├── .gitattributes
├── .gitignore
├── EXAMPLE CODE
    ├── .htaccess
    ├── BeautifulSoup.py
    ├── aa_readme.txt
    ├── argfile.py
    ├── argtest.py
    ├── avelist.py
    ├── avenum.py
    ├── average.py
    ├── badhtml.html
    ├── celsius.py
    ├── cleanup.sh
    ├── clown.txt
    ├── copytildone.py
    ├── count1.py
    ├── count2.py
    ├── count3.py
    ├── curl1.py
    ├── curl2.py
    ├── curl3.py
    ├── db1.py
    ├── db2.py
    ├── egg.py
    ├── emaildb.py
    ├── fahren.py
    ├── geodata.zip
    ├── geodata
    │   ├── README.txt
    │   ├── geodump.py
    │   ├── geoload.py
    │   ├── where.data
    │   ├── where.html
    │   └── where.js
    ├── geojson.py
    ├── geoxml.py
    ├── gmane.zip
    ├── gmane
    │   ├── Chart.bundle.js
    │   ├── README.txt
    │   ├── d3.layout.cloud.js
    │   ├── d3.v3.js
    │   ├── gbasic.py
    │   ├── gline.htm
    │   ├── gline.py
    │   ├── gline2.htm
    │   ├── gline3.htm
    │   ├── gmane.py
    │   ├── gmodel.py
    │   ├── gword.htm
    │   ├── gword.py
    │   ├── gyear.py
    │   └── mapping.sqlite
    ├── grade.py
    ├── graphics
    │   ├── graphics.py
    │   └── histogram.py
    ├── greet.py
    ├── grep.py
    ├── hidden.py
    ├── intro-short.txt
    ├── intro.txt
    ├── json1.py
    ├── json2.py
    ├── largest.py
    ├── mailcount.py
    ├── mailtop.py
    ├── mbox-short.txt
    ├── mbox.txt
    ├── oauth.py
    ├── open.py
    ├── pagerank.zip
    ├── pagerank
    │   ├── BeautifulSoup.py
    │   ├── LICENSE
    │   ├── README.txt
    │   ├── d3.v2.js
    │   ├── force.css
    │   ├── force.html
    │   ├── force.js
    │   ├── spdump.py
    │   ├── spider.js
    │   ├── spider.py
    │   ├── spjson.py
    │   ├── sprank.py
    │   └── spreset.py
    ├── pals.py
    ├── party1.py
    ├── party2.py
    ├── party3.py
    ├── party4.py
    ├── party5.py
    ├── pay.py
    ├── pay2.py
    ├── pay3.py
    ├── re01.py
    ├── re02.py
    ├── re03.py
    ├── re04.py
    ├── re05.py
    ├── re06.py
    ├── re07.py
    ├── re08.py
    ├── re09.py
    ├── re10.py
    ├── re11.py
    ├── re12.py
    ├── re13.py
    ├── re14.py
    ├── romeo-full.txt
    ├── romeo.txt
    ├── roster.py
    ├── roster.zip
    ├── roster
    │   ├── roster.py
    │   └── roster_data.json
    ├── search1.py
    ├── search10.py
    ├── search2.py
    ├── search3.py
    ├── search4.py
    ├── search5.py
    ├── search6.py
    ├── search7.py
    ├── search8.py
    ├── search9.py
    ├── sequence.py
    ├── socket1.py
    ├── socket2.py
    ├── soft.py
    ├── spamave.py
    ├── tracks.zip
    ├── tracks
    │   ├── Library.xml
    │   ├── README.txt
    │   └── tracks.py
    ├── twdump.py
    ├── twfriends.py
    ├── twitter1.py
    ├── twitter2.py
    ├── twjoin.py
    ├── twspider.py
    ├── twtest.py
    ├── twurl.py
    ├── txtcheck.py
    ├── txtcheck2.py
    ├── txtcheck3.py
    ├── txtcount.py
    ├── txtdelete.py
    ├── txtmd5.py
    ├── txtsize.py
    ├── urljpeg.py
    ├── urllib1.py
    ├── urllib2.py
    ├── urllink2.py
    ├── urllink3.py
    ├── urllinks.py
    ├── urlregex.py
    ├── urlwords.py
    ├── whathour.py
    ├── wikidata.db
    ├── wikigrade.py
    ├── wordlist.py
    ├── words.py
    ├── words.txt
    ├── xml1.py
    └── xml2.py
├── README.md
├── Textbook - Castellano.epub
├── Textbook - English.epub
├── Unit 1 - Introduction
    ├── A1.1 - Code screenshot.PNG
    ├── A1.2. - Script execution.PNG
    └── C1.1 - Firstcode.py
├── Unit 2 - Regular expressions
    ├── A.2.1 - regex text sample.txt
    ├── A.2.2 - regex text data.txt
    └── C2.1 - Programming assignment.py
├── Unit 3 - Networks and sockets
    └── C3.1. - Programming assignment.py
├── Unit 4 - Programs that surf the web
    ├── BeautifulSoup.py
    ├── BeautifulSoup.pyc
    ├── C4.1. Programming assignment.py
    └── C4.2. Programming assignment 2.py
├── Unit 5 - Web services and XML
    ├── BeautifulSoup.py
    ├── BeautifulSoup.pyc
    └── C5.1. Programming assignment.py
└── Unit 6 - JSON and the REST architecture
    ├── C.6.1. - Programming assignment 1.py
    └── C.6.2. - Programming assignment 2.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #### CUSTOM PART
 2 | # For non coding-files
 3 | *.docx
 4 | *.pdf
 5 | 
 6 | 
 7 | # Windows image file caches
 8 | Thumbs.db
 9 | ehthumbs.db
10 | 
11 | # Folder config file
12 | Desktop.ini
13 | 
14 | # Recycle Bin used on file shares
15 | $RECYCLE.BIN/
16 | 
17 | # Windows Installer files
18 | *.cab
19 | *.msi
20 | *.msm
21 | *.msp
22 | 
23 | # Windows shortcuts
24 | *.lnk
25 | 
26 | # =========================
27 | # Operating System Files
28 | # =========================
29 | 
30 | # OSX
31 | # =========================
32 | 
33 | .DS_Store
34 | .AppleDouble
35 | .LSOverride
36 | 
37 | # Thumbnails
38 | ._*
39 | 
40 | # Files that might appear in the root of a volume
41 | .DocumentRevisions-V100
42 | .fseventsd
43 | .Spotlight-V100
44 | .TemporaryItems
45 | .Trashes
46 | .VolumeIcon.icns
47 | 
48 | # Directories potentially created on remote AFP share
49 | .AppleDB
50 | .AppleDesktop
51 | Network Trash Folder
52 | Temporary Items
53 | .apdisk
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/.htaccess:
--------------------------------------------------------------------------------
 1 | Options +Indexes
 2 | AddType text/plain .py
 3 | 
 4 | <IfModule headers_module>
 5 | Header set Cache-Control "max-age=604800, public"
 6 | 
 7 | Header add Access-Control-Allow-Origin "*"
 8 | Header add Access-Control-Allow-Headers "origin, x-requested-with, content-type"
 9 | Header add Access-Control-Allow-Methods "GET"
10 | </IfModule>
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/aa_readme.txt:
--------------------------------------------------------------------------------
1 | This is the Python version 2.0 version of the sample code
2 | for Python for Informatics.
3 | 
4 | The Python 3.0 version of the code is in the folder "code3"
5 | 
6 | /Chuck
7 | 
8 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/argfile.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | name = sys.argv[1]
4 | handle = open(name, 'r')
5 | text = handle.read()
6 | print name, 'is', len(text), 'bytes'
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/argtest.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | print 'Count:', len(sys.argv)
4 | print 'Type:', type(sys.argv)
5 | 
6 | for arg in sys.argv:
7 |     print 'Argument:', arg
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/avelist.py:
--------------------------------------------------------------------------------
 1 | numlist = list()
 2 | while ( True ) :
 3 |     inp = raw_input('Enter a number: ')
 4 |     if inp == 'done' : break
 5 |     value = float(inp)
 6 |     numlist.append(value)
 7 | 
 8 | average = sum(numlist) / len(numlist)
 9 | print 'Average:', average
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/avenum.py:
--------------------------------------------------------------------------------
 1 | total = 0
 2 | count = 0
 3 | while ( True ) :
 4 |     inp = raw_input('Enter a number: ')
 5 |     if inp == 'done' : break
 6 |     value = float(inp)
 7 |     total = total + value     
 8 |     count = count + 1
 9 | 
10 | average = total / count
11 | print 'Average:', average
12 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/average.py:
--------------------------------------------------------------------------------
 1 | total = 0
 2 | count = 0
 3 | while ( True ) :
 4 |     inp = raw_input('Enter a number: ')
 5 |     if inp == 'done' :
 6 |         break
 7 |     try:
 8 |         value = float(inp)
 9 |     except:
10 |         print 'Invalid input'
11 |         continue
12 |     total = total + value     
13 |     count = count + 1
14 | 
15 | average = total / count
16 | print 'Average:', average
17 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/badhtml.html:
--------------------------------------------------------------------------------
 1 | <h1>Hello</h1>
 2 | <a href="#">First</a>
 3 | 
 4 | <a 
 5 | href="#"
 6 | >Second</a>
 7 | 
 8 | <a 
 9 | href=
10 | "#"
11 | >Third</a>
12 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/celsius.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Celsius Temperature:')
2 | cel = float(inp)
3 | fahr = ( cel * 9.0 ) / 5.0 + 32.0
4 | print fahr
5 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | rm *.pyc */*.pyc
 4 | rm *.sqlite 
 5 | rm *.zip
 6 | 
 7 | zip -r geodata.zip geodata
 8 | zip -r gmane.zip gmane
 9 | zip -r pagerank.zip pagerank
10 | zip -r tracks.zip tracks
11 | zip -r roster.zip roster
12 | 
13 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/clown.txt:
--------------------------------------------------------------------------------
1 | the clown ran after the car and the car ran into the tent and the tent fell down on the clown and the car
2 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/copytildone.py:
--------------------------------------------------------------------------------
 1 | while True:
 2 |     line = raw_input('> ')
 3 |     if line[0] == '#' :
 4 |         continue
 5 |     if line == 'done':
 6 |         break
 7 |     print line
 8 | 
 9 | print 'Done!'
10 | 
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/count1.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter the file name: ')
 2 | try:
 3 |     fhand = open(fname)
 4 | except:
 5 |     print 'File cannot be opened:', fname
 6 |     exit()
 7 | 
 8 | counts = dict()
 9 | for line in fhand:
10 |     words = line.split()
11 |     for word in words:
12 |         if word not in counts:
13 |             counts[word] = 1
14 |         else:
15 |             counts[word] += 1
16 | 
17 | print counts
18 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/count2.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | fname = raw_input('Enter the file name: ')
 4 | try:
 5 |     fhand = open(fname)
 6 | except:
 7 |     print 'File cannot be opened:', fname
 8 |     exit()
 9 | 
10 | counts = dict()
11 | for line in fhand:
12 |     line = line.translate(None, string.punctuation)
13 |     line = line.lower()
14 |     words = line.split()
15 |     for word in words:
16 |         if word not in counts:
17 |             counts[word] = 1
18 |         else:
19 |             counts[word] += 1
20 | 
21 | print counts
22 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/count3.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | fhand = open('romeo-full.txt')
 3 | counts = dict()
 4 | for line in fhand:
 5 |     line = line.translate(None, string.punctuation)
 6 |     line = line.lower()
 7 |     words = line.split()
 8 |     for word in words:
 9 |         if word not in counts:
10 |             counts[word] = 1
11 |         else:
12 |             counts[word] += 1
13 | 
14 | # Sort the dictionary by value
15 | lst = list()
16 | for key, val in counts.items():
17 |     lst.append( (val, key) )
18 | 
19 | lst.sort(reverse=True)
20 | 
21 | for key, val in lst[:10] :
22 |     print key, val
23 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/curl1.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | 
3 | img = urllib.urlopen('http://www.py4inf.com/cover.jpg').read()
4 | fhand = open('cover.jpg', 'w')
5 | fhand.write(img)
6 | fhand.close()
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/curl2.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | 
 3 | img = urllib.urlopen('http://www.py4inf.com/cover.jpg')
 4 | fhand = open('cover.jpg', 'w')
 5 | size = 0
 6 | while True:
 7 |     info = img.read(100000)
 8 |     if len(info) < 1 : break
 9 |     size = size + len(info)
10 |     fhand.write(info)
11 | 
12 | print size,'characters copied.'
13 | fhand.close()
14 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/curl3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib
 3 | 
 4 | print 'Please enter a URL like http://www.py4inf.com/cover.jpg'
 5 | urlstr = raw_input().strip()
 6 | img = urllib.urlopen(urlstr)
 7 | 
 8 | # Get the last "word"
 9 | words = urlstr.split('/')
10 | fname = words[-1]
11 | 
12 | # Don't overwrite the file
13 | if os.path.exists(fname) :
14 |     if raw_input('Replace '+fname+' (Y/n)?') != 'Y' :
15 |         print 'Data not copied'
16 |         exit()
17 |     print 'Replacing',fname
18 | 
19 | fhand = open(fname, 'w')
20 | size = 0
21 | while True:
22 |     info = img.read(100000)
23 |     if len(info) < 1 : break
24 |     size = size + len(info)
25 |     fhand.write(info)
26 | 
27 | print size,'characters copied to',fname
28 | fhand.close()
29 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/db1.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('music.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('DROP TABLE IF EXISTS Tracks ')
 7 | cur.execute('CREATE TABLE Tracks (title TEXT, plays INTEGER)')
 8 | 
 9 | conn.close()
10 | 
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/db2.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('music.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('INSERT INTO Tracks (title, plays) VALUES ( ?, ? )', 
 7 |     ( 'Thunderstruck', 20 ) )
 8 | cur.execute('INSERT INTO Tracks (title, plays) VALUES ( ?, ? )', 
 9 |     ( 'My Way', 15 ) )
10 | conn.commit()
11 | 
12 | print 'Tracks:'
13 | cur.execute('SELECT title, plays FROM Tracks')
14 | for row in cur :
15 |      print row
16 | 
17 | cur.execute('DELETE FROM Tracks WHERE plays < 100')
18 | 
19 | cur.close()
20 | 
21 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/egg.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter the file name: ')
 2 | if fname == 'na na boo boo' :
 3 |     print 'NA NA BOO BOO TO YOU - You have been punkd!'
 4 |     exit()
 5 | 
 6 | try:
 7 |     fhand = open(fname)
 8 | except:
 9 |     print 'File cannot be opened:', fname
10 |     exit()
11 | count = 0
12 | for line in fhand:
13 |     if line.startswith('Subject:') : 
14 |         count = count + 1
15 | print 'There were', count, 'subject lines in', fname
16 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/emaildb.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('emaildb.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('''
 7 | DROP TABLE IF EXISTS Counts''')
 8 | 
 9 | cur.execute('''
10 | CREATE TABLE Counts (email TEXT, count INTEGER)''')
11 | 
12 | fname = raw_input('Enter file name: ')
13 | if ( len(fname) < 1 ) : fname = 'mbox-short.txt'
14 | fh = open(fname)
15 | for line in fh:
16 |     if not line.startswith('From: ') : continue
17 |     pieces = line.split()
18 |     email = pieces[1]
19 |     print email
20 |     cur.execute('SELECT count FROM Counts WHERE email = ? ', (email, ))
21 |     row = cur.fetchone()
22 |     if row is None:
23 |         cur.execute('''INSERT INTO Counts (email, count) 
24 |                 VALUES ( ?, 1 )''', ( email, ) )
25 |     else : 
26 |         cur.execute('UPDATE Counts SET count=count+1 WHERE email = ?', 
27 |             (email, ))
28 |     # This statement commits outstanding changes to disk each 
29 |     # time through the loop - the program can be made faster 
30 |     # by moving the commit so it runs only after the loop completes
31 |     conn.commit()
32 | 
33 | # https://www.sqlite.org/lang_select.html
34 | sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10'
35 | 
36 | print
37 | print "Counts:"
38 | for row in cur.execute(sqlstr) :
39 |     print str(row[0]), row[1]
40 | 
41 | cur.close()
42 | 
43 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/fahren.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Fahrenheit Temperature:')
2 | fahr = float(inp)
3 | cel = (fahr - 32.0) * 5.0 / 9.0
4 | print cel
5 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/geodata.zip


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/README.txt:
--------------------------------------------------------------------------------
  1 | Using the Google Geocoding API with a Database and 
  2 | Visualizing data on Google Map
  3 | 
  4 | In this project, we are using the Google geocoding API
  5 | to clean up some user-entered geographic locations of 
  6 | university names and then placing the data on a Google
  7 | Map.
  8 | 
  9 | You should install the SQLite browser to view and modify 
 10 | the databases from:
 11 | 
 12 | http://sqlitebrowser.org/
 13 | 
 14 | The first problem to solve is that the Google geocoding
 15 | API is rate limited to 2500 requests per day.  So if you have
 16 | a lot of data you might need to stop and restart the lookup
 17 | process several times.  So we break the problem into two
 18 | phases.  
 19 | 
 20 | In the first phase we take our input data in the file
 21 | (where.data) and read it one line at a time, and retreive the
 22 | geocoded response and store it in a database (geodata.sqlite).
 23 | Before we use the geocoding API, we simply check to see if
 24 | we already have the data for that particular line of input.
 25 | 
 26 | You can re-start the process at any time by removing the file
 27 | geodata.sqlite
 28 | 
 29 | Run the geoload.py program.   This program will read the input
 30 | lines in where.data and for each line check to see if it is already
 31 | in the database and if we don't have the data for the location,
 32 | call the geocoding API to retrieve the data and store it in 
 33 | the database.
 34 | 
 35 | Here is a sample run after there is already some data in the 
 36 | database:
 37 | 
 38 | Mac: python geoload.py
 39 | Win: geoload.py
 40 | 
 41 | Found in database  Northeastern University
 42 | 
 43 | Found in database  University of Hong Kong, Illinois Institute of Technology, Bradley University
 44 | 
 45 | Found in database  Technion
 46 | 
 47 | Found in database  Viswakarma Institute, Pune, India
 48 | 
 49 | Found in database  UMD
 50 | 
 51 | Found in database  Tufts University
 52 | 
 53 | Resolving Monash University
 54 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Monash+University
 55 | Retrieved 2063 characters {    "results" : [  
 56 | {u'status': u'OK', u'results': ... }
 57 | 
 58 | Resolving Kokshetau Institute of Economics and Management
 59 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Kokshetau+Institute+of+Economics+and+Management
 60 | Retrieved 1749 characters {    "results" : [  
 61 | {u'status': u'OK', u'results': ... }
 62 | 
 63 | The first five locations are already in the database and so they 
 64 | are skipped.  The program scans to the point where it finds un-retrieved
 65 | locations and starts retrieving them.
 66 | 
 67 | The geoload.py can be stopped at any time, and there is a counter 
 68 | that you can use to limit the number of calls to the geocoding
 69 | API for each run.
 70 | 
 71 | Once you have some data loaded into geodata.sqlite, you can 
 72 | visualize the data using the (geodump.py) program.  This
 73 | program reads the database and writes tile file (where.js)
 74 | with the location, latitude, and longitude in the form of
 75 | executable JavaScript code.   
 76 | 
 77 | A run of the geodump.py program is as follows:
 78 | 
 79 | Mac: python geodump.py
 80 | Win: geodump.py
 81 | 
 82 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975
 83 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811
 84 | ...
 85 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667
 86 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682
 87 | Kokshetau, Kazakhstan 53.2833333 69.3833333
 88 | ...
 89 | 12 records written to where.js
 90 | Open where.html to view the data in a browser
 91 | 
 92 | The file (where.html) consists of HTML and JavaScript to visualize 
 93 | a Google Map.  It reads the most recent data in where.js to get 
 94 | the data to be visualized.  Here is the format of the where.js file:
 95 | 
 96 | myData = [
 97 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'],
 98 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'],
 99 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'],
100 |    ...
101 | ];
102 | 
103 | This is a JavaScript list of lists.  The syntax for JavaScript 
104 | list constants is very similar to Python so the syntax should 
105 | be familiar to you.
106 | 
107 | Simply open where.html in a browser to see the locations.  You 
108 | can hover over each map pin to find the location that the 
109 | gecoding API returned for the user-entered input.  If you 
110 | cannot see any data when you open the where.html file, you might 
111 | want to check the JavaScript or developer console for your browser.
112 | 
113 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/geodump.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import json
 3 | import codecs
 4 | 
 5 | conn = sqlite3.connect('geodata.sqlite')
 6 | cur = conn.cursor()
 7 | 
 8 | cur.execute('SELECT * FROM Locations')
 9 | fhand = codecs.open('where.js','w', "utf-8")
10 | fhand.write("myData = [\n")
11 | count = 0
12 | for row in cur :
13 |     data = str(row[1])
14 |     try: js = json.loads(str(data))
15 |     except: continue
16 | 
17 |     if not('status' in js and js['status'] == 'OK') : continue
18 | 
19 |     lat = js["results"][0]["geometry"]["location"]["lat"]
20 |     lng = js["results"][0]["geometry"]["location"]["lng"]
21 |     if lat == 0 or lng == 0 : continue
22 |     where = js['results'][0]['formatted_address']
23 |     where = where.replace("'","")
24 |     try :
25 |         print where, lat, lng
26 | 
27 |         count = count + 1
28 |         if count > 1 : fhand.write(",\n")
29 |         output = "["+str(lat)+","+str(lng)+", '"+where+"']"
30 |         fhand.write(output)
31 |     except:
32 |         continue
33 | 
34 | fhand.write("\n];\n")
35 | cur.close()
36 | fhand.close()
37 | print count, "records written to where.js"
38 | print "Open where.html to view the data in a browser"
39 | 
40 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/geoload.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import sqlite3
 3 | import json
 4 | import time
 5 | import ssl
 6 | 
 7 | # If you are in China use this URL:
 8 | # serviceurl = "http://maps.google.cn/maps/api/geocode/json?"
 9 | serviceurl = "http://maps.googleapis.com/maps/api/geocode/json?"
10 | 
11 | # Deal with SSL certificate anomalies Python > 2.7
12 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
13 | scontext = None
14 | 
15 | conn = sqlite3.connect('geodata.sqlite')
16 | cur = conn.cursor()
17 | 
18 | cur.execute('''
19 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''')
20 | 
21 | fh = open("where.data")
22 | count = 0
23 | for line in fh:
24 |     if count > 200 : break
25 |     address = line.strip()
26 |     print ''
27 |     cur.execute("SELECT geodata FROM Locations WHERE address= ?", (buffer(address), ))
28 | 
29 |     try:
30 |         data = cur.fetchone()[0]
31 |         print "Found in database ",address
32 |         continue
33 |     except:
34 |         pass
35 | 
36 |     print 'Resolving', address
37 |     url = serviceurl + urllib.urlencode({"sensor":"false", "address": address})
38 |     print 'Retrieving', url
39 |     uh = urllib.urlopen(url, context=scontext)
40 |     data = uh.read()
41 |     print 'Retrieved',len(data),'characters',data[:20].replace('\n',' ')
42 |     count = count + 1
43 |     try: 
44 |         js = json.loads(str(data))
45 |         # print js  # We print in case unicode causes an error
46 |     except: 
47 |         continue
48 | 
49 |     if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') : 
50 |         print '==== Failure To Retrieve ===='
51 |         print data
52 |         break
53 | 
54 |     cur.execute('''INSERT INTO Locations (address, geodata) 
55 |             VALUES ( ?, ? )''', ( buffer(address),buffer(data) ) )
56 |     conn.commit() 
57 |     time.sleep(1)
58 | 
59 | print "Run geodump.py to read the data from the database so you can visualize it on a map."
60 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/where.data:
--------------------------------------------------------------------------------
  1 | Northeastern University
  2 | University of Hong Kong, Illinois Institute of Technology, Bradley University
  3 | Technion
  4 | Viswakarma Institute, Pune, India
  5 | UMD
  6 | Tufts University
  7 | Monash University
  8 | Kokshetau Institute of Economics and Management
  9 | RSU named S.A. Esenin
 10 | Tavrida National V.I. Vernadsky University
 11 | UOC
 12 | Irkutsk State University
 13 | Institute of Technology Telkom
 14 | Shanghai Jiao Tong University
 15 | University of Ilorin, Kwara State. Nigeria
 16 | Monash University Churchill Australia
 17 | UNISA
 18 | Fachhochschule FH Salzburg
 19 | Tampere University of Technology (Tampere, Finland)
 20 | Saint Petersburg State University
 21 | University of São Paulo
 22 | Smolensk State University (Russia)
 23 | Institute of Business Administration, Karachi
 24 | universidad complutense de madrid
 25 | Masdar Institute
 26 | University of London
 27 | University of Oxford
 28 | Tallinn University of Technology
 29 | University of Tartu
 30 | University of Padua
 31 | University of Pune, India
 32 | National Kyiv Shevchenko University
 33 | UC Berkeley
 34 | University of Wisconsin - Madison
 35 | Lodz University of Technology 
 36 | NRU IFMO
 37 | Dniepropetrovsk National University (Ukraine), Applied Math Faculty
 38 | Dokuz Eylul University, Izmir, Turkey
 39 | Beijing normal university
 40 | University of Piraeus, Athens
 41 | Universidad de Buenos Aires (UBA). Argentina.
 42 | SASTRA University
 43 | Nagpur University
 44 | Duke University
 45 | San Francisco State University
 46 | FATEC-SP - Faculdade de Tecnologia do Estado de São Paulo
 47 | University of Texas at Austin
 48 | University of applied sciense of Mikkeli (Finland)
 49 | Troy University
 50 | Universidade do Minho
 51 | National University of Sciences and Technology (NUST)-Pakistan
 52 | Pontificia universidad catolica de chile
 53 | Illinois State University Joliet Junior College
 54 | American University in Cairo (AUC)
 55 | Obninsk Technical University of Nuclear Power Engineering, Russia
 56 | Vyatka State Humanitarian University
 57 | Weizmann Institute of Science (Israel)
 58 | University of Washington
 59 | Kharkiv State Academy of Municipal Economy, Ukraine
 60 | Faculty of Electrical Engineering in Sarajevo, University of Sarajevo
 61 | Universidad de Los Andes Colombia
 62 | University of Colorado at Boulder
 63 | Magnitogorsk State Technical University
 64 | USC
 65 | Simon Fraser University
 66 | Columbia University (New York)
 67 | University of Southern California
 68 | University of Warsaw
 69 | Warsaw University of Technology
 70 | (Some place in New Zealand you haven't heard of.)
 71 | Massey university part-time Distance learning
 72 | University of Oklahoma
 73 | University of Pavia, Italy
 74 | University of Missouri - Columbia
 75 | Czech Technical University in Prague
 76 | Illinois Institute of Technology
 77 | Penn State University
 78 | University of Utah
 79 | Faculty of Science, University of Zagreb - Department of Mathematics
 80 | Universitat Politecnica de Valencia
 81 | University of Vienna
 82 | University of Puerto Rico - Mayaguez Campus
 83 | University "Hyperion" of Bucharest
 84 | University of New Haven
 85 | University of Washington -Bothell
 86 | Drexel University
 87 | University of Texas at Austin
 88 | University of Helsinki
 89 | University of Michigan
 90 | Carnegie Mellon University
 91 | Kazan Federal University
 92 | Pondicherry University
 93 | Far-Eastern State University
 94 | Nanyang Technological University
 95 | Slovak University of Technology
 96 | NYU
 97 | UFABC - Universidade Federal do ABC, Sanso André - SP - Brazil
 98 | University of Debrecen 
 99 | California State University, San Bernardino
100 | National University "Kyiv-Mohyla Academy" (Kyiv, Ukraine)
101 | Laurentian University
102 | Humanities Institute of TV and Radio, Moscow, Russia
103 | University of Cambridge, UK
104 | Payame Noor University, Tehran, Iran
105 | Middle East Technical University
106 | EPFL
107 | Faculty of Technical Sciences, Novi Sad, Serbia
108 | University of Gothenburg, Sweden
109 | Polytechnic University of Timisoara
110 | University of Hawaii (Go, Rainbows!)
111 | Belarusian State University
112 | Haaga-Helia university of applied sciences
113 | JADAVPUR UNIVERSITY
114 | Gauhati University, India
115 | Universidad de Buenos Aires
116 | Università degli Studi di Genova, Genova, Italia
117 | King Mongkut's University of Technology Thonburi
118 | Universidad de la Sabana, Chia, Colombia
119 | State University of New York (SUNY) College at Oswego
120 | Kyrgyz Slavic Russian University
121 | De La Salle University http://www.dlsu.edu.ph
122 | Jawaharlal Nehru Technological University, INDIA
123 | UCL (Université Catholique de Louvain) in Belgium
124 | Boston University
125 | The University of Manchester
126 | Fachhochschule Düsseldorf 
127 | Pine Manor College (AA), Harvard University (BA), Lesley University (MEd)
128 | Simón Bolívar University
129 | Indiana University at Bloomington
130 | RPI
131 | University of Ottawa, Canada
132 | Ural Federal University
133 | BITS Pilani
134 | Transilvania University
135 | IIT(BHU), Varanasi, India
136 | EM Lyon
137 | Universidad Central de Venezuela
138 | NTUU "KPI"
139 | Universidade Federal da Paraiba, Brazil
140 | Budapest University of Technology and Economics
141 | Moscow Institute of Physics & Technology (State University)
142 | Saint Petersburg State University of Aerospace Instrumentation, Russia
143 | North Central College, Naperville, IL
144 | Tech. Uni. Denmark (DTU)
145 | Stanford
146 | "Politehnica" Timisoara
147 | National University of Engineering
148 | Monash
149 | Federal University of Campina Grande (UFCG)
150 | Universidade Federal do Rio Grande do Sul (UFRGS)
151 | Universidad Nacional Autónoma de México
152 | University of New South Wales Harvard Business School
153 | University of Tehran
154 | Old Dominion University
155 | Kyiv Unisersity of Oriental Language
156 | Babcock University
157 | University of Essex
158 | Kharkiv National University of Radio Electronics (Ukraine)
159 | Kaunas Technology University
160 | University of Buenos Aires
161 | University of Jaffna.
162 | R V College of Engineering, Bangalore, India for BE in Instrumentation Technology
163 | Beloit College
164 | UCLA
165 | University of Chicago
166 | University of Sciences and Technology of Oran. Mohamed Boudiaf (USTO-MB).
167 | Zagazig University, Egypt
168 | University of Alberta
169 | Belorussian State University
170 | Jones International University (online) Illinois State Univeristy
171 | University of Florida
172 | Too many to mention.
173 | University of Kerala, India
174 | Politecnico di Milano
175 | Vilnius Gediminas Technical University
176 | Madras university/ Bharthidasan University in India . 
177 | Universidade Tecnica de Lisboa - Instituto Superior Técnico
178 | Does not apply. 
179 | Stellenbosch University
180 | imt ghazIABAD INDIA
181 | University of Pennsylvania
182 | National Institute of Technology, Jalandhar (India)
183 | Universidad ICESI
184 | Virginia Tech
185 | arizona state university
186 | Universidad del Valle de Guatemala
187 | Mykolas Romeris University, Vilnius, Lithuania
188 | BSU
189 | Distance Learning Center at the Technical University of Kaiserslautern in Germany
190 | Ain shams university, Cairo, Egypt
191 | Universidad Nacional de Colombia
192 | Saint-Petersburg Polytechnic Univesity
193 | NAIT (Northern Alberta Institute of Technology)
194 | Wayne State took courses at U of M
195 | Universidad Nacional, Costa Rica
196 | Marietta College (Ohio) Northwestern University
197 | Grandville
198 | Portland State University, Oregon Institute of Technology
199 | Malayer Azad University, Iran
200 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/where.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <meta name="viewport" content="initial-scale=1.0, user-scalable=no">
 4 |     <meta charset="utf-8">
 5 |     <title>A Map of Information</title>
 6 |     <link href="http://google-developers.appspot.com/maps/documentation/javascript/examples/default.css" rel="stylesheet">
 7 | 
 8 |     <!-- If you are in China, you may need to use theis site for the Google Maps code
 9 |     <script src="http://maps.google.cn/maps/api/js" type="text/javascript"></script> -->
10 |     <script src="http://maps.googleapis.com/maps/api/js?sensor=false"></script>
11 | 
12 |     <script src="http://google-maps-utility-library-v3.googlecode.com/svn/trunk/markerclusterer/src/markerclusterer_compiled.js"></script>
13 |     <script src="where.js"></script>
14 |     <script>
15 | 
16 |       function initialize() {
17 |         alert("To see the title of a marker, hover over the marker but don't click.");
18 |         var myLatlng = new google.maps.LatLng(37.39361,-122.099263)
19 |         var mapOptions = {
20 |           zoom: 3,
21 |           center: myLatlng,
22 |           mapTypeId: google.maps.MapTypeId.ROADMAP
23 |         }
24 |         var map = new google.maps.Map(document.getElementById('map_canvas'), mapOptions);
25 | 
26 |         i = 0;
27 |         var markers = [];
28 |         for ( pos in myData ) {
29 |             i = i + 1;
30 |             var row = myData[pos];
31 | 		    window.console && console.log(row);
32 |             // if ( i < 3 ) { alert(row); }
33 |             var newLatlng = new google.maps.LatLng(row[0], row[1]);
34 |             var marker = new google.maps.Marker({
35 |                 position: newLatlng,
36 |                 map: map,
37 |                 title: row[2]
38 |             });
39 |             markers.push(marker);
40 |         }
41 |       }
42 |     </script>
43 |   </head>
44 |   <body onload="initialize()">
45 | <div id="map_canvas" style="height: 500px"></div>
46 | <p><b>About this Map</b></p>
47 | <p>
48 | This is a cool map from 
49 | <a href="http://www.pythonlearn.com">www.pythonlearn.com</a>.
50 | </p>
51 | </body>
52 | </html>
53 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/where.js:
--------------------------------------------------------------------------------
 1 | myData = [
 2 | [42.340075,-71.0895367, 'Northeastern, Boston, MA 02115, USA'],
 3 | [38.2113643,-85.7470011, 'Bradley Ave, Louisville, KY, USA'],
 4 | [32.778949,35.019648, 'Technion/ Sports Building, Haifa'],
 5 | [18.4574518,73.8837999, 'Vishwakarma Institutes Play Ground, Yashodhan Society, Kapil Nagar, Kondhwa Budrukh, Vishwakarma, Maharashtra 411048, India'],
 6 | [33.1561058,131.826132, 'Japan, 〒875-0002 Ōita-ken, Usuki-shi, Shitanoe, 1232−2 ＵＭＤ'],
 7 | [42.4036847,-71.120482, 'South Hall Tufts University, 30 Lower Campus Rd, Somerville, MA 02144, USA'],
 8 | [-37.914517,145.1303881, 'Monash College, Wellington Rd, Clayton VIC 3168, Australia'],
 9 | [53.2948229,69.4047872, 'Kokshetau 020000, Kazakhstan'],
10 | [40.7127837,-74.0059413, 'New York, NY, USA'],
11 | [52.2869741,104.3050183, 'Irkutsk, Irkutsk Oblast, Russia']
12 | ];
13 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geojson.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import json
 3 | 
 4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'
 5 | #serviceurl = 'http://python-data.dr-chuck.net/geojson?'
 6 | 
 7 | while True:
 8 |     address = raw_input('Enter location: ')
 9 |     if len(address) < 1 : break
10 | 
11 |     url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address})
12 |     print 'Retrieving', url
13 |     uh = urllib.urlopen(url)
14 |     data = uh.read()
15 |     print 'Retrieved',len(data),'characters'
16 | 
17 |     try: js = json.loads(str(data))
18 |     except: js = None
19 |     if 'status' not in js or js['status'] != 'OK':
20 |         print '==== Failure To Retrieve ===='
21 |         print data
22 |         continue
23 | 
24 |     print json.dumps(js, indent=4)
25 | 
26 |     lat = js["results"][0]["geometry"]["location"]["lat"]
27 |     lng = js["results"][0]["geometry"]["location"]["lng"]
28 |     print 'lat',lat,'lng',lng
29 |     location = js['results'][0]['formatted_address']
30 |     print location
31 | 
32 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/geoxml.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import xml.etree.ElementTree as ET
 3 | 
 4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/xml?'
 5 | 
 6 | while True:
 7 |     address = raw_input('Enter location: ')
 8 |     if len(address) < 1 : break
 9 | 
10 |     url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address})
11 |     print 'Retrieving', url
12 |     uh = urllib.urlopen(url)
13 |     data = uh.read()
14 |     print 'Retrieved',len(data),'characters'
15 |     print data
16 |     tree = ET.fromstring(data)
17 | 
18 | 
19 |     results = tree.findall('result')
20 |     lat = results[0].find('geometry').find('location').find('lat').text
21 |     lng = results[0].find('geometry').find('location').find('lng').text
22 |     location = results[0].find('formatted_address').text
23 | 
24 |     print 'lat',lat,'lng',lng
25 |     print location
26 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/gmane.zip


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/README.txt:
--------------------------------------------------------------------------------
  1 | Analyzing an EMAIL Archive vizualizing the data using the 
  2 | D3 JavaScript library
  3 | 
  4 | Here is a copy of the Sakai Developer Mailing list from 2006-2014.
  5 | 
  6 | http://mbox.dr-chuck.net/
  7 | 
  8 | You should install the SQLite browser to view and modify the databases from:
  9 | 
 10 | http://sqlitebrowser.org/
 11 | 
 12 | The base URL is hard-coded in the gmane.py.  Make sure to delete the 
 13 | content.sqlite file if you switch the base url.  The gmane.py file 
 14 | operates as a spider in that it runs slowly and retrieves one mail 
 15 | message per second so as to avoid getting throttled.   It stores all of
 16 | its data in a database and can be interrupted and re-started 
 17 | as often as needed.   It may take many hours to pull all the data
 18 | down.  So you may need to restart several times.
 19 | 
 20 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 
 21 | email here:
 22 | 
 23 | https://online.dr-chuck.com/files/sakai/email/content.sqlite.zip
 24 | 
 25 | If you download and unzip this, you can "catch up with the 
 26 | latest" by running gmane.py.
 27 | 
 28 | Navigate to the folder where you extracted the gmane.zip
 29 | 
 30 | Here is a run of gmane.py getting the last five messages of the
 31 | sakai developer list:
 32 | 
 33 | Mac: python gmane.py 
 34 | Win: gmane.py 
 35 | 
 36 | How many messages:10
 37 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443
 38 |     john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments
 39 | http://mbox.dr-chuck.net/sakai.devel/6/7 3586
 40 |     s-githens@northwestern.edu 2005-12-09T13:32:31-06:00 re: sakaiportallogin and presense
 41 | http://mbox.dr-chuck.net/sakai.devel/7/8 10600
 42 |     john@caret.cam.ac.uk 2005-12-09T13:42:24+00:00 re: lms/vle rants/comments
 43 | 
 44 | The program scans content.sqlite from 1 up to the first message number not
 45 | already spidered and starts spidering at that message.  It continues spidering
 46 | until it has spidered the desired number of messages or it reaches a page
 47 | that does not appear to be a properly formatted message.
 48 | 
 49 | Sometimes there is missing a message.  Perhaps administrators can delete messages
 50 | or perhaps they get lost - I don't know.   If your spider stops, and it seems it has hit
 51 | a missing message, go into the SQLite Manager and add a row with the missing id - leave
 52 | all the other fields blank - and then restart gmane.py.   This will unstick the 
 53 | spidering process and allow it to continue.  These empty messages will be ignored in the next
 54 | phase of the process.
 55 | 
 56 | One nice thing is that once you have spidered all of the messages and have them in 
 57 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the
 58 | list.  gmane.py will quickly scan to the end of the already-spidered pages and check 
 59 | if there are new messages and then quickly retrieve those messages and add them 
 60 | to content.sqlite.
 61 | 
 62 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed.
 63 | This is intentional as it allows you to look at content.sqlite to debug the process.
 64 | It would be a bad idea to run any queries against this database as they would be 
 65 | slow.
 66 | 
 67 | The second process is running the program gmodel.py.  gmodel.py reads the rough/raw 
 68 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 
 69 | data in the file index.sqlite.  The file index.sqlite will be much smaller (often 10X
 70 | smaller) than content.sqlite because it also compresses the header and body text.
 71 | 
 72 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing
 73 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 
 74 | data cleaning process.
 75 | 
 76 | Running gmodel.py works as follows:
 77 | 
 78 | Mac: python gmodel.py
 79 | Win: gmodel.py
 80 | 
 81 | Loaded allsenders 1588 and mapping 28 dns mapping 1
 82 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com
 83 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu
 84 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu
 85 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu
 86 | ...
 87 | 
 88 | The gmodel.py program does a number of data cleaing steps
 89 | 
 90 | Domain names are truncated to two levels for .com, .org, .edu, and .net 
 91 | other domain names are truncated to three levels.  So si.umich.edu becomes
 92 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk.   Also mail addresses are
 93 | forced to lower case and some of the @gmane.org address like the following
 94 | 
 95 |    arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org
 96 | 
 97 | are converted to the real address whenever there is a matching real email
 98 | address elsewhere in the message corpus.
 99 | 
100 | If you look in the content.sqlite database there are two tables that allow
101 | you to map both domain names and individual email addresses that change over 
102 | the lifetime of the email list.  For example, Steve Githens used the following
103 | email addresses over the life of the Sakai developer list:
104 | 
105 | s-githens@northwestern.edu
106 | sgithens@cam.ac.uk
107 | swgithen@mtu.edu
108 | 
109 | We can add two entries to the Mapping table
110 | 
111 | s-githens@northwestern.edu ->  swgithen@mtu.edu
112 | sgithens@cam.ac.uk -> swgithen@mtu.edu
113 | 
114 | And so all the mail messages will be collected under one sender even if 
115 | they used several email addresses over the lifetime of the mailing list.
116 | 
117 | You can also make similar entries in the DNSMapping table if there are multiple
118 | DNS names you want mapped to a single DNS.  In the Sakai data I add the following
119 | mapping:
120 | 
121 | iupui.edu -> indiana.edu
122 | 
123 | So all the folks from the various Indiana University campuses are tracked together
124 | 
125 | You can re-run the gmodel.py over and over as you look at the data, and add mappings
126 | to make the data cleaner and cleaner.   When you are done, you will have a nicely
127 | indexed version of the email in index.sqlite.   This is the file to use to do data
128 | analysis.   With this file, data analysis will be really quick.
129 | 
130 | The first, simplest data analysis is to do a "who does the most" and "which 
131 | organzation does the most"?  This is done using gbasic.py:
132 | 
133 | Mac: python gbasic.py 
134 | Win: gbasic.py 
135 | 
136 | How many to dump? 5
137 | Loaded messages= 51330 subjects= 25033 senders= 1584
138 | 
139 | Top 5 Email list participants
140 | steve.swinsburg@gmail.com 2657
141 | azeckoski@unicon.net 1742
142 | ieb@tfd.co.uk 1591
143 | csev@umich.edu 1304
144 | david.horwitz@uct.ac.za 1184
145 | 
146 | Top 5 Email list organizations
147 | gmail.com 7339
148 | umich.edu 6243
149 | uct.ac.za 2451
150 | indiana.edu 2258
151 | unicon.net 2055
152 | 
153 | You can look at the data in index.sqlite and if you find a problem, you 
154 | can update the Mapping table and DNSMapping table in content.sqlite and
155 | re-run gmodel.py.
156 | 
157 | There is a simple vizualization of the word frequence in the subject lines
158 | in the file gword.py:
159 | 
160 | Mac: python gword.py
161 | Win: gword.py
162 | 
163 | Range of counts: 33229 129
164 | Output written to gword.js
165 | 
166 | This produces the file gword.js which you can visualize using the file 
167 | gword.htm.
168 | 
169 | A second visualization is in gline.py.  It visualizes email participation by 
170 | organizations over time.
171 | 
172 | Mac: python gline.py 
173 | Win: gline.py 
174 | 
175 | Loaded messages= 51330 subjects= 25033 senders= 1584
176 | Top 10 Organizations
177 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk']
178 | Output written to gline.js
179 | 
180 | Its output is written to gline.js which is visualized using gline.htm.
181 | If you have a problem with gline.htm, you can try gline2.htm or gline3.htm
182 | to vizualize your data.
183 | 
184 | Some URLs for visualization ideas:
185 | 
186 | https://developers.google.com/chart/
187 | 
188 | https://developers.google.com/chart/interactive/docs/gallery/motionchart
189 | 
190 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats
191 | 
192 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline
193 | 
194 | http://bost.ocks.org/mike/uberdata/
195 | 
196 | http://mbostock.github.io/d3/talk/20111018/calendar.html
197 | 
198 | http://nltk.org/install.html
199 | 
200 | As always - comments welcome.
201 | 
202 | -- Dr. Chuck
203 | Sun Sep 29 00:11:01 EDT 2013
204 | 
205 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/d3.layout.cloud.js:
--------------------------------------------------------------------------------
  1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/
  2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf
  3 | (function(exports) {
  4 |   function cloud() {
  5 |     var size = [256, 256],
  6 |         text = cloudText,
  7 |         font = cloudFont,
  8 |         fontSize = cloudFontSize,
  9 |         fontStyle = cloudFontNormal,
 10 |         fontWeight = cloudFontNormal,
 11 |         rotate = cloudRotate,
 12 |         padding = cloudPadding,
 13 |         spiral = archimedeanSpiral,
 14 |         words = [],
 15 |         timeInterval = Infinity,
 16 |         event = d3.dispatch("word", "end"),
 17 |         timer = null,
 18 |         cloud = {};
 19 | 
 20 |     cloud.start = function() {
 21 |       var board = zeroArray((size[0] >> 5) * size[1]),
 22 |           bounds = null,
 23 |           n = words.length,
 24 |           i = -1,
 25 |           tags = [],
 26 |           data = words.map(function(d, i) {
 27 |             d.text = text.call(this, d, i);
 28 |             d.font = font.call(this, d, i);
 29 |             d.style = fontStyle.call(this, d, i);
 30 |             d.weight = fontWeight.call(this, d, i);
 31 |             d.rotate = rotate.call(this, d, i);
 32 |             d.size = ~~fontSize.call(this, d, i);
 33 |             d.padding = cloudPadding.call(this, d, i);
 34 |             return d;
 35 |           }).sort(function(a, b) { return b.size - a.size; });
 36 | 
 37 |       if (timer) clearInterval(timer);
 38 |       timer = setInterval(step, 0);
 39 |       step();
 40 | 
 41 |       return cloud;
 42 | 
 43 |       function step() {
 44 |         var start = +new Date,
 45 |             d;
 46 |         while (+new Date - start < timeInterval && ++i < n && timer) {
 47 |           d = data[i];
 48 |           d.x = (size[0] * (Math.random() + .5)) >> 1;
 49 |           d.y = (size[1] * (Math.random() + .5)) >> 1;
 50 |           cloudSprite(d, data, i);
 51 |           if (place(board, d, bounds)) {
 52 |             tags.push(d);
 53 |             event.word(d);
 54 |             if (bounds) cloudBounds(bounds, d);
 55 |             else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}];
 56 |             // Temporary hack
 57 |             d.x -= size[0] >> 1;
 58 |             d.y -= size[1] >> 1;
 59 |           }
 60 |         }
 61 |         if (i >= n) {
 62 |           cloud.stop();
 63 |           event.end(tags, bounds);
 64 |         }
 65 |       }
 66 |     }
 67 | 
 68 |     cloud.stop = function() {
 69 |       if (timer) {
 70 |         clearInterval(timer);
 71 |         timer = null;
 72 |       }
 73 |       return cloud;
 74 |     };
 75 | 
 76 |     cloud.timeInterval = function(x) {
 77 |       if (!arguments.length) return timeInterval;
 78 |       timeInterval = x == null ? Infinity : x;
 79 |       return cloud;
 80 |     };
 81 | 
 82 |     function place(board, tag, bounds) {
 83 |       var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}],
 84 |           startX = tag.x,
 85 |           startY = tag.y,
 86 |           maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]),
 87 |           s = spiral(size),
 88 |           dt = Math.random() < .5 ? 1 : -1,
 89 |           t = -dt,
 90 |           dxdy,
 91 |           dx,
 92 |           dy;
 93 | 
 94 |       while (dxdy = s(t += dt)) {
 95 |         dx = ~~dxdy[0];
 96 |         dy = ~~dxdy[1];
 97 | 
 98 |         if (Math.min(dx, dy) > maxDelta) break;
 99 | 
100 |         tag.x = startX + dx;
101 |         tag.y = startY + dy;
102 | 
103 |         if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 ||
104 |             tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue;
105 |         // TODO only check for collisions within current bounds.
106 |         if (!bounds || !cloudCollide(tag, board, size[0])) {
107 |           if (!bounds || collideRects(tag, bounds)) {
108 |             var sprite = tag.sprite,
109 |                 w = tag.width >> 5,
110 |                 sw = size[0] >> 5,
111 |                 lx = tag.x - (w << 4),
112 |                 sx = lx & 0x7f,
113 |                 msx = 32 - sx,
114 |                 h = tag.y1 - tag.y0,
115 |                 x = (tag.y + tag.y0) * sw + (lx >> 5),
116 |                 last;
117 |             for (var j = 0; j < h; j++) {
118 |               last = 0;
119 |               for (var i = 0; i <= w; i++) {
120 |                 board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0);
121 |               }
122 |               x += sw;
123 |             }
124 |             delete tag.sprite;
125 |             return true;
126 |           }
127 |         }
128 |       }
129 |       return false;
130 |     }
131 | 
132 |     cloud.words = function(x) {
133 |       if (!arguments.length) return words;
134 |       words = x;
135 |       return cloud;
136 |     };
137 | 
138 |     cloud.size = function(x) {
139 |       if (!arguments.length) return size;
140 |       size = [+x[0], +x[1]];
141 |       return cloud;
142 |     };
143 | 
144 |     cloud.font = function(x) {
145 |       if (!arguments.length) return font;
146 |       font = d3.functor(x);
147 |       return cloud;
148 |     };
149 | 
150 |     cloud.fontStyle = function(x) {
151 |       if (!arguments.length) return fontStyle;
152 |       fontStyle = d3.functor(x);
153 |       return cloud;
154 |     };
155 | 
156 |     cloud.fontWeight = function(x) {
157 |       if (!arguments.length) return fontWeight;
158 |       fontWeight = d3.functor(x);
159 |       return cloud;
160 |     };
161 | 
162 |     cloud.rotate = function(x) {
163 |       if (!arguments.length) return rotate;
164 |       rotate = d3.functor(x);
165 |       return cloud;
166 |     };
167 | 
168 |     cloud.text = function(x) {
169 |       if (!arguments.length) return text;
170 |       text = d3.functor(x);
171 |       return cloud;
172 |     };
173 | 
174 |     cloud.spiral = function(x) {
175 |       if (!arguments.length) return spiral;
176 |       spiral = spirals[x + ""] || x;
177 |       return cloud;
178 |     };
179 | 
180 |     cloud.fontSize = function(x) {
181 |       if (!arguments.length) return fontSize;
182 |       fontSize = d3.functor(x);
183 |       return cloud;
184 |     };
185 | 
186 |     cloud.padding = function(x) {
187 |       if (!arguments.length) return padding;
188 |       padding = d3.functor(x);
189 |       return cloud;
190 |     };
191 | 
192 |     return d3.rebind(cloud, event, "on");
193 |   }
194 | 
195 |   function cloudText(d) {
196 |     return d.text;
197 |   }
198 | 
199 |   function cloudFont() {
200 |     return "serif";
201 |   }
202 | 
203 |   function cloudFontNormal() {
204 |     return "normal";
205 |   }
206 | 
207 |   function cloudFontSize(d) {
208 |     return Math.sqrt(d.value);
209 |   }
210 | 
211 |   function cloudRotate() {
212 |     return (~~(Math.random() * 6) - 3) * 30;
213 |   }
214 | 
215 |   function cloudPadding() {
216 |     return 1;
217 |   }
218 | 
219 |   // Fetches a monochrome sprite bitmap for the specified text.
220 |   // Load in batches for speed.
221 |   function cloudSprite(d, data, di) {
222 |     if (d.sprite) return;
223 |     c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio);
224 |     var x = 0,
225 |         y = 0,
226 |         maxh = 0,
227 |         n = data.length;
228 |     di--;
229 |     while (++di < n) {
230 |       d = data[di];
231 |       c.save();
232 |       c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font;
233 |       var w = c.measureText(d.text + "m").width * ratio,
234 |           h = d.size << 1;
235 |       if (d.rotate) {
236 |         var sr = Math.sin(d.rotate * cloudRadians),
237 |             cr = Math.cos(d.rotate * cloudRadians),
238 |             wcr = w * cr,
239 |             wsr = w * sr,
240 |             hcr = h * cr,
241 |             hsr = h * sr;
242 |         w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5;
243 |         h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr));
244 |       } else {
245 |         w = (w + 0x1f) >> 5 << 5;
246 |       }
247 |       if (h > maxh) maxh = h;
248 |       if (x + w >= (cw << 5)) {
249 |         x = 0;
250 |         y += maxh;
251 |         maxh = 0;
252 |       }
253 |       if (y + h >= ch) break;
254 |       c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio);
255 |       if (d.rotate) c.rotate(d.rotate * cloudRadians);
256 |       c.fillText(d.text, 0, 0);
257 |       c.restore();
258 |       d.width = w;
259 |       d.height = h;
260 |       d.xoff = x;
261 |       d.yoff = y;
262 |       d.x1 = w >> 1;
263 |       d.y1 = h >> 1;
264 |       d.x0 = -d.x1;
265 |       d.y0 = -d.y1;
266 |       x += w;
267 |     }
268 |     var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data,
269 |         sprite = [];
270 |     while (--di >= 0) {
271 |       d = data[di];
272 |       var w = d.width,
273 |           w32 = w >> 5,
274 |           h = d.y1 - d.y0,
275 |           p = d.padding;
276 |       // Zero the buffer
277 |       for (var i = 0; i < h * w32; i++) sprite[i] = 0;
278 |       x = d.xoff;
279 |       if (x == null) return;
280 |       y = d.yoff;
281 |       var seen = 0,
282 |           seenRow = -1;
283 |       for (var j = 0; j < h; j++) {
284 |         for (var i = 0; i < w; i++) {
285 |           var k = w32 * j + (i >> 5),
286 |               m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0;
287 |           if (p) {
288 |             if (j) sprite[k - w32] |= m;
289 |             if (j < w - 1) sprite[k + w32] |= m;
290 |             m |= (m << 1) | (m >> 1);
291 |           }
292 |           sprite[k] |= m;
293 |           seen |= m;
294 |         }
295 |         if (seen) seenRow = j;
296 |         else {
297 |           d.y0++;
298 |           h--;
299 |           j--;
300 |           y++;
301 |         }
302 |       }
303 |       d.y1 = d.y0 + seenRow;
304 |       d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32);
305 |     }
306 |   }
307 | 
308 |   // Use mask-based collision detection.
309 |   function cloudCollide(tag, board, sw) {
310 |     sw >>= 5;
311 |     var sprite = tag.sprite,
312 |         w = tag.width >> 5,
313 |         lx = tag.x - (w << 4),
314 |         sx = lx & 0x7f,
315 |         msx = 32 - sx,
316 |         h = tag.y1 - tag.y0,
317 |         x = (tag.y + tag.y0) * sw + (lx >> 5),
318 |         last;
319 |     for (var j = 0; j < h; j++) {
320 |       last = 0;
321 |       for (var i = 0; i <= w; i++) {
322 |         if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0))
323 |             & board[x + i]) return true;
324 |       }
325 |       x += sw;
326 |     }
327 |     return false;
328 |   }
329 | 
330 |   function cloudBounds(bounds, d) {
331 |     var b0 = bounds[0],
332 |         b1 = bounds[1];
333 |     if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0;
334 |     if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0;
335 |     if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1;
336 |     if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1;
337 |   }
338 | 
339 |   function collideRects(a, b) {
340 |     return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y;
341 |   }
342 | 
343 |   function archimedeanSpiral(size) {
344 |     var e = size[0] / size[1];
345 |     return function(t) {
346 |       return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)];
347 |     };
348 |   }
349 | 
350 |   function rectangularSpiral(size) {
351 |     var dy = 4,
352 |         dx = dy * size[0] / size[1],
353 |         x = 0,
354 |         y = 0;
355 |     return function(t) {
356 |       var sign = t < 0 ? -1 : 1;
357 |       // See triangular numbers: T_n = n * (n + 1) / 2.
358 |       switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) {
359 |         case 0:  x += dx; break;
360 |         case 1:  y += dy; break;
361 |         case 2:  x -= dx; break;
362 |         default: y -= dy; break;
363 |       }
364 |       return [x, y];
365 |     };
366 |   }
367 | 
368 |   // TODO reuse arrays?
369 |   function zeroArray(n) {
370 |     var a = [],
371 |         i = -1;
372 |     while (++i < n) a[i] = 0;
373 |     return a;
374 |   }
375 | 
376 |   var cloudRadians = Math.PI / 180,
377 |       cw = 1 << 11 >> 5,
378 |       ch = 1 << 11,
379 |       canvas,
380 |       ratio = 1;
381 | 
382 |   if (typeof document !== "undefined") {
383 |     canvas = document.createElement("canvas");
384 |     canvas.width = 1;
385 |     canvas.height = 1;
386 |     ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2);
387 |     canvas.width = (cw << 5) / ratio;
388 |     canvas.height = ch / ratio;
389 |   } else {
390 |     // node-canvas support
391 |     var Canvas = require("canvas");
392 |     canvas = new Canvas(cw << 5, ch);
393 |   }
394 | 
395 |   var c = canvas.getContext("2d"),
396 |       spirals = {
397 |         archimedean: archimedeanSpiral,
398 |         rectangular: rectangularSpiral
399 |       };
400 |   c.fillStyle = "red";
401 |   c.textAlign = "center";
402 | 
403 |   exports.cloud = cloud;
404 | })(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports);
405 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gbasic.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import urllib
 4 | import zlib
 5 | 
 6 | howmany = int(raw_input("How many to dump? "))
 7 | 
 8 | conn = sqlite3.connect('index.sqlite')
 9 | conn.text_factory = str
10 | cur = conn.cursor()
11 | 
12 | cur.execute('''SELECT Messages.id, sender FROM Messages
13 |     JOIN Senders ON Messages.sender_id = Senders.id''')
14 |     
15 | sendcounts = dict()
16 | sendorgs = dict()
17 | for message in cur :
18 |     sender = message[1]
19 |     sendcounts[sender] = sendcounts.get(sender,0) + 1
20 |     pieces = sender.split("@")
21 |     if len(pieces) != 2 : continue
22 |     dns = pieces[1]
23 |     sendorgs[dns] = sendorgs.get(dns,0) + 1
24 | 
25 | print ''
26 | print 'Top',howmany,'Email list participants'
27 | 
28 | x = sorted(sendcounts, key=sendcounts.get, reverse=True)
29 | for k in x[:howmany]:
30 |     print k, sendcounts[k]
31 |     if sendcounts[k] < 10 : break
32 | 
33 | print ''
34 | print 'Top',howmany,'Email list organizations'
35 | 
36 | x = sorted(sendorgs, key=sendorgs.get, reverse=True)
37 | for k in x[:howmany]:
38 |     print k, sendorgs[k]
39 |     if sendorgs[k] < 10 : break
40 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline.htm:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <script type="text/javascript" src="gline.js"></script>
 4 |     <script type="text/javascript" src="https://www.google.com/jsapi"></script>
 5 |     <script type="text/javascript">
 6 |       google.load("visualization", "1", {packages:["corechart"]});
 7 |       google.setOnLoadCallback(drawChart);
 8 |       function drawChart() {
 9 |         var data = google.visualization.arrayToDataTable( gline );
10 | 
11 |         var options = {
12 |           title: 'Sakai Developer Email Participation by Organization'
13 |         };
14 | 
15 |         var chart = new google.visualization.LineChart(document.getElementById('chart_div'));
16 |         chart.draw(data, options);
17 |       }
18 |     </script>
19 |   </head>
20 |   <body>
21 |     <div id="chart_div" style="width: 1400px; height: 800px;"></div>
22 |   </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import urllib
 4 | import zlib
 5 | 
 6 | conn = sqlite3.connect('index.sqlite')
 7 | conn.text_factory = str
 8 | cur = conn.cursor()
 9 | 
10 | # Determine the top ten organizations
11 | cur.execute('''SELECT Messages.id, sender FROM Messages 
12 |     JOIN Senders ON Messages.sender_id = Senders.id''')
13 | 
14 | sendorgs = dict()
15 | for message_row in cur :
16 |     sender = message_row[1]
17 |     pieces = sender.split("@")
18 |     if len(pieces) != 2 : continue
19 |     dns = pieces[1]
20 |     sendorgs[dns] = sendorgs.get(dns,0) + 1
21 | 
22 | # pick the top schools
23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
24 | orgs = orgs[:10]
25 | print "Top 10 Organizations"
26 | print orgs
27 | # orgs = ['total'] + orgs
28 | 
29 | # Read through the messages
30 | counts = dict()
31 | months = list()
32 | 
33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages 
34 |     JOIN Senders ON Messages.sender_id = Senders.id''')
35 | 
36 | for message_row in cur :
37 |     sender = message_row[1]
38 |     pieces = sender.split("@")
39 |     if len(pieces) != 2 : continue
40 |     dns = pieces[1]
41 |     if dns not in orgs : continue
42 |     month = message_row[2][:7]
43 |     if month not in months : months.append(month)
44 |     key = (month, dns)
45 |     counts[key] = counts.get(key,0) + 1
46 |     tkey = (month, 'total')
47 |     counts[tkey] = counts.get(tkey,0) + 1
48 |     
49 | months.sort()
50 | print counts
51 | print months
52 | 
53 | fhand = open('gline.js','w')
54 | fhand.write("gline = [ ['Month'")
55 | for org in orgs:
56 |     fhand.write(",'"+org+"'")
57 | fhand.write("]")
58 | 
59 | # for month in months[1:-1]:
60 | for month in months:
61 |     fhand.write(",\n['"+month+"'")
62 |     for org in orgs:
63 |         key = (month, org)
64 |         val = counts.get(key,0)
65 |         fhand.write(","+str(val))
66 |     fhand.write("]");
67 | 
68 | fhand.write("\n];\n")
69 | 
70 | print "Data written to gline.js"
71 | print "Open gline.htm in a browser to view"
72 | 
73 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline2.htm:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <script type="text/javascript" src="gline.js"></script>
 4 |     <script type="text/javascript"
 5 |         src="https://www.gstatic.com/charts/loader.js"></script>
 6 |     <script type="text/javascript">
 7 |       // https://developers.google.com/chart/interactive/docs/basic_load_libs
 8 |       google.charts.load("current", {packages:["corechart"]});
 9 |       google.charts.setOnLoadCallback(drawChart);
10 |       function drawChart() {
11 |         var data = google.visualization.arrayToDataTable( gline );
12 | 
13 |         var options = {
14 |           title: 'Sakai Developer Email Participation by Organization'
15 |         };
16 | 
17 |         var chart = new google.visualization.LineChart(document.getElementById('chart_div'));
18 |         chart.draw(data, options);
19 |       }
20 |     </script>
21 |   </head>
22 |   <body>
23 |     <div id="chart_div" style="width: 1400px; height: 800px;"></div>
24 |   </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline3.htm:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html>
  3 | 
  4 | <head>
  5 |     <title>Line Chart</title>
  6 |     <script src="Chart.bundle.js"></script>
  7 |     <script src="gline.js"></script>
  8 |     <style>
  9 |         canvas {
 10 |             -moz-user-select: none;
 11 |             -webkit-user-select: none;
 12 |             -ms-user-select: none;
 13 |         }
 14 |     </style>
 15 | </head>
 16 | 
 17 | <body>
 18 |     <div style="width:75%;">
 19 |         <canvas id="canvas"></canvas>
 20 |     </div>
 21 |     <script>
 22 |         // Inspired from https://github.com/chartjs/Chart.js/blob/master/samples/line-legend.html
 23 |         var randomColorFactor = function() {
 24 |             return Math.round(Math.random() * 255);
 25 |         };
 26 |         var randomColor = function(opacity) {
 27 |             return 'rgba(' + randomColorFactor() + ',' + randomColorFactor() + ',' + randomColorFactor() + ',' + (opacity || '.3') + ')';
 28 |         };
 29 | 
 30 |         // Transform the gline data into Chart.js code
 31 |         console.log(gline);
 32 |         info = gline[0];
 33 |         xaxis_label = info[0];
 34 |         xaxis = Array();
 35 |         dsets = Array();
 36 |         legend = Array();
 37 |         for(i=1;i<gline.length;i++) {
 38 |             dset = {};
 39 |             dset.label = info[i];
 40 |             console.log(gline[i]);
 41 |             xaxis[i-1] = gline[i][0];
 42 |             dset.fill = false;
 43 |             dset.lineTension = 0;
 44 |             var background = randomColor(0.5);
 45 |             dset.borderColor = background;
 46 |             dset.backgroundColor = background;
 47 |             dset.pointBorderColor = background;
 48 |             dset.pointBackgroundColor = background;
 49 |             dset.pointBorderWidth = 1;
 50 |             ddata = Array();
 51 |             for(j=1; j<gline.length; j++ ) {
 52 |                 ddata[j-1] = gline[j][i];
 53 |             }
 54 |             dset.data = ddata;
 55 |             dsets[i-1] = dset;
 56 |         }
 57 | 
 58 |         console.log(xaxis);
 59 |         console.log(dsets);
 60 | 
 61 |         var config = {
 62 |             type: 'line',
 63 |             data: {
 64 |                 labels: xaxis,
 65 |                 datasets: dsets,
 66 |             },
 67 |             options: {
 68 |                 responsive: true,
 69 |                 legend: {
 70 |                     position: 'bottom',
 71 |                 },
 72 |                 hover: {
 73 |                     mode: 'label'
 74 |                 },
 75 |                 scales: {
 76 |                     xAxes: [{
 77 |                         display: true,
 78 |                         scaleLabel: {
 79 |                             display: true,
 80 |                             labelString: 'Month'
 81 |                         }
 82 |                     }],
 83 |                     yAxes: [{
 84 |                         display: true,
 85 |                         scaleLabel: {
 86 |                             display: true,
 87 |                             labelString: 'Value'
 88 |                         }
 89 |                     }]
 90 |                 },
 91 |                 title: {
 92 |                     display: true,
 93 |                     text: 'Sakai Developer Email Participation by Organization'
 94 |                 }
 95 |             }
 96 |         };
 97 | 
 98 |         window.onload = function() {
 99 |             var ctx = document.getElementById("canvas").getContext("2d");
100 |             window.myLine = new Chart(ctx, config);
101 |         };
102 | 
103 |     </script>
104 | </body>
105 | 
106 | </html>
107 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gmane.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import sqlite3
  3 | import time
  4 | import ssl
  5 | import urllib
  6 | from urlparse import urljoin
  7 | from urlparse import urlparse
  8 | import re
  9 | from datetime import datetime, timedelta
 10 | 
 11 | # Not all systems have this so conditionally define parser
 12 | try:
 13 |     import dateutil.parser as parser
 14 | except:
 15 |     pass
 16 | 
 17 | def parsemaildate(md) :
 18 |     # See if we have dateutil
 19 |     try:
 20 |         pdate = parser.parse(tdate)
 21 |         test_at = pdate.isoformat()
 22 |         return test_at
 23 |     except:
 24 |         pass
 25 | 
 26 |     # Non-dateutil version - we try our best
 27 | 
 28 |     pieces = md.split()
 29 |     notz = " ".join(pieces[:4]).strip()
 30 | 
 31 |     # Try a bunch of format variations - strptime() is *lame*
 32 |     dnotz = None
 33 |     for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
 34 |         '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
 35 |         '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
 36 |         try:
 37 |             dnotz = datetime.strptime(notz, form)
 38 |             break
 39 |         except:
 40 |             continue
 41 | 
 42 |     if dnotz is None :
 43 |         # print 'Bad Date:',md
 44 |         return None
 45 | 
 46 |     iso = dnotz.isoformat()
 47 | 
 48 |     tz = "+0000"
 49 |     try:
 50 |         tz = pieces[4]
 51 |         ival = int(tz) # Only want numeric timezone values
 52 |         if tz == '-0000' : tz = '+0000'
 53 |         tzh = tz[:3]
 54 |         tzm = tz[3:]
 55 |         tz = tzh+":"+tzm
 56 |     except:
 57 |         pass
 58 | 
 59 |     return iso+tz
 60 | 
 61 | conn = sqlite3.connect('content.sqlite')
 62 | cur = conn.cursor()
 63 | conn.text_factory = str
 64 | 
 65 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
 66 | 
 67 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 
 68 |     (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 
 69 |      subject TEXT, headers TEXT, body TEXT)''')
 70 | 
 71 | start = 0
 72 | cur.execute('SELECT max(id) FROM Messages')
 73 | try:
 74 |     row = cur.fetchone()
 75 |     if row[0] is not None: 
 76 |         start = row[0]
 77 | except:
 78 |     start = 0
 79 |     row = None
 80 | 
 81 | print start
 82 | 
 83 | many = 0
 84 | 
 85 | # Skip up to five messages
 86 | skip = 5
 87 | while True:
 88 |     if ( many < 1 ) :
 89 |         sval = raw_input('How many messages:')
 90 |         if ( len(sval) < 1 ) : break
 91 |         many = int(sval)
 92 | 
 93 |     start = start + 1
 94 |     cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
 95 |     try:
 96 |         row = cur.fetchone()
 97 |         if row is not None : continue
 98 |     except:
 99 |         row = None
100 |         
101 |     many = many - 1
102 |     url = baseurl + str(start) + '/' + str(start + 1)
103 | 
104 |     try:
105 |         # Deal with SSL certificate anomalies Python > 2.7
106 | 	    # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
107 |         # document = urllib.urlopen(url, context=scontext)
108 | 
109 |         document = urllib.urlopen(url)
110 | 
111 |         text = document.read()
112 |         if document.getcode() != 200 :
113 |             print "Error code=",document.getcode(), url
114 |             break
115 |     except KeyboardInterrupt:
116 |         print ''
117 |         print 'Program interrupted by user...'
118 |         break
119 |     except:
120 |         print "Unable to retrieve or parse page",url
121 |         print sys.exc_info()[0]
122 |         break
123 | 
124 |     print url,len(text)
125 | 
126 |     if not text.startswith("From "):
127 |         if skip < 1 :
128 |             print text
129 |             print "End of mail stream reached..."
130 |             quit ()
131 |         print "    Skipping badly formed message"
132 |         skip = skip-1
133 |         continue
134 | 
135 |     pos = text.find("\n\n")
136 |     if pos > 0 : 
137 |         hdr = text[:pos]
138 |         body = text[pos+2:]
139 |     else:
140 |         print text
141 |         print "Could not find break between headers and body"
142 |         break
143 | 
144 |     skip = 5 # reset skip count
145 | 
146 |     email = None
147 |     x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
148 |     if len(x) == 1 : 
149 |         email = x[0];
150 |         email = email.strip().lower()
151 |         email = email.replace("<","")
152 |     else:
153 |         x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
154 |         if len(x) == 1 : 
155 |             email = x[0];
156 |             email = email.strip().lower()
157 |             email = email.replace("<","")
158 | 
159 |     date = None
160 |     y = re.findall('\Date: .*, (.*)\n', hdr)
161 |     if len(y) == 1 : 
162 |         tdate = y[0]
163 |         tdate = tdate[:26]
164 |         try:
165 |             sent_at = parsemaildate(tdate)
166 |         except:
167 |             print text
168 |             print "Parse fail",tdate
169 |             break
170 | 
171 |     subject = None
172 |     z = re.findall('\Subject: (.*)\n', hdr)
173 |     if len(z) == 1 : subject = z[0].strip().lower();
174 | 
175 |     print "   ",email,sent_at,subject
176 |     cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 
177 |         VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
178 | 
179 |     # Only commit every 50th record
180 |     # if (many % 50) == 0 : conn.commit() 
181 |     time.sleep(1)
182 | 
183 | conn.commit()
184 | cur.close()
185 | 
186 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gmodel.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import time
  3 | import urllib
  4 | import re
  5 | import zlib
  6 | from datetime import datetime, timedelta
  7 | # Not all systems have this
  8 | try:
  9 |     import dateutil.parser as parser 
 10 | except:
 11 |     pass
 12 | 
 13 | dnsmapping = dict()
 14 | mapping = dict()
 15 | 
 16 | def fixsender(sender,allsenders=None) :
 17 |     global dnsmapping
 18 |     global mapping
 19 |     if sender is None : return None
 20 |     sender = sender.strip().lower()
 21 |     sender = sender.replace('<','').replace('>','')
 22 | 
 23 |     # Check if we have a hacked gmane.org from address
 24 |     if allsenders is not None and sender.endswith('gmane.org') : 
 25 |         pieces = sender.split('-')
 26 |         realsender = None
 27 |         for s in allsenders:
 28 |             if s.startswith(pieces[0]) :
 29 |                 realsender = sender
 30 |                 sender = s
 31 |                 # print realsender, sender
 32 |                 break
 33 |         if realsender is None : 
 34 |             for s in mapping:
 35 |                 if s.startswith(pieces[0]) :
 36 |                     realsender = sender
 37 |                     sender = mapping[s]
 38 |                     # print realsender, sender
 39 |                     break
 40 |         if realsender is None : sender = pieces[0]
 41 | 
 42 |     mpieces = sender.split("@")
 43 |     if len(mpieces) != 2 : return sender
 44 |     dns = mpieces[1]
 45 |     x = dns
 46 |     pieces = dns.split(".")
 47 |     if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
 48 |         dns = ".".join(pieces[-2:])
 49 |     else:
 50 |         dns = ".".join(pieces[-3:])
 51 |     # if dns != x : print x,dns
 52 |     # if dns != dnsmapping.get(dns,dns) : print dns,dnsmapping.get(dns,dns)
 53 |     dns = dnsmapping.get(dns,dns)
 54 |     return mpieces[0] + '@' + dns
 55 | 
 56 | def parsemaildate(md) :
 57 |     # See if we have dateutil
 58 |     try:
 59 |         pdate = parser.parse(tdate)
 60 |         test_at = pdate.isoformat()
 61 |         return test_at
 62 |     except:
 63 |         pass
 64 | 
 65 |     # Non-dateutil version - we try our best
 66 | 
 67 |     pieces = md.split()
 68 |     notz = " ".join(pieces[:4]).strip()
 69 |    
 70 |     # Try a bunch of format variations - strptime() is *lame*
 71 |     dnotz = None
 72 |     for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 
 73 |         '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 
 74 |         '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
 75 |         try:
 76 |             dnotz = datetime.strptime(notz, form)
 77 |             break
 78 |         except:
 79 |             continue
 80 | 
 81 |     if dnotz is None :
 82 |         # print 'Bad Date:',md
 83 |         return None
 84 | 
 85 |     iso = dnotz.isoformat()
 86 | 
 87 |     tz = "+0000"
 88 |     try:
 89 |         tz = pieces[4]
 90 |         ival = int(tz) # Only want numeric timezone values
 91 |         if tz == '-0000' : tz = '+0000'
 92 |         tzh = tz[:3]
 93 |         tzm = tz[3:]
 94 |         tz = tzh+":"+tzm
 95 |     except:
 96 |         pass
 97 | 
 98 |     return iso+tz
 99 | 
100 | # Parse out the info...
101 | def parseheader(hdr, allsenders=None):
102 |     if hdr is None or len(hdr) < 1 : return None
103 |     sender = None
104 |     x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
105 |     if len(x) >= 1 :
106 |         sender = x[0]
107 |     else:
108 |         x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
109 |         if len(x) >= 1 :
110 |             sender = x[0]
111 | 
112 |     # normalize the domain name of Email addresses
113 |     sender = fixsender(sender, allsenders)
114 | 
115 |     date = None
116 |     y = re.findall('\nDate: .*, (.*)\n', hdr)
117 |     sent_at = None
118 |     if len(y) >= 1 :
119 |         tdate = y[0]
120 |         tdate = tdate[:26]
121 |         try:
122 |             sent_at = parsemaildate(tdate)
123 |         except Exception, e:
124 |             # print 'Date ignored ',tdate, e
125 |             return None
126 | 
127 |     subject = None
128 |     z = re.findall('\nSubject: (.*)\n', hdr)
129 |     if len(z) >= 1 : subject = z[0].strip().lower()
130 | 
131 |     guid = None
132 |     z = re.findall('\nMessage-ID: (.*)\n', hdr)
133 |     if len(z) >= 1 : guid = z[0].strip().lower()
134 | 
135 |     if sender is None or sent_at is None or subject is None or guid is None :
136 |         return None
137 |     return (guid, sender, subject, sent_at)
138 | 
139 | # Open the output database and create empty tables
140 | conn = sqlite3.connect('index.sqlite')
141 | conn.text_factory = str
142 | cur = conn.cursor()
143 | 
144 | cur.execute('''DROP TABLE IF EXISTS Messages ''')
145 | cur.execute('''DROP TABLE IF EXISTS Senders ''')
146 | cur.execute('''DROP TABLE IF EXISTS Subjects ''')
147 | cur.execute('''DROP TABLE IF EXISTS Replies ''')
148 | 
149 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 
150 |     (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 
151 |      sender_id INTEGER, subject_id INTEGER, 
152 |      headers BLOB, body BLOB)''')
153 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 
154 |     (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
155 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 
156 |     (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
157 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 
158 |     (from_id INTEGER, to_id INTEGER)''')
159 | 
160 | # Open the mapping information
161 | conn_1 = sqlite3.connect('mapping.sqlite')
162 | conn_1.text_factory = str
163 | cur_1 = conn_1.cursor()
164 | 
165 | # Load up the mapping information into memory structures
166 | cur_1.execute('''SELECT old,new FROM DNSMapping''')
167 | for message_row in cur_1 :
168 |     dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower()
169 | 
170 | mapping = dict()
171 | cur_1.execute('''SELECT old,new FROM Mapping''')
172 | for message_row in cur_1 :
173 |     old = fixsender(message_row[0])
174 |     new = fixsender(message_row[1])
175 |     mapping[old] = fixsender(new)
176 | 
177 | cur_1.close()
178 | 
179 | # Open the raw data retrieved from the network
180 | conn_2 = sqlite3.connect('content.sqlite')
181 | conn_2.text_factory = str
182 | cur_2 = conn_2.cursor()
183 | 
184 | allsenders = list()
185 | cur_2.execute('''SELECT email FROM Messages''')
186 | for message_row in cur_2 :
187 |     sender = fixsender(message_row[0])
188 |     if sender is None : continue
189 |     if 'gmane.org' in sender : continue
190 |     if sender in allsenders: continue
191 |     allsenders.append(sender)
192 | 
193 | print "Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)
194 | 
195 | cur_2.execute('''SELECT headers, body, sent_at 
196 |     FROM Messages ORDER BY sent_at''')
197 | 
198 | senders = dict()
199 | subjects = dict()
200 | guids = dict()
201 | 
202 | count = 0
203 | 
204 | for message_row in cur_2 :
205 |     hdr = message_row[0]
206 |     parsed = parseheader(hdr, allsenders)
207 |     if parsed is None: continue
208 |     (guid, sender, subject, sent_at) = parsed
209 |     
210 |     # Apply the sender mapping
211 |     sender = mapping.get(sender,sender)
212 | 
213 |     count = count + 1
214 |     if count % 250 == 1 : print count,sent_at, sender
215 |     # print guid, sender, subject, sent_at
216 | 
217 |     if 'gmane.org' in sender:
218 |         print "Error in sender ===", sender
219 | 
220 |     sender_id = senders.get(sender,None)
221 |     subject_id = subjects.get(subject,None)
222 |     guid_id = guids.get(guid,None)
223 | 
224 |     if sender_id is None : 
225 |         cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
226 |         conn.commit()
227 |         cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
228 |         try:
229 |             row = cur.fetchone()
230 |             sender_id = row[0]
231 |             senders[sender] = sender_id
232 |         except:
233 |             print 'Could not retrieve sender id',sender
234 |             break
235 |     if subject_id is None : 
236 |         cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
237 |         conn.commit()
238 |         cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
239 |         try:
240 |             row = cur.fetchone()
241 |             subject_id = row[0]
242 |             subjects[subject] = subject_id
243 |         except:
244 |             print 'Could not retrieve subject id',subject
245 |             break
246 |     # print sender_id, subject_id
247 |     cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 
248 |             ( guid, sender_id, subject_id, sent_at, zlib.compress(message_row[0]), zlib.compress(message_row[1])) )
249 |     conn.commit()
250 |     cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
251 |     try:
252 |         row = cur.fetchone()
253 |         message_id = row[0]
254 |         guids[guid] = message_id
255 |     except:
256 |         print 'Could not retrieve guid id',guid
257 |         break
258 | 
259 | # Close the connections
260 | cur.close()
261 | cur_2.close()
262 | 
263 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gword.htm:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | <script src="d3.v3.js"></script>
 4 | <script src="d3.layout.cloud.js"></script>
 5 | <script src="gword.js"></script>
 6 | <body>
 7 | <script>
 8 |   var fill = d3.scale.category20();
 9 | 
10 |   d3.layout.cloud().size([700, 700])
11 |       .words(gword)
12 |       .rotate(function() { return ~~(Math.random() * 2) * 90; })
13 |       .font("Impact")
14 |       .fontSize(function(d) { return d.size; })
15 |       .on("end", draw)
16 |       .start();
17 | 
18 |   function draw(words) {
19 |     d3.select("body").append("svg")
20 |         .attr("width", 700)
21 |         .attr("height", 700)
22 |       .append("g")
23 |         .attr("transform", "translate(350,350)")
24 |       .selectAll("text")
25 |         .data(words)
26 |       .enter().append("text")
27 |         .style("font-size", function(d) { return d.size + "px"; })
28 |         .style("font-family", "Impact")
29 |         .style("fill", function(d, i) { return fill(i); })
30 |         .attr("text-anchor", "middle")
31 |         .attr("transform", function(d) {
32 |           return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
33 |         })
34 |         .text(function(d) { return d.text; });
35 |   }
36 | </script>
37 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gword.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import urllib
 4 | import zlib
 5 | import string
 6 | 
 7 | conn = sqlite3.connect('index.sqlite')
 8 | conn.text_factory = str
 9 | cur = conn.cursor()
10 | 
11 | cur.execute('''SELECT subject_id,subject FROM Messages
12 |     JOIN Subjects ON Messages.subject_id = Subjects.id''')
13 | 
14 | counts = dict()
15 | for message_row in cur :
16 |     text = message_row[1]
17 |     text = text.translate(None, string.punctuation)
18 |     text = text.translate(None, '1234567890')
19 |     text = text.strip()
20 |     text = text.lower()
21 |     words = text.split()
22 |     for word in words:
23 |         if len(word) < 4 : continue
24 |         counts[word] = counts.get(word,0) + 1
25 | 
26 | # Find the top 100 words
27 | words = sorted(counts, key=counts.get, reverse=True)
28 | highest = None
29 | lowest = None
30 | for w in words[:100]:
31 |     if highest is None or highest < counts[w] :
32 |         highest = counts[w]
33 |     if lowest is None or lowest > counts[w] :
34 |         lowest = counts[w]
35 | print 'Range of counts:',highest,lowest
36 | 
37 | # Spread the font sizes across 20-100 based on the count
38 | bigsize = 80
39 | smallsize = 20
40 | 
41 | fhand = open('gword.js','w')
42 | fhand.write("gword = [")
43 | first = True
44 | for k in words[:100]:
45 |     if not first : fhand.write( ",\n")
46 |     first = False
47 |     size = counts[k]
48 |     size = (size - lowest) / float(highest - lowest)
49 |     size = int((size * bigsize) + smallsize)
50 |     fhand.write("{text: '"+k+"', size: "+str(size)+"}")
51 | fhand.write( "\n];\n")
52 | 
53 | print "Output written to gword.js"
54 | print "Open gword.htm in a browser to view"
55 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gyear.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import urllib
 4 | import zlib
 5 | 
 6 | conn = sqlite3.connect('index.sqlite')
 7 | conn.text_factory = str
 8 | cur = conn.cursor()
 9 | 
10 | # Determine the top ten organizations
11 | cur.execute('''SELECT Messages.id, sender FROM Messages 
12 |     JOIN Senders ON Messages.sender_id = Senders.id''')
13 | 
14 | sendorgs = dict()
15 | for message_row in cur :
16 |     sender = message_row[1]
17 |     pieces = sender.split("@")
18 |     if len(pieces) != 2 : continue
19 |     dns = pieces[1]
20 |     sendorgs[dns] = sendorgs.get(dns,0) + 1
21 | 
22 | # pick the top schools
23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
24 | orgs = orgs[:10]
25 | print "Top 10 Organizations"
26 | print orgs
27 | # orgs = ['total'] + orgs
28 | 
29 | # Read through the messages
30 | counts = dict()
31 | years = list()
32 | 
33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages 
34 |     JOIN Senders ON Messages.sender_id = Senders.id''')
35 | 
36 | for message_row in cur :
37 |     sender = message_row[1]
38 |     pieces = sender.split("@")
39 |     if len(pieces) != 2 : continue
40 |     dns = pieces[1]
41 |     if dns not in orgs : continue
42 |     year = message_row[2][:4]
43 |     if year not in years : years.append(year)
44 |     key = (year, dns)
45 |     counts[key] = counts.get(key,0) + 1
46 |     tkey = (year, 'total')
47 |     counts[tkey] = counts.get(tkey,0) + 1
48 |     
49 | years.sort()
50 | print counts
51 | print years
52 | 
53 | fhand = open('gline.js','w')
54 | fhand.write("gline = [ ['Year'")
55 | for org in orgs:
56 |     fhand.write(",'"+org+"'")
57 | fhand.write("]")
58 | 
59 | # for year in years[1:-1]:
60 | for year in years:
61 |     fhand.write(",\n['"+year+"'")
62 |     for org in orgs:
63 |         key = (year, org)
64 |         val = counts.get(key,0)
65 |         fhand.write(","+str(val))
66 |     fhand.write("]");
67 | 
68 | fhand.write("\n];\n")
69 | 
70 | print "Data written to gline.js"
71 | print "Open gline.htm in a browser to view"
72 | 
73 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/mapping.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/gmane/mapping.sqlite


--------------------------------------------------------------------------------
/EXAMPLE CODE/grade.py:
--------------------------------------------------------------------------------
 1 | inp = raw_input('Enter score: ')
 2 | try:
 3 |     score = float(inp)
 4 | except:
 5 |     score = -1
 6 | 
 7 | if score > 1.0 or score < 0.0:
 8 |     print 'Bad score'
 9 | elif score > 0.9:
10 |     print 'A'
11 | elif score > 0.8:
12 |     print 'B'
13 | elif score > 0.7:
14 |     print 'C'
15 | elif score > 0.6:
16 |     print 'D'
17 | else:
18 |     print 'F'
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/graphics/histogram.py:
--------------------------------------------------------------------------------
 1 | import string 
 2 | from graphics import *
 3 | 
 4 | fname = raw_input("Enter file name:")
 5 | if len(fname) == 0 :
 6 |   print "Assuming mbox-short.txt"
 7 |   fname = "mbox-short.txt"
 8 | infile = open(fname, "r")
 9 | 
10 | # Set up a 24 element list of zeros
11 | totals = [0] * 24;
12 | print totals;
13 | 
14 | # Accumulate the times
15 | for line in infile:
16 |     if line[0:5] == "From " :
17 | 	words = line.split()
18 |         time = words[5]
19 | 	print "Time", time
20 | 
21 | 	# Split time
22 | 	tsplit = time.split(':')
23 | 	try : 
24 |             hour = int(tsplit[0])
25 | 	    print "Hour", hour
26 | 	except:
27 |             print "Hour not found"
28 |             continue
29 | 
30 | 	totals[hour] = totals[hour] + 1
31 | 	print totals
32 | 
33 | bmax = max(totals)
34 | print "Maximum value", bmax
35 | 
36 | ymax = ( int(bmax / 10) + 1 ) * 10
37 | 
38 | print "Y-Axis Maximum", ymax
39 | 
40 | win = GraphWin("Distribution of Commits "+fname, 600,400)
41 | win.setCoords(0,0,1,1)
42 | 
43 | # Draw the X-Axis
44 | xaxis = Line(Point(0.1,0.1),Point(0.9,0.1))
45 | xaxis.draw(win)
46 | 
47 | # Label the X-Axis - we have 24 hours (0-23)
48 | # so we need to know each slot's width
49 | width = 0.8 * (1.0 / 24.0)
50 | for i in range(24):
51 |     center = (i * width) + (width / 2.0) + 0.1;
52 |     txt = Text(Point(center, 0.066), str(i))
53 |     txt.draw(win)
54 | 
55 | txt = Text(Point(0.5,0.033),"Hour of the Day");
56 | txt.draw(win)
57 | 
58 | # Draw the Y-Axis
59 | yaxis = Line(Point(0.1,0.1),Point(0.1,0.9))
60 | yaxis.draw(win)
61 | 
62 | # Label the Y-Axis
63 | # we will have 10 labels up to ymax
64 | unit = ymax / 10.0;
65 | for i in range(10) :
66 |     center = 0.1 + (i + 1) * 0.08;
67 |     value = int( (i + 1) * unit ) ;
68 |     txt = Text(Point(0.066,center), str(value))
69 |     txt.draw(win)
70 | 
71 | 
72 | # Draw the bars
73 | for i in range(24):
74 |     if totals[i] == 0:
75 |         continue
76 |     left = i * width + 0.1;
77 |     right = i * width + width + 0.1;
78 |     height = (float(totals[i]) / ymax) * 0.8;
79 |     rec = Rectangle(Point(left,0.1), Point(right,0.1+height))
80 |     rec.setFill('blue')
81 |     rec.draw(win)
82 | 
83 | win.getMouse()
84 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/greet.py:
--------------------------------------------------------------------------------
1 | name = raw_input('Enter your name:')
2 | print 'Hello', name
3 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/grep.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | hand = open('mbox.txt')
 4 | search = raw_input('Enter a regular expression: ')
 5 | count = 0 
 6 | for line in hand:
 7 |     line = line.rstrip()
 8 |     if re.search(search,line) : count = count + 1
 9 | 
10 | print 'mbox.txt had',count,'lines that matched',search
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/hidden.py:
--------------------------------------------------------------------------------
1 | # Keep this file separate
2 | # https://apps.twitter.com/
3 | 
4 | def oauth() :
5 |     return { "consumer_key" : "h7Lu...Ng",
6 |         "consumer_secret" : "dNKenAC3New...mmn7Q",
7 |         "token_key" : "10185562-eibxCp9n2...P4GEQQOSGI",
8 |         "token_secret" : "H0ycCFemmC4wyf1...qoIpBo" }
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/intro-short.txt:
--------------------------------------------------------------------------------
 1 | Why should you learn to write programs?
 2 | 
 3 | Writing programs (or programming) is a very creative 
 4 | and rewarding activity.  You can write programs for 
 5 | many reasons, ranging from making your living to solving
 6 | a difficult data analysis problem to having fun to helping
 7 | someone else solve a problem.  This book assumes that 
 8 | everyone needs to know how to program, and that once 
 9 | you know how to program you will figure out what you want 
10 | to do with your newfound skills.  
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/json1.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | data = """{
 4 |   "name" : "Chuck",
 5 |   "phone" : {
 6 |     "type" : "intl",
 7 |     "number" : "+1 734 303 4456"
 8 |    },
 9 |    "email" : {
10 |      "hide" : "yes"
11 |    }
12 | }"""
13 | 
14 | info = json.loads(data)
15 | print 'Name:',info["name"]
16 | print 'Hide:',info["email"]["hide"]
17 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/json2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | input = '''
 4 | [
 5 |   { "id" : "001",
 6 |     "x" : "2",
 7 |     "name" : "Chuck"
 8 |   } ,
 9 |   { "id" : "009",
10 |     "x" : "7",
11 |     "name" : "Chuck"
12 |   } 
13 | ]'''
14 | 
15 | info = json.loads(input)
16 | print 'User count:', len(info)
17 | 
18 | for item in info:
19 |     print 'Name', item['name']
20 |     print 'Id', item['id']
21 |     print 'Attribute', item['x']
22 | 
23 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/largest.py:
--------------------------------------------------------------------------------
1 | largest = None
2 | print 'Before:', largest
3 | for iterval in [3, 41, 12, 9, 74, 15]:
4 |     if largest == None or largest < iterval:
5 |         largest = iterval
6 |     print 'Loop:', iterval, largest
7 | print 'Largest:', largest
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/mailcount.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter file name: ')
 2 | fhand = open(fname)
 3 | c = dict()
 4 | for line in fhand:
 5 |     if not line.startswith('From ') : continue
 6 |     pieces = line.split()
 7 |     email = pieces[1]
 8 |     c[email] = c.get(email,0) + 1
 9 | 
10 | print c
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/mailtop.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter file name: ')
 2 | fhand = open(fname)
 3 | c = dict()
 4 | for line in fhand:
 5 |     if not line.startswith('From ') : continue
 6 |     pieces = line.split()
 7 |     email = pieces[1]
 8 |     c[email] = c.get(email,0) + 1
 9 | 
10 | bigc = None
11 | bige = None
12 | for word in c:
13 |     value = c[word]
14 |     if bigc == None or value > bigc:
15 |         bigw = word
16 |         bigc = value
17 | 
18 | print bigw, bigc
19 | 
20 | 
21 |   
22 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/open.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox.txt')
2 | count = 0
3 | for line in fhand:
4 |     count = count + 1
5 | print 'Line Count:', count
6 | 
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/pagerank.zip


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012, Michael Bostock
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * The name Michael Bostock may not be used to endorse or promote products
15 |   derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/README.txt:
--------------------------------------------------------------------------------
  1 | Simple Python Search Spider, Page Ranker, and Visualizer
  2 | 
  3 | This is a set of programs that emulate some of the functions of a 
  4 | search engine.  They store their data in a SQLITE3 database named
  5 | 'spider.sqlite'.  This file can be removed at any time to restart the
  6 | process.   
  7 | 
  8 | You should install the SQLite browser to view and modify 
  9 | the databases from:
 10 | 
 11 | http://sqlitebrowser.org/
 12 | 
 13 | This program crawls a web site and pulls a series of pages into the
 14 | database, recording the links between pages.
 15 | 
 16 | Mac: rm spider.sqlite
 17 | Mac: python spider.py
 18 | 
 19 | Win: del spider.sqlite
 20 | Win: spider.py
 21 | 
 22 | Enter web url or enter: http://www.dr-chuck.com/
 23 | ['http://www.dr-chuck.com']
 24 | How many pages:2
 25 | 1 http://www.dr-chuck.com/ 12
 26 | 2 http://www.dr-chuck.com/csev-blog/ 57
 27 | How many pages:
 28 | 
 29 | In this sample run, we told it to crawl a website and retrieve two 
 30 | pages.  If you restart the program again and tell it to crawl more
 31 | pages, it will not re-crawl any pages already in the database.  Upon 
 32 | restart it goes to a random non-crawled page and starts there.  So 
 33 | each successive run of spider.py is additive.
 34 | 
 35 | Mac: python spider.py 
 36 | Win: spider.py
 37 | 
 38 | Enter web url or enter: http://www.dr-chuck.com/
 39 | ['http://www.dr-chuck.com']
 40 | How many pages:3
 41 | 3 http://www.dr-chuck.com/csev-blog 57
 42 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1
 43 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13
 44 | How many pages:
 45 | 
 46 | You can have multiple starting points in the same database - 
 47 | within the program these are called "webs".   The spider
 48 | chooses randomly amongst all non-visited links across all
 49 | the webs.
 50 | 
 51 | If your code fails complainin about certificate probems, 
 52 | there is some code (SSL) that can be un-commented to work
 53 | around certificate problems.
 54 | 
 55 | If you want to dump the contents of the spider.sqlite file, you can 
 56 | run spdump.py as follows:
 57 | 
 58 | Mac: python spdump.py 
 59 | Win: spdump.py
 60 | 
 61 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog')
 62 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
 63 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/')
 64 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
 65 | 4 rows.
 66 | 
 67 | This shows the number of incoming links, the old page rank, the new page
 68 | rank, the id of the page, and the url of the page.  The spdump.py program
 69 | only shows pages that have at least one incoming link to them.
 70 | 
 71 | Once you have a few pages in the database, you can run Page Rank on the
 72 | pages using the sprank.py program.  You simply tell it how many Page
 73 | Rank iterations to run.
 74 | 
 75 | Mac: python sprank.py 
 76 | Win: sprank.py 
 77 | 
 78 | How many iterations:2
 79 | 1 0.546848992536
 80 | 2 0.226714939664
 81 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)]
 82 | 
 83 | You can dump the database again to see that page rank has been updated:
 84 | 
 85 | Mac: python spdump.py 
 86 | Win: spdump.py 
 87 | 
 88 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog')
 89 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
 90 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/')
 91 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
 92 | 4 rows.
 93 | 
 94 | You can run sprank.py as many times as you like and it will simply refine
 95 | the page rank the more times you run it.  You can even run sprank.py a few times
 96 | and then go spider a few more pages sith spider.py and then run sprank.py
 97 | to converge the page ranks.
 98 | 
 99 | If you want to restart the Page Rank calculations without re-spidering the 
100 | web pages, you can use spreset.py
101 | 
102 | Mac: python spreset.py 
103 | Win: spreset.py 
104 | 
105 | All pages set to a rank of 1.0
106 | 
107 | Mac: python sprank.py 
108 | Win: sprank.py 
109 | 
110 | How many iterations:50
111 | 1 0.546848992536
112 | 2 0.226714939664
113 | 3 0.0659516187242
114 | 4 0.0244199333
115 | 5 0.0102096489546
116 | 6 0.00610244329379
117 | ...
118 | 42 0.000109076928206
119 | 43 9.91987599002e-05
120 | 44 9.02151706798e-05
121 | 45 8.20451504471e-05
122 | 46 7.46150183837e-05
123 | 47 6.7857770908e-05
124 | 48 6.17124694224e-05
125 | 49 5.61236959327e-05
126 | 50 5.10410499467e-05
127 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)]
128 | 
129 | For each iteration of the page rank algorithm it prints the average
130 | change per page of the page rank.   The network initially is quite 
131 | unbalanced and so the individual page ranks are changeing wildly.
132 | But in a few short iterations, the page rank converges.  You 
133 | should run prank.py long enough that the page ranks converge.
134 | 
135 | If you want to visualize the current top pages in terms of page rank,
136 | run spjson.py to write the pages out in JSON format to be viewed in a
137 | web browser.
138 | 
139 | Mac: python spjson.py 
140 | Win: spjson.py 
141 | 
142 | Creating JSON output on spider.js...
143 | How many nodes? 30
144 | Open force.html in a browser to view the visualization
145 | 
146 | You can view this data by opening the file force.html in your web browser.  
147 | This shows an automatic layout of the nodes and links.  You can click and 
148 | drag any node and you can also double click on a node to find the URL
149 | that is represented by the node.
150 | 
151 | This visualization is provided using the force layout from:
152 | 
153 | http://mbostock.github.com/d3/
154 | 
155 | If you rerun the other utilities and then re-run spjson.py - you merely
156 | have to press refresh in the browser to get the new data from spider.js.
157 | 
158 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/force.css:
--------------------------------------------------------------------------------
 1 | circle.node {
 2 |   stroke: #fff;
 3 |   stroke-width: 1.5px;
 4 | }
 5 | 
 6 | line.link {
 7 |   stroke: #999;
 8 |   stroke-opacity: .6;
 9 | }
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/force.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Force-Directed Layout</title>
 5 |     <script type="text/javascript" src="d3.v2.js"></script>
 6 |     <script type="text/javascript" src="spider.js"></script>
 7 |     <link type="text/css" rel="stylesheet" href="force.css"/>
 8 |   </head>
 9 |   <body style="font-family: sans-serif;">
10 |     <script>
11 |         document.write("<p>Starting url: "+spiderJson.nodes[0].url+"</p>");
12 |     </script>
13 |     <div id="chart" style="border:1px"></div>
14 |     <script type="text/javascript" src="force.js"></script>
15 | 	<p>If you don't see a chart above, check the JavaScript console. You may
16 | 	need to use a different browser.</p>
17 |   </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/force.js:
--------------------------------------------------------------------------------
 1 | var width = 600,
 2 |     height = 600;
 3 | 
 4 | var color = d3.scale.category20();
 5 | 
 6 | var dist = (width + height) / 4;
 7 | 
 8 | var force = d3.layout.force()
 9 |     .charge(-120)
10 |     .linkDistance(dist)
11 |     .size([width, height]);
12 | 
13 | function getrank(rval) {
14 |   return (rval/2.0) + 3;
15 | }
16 | 
17 | function getcolor(rval) {
18 |   return color(rval);
19 | }
20 | 
21 | var svg = d3.select("#chart").append("svg")
22 |     .attr("width", width)
23 |     .attr("height", height);
24 | 
25 | function loadData(json) {
26 |   force
27 |       .nodes(json.nodes)
28 |       .links(json.links);
29 | 
30 |     var k = Math.sqrt(json.nodes.length / (width * height));
31 | 
32 |     force
33 |         .charge(-10 / k)
34 |         .gravity(100 * k)
35 |         .start();
36 | 
37 |   var link = svg.selectAll("line.link")
38 |       .data(json.links)
39 |       .enter().append("line")
40 |       .attr("class", "link")
41 |       .style("stroke-width", function(d) { return Math.sqrt(d.value); });
42 | 
43 |   var node = svg.selectAll("circle.node")
44 |       .data(json.nodes)
45 |       .enter().append("circle")
46 |       .attr("class", "node")
47 |       .attr("r", function(d) { return getrank(d.rank); } )
48 |       .style("fill", function(d) { return getcolor(d.rank); })
49 |       .on("dblclick",function(d) { 
50 |             if ( confirm('Do you want to open '+d.url) ) 
51 |                 window.open(d.url,'_new',''); 
52 |             d3.event.stopPropagation();
53 |         })
54 |       .call(force.drag);
55 | 
56 |   node.append("title")
57 |       .text(function(d) { return d.url; });
58 | 
59 |   force.on("tick", function() {
60 |     link.attr("x1", function(d) { return d.source.x; })
61 |         .attr("y1", function(d) { return d.source.y; })
62 |         .attr("x2", function(d) { return d.target.x; })
63 |         .attr("y2", function(d) { return d.target.y; });
64 | 
65 |     node.attr("cx", function(d) { return d.x; })
66 |         .attr("cy", function(d) { return d.y; });
67 |   });
68 | 
69 | }
70 | loadData(spiderJson);
71 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spdump.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 
 7 |      FROM Pages JOIN Links ON Pages.id = Links.to_id
 8 |      WHERE html IS NOT NULL
 9 |      GROUP BY id ORDER BY inbound DESC''')
10 | 
11 | count = 0
12 | for row in cur :
13 |     if count < 50 : print row
14 |     count = count + 1
15 | print count, 'rows.'
16 | cur.close()
17 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spider.js:
--------------------------------------------------------------------------------
 1 | spiderJson = {"nodes":[
 2 | {"weight":1,"rank":0.0, "id":1, "url":"http://python-data.dr-chuck.net"},
 3 | {"weight":1,"rank":4.66423227024, "id":4, "url":"http://python-data.dr-chuck.net/comments_42.html"},
 4 | {"weight":1,"rank":1.38142061792, "id":7, "url":"http://python-data.dr-chuck.net/known_by_42.html"},
 5 | {"weight":1,"rank":0.690710255581, "id":9, "url":"http://python-data.dr-chuck.net/known_by_Kaylyn.html"},
 6 | {"weight":2,"rank":2.26669663573, "id":40, "url":"http://python-data.dr-chuck.net/known_by_Takua.html"},
 7 | {"weight":1,"rank":0.690710255581, "id":82, "url":"http://python-data.dr-chuck.net/known_by_Marwan.html"},
 8 | {"weight":2,"rank":7.45553422719, "id":85, "url":"http://python-data.dr-chuck.net/known_by_Samiya.html"},
 9 | {"weight":2,"rank":8.48734569457, "id":145, "url":"http://python-data.dr-chuck.net/known_by_Shihed.html"},
10 | {"weight":1,"rank":0.518032667194, "id":189, "url":"http://python-data.dr-chuck.net/known_by_Cassidy.html"},
11 | {"weight":2,"rank":1.56869025396, "id":199, "url":"http://python-data.dr-chuck.net/known_by_Vinnie.html"},
12 | {"weight":2,"rank":2.54881807574, "id":203, "url":"http://python-data.dr-chuck.net/known_by_Charlee.html"},
13 | {"weight":1,"rank":8.83695381234, "id":248, "url":"http://python-data.dr-chuck.net/known_by_Atli.html"},
14 | {"weight":2,"rank":4.16614971195, "id":309, "url":"http://python-data.dr-chuck.net/known_by_Abbiegail.html"},
15 | {"weight":2,"rank":2.2314317079, "id":326, "url":"http://python-data.dr-chuck.net/known_by_Nisha.html"},
16 | {"weight":1,"rank":1.21603900362, "id":382, "url":"http://python-data.dr-chuck.net/known_by_Ciar.html"},
17 | {"weight":1,"rank":1.89945314693, "id":413, "url":"http://python-data.dr-chuck.net/known_by_Brodie.html"},
18 | {"weight":2,"rank":19.0, "id":501, "url":"http://python-data.dr-chuck.net/known_by_Kylar.html"},
19 | {"weight":2,"rank":5.3834045047, "id":642, "url":"http://python-data.dr-chuck.net/known_by_Mohamed.html"},
20 | {"weight":1,"rank":3.93023811326, "id":676, "url":"http://python-data.dr-chuck.net/known_by_Oluwaferanmi.html"},
21 | {"weight":1,"rank":2.59745947896, "id":813, "url":"http://python-data.dr-chuck.net/known_by_Maree.html"},
22 | {"weight":1,"rank":1.77055254257, "id":873, "url":"http://python-data.dr-chuck.net/known_by_Shaw.html"}],
23 | "links":[
24 | {"source":0,"target":1,"value":3},
25 | {"source":0,"target":2,"value":3},
26 | {"source":0,"target":0,"value":3},
27 | {"source":2,"target":3,"value":3},
28 | {"source":2,"target":4,"value":3},
29 | {"source":2,"target":5,"value":3},
30 | {"source":2,"target":6,"value":3},
31 | {"source":5,"target":7,"value":3},
32 | {"source":5,"target":8,"value":3},
33 | {"source":5,"target":9,"value":3},
34 | {"source":5,"target":10,"value":3},
35 | {"source":6,"target":11,"value":3},
36 | {"source":4,"target":12,"value":3},
37 | {"source":4,"target":13,"value":3},
38 | {"source":4,"target":14,"value":3},
39 | {"source":8,"target":15,"value":3},
40 | {"source":7,"target":16,"value":3},
41 | {"source":13,"target":17,"value":3},
42 | {"source":10,"target":18,"value":3},
43 | {"source":14,"target":19,"value":3},
44 | {"source":18,"target":20,"value":3},
45 | {"source":18,"target":17,"value":3},
46 | {"source":20,"target":9,"value":3},
47 | {"source":17,"target":6,"value":3},
48 | {"source":9,"target":12,"value":3}]};


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spider.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import urllib
  3 | import ssl 
  4 | from urlparse import urljoin
  5 | from urlparse import urlparse
  6 | from BeautifulSoup import *
  7 | 
  8 | # Deal with SSL certificate anomalies Python > 2.7
  9 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 10 | scontext = None
 11 | 
 12 | conn = sqlite3.connect('spider.sqlite')
 13 | cur = conn.cursor()
 14 | 
 15 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 
 16 |     (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 
 17 |      error INTEGER, old_rank REAL, new_rank REAL)''')
 18 | 
 19 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 
 20 |     (from_id INTEGER, to_id INTEGER)''')
 21 | 
 22 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
 23 | 
 24 | # Check to see if we are already in progress...
 25 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
 26 | row = cur.fetchone()
 27 | if row is not None:
 28 |     print "Restarting existing crawl.  Remove spider.sqlite to start a fresh crawl."
 29 | else :
 30 |     starturl = raw_input('Enter web url or enter: ')
 31 |     if ( len(starturl) < 1 ) : starturl = 'http://python-data.dr-chuck.net/'
 32 |     if ( starturl.endswith('/') ) : starturl = starturl[:-1]
 33 |     web = starturl
 34 |     if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
 35 |         pos = starturl.rfind('/')
 36 |         web = starturl[:pos]
 37 | 
 38 |     if ( len(web) > 1 ) :
 39 |         cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
 40 |         cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 
 41 |         conn.commit()
 42 | 
 43 | # Get the current webs
 44 | cur.execute('''SELECT url FROM Webs''')
 45 | webs = list()
 46 | for row in cur:
 47 |     webs.append(str(row[0]))
 48 | 
 49 | print webs
 50 | 
 51 | many = 0
 52 | while True:
 53 |     if ( many < 1 ) :
 54 |         sval = raw_input('How many pages:')
 55 |         if ( len(sval) < 1 ) : break
 56 |         many = int(sval)
 57 |     many = many - 1
 58 | 
 59 |     cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
 60 |     try:
 61 |         row = cur.fetchone()
 62 |         # print row
 63 |         fromid = row[0]
 64 |         url = row[1]
 65 |     except:
 66 |         print 'No unretrieved HTML pages found'
 67 |         many = 0
 68 |         break
 69 | 
 70 |     print fromid, url, 
 71 | 
 72 |     # If we are retrieving this page, there should be no links from it
 73 |     cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
 74 |     try:
 75 |         # Deal with SSL certificate anomalies Python > 2.7
 76 |         # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 77 |         # document = urllib.urlopen(url, context=scontext)
 78 | 
 79 |         # Normal Unless you encounter certificate problems
 80 |         document = urllib.urlopen(url)
 81 | 
 82 |         html = document.read()
 83 |         if document.getcode() != 200 :
 84 |             print "Error on page: ",document.getcode()
 85 |             cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
 86 | 
 87 |         if 'text/html' != document.info().gettype() :
 88 |             print "Ignore non text/html page"
 89 |             cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
 90 |             conn.commit()
 91 |             continue
 92 | 
 93 |         print '('+str(len(html))+')',
 94 | 
 95 |         soup = BeautifulSoup(html)
 96 |     except KeyboardInterrupt:
 97 |         print ''
 98 |         print 'Program interrupted by user...'
 99 |         break
100 |     except:
101 |         print "Unable to retrieve or parse page"
102 |         cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
103 |         conn.commit()
104 |         continue
105 | 
106 |     cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 
107 |     cur.execute('UPDATE Pages SET html=? WHERE url=?', (buffer(html), url ) )
108 |     conn.commit()
109 | 
110 |     # Retrieve all of the anchor tags
111 |     tags = soup('a')
112 |     count = 0
113 |     for tag in tags:
114 |         href = tag.get('href', None)
115 |         if ( href is None ) : continue
116 |         # Resolve relative references like href="/contact"
117 |         up = urlparse(href)
118 |         if ( len(up.scheme) < 1 ) :
119 |             href = urljoin(url, href)
120 |         ipos = href.find('#')
121 |         if ( ipos > 1 ) : href = href[:ipos]
122 |         if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
123 |         if ( href.endswith('/') ) : href = href[:-1]
124 |         # print href
125 |         if ( len(href) < 1 ) : continue
126 | 
127 |         # Check if the URL is in any of the webs
128 |         found = False
129 |         for web in webs:
130 |             if ( href.startswith(web) ) :
131 |                 found = True
132 |                 break
133 |         if not found : continue
134 | 
135 |         cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 
136 |         count = count + 1
137 |         conn.commit()
138 | 
139 |         cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
140 |         try:
141 |             row = cur.fetchone()
142 |             toid = row[0]
143 |         except:
144 |             print 'Could not retrieve id'
145 |             continue
146 |         # print fromid, toid
147 |         cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 
148 | 
149 | 
150 |     print count
151 | 
152 | cur.close()
153 | 
154 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spjson.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | print "Creating JSON output on spider.js..."
 7 | howmany = int(raw_input("How many nodes? "))
 8 | 
 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 
10 |     FROM Pages JOIN Links ON Pages.id = Links.to_id
11 |     WHERE html IS NOT NULL AND ERROR IS NULL
12 |     GROUP BY id ORDER BY id,inbound''')
13 | 
14 | fhand = open('spider.js','w')
15 | nodes = list()
16 | maxrank = None
17 | minrank = None
18 | for row in cur :
19 |     nodes.append(row)
20 |     rank = row[2]
21 |     if maxrank < rank or maxrank is None : maxrank = rank
22 |     if minrank > rank or minrank is None : minrank = rank
23 |     if len(nodes) > howmany : break
24 | 
25 | if maxrank == minrank or maxrank is None or minrank is None:
26 |     print "Error - please run sprank.py to compute page rank"
27 |     quit()
28 | 
29 | fhand.write('spiderJson = {"nodes":[\n')
30 | count = 0
31 | map = dict()
32 | ranks = dict()
33 | for row in nodes :
34 |     if count > 0 : fhand.write(',\n')
35 |     # print row
36 |     rank = row[2]
37 |     rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 
38 |     fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',')
39 |     fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}')
40 |     map[row[3]] = count
41 |     ranks[row[3]] = rank
42 |     count = count + 1
43 | fhand.write('],\n')
44 | 
45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
46 | fhand.write('"links":[\n')
47 | 
48 | count = 0
49 | for row in cur :
50 |     # print row
51 |     if row[0] not in map or row[1] not in map : continue
52 |     if count > 0 : fhand.write(',\n')
53 |     rank = ranks[row[0]]
54 |     srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 
55 |     fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}')
56 |     count = count + 1
57 | fhand.write(']};')
58 | fhand.close()
59 | cur.close()
60 | 
61 | print "Open force.html in a browser to view the visualization"
62 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/sprank.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | 
  3 | conn = sqlite3.connect('spider.sqlite')
  4 | cur = conn.cursor()
  5 | 
  6 | # Find the ids that send out page rank - we only are interested
  7 | # in pages in the SCC that have in and out links
  8 | cur.execute('''SELECT DISTINCT from_id FROM Links''')
  9 | from_ids = list()
 10 | for row in cur: 
 11 |     from_ids.append(row[0])
 12 | 
 13 | # Find the ids that receive page rank 
 14 | to_ids = list()
 15 | links = list()
 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
 17 | for row in cur:
 18 |     from_id = row[0]
 19 |     to_id = row[1]
 20 |     if from_id == to_id : continue
 21 |     if from_id not in from_ids : continue
 22 |     if to_id not in from_ids : continue
 23 |     links.append(row)
 24 |     if to_id not in to_ids : to_ids.append(to_id)
 25 | 
 26 | # Get latest page ranks for strongly connected component
 27 | prev_ranks = dict()
 28 | for node in from_ids:
 29 |     cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, ))
 30 |     row = cur.fetchone()
 31 |     prev_ranks[node] = row[0]
 32 | 
 33 | sval = raw_input('How many iterations:')
 34 | many = 1
 35 | if ( len(sval) > 0 ) : many = int(sval)
 36 | 
 37 | # Sanity check
 38 | if len(prev_ranks) < 1 : 
 39 |     print "Nothing to page rank.  Check data."
 40 |     quit()
 41 | 
 42 | # Lets do Page Rank in memory so it is really fast
 43 | for i in range(many):
 44 |     # print prev_ranks.items()[:5]
 45 |     next_ranks = dict();
 46 |     total = 0.0
 47 |     for (node, old_rank) in prev_ranks.items():
 48 |         total = total + old_rank
 49 |         next_ranks[node] = 0.0
 50 |     # print total
 51 | 
 52 |     # Find the number of outbound links and sent the page rank down each
 53 |     for (node, old_rank) in prev_ranks.items():
 54 |         # print node, old_rank
 55 |         give_ids = list()
 56 |         for (from_id, to_id) in links:
 57 |             if from_id != node : continue
 58 |            #  print '   ',from_id,to_id
 59 | 
 60 |             if to_id not in to_ids: continue
 61 |             give_ids.append(to_id)
 62 |         if ( len(give_ids) < 1 ) : continue
 63 |         amount = old_rank / len(give_ids)
 64 |         # print node, old_rank,amount, give_ids
 65 |     
 66 |         for id in give_ids:
 67 |             next_ranks[id] = next_ranks[id] + amount
 68 |     
 69 |     newtot = 0
 70 |     for (node, next_rank) in next_ranks.items():
 71 |         newtot = newtot + next_rank
 72 |     evap = (total - newtot) / len(next_ranks)
 73 | 
 74 |     # print newtot, evap
 75 |     for node in next_ranks:
 76 |         next_ranks[node] = next_ranks[node] + evap
 77 | 
 78 |     newtot = 0
 79 |     for (node, next_rank) in next_ranks.items():
 80 |         newtot = newtot + next_rank
 81 | 
 82 |     # Compute the per-page average change from old rank to new rank
 83 |     # As indication of convergence of the algorithm
 84 |     totdiff = 0
 85 |     for (node, old_rank) in prev_ranks.items():
 86 |         new_rank = next_ranks[node]
 87 |         diff = abs(old_rank-new_rank)
 88 |         totdiff = totdiff + diff
 89 | 
 90 |     avediff = totdiff / len(prev_ranks)
 91 |     print i+1, avediff
 92 | 
 93 |     # rotate
 94 |     prev_ranks = next_ranks
 95 | 
 96 | # Put the final ranks back into the database
 97 | print next_ranks.items()[:5]
 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''')
 99 | for (id, new_rank) in next_ranks.items() :
100 |     cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id))
101 | conn.commit()
102 | cur.close()
103 | 
104 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spreset.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''')
 7 | conn.commit()
 8 | 
 9 | cur.close()
10 | 
11 | print "All pages set to a rank of 1.0"
12 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pals.py:
--------------------------------------------------------------------------------
1 | friends = ['Joseph', 'Glenn', 'Sally']
2 | for friend in friends:
3 |     print 'Happy New Year:', friend
4 | print 'Done!'
5 | 
6 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/party1.py:
--------------------------------------------------------------------------------
 1 | class PartyAnimal:
 2 |    x = 0
 3 | 
 4 |    def party(self) :
 5 |      self.x = self.x + 1
 6 |      print "So far",self.x
 7 | 
 8 | an = PartyAnimal()
 9 | 
10 | an.party()
11 | an.party()
12 | an.party()
13 | 
14 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/party2.py:
--------------------------------------------------------------------------------
 1 | class PartyAnimal:
 2 |    x = 0
 3 | 
 4 |    def party(self) :
 5 |      self.x = self.x + 1
 6 |      print "So far",self.x
 7 | 
 8 | an = PartyAnimal()
 9 | 
10 | print "Type", type(an)
11 | print "Dir ", dir(an)
12 | 
13 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/party3.py:
--------------------------------------------------------------------------------
 1 | class PartyAnimal:
 2 |    x = 0
 3 | 
 4 |    def __init__(self):
 5 |      print "I am constructed"
 6 | 
 7 |    def party(self) :
 8 |      self.x = self.x + 1
 9 |      print "So far",self.x
10 | 
11 |    def __del__(self):
12 |      print "I am destructed", self.x
13 | 
14 | an = PartyAnimal()
15 | an.party()
16 | an.party()
17 | an.party()
18 | 
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/party4.py:
--------------------------------------------------------------------------------
 1 | class PartyAnimal:
 2 |    x = 0
 3 |    name = ""
 4 |    def __init__(self, nam):
 5 |      self.name = nam
 6 |      print self.name,"constructed"
 7 | 
 8 |    def party(self) :
 9 |      self.x = self.x + 1
10 |      print self.name,"party count",self.x
11 | 
12 | s = PartyAnimal("Sally")
13 | s.party()
14 | 
15 | j = PartyAnimal("Jim")
16 | j.party()
17 | s.party()
18 | 
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/party5.py:
--------------------------------------------------------------------------------
 1 | class PartyAnimal:
 2 |    x = 0
 3 |    name = ""
 4 |    def __init__(self, nam):
 5 |      self.name = nam
 6 |      print self.name,"constructed"
 7 | 
 8 |    def party(self) :
 9 |      self.x = self.x + 1
10 |      print self.name,"party count",self.x
11 | 
12 | class FootballFan(PartyAnimal):
13 |    points = 0
14 |    def touchdown(self):
15 |       self.points = self.points + 7
16 |       self.party()
17 |       print self.name,"points",self.points
18 | 
19 | s = PartyAnimal("Sally")
20 | s.party()
21 | 
22 | j = FootballFan("Jim")
23 | j.party()
24 | j.touchdown()
25 | 
26 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pay.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Hours: ')
2 | hours = float(inp)
3 | inp = raw_input('Enter Rate: ')
4 | rate = float(inp)
5 | pay = hours * rate
6 | print 'Pay:', pay
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pay2.py:
--------------------------------------------------------------------------------
 1 | inp = raw_input('Enter Hours: ')
 2 | hours = float(inp)
 3 | inp = raw_input('Enter Rate: ')
 4 | rate = float(inp)
 5 | if hours > 40:
 6 |     pay = hours * rate + (hours - 40) * rate * 0.5
 7 | else:
 8 |     pay = hours * rate
 9 | print 'Pay:', pay
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/pay3.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     inp = raw_input('Enter Hours: ')
 3 |     hours = float(inp)
 4 |     inp = raw_input('Enter Rate: ')
 5 |     rate = float(inp)
 6 |     if hours > 40:
 7 |         pay = hours * rate + (hours - 40) * rate * 1.5
 8 |     else:
 9 |         pay = hours * rate
10 |     print 'Pay:', pay
11 | except:
12 |     print 'Error, please enter numeric input'
13 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re01.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 |     line = line.rstrip()
6 |     if re.search('From:', line) : 
7 |         print line
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re02.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 |     line = line.rstrip()
6 |     if re.search('^From:', line) : 
7 |         print line
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re03.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with F and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 |     line = line.rstrip()
6 |     if re.search('^F..m:', line) : 
7 |         print line
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re04.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 |     line = line.rstrip()
6 |     if re.search('^From:.+@', line) : 
7 |         print line
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re05.py:
--------------------------------------------------------------------------------
1 | import re
2 | s = 'This message from csev@umich.edu to cwen@iupui.edu is about a meeting @2PM'
3 | lst = re.findall('\S+@\S+', s)
4 | print lst
5 | 
6 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re06.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | hand = open('mbox-short.txt')
 4 | for line in hand:
 5 |     line = line.rstrip()
 6 |     x = re.findall('\S+@\S+', line)
 7 |     if len(x) > 0 :
 8 |         print x
 9 | 
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re07.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | hand = open('mbox-short.txt')
 4 | for line in hand:
 5 |     line = line.rstrip()
 6 |     x = re.findall('[a-zA-Z0-9]\S+@\S+[a-zA-Z]', line)
 7 |     if len(x) > 0 :
 8 |         print x
 9 | 
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re08.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | hand = open('mbox-short.txt')
 4 | for line in hand:
 5 |     line = line.rstrip()
 6 |     x = re.findall('^X\S*: (\S+)', line)
 7 |     if not x : continue
 8 |     print x
 9 | 
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re09.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 |     line = line.rstrip()
6 |     if re.search('^X\S*: [0-9.]+', line) :
7 |         print line
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re10.py:
--------------------------------------------------------------------------------
1 | import re
2 | hand = open('mbox-short.txt')
3 | for line in hand:
4 |     line = line.rstrip()
5 |     x = re.findall('^X\S*: ([0-9.]+)', line)
6 |     if len(x) > 0 :
7 |         print x
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re11.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | hand = open('mbox-short.txt')
 4 | for line in hand:
 5 |     line = line.rstrip()
 6 |     x = re.findall('^Details:.*rev=([0-9.]+)', line)
 7 |     if len(x) > 0:
 8 |         print x
 9 | 
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re12.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 |     line = line.rstrip()
6 |     x = re.findall('^From .* ([0-9][0-9]):', line)
7 |     if len(x) > 0 : print x
8 | 
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re13.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | hand = open('mbox-short.txt')
 4 | for line in hand:
 5 |     line = line.rstrip()
 6 |     x = re.findall('Author:.*@(\S+)', line)
 7 |     if not x : continue
 8 |     print x
 9 | 
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/re14.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import re
 3 | fname = raw_input('Enter file:')
 4 | hand = open(fname)
 5 | nums = list()
 6 | for line in hand:
 7 |     line = line.rstrip()
 8 |     x = re.findall('New Revision: ([0-9]+)', line)
 9 |     if len(x) == 1 :
10 |         val = float(x[0])
11 |         nums.append(val)
12 | print len(nums)
13 | print sum(nums)/len(nums)
14 | 
15 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/romeo-full.txt:
--------------------------------------------------------------------------------
  1 | Romeo and Juliet
  2 | Act 2, Scene 2 
  3 | 
  4 | SCENE II. Capulet's orchard.
  5 | 
  6 | Enter ROMEO
  7 | 
  8 | ROMEO
  9 | 
 10 | He jests at scars that never felt a wound.
 11 | JULIET appears above at a window
 12 | 
 13 | But, soft! what light through yonder window breaks?
 14 | It is the east, and Juliet is the sun.
 15 | Arise, fair sun, and kill the envious moon,
 16 | Who is already sick and pale with grief,
 17 | That thou her maid art far more fair than she:
 18 | Be not her maid, since she is envious;
 19 | Her vestal livery is but sick and green
 20 | And none but fools do wear it; cast it off.
 21 | It is my lady, O, it is my love!
 22 | O, that she knew she were!
 23 | She speaks yet she says nothing: what of that?
 24 | Her eye discourses; I will answer it.
 25 | I am too bold, 'tis not to me she speaks:
 26 | Two of the fairest stars in all the heaven,
 27 | Having some business, do entreat her eyes
 28 | To twinkle in their spheres till they return.
 29 | What if her eyes were there, they in her head?
 30 | The brightness of her cheek would shame those stars,
 31 | As daylight doth a lamp; her eyes in heaven
 32 | Would through the airy region stream so bright
 33 | That birds would sing and think it were not night.
 34 | See, how she leans her cheek upon her hand!
 35 | O, that I were a glove upon that hand,
 36 | That I might touch that cheek!
 37 | 
 38 | JULIET
 39 | 
 40 | Ay me!
 41 | 
 42 | ROMEO
 43 | 
 44 | She speaks:
 45 | O, speak again, bright angel! for thou art
 46 | As glorious to this night, being o'er my head
 47 | As is a winged messenger of heaven
 48 | Unto the white-upturned wondering eyes
 49 | Of mortals that fall back to gaze on him
 50 | When he bestrides the lazy-pacing clouds
 51 | And sails upon the bosom of the air.
 52 | 
 53 | JULIET
 54 | 
 55 | O Romeo, Romeo! wherefore art thou Romeo?
 56 | Deny thy father and refuse thy name;
 57 | Or, if thou wilt not, be but sworn my love,
 58 | And I'll no longer be a Capulet.
 59 | 
 60 | ROMEO
 61 | 
 62 | [Aside] Shall I hear more, or shall I speak at this?
 63 | 
 64 | JULIET
 65 | 
 66 | 'Tis but thy name that is my enemy;
 67 | Thou art thyself, though not a Montague.
 68 | What's Montague? it is nor hand, nor foot,
 69 | Nor arm, nor face, nor any other part
 70 | Belonging to a man. O, be some other name!
 71 | What's in a name? that which we call a rose
 72 | By any other name would smell as sweet;
 73 | So Romeo would, were he not Romeo call'd,
 74 | Retain that dear perfection which he owes
 75 | Without that title. Romeo, doff thy name,
 76 | And for that name which is no part of thee
 77 | Take all myself.
 78 | 
 79 | ROMEO
 80 | 
 81 | I take thee at thy word:
 82 | Call me but love, and I'll be new baptized;
 83 | Henceforth I never will be Romeo.
 84 | 
 85 | JULIET
 86 | 
 87 | What man art thou that thus bescreen'd in night
 88 | So stumblest on my counsel?
 89 | 
 90 | ROMEO
 91 | 
 92 | By a name
 93 | I know not how to tell thee who I am:
 94 | My name, dear saint, is hateful to myself,
 95 | Because it is an enemy to thee;
 96 | Had I it written, I would tear the word.
 97 | 
 98 | JULIET
 99 | 
100 | My ears have not yet drunk a hundred words
101 | Of that tongue's utterance, yet I know the sound:
102 | Art thou not Romeo and a Montague?
103 | 
104 | ROMEO
105 | 
106 | Neither, fair saint, if either thee dislike.
107 | 
108 | JULIET
109 | 
110 | How camest thou hither, tell me, and wherefore?
111 | The orchard walls are high and hard to climb,
112 | And the place death, considering who thou art,
113 | If any of my kinsmen find thee here.
114 | 
115 | ROMEO
116 | 
117 | With love's light wings did I o'er-perch these walls;
118 | For stony limits cannot hold love out,
119 | And what love can do that dares love attempt;
120 | Therefore thy kinsmen are no let to me.
121 | 
122 | JULIET
123 | 
124 | If they do see thee, they will murder thee.
125 | 
126 | ROMEO
127 | 
128 | Alack, there lies more peril in thine eye
129 | Than twenty of their swords: look thou but sweet,
130 | And I am proof against their enmity.
131 | 
132 | JULIET
133 | 
134 | I would not for the world they saw thee here.
135 | 
136 | ROMEO
137 | 
138 | I have night's cloak to hide me from their sight;
139 | And but thou love me, let them find me here:
140 | My life were better ended by their hate,
141 | Than death prorogued, wanting of thy love.
142 | 
143 | JULIET
144 | 
145 | By whose direction found'st thou out this place?
146 | 
147 | ROMEO
148 | 
149 | By love, who first did prompt me to inquire;
150 | He lent me counsel and I lent him eyes.
151 | I am no pilot; yet, wert thou as far
152 | As that vast shore wash'd with the farthest sea,
153 | I would adventure for such merchandise.
154 | 
155 | JULIET
156 | 
157 | Thou know'st the mask of night is on my face,
158 | Else would a maiden blush bepaint my cheek
159 | For that which thou hast heard me speak to-night
160 | Fain would I dwell on form, fain, fain deny
161 | What I have spoke: but farewell compliment!
162 | Dost thou love me? I know thou wilt say 'Ay,'
163 | And I will take thy word: yet if thou swear'st,
164 | Thou mayst prove false; at lovers' perjuries
165 | Then say, Jove laughs. O gentle Romeo,
166 | If thou dost love, pronounce it faithfully:
167 | Or if thou think'st I am too quickly won,
168 | I'll frown and be perverse an say thee nay,
169 | So thou wilt woo; but else, not for the world.
170 | In truth, fair Montague, I am too fond,
171 | And therefore thou mayst think my 'havior light:
172 | But trust me, gentleman, I'll prove more true
173 | Than those that have more cunning to be strange.
174 | I should have been more strange, I must confess,
175 | But that thou overheard'st, ere I was ware,
176 | My true love's passion: therefore pardon me,
177 | And not impute this yielding to light love,
178 | Which the dark night hath so discovered.
179 | 
180 | ROMEO
181 | 
182 | Lady, by yonder blessed moon I swear
183 | That tips with silver all these fruit-tree tops--
184 | 
185 | JULIET
186 | 
187 | O, swear not by the moon, the inconstant moon,
188 | That monthly changes in her circled orb,
189 | Lest that thy love prove likewise variable.
190 | 
191 | ROMEO
192 | 
193 | What shall I swear by?
194 | 
195 | JULIET
196 | 
197 | Do not swear at all;
198 | Or, if thou wilt, swear by thy gracious self,
199 | Which is the god of my idolatry,
200 | And I'll believe thee.
201 | 
202 | ROMEO
203 | 
204 | If my heart's dear love--
205 | 
206 | JULIET
207 | 
208 | Well, do not swear: although I joy in thee,
209 | I have no joy of this contract to-night:
210 | It is too rash, too unadvised, too sudden;
211 | Too like the lightning, which doth cease to be
212 | Ere one can say 'It lightens.' Sweet, good night!
213 | This bud of love, by summer's ripening breath,
214 | May prove a beauteous flower when next we meet.
215 | Good night, good night! as sweet repose and rest
216 | Come to thy heart as that within my breast!
217 | 
218 | ROMEO
219 | 
220 | O, wilt thou leave me so unsatisfied?
221 | 
222 | JULIET
223 | 
224 | What satisfaction canst thou have to-night?
225 | 
226 | ROMEO
227 | 
228 | The exchange of thy love's faithful vow for mine.
229 | 
230 | JULIET
231 | 
232 | I gave thee mine before thou didst request it:
233 | And yet I would it were to give again.
234 | 
235 | ROMEO
236 | 
237 | Wouldst thou withdraw it? for what purpose, love?
238 | 
239 | JULIET
240 | 
241 | But to be frank, and give it thee again.
242 | And yet I wish but for the thing I have:
243 | My bounty is as boundless as the sea,
244 | My love as deep; the more I give to thee,
245 | The more I have, for both are infinite.
246 | 
247 | Nurse calls within
248 | 
249 | I hear some noise within; dear love, adieu!
250 | Anon, good nurse! Sweet Montague, be true.
251 | Stay but a little, I will come again.
252 | Exit, above
253 | 
254 | ROMEO
255 | 
256 | O blessed, blessed night! I am afeard.
257 | Being in night, all this is but a dream,
258 | Too flattering-sweet to be substantial.
259 | 
260 | Re-enter JULIET, above
261 | 
262 | JULIET
263 | 
264 | Three words, dear Romeo, and good night indeed.
265 | If that thy bent of love be honourable,
266 | Thy purpose marriage, send me word to-morrow,
267 | By one that I'll procure to come to thee,
268 | Where and what time thou wilt perform the rite;
269 | And all my fortunes at thy foot I'll lay
270 | And follow thee my lord throughout the world.
271 | 
272 | Nurse
273 | 
274 | [Within] Madam!
275 | 
276 | JULIET
277 | 
278 | I come, anon.--But if thou mean'st not well,
279 | I do beseech thee--
280 | 
281 | Nurse
282 | [Within] Madam!
283 | 
284 | JULIET
285 | 
286 | By and by, I come:--
287 | To cease thy suit, and leave me to my grief:
288 | To-morrow will I send.
289 | 
290 | ROMEO
291 | 
292 | So thrive my soul--
293 | 
294 | JULIET
295 | 
296 | A thousand times good night!
297 | Exit, above
298 | 
299 | ROMEO
300 | 
301 | A thousand times the worse, to want thy light.
302 | Love goes toward love, as schoolboys from
303 | their books,
304 | But love from love, toward school with heavy looks.
305 | Retiring
306 | 
307 | Re-enter JULIET, above
308 | 
309 | JULIET
310 | 
311 | Hist! Romeo, hist! O, for a falconer's voice,
312 | To lure this tassel-gentle back again!
313 | Bondage is hoarse, and may not speak aloud;
314 | Else would I tear the cave where Echo lies,
315 | And make her airy tongue more hoarse than mine,
316 | With repetition of my Romeo's name.
317 | 
318 | ROMEO
319 | 
320 | It is my soul that calls upon my name:
321 | How silver-sweet sound lovers' tongues by night,
322 | Like softest music to attending ears!
323 | 
324 | JULIET
325 | 
326 | Romeo!
327 | 
328 | ROMEO
329 | 
330 | My dear?
331 | 
332 | JULIET
333 | 
334 | At what o'clock to-morrow
335 | Shall I send to thee?
336 | 
337 | ROMEO
338 | 
339 | At the hour of nine.
340 | 
341 | JULIET
342 | 
343 | I will not fail: 'tis twenty years till then.
344 | I have forgot why I did call thee back.
345 | 
346 | ROMEO
347 | 
348 | Let me stand here till thou remember it.
349 | 
350 | JULIET
351 | 
352 | I shall forget, to have thee still stand there,
353 | Remembering how I love thy company.
354 | 
355 | ROMEO
356 | 
357 | And I'll still stay, to have thee still forget,
358 | Forgetting any other home but this.
359 | 
360 | JULIET
361 | 
362 | 'Tis almost morning; I would have thee gone:
363 | And yet no further than a wanton's bird;
364 | Who lets it hop a little from her hand,
365 | Like a poor prisoner in his twisted gyves,
366 | And with a silk thread plucks it back again,
367 | So loving-jealous of his liberty.
368 | 
369 | ROMEO
370 | 
371 | I would I were thy bird.
372 | 
373 | JULIET
374 | 
375 | Sweet, so would I:
376 | Yet I should kill thee with much cherishing.
377 | Good night, good night! parting is such
378 | sweet sorrow,
379 | That I shall say good night till it be morrow.
380 | 
381 | Exit above
382 | 
383 | ROMEO
384 | 
385 | Sleep dwell upon thine eyes, peace in thy breast!
386 | Would I were sleep and peace, so sweet to rest!
387 | Hence will I to my ghostly father's cell,
388 | His help to crave, and my dear hap to tell.
389 | 
390 | Exit
391 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/romeo.txt:
--------------------------------------------------------------------------------
1 | But soft what light through yonder window breaks
2 | It is the east and Juliet is the sun
3 | Arise fair sun and kill the envious moon
4 | Who is already sick and pale with grief
5 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/roster.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sqlite3
 3 | 
 4 | conn = sqlite3.connect('rosterdb.sqlite')
 5 | cur = conn.cursor()
 6 | 
 7 | # Do some setup
 8 | cur.executescript('''
 9 | DROP TABLE IF EXISTS User;
10 | DROP TABLE IF EXISTS Member;
11 | DROP TABLE IF EXISTS Course;
12 | 
13 | CREATE TABLE User (
14 |     id     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
15 |     name   TEXT UNIQUE
16 | );
17 | 
18 | CREATE TABLE Course (
19 |     id     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
20 |     title  TEXT UNIQUE
21 | );
22 | 
23 | CREATE TABLE Member (
24 |     user_id     INTEGER,
25 |     course_id   INTEGER,
26 |     role        INTEGER,
27 |     PRIMARY KEY (user_id, course_id)
28 | )
29 | ''')
30 | 
31 | fname = raw_input('Enter file name: ')
32 | if ( len(fname) < 1 ) : fname = 'roster_data.json'
33 | 
34 | # [
35 | #   [ "Charley", "si110", 1 ],
36 | #   [ "Mea", "si110", 0 ],
37 | 
38 | str_data = open(fname).read()
39 | json_data = json.loads(str_data)
40 | 
41 | for entry in json_data:
42 | 
43 |     name = entry[0];
44 |     title = entry[1];
45 | 
46 |     print name, title
47 | 
48 |     cur.execute('''INSERT OR IGNORE INTO User (name) 
49 |         VALUES ( ? )''', ( name, ) )
50 |     cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
51 |     user_id = cur.fetchone()[0]
52 | 
53 |     cur.execute('''INSERT OR IGNORE INTO Course (title) 
54 |         VALUES ( ? )''', ( title, ) )
55 |     cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
56 |     course_id = cur.fetchone()[0]
57 | 
58 |     cur.execute('''INSERT OR REPLACE INTO Member
59 |         (user_id, course_id) VALUES ( ?, ? )''', 
60 |         ( user_id, course_id ) )
61 | 
62 |     conn.commit()
63 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/roster.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/roster.zip


--------------------------------------------------------------------------------
/EXAMPLE CODE/roster/roster.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sqlite3
 3 | 
 4 | conn = sqlite3.connect('rosterdb.sqlite')
 5 | cur = conn.cursor()
 6 | 
 7 | # Do some setup
 8 | cur.executescript('''
 9 | DROP TABLE IF EXISTS User;
10 | DROP TABLE IF EXISTS Member;
11 | DROP TABLE IF EXISTS Course;
12 | 
13 | CREATE TABLE User (
14 |     id     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
15 |     name   TEXT UNIQUE
16 | );
17 | 
18 | CREATE TABLE Course (
19 |     id     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
20 |     title  TEXT UNIQUE
21 | );
22 | 
23 | CREATE TABLE Member (
24 |     user_id     INTEGER,
25 |     course_id   INTEGER,
26 |     role        INTEGER,
27 |     PRIMARY KEY (user_id, course_id)
28 | )
29 | ''')
30 | 
31 | fname = raw_input('Enter file name: ')
32 | if ( len(fname) < 1 ) : fname = 'roster_data.json'
33 | 
34 | # [
35 | #   [ "Charley", "si110", 1 ],
36 | #   [ "Mea", "si110", 0 ],
37 | 
38 | str_data = open(fname).read()
39 | json_data = json.loads(str_data)
40 | 
41 | for entry in json_data:
42 | 
43 |     name = entry[0];
44 |     title = entry[1];
45 | 
46 |     print name, title
47 | 
48 |     cur.execute('''INSERT OR IGNORE INTO User (name) 
49 |         VALUES ( ? )''', ( name, ) )
50 |     cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
51 |     user_id = cur.fetchone()[0]
52 | 
53 |     cur.execute('''INSERT OR IGNORE INTO Course (title) 
54 |         VALUES ( ? )''', ( title, ) )
55 |     cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
56 |     course_id = cur.fetchone()[0]
57 | 
58 |     cur.execute('''INSERT OR REPLACE INTO Member
59 |         (user_id, course_id) VALUES ( ?, ? )''', 
60 |         ( user_id, course_id ) )
61 | 
62 |     conn.commit()
63 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/roster/roster_data.json:
--------------------------------------------------------------------------------
   1 | [
   2 |   [
   3 |     "Charley",
   4 |     "si110",
   5 |     1
   6 |   ],
   7 |   [
   8 |     "Mea",
   9 |     "si110",
  10 |     0
  11 |   ],
  12 |   [
  13 |     "Hattie",
  14 |     "si110",
  15 |     0
  16 |   ],
  17 |   [
  18 |     "Lyena",
  19 |     "si110",
  20 |     0
  21 |   ],
  22 |   [
  23 |     "Keziah",
  24 |     "si110",
  25 |     0
  26 |   ],
  27 |   [
  28 |     "Ellyce",
  29 |     "si110",
  30 |     0
  31 |   ],
  32 |   [
  33 |     "Thalia",
  34 |     "si110",
  35 |     0
  36 |   ],
  37 |   [
  38 |     "Meabh",
  39 |     "si110",
  40 |     0
  41 |   ],
  42 |   [
  43 |     "Aria",
  44 |     "si110",
  45 |     0
  46 |   ],
  47 |   [
  48 |     "Reena",
  49 |     "si110",
  50 |     0
  51 |   ],
  52 |   [
  53 |     "Ioannis",
  54 |     "si110",
  55 |     0
  56 |   ],
  57 |   [
  58 |     "Reily",
  59 |     "si110",
  60 |     0
  61 |   ],
  62 |   [
  63 |     "Sidharth",
  64 |     "si110",
  65 |     0
  66 |   ],
  67 |   [
  68 |     "Keiara",
  69 |     "si110",
  70 |     0
  71 |   ],
  72 |   [
  73 |     "Yann",
  74 |     "si110",
  75 |     0
  76 |   ],
  77 |   [
  78 |     "Marykate",
  79 |     "si110",
  80 |     0
  81 |   ],
  82 |   [
  83 |     "Dylan",
  84 |     "si110",
  85 |     0
  86 |   ],
  87 |   [
  88 |     "Kiran",
  89 |     "si110",
  90 |     0
  91 |   ],
  92 |   [
  93 |     "Faizaan",
  94 |     "si110",
  95 |     0
  96 |   ],
  97 |   [
  98 |     "Aneshia",
  99 |     "si110",
 100 |     0
 101 |   ],
 102 |   [
 103 |     "Kamron",
 104 |     "si110",
 105 |     0
 106 |   ],
 107 |   [
 108 |     "Allen",
 109 |     "si110",
 110 |     0
 111 |   ],
 112 |   [
 113 |     "Marshall",
 114 |     "si110",
 115 |     0
 116 |   ],
 117 |   [
 118 |     "Rosa",
 119 |     "si106",
 120 |     1
 121 |   ],
 122 |   [
 123 |     "Nora",
 124 |     "si106",
 125 |     0
 126 |   ],
 127 |   [
 128 |     "Mairin",
 129 |     "si106",
 130 |     0
 131 |   ],
 132 |   [
 133 |     "Zendel",
 134 |     "si106",
 135 |     0
 136 |   ],
 137 |   [
 138 |     "Honie",
 139 |     "si106",
 140 |     0
 141 |   ],
 142 |   [
 143 |     "Betsy",
 144 |     "si106",
 145 |     0
 146 |   ],
 147 |   [
 148 |     "Davie",
 149 |     "si106",
 150 |     0
 151 |   ],
 152 |   [
 153 |     "Larissa",
 154 |     "si106",
 155 |     0
 156 |   ],
 157 |   [
 158 |     "Shaurya",
 159 |     "si106",
 160 |     0
 161 |   ],
 162 |   [
 163 |     "Shania",
 164 |     "si106",
 165 |     0
 166 |   ],
 167 |   [
 168 |     "Sorcha",
 169 |     "si106",
 170 |     0
 171 |   ],
 172 |   [
 173 |     "Jeanna",
 174 |     "si106",
 175 |     0
 176 |   ],
 177 |   [
 178 |     "Temba",
 179 |     "si106",
 180 |     0
 181 |   ],
 182 |   [
 183 |     "Buse",
 184 |     "si106",
 185 |     0
 186 |   ],
 187 |   [
 188 |     "Mohammed",
 189 |     "si106",
 190 |     0
 191 |   ],
 192 |   [
 193 |     "Kayah",
 194 |     "si106",
 195 |     0
 196 |   ],
 197 |   [
 198 |     "Kareena",
 199 |     "si106",
 200 |     0
 201 |   ],
 202 |   [
 203 |     "Dineo",
 204 |     "si106",
 205 |     0
 206 |   ],
 207 |   [
 208 |     "Philippa",
 209 |     "si106",
 210 |     0
 211 |   ],
 212 |   [
 213 |     "Lia",
 214 |     "si206",
 215 |     1
 216 |   ],
 217 |   [
 218 |     "Sharlyn",
 219 |     "si206",
 220 |     0
 221 |   ],
 222 |   [
 223 |     "Linton",
 224 |     "si206",
 225 |     0
 226 |   ],
 227 |   [
 228 |     "Temilade",
 229 |     "si206",
 230 |     0
 231 |   ],
 232 |   [
 233 |     "Areez",
 234 |     "si206",
 235 |     0
 236 |   ],
 237 |   [
 238 |     "MacCartney",
 239 |     "si206",
 240 |     0
 241 |   ],
 242 |   [
 243 |     "Abubakar",
 244 |     "si206",
 245 |     0
 246 |   ],
 247 |   [
 248 |     "Derryn",
 249 |     "si206",
 250 |     0
 251 |   ],
 252 |   [
 253 |     "Elan",
 254 |     "si206",
 255 |     0
 256 |   ],
 257 |   [
 258 |     "Vikki",
 259 |     "si206",
 260 |     0
 261 |   ],
 262 |   [
 263 |     "Anisa",
 264 |     "si206",
 265 |     0
 266 |   ],
 267 |   [
 268 |     "Klevis",
 269 |     "si206",
 270 |     0
 271 |   ],
 272 |   [
 273 |     "Tait",
 274 |     "si206",
 275 |     0
 276 |   ],
 277 |   [
 278 |     "Rhea",
 279 |     "si206",
 280 |     0
 281 |   ],
 282 |   [
 283 |     "Pearsen",
 284 |     "si206",
 285 |     0
 286 |   ],
 287 |   [
 288 |     "Willow",
 289 |     "si206",
 290 |     0
 291 |   ],
 292 |   [
 293 |     "Skye",
 294 |     "si206",
 295 |     0
 296 |   ],
 297 |   [
 298 |     "Caralee",
 299 |     "si206",
 300 |     0
 301 |   ],
 302 |   [
 303 |     "Charlee",
 304 |     "si206",
 305 |     0
 306 |   ],
 307 |   [
 308 |     "Karyn",
 309 |     "si206",
 310 |     0
 311 |   ],
 312 |   [
 313 |     "Elana",
 314 |     "si206",
 315 |     0
 316 |   ],
 317 |   [
 318 |     "Maggie",
 319 |     "si206",
 320 |     0
 321 |   ],
 322 |   [
 323 |     "Eryk",
 324 |     "si206",
 325 |     0
 326 |   ],
 327 |   [
 328 |     "Zulaikha",
 329 |     "si301",
 330 |     1
 331 |   ],
 332 |   [
 333 |     "Elshan",
 334 |     "si301",
 335 |     0
 336 |   ],
 337 |   [
 338 |     "Anastasia",
 339 |     "si301",
 340 |     0
 341 |   ],
 342 |   [
 343 |     "Connar",
 344 |     "si301",
 345 |     0
 346 |   ],
 347 |   [
 348 |     "Anay",
 349 |     "si301",
 350 |     0
 351 |   ],
 352 |   [
 353 |     "Jayla",
 354 |     "si301",
 355 |     0
 356 |   ],
 357 |   [
 358 |     "Cai",
 359 |     "si301",
 360 |     0
 361 |   ],
 362 |   [
 363 |     "Zijie",
 364 |     "si301",
 365 |     0
 366 |   ],
 367 |   [
 368 |     "Riana",
 369 |     "si301",
 370 |     0
 371 |   ],
 372 |   [
 373 |     "Codie",
 374 |     "si301",
 375 |     0
 376 |   ],
 377 |   [
 378 |     "Colette",
 379 |     "si301",
 380 |     0
 381 |   ],
 382 |   [
 383 |     "Lucee",
 384 |     "si301",
 385 |     0
 386 |   ],
 387 |   [
 388 |     "Tatiana",
 389 |     "si301",
 390 |     0
 391 |   ],
 392 |   [
 393 |     "Zhong",
 394 |     "si301",
 395 |     0
 396 |   ],
 397 |   [
 398 |     "Lowri",
 399 |     "si301",
 400 |     0
 401 |   ],
 402 |   [
 403 |     "Maggy",
 404 |     "si301",
 405 |     0
 406 |   ],
 407 |   [
 408 |     "Basher",
 409 |     "si301",
 410 |     0
 411 |   ],
 412 |   [
 413 |     "Tanika",
 414 |     "si301",
 415 |     0
 416 |   ],
 417 |   [
 418 |     "Aria",
 419 |     "si301",
 420 |     0
 421 |   ],
 422 |   [
 423 |     "Belle",
 424 |     "si301",
 425 |     0
 426 |   ],
 427 |   [
 428 |     "Laranya",
 429 |     "si301",
 430 |     0
 431 |   ],
 432 |   [
 433 |     "Dayna",
 434 |     "si301",
 435 |     0
 436 |   ],
 437 |   [
 438 |     "Elleanne",
 439 |     "si301",
 440 |     0
 441 |   ],
 442 |   [
 443 |     "Maanav",
 444 |     "si310",
 445 |     1
 446 |   ],
 447 |   [
 448 |     "Tamta",
 449 |     "si310",
 450 |     0
 451 |   ],
 452 |   [
 453 |     "Frazer",
 454 |     "si310",
 455 |     0
 456 |   ],
 457 |   [
 458 |     "Sacha",
 459 |     "si310",
 460 |     0
 461 |   ],
 462 |   [
 463 |     "Aidan",
 464 |     "si310",
 465 |     0
 466 |   ],
 467 |   [
 468 |     "Abel",
 469 |     "si310",
 470 |     0
 471 |   ],
 472 |   [
 473 |     "Ahtasham",
 474 |     "si310",
 475 |     0
 476 |   ],
 477 |   [
 478 |     "Avinash",
 479 |     "si310",
 480 |     0
 481 |   ],
 482 |   [
 483 |     "Colette",
 484 |     "si310",
 485 |     0
 486 |   ],
 487 |   [
 488 |     "Cohen",
 489 |     "si310",
 490 |     0
 491 |   ],
 492 |   [
 493 |     "Rori",
 494 |     "si310",
 495 |     0
 496 |   ],
 497 |   [
 498 |     "Youer",
 499 |     "si310",
 500 |     0
 501 |   ],
 502 |   [
 503 |     "Jamey",
 504 |     "si310",
 505 |     0
 506 |   ],
 507 |   [
 508 |     "Makenzie",
 509 |     "si310",
 510 |     0
 511 |   ],
 512 |   [
 513 |     "Ida",
 514 |     "si310",
 515 |     0
 516 |   ],
 517 |   [
 518 |     "Alexzander",
 519 |     "si310",
 520 |     0
 521 |   ],
 522 |   [
 523 |     "Kavita",
 524 |     "si310",
 525 |     0
 526 |   ],
 527 |   [
 528 |     "Talia",
 529 |     "si310",
 530 |     0
 531 |   ],
 532 |   [
 533 |     "Anthony",
 534 |     "si310",
 535 |     0
 536 |   ],
 537 |   [
 538 |     "Elona",
 539 |     "si334",
 540 |     1
 541 |   ],
 542 |   [
 543 |     "Inan",
 544 |     "si334",
 545 |     0
 546 |   ],
 547 |   [
 548 |     "Caoilainn",
 549 |     "si334",
 550 |     0
 551 |   ],
 552 |   [
 553 |     "Ainsley",
 554 |     "si334",
 555 |     0
 556 |   ],
 557 |   [
 558 |     "Franciszek",
 559 |     "si334",
 560 |     0
 561 |   ],
 562 |   [
 563 |     "Corrie",
 564 |     "si334",
 565 |     0
 566 |   ],
 567 |   [
 568 |     "Nolan",
 569 |     "si334",
 570 |     0
 571 |   ],
 572 |   [
 573 |     "Makala",
 574 |     "si334",
 575 |     0
 576 |   ],
 577 |   [
 578 |     "Obieluem",
 579 |     "si334",
 580 |     0
 581 |   ],
 582 |   [
 583 |     "Camryn",
 584 |     "si334",
 585 |     0
 586 |   ],
 587 |   [
 588 |     "Honie",
 589 |     "si334",
 590 |     0
 591 |   ],
 592 |   [
 593 |     "Ole",
 594 |     "si334",
 595 |     0
 596 |   ],
 597 |   [
 598 |     "Raine",
 599 |     "si334",
 600 |     0
 601 |   ],
 602 |   [
 603 |     "Tyllor",
 604 |     "si334",
 605 |     0
 606 |   ],
 607 |   [
 608 |     "Diane",
 609 |     "si334",
 610 |     0
 611 |   ],
 612 |   [
 613 |     "Cullen",
 614 |     "si334",
 615 |     0
 616 |   ],
 617 |   [
 618 |     "Taylor",
 619 |     "si334",
 620 |     0
 621 |   ],
 622 |   [
 623 |     "Schekina",
 624 |     "si334",
 625 |     0
 626 |   ],
 627 |   [
 628 |     "Kensey",
 629 |     "si334",
 630 |     0
 631 |   ],
 632 |   [
 633 |     "Zhi",
 634 |     "si334",
 635 |     0
 636 |   ],
 637 |   [
 638 |     "Kiran",
 639 |     "si334",
 640 |     0
 641 |   ],
 642 |   [
 643 |     "Tymoteusz",
 644 |     "si334",
 645 |     0
 646 |   ],
 647 |   [
 648 |     "Windsor",
 649 |     "si363",
 650 |     1
 651 |   ],
 652 |   [
 653 |     "Kashish",
 654 |     "si363",
 655 |     0
 656 |   ],
 657 |   [
 658 |     "Diarmid",
 659 |     "si363",
 660 |     0
 661 |   ],
 662 |   [
 663 |     "Laura",
 664 |     "si363",
 665 |     0
 666 |   ],
 667 |   [
 668 |     "Jaskaran",
 669 |     "si363",
 670 |     0
 671 |   ],
 672 |   [
 673 |     "Presley",
 674 |     "si363",
 675 |     0
 676 |   ],
 677 |   [
 678 |     "Brooklynn",
 679 |     "si363",
 680 |     0
 681 |   ],
 682 |   [
 683 |     "Heddle",
 684 |     "si363",
 685 |     0
 686 |   ],
 687 |   [
 688 |     "Travis",
 689 |     "si363",
 690 |     0
 691 |   ],
 692 |   [
 693 |     "Alx",
 694 |     "si363",
 695 |     0
 696 |   ],
 697 |   [
 698 |     "Airen",
 699 |     "si363",
 700 |     0
 701 |   ],
 702 |   [
 703 |     "Erika",
 704 |     "si363",
 705 |     0
 706 |   ],
 707 |   [
 708 |     "Mackie",
 709 |     "si363",
 710 |     0
 711 |   ],
 712 |   [
 713 |     "Wen",
 714 |     "si363",
 715 |     0
 716 |   ],
 717 |   [
 718 |     "Seaan",
 719 |     "si363",
 720 |     0
 721 |   ],
 722 |   [
 723 |     "Meghan",
 724 |     "si363",
 725 |     0
 726 |   ],
 727 |   [
 728 |     "Ryaan",
 729 |     "si363",
 730 |     0
 731 |   ],
 732 |   [
 733 |     "Imogem",
 734 |     "si364",
 735 |     1
 736 |   ],
 737 |   [
 738 |     "Harlie",
 739 |     "si364",
 740 |     0
 741 |   ],
 742 |   [
 743 |     "Ronnie",
 744 |     "si364",
 745 |     0
 746 |   ],
 747 |   [
 748 |     "Lucca",
 749 |     "si364",
 750 |     0
 751 |   ],
 752 |   [
 753 |     "Shanelle",
 754 |     "si364",
 755 |     0
 756 |   ],
 757 |   [
 758 |     "Ieuan",
 759 |     "si364",
 760 |     0
 761 |   ],
 762 |   [
 763 |     "Anneliese",
 764 |     "si364",
 765 |     0
 766 |   ],
 767 |   [
 768 |     "Simon",
 769 |     "si364",
 770 |     0
 771 |   ],
 772 |   [
 773 |     "Sorche",
 774 |     "si364",
 775 |     0
 776 |   ],
 777 |   [
 778 |     "Nawal",
 779 |     "si364",
 780 |     0
 781 |   ],
 782 |   [
 783 |     "Adelaide",
 784 |     "si364",
 785 |     0
 786 |   ],
 787 |   [
 788 |     "Rhia",
 789 |     "si364",
 790 |     0
 791 |   ],
 792 |   [
 793 |     "Katarzyna",
 794 |     "si364",
 795 |     0
 796 |   ],
 797 |   [
 798 |     "LLeyton",
 799 |     "si364",
 800 |     0
 801 |   ],
 802 |   [
 803 |     "Enzo",
 804 |     "si364",
 805 |     0
 806 |   ],
 807 |   [
 808 |     "Declan",
 809 |     "si364",
 810 |     0
 811 |   ],
 812 |   [
 813 |     "Emelie",
 814 |     "si364",
 815 |     0
 816 |   ],
 817 |   [
 818 |     "Baillie",
 819 |     "si364",
 820 |     0
 821 |   ],
 822 |   [
 823 |     "Shola",
 824 |     "si364",
 825 |     0
 826 |   ],
 827 |   [
 828 |     "Jenna",
 829 |     "si422",
 830 |     1
 831 |   ],
 832 |   [
 833 |     "Miles",
 834 |     "si422",
 835 |     0
 836 |   ],
 837 |   [
 838 |     "Sakina",
 839 |     "si422",
 840 |     0
 841 |   ],
 842 |   [
 843 |     "Melanie",
 844 |     "si422",
 845 |     0
 846 |   ],
 847 |   [
 848 |     "Bailie",
 849 |     "si422",
 850 |     0
 851 |   ],
 852 |   [
 853 |     "Cassy",
 854 |     "si422",
 855 |     0
 856 |   ],
 857 |   [
 858 |     "Nikash",
 859 |     "si422",
 860 |     0
 861 |   ],
 862 |   [
 863 |     "Hebe",
 864 |     "si422",
 865 |     0
 866 |   ],
 867 |   [
 868 |     "Sia",
 869 |     "si422",
 870 |     0
 871 |   ],
 872 |   [
 873 |     "Skyla",
 874 |     "si422",
 875 |     0
 876 |   ],
 877 |   [
 878 |     "Jamaal",
 879 |     "si422",
 880 |     0
 881 |   ],
 882 |   [
 883 |     "Keanna",
 884 |     "si422",
 885 |     0
 886 |   ],
 887 |   [
 888 |     "Vanya",
 889 |     "si422",
 890 |     0
 891 |   ],
 892 |   [
 893 |     "Temperance",
 894 |     "si422",
 895 |     0
 896 |   ],
 897 |   [
 898 |     "Hafiza",
 899 |     "si422",
 900 |     0
 901 |   ],
 902 |   [
 903 |     "Alx",
 904 |     "si422",
 905 |     0
 906 |   ],
 907 |   [
 908 |     "Brigitte",
 909 |     "si422",
 910 |     0
 911 |   ],
 912 |   [
 913 |     "Eliana",
 914 |     "si422",
 915 |     0
 916 |   ],
 917 |   [
 918 |     "Kayden",
 919 |     "si422",
 920 |     0
 921 |   ],
 922 |   [
 923 |     "Man",
 924 |     "si422",
 925 |     0
 926 |   ],
 927 |   [
 928 |     "Jaydyn",
 929 |     "si422",
 930 |     0
 931 |   ],
 932 |   [
 933 |     "Soukina",
 934 |     "si430",
 935 |     1
 936 |   ],
 937 |   [
 938 |     "Stephenjunior",
 939 |     "si430",
 940 |     0
 941 |   ],
 942 |   [
 943 |     "Buddy",
 944 |     "si430",
 945 |     0
 946 |   ],
 947 |   [
 948 |     "Holly",
 949 |     "si430",
 950 |     0
 951 |   ],
 952 |   [
 953 |     "Kamilia",
 954 |     "si430",
 955 |     0
 956 |   ],
 957 |   [
 958 |     "Cassie",
 959 |     "si430",
 960 |     0
 961 |   ],
 962 |   [
 963 |     "Kris",
 964 |     "si430",
 965 |     0
 966 |   ],
 967 |   [
 968 |     "Maia",
 969 |     "si430",
 970 |     0
 971 |   ],
 972 |   [
 973 |     "Abel",
 974 |     "si430",
 975 |     0
 976 |   ],
 977 |   [
 978 |     "Tamika",
 979 |     "si430",
 980 |     0
 981 |   ],
 982 |   [
 983 |     "Deano",
 984 |     "si430",
 985 |     0
 986 |   ],
 987 |   [
 988 |     "Rosa",
 989 |     "si430",
 990 |     0
 991 |   ],
 992 |   [
 993 |     "Georgia",
 994 |     "si430",
 995 |     0
 996 |   ],
 997 |   [
 998 |     "Louie",
 999 |     "si430",
1000 |     0
1001 |   ],
1002 |   [
1003 |     "Kassie",
1004 |     "si430",
1005 |     0
1006 |   ],
1007 |   [
1008 |     "Mutinta",
1009 |     "si430",
1010 |     0
1011 |   ],
1012 |   [
1013 |     "Manwen",
1014 |     "si430",
1015 |     0
1016 |   ]
1017 | ]


--------------------------------------------------------------------------------
/EXAMPLE CODE/search1.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | count = 0
3 | for line in fhand:
4 |     if line.startswith('From:') :
5 |         print line
6 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search10.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 |     words = line.split()
4 |     # print 'Debug:', words
5 |     if len(words) == 0 : continue
6 |     if words[0] != 'From' : continue
7 |     print words[2]
8 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search2.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 |     line = line.rstrip()
4 |     if line.startswith('From:') :
5 |         print line
6 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search3.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 |     line = line.rstrip()
4 |     # Skip 'uninteresting lines'
5 |     if not line.startswith('From:') :
6 |         continue
7 |     # Process our 'interesting' line
8 |     print line
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search4.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 |     line = line.rstrip()
4 |     if line.find('@uct.ac.za') == -1 : continue
5 |     print line
6 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search5.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 |     line = line.rstrip()
4 |     if not line.startswith('From ') : continue
5 |     words = line.split()
6 |     print words[2]
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search6.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter the file name: ')
2 | fhand = open(fname)
3 | count = 0
4 | for line in fhand:
5 |     if line.startswith('Subject:') : 
6 |         count = count + 1
7 | print 'There were', count, 'subject lines in', fname
8 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search7.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter the file name: ')
 2 | try:
 3 |     fhand = open(fname)
 4 | except:
 5 |     print 'File cannot be opened:', fname
 6 |     exit()
 7 | count = 0
 8 | for line in fhand:
 9 |     if line.startswith('Subject:') : 
10 |         count = count + 1
11 | print 'There were', count, 'subject lines in', fname
12 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search8.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | count = 0
3 | for line in fhand:
4 |     words = line.split()
5 |     if words[0] != 'From' : continue
6 |     print words[2]
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/search9.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | count = 0
3 | for line in fhand:
4 |     words = line.split()
5 |     print 'Debug:', words
6 |     if words[0] != 'From' : continue
7 |     print words[2]
8 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/sequence.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter a Number:')
2 | n = int(inp)
3 | while n != 1:
4 |     print n,            # Use comma to suppress newline
5 |     if n%2 == 0:        # n is even
6 |         n = n/2
7 |     else:               # n is odd
8 |         n = n*3+1
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/socket1.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 4 | mysock.connect(('www.py4inf.com', 80))
 5 | mysock.send('GET http://www.py4inf.com/code/romeo.txt HTTP/1.0\n\n')
 6 | 
 7 | while True:
 8 |     data = mysock.recv(512)
 9 |     if ( len(data) < 1 ) :
10 |         break
11 |     print data;
12 | 
13 | mysock.close()
14 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/socket2.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | url = raw_input('Enter: ')
 4 | words = url.split('/')
 5 | host = words[2]
 6 | 
 7 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 8 | mysock.connect((host, 80))
 9 | mysock.send('GET '+url+' HTTP/1.0\n\n')
10 | 
11 | while True:
12 |     data = mysock.recv(512)
13 |     if ( len(data) < 1 ) :
14 |         break
15 |     print data,
16 | 
17 | mysock.close()
18 | 
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/soft.py:
--------------------------------------------------------------------------------
 1 | txt = 'but soft what light in yonder window breaks'
 2 | words = txt.split()
 3 | t = list()
 4 | for word in words:
 5 |     t.append((len(word), word))
 6 | 
 7 | t.sort(reverse=True)
 8 | 
 9 | res = list()
10 | for length, word in t:
11 |     res.append(word)
12 | 
13 | print res
14 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/spamave.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter the file name: ')
 2 | try:
 3 |     fhand = open(fname)
 4 | except:
 5 |     print 'File cannot be opened:', fname
 6 |     exit()
 7 | count = 0
 8 | total = 0
 9 | for line in fhand:
10 |     words = line.split()
11 |     if len(words) != 2 : continue
12 |     if words[0] != 'X-DSPAM-Confidence:' : continue
13 |     try:
14 |         conf = float(words[1])
15 |     except:
16 |         continue
17 |     count = count + 1
18 |     total = total + conf
19 | average = total / count
20 | print 'Average spam confidence:', average
21 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/tracks.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/tracks.zip


--------------------------------------------------------------------------------
/EXAMPLE CODE/tracks/README.txt:
--------------------------------------------------------------------------------
1 | TBD
2 | 
3 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/tracks/tracks.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | import sqlite3
 3 | 
 4 | conn = sqlite3.connect('trackdb.sqlite')
 5 | cur = conn.cursor()
 6 | 
 7 | # Make some fresh tables using executescript()
 8 | cur.executescript('''
 9 | DROP TABLE IF EXISTS Artist;
10 | DROP TABLE IF EXISTS Album;
11 | DROP TABLE IF EXISTS Track;
12 | 
13 | CREATE TABLE Artist (
14 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
15 |     name    TEXT UNIQUE
16 | );
17 | 
18 | CREATE TABLE Album (
19 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
20 |     artist_id  INTEGER,
21 |     title   TEXT UNIQUE
22 | );
23 | 
24 | CREATE TABLE Track (
25 |     id  INTEGER NOT NULL PRIMARY KEY 
26 |         AUTOINCREMENT UNIQUE,
27 |     title TEXT  UNIQUE,
28 |     album_id  INTEGER,
29 |     len INTEGER, rating INTEGER, count INTEGER
30 | );
31 | ''')
32 | 
33 | 
34 | fname = raw_input('Enter file name: ')
35 | if ( len(fname) < 1 ) : fname = 'Library.xml'
36 | 
37 | # <key>Track ID</key><integer>369</integer>
38 | # <key>Name</key><string>Another One Bites The Dust</string>
39 | # <key>Artist</key><string>Queen</string>
40 | def lookup(d, key):
41 |     found = False
42 |     for child in d:
43 |         if found : return child.text
44 |         if child.tag == 'key' and child.text == key :
45 |             found = True
46 |     return None
47 | 
48 | stuff = ET.parse(fname)
49 | all = stuff.findall('dict/dict/dict')
50 | print 'Dict count:', len(all)
51 | for entry in all:
52 |     if ( lookup(entry, 'Track ID') is None ) : continue
53 | 
54 |     name = lookup(entry, 'Name')
55 |     artist = lookup(entry, 'Artist')
56 |     album = lookup(entry, 'Album')
57 |     count = lookup(entry, 'Play Count')
58 |     rating = lookup(entry, 'Rating')
59 |     length = lookup(entry, 'Total Time')
60 | 
61 |     if name is None or artist is None or album is None : 
62 |         continue
63 | 
64 |     print name, artist, album, count, rating, length
65 | 
66 |     cur.execute('''INSERT OR IGNORE INTO Artist (name) 
67 |         VALUES ( ? )''', ( artist, ) )
68 |     cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
69 |     artist_id = cur.fetchone()[0]
70 | 
71 |     cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 
72 |         VALUES ( ?, ? )''', ( album, artist_id ) )
73 |     cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
74 |     album_id = cur.fetchone()[0]
75 | 
76 |     cur.execute('''INSERT OR REPLACE INTO Track
77 |         (title, album_id, len, rating, count) 
78 |         VALUES ( ?, ?, ?, ?, ? )''', 
79 |         ( name, album_id, length, rating, count ) )
80 | 
81 |     conn.commit()
82 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twdump.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | cur.execute('SELECT * FROM Twitter')
 6 | count = 0
 7 | for row in cur :
 8 |     print row
 9 |     count = count + 1
10 | print count, 'rows.'
11 | cur.close()
12 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twfriends.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import twurl
 3 | import json
 4 | import sqlite3
 5 | 
 6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
 7 | 
 8 | conn = sqlite3.connect('friends.sqlite')
 9 | cur = conn.cursor()
10 | 
11 | cur.execute('''CREATE TABLE IF NOT EXISTS People 
12 |     (id INTEGER PRIMARY KEY, name TEXT UNIQUE, retrieved INTEGER)''')
13 | cur.execute('''CREATE TABLE IF NOT EXISTS Follows 
14 |     (from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''')
15 | 
16 | while True:
17 |     acct = raw_input('Enter a Twitter account, or quit: ')
18 |     if ( acct == 'quit' ) : break
19 |     if ( len(acct) < 1 ) :
20 |         cur.execute('SELECT id, name FROM People WHERE retrieved = 0 LIMIT 1')
21 |         try:
22 |             (id, acct) = cur.fetchone()
23 |         except:
24 |             print 'No unretrieved Twitter accounts found'
25 |             continue
26 |     else:
27 |         cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1', 
28 |             (acct, ) )
29 |         try:
30 |             id = cur.fetchone()[0]
31 |         except:
32 |             cur.execute('INSERT OR IGNORE INTO People (name, retrieved) VALUES ( ?, 0)', 
33 |                 ( acct, ) )
34 |             conn.commit()
35 |             if cur.rowcount != 1 : 
36 |                 print 'Error inserting account:',acct
37 |                 continue
38 |             id = cur.lastrowid
39 | 
40 |     url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '5'} )
41 |     print 'Retrieving account', acct
42 |     connection = urllib.urlopen(url)
43 |     data = connection.read()
44 |     headers = connection.info().dict
45 |     print 'Remaining', headers['x-rate-limit-remaining']
46 | 
47 |     js = json.loads(data)
48 |     # print json.dumps(js, indent=4)
49 | 
50 |     cur.execute('UPDATE People SET retrieved=1 WHERE name = ?', (acct, ) )
51 | 
52 |     countnew = 0
53 |     countold = 0
54 |     for u in js['users'] :
55 |         friend = u['screen_name']
56 |         print friend
57 |         cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1', 
58 |             (friend, ) )
59 |         try:
60 |             friend_id = cur.fetchone()[0]
61 |             countold = countold + 1
62 |         except:
63 |             cur.execute('''INSERT OR IGNORE INTO People (name, retrieved) 
64 |                 VALUES ( ?, 0)''', ( friend, ) )
65 |             conn.commit()
66 |             if cur.rowcount != 1 :
67 |                 print 'Error inserting account:',friend
68 |                 continue
69 |             friend_id = cur.lastrowid
70 |             countnew = countnew + 1
71 |         cur.execute('INSERT OR IGNORE INTO Follows (from_id, to_id) VALUES (?, ?)',
72 |             (id, friend_id) )
73 |     print 'New accounts=',countnew,' revisited=',countold
74 |     conn.commit()
75 | 
76 | cur.close()
77 | 
78 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twitter1.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import twurl
 3 | 
 4 | TWITTER_URL = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
 5 | 
 6 | while True:
 7 |     print ''
 8 |     acct = raw_input('Enter Twitter Account:')
 9 |     if ( len(acct) < 1 ) : break
10 |     url = twurl.augment(TWITTER_URL,
11 |         {'screen_name': acct, 'count': '2'} )
12 |     print 'Retrieving', url
13 |     connection = urllib.urlopen(url)
14 |     data = connection.read()
15 |     print data[:250]
16 |     headers = connection.info().dict
17 |     # print headers
18 |     print 'Remaining', headers['x-rate-limit-remaining']
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twitter2.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import twurl
 3 | import json
 4 | 
 5 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
 6 | 
 7 | while True:
 8 |     print ''
 9 |     acct = raw_input('Enter Twitter Account:')
10 |     if ( len(acct) < 1 ) : break
11 |     url = twurl.augment(TWITTER_URL,
12 |         {'screen_name': acct, 'count': '5'} )
13 |     print 'Retrieving', url
14 |     connection = urllib.urlopen(url)
15 |     data = connection.read()
16 |     headers = connection.info().dict
17 |     print 'Remaining', headers['x-rate-limit-remaining']
18 |     js = json.loads(data)
19 |     print json.dumps(js, indent=4)
20 | 
21 |     for u in js['users'] :
22 |         print u['screen_name']
23 |         s = u['status']['text']
24 |         print '  ',s[:50]
25 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twjoin.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('friends.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('SELECT * FROM People')
 7 | count = 0
 8 | print 'People:'
 9 | for row in cur :
10 |     if count < 5: print row
11 |     count = count + 1
12 | print count, 'rows.'
13 | 
14 | cur.execute('SELECT * FROM Follows')
15 | count = 0
16 | print 'Follows:'
17 | for row in cur :
18 |     if count < 5: print row
19 |     count = count + 1
20 | print count, 'rows.'
21 | 
22 | cur.execute('''SELECT * FROM Follows JOIN People 
23 |     ON Follows.to_id = People.id WHERE Follows.from_id = 2''')
24 | count = 0
25 | print 'Connections for id=2:'
26 | for row in cur :
27 |     if count < 5: print row
28 |     count = count + 1
29 | print count, 'rows.'
30 | 
31 | cur.close()
32 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twspider.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import twurl
 3 | import json
 4 | import sqlite3
 5 | 
 6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
 7 | 
 8 | conn = sqlite3.connect('spider.sqlite')
 9 | cur = conn.cursor()
10 | 
11 | cur.execute('''
12 | CREATE TABLE IF NOT EXISTS Twitter (name TEXT, retrieved INTEGER, friends INTEGER)''')
13 | 
14 | while True:
15 |     acct = raw_input('Enter a Twitter account, or quit: ')
16 |     if ( acct == 'quit' ) : break
17 |     if ( len(acct) < 1 ) :
18 |         cur.execute('SELECT name FROM Twitter WHERE retrieved = 0 LIMIT 1')
19 |         try:
20 |             acct = cur.fetchone()[0]
21 |         except:
22 |             print 'No unretrieved Twitter accounts found'
23 |             continue
24 | 
25 |     url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '5'} )
26 |     print 'Retrieving', url
27 |     connection = urllib.urlopen(url)
28 |     data = connection.read()
29 |     headers = connection.info().dict
30 |     print 'Remaining', headers['x-rate-limit-remaining']
31 |     js = json.loads(data)
32 |     # print json.dumps(js, indent=4)
33 | 
34 |     cur.execute('UPDATE Twitter SET retrieved=1 WHERE name = ?', (acct, ) )
35 | 
36 |     countnew = 0
37 |     countold = 0
38 |     for u in js['users'] :
39 |         friend = u['screen_name']
40 |         print friend
41 |         cur.execute('SELECT friends FROM Twitter WHERE name = ? LIMIT 1', 
42 |             (friend, ) )
43 |         try:
44 |             count = cur.fetchone()[0]
45 |             cur.execute('UPDATE Twitter SET friends = ? WHERE name = ?', 
46 |                 (count+1, friend) )
47 |             countold = countold + 1
48 |         except:
49 |             cur.execute('''INSERT INTO Twitter (name, retrieved, friends) 
50 |                 VALUES ( ?, 0, 1 )''', ( friend, ) )
51 |             countnew = countnew + 1
52 |     print 'New accounts=',countnew,' revisited=',countold
53 |     conn.commit()
54 | 
55 | cur.close()
56 | 
57 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twtest.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | from twurl import augment
 3 | 
 4 | print '* Calling Twitter...'
 5 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json',
 6 |         {'screen_name': 'drchuck', 'count': '2'} )
 7 | print url
 8 | connection = urllib.urlopen(url)
 9 | data = connection.read()
10 | print data
11 | headers = connection.info().dict
12 | print headers
13 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/twurl.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import oauth
 3 | import hidden
 4 | 
 5 | def augment(url, parameters) :
 6 |     secrets = hidden.oauth()
 7 |     consumer = oauth.OAuthConsumer(secrets['consumer_key'], secrets['consumer_secret'])
 8 |     token = oauth.OAuthToken(secrets['token_key'],secrets['token_secret'])
 9 | 
10 |     oauth_request = oauth.OAuthRequest.from_consumer_and_token(consumer, 
11 |         token=token, http_method='GET', http_url=url, parameters=parameters)
12 |     oauth_request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, token)
13 |     return oauth_request.to_url()
14 | 
15 | 
16 | def test_me() :
17 |     print '* Calling Twitter...'
18 |     url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json',
19 |         {'screen_name': 'drchuck', 'count': '2'} )
20 |     print url
21 |     connection = urllib.urlopen(url)
22 |     data = connection.read()
23 |     print data
24 |     headers = connection.info().dict
25 |     print headers
26 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcheck.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join
 3 | for (dirname, dirs, files) in os.walk('.'):
 4 |     for filename in files:
 5 |         if filename.endswith('.txt') :
 6 |             thefile = os.path.join(dirname,filename)
 7 |             size = os.path.getsize(thefile)
 8 |             if size == 2578 or size == 2565:
 9 |                 continue
10 |             fhand = open(thefile,'r')
11 |             lines = list()
12 |             for line in fhand:
13 |                 lines.append(line)
14 |             fhand.close()
15 |             if len(lines) > 1:
16 |                 print len(lines), thefile
17 |                 print lines[:4]
18 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcheck2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join
 3 | for (dirname, dirs, files) in os.walk('.'):
 4 |     for filename in files:
 5 |         if filename.endswith('.txt') :
 6 |             thefile = os.path.join(dirname,filename)
 7 |             size = os.path.getsize(thefile)
 8 |             if size == 2578 or size == 2565:
 9 |                 continue
10 |             fhand = open(thefile,'r')
11 |             lines = list()
12 |             for line in fhand:
13 |                 lines.append(line)
14 |             fhand.close()
15 |             if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') :
16 |                 continue
17 |            if len(lines) > 1:
18 |                 print len(lines), thefile
19 |                 print lines[:4]
20 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcheck3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join
 3 | for (dirname, dirs, files) in os.walk('.'):
 4 |     for filename in files:
 5 |         if filename.endswith('.txt') :
 6 |             thefile = os.path.join(dirname,filename)
 7 |             size = os.path.getsize(thefile)
 8 |             if size == 2578 or size == 2565:
 9 |                 print 'T-Mobile:',thefile
10 |                 continue
11 |             fhand = open(thefile,'r')
12 |             lines = list()
13 |             for line in fhand:
14 |                 lines.append(line)
15 |             fhand.close()
16 |             if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') :
17 |                 print 'iPhone:', thefile
18 |                 continue
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcount.py:
--------------------------------------------------------------------------------
1 | import os
2 | count = 0
3 | for dirname, dirs, files in os.walk('.'):
4 |     for filename in files:
5 |         if filename.endswith('.txt') :
6 |             count = count + 1
7 | 
8 | print 'Files:', count
9 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtdelete.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join
 3 | for (dirname, dirs, files) in os.walk('.'):
 4 |     for filename in files:
 5 |         if filename.endswith('.txt') :
 6 |             thefile = os.path.join(dirname,filename)
 7 |             size = os.path.getsize(thefile)
 8 |             if size == 2578 or size == 2565:
 9 |                 print 'T-Mobile:',thefile
10 |                 os.remove(thefile)
11 |                 continue
12 |             fhand = open(thefile,'r')
13 |             lines = list()
14 |             for line in fhand:
15 |                 lines.append(line)
16 |             fhand.close()
17 |             if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') :
18 |                 print 'iPhone:', thefile
19 |                 os.remove(thefile)
20 |                 continue
21 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtmd5.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | from os.path import join
 4 | 
 5 | hashes = dict()
 6 | for (dirname, dirs, files) in os.walk('.'):
 7 |     for filename in files:
 8 |         if filename.endswith('.txt') :
 9 |             thefile = os.path.join(dirname,filename)
10 |             fhand = open(thefile,'r')
11 |             data = fhand.read()
12 |             fhand.close()
13 |             hash = hashlib.md5(data).hexdigest()
14 |             # print thefile, hash
15 |             if hash in hashes:
16 |                 print hashes[hash], thefile
17 |             else:
18 |                 hashes[hash] = thefile
19 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/txtsize.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import join
3 | for (dirname, dirs, files) in os.walk('.'):
4 |     for filename in files:
5 |         if filename.endswith('.txt') :
6 |             thefile = os.path.join(dirname,filename)
7 |             print os.path.getsize(thefile), thefile
8 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urljpeg.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import time
 3 | 
 4 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 5 | mysock.connect(('www.py4inf.com', 80))
 6 | mysock.send('GET http://www.py4inf.com/cover.jpg HTTP/1.0\n\n')
 7 | 
 8 | 
 9 | count = 0
10 | picture = "";
11 | while True:
12 |     data = mysock.recv(5120)
13 |     if ( len(data) < 1 ) : break
14 |     time.sleep(0.25)
15 |     count = count + len(data)
16 |     print len(data),count
17 |     picture = picture + data
18 | 
19 | mysock.close()
20 | 
21 | # Look for the end of the header (2 CRLF)
22 | pos = picture.find("\r\n\r\n");
23 | print 'Header length',pos
24 | print picture[:pos]
25 | 
26 | # Skip past the header and save the picture data
27 | picture = picture[pos+4:]
28 | fhand = open("stuff.jpg","wb")
29 | fhand.write(picture);
30 | fhand.close()
31 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urllib1.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | 
3 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt')
4 | for line in fhand:
5 |     print line.strip()
6 | 
7 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urllib2.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | 
3 | fhand = urllib.urlopen('http://www.dr-chuck.com/page1.htm')
4 | for line in fhand:
5 |     print line.strip()


--------------------------------------------------------------------------------
/EXAMPLE CODE/urllink2.py:
--------------------------------------------------------------------------------
 1 | # Note - this code must run in Python 2.x and you must download
 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
 3 | # Into the same folder as this program
 4 | 
 5 | import urllib
 6 | from BeautifulSoup import *
 7 | 
 8 | url = raw_input('Enter - ')
 9 | html = urllib.urlopen(url).read()
10 | 
11 | soup = BeautifulSoup(html)
12 | 
13 | # Retrieve all of the anchor tags
14 | tags = soup('a')
15 | for tag in tags:
16 |     # Look at the parts of a tag
17 |     print 'TAG:',tag
18 |     print 'URL:',tag.get('href', None)
19 |     print 'Contents:',tag.contents[0]
20 |     print 'Attrs:',tag.attrs
21 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urllink3.py:
--------------------------------------------------------------------------------
 1 | # Note - this code must run in Python 2.x and you must download
 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
 3 | # Into the same folder as this program
 4 | 
 5 | import urllib
 6 | from BeautifulSoup import *
 7 | 
 8 | todo = list()
 9 | visited = list()
10 | url = raw_input('Enter - ')
11 | todo.append(url)
12 | 
13 | while len(todo) > 0 :
14 |     print "====== Todo list count is ",len(todo)
15 |     url = todo.pop()
16 | 
17 |     if ( not url.startswith('http') ) : 
18 |         print "Skipping", url
19 |         continue
20 | 
21 |     if ( url.find('facebook') > 0 ) :
22 |         continue
23 | 
24 |     if ( url in visited ) :
25 |         print "Visited", url
26 |         continue
27 | 
28 |     print "===== Retrieving ", url
29 | 
30 |     html = urllib.urlopen(url).read()
31 |     soup = BeautifulSoup(html)
32 |     visited.append(url)
33 | 
34 |     # Retrieve all of the anchor tags
35 |     tags = soup('a')
36 |     for tag in tags:
37 |         newurl = tag.get('href', None)
38 |         if ( newurl != None ) : 
39 |             todo.append(newurl)
40 | 
41 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urllinks.py:
--------------------------------------------------------------------------------
 1 | # Note - this code must run in Python 2.x and you must download
 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
 3 | # Into the same folder as this program
 4 | 
 5 | import urllib
 6 | from BeautifulSoup import *
 7 | 
 8 | url = raw_input('Enter - ')
 9 | html = urllib.urlopen(url).read()
10 | soup = BeautifulSoup(html)
11 | 
12 | # Retrieve all of the anchor tags
13 | tags = soup('a')
14 | for tag in tags:
15 |     print tag.get('href', None)
16 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urlregex.py:
--------------------------------------------------------------------------------
 1 | # Search for lines that start with From and have an at sign
 2 | import urllib
 3 | import re
 4 | 
 5 | url = raw_input('Enter - ')
 6 | html = urllib.urlopen(url).read()
 7 | links = re.findall('href="(http://.*?)"', html)
 8 | for link in links:
 9 |     print link
10 | 
11 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/urlwords.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | 
 3 | counts = dict()
 4 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt')
 5 | for line in fhand:
 6 |     words = line.split()
 7 |     for word in words:
 8 |         counts[word] = counts.get(word,0) + 1   
 9 | print counts
10 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/whathour.py:
--------------------------------------------------------------------------------
 1 | fname = raw_input('Enter file name: ')
 2 | fhand = open(fname)
 3 | c = dict()
 4 | for line in fhand:
 5 |     if not line.startswith('From ') : continue
 6 |     pieces = line.split()
 7 |     time = pieces[5]
 8 |     parts = time.split(':')
 9 |     hour = parts[0]
10 |     c[hour] = c.get(hour,0) + 1
11 | 
12 | lst = list()
13 | for key in c:
14 |   value = c[key]
15 |   lst.append( (value, key) ) 
16 | 
17 | lst.sort()
18 | 
19 | for value, key in lst:
20 |   print key, value
21 | 
22 |   
23 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/wikidata.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/wikidata.db


--------------------------------------------------------------------------------
/EXAMPLE CODE/wikigrade.py:
--------------------------------------------------------------------------------
 1 | # Note - this code must run in Python 2.x and you must download
 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
 3 | # Into the same folder as this program
 4 | 
 5 | import string
 6 | import sqlite3
 7 | import urllib
 8 | import xml.etree.ElementTree as ET
 9 | from BeautifulSoup import *
10 | 
11 | conn = sqlite3.connect('wikidata.db')
12 | cur = conn.cursor()
13 | 
14 | cur.execute('''
15 |     CREATE TABLE IF NOT EXISTS TinyTable (id INTEGER PRIMARY KEY, 
16 |                    url TEXT, page BLOB, retrieved_at timestamp)''')
17 | 
18 | # A slightly extended dictionary
19 | class sash(dict):
20 |     def sortvalues(self,reverse=True):
21 |         return sorted(self.items(),key=lambda x: (x[1], x[0]), reverse=reverse)
22 | 
23 | def tinyTable(url):
24 |     global cur,conn
25 |     cur.execute('SELECT id,page,retrieved_at FROM TinyTable WHERE URL = ?', (url, ))
26 |     try:
27 |         row = cur.fetchone()
28 |         print 'DATE',row[2]
29 |         return row[1]
30 |     except:
31 |         row = None
32 |     print 'Retrieving', url
33 | 
34 |     data = urllib.urlopen (url).read()
35 |     if row != None:
36 |         cur.execute("UPDATE TinyTable SET page=?,retrieved_at=datetime('now') WHERE id=?", (unicode(data, 'utf-8'), row[0]))
37 |     else:
38 |         cur.execute("INSERT INTO TinyTable (url, page, retrieved_at) VALUES (?, ?, datetime('now'))",(url, unicode(data, 'utf-8')))
39 |     conn.commit()
40 |     return data
41 | 
42 | cururl = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7?pageName=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138%2Fhome&action=view&panel=Main&realm=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138'
43 | prefix = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7'
44 | 
45 | urls = list()
46 | urls.append(cururl)
47 | visited = list()
48 | editcounts = sash()
49 | postcounts = sash()
50 | 
51 | while len(urls) > 0 : 
52 |     print '=== URLS Yet To Retrieve:',len(urls)
53 |     cururl = urls.pop()
54 |     if cururl in visited: continue
55 |     print 'RETRIEVING',cururl
56 |     data = tinyTable(cururl)
57 |     visited.append(cururl)
58 |     soup = BeautifulSoup(data)
59 |     tags = soup('a')
60 |     # print 'Tags'
61 |     for tag in tags:
62 |         print tag
63 |         url = tag.get('href',None)
64 |         if url == None : continue
65 |         # Don't follow absolute urls
66 |         if not url.startswith(prefix) : continue
67 |         newurl = urllib.basejoin(cururl,url)
68 |         if newurl in visited : continue
69 |         # print 'APPENDING',newurl
70 |         if newurl.find('action=view') > 0 or newurl.find('action=history') > 0 :
71 |             urls.append(newurl)
72 | 
73 | print 'EDITS:'
74 | for (key,val) in editcounts.sortvalues():
75 |     print key, val
76 | 
77 | for (key,val) in sorted(postcounts.items()):
78 |     print key, val
79 | 
80 | conn.close()
81 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/wordlist.py:
--------------------------------------------------------------------------------
 1 | name = raw_input('Enter file: ')
 2 | handle = open(name, 'r')
 3 | wordlist = list()
 4 | for line in handle:
 5 |     words = line.split()
 6 |     for word in words:
 7 |         if word in wordlist: continue
 8 |         wordlist.append(word)
 9 | 
10 | wordlist.sort()
11 | print wordlist
12 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/words.py:
--------------------------------------------------------------------------------
 1 | name = raw_input('Enter file:')
 2 | handle = open(name, 'r')
 3 | text = handle.read()
 4 | words = text.split()
 5 | counts = dict()
 6 | for word in words: 
 7 |     counts[word] = counts.get(word,0) + 1
 8 | 
 9 | bigcount = None
10 | bigword = None
11 | for word,count in counts.items():
12 |     if bigcount == None or count > bigcount:
13 |         bigword = word 
14 |         bigcount = count 
15 | 
16 | print bigword, bigcount
17 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/words.txt:
--------------------------------------------------------------------------------
 1 | Writing programs or programming is a very creative
 2 | and rewarding activity  You can write programs for
 3 | many reasons ranging from making your living to solving
 4 | a difficult data analysis problem to having fun to helping
 5 | someone else solve a problem  This book assumes that
 6 | {\em everyone} needs to know how to program and that once
 7 | you know how to program, you will figure out what you want
 8 | to do with your newfound skills
 9 | 
10 | We are surrounded in our daily lives with computers ranging
11 | from laptops to cell phones  We can think of these computers
12 | as our personal assistants who can take care of many things
13 | on our behalf  The hardware in our current-day computers
14 | is essentially built to continuously ask us the question
15 | What would you like me to do next
16 | 
17 | Our computers are fast and have vasts amounts of memory and 
18 | could be very helpful to us if we only knew the language to 
19 | speak to explain to the computer what we would like it to 
20 | do next If we knew this language we could tell the 
21 | computer to do tasks on our behalf that were reptitive  
22 | Interestingly, the kinds of things computers can do best
23 | are often the kinds of things that we humans find boring
24 | and mind-numbing
25 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/xml1.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | 
 3 | data = '''
 4 | <person>
 5 |   <name>Chuck</name>
 6 |   <phone type="intl">
 7 |      +1 734 303 4456
 8 |   </phone>
 9 |   <email hide="yes"/>
10 | </person>'''
11 | 
12 | tree = ET.fromstring(data)
13 | print 'Name:',tree.find('name').text
14 | print 'Attr:',tree.find('email').get('hide')
15 | 


--------------------------------------------------------------------------------
/EXAMPLE CODE/xml2.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | 
 3 | input = '''
 4 | <stuff>
 5 |     <users>
 6 |         <user x="2">
 7 |             <id>001</id>
 8 |             <name>Chuck</name>
 9 |         </user>
10 |         <user x="7">
11 |             <id>009</id>
12 |             <name>Brent</name>
13 |             </user>
14 |         </users>
15 | </stuff>'''
16 | 
17 | stuff = ET.fromstring(input)
18 | lst = stuff.findall('users/user')
19 | print 'User count:', len(lst)
20 | 
21 | for item in lst:
22 |     print 'Name', item.find('name').text
23 |     print 'Id', item.find('id').text
24 |     print 'Attribute', item.get("x")
25 |     print ""
26 | 
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Introduction
 2 | This repository contains the resources and materials I've generated myself during the course ["Using Python to Access Web Data"](https://www.coursera.org/learn/python-network-data/), from University of Michigan and offered in Coursera
 3 | 
 4 | #Content available
 5 | Currently, the only content available are the Python files corresponding to the programming assignments I've coded for the course. I plan on making available also documents with my notes from the course, but I'm still finishing them
 6 | 
 7 | #Folder structure
 8 | The content follows the same structure than the course: there's a folder for each of the weeks the course is structured in, and them contains all the material from that week. There are two types of files: the ones that start with "C" are the coding files containing the exercises, while the ones that start with "A" are other resources used on or obtained as a result from the assignments. The other two numbers represent the unit and the index of that document within the type+week respectively.
 9 | 
10 | *Example: the file starting with C4.2. is the 2nd coding file needed for the programming assignment used on Unit 4*
11 | 


--------------------------------------------------------------------------------
/Textbook - Castellano.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Textbook - Castellano.epub


--------------------------------------------------------------------------------
/Textbook - English.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Textbook - English.epub


--------------------------------------------------------------------------------
/Unit 1 - Introduction/A1.1 - Code screenshot.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 1 - Introduction/A1.1 - Code screenshot.PNG


--------------------------------------------------------------------------------
/Unit 1 - Introduction/A1.2. - Script execution.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 1 - Introduction/A1.2. - Script execution.PNG


--------------------------------------------------------------------------------
/Unit 1 - Introduction/C1.1 - Firstcode.py:
--------------------------------------------------------------------------------
1 | print("Hello pythonistas!")


--------------------------------------------------------------------------------
/Unit 2 - Regular expressions/C2.1 - Programming assignment.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | #Opening the file in which we'll need to find the numbers
 4 | sample_file = open('A.2.2 - regex text data.txt')
 5 | 
 6 | #Obtaining strings representing the numbers in that file
 7 | text = sample_file.read() #With read, we read the entire text and not line by line
 8 | number_regex = '[0-9]+'
 9 | numbers = re.findall(number_regex, text) #Match any combination of one or more digits
10 | 
11 | #Casting them to integers and getting the total sum
12 | total = sum(int(num) for num in numbers)
13 | 
14 | print(total)
15 | 
16 | #Closing the file to avoid memory problems
17 | sample_file.close()


--------------------------------------------------------------------------------
/Unit 3 - Networks and sockets/C3.1. - Programming assignment.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | #Setting the socket
 4 | mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 5 | mysocket.connect( ('www.pythonlearn.com', 80) )
 6 | 
 7 | #Making the HTTP request that will get us the desired document
 8 | mysocket.send("GET http://www.pythonlearn.com/code/intro-short.txt HTTP/1.0 \n\n")
 9 | 
10 | while True:
11 | 	#Obtaining the web data
12 | 	webdata = mysocket.recv(512)
13 | 
14 | 	#When there's no more data left, we'll stop the loop
15 | 	if len(webdata) < 1:
16 | 		break
17 | 
18 | 	#Printing the obtained data
19 | 	print webdata
20 | 
21 | mysocket.close()


--------------------------------------------------------------------------------
/Unit 4 - Programs that surf the web/BeautifulSoup.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 4 - Programs that surf the web/BeautifulSoup.pyc


--------------------------------------------------------------------------------
/Unit 4 - Programs that surf the web/C4.1. Programming assignment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This assignment consists of using urllib to read the HTML from the data files 
 3 | indicated, and parse the data, extract the numbers and compute the sum of the 
 4 | numbers in the file
 5 | 
 6 | DATA FORMAT:
 7 |  The file is a table of names and comment counts. You can ignore most of the data
 8 |  in the file except for lines like the following:
 9 | 
10 | <tr><td>Modu</td><td><span class="comments">90</span></td></tr>
11 | <tr><td>Kenzie</td><td><span class="comments">88</span></td></tr>
12 | <tr><td>Hubert</td><td><span class="comments">87</span></td></tr>
13 | 
14 | You are to find all the <span> tags in the file and pull out the numbers from the
15 | tag and sum the numbers.
16 | 
17 | Look at the sample code (http://www.pythonlearn.com/code/urllink2.py) provided. It
18 | shows how to find all of a certain kind of tag, loop through the tags and extract 
19 | the various aspects of the tags.
20 | 
21 | # Retrieve all of the anchor tags
22 | tags = soup('a')
23 | for tag in tags:
24 |    # Look at the parts of a tag
25 |    print 'TAG:',tag
26 |    print 'URL:',tag.get('href', None)
27 |    print 'Contents:',tag.contents[0]
28 |    print 'Attrs:',tag.attrs
29 | 
30 | You need to adjust this code to look for span tags and pull out the text content of the
31 | span tag, convert them to integers and add them up to complete the assignment. """
32 | 
33 | 
34 | import urllib
35 | from BeautifulSoup import *
36 | 
37 | sample_url = "http://python-data.dr-chuck.net/comments_42.html"
38 | data_url = "http://python-data.dr-chuck.net/comments_277464.html"
39 | 
40 | #Getting the html information and parsing it with BeautifulSoup
41 | html = urllib.urlopen(data_url).read()
42 | soup = BeautifulSoup(html)
43 | 
44 | #Getting a list with the "span" tags
45 | tags = soup('span')
46 | 
47 | #Counting the sum of all the values within the span tags
48 | count = 0
49 | for tag in tags:
50 | 	
51 | 	#We need to cast them to int, as they're parsed as text strings
52 | 	count += int(tag.contents[0])
53 | 
54 | print(count)
55 | 
56 | 


--------------------------------------------------------------------------------
/Unit 4 - Programs that surf the web/C4.2. Programming assignment 2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | In this assignment you will write a Python program that expands on http://www.pythonlearn.com/code/urllinks.py
 3 | The program will use urllib to read the HTML from the data files below, extract
 4 | the href= vaues from the anchor tags, scan for a tag that is in a particular 
 5 | position from the top and follow that link, repeat the process a number of times,
 6 | and report the last name you find.
 7 | 
 8 | 
 9 | SAMPLE:
10 | Find the link at position 3 (the first name is 1). Follow that link. Repeat this 
11 | process 4 times. The answer is the last name that you retrieve. 
12 | The result should be: Anayah
13 | 
14 | PROBLEM:
15 | Find the link at position 18 (the first name is 1). Follow that link. Repeat this
16 | process 7 times. The answer is the last name that you retrieve.
17 | Hint: the name starts with S
18 | """
19 | 
20 | import urllib
21 | from BeautifulSoup import *
22 | 
23 | #SAMPLE DATA
24 | sample_url = "http://python-data.dr-chuck.net/known_by_Fikret.html"
25 | sample_repetitions = 4
26 | sample_resultPosition = 3
27 | 
28 | #ACTUAL PROBLEM DATA
29 | problem_url = "http://python-data.dr-chuck.net/known_by_Max.html"
30 | problem_repetitions = 7
31 | problem_resultPosition = 18
32 | 
33 | 
34 | #Choosing the type of execution we're trying
35 | type_of_execution = 'problem'
36 | if type_of_execution == 'sample':
37 | 	(link, repetitions, resultPosition) = (sample_url, sample_repetitions, sample_resultPosition)
38 | 
39 | elif type_of_execution == 'problem':
40 | 	(link, repetitions, resultPosition) = (problem_url, problem_repetitions, problem_resultPosition)
41 | 
42 | 
43 | #Amount of iterations needed
44 | for times in range(repetitions):
45 | 
46 | 	#Getting the information of the correspondent url
47 | 	html = urllib.urlopen(link).read()
48 | 	soup = BeautifulSoup(html)
49 | 	tags = soup('a')
50 | 
51 | 	#We are indicated that the first name is 1, but in Python the array begins in 0,
52 | 	#so we have to take 1 unit from the index
53 | 	link = tags[resultPosition - 1].get('href')
54 | 
55 | #Getting the content of the tag in the specified position. It should correspond to
56 | #the answer we're looking for
57 | result_name = tags[resultPosition - 1].contents[0]
58 | print(result_name)
59 | 
60 | 


--------------------------------------------------------------------------------
/Unit 5 - Web services and XML/BeautifulSoup.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 5 - Web services and XML/BeautifulSoup.pyc


--------------------------------------------------------------------------------
/Unit 5 - Web services and XML/C5.1. Programming assignment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | EXTRACTING DATA FROM XML
 3 | In this assignment you will write a Python program somewhat similar to 
 4 | http://www.pythonlearn.com/code/geoxml.py. The program will prompt for a URL,
 5 | read the XML data from that URL using urllib and then parse and extract the 
 6 | comment counts from the XML data, compute the sum of the numbers in the file.
 7 | 
 8 | We provide two files for this assignment. One is a sample file where we give you
 9 | the sum for your testing and the other is the actual data you need to process for
10 | the assignment.
11 | 
12 |     Sample data: http://python-data.dr-chuck.net/comments_42.xml (Sum=2553)
13 |     Actual data: http://python-data.dr-chuck.net/comments_277461.xml
14 | 
15 | You do not need to save these files to your folder since your program will read
16 | the data directly from the URL. Note: Each student will have a distinct data url
17 | for the assignment - so only use your own data url for analysis.
18 | 
19 | 
20 | DATA FORMAT AND APPROACH
21 | The data consists of a number of names and comment counts in XML as follows:
22 | 
23 | <comment>
24 |   <name>Matthias</name>
25 |   <count>97</count>
26 | </comment>
27 | 
28 | You are to look through all the <comment> tags and find the <count> values sum
29 | the numbers. The closest sample code that shows how to parse XML is geoxml.py. 
30 | But since the nesting of the elements in our data is different than the data we
31 | are parsing in that sample code you will have to make real changes to the code.
32 | 
33 | To make the code a little simpler, you can use an XPath selector string to look
34 | through the entire tree of XML for any tag named 'count' with the following line
35 | of code:
36 | 
37 | counts = tree.findall('.//count')
38 | 
39 | Take a look at the Python ElementTree documentation and look for the supported
40 | XPath syntax for details. You could also work from the top of the XML down to 
41 | the comments node and then loop through the child nodes of the comments node. 
42 | """
43 | #We'll left XPath for another moment, as it requires further investigation. For
44 | #now we'll look for the count tags by knowing its structure: 
45 | #commentinfo -> comments -> comment -> count
46 | 
47 | 
48 | import urllib
49 | from BeautifulSoup import *
50 | import xml.etree.ElementTree as ET
51 | 
52 | sample_data = "http://python-data.dr-chuck.net/comments_42.xml"
53 | actual_data = "http://python-data.dr-chuck.net/comments_277461.xml"
54 | 
55 | #We'll analyze this generic parameter, so we will only need to change its source
56 | #and not every single one of its appearances in the code
57 | #NOTE: I'm using Sublime Text and it doesn't accept raw_input, so I'll set the URL
58 | #from here isntead from a user prompt
59 | data_url = actual_data 
60 | data = urllib.urlopen(data_url).read()
61 | 
62 | #xml_data contains the commentinfo object, as it is the main structure, so we 
63 | #have to look for the comments element and then for all its comment elements
64 | xml_data = ET.fromstring(data)
65 | search_str = "comments/comment"
66 | count_tags = xml_data.findall(search_str)
67 | 
68 | #Computing the sum
69 | total_count = 0
70 | for tag in count_tags:
71 | 	#We'll find the "count" element inside each "comment" element and add it 
72 | 	count = tag.find('count')
73 | 	total_count += int(count.text)
74 | 
75 | print(total_count)
76 | 
77 | 
78 | 	
79 | 
80 | 


--------------------------------------------------------------------------------
/Unit 6 - JSON and the REST architecture/C.6.1. - Programming assignment 1.py:
--------------------------------------------------------------------------------
 1 | """ EXTRACTING DATA FROM JSON
 2 | In this assignment you will write a Python program somewhat similar to 
 3 | http://www.pythonlearn.com/code/json2.py. The program will prompt for a URL, read
 4 | the JSON data from that URL using urllib and then parse and extract the comment 
 5 | counts from the JSON data, compute the sum of the numbers in the file and enter 
 6 | the sum below:
 7 | 
 8 | We provide two files for this assignment. One is a sample file where we give you
 9 | the sum for your testing and the other is the actual data you need to process for
10 | the assignment.
11 |     
12 | - Sample data: http://python-data.dr-chuck.net/comments_42.json (Sum=2553)
13 | - Actual data: http://python-data.dr-chuck.net/comments_277465.json
14 | 
15 | You do not need to save these files to your folder since your program will read
16 | the data directly from the URL. Note: Each student will have a distinct data url
17 | for the assignment - so only use your own data url for analysis.
18 | 
19 | 
20 | DATA FORMAT
21 | The data consists of a number of names and comment counts in JSON as follows:
22 | 
23 | {
24 |   comments: [
25 |     {
26 |       name: "Matthias"
27 |       count: 97
28 |     },
29 |     {
30 |       name: "Geomer"
31 |       count: 97
32 |     }
33 |     ...
34 |   ]
35 | }
36 | The closest sample code that shows how to parse JSON and extract a list is 
37 | json2.py. You might also want to look at geoxml.py to see how to prompt for a URL
38 | and retrieve data from a URL. 
39 | """
40 | 
41 | 
42 | import urllib
43 | import json
44 | 
45 | sample_url = "http://python-data.dr-chuck.net/comments_42.json"
46 | data_url = "http://python-data.dr-chuck.net/comments_277465.json"
47 | 
48 | #Reading the URL and parsing its data
49 | urldata = urllib.urlopen(data_url).read()
50 | data = json.loads(urldata)
51 | 
52 | #Finding each "count" field and adding its value to the total sum.
53 | total = 0
54 | for comment in data["comments"]:
55 | 	total += comment["count"]
56 | 
57 | print("TOTAL SUM: ", total)


--------------------------------------------------------------------------------
/Unit 6 - JSON and the REST architecture/C.6.2. - Programming assignment 2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | CALLING A JSON API
 3 | In this assignment you will write a Python program somewhat similar to 
 4 | http://www.pythonlearn.com/code/geojson.py. The program will prompt for a location,
 5 | contact a web service and retrieve JSON for the web service and parse that data,
 6 | and retrieve the first place_id from the JSON. A place ID is a textual identifier
 7 | that uniquely identifies a place as within Google Maps.
 8 | 
 9 | 
10 | API ENDPOINTS
11 | To complete this assignment, you should use this API endpoint that has a static
12 | subset of the Google Data:
13 | 
14 | http://python-data.dr-chuck.net/geojson
15 | 
16 | This API uses the same parameters (sensor and address) as the Google API. This 
17 | API also has no rate limit so you can test as often as you like. If you visit 
18 | the URL with no parameters, you get a list of all of the address values which 
19 | can be used with this API.
20 | 
21 | To call the API, you need to provide a sensor=false parameter and the address 
22 | that you are requesting as the address= parameter that is properly URL encoded
23 | using the urllib.urlencode() fuction as shown in 
24 | http://www.pythonlearn.com/code/geojson.py
25 | 
26 | 
27 | TEST DATA / SAMPLE EXECUTION
28 | You can test to see if your program is working with a location of "South Federal
29 | University" which will have a place_id of "ChIJJ8oO7_B_bIcR2AlhC8nKlok".
30 | 
31 | 
32 | TURN IN
33 | Please run your program to find the place_id for this location: Columbia 
34 | University
35 | 
36 | Make sure to enter the name and case exactly as above and enter the place_id and
37 | your Python code below. Hint: The first seven characters of the place_id are 
38 | "ChIJdeM ...". Make sure to retreive the data from the URL specified above and 
39 | not the normal Google API. Your program should work with the Google API - but the
40 | place_id may not match for this assignment. 
41 | """
42 | 
43 | import json
44 | import urllib
45 | 
46 | #Stroring the given parameters
47 | serviceurl = "http://python-data.dr-chuck.net/geojson?"
48 | sample_address = "South Federal University"
49 | data_address = "Columbia University"
50 | address_wanted = data_address
51 | 
52 | #Setting the GET parameters on the URL
53 | parameters = {"sensor": "false", "address": address_wanted}
54 | paramsurl = urllib.urlencode(parameters)
55 | 
56 | #Generating the complete URL. Printing it in order to check if it's correct.
57 | queryurl = serviceurl + paramsurl
58 | print("DATA URL: ", queryurl)
59 | 
60 | #Obtaining and reading the data
61 | data = urllib.urlopen(queryurl).read()
62 | 
63 | #Parsing the data and looking for the field we want.
64 | #That field is inside the "Results" array, in its first item (if our address is 
65 | #correct we can assume that the result would be the correct one) and on its 
66 | #"place_id" field
67 | jsondata = json.loads(str(data))
68 | place_id = jsondata["results"][0]["place_id"]
69 | print("PLACE ID: ", place_id)
70 | 
71 | 


--------------------------------------------------------------------------------