2 | First
3 |
4 | Second
7 |
8 | Third
12 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/celsius.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Celsius Temperature:')
2 | cel = float(inp)
3 | fahr = ( cel * 9.0 ) / 5.0 + 32.0
4 | print fahr
5 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/cleanup.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 |
3 | rm *.pyc */*.pyc
4 | rm *.sqlite
5 | rm *.zip
6 |
7 | zip -r geodata.zip geodata
8 | zip -r gmane.zip gmane
9 | zip -r pagerank.zip pagerank
10 | zip -r tracks.zip tracks
11 | zip -r roster.zip roster
12 |
13 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/clown.txt:
--------------------------------------------------------------------------------
1 | the clown ran after the car and the car ran into the tent and the tent fell down on the clown and the car
2 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/copytildone.py:
--------------------------------------------------------------------------------
1 | while True:
2 | line = raw_input('> ')
3 | if line[0] == '#' :
4 | continue
5 | if line == 'done':
6 | break
7 | print line
8 |
9 | print 'Done!'
10 |
11 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/count1.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter the file name: ')
2 | try:
3 | fhand = open(fname)
4 | except:
5 | print 'File cannot be opened:', fname
6 | exit()
7 |
8 | counts = dict()
9 | for line in fhand:
10 | words = line.split()
11 | for word in words:
12 | if word not in counts:
13 | counts[word] = 1
14 | else:
15 | counts[word] += 1
16 |
17 | print counts
18 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/count2.py:
--------------------------------------------------------------------------------
1 | import string
2 |
3 | fname = raw_input('Enter the file name: ')
4 | try:
5 | fhand = open(fname)
6 | except:
7 | print 'File cannot be opened:', fname
8 | exit()
9 |
10 | counts = dict()
11 | for line in fhand:
12 | line = line.translate(None, string.punctuation)
13 | line = line.lower()
14 | words = line.split()
15 | for word in words:
16 | if word not in counts:
17 | counts[word] = 1
18 | else:
19 | counts[word] += 1
20 |
21 | print counts
22 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/count3.py:
--------------------------------------------------------------------------------
1 | import string
2 | fhand = open('romeo-full.txt')
3 | counts = dict()
4 | for line in fhand:
5 | line = line.translate(None, string.punctuation)
6 | line = line.lower()
7 | words = line.split()
8 | for word in words:
9 | if word not in counts:
10 | counts[word] = 1
11 | else:
12 | counts[word] += 1
13 |
14 | # Sort the dictionary by value
15 | lst = list()
16 | for key, val in counts.items():
17 | lst.append( (val, key) )
18 |
19 | lst.sort(reverse=True)
20 |
21 | for key, val in lst[:10] :
22 | print key, val
23 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/curl1.py:
--------------------------------------------------------------------------------
1 | import urllib
2 |
3 | img = urllib.urlopen('http://www.py4inf.com/cover.jpg').read()
4 | fhand = open('cover.jpg', 'w')
5 | fhand.write(img)
6 | fhand.close()
7 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/curl2.py:
--------------------------------------------------------------------------------
1 | import urllib
2 |
3 | img = urllib.urlopen('http://www.py4inf.com/cover.jpg')
4 | fhand = open('cover.jpg', 'w')
5 | size = 0
6 | while True:
7 | info = img.read(100000)
8 | if len(info) < 1 : break
9 | size = size + len(info)
10 | fhand.write(info)
11 |
12 | print size,'characters copied.'
13 | fhand.close()
14 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/curl3.py:
--------------------------------------------------------------------------------
1 | import os
2 | import urllib
3 |
4 | print 'Please enter a URL like http://www.py4inf.com/cover.jpg'
5 | urlstr = raw_input().strip()
6 | img = urllib.urlopen(urlstr)
7 |
8 | # Get the last "word"
9 | words = urlstr.split('/')
10 | fname = words[-1]
11 |
12 | # Don't overwrite the file
13 | if os.path.exists(fname) :
14 | if raw_input('Replace '+fname+' (Y/n)?') != 'Y' :
15 | print 'Data not copied'
16 | exit()
17 | print 'Replacing',fname
18 |
19 | fhand = open(fname, 'w')
20 | size = 0
21 | while True:
22 | info = img.read(100000)
23 | if len(info) < 1 : break
24 | size = size + len(info)
25 | fhand.write(info)
26 |
27 | print size,'characters copied to',fname
28 | fhand.close()
29 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/db1.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('music.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('DROP TABLE IF EXISTS Tracks ')
7 | cur.execute('CREATE TABLE Tracks (title TEXT, plays INTEGER)')
8 |
9 | conn.close()
10 |
11 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/db2.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('music.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('INSERT INTO Tracks (title, plays) VALUES ( ?, ? )',
7 | ( 'Thunderstruck', 20 ) )
8 | cur.execute('INSERT INTO Tracks (title, plays) VALUES ( ?, ? )',
9 | ( 'My Way', 15 ) )
10 | conn.commit()
11 |
12 | print 'Tracks:'
13 | cur.execute('SELECT title, plays FROM Tracks')
14 | for row in cur :
15 | print row
16 |
17 | cur.execute('DELETE FROM Tracks WHERE plays < 100')
18 |
19 | cur.close()
20 |
21 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/egg.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter the file name: ')
2 | if fname == 'na na boo boo' :
3 | print 'NA NA BOO BOO TO YOU - You have been punkd!'
4 | exit()
5 |
6 | try:
7 | fhand = open(fname)
8 | except:
9 | print 'File cannot be opened:', fname
10 | exit()
11 | count = 0
12 | for line in fhand:
13 | if line.startswith('Subject:') :
14 | count = count + 1
15 | print 'There were', count, 'subject lines in', fname
16 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/emaildb.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('emaildb.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('''
7 | DROP TABLE IF EXISTS Counts''')
8 |
9 | cur.execute('''
10 | CREATE TABLE Counts (email TEXT, count INTEGER)''')
11 |
12 | fname = raw_input('Enter file name: ')
13 | if ( len(fname) < 1 ) : fname = 'mbox-short.txt'
14 | fh = open(fname)
15 | for line in fh:
16 | if not line.startswith('From: ') : continue
17 | pieces = line.split()
18 | email = pieces[1]
19 | print email
20 | cur.execute('SELECT count FROM Counts WHERE email = ? ', (email, ))
21 | row = cur.fetchone()
22 | if row is None:
23 | cur.execute('''INSERT INTO Counts (email, count)
24 | VALUES ( ?, 1 )''', ( email, ) )
25 | else :
26 | cur.execute('UPDATE Counts SET count=count+1 WHERE email = ?',
27 | (email, ))
28 | # This statement commits outstanding changes to disk each
29 | # time through the loop - the program can be made faster
30 | # by moving the commit so it runs only after the loop completes
31 | conn.commit()
32 |
33 | # https://www.sqlite.org/lang_select.html
34 | sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10'
35 |
36 | print
37 | print "Counts:"
38 | for row in cur.execute(sqlstr) :
39 | print str(row[0]), row[1]
40 |
41 | cur.close()
42 |
43 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/fahren.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Fahrenheit Temperature:')
2 | fahr = float(inp)
3 | cel = (fahr - 32.0) * 5.0 / 9.0
4 | print cel
5 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/geodata.zip
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/README.txt:
--------------------------------------------------------------------------------
1 | Using the Google Geocoding API with a Database and
2 | Visualizing data on Google Map
3 |
4 | In this project, we are using the Google geocoding API
5 | to clean up some user-entered geographic locations of
6 | university names and then placing the data on a Google
7 | Map.
8 |
9 | You should install the SQLite browser to view and modify
10 | the databases from:
11 |
12 | http://sqlitebrowser.org/
13 |
14 | The first problem to solve is that the Google geocoding
15 | API is rate limited to 2500 requests per day. So if you have
16 | a lot of data you might need to stop and restart the lookup
17 | process several times. So we break the problem into two
18 | phases.
19 |
20 | In the first phase we take our input data in the file
21 | (where.data) and read it one line at a time, and retreive the
22 | geocoded response and store it in a database (geodata.sqlite).
23 | Before we use the geocoding API, we simply check to see if
24 | we already have the data for that particular line of input.
25 |
26 | You can re-start the process at any time by removing the file
27 | geodata.sqlite
28 |
29 | Run the geoload.py program. This program will read the input
30 | lines in where.data and for each line check to see if it is already
31 | in the database and if we don't have the data for the location,
32 | call the geocoding API to retrieve the data and store it in
33 | the database.
34 |
35 | Here is a sample run after there is already some data in the
36 | database:
37 |
38 | Mac: python geoload.py
39 | Win: geoload.py
40 |
41 | Found in database Northeastern University
42 |
43 | Found in database University of Hong Kong, Illinois Institute of Technology, Bradley University
44 |
45 | Found in database Technion
46 |
47 | Found in database Viswakarma Institute, Pune, India
48 |
49 | Found in database UMD
50 |
51 | Found in database Tufts University
52 |
53 | Resolving Monash University
54 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Monash+University
55 | Retrieved 2063 characters { "results" : [
56 | {u'status': u'OK', u'results': ... }
57 |
58 | Resolving Kokshetau Institute of Economics and Management
59 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Kokshetau+Institute+of+Economics+and+Management
60 | Retrieved 1749 characters { "results" : [
61 | {u'status': u'OK', u'results': ... }
62 |
63 | The first five locations are already in the database and so they
64 | are skipped. The program scans to the point where it finds un-retrieved
65 | locations and starts retrieving them.
66 |
67 | The geoload.py can be stopped at any time, and there is a counter
68 | that you can use to limit the number of calls to the geocoding
69 | API for each run.
70 |
71 | Once you have some data loaded into geodata.sqlite, you can
72 | visualize the data using the (geodump.py) program. This
73 | program reads the database and writes tile file (where.js)
74 | with the location, latitude, and longitude in the form of
75 | executable JavaScript code.
76 |
77 | A run of the geodump.py program is as follows:
78 |
79 | Mac: python geodump.py
80 | Win: geodump.py
81 |
82 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975
83 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811
84 | ...
85 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667
86 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682
87 | Kokshetau, Kazakhstan 53.2833333 69.3833333
88 | ...
89 | 12 records written to where.js
90 | Open where.html to view the data in a browser
91 |
92 | The file (where.html) consists of HTML and JavaScript to visualize
93 | a Google Map. It reads the most recent data in where.js to get
94 | the data to be visualized. Here is the format of the where.js file:
95 |
96 | myData = [
97 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'],
98 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'],
99 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'],
100 | ...
101 | ];
102 |
103 | This is a JavaScript list of lists. The syntax for JavaScript
104 | list constants is very similar to Python so the syntax should
105 | be familiar to you.
106 |
107 | Simply open where.html in a browser to see the locations. You
108 | can hover over each map pin to find the location that the
109 | gecoding API returned for the user-entered input. If you
110 | cannot see any data when you open the where.html file, you might
111 | want to check the JavaScript or developer console for your browser.
112 |
113 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/geodump.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import json
3 | import codecs
4 |
5 | conn = sqlite3.connect('geodata.sqlite')
6 | cur = conn.cursor()
7 |
8 | cur.execute('SELECT * FROM Locations')
9 | fhand = codecs.open('where.js','w', "utf-8")
10 | fhand.write("myData = [\n")
11 | count = 0
12 | for row in cur :
13 | data = str(row[1])
14 | try: js = json.loads(str(data))
15 | except: continue
16 |
17 | if not('status' in js and js['status'] == 'OK') : continue
18 |
19 | lat = js["results"][0]["geometry"]["location"]["lat"]
20 | lng = js["results"][0]["geometry"]["location"]["lng"]
21 | if lat == 0 or lng == 0 : continue
22 | where = js['results'][0]['formatted_address']
23 | where = where.replace("'","")
24 | try :
25 | print where, lat, lng
26 |
27 | count = count + 1
28 | if count > 1 : fhand.write(",\n")
29 | output = "["+str(lat)+","+str(lng)+", '"+where+"']"
30 | fhand.write(output)
31 | except:
32 | continue
33 |
34 | fhand.write("\n];\n")
35 | cur.close()
36 | fhand.close()
37 | print count, "records written to where.js"
38 | print "Open where.html to view the data in a browser"
39 |
40 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/geoload.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import sqlite3
3 | import json
4 | import time
5 | import ssl
6 |
7 | # If you are in China use this URL:
8 | # serviceurl = "http://maps.google.cn/maps/api/geocode/json?"
9 | serviceurl = "http://maps.googleapis.com/maps/api/geocode/json?"
10 |
11 | # Deal with SSL certificate anomalies Python > 2.7
12 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
13 | scontext = None
14 |
15 | conn = sqlite3.connect('geodata.sqlite')
16 | cur = conn.cursor()
17 |
18 | cur.execute('''
19 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''')
20 |
21 | fh = open("where.data")
22 | count = 0
23 | for line in fh:
24 | if count > 200 : break
25 | address = line.strip()
26 | print ''
27 | cur.execute("SELECT geodata FROM Locations WHERE address= ?", (buffer(address), ))
28 |
29 | try:
30 | data = cur.fetchone()[0]
31 | print "Found in database ",address
32 | continue
33 | except:
34 | pass
35 |
36 | print 'Resolving', address
37 | url = serviceurl + urllib.urlencode({"sensor":"false", "address": address})
38 | print 'Retrieving', url
39 | uh = urllib.urlopen(url, context=scontext)
40 | data = uh.read()
41 | print 'Retrieved',len(data),'characters',data[:20].replace('\n',' ')
42 | count = count + 1
43 | try:
44 | js = json.loads(str(data))
45 | # print js # We print in case unicode causes an error
46 | except:
47 | continue
48 |
49 | if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') :
50 | print '==== Failure To Retrieve ===='
51 | print data
52 | break
53 |
54 | cur.execute('''INSERT INTO Locations (address, geodata)
55 | VALUES ( ?, ? )''', ( buffer(address),buffer(data) ) )
56 | conn.commit()
57 | time.sleep(1)
58 |
59 | print "Run geodump.py to read the data from the database so you can visualize it on a map."
60 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/where.data:
--------------------------------------------------------------------------------
1 | Northeastern University
2 | University of Hong Kong, Illinois Institute of Technology, Bradley University
3 | Technion
4 | Viswakarma Institute, Pune, India
5 | UMD
6 | Tufts University
7 | Monash University
8 | Kokshetau Institute of Economics and Management
9 | RSU named S.A. Esenin
10 | Tavrida National V.I. Vernadsky University
11 | UOC
12 | Irkutsk State University
13 | Institute of Technology Telkom
14 | Shanghai Jiao Tong University
15 | University of Ilorin, Kwara State. Nigeria
16 | Monash University Churchill Australia
17 | UNISA
18 | Fachhochschule FH Salzburg
19 | Tampere University of Technology (Tampere, Finland)
20 | Saint Petersburg State University
21 | University of São Paulo
22 | Smolensk State University (Russia)
23 | Institute of Business Administration, Karachi
24 | universidad complutense de madrid
25 | Masdar Institute
26 | University of London
27 | University of Oxford
28 | Tallinn University of Technology
29 | University of Tartu
30 | University of Padua
31 | University of Pune, India
32 | National Kyiv Shevchenko University
33 | UC Berkeley
34 | University of Wisconsin - Madison
35 | Lodz University of Technology
36 | NRU IFMO
37 | Dniepropetrovsk National University (Ukraine), Applied Math Faculty
38 | Dokuz Eylul University, Izmir, Turkey
39 | Beijing normal university
40 | University of Piraeus, Athens
41 | Universidad de Buenos Aires (UBA). Argentina.
42 | SASTRA University
43 | Nagpur University
44 | Duke University
45 | San Francisco State University
46 | FATEC-SP - Faculdade de Tecnologia do Estado de São Paulo
47 | University of Texas at Austin
48 | University of applied sciense of Mikkeli (Finland)
49 | Troy University
50 | Universidade do Minho
51 | National University of Sciences and Technology (NUST)-Pakistan
52 | Pontificia universidad catolica de chile
53 | Illinois State University Joliet Junior College
54 | American University in Cairo (AUC)
55 | Obninsk Technical University of Nuclear Power Engineering, Russia
56 | Vyatka State Humanitarian University
57 | Weizmann Institute of Science (Israel)
58 | University of Washington
59 | Kharkiv State Academy of Municipal Economy, Ukraine
60 | Faculty of Electrical Engineering in Sarajevo, University of Sarajevo
61 | Universidad de Los Andes Colombia
62 | University of Colorado at Boulder
63 | Magnitogorsk State Technical University
64 | USC
65 | Simon Fraser University
66 | Columbia University (New York)
67 | University of Southern California
68 | University of Warsaw
69 | Warsaw University of Technology
70 | (Some place in New Zealand you haven't heard of.)
71 | Massey university part-time Distance learning
72 | University of Oklahoma
73 | University of Pavia, Italy
74 | University of Missouri - Columbia
75 | Czech Technical University in Prague
76 | Illinois Institute of Technology
77 | Penn State University
78 | University of Utah
79 | Faculty of Science, University of Zagreb - Department of Mathematics
80 | Universitat Politecnica de Valencia
81 | University of Vienna
82 | University of Puerto Rico - Mayaguez Campus
83 | University "Hyperion" of Bucharest
84 | University of New Haven
85 | University of Washington -Bothell
86 | Drexel University
87 | University of Texas at Austin
88 | University of Helsinki
89 | University of Michigan
90 | Carnegie Mellon University
91 | Kazan Federal University
92 | Pondicherry University
93 | Far-Eastern State University
94 | Nanyang Technological University
95 | Slovak University of Technology
96 | NYU
97 | UFABC - Universidade Federal do ABC, Sanso André - SP - Brazil
98 | University of Debrecen
99 | California State University, San Bernardino
100 | National University "Kyiv-Mohyla Academy" (Kyiv, Ukraine)
101 | Laurentian University
102 | Humanities Institute of TV and Radio, Moscow, Russia
103 | University of Cambridge, UK
104 | Payame Noor University, Tehran, Iran
105 | Middle East Technical University
106 | EPFL
107 | Faculty of Technical Sciences, Novi Sad, Serbia
108 | University of Gothenburg, Sweden
109 | Polytechnic University of Timisoara
110 | University of Hawaii (Go, Rainbows!)
111 | Belarusian State University
112 | Haaga-Helia university of applied sciences
113 | JADAVPUR UNIVERSITY
114 | Gauhati University, India
115 | Universidad de Buenos Aires
116 | Università degli Studi di Genova, Genova, Italia
117 | King Mongkut's University of Technology Thonburi
118 | Universidad de la Sabana, Chia, Colombia
119 | State University of New York (SUNY) College at Oswego
120 | Kyrgyz Slavic Russian University
121 | De La Salle University http://www.dlsu.edu.ph
122 | Jawaharlal Nehru Technological University, INDIA
123 | UCL (Université Catholique de Louvain) in Belgium
124 | Boston University
125 | The University of Manchester
126 | Fachhochschule Düsseldorf
127 | Pine Manor College (AA), Harvard University (BA), Lesley University (MEd)
128 | Simón Bolívar University
129 | Indiana University at Bloomington
130 | RPI
131 | University of Ottawa, Canada
132 | Ural Federal University
133 | BITS Pilani
134 | Transilvania University
135 | IIT(BHU), Varanasi, India
136 | EM Lyon
137 | Universidad Central de Venezuela
138 | NTUU "KPI"
139 | Universidade Federal da Paraiba, Brazil
140 | Budapest University of Technology and Economics
141 | Moscow Institute of Physics & Technology (State University)
142 | Saint Petersburg State University of Aerospace Instrumentation, Russia
143 | North Central College, Naperville, IL
144 | Tech. Uni. Denmark (DTU)
145 | Stanford
146 | "Politehnica" Timisoara
147 | National University of Engineering
148 | Monash
149 | Federal University of Campina Grande (UFCG)
150 | Universidade Federal do Rio Grande do Sul (UFRGS)
151 | Universidad Nacional Autónoma de México
152 | University of New South Wales Harvard Business School
153 | University of Tehran
154 | Old Dominion University
155 | Kyiv Unisersity of Oriental Language
156 | Babcock University
157 | University of Essex
158 | Kharkiv National University of Radio Electronics (Ukraine)
159 | Kaunas Technology University
160 | University of Buenos Aires
161 | University of Jaffna.
162 | R V College of Engineering, Bangalore, India for BE in Instrumentation Technology
163 | Beloit College
164 | UCLA
165 | University of Chicago
166 | University of Sciences and Technology of Oran. Mohamed Boudiaf (USTO-MB).
167 | Zagazig University, Egypt
168 | University of Alberta
169 | Belorussian State University
170 | Jones International University (online) Illinois State Univeristy
171 | University of Florida
172 | Too many to mention.
173 | University of Kerala, India
174 | Politecnico di Milano
175 | Vilnius Gediminas Technical University
176 | Madras university/ Bharthidasan University in India .
177 | Universidade Tecnica de Lisboa - Instituto Superior Técnico
178 | Does not apply.
179 | Stellenbosch University
180 | imt ghazIABAD INDIA
181 | University of Pennsylvania
182 | National Institute of Technology, Jalandhar (India)
183 | Universidad ICESI
184 | Virginia Tech
185 | arizona state university
186 | Universidad del Valle de Guatemala
187 | Mykolas Romeris University, Vilnius, Lithuania
188 | BSU
189 | Distance Learning Center at the Technical University of Kaiserslautern in Germany
190 | Ain shams university, Cairo, Egypt
191 | Universidad Nacional de Colombia
192 | Saint-Petersburg Polytechnic Univesity
193 | NAIT (Northern Alberta Institute of Technology)
194 | Wayne State took courses at U of M
195 | Universidad Nacional, Costa Rica
196 | Marietta College (Ohio) Northwestern University
197 | Grandville
198 | Portland State University, Oregon Institute of Technology
199 | Malayer Azad University, Iran
200 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/where.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | A Map of Information
6 |
7 |
8 |
10 |
11 |
12 |
13 |
14 |
43 |
44 |
45 |
46 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geodata/where.js:
--------------------------------------------------------------------------------
1 | myData = [
2 | [42.340075,-71.0895367, 'Northeastern, Boston, MA 02115, USA'],
3 | [38.2113643,-85.7470011, 'Bradley Ave, Louisville, KY, USA'],
4 | [32.778949,35.019648, 'Technion/ Sports Building, Haifa'],
5 | [18.4574518,73.8837999, 'Vishwakarma Institutes Play Ground, Yashodhan Society, Kapil Nagar, Kondhwa Budrukh, Vishwakarma, Maharashtra 411048, India'],
6 | [33.1561058,131.826132, 'Japan, 〒875-0002 Ōita-ken, Usuki-shi, Shitanoe, 1232−2 UMD'],
7 | [42.4036847,-71.120482, 'South Hall Tufts University, 30 Lower Campus Rd, Somerville, MA 02144, USA'],
8 | [-37.914517,145.1303881, 'Monash College, Wellington Rd, Clayton VIC 3168, Australia'],
9 | [53.2948229,69.4047872, 'Kokshetau 020000, Kazakhstan'],
10 | [40.7127837,-74.0059413, 'New York, NY, USA'],
11 | [52.2869741,104.3050183, 'Irkutsk, Irkutsk Oblast, Russia']
12 | ];
13 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geojson.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import json
3 |
4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'
5 | #serviceurl = 'http://python-data.dr-chuck.net/geojson?'
6 |
7 | while True:
8 | address = raw_input('Enter location: ')
9 | if len(address) < 1 : break
10 |
11 | url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address})
12 | print 'Retrieving', url
13 | uh = urllib.urlopen(url)
14 | data = uh.read()
15 | print 'Retrieved',len(data),'characters'
16 |
17 | try: js = json.loads(str(data))
18 | except: js = None
19 | if 'status' not in js or js['status'] != 'OK':
20 | print '==== Failure To Retrieve ===='
21 | print data
22 | continue
23 |
24 | print json.dumps(js, indent=4)
25 |
26 | lat = js["results"][0]["geometry"]["location"]["lat"]
27 | lng = js["results"][0]["geometry"]["location"]["lng"]
28 | print 'lat',lat,'lng',lng
29 | location = js['results'][0]['formatted_address']
30 | print location
31 |
32 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/geoxml.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import xml.etree.ElementTree as ET
3 |
4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/xml?'
5 |
6 | while True:
7 | address = raw_input('Enter location: ')
8 | if len(address) < 1 : break
9 |
10 | url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address})
11 | print 'Retrieving', url
12 | uh = urllib.urlopen(url)
13 | data = uh.read()
14 | print 'Retrieved',len(data),'characters'
15 | print data
16 | tree = ET.fromstring(data)
17 |
18 |
19 | results = tree.findall('result')
20 | lat = results[0].find('geometry').find('location').find('lat').text
21 | lng = results[0].find('geometry').find('location').find('lng').text
22 | location = results[0].find('formatted_address').text
23 |
24 | print 'lat',lat,'lng',lng
25 | print location
26 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/gmane.zip
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/README.txt:
--------------------------------------------------------------------------------
1 | Analyzing an EMAIL Archive vizualizing the data using the
2 | D3 JavaScript library
3 |
4 | Here is a copy of the Sakai Developer Mailing list from 2006-2014.
5 |
6 | http://mbox.dr-chuck.net/
7 |
8 | You should install the SQLite browser to view and modify the databases from:
9 |
10 | http://sqlitebrowser.org/
11 |
12 | The base URL is hard-coded in the gmane.py. Make sure to delete the
13 | content.sqlite file if you switch the base url. The gmane.py file
14 | operates as a spider in that it runs slowly and retrieves one mail
15 | message per second so as to avoid getting throttled. It stores all of
16 | its data in a database and can be interrupted and re-started
17 | as often as needed. It may take many hours to pull all the data
18 | down. So you may need to restart several times.
19 |
20 | To give you a head-start, I have put up 600MB of pre-spidered Sakai
21 | email here:
22 |
23 | https://online.dr-chuck.com/files/sakai/email/content.sqlite.zip
24 |
25 | If you download and unzip this, you can "catch up with the
26 | latest" by running gmane.py.
27 |
28 | Navigate to the folder where you extracted the gmane.zip
29 |
30 | Here is a run of gmane.py getting the last five messages of the
31 | sakai developer list:
32 |
33 | Mac: python gmane.py
34 | Win: gmane.py
35 |
36 | How many messages:10
37 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443
38 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments
39 | http://mbox.dr-chuck.net/sakai.devel/6/7 3586
40 | s-githens@northwestern.edu 2005-12-09T13:32:31-06:00 re: sakaiportallogin and presense
41 | http://mbox.dr-chuck.net/sakai.devel/7/8 10600
42 | john@caret.cam.ac.uk 2005-12-09T13:42:24+00:00 re: lms/vle rants/comments
43 |
44 | The program scans content.sqlite from 1 up to the first message number not
45 | already spidered and starts spidering at that message. It continues spidering
46 | until it has spidered the desired number of messages or it reaches a page
47 | that does not appear to be a properly formatted message.
48 |
49 | Sometimes there is missing a message. Perhaps administrators can delete messages
50 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit
51 | a missing message, go into the SQLite Manager and add a row with the missing id - leave
52 | all the other fields blank - and then restart gmane.py. This will unstick the
53 | spidering process and allow it to continue. These empty messages will be ignored in the next
54 | phase of the process.
55 |
56 | One nice thing is that once you have spidered all of the messages and have them in
57 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the
58 | list. gmane.py will quickly scan to the end of the already-spidered pages and check
59 | if there are new messages and then quickly retrieve those messages and add them
60 | to content.sqlite.
61 |
62 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed.
63 | This is intentional as it allows you to look at content.sqlite to debug the process.
64 | It would be a bad idea to run any queries against this database as they would be
65 | slow.
66 |
67 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw
68 | data from content.sqlite and produces a cleaned-up and well-modeled version of the
69 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X
70 | smaller) than content.sqlite because it also compresses the header and body text.
71 |
72 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing
73 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the
74 | data cleaning process.
75 |
76 | Running gmodel.py works as follows:
77 |
78 | Mac: python gmodel.py
79 | Win: gmodel.py
80 |
81 | Loaded allsenders 1588 and mapping 28 dns mapping 1
82 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com
83 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu
84 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu
85 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu
86 | ...
87 |
88 | The gmodel.py program does a number of data cleaing steps
89 |
90 | Domain names are truncated to two levels for .com, .org, .edu, and .net
91 | other domain names are truncated to three levels. So si.umich.edu becomes
92 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are
93 | forced to lower case and some of the @gmane.org address like the following
94 |
95 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org
96 |
97 | are converted to the real address whenever there is a matching real email
98 | address elsewhere in the message corpus.
99 |
100 | If you look in the content.sqlite database there are two tables that allow
101 | you to map both domain names and individual email addresses that change over
102 | the lifetime of the email list. For example, Steve Githens used the following
103 | email addresses over the life of the Sakai developer list:
104 |
105 | s-githens@northwestern.edu
106 | sgithens@cam.ac.uk
107 | swgithen@mtu.edu
108 |
109 | We can add two entries to the Mapping table
110 |
111 | s-githens@northwestern.edu -> swgithen@mtu.edu
112 | sgithens@cam.ac.uk -> swgithen@mtu.edu
113 |
114 | And so all the mail messages will be collected under one sender even if
115 | they used several email addresses over the lifetime of the mailing list.
116 |
117 | You can also make similar entries in the DNSMapping table if there are multiple
118 | DNS names you want mapped to a single DNS. In the Sakai data I add the following
119 | mapping:
120 |
121 | iupui.edu -> indiana.edu
122 |
123 | So all the folks from the various Indiana University campuses are tracked together
124 |
125 | You can re-run the gmodel.py over and over as you look at the data, and add mappings
126 | to make the data cleaner and cleaner. When you are done, you will have a nicely
127 | indexed version of the email in index.sqlite. This is the file to use to do data
128 | analysis. With this file, data analysis will be really quick.
129 |
130 | The first, simplest data analysis is to do a "who does the most" and "which
131 | organzation does the most"? This is done using gbasic.py:
132 |
133 | Mac: python gbasic.py
134 | Win: gbasic.py
135 |
136 | How many to dump? 5
137 | Loaded messages= 51330 subjects= 25033 senders= 1584
138 |
139 | Top 5 Email list participants
140 | steve.swinsburg@gmail.com 2657
141 | azeckoski@unicon.net 1742
142 | ieb@tfd.co.uk 1591
143 | csev@umich.edu 1304
144 | david.horwitz@uct.ac.za 1184
145 |
146 | Top 5 Email list organizations
147 | gmail.com 7339
148 | umich.edu 6243
149 | uct.ac.za 2451
150 | indiana.edu 2258
151 | unicon.net 2055
152 |
153 | You can look at the data in index.sqlite and if you find a problem, you
154 | can update the Mapping table and DNSMapping table in content.sqlite and
155 | re-run gmodel.py.
156 |
157 | There is a simple vizualization of the word frequence in the subject lines
158 | in the file gword.py:
159 |
160 | Mac: python gword.py
161 | Win: gword.py
162 |
163 | Range of counts: 33229 129
164 | Output written to gword.js
165 |
166 | This produces the file gword.js which you can visualize using the file
167 | gword.htm.
168 |
169 | A second visualization is in gline.py. It visualizes email participation by
170 | organizations over time.
171 |
172 | Mac: python gline.py
173 | Win: gline.py
174 |
175 | Loaded messages= 51330 subjects= 25033 senders= 1584
176 | Top 10 Organizations
177 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk']
178 | Output written to gline.js
179 |
180 | Its output is written to gline.js which is visualized using gline.htm.
181 | If you have a problem with gline.htm, you can try gline2.htm or gline3.htm
182 | to vizualize your data.
183 |
184 | Some URLs for visualization ideas:
185 |
186 | https://developers.google.com/chart/
187 |
188 | https://developers.google.com/chart/interactive/docs/gallery/motionchart
189 |
190 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats
191 |
192 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline
193 |
194 | http://bost.ocks.org/mike/uberdata/
195 |
196 | http://mbostock.github.io/d3/talk/20111018/calendar.html
197 |
198 | http://nltk.org/install.html
199 |
200 | As always - comments welcome.
201 |
202 | -- Dr. Chuck
203 | Sun Sep 29 00:11:01 EDT 2013
204 |
205 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/d3.layout.cloud.js:
--------------------------------------------------------------------------------
1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/
2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf
3 | (function(exports) {
4 | function cloud() {
5 | var size = [256, 256],
6 | text = cloudText,
7 | font = cloudFont,
8 | fontSize = cloudFontSize,
9 | fontStyle = cloudFontNormal,
10 | fontWeight = cloudFontNormal,
11 | rotate = cloudRotate,
12 | padding = cloudPadding,
13 | spiral = archimedeanSpiral,
14 | words = [],
15 | timeInterval = Infinity,
16 | event = d3.dispatch("word", "end"),
17 | timer = null,
18 | cloud = {};
19 |
20 | cloud.start = function() {
21 | var board = zeroArray((size[0] >> 5) * size[1]),
22 | bounds = null,
23 | n = words.length,
24 | i = -1,
25 | tags = [],
26 | data = words.map(function(d, i) {
27 | d.text = text.call(this, d, i);
28 | d.font = font.call(this, d, i);
29 | d.style = fontStyle.call(this, d, i);
30 | d.weight = fontWeight.call(this, d, i);
31 | d.rotate = rotate.call(this, d, i);
32 | d.size = ~~fontSize.call(this, d, i);
33 | d.padding = cloudPadding.call(this, d, i);
34 | return d;
35 | }).sort(function(a, b) { return b.size - a.size; });
36 |
37 | if (timer) clearInterval(timer);
38 | timer = setInterval(step, 0);
39 | step();
40 |
41 | return cloud;
42 |
43 | function step() {
44 | var start = +new Date,
45 | d;
46 | while (+new Date - start < timeInterval && ++i < n && timer) {
47 | d = data[i];
48 | d.x = (size[0] * (Math.random() + .5)) >> 1;
49 | d.y = (size[1] * (Math.random() + .5)) >> 1;
50 | cloudSprite(d, data, i);
51 | if (place(board, d, bounds)) {
52 | tags.push(d);
53 | event.word(d);
54 | if (bounds) cloudBounds(bounds, d);
55 | else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}];
56 | // Temporary hack
57 | d.x -= size[0] >> 1;
58 | d.y -= size[1] >> 1;
59 | }
60 | }
61 | if (i >= n) {
62 | cloud.stop();
63 | event.end(tags, bounds);
64 | }
65 | }
66 | }
67 |
68 | cloud.stop = function() {
69 | if (timer) {
70 | clearInterval(timer);
71 | timer = null;
72 | }
73 | return cloud;
74 | };
75 |
76 | cloud.timeInterval = function(x) {
77 | if (!arguments.length) return timeInterval;
78 | timeInterval = x == null ? Infinity : x;
79 | return cloud;
80 | };
81 |
82 | function place(board, tag, bounds) {
83 | var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}],
84 | startX = tag.x,
85 | startY = tag.y,
86 | maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]),
87 | s = spiral(size),
88 | dt = Math.random() < .5 ? 1 : -1,
89 | t = -dt,
90 | dxdy,
91 | dx,
92 | dy;
93 |
94 | while (dxdy = s(t += dt)) {
95 | dx = ~~dxdy[0];
96 | dy = ~~dxdy[1];
97 |
98 | if (Math.min(dx, dy) > maxDelta) break;
99 |
100 | tag.x = startX + dx;
101 | tag.y = startY + dy;
102 |
103 | if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 ||
104 | tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue;
105 | // TODO only check for collisions within current bounds.
106 | if (!bounds || !cloudCollide(tag, board, size[0])) {
107 | if (!bounds || collideRects(tag, bounds)) {
108 | var sprite = tag.sprite,
109 | w = tag.width >> 5,
110 | sw = size[0] >> 5,
111 | lx = tag.x - (w << 4),
112 | sx = lx & 0x7f,
113 | msx = 32 - sx,
114 | h = tag.y1 - tag.y0,
115 | x = (tag.y + tag.y0) * sw + (lx >> 5),
116 | last;
117 | for (var j = 0; j < h; j++) {
118 | last = 0;
119 | for (var i = 0; i <= w; i++) {
120 | board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0);
121 | }
122 | x += sw;
123 | }
124 | delete tag.sprite;
125 | return true;
126 | }
127 | }
128 | }
129 | return false;
130 | }
131 |
132 | cloud.words = function(x) {
133 | if (!arguments.length) return words;
134 | words = x;
135 | return cloud;
136 | };
137 |
138 | cloud.size = function(x) {
139 | if (!arguments.length) return size;
140 | size = [+x[0], +x[1]];
141 | return cloud;
142 | };
143 |
144 | cloud.font = function(x) {
145 | if (!arguments.length) return font;
146 | font = d3.functor(x);
147 | return cloud;
148 | };
149 |
150 | cloud.fontStyle = function(x) {
151 | if (!arguments.length) return fontStyle;
152 | fontStyle = d3.functor(x);
153 | return cloud;
154 | };
155 |
156 | cloud.fontWeight = function(x) {
157 | if (!arguments.length) return fontWeight;
158 | fontWeight = d3.functor(x);
159 | return cloud;
160 | };
161 |
162 | cloud.rotate = function(x) {
163 | if (!arguments.length) return rotate;
164 | rotate = d3.functor(x);
165 | return cloud;
166 | };
167 |
168 | cloud.text = function(x) {
169 | if (!arguments.length) return text;
170 | text = d3.functor(x);
171 | return cloud;
172 | };
173 |
174 | cloud.spiral = function(x) {
175 | if (!arguments.length) return spiral;
176 | spiral = spirals[x + ""] || x;
177 | return cloud;
178 | };
179 |
180 | cloud.fontSize = function(x) {
181 | if (!arguments.length) return fontSize;
182 | fontSize = d3.functor(x);
183 | return cloud;
184 | };
185 |
186 | cloud.padding = function(x) {
187 | if (!arguments.length) return padding;
188 | padding = d3.functor(x);
189 | return cloud;
190 | };
191 |
192 | return d3.rebind(cloud, event, "on");
193 | }
194 |
195 | function cloudText(d) {
196 | return d.text;
197 | }
198 |
199 | function cloudFont() {
200 | return "serif";
201 | }
202 |
203 | function cloudFontNormal() {
204 | return "normal";
205 | }
206 |
207 | function cloudFontSize(d) {
208 | return Math.sqrt(d.value);
209 | }
210 |
211 | function cloudRotate() {
212 | return (~~(Math.random() * 6) - 3) * 30;
213 | }
214 |
215 | function cloudPadding() {
216 | return 1;
217 | }
218 |
219 | // Fetches a monochrome sprite bitmap for the specified text.
220 | // Load in batches for speed.
221 | function cloudSprite(d, data, di) {
222 | if (d.sprite) return;
223 | c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio);
224 | var x = 0,
225 | y = 0,
226 | maxh = 0,
227 | n = data.length;
228 | di--;
229 | while (++di < n) {
230 | d = data[di];
231 | c.save();
232 | c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font;
233 | var w = c.measureText(d.text + "m").width * ratio,
234 | h = d.size << 1;
235 | if (d.rotate) {
236 | var sr = Math.sin(d.rotate * cloudRadians),
237 | cr = Math.cos(d.rotate * cloudRadians),
238 | wcr = w * cr,
239 | wsr = w * sr,
240 | hcr = h * cr,
241 | hsr = h * sr;
242 | w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5;
243 | h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr));
244 | } else {
245 | w = (w + 0x1f) >> 5 << 5;
246 | }
247 | if (h > maxh) maxh = h;
248 | if (x + w >= (cw << 5)) {
249 | x = 0;
250 | y += maxh;
251 | maxh = 0;
252 | }
253 | if (y + h >= ch) break;
254 | c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio);
255 | if (d.rotate) c.rotate(d.rotate * cloudRadians);
256 | c.fillText(d.text, 0, 0);
257 | c.restore();
258 | d.width = w;
259 | d.height = h;
260 | d.xoff = x;
261 | d.yoff = y;
262 | d.x1 = w >> 1;
263 | d.y1 = h >> 1;
264 | d.x0 = -d.x1;
265 | d.y0 = -d.y1;
266 | x += w;
267 | }
268 | var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data,
269 | sprite = [];
270 | while (--di >= 0) {
271 | d = data[di];
272 | var w = d.width,
273 | w32 = w >> 5,
274 | h = d.y1 - d.y0,
275 | p = d.padding;
276 | // Zero the buffer
277 | for (var i = 0; i < h * w32; i++) sprite[i] = 0;
278 | x = d.xoff;
279 | if (x == null) return;
280 | y = d.yoff;
281 | var seen = 0,
282 | seenRow = -1;
283 | for (var j = 0; j < h; j++) {
284 | for (var i = 0; i < w; i++) {
285 | var k = w32 * j + (i >> 5),
286 | m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0;
287 | if (p) {
288 | if (j) sprite[k - w32] |= m;
289 | if (j < w - 1) sprite[k + w32] |= m;
290 | m |= (m << 1) | (m >> 1);
291 | }
292 | sprite[k] |= m;
293 | seen |= m;
294 | }
295 | if (seen) seenRow = j;
296 | else {
297 | d.y0++;
298 | h--;
299 | j--;
300 | y++;
301 | }
302 | }
303 | d.y1 = d.y0 + seenRow;
304 | d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32);
305 | }
306 | }
307 |
308 | // Use mask-based collision detection.
309 | function cloudCollide(tag, board, sw) {
310 | sw >>= 5;
311 | var sprite = tag.sprite,
312 | w = tag.width >> 5,
313 | lx = tag.x - (w << 4),
314 | sx = lx & 0x7f,
315 | msx = 32 - sx,
316 | h = tag.y1 - tag.y0,
317 | x = (tag.y + tag.y0) * sw + (lx >> 5),
318 | last;
319 | for (var j = 0; j < h; j++) {
320 | last = 0;
321 | for (var i = 0; i <= w; i++) {
322 | if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0))
323 | & board[x + i]) return true;
324 | }
325 | x += sw;
326 | }
327 | return false;
328 | }
329 |
330 | function cloudBounds(bounds, d) {
331 | var b0 = bounds[0],
332 | b1 = bounds[1];
333 | if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0;
334 | if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0;
335 | if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1;
336 | if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1;
337 | }
338 |
339 | function collideRects(a, b) {
340 | return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y;
341 | }
342 |
343 | function archimedeanSpiral(size) {
344 | var e = size[0] / size[1];
345 | return function(t) {
346 | return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)];
347 | };
348 | }
349 |
350 | function rectangularSpiral(size) {
351 | var dy = 4,
352 | dx = dy * size[0] / size[1],
353 | x = 0,
354 | y = 0;
355 | return function(t) {
356 | var sign = t < 0 ? -1 : 1;
357 | // See triangular numbers: T_n = n * (n + 1) / 2.
358 | switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) {
359 | case 0: x += dx; break;
360 | case 1: y += dy; break;
361 | case 2: x -= dx; break;
362 | default: y -= dy; break;
363 | }
364 | return [x, y];
365 | };
366 | }
367 |
368 | // TODO reuse arrays?
369 | function zeroArray(n) {
370 | var a = [],
371 | i = -1;
372 | while (++i < n) a[i] = 0;
373 | return a;
374 | }
375 |
376 | var cloudRadians = Math.PI / 180,
377 | cw = 1 << 11 >> 5,
378 | ch = 1 << 11,
379 | canvas,
380 | ratio = 1;
381 |
382 | if (typeof document !== "undefined") {
383 | canvas = document.createElement("canvas");
384 | canvas.width = 1;
385 | canvas.height = 1;
386 | ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2);
387 | canvas.width = (cw << 5) / ratio;
388 | canvas.height = ch / ratio;
389 | } else {
390 | // node-canvas support
391 | var Canvas = require("canvas");
392 | canvas = new Canvas(cw << 5, ch);
393 | }
394 |
395 | var c = canvas.getContext("2d"),
396 | spirals = {
397 | archimedean: archimedeanSpiral,
398 | rectangular: rectangularSpiral
399 | };
400 | c.fillStyle = "red";
401 | c.textAlign = "center";
402 |
403 | exports.cloud = cloud;
404 | })(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports);
405 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gbasic.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 |
6 | howmany = int(raw_input("How many to dump? "))
7 |
8 | conn = sqlite3.connect('index.sqlite')
9 | conn.text_factory = str
10 | cur = conn.cursor()
11 |
12 | cur.execute('''SELECT Messages.id, sender FROM Messages
13 | JOIN Senders ON Messages.sender_id = Senders.id''')
14 |
15 | sendcounts = dict()
16 | sendorgs = dict()
17 | for message in cur :
18 | sender = message[1]
19 | sendcounts[sender] = sendcounts.get(sender,0) + 1
20 | pieces = sender.split("@")
21 | if len(pieces) != 2 : continue
22 | dns = pieces[1]
23 | sendorgs[dns] = sendorgs.get(dns,0) + 1
24 |
25 | print ''
26 | print 'Top',howmany,'Email list participants'
27 |
28 | x = sorted(sendcounts, key=sendcounts.get, reverse=True)
29 | for k in x[:howmany]:
30 | print k, sendcounts[k]
31 | if sendcounts[k] < 10 : break
32 |
33 | print ''
34 | print 'Top',howmany,'Email list organizations'
35 |
36 | x = sorted(sendorgs, key=sendorgs.get, reverse=True)
37 | for k in x[:howmany]:
38 | print k, sendorgs[k]
39 | if sendorgs[k] < 10 : break
40 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 |
6 | conn = sqlite3.connect('index.sqlite')
7 | conn.text_factory = str
8 | cur = conn.cursor()
9 |
10 | # Determine the top ten organizations
11 | cur.execute('''SELECT Messages.id, sender FROM Messages
12 | JOIN Senders ON Messages.sender_id = Senders.id''')
13 |
14 | sendorgs = dict()
15 | for message_row in cur :
16 | sender = message_row[1]
17 | pieces = sender.split("@")
18 | if len(pieces) != 2 : continue
19 | dns = pieces[1]
20 | sendorgs[dns] = sendorgs.get(dns,0) + 1
21 |
22 | # pick the top schools
23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
24 | orgs = orgs[:10]
25 | print "Top 10 Organizations"
26 | print orgs
27 | # orgs = ['total'] + orgs
28 |
29 | # Read through the messages
30 | counts = dict()
31 | months = list()
32 |
33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages
34 | JOIN Senders ON Messages.sender_id = Senders.id''')
35 |
36 | for message_row in cur :
37 | sender = message_row[1]
38 | pieces = sender.split("@")
39 | if len(pieces) != 2 : continue
40 | dns = pieces[1]
41 | if dns not in orgs : continue
42 | month = message_row[2][:7]
43 | if month not in months : months.append(month)
44 | key = (month, dns)
45 | counts[key] = counts.get(key,0) + 1
46 | tkey = (month, 'total')
47 | counts[tkey] = counts.get(tkey,0) + 1
48 |
49 | months.sort()
50 | print counts
51 | print months
52 |
53 | fhand = open('gline.js','w')
54 | fhand.write("gline = [ ['Month'")
55 | for org in orgs:
56 | fhand.write(",'"+org+"'")
57 | fhand.write("]")
58 |
59 | # for month in months[1:-1]:
60 | for month in months:
61 | fhand.write(",\n['"+month+"'")
62 | for org in orgs:
63 | key = (month, org)
64 | val = counts.get(key,0)
65 | fhand.write(","+str(val))
66 | fhand.write("]");
67 |
68 | fhand.write("\n];\n")
69 |
70 | print "Data written to gline.js"
71 | print "Open gline.htm in a browser to view"
72 |
73 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline2.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gline3.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Line Chart
6 |
7 |
8 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gmane.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import sqlite3
3 | import time
4 | import ssl
5 | import urllib
6 | from urlparse import urljoin
7 | from urlparse import urlparse
8 | import re
9 | from datetime import datetime, timedelta
10 |
11 | # Not all systems have this so conditionally define parser
12 | try:
13 | import dateutil.parser as parser
14 | except:
15 | pass
16 |
17 | def parsemaildate(md) :
18 | # See if we have dateutil
19 | try:
20 | pdate = parser.parse(tdate)
21 | test_at = pdate.isoformat()
22 | return test_at
23 | except:
24 | pass
25 |
26 | # Non-dateutil version - we try our best
27 |
28 | pieces = md.split()
29 | notz = " ".join(pieces[:4]).strip()
30 |
31 | # Try a bunch of format variations - strptime() is *lame*
32 | dnotz = None
33 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
34 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
35 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
36 | try:
37 | dnotz = datetime.strptime(notz, form)
38 | break
39 | except:
40 | continue
41 |
42 | if dnotz is None :
43 | # print 'Bad Date:',md
44 | return None
45 |
46 | iso = dnotz.isoformat()
47 |
48 | tz = "+0000"
49 | try:
50 | tz = pieces[4]
51 | ival = int(tz) # Only want numeric timezone values
52 | if tz == '-0000' : tz = '+0000'
53 | tzh = tz[:3]
54 | tzm = tz[3:]
55 | tz = tzh+":"+tzm
56 | except:
57 | pass
58 |
59 | return iso+tz
60 |
61 | conn = sqlite3.connect('content.sqlite')
62 | cur = conn.cursor()
63 | conn.text_factory = str
64 |
65 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
66 |
67 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages
68 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT,
69 | subject TEXT, headers TEXT, body TEXT)''')
70 |
71 | start = 0
72 | cur.execute('SELECT max(id) FROM Messages')
73 | try:
74 | row = cur.fetchone()
75 | if row[0] is not None:
76 | start = row[0]
77 | except:
78 | start = 0
79 | row = None
80 |
81 | print start
82 |
83 | many = 0
84 |
85 | # Skip up to five messages
86 | skip = 5
87 | while True:
88 | if ( many < 1 ) :
89 | sval = raw_input('How many messages:')
90 | if ( len(sval) < 1 ) : break
91 | many = int(sval)
92 |
93 | start = start + 1
94 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
95 | try:
96 | row = cur.fetchone()
97 | if row is not None : continue
98 | except:
99 | row = None
100 |
101 | many = many - 1
102 | url = baseurl + str(start) + '/' + str(start + 1)
103 |
104 | try:
105 | # Deal with SSL certificate anomalies Python > 2.7
106 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
107 | # document = urllib.urlopen(url, context=scontext)
108 |
109 | document = urllib.urlopen(url)
110 |
111 | text = document.read()
112 | if document.getcode() != 200 :
113 | print "Error code=",document.getcode(), url
114 | break
115 | except KeyboardInterrupt:
116 | print ''
117 | print 'Program interrupted by user...'
118 | break
119 | except:
120 | print "Unable to retrieve or parse page",url
121 | print sys.exc_info()[0]
122 | break
123 |
124 | print url,len(text)
125 |
126 | if not text.startswith("From "):
127 | if skip < 1 :
128 | print text
129 | print "End of mail stream reached..."
130 | quit ()
131 | print " Skipping badly formed message"
132 | skip = skip-1
133 | continue
134 |
135 | pos = text.find("\n\n")
136 | if pos > 0 :
137 | hdr = text[:pos]
138 | body = text[pos+2:]
139 | else:
140 | print text
141 | print "Could not find break between headers and body"
142 | break
143 |
144 | skip = 5 # reset skip count
145 |
146 | email = None
147 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
148 | if len(x) == 1 :
149 | email = x[0];
150 | email = email.strip().lower()
151 | email = email.replace("<","")
152 | else:
153 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
154 | if len(x) == 1 :
155 | email = x[0];
156 | email = email.strip().lower()
157 | email = email.replace("<","")
158 |
159 | date = None
160 | y = re.findall('\Date: .*, (.*)\n', hdr)
161 | if len(y) == 1 :
162 | tdate = y[0]
163 | tdate = tdate[:26]
164 | try:
165 | sent_at = parsemaildate(tdate)
166 | except:
167 | print text
168 | print "Parse fail",tdate
169 | break
170 |
171 | subject = None
172 | z = re.findall('\Subject: (.*)\n', hdr)
173 | if len(z) == 1 : subject = z[0].strip().lower();
174 |
175 | print " ",email,sent_at,subject
176 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body)
177 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
178 |
179 | # Only commit every 50th record
180 | # if (many % 50) == 0 : conn.commit()
181 | time.sleep(1)
182 |
183 | conn.commit()
184 | cur.close()
185 |
186 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gmodel.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import re
5 | import zlib
6 | from datetime import datetime, timedelta
7 | # Not all systems have this
8 | try:
9 | import dateutil.parser as parser
10 | except:
11 | pass
12 |
13 | dnsmapping = dict()
14 | mapping = dict()
15 |
16 | def fixsender(sender,allsenders=None) :
17 | global dnsmapping
18 | global mapping
19 | if sender is None : return None
20 | sender = sender.strip().lower()
21 | sender = sender.replace('<','').replace('>','')
22 |
23 | # Check if we have a hacked gmane.org from address
24 | if allsenders is not None and sender.endswith('gmane.org') :
25 | pieces = sender.split('-')
26 | realsender = None
27 | for s in allsenders:
28 | if s.startswith(pieces[0]) :
29 | realsender = sender
30 | sender = s
31 | # print realsender, sender
32 | break
33 | if realsender is None :
34 | for s in mapping:
35 | if s.startswith(pieces[0]) :
36 | realsender = sender
37 | sender = mapping[s]
38 | # print realsender, sender
39 | break
40 | if realsender is None : sender = pieces[0]
41 |
42 | mpieces = sender.split("@")
43 | if len(mpieces) != 2 : return sender
44 | dns = mpieces[1]
45 | x = dns
46 | pieces = dns.split(".")
47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
48 | dns = ".".join(pieces[-2:])
49 | else:
50 | dns = ".".join(pieces[-3:])
51 | # if dns != x : print x,dns
52 | # if dns != dnsmapping.get(dns,dns) : print dns,dnsmapping.get(dns,dns)
53 | dns = dnsmapping.get(dns,dns)
54 | return mpieces[0] + '@' + dns
55 |
56 | def parsemaildate(md) :
57 | # See if we have dateutil
58 | try:
59 | pdate = parser.parse(tdate)
60 | test_at = pdate.isoformat()
61 | return test_at
62 | except:
63 | pass
64 |
65 | # Non-dateutil version - we try our best
66 |
67 | pieces = md.split()
68 | notz = " ".join(pieces[:4]).strip()
69 |
70 | # Try a bunch of format variations - strptime() is *lame*
71 | dnotz = None
72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
75 | try:
76 | dnotz = datetime.strptime(notz, form)
77 | break
78 | except:
79 | continue
80 |
81 | if dnotz is None :
82 | # print 'Bad Date:',md
83 | return None
84 |
85 | iso = dnotz.isoformat()
86 |
87 | tz = "+0000"
88 | try:
89 | tz = pieces[4]
90 | ival = int(tz) # Only want numeric timezone values
91 | if tz == '-0000' : tz = '+0000'
92 | tzh = tz[:3]
93 | tzm = tz[3:]
94 | tz = tzh+":"+tzm
95 | except:
96 | pass
97 |
98 | return iso+tz
99 |
100 | # Parse out the info...
101 | def parseheader(hdr, allsenders=None):
102 | if hdr is None or len(hdr) < 1 : return None
103 | sender = None
104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
105 | if len(x) >= 1 :
106 | sender = x[0]
107 | else:
108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
109 | if len(x) >= 1 :
110 | sender = x[0]
111 |
112 | # normalize the domain name of Email addresses
113 | sender = fixsender(sender, allsenders)
114 |
115 | date = None
116 | y = re.findall('\nDate: .*, (.*)\n', hdr)
117 | sent_at = None
118 | if len(y) >= 1 :
119 | tdate = y[0]
120 | tdate = tdate[:26]
121 | try:
122 | sent_at = parsemaildate(tdate)
123 | except Exception, e:
124 | # print 'Date ignored ',tdate, e
125 | return None
126 |
127 | subject = None
128 | z = re.findall('\nSubject: (.*)\n', hdr)
129 | if len(z) >= 1 : subject = z[0].strip().lower()
130 |
131 | guid = None
132 | z = re.findall('\nMessage-ID: (.*)\n', hdr)
133 | if len(z) >= 1 : guid = z[0].strip().lower()
134 |
135 | if sender is None or sent_at is None or subject is None or guid is None :
136 | return None
137 | return (guid, sender, subject, sent_at)
138 |
139 | # Open the output database and create empty tables
140 | conn = sqlite3.connect('index.sqlite')
141 | conn.text_factory = str
142 | cur = conn.cursor()
143 |
144 | cur.execute('''DROP TABLE IF EXISTS Messages ''')
145 | cur.execute('''DROP TABLE IF EXISTS Senders ''')
146 | cur.execute('''DROP TABLE IF EXISTS Subjects ''')
147 | cur.execute('''DROP TABLE IF EXISTS Replies ''')
148 |
149 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages
150 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER,
151 | sender_id INTEGER, subject_id INTEGER,
152 | headers BLOB, body BLOB)''')
153 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders
154 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
155 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects
156 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
157 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies
158 | (from_id INTEGER, to_id INTEGER)''')
159 |
160 | # Open the mapping information
161 | conn_1 = sqlite3.connect('mapping.sqlite')
162 | conn_1.text_factory = str
163 | cur_1 = conn_1.cursor()
164 |
165 | # Load up the mapping information into memory structures
166 | cur_1.execute('''SELECT old,new FROM DNSMapping''')
167 | for message_row in cur_1 :
168 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower()
169 |
170 | mapping = dict()
171 | cur_1.execute('''SELECT old,new FROM Mapping''')
172 | for message_row in cur_1 :
173 | old = fixsender(message_row[0])
174 | new = fixsender(message_row[1])
175 | mapping[old] = fixsender(new)
176 |
177 | cur_1.close()
178 |
179 | # Open the raw data retrieved from the network
180 | conn_2 = sqlite3.connect('content.sqlite')
181 | conn_2.text_factory = str
182 | cur_2 = conn_2.cursor()
183 |
184 | allsenders = list()
185 | cur_2.execute('''SELECT email FROM Messages''')
186 | for message_row in cur_2 :
187 | sender = fixsender(message_row[0])
188 | if sender is None : continue
189 | if 'gmane.org' in sender : continue
190 | if sender in allsenders: continue
191 | allsenders.append(sender)
192 |
193 | print "Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)
194 |
195 | cur_2.execute('''SELECT headers, body, sent_at
196 | FROM Messages ORDER BY sent_at''')
197 |
198 | senders = dict()
199 | subjects = dict()
200 | guids = dict()
201 |
202 | count = 0
203 |
204 | for message_row in cur_2 :
205 | hdr = message_row[0]
206 | parsed = parseheader(hdr, allsenders)
207 | if parsed is None: continue
208 | (guid, sender, subject, sent_at) = parsed
209 |
210 | # Apply the sender mapping
211 | sender = mapping.get(sender,sender)
212 |
213 | count = count + 1
214 | if count % 250 == 1 : print count,sent_at, sender
215 | # print guid, sender, subject, sent_at
216 |
217 | if 'gmane.org' in sender:
218 | print "Error in sender ===", sender
219 |
220 | sender_id = senders.get(sender,None)
221 | subject_id = subjects.get(subject,None)
222 | guid_id = guids.get(guid,None)
223 |
224 | if sender_id is None :
225 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
226 | conn.commit()
227 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
228 | try:
229 | row = cur.fetchone()
230 | sender_id = row[0]
231 | senders[sender] = sender_id
232 | except:
233 | print 'Could not retrieve sender id',sender
234 | break
235 | if subject_id is None :
236 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
237 | conn.commit()
238 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
239 | try:
240 | row = cur.fetchone()
241 | subject_id = row[0]
242 | subjects[subject] = subject_id
243 | except:
244 | print 'Could not retrieve subject id',subject
245 | break
246 | # print sender_id, subject_id
247 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )',
248 | ( guid, sender_id, subject_id, sent_at, zlib.compress(message_row[0]), zlib.compress(message_row[1])) )
249 | conn.commit()
250 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
251 | try:
252 | row = cur.fetchone()
253 | message_id = row[0]
254 | guids[guid] = message_id
255 | except:
256 | print 'Could not retrieve guid id',guid
257 | break
258 |
259 | # Close the connections
260 | cur.close()
261 | cur_2.close()
262 |
263 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gword.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
37 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gword.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 | import string
6 |
7 | conn = sqlite3.connect('index.sqlite')
8 | conn.text_factory = str
9 | cur = conn.cursor()
10 |
11 | cur.execute('''SELECT subject_id,subject FROM Messages
12 | JOIN Subjects ON Messages.subject_id = Subjects.id''')
13 |
14 | counts = dict()
15 | for message_row in cur :
16 | text = message_row[1]
17 | text = text.translate(None, string.punctuation)
18 | text = text.translate(None, '1234567890')
19 | text = text.strip()
20 | text = text.lower()
21 | words = text.split()
22 | for word in words:
23 | if len(word) < 4 : continue
24 | counts[word] = counts.get(word,0) + 1
25 |
26 | # Find the top 100 words
27 | words = sorted(counts, key=counts.get, reverse=True)
28 | highest = None
29 | lowest = None
30 | for w in words[:100]:
31 | if highest is None or highest < counts[w] :
32 | highest = counts[w]
33 | if lowest is None or lowest > counts[w] :
34 | lowest = counts[w]
35 | print 'Range of counts:',highest,lowest
36 |
37 | # Spread the font sizes across 20-100 based on the count
38 | bigsize = 80
39 | smallsize = 20
40 |
41 | fhand = open('gword.js','w')
42 | fhand.write("gword = [")
43 | first = True
44 | for k in words[:100]:
45 | if not first : fhand.write( ",\n")
46 | first = False
47 | size = counts[k]
48 | size = (size - lowest) / float(highest - lowest)
49 | size = int((size * bigsize) + smallsize)
50 | fhand.write("{text: '"+k+"', size: "+str(size)+"}")
51 | fhand.write( "\n];\n")
52 |
53 | print "Output written to gword.js"
54 | print "Open gword.htm in a browser to view"
55 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/gyear.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 |
6 | conn = sqlite3.connect('index.sqlite')
7 | conn.text_factory = str
8 | cur = conn.cursor()
9 |
10 | # Determine the top ten organizations
11 | cur.execute('''SELECT Messages.id, sender FROM Messages
12 | JOIN Senders ON Messages.sender_id = Senders.id''')
13 |
14 | sendorgs = dict()
15 | for message_row in cur :
16 | sender = message_row[1]
17 | pieces = sender.split("@")
18 | if len(pieces) != 2 : continue
19 | dns = pieces[1]
20 | sendorgs[dns] = sendorgs.get(dns,0) + 1
21 |
22 | # pick the top schools
23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
24 | orgs = orgs[:10]
25 | print "Top 10 Organizations"
26 | print orgs
27 | # orgs = ['total'] + orgs
28 |
29 | # Read through the messages
30 | counts = dict()
31 | years = list()
32 |
33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages
34 | JOIN Senders ON Messages.sender_id = Senders.id''')
35 |
36 | for message_row in cur :
37 | sender = message_row[1]
38 | pieces = sender.split("@")
39 | if len(pieces) != 2 : continue
40 | dns = pieces[1]
41 | if dns not in orgs : continue
42 | year = message_row[2][:4]
43 | if year not in years : years.append(year)
44 | key = (year, dns)
45 | counts[key] = counts.get(key,0) + 1
46 | tkey = (year, 'total')
47 | counts[tkey] = counts.get(tkey,0) + 1
48 |
49 | years.sort()
50 | print counts
51 | print years
52 |
53 | fhand = open('gline.js','w')
54 | fhand.write("gline = [ ['Year'")
55 | for org in orgs:
56 | fhand.write(",'"+org+"'")
57 | fhand.write("]")
58 |
59 | # for year in years[1:-1]:
60 | for year in years:
61 | fhand.write(",\n['"+year+"'")
62 | for org in orgs:
63 | key = (year, org)
64 | val = counts.get(key,0)
65 | fhand.write(","+str(val))
66 | fhand.write("]");
67 |
68 | fhand.write("\n];\n")
69 |
70 | print "Data written to gline.js"
71 | print "Open gline.htm in a browser to view"
72 |
73 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/gmane/mapping.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/gmane/mapping.sqlite
--------------------------------------------------------------------------------
/EXAMPLE CODE/grade.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter score: ')
2 | try:
3 | score = float(inp)
4 | except:
5 | score = -1
6 |
7 | if score > 1.0 or score < 0.0:
8 | print 'Bad score'
9 | elif score > 0.9:
10 | print 'A'
11 | elif score > 0.8:
12 | print 'B'
13 | elif score > 0.7:
14 | print 'C'
15 | elif score > 0.6:
16 | print 'D'
17 | else:
18 | print 'F'
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/graphics/histogram.py:
--------------------------------------------------------------------------------
1 | import string
2 | from graphics import *
3 |
4 | fname = raw_input("Enter file name:")
5 | if len(fname) == 0 :
6 | print "Assuming mbox-short.txt"
7 | fname = "mbox-short.txt"
8 | infile = open(fname, "r")
9 |
10 | # Set up a 24 element list of zeros
11 | totals = [0] * 24;
12 | print totals;
13 |
14 | # Accumulate the times
15 | for line in infile:
16 | if line[0:5] == "From " :
17 | words = line.split()
18 | time = words[5]
19 | print "Time", time
20 |
21 | # Split time
22 | tsplit = time.split(':')
23 | try :
24 | hour = int(tsplit[0])
25 | print "Hour", hour
26 | except:
27 | print "Hour not found"
28 | continue
29 |
30 | totals[hour] = totals[hour] + 1
31 | print totals
32 |
33 | bmax = max(totals)
34 | print "Maximum value", bmax
35 |
36 | ymax = ( int(bmax / 10) + 1 ) * 10
37 |
38 | print "Y-Axis Maximum", ymax
39 |
40 | win = GraphWin("Distribution of Commits "+fname, 600,400)
41 | win.setCoords(0,0,1,1)
42 |
43 | # Draw the X-Axis
44 | xaxis = Line(Point(0.1,0.1),Point(0.9,0.1))
45 | xaxis.draw(win)
46 |
47 | # Label the X-Axis - we have 24 hours (0-23)
48 | # so we need to know each slot's width
49 | width = 0.8 * (1.0 / 24.0)
50 | for i in range(24):
51 | center = (i * width) + (width / 2.0) + 0.1;
52 | txt = Text(Point(center, 0.066), str(i))
53 | txt.draw(win)
54 |
55 | txt = Text(Point(0.5,0.033),"Hour of the Day");
56 | txt.draw(win)
57 |
58 | # Draw the Y-Axis
59 | yaxis = Line(Point(0.1,0.1),Point(0.1,0.9))
60 | yaxis.draw(win)
61 |
62 | # Label the Y-Axis
63 | # we will have 10 labels up to ymax
64 | unit = ymax / 10.0;
65 | for i in range(10) :
66 | center = 0.1 + (i + 1) * 0.08;
67 | value = int( (i + 1) * unit ) ;
68 | txt = Text(Point(0.066,center), str(value))
69 | txt.draw(win)
70 |
71 |
72 | # Draw the bars
73 | for i in range(24):
74 | if totals[i] == 0:
75 | continue
76 | left = i * width + 0.1;
77 | right = i * width + width + 0.1;
78 | height = (float(totals[i]) / ymax) * 0.8;
79 | rec = Rectangle(Point(left,0.1), Point(right,0.1+height))
80 | rec.setFill('blue')
81 | rec.draw(win)
82 |
83 | win.getMouse()
84 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/greet.py:
--------------------------------------------------------------------------------
1 | name = raw_input('Enter your name:')
2 | print 'Hello', name
3 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/grep.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox.txt')
4 | search = raw_input('Enter a regular expression: ')
5 | count = 0
6 | for line in hand:
7 | line = line.rstrip()
8 | if re.search(search,line) : count = count + 1
9 |
10 | print 'mbox.txt had',count,'lines that matched',search
11 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/hidden.py:
--------------------------------------------------------------------------------
1 | # Keep this file separate
2 | # https://apps.twitter.com/
3 |
4 | def oauth() :
5 | return { "consumer_key" : "h7Lu...Ng",
6 | "consumer_secret" : "dNKenAC3New...mmn7Q",
7 | "token_key" : "10185562-eibxCp9n2...P4GEQQOSGI",
8 | "token_secret" : "H0ycCFemmC4wyf1...qoIpBo" }
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/intro-short.txt:
--------------------------------------------------------------------------------
1 | Why should you learn to write programs?
2 |
3 | Writing programs (or programming) is a very creative
4 | and rewarding activity. You can write programs for
5 | many reasons, ranging from making your living to solving
6 | a difficult data analysis problem to having fun to helping
7 | someone else solve a problem. This book assumes that
8 | everyone needs to know how to program, and that once
9 | you know how to program you will figure out what you want
10 | to do with your newfound skills.
11 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/json1.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | data = """{
4 | "name" : "Chuck",
5 | "phone" : {
6 | "type" : "intl",
7 | "number" : "+1 734 303 4456"
8 | },
9 | "email" : {
10 | "hide" : "yes"
11 | }
12 | }"""
13 |
14 | info = json.loads(data)
15 | print 'Name:',info["name"]
16 | print 'Hide:',info["email"]["hide"]
17 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/json2.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | input = '''
4 | [
5 | { "id" : "001",
6 | "x" : "2",
7 | "name" : "Chuck"
8 | } ,
9 | { "id" : "009",
10 | "x" : "7",
11 | "name" : "Chuck"
12 | }
13 | ]'''
14 |
15 | info = json.loads(input)
16 | print 'User count:', len(info)
17 |
18 | for item in info:
19 | print 'Name', item['name']
20 | print 'Id', item['id']
21 | print 'Attribute', item['x']
22 |
23 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/largest.py:
--------------------------------------------------------------------------------
1 | largest = None
2 | print 'Before:', largest
3 | for iterval in [3, 41, 12, 9, 74, 15]:
4 | if largest == None or largest < iterval:
5 | largest = iterval
6 | print 'Loop:', iterval, largest
7 | print 'Largest:', largest
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/mailcount.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter file name: ')
2 | fhand = open(fname)
3 | c = dict()
4 | for line in fhand:
5 | if not line.startswith('From ') : continue
6 | pieces = line.split()
7 | email = pieces[1]
8 | c[email] = c.get(email,0) + 1
9 |
10 | print c
11 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/mailtop.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter file name: ')
2 | fhand = open(fname)
3 | c = dict()
4 | for line in fhand:
5 | if not line.startswith('From ') : continue
6 | pieces = line.split()
7 | email = pieces[1]
8 | c[email] = c.get(email,0) + 1
9 |
10 | bigc = None
11 | bige = None
12 | for word in c:
13 | value = c[word]
14 | if bigc == None or value > bigc:
15 | bigw = word
16 | bigc = value
17 |
18 | print bigw, bigc
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/open.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox.txt')
2 | count = 0
3 | for line in fhand:
4 | count = count + 1
5 | print 'Line Count:', count
6 |
7 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/pagerank.zip
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012, Michael Bostock
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * The name Michael Bostock may not be used to endorse or promote products
15 | derived from this software without specific prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/README.txt:
--------------------------------------------------------------------------------
1 | Simple Python Search Spider, Page Ranker, and Visualizer
2 |
3 | This is a set of programs that emulate some of the functions of a
4 | search engine. They store their data in a SQLITE3 database named
5 | 'spider.sqlite'. This file can be removed at any time to restart the
6 | process.
7 |
8 | You should install the SQLite browser to view and modify
9 | the databases from:
10 |
11 | http://sqlitebrowser.org/
12 |
13 | This program crawls a web site and pulls a series of pages into the
14 | database, recording the links between pages.
15 |
16 | Mac: rm spider.sqlite
17 | Mac: python spider.py
18 |
19 | Win: del spider.sqlite
20 | Win: spider.py
21 |
22 | Enter web url or enter: http://www.dr-chuck.com/
23 | ['http://www.dr-chuck.com']
24 | How many pages:2
25 | 1 http://www.dr-chuck.com/ 12
26 | 2 http://www.dr-chuck.com/csev-blog/ 57
27 | How many pages:
28 |
29 | In this sample run, we told it to crawl a website and retrieve two
30 | pages. If you restart the program again and tell it to crawl more
31 | pages, it will not re-crawl any pages already in the database. Upon
32 | restart it goes to a random non-crawled page and starts there. So
33 | each successive run of spider.py is additive.
34 |
35 | Mac: python spider.py
36 | Win: spider.py
37 |
38 | Enter web url or enter: http://www.dr-chuck.com/
39 | ['http://www.dr-chuck.com']
40 | How many pages:3
41 | 3 http://www.dr-chuck.com/csev-blog 57
42 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1
43 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13
44 | How many pages:
45 |
46 | You can have multiple starting points in the same database -
47 | within the program these are called "webs". The spider
48 | chooses randomly amongst all non-visited links across all
49 | the webs.
50 |
51 | If your code fails complainin about certificate probems,
52 | there is some code (SSL) that can be un-commented to work
53 | around certificate problems.
54 |
55 | If you want to dump the contents of the spider.sqlite file, you can
56 | run spdump.py as follows:
57 |
58 | Mac: python spdump.py
59 | Win: spdump.py
60 |
61 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog')
62 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
63 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/')
64 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
65 | 4 rows.
66 |
67 | This shows the number of incoming links, the old page rank, the new page
68 | rank, the id of the page, and the url of the page. The spdump.py program
69 | only shows pages that have at least one incoming link to them.
70 |
71 | Once you have a few pages in the database, you can run Page Rank on the
72 | pages using the sprank.py program. You simply tell it how many Page
73 | Rank iterations to run.
74 |
75 | Mac: python sprank.py
76 | Win: sprank.py
77 |
78 | How many iterations:2
79 | 1 0.546848992536
80 | 2 0.226714939664
81 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)]
82 |
83 | You can dump the database again to see that page rank has been updated:
84 |
85 | Mac: python spdump.py
86 | Win: spdump.py
87 |
88 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog')
89 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
90 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/')
91 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
92 | 4 rows.
93 |
94 | You can run sprank.py as many times as you like and it will simply refine
95 | the page rank the more times you run it. You can even run sprank.py a few times
96 | and then go spider a few more pages sith spider.py and then run sprank.py
97 | to converge the page ranks.
98 |
99 | If you want to restart the Page Rank calculations without re-spidering the
100 | web pages, you can use spreset.py
101 |
102 | Mac: python spreset.py
103 | Win: spreset.py
104 |
105 | All pages set to a rank of 1.0
106 |
107 | Mac: python sprank.py
108 | Win: sprank.py
109 |
110 | How many iterations:50
111 | 1 0.546848992536
112 | 2 0.226714939664
113 | 3 0.0659516187242
114 | 4 0.0244199333
115 | 5 0.0102096489546
116 | 6 0.00610244329379
117 | ...
118 | 42 0.000109076928206
119 | 43 9.91987599002e-05
120 | 44 9.02151706798e-05
121 | 45 8.20451504471e-05
122 | 46 7.46150183837e-05
123 | 47 6.7857770908e-05
124 | 48 6.17124694224e-05
125 | 49 5.61236959327e-05
126 | 50 5.10410499467e-05
127 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)]
128 |
129 | For each iteration of the page rank algorithm it prints the average
130 | change per page of the page rank. The network initially is quite
131 | unbalanced and so the individual page ranks are changeing wildly.
132 | But in a few short iterations, the page rank converges. You
133 | should run prank.py long enough that the page ranks converge.
134 |
135 | If you want to visualize the current top pages in terms of page rank,
136 | run spjson.py to write the pages out in JSON format to be viewed in a
137 | web browser.
138 |
139 | Mac: python spjson.py
140 | Win: spjson.py
141 |
142 | Creating JSON output on spider.js...
143 | How many nodes? 30
144 | Open force.html in a browser to view the visualization
145 |
146 | You can view this data by opening the file force.html in your web browser.
147 | This shows an automatic layout of the nodes and links. You can click and
148 | drag any node and you can also double click on a node to find the URL
149 | that is represented by the node.
150 |
151 | This visualization is provided using the force layout from:
152 |
153 | http://mbostock.github.com/d3/
154 |
155 | If you rerun the other utilities and then re-run spjson.py - you merely
156 | have to press refresh in the browser to get the new data from spider.js.
157 |
158 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/force.css:
--------------------------------------------------------------------------------
1 | circle.node {
2 | stroke: #fff;
3 | stroke-width: 1.5px;
4 | }
5 |
6 | line.link {
7 | stroke: #999;
8 | stroke-opacity: .6;
9 | }
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/force.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Force-Directed Layout
5 |
6 |
7 |
8 |
9 |
10 |
13 |
14 |
15 |
If you don't see a chart above, check the JavaScript console. You may
16 | need to use a different browser.
17 |
18 |
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/force.js:
--------------------------------------------------------------------------------
1 | var width = 600,
2 | height = 600;
3 |
4 | var color = d3.scale.category20();
5 |
6 | var dist = (width + height) / 4;
7 |
8 | var force = d3.layout.force()
9 | .charge(-120)
10 | .linkDistance(dist)
11 | .size([width, height]);
12 |
13 | function getrank(rval) {
14 | return (rval/2.0) + 3;
15 | }
16 |
17 | function getcolor(rval) {
18 | return color(rval);
19 | }
20 |
21 | var svg = d3.select("#chart").append("svg")
22 | .attr("width", width)
23 | .attr("height", height);
24 |
25 | function loadData(json) {
26 | force
27 | .nodes(json.nodes)
28 | .links(json.links);
29 |
30 | var k = Math.sqrt(json.nodes.length / (width * height));
31 |
32 | force
33 | .charge(-10 / k)
34 | .gravity(100 * k)
35 | .start();
36 |
37 | var link = svg.selectAll("line.link")
38 | .data(json.links)
39 | .enter().append("line")
40 | .attr("class", "link")
41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); });
42 |
43 | var node = svg.selectAll("circle.node")
44 | .data(json.nodes)
45 | .enter().append("circle")
46 | .attr("class", "node")
47 | .attr("r", function(d) { return getrank(d.rank); } )
48 | .style("fill", function(d) { return getcolor(d.rank); })
49 | .on("dblclick",function(d) {
50 | if ( confirm('Do you want to open '+d.url) )
51 | window.open(d.url,'_new','');
52 | d3.event.stopPropagation();
53 | })
54 | .call(force.drag);
55 |
56 | node.append("title")
57 | .text(function(d) { return d.url; });
58 |
59 | force.on("tick", function() {
60 | link.attr("x1", function(d) { return d.source.x; })
61 | .attr("y1", function(d) { return d.source.y; })
62 | .attr("x2", function(d) { return d.target.x; })
63 | .attr("y2", function(d) { return d.target.y; });
64 |
65 | node.attr("cx", function(d) { return d.x; })
66 | .attr("cy", function(d) { return d.y; });
67 | });
68 |
69 | }
70 | loadData(spiderJson);
71 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spdump.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url
7 | FROM Pages JOIN Links ON Pages.id = Links.to_id
8 | WHERE html IS NOT NULL
9 | GROUP BY id ORDER BY inbound DESC''')
10 |
11 | count = 0
12 | for row in cur :
13 | if count < 50 : print row
14 | count = count + 1
15 | print count, 'rows.'
16 | cur.close()
17 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spider.js:
--------------------------------------------------------------------------------
1 | spiderJson = {"nodes":[
2 | {"weight":1,"rank":0.0, "id":1, "url":"http://python-data.dr-chuck.net"},
3 | {"weight":1,"rank":4.66423227024, "id":4, "url":"http://python-data.dr-chuck.net/comments_42.html"},
4 | {"weight":1,"rank":1.38142061792, "id":7, "url":"http://python-data.dr-chuck.net/known_by_42.html"},
5 | {"weight":1,"rank":0.690710255581, "id":9, "url":"http://python-data.dr-chuck.net/known_by_Kaylyn.html"},
6 | {"weight":2,"rank":2.26669663573, "id":40, "url":"http://python-data.dr-chuck.net/known_by_Takua.html"},
7 | {"weight":1,"rank":0.690710255581, "id":82, "url":"http://python-data.dr-chuck.net/known_by_Marwan.html"},
8 | {"weight":2,"rank":7.45553422719, "id":85, "url":"http://python-data.dr-chuck.net/known_by_Samiya.html"},
9 | {"weight":2,"rank":8.48734569457, "id":145, "url":"http://python-data.dr-chuck.net/known_by_Shihed.html"},
10 | {"weight":1,"rank":0.518032667194, "id":189, "url":"http://python-data.dr-chuck.net/known_by_Cassidy.html"},
11 | {"weight":2,"rank":1.56869025396, "id":199, "url":"http://python-data.dr-chuck.net/known_by_Vinnie.html"},
12 | {"weight":2,"rank":2.54881807574, "id":203, "url":"http://python-data.dr-chuck.net/known_by_Charlee.html"},
13 | {"weight":1,"rank":8.83695381234, "id":248, "url":"http://python-data.dr-chuck.net/known_by_Atli.html"},
14 | {"weight":2,"rank":4.16614971195, "id":309, "url":"http://python-data.dr-chuck.net/known_by_Abbiegail.html"},
15 | {"weight":2,"rank":2.2314317079, "id":326, "url":"http://python-data.dr-chuck.net/known_by_Nisha.html"},
16 | {"weight":1,"rank":1.21603900362, "id":382, "url":"http://python-data.dr-chuck.net/known_by_Ciar.html"},
17 | {"weight":1,"rank":1.89945314693, "id":413, "url":"http://python-data.dr-chuck.net/known_by_Brodie.html"},
18 | {"weight":2,"rank":19.0, "id":501, "url":"http://python-data.dr-chuck.net/known_by_Kylar.html"},
19 | {"weight":2,"rank":5.3834045047, "id":642, "url":"http://python-data.dr-chuck.net/known_by_Mohamed.html"},
20 | {"weight":1,"rank":3.93023811326, "id":676, "url":"http://python-data.dr-chuck.net/known_by_Oluwaferanmi.html"},
21 | {"weight":1,"rank":2.59745947896, "id":813, "url":"http://python-data.dr-chuck.net/known_by_Maree.html"},
22 | {"weight":1,"rank":1.77055254257, "id":873, "url":"http://python-data.dr-chuck.net/known_by_Shaw.html"}],
23 | "links":[
24 | {"source":0,"target":1,"value":3},
25 | {"source":0,"target":2,"value":3},
26 | {"source":0,"target":0,"value":3},
27 | {"source":2,"target":3,"value":3},
28 | {"source":2,"target":4,"value":3},
29 | {"source":2,"target":5,"value":3},
30 | {"source":2,"target":6,"value":3},
31 | {"source":5,"target":7,"value":3},
32 | {"source":5,"target":8,"value":3},
33 | {"source":5,"target":9,"value":3},
34 | {"source":5,"target":10,"value":3},
35 | {"source":6,"target":11,"value":3},
36 | {"source":4,"target":12,"value":3},
37 | {"source":4,"target":13,"value":3},
38 | {"source":4,"target":14,"value":3},
39 | {"source":8,"target":15,"value":3},
40 | {"source":7,"target":16,"value":3},
41 | {"source":13,"target":17,"value":3},
42 | {"source":10,"target":18,"value":3},
43 | {"source":14,"target":19,"value":3},
44 | {"source":18,"target":20,"value":3},
45 | {"source":18,"target":17,"value":3},
46 | {"source":20,"target":9,"value":3},
47 | {"source":17,"target":6,"value":3},
48 | {"source":9,"target":12,"value":3}]};
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spider.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import urllib
3 | import ssl
4 | from urlparse import urljoin
5 | from urlparse import urlparse
6 | from BeautifulSoup import *
7 |
8 | # Deal with SSL certificate anomalies Python > 2.7
9 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
10 | scontext = None
11 |
12 | conn = sqlite3.connect('spider.sqlite')
13 | cur = conn.cursor()
14 |
15 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages
16 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
17 | error INTEGER, old_rank REAL, new_rank REAL)''')
18 |
19 | cur.execute('''CREATE TABLE IF NOT EXISTS Links
20 | (from_id INTEGER, to_id INTEGER)''')
21 |
22 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
23 |
24 | # Check to see if we are already in progress...
25 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
26 | row = cur.fetchone()
27 | if row is not None:
28 | print "Restarting existing crawl. Remove spider.sqlite to start a fresh crawl."
29 | else :
30 | starturl = raw_input('Enter web url or enter: ')
31 | if ( len(starturl) < 1 ) : starturl = 'http://python-data.dr-chuck.net/'
32 | if ( starturl.endswith('/') ) : starturl = starturl[:-1]
33 | web = starturl
34 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
35 | pos = starturl.rfind('/')
36 | web = starturl[:pos]
37 |
38 | if ( len(web) > 1 ) :
39 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
40 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
41 | conn.commit()
42 |
43 | # Get the current webs
44 | cur.execute('''SELECT url FROM Webs''')
45 | webs = list()
46 | for row in cur:
47 | webs.append(str(row[0]))
48 |
49 | print webs
50 |
51 | many = 0
52 | while True:
53 | if ( many < 1 ) :
54 | sval = raw_input('How many pages:')
55 | if ( len(sval) < 1 ) : break
56 | many = int(sval)
57 | many = many - 1
58 |
59 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
60 | try:
61 | row = cur.fetchone()
62 | # print row
63 | fromid = row[0]
64 | url = row[1]
65 | except:
66 | print 'No unretrieved HTML pages found'
67 | many = 0
68 | break
69 |
70 | print fromid, url,
71 |
72 | # If we are retrieving this page, there should be no links from it
73 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
74 | try:
75 | # Deal with SSL certificate anomalies Python > 2.7
76 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
77 | # document = urllib.urlopen(url, context=scontext)
78 |
79 | # Normal Unless you encounter certificate problems
80 | document = urllib.urlopen(url)
81 |
82 | html = document.read()
83 | if document.getcode() != 200 :
84 | print "Error on page: ",document.getcode()
85 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
86 |
87 | if 'text/html' != document.info().gettype() :
88 | print "Ignore non text/html page"
89 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
90 | conn.commit()
91 | continue
92 |
93 | print '('+str(len(html))+')',
94 |
95 | soup = BeautifulSoup(html)
96 | except KeyboardInterrupt:
97 | print ''
98 | print 'Program interrupted by user...'
99 | break
100 | except:
101 | print "Unable to retrieve or parse page"
102 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
103 | conn.commit()
104 | continue
105 |
106 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
107 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (buffer(html), url ) )
108 | conn.commit()
109 |
110 | # Retrieve all of the anchor tags
111 | tags = soup('a')
112 | count = 0
113 | for tag in tags:
114 | href = tag.get('href', None)
115 | if ( href is None ) : continue
116 | # Resolve relative references like href="/contact"
117 | up = urlparse(href)
118 | if ( len(up.scheme) < 1 ) :
119 | href = urljoin(url, href)
120 | ipos = href.find('#')
121 | if ( ipos > 1 ) : href = href[:ipos]
122 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
123 | if ( href.endswith('/') ) : href = href[:-1]
124 | # print href
125 | if ( len(href) < 1 ) : continue
126 |
127 | # Check if the URL is in any of the webs
128 | found = False
129 | for web in webs:
130 | if ( href.startswith(web) ) :
131 | found = True
132 | break
133 | if not found : continue
134 |
135 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
136 | count = count + 1
137 | conn.commit()
138 |
139 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
140 | try:
141 | row = cur.fetchone()
142 | toid = row[0]
143 | except:
144 | print 'Could not retrieve id'
145 | continue
146 | # print fromid, toid
147 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )
148 |
149 |
150 | print count
151 |
152 | cur.close()
153 |
154 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spjson.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | print "Creating JSON output on spider.js..."
7 | howmany = int(raw_input("How many nodes? "))
8 |
9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url
10 | FROM Pages JOIN Links ON Pages.id = Links.to_id
11 | WHERE html IS NOT NULL AND ERROR IS NULL
12 | GROUP BY id ORDER BY id,inbound''')
13 |
14 | fhand = open('spider.js','w')
15 | nodes = list()
16 | maxrank = None
17 | minrank = None
18 | for row in cur :
19 | nodes.append(row)
20 | rank = row[2]
21 | if maxrank < rank or maxrank is None : maxrank = rank
22 | if minrank > rank or minrank is None : minrank = rank
23 | if len(nodes) > howmany : break
24 |
25 | if maxrank == minrank or maxrank is None or minrank is None:
26 | print "Error - please run sprank.py to compute page rank"
27 | quit()
28 |
29 | fhand.write('spiderJson = {"nodes":[\n')
30 | count = 0
31 | map = dict()
32 | ranks = dict()
33 | for row in nodes :
34 | if count > 0 : fhand.write(',\n')
35 | # print row
36 | rank = row[2]
37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) )
38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',')
39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}')
40 | map[row[3]] = count
41 | ranks[row[3]] = rank
42 | count = count + 1
43 | fhand.write('],\n')
44 |
45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
46 | fhand.write('"links":[\n')
47 |
48 | count = 0
49 | for row in cur :
50 | # print row
51 | if row[0] not in map or row[1] not in map : continue
52 | if count > 0 : fhand.write(',\n')
53 | rank = ranks[row[0]]
54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) )
55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}')
56 | count = count + 1
57 | fhand.write(']};')
58 | fhand.close()
59 | cur.close()
60 |
61 | print "Open force.html in a browser to view the visualization"
62 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/sprank.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | # Find the ids that send out page rank - we only are interested
7 | # in pages in the SCC that have in and out links
8 | cur.execute('''SELECT DISTINCT from_id FROM Links''')
9 | from_ids = list()
10 | for row in cur:
11 | from_ids.append(row[0])
12 |
13 | # Find the ids that receive page rank
14 | to_ids = list()
15 | links = list()
16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
17 | for row in cur:
18 | from_id = row[0]
19 | to_id = row[1]
20 | if from_id == to_id : continue
21 | if from_id not in from_ids : continue
22 | if to_id not in from_ids : continue
23 | links.append(row)
24 | if to_id not in to_ids : to_ids.append(to_id)
25 |
26 | # Get latest page ranks for strongly connected component
27 | prev_ranks = dict()
28 | for node in from_ids:
29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, ))
30 | row = cur.fetchone()
31 | prev_ranks[node] = row[0]
32 |
33 | sval = raw_input('How many iterations:')
34 | many = 1
35 | if ( len(sval) > 0 ) : many = int(sval)
36 |
37 | # Sanity check
38 | if len(prev_ranks) < 1 :
39 | print "Nothing to page rank. Check data."
40 | quit()
41 |
42 | # Lets do Page Rank in memory so it is really fast
43 | for i in range(many):
44 | # print prev_ranks.items()[:5]
45 | next_ranks = dict();
46 | total = 0.0
47 | for (node, old_rank) in prev_ranks.items():
48 | total = total + old_rank
49 | next_ranks[node] = 0.0
50 | # print total
51 |
52 | # Find the number of outbound links and sent the page rank down each
53 | for (node, old_rank) in prev_ranks.items():
54 | # print node, old_rank
55 | give_ids = list()
56 | for (from_id, to_id) in links:
57 | if from_id != node : continue
58 | # print ' ',from_id,to_id
59 |
60 | if to_id not in to_ids: continue
61 | give_ids.append(to_id)
62 | if ( len(give_ids) < 1 ) : continue
63 | amount = old_rank / len(give_ids)
64 | # print node, old_rank,amount, give_ids
65 |
66 | for id in give_ids:
67 | next_ranks[id] = next_ranks[id] + amount
68 |
69 | newtot = 0
70 | for (node, next_rank) in next_ranks.items():
71 | newtot = newtot + next_rank
72 | evap = (total - newtot) / len(next_ranks)
73 |
74 | # print newtot, evap
75 | for node in next_ranks:
76 | next_ranks[node] = next_ranks[node] + evap
77 |
78 | newtot = 0
79 | for (node, next_rank) in next_ranks.items():
80 | newtot = newtot + next_rank
81 |
82 | # Compute the per-page average change from old rank to new rank
83 | # As indication of convergence of the algorithm
84 | totdiff = 0
85 | for (node, old_rank) in prev_ranks.items():
86 | new_rank = next_ranks[node]
87 | diff = abs(old_rank-new_rank)
88 | totdiff = totdiff + diff
89 |
90 | avediff = totdiff / len(prev_ranks)
91 | print i+1, avediff
92 |
93 | # rotate
94 | prev_ranks = next_ranks
95 |
96 | # Put the final ranks back into the database
97 | print next_ranks.items()[:5]
98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''')
99 | for (id, new_rank) in next_ranks.items() :
100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id))
101 | conn.commit()
102 | cur.close()
103 |
104 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pagerank/spreset.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''')
7 | conn.commit()
8 |
9 | cur.close()
10 |
11 | print "All pages set to a rank of 1.0"
12 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pals.py:
--------------------------------------------------------------------------------
1 | friends = ['Joseph', 'Glenn', 'Sally']
2 | for friend in friends:
3 | print 'Happy New Year:', friend
4 | print 'Done!'
5 |
6 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/party1.py:
--------------------------------------------------------------------------------
1 | class PartyAnimal:
2 | x = 0
3 |
4 | def party(self) :
5 | self.x = self.x + 1
6 | print "So far",self.x
7 |
8 | an = PartyAnimal()
9 |
10 | an.party()
11 | an.party()
12 | an.party()
13 |
14 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/party2.py:
--------------------------------------------------------------------------------
1 | class PartyAnimal:
2 | x = 0
3 |
4 | def party(self) :
5 | self.x = self.x + 1
6 | print "So far",self.x
7 |
8 | an = PartyAnimal()
9 |
10 | print "Type", type(an)
11 | print "Dir ", dir(an)
12 |
13 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/party3.py:
--------------------------------------------------------------------------------
1 | class PartyAnimal:
2 | x = 0
3 |
4 | def __init__(self):
5 | print "I am constructed"
6 |
7 | def party(self) :
8 | self.x = self.x + 1
9 | print "So far",self.x
10 |
11 | def __del__(self):
12 | print "I am destructed", self.x
13 |
14 | an = PartyAnimal()
15 | an.party()
16 | an.party()
17 | an.party()
18 |
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/party4.py:
--------------------------------------------------------------------------------
1 | class PartyAnimal:
2 | x = 0
3 | name = ""
4 | def __init__(self, nam):
5 | self.name = nam
6 | print self.name,"constructed"
7 |
8 | def party(self) :
9 | self.x = self.x + 1
10 | print self.name,"party count",self.x
11 |
12 | s = PartyAnimal("Sally")
13 | s.party()
14 |
15 | j = PartyAnimal("Jim")
16 | j.party()
17 | s.party()
18 |
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/party5.py:
--------------------------------------------------------------------------------
1 | class PartyAnimal:
2 | x = 0
3 | name = ""
4 | def __init__(self, nam):
5 | self.name = nam
6 | print self.name,"constructed"
7 |
8 | def party(self) :
9 | self.x = self.x + 1
10 | print self.name,"party count",self.x
11 |
12 | class FootballFan(PartyAnimal):
13 | points = 0
14 | def touchdown(self):
15 | self.points = self.points + 7
16 | self.party()
17 | print self.name,"points",self.points
18 |
19 | s = PartyAnimal("Sally")
20 | s.party()
21 |
22 | j = FootballFan("Jim")
23 | j.party()
24 | j.touchdown()
25 |
26 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pay.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Hours: ')
2 | hours = float(inp)
3 | inp = raw_input('Enter Rate: ')
4 | rate = float(inp)
5 | pay = hours * rate
6 | print 'Pay:', pay
7 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pay2.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter Hours: ')
2 | hours = float(inp)
3 | inp = raw_input('Enter Rate: ')
4 | rate = float(inp)
5 | if hours > 40:
6 | pay = hours * rate + (hours - 40) * rate * 0.5
7 | else:
8 | pay = hours * rate
9 | print 'Pay:', pay
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/pay3.py:
--------------------------------------------------------------------------------
1 | try:
2 | inp = raw_input('Enter Hours: ')
3 | hours = float(inp)
4 | inp = raw_input('Enter Rate: ')
5 | rate = float(inp)
6 | if hours > 40:
7 | pay = hours * rate + (hours - 40) * rate * 1.5
8 | else:
9 | pay = hours * rate
10 | print 'Pay:', pay
11 | except:
12 | print 'Error, please enter numeric input'
13 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re01.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | if re.search('From:', line) :
7 | print line
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re02.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | if re.search('^From:', line) :
7 | print line
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re03.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with F and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | if re.search('^F..m:', line) :
7 | print line
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re04.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | if re.search('^From:.+@', line) :
7 | print line
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re05.py:
--------------------------------------------------------------------------------
1 | import re
2 | s = 'This message from csev@umich.edu to cwen@iupui.edu is about a meeting @2PM'
3 | lst = re.findall('\S+@\S+', s)
4 | print lst
5 |
6 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re06.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | x = re.findall('\S+@\S+', line)
7 | if len(x) > 0 :
8 | print x
9 |
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re07.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | x = re.findall('[a-zA-Z0-9]\S+@\S+[a-zA-Z]', line)
7 | if len(x) > 0 :
8 | print x
9 |
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re08.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | x = re.findall('^X\S*: (\S+)', line)
7 | if not x : continue
8 | print x
9 |
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re09.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | if re.search('^X\S*: [0-9.]+', line) :
7 | print line
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re10.py:
--------------------------------------------------------------------------------
1 | import re
2 | hand = open('mbox-short.txt')
3 | for line in hand:
4 | line = line.rstrip()
5 | x = re.findall('^X\S*: ([0-9.]+)', line)
6 | if len(x) > 0 :
7 | print x
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re11.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | x = re.findall('^Details:.*rev=([0-9.]+)', line)
7 | if len(x) > 0:
8 | print x
9 |
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re12.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | x = re.findall('^From .* ([0-9][0-9]):', line)
7 | if len(x) > 0 : print x
8 |
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re13.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | hand = open('mbox-short.txt')
4 | for line in hand:
5 | line = line.rstrip()
6 | x = re.findall('Author:.*@(\S+)', line)
7 | if not x : continue
8 | print x
9 |
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/re14.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import re
3 | fname = raw_input('Enter file:')
4 | hand = open(fname)
5 | nums = list()
6 | for line in hand:
7 | line = line.rstrip()
8 | x = re.findall('New Revision: ([0-9]+)', line)
9 | if len(x) == 1 :
10 | val = float(x[0])
11 | nums.append(val)
12 | print len(nums)
13 | print sum(nums)/len(nums)
14 |
15 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/romeo-full.txt:
--------------------------------------------------------------------------------
1 | Romeo and Juliet
2 | Act 2, Scene 2
3 |
4 | SCENE II. Capulet's orchard.
5 |
6 | Enter ROMEO
7 |
8 | ROMEO
9 |
10 | He jests at scars that never felt a wound.
11 | JULIET appears above at a window
12 |
13 | But, soft! what light through yonder window breaks?
14 | It is the east, and Juliet is the sun.
15 | Arise, fair sun, and kill the envious moon,
16 | Who is already sick and pale with grief,
17 | That thou her maid art far more fair than she:
18 | Be not her maid, since she is envious;
19 | Her vestal livery is but sick and green
20 | And none but fools do wear it; cast it off.
21 | It is my lady, O, it is my love!
22 | O, that she knew she were!
23 | She speaks yet she says nothing: what of that?
24 | Her eye discourses; I will answer it.
25 | I am too bold, 'tis not to me she speaks:
26 | Two of the fairest stars in all the heaven,
27 | Having some business, do entreat her eyes
28 | To twinkle in their spheres till they return.
29 | What if her eyes were there, they in her head?
30 | The brightness of her cheek would shame those stars,
31 | As daylight doth a lamp; her eyes in heaven
32 | Would through the airy region stream so bright
33 | That birds would sing and think it were not night.
34 | See, how she leans her cheek upon her hand!
35 | O, that I were a glove upon that hand,
36 | That I might touch that cheek!
37 |
38 | JULIET
39 |
40 | Ay me!
41 |
42 | ROMEO
43 |
44 | She speaks:
45 | O, speak again, bright angel! for thou art
46 | As glorious to this night, being o'er my head
47 | As is a winged messenger of heaven
48 | Unto the white-upturned wondering eyes
49 | Of mortals that fall back to gaze on him
50 | When he bestrides the lazy-pacing clouds
51 | And sails upon the bosom of the air.
52 |
53 | JULIET
54 |
55 | O Romeo, Romeo! wherefore art thou Romeo?
56 | Deny thy father and refuse thy name;
57 | Or, if thou wilt not, be but sworn my love,
58 | And I'll no longer be a Capulet.
59 |
60 | ROMEO
61 |
62 | [Aside] Shall I hear more, or shall I speak at this?
63 |
64 | JULIET
65 |
66 | 'Tis but thy name that is my enemy;
67 | Thou art thyself, though not a Montague.
68 | What's Montague? it is nor hand, nor foot,
69 | Nor arm, nor face, nor any other part
70 | Belonging to a man. O, be some other name!
71 | What's in a name? that which we call a rose
72 | By any other name would smell as sweet;
73 | So Romeo would, were he not Romeo call'd,
74 | Retain that dear perfection which he owes
75 | Without that title. Romeo, doff thy name,
76 | And for that name which is no part of thee
77 | Take all myself.
78 |
79 | ROMEO
80 |
81 | I take thee at thy word:
82 | Call me but love, and I'll be new baptized;
83 | Henceforth I never will be Romeo.
84 |
85 | JULIET
86 |
87 | What man art thou that thus bescreen'd in night
88 | So stumblest on my counsel?
89 |
90 | ROMEO
91 |
92 | By a name
93 | I know not how to tell thee who I am:
94 | My name, dear saint, is hateful to myself,
95 | Because it is an enemy to thee;
96 | Had I it written, I would tear the word.
97 |
98 | JULIET
99 |
100 | My ears have not yet drunk a hundred words
101 | Of that tongue's utterance, yet I know the sound:
102 | Art thou not Romeo and a Montague?
103 |
104 | ROMEO
105 |
106 | Neither, fair saint, if either thee dislike.
107 |
108 | JULIET
109 |
110 | How camest thou hither, tell me, and wherefore?
111 | The orchard walls are high and hard to climb,
112 | And the place death, considering who thou art,
113 | If any of my kinsmen find thee here.
114 |
115 | ROMEO
116 |
117 | With love's light wings did I o'er-perch these walls;
118 | For stony limits cannot hold love out,
119 | And what love can do that dares love attempt;
120 | Therefore thy kinsmen are no let to me.
121 |
122 | JULIET
123 |
124 | If they do see thee, they will murder thee.
125 |
126 | ROMEO
127 |
128 | Alack, there lies more peril in thine eye
129 | Than twenty of their swords: look thou but sweet,
130 | And I am proof against their enmity.
131 |
132 | JULIET
133 |
134 | I would not for the world they saw thee here.
135 |
136 | ROMEO
137 |
138 | I have night's cloak to hide me from their sight;
139 | And but thou love me, let them find me here:
140 | My life were better ended by their hate,
141 | Than death prorogued, wanting of thy love.
142 |
143 | JULIET
144 |
145 | By whose direction found'st thou out this place?
146 |
147 | ROMEO
148 |
149 | By love, who first did prompt me to inquire;
150 | He lent me counsel and I lent him eyes.
151 | I am no pilot; yet, wert thou as far
152 | As that vast shore wash'd with the farthest sea,
153 | I would adventure for such merchandise.
154 |
155 | JULIET
156 |
157 | Thou know'st the mask of night is on my face,
158 | Else would a maiden blush bepaint my cheek
159 | For that which thou hast heard me speak to-night
160 | Fain would I dwell on form, fain, fain deny
161 | What I have spoke: but farewell compliment!
162 | Dost thou love me? I know thou wilt say 'Ay,'
163 | And I will take thy word: yet if thou swear'st,
164 | Thou mayst prove false; at lovers' perjuries
165 | Then say, Jove laughs. O gentle Romeo,
166 | If thou dost love, pronounce it faithfully:
167 | Or if thou think'st I am too quickly won,
168 | I'll frown and be perverse an say thee nay,
169 | So thou wilt woo; but else, not for the world.
170 | In truth, fair Montague, I am too fond,
171 | And therefore thou mayst think my 'havior light:
172 | But trust me, gentleman, I'll prove more true
173 | Than those that have more cunning to be strange.
174 | I should have been more strange, I must confess,
175 | But that thou overheard'st, ere I was ware,
176 | My true love's passion: therefore pardon me,
177 | And not impute this yielding to light love,
178 | Which the dark night hath so discovered.
179 |
180 | ROMEO
181 |
182 | Lady, by yonder blessed moon I swear
183 | That tips with silver all these fruit-tree tops--
184 |
185 | JULIET
186 |
187 | O, swear not by the moon, the inconstant moon,
188 | That monthly changes in her circled orb,
189 | Lest that thy love prove likewise variable.
190 |
191 | ROMEO
192 |
193 | What shall I swear by?
194 |
195 | JULIET
196 |
197 | Do not swear at all;
198 | Or, if thou wilt, swear by thy gracious self,
199 | Which is the god of my idolatry,
200 | And I'll believe thee.
201 |
202 | ROMEO
203 |
204 | If my heart's dear love--
205 |
206 | JULIET
207 |
208 | Well, do not swear: although I joy in thee,
209 | I have no joy of this contract to-night:
210 | It is too rash, too unadvised, too sudden;
211 | Too like the lightning, which doth cease to be
212 | Ere one can say 'It lightens.' Sweet, good night!
213 | This bud of love, by summer's ripening breath,
214 | May prove a beauteous flower when next we meet.
215 | Good night, good night! as sweet repose and rest
216 | Come to thy heart as that within my breast!
217 |
218 | ROMEO
219 |
220 | O, wilt thou leave me so unsatisfied?
221 |
222 | JULIET
223 |
224 | What satisfaction canst thou have to-night?
225 |
226 | ROMEO
227 |
228 | The exchange of thy love's faithful vow for mine.
229 |
230 | JULIET
231 |
232 | I gave thee mine before thou didst request it:
233 | And yet I would it were to give again.
234 |
235 | ROMEO
236 |
237 | Wouldst thou withdraw it? for what purpose, love?
238 |
239 | JULIET
240 |
241 | But to be frank, and give it thee again.
242 | And yet I wish but for the thing I have:
243 | My bounty is as boundless as the sea,
244 | My love as deep; the more I give to thee,
245 | The more I have, for both are infinite.
246 |
247 | Nurse calls within
248 |
249 | I hear some noise within; dear love, adieu!
250 | Anon, good nurse! Sweet Montague, be true.
251 | Stay but a little, I will come again.
252 | Exit, above
253 |
254 | ROMEO
255 |
256 | O blessed, blessed night! I am afeard.
257 | Being in night, all this is but a dream,
258 | Too flattering-sweet to be substantial.
259 |
260 | Re-enter JULIET, above
261 |
262 | JULIET
263 |
264 | Three words, dear Romeo, and good night indeed.
265 | If that thy bent of love be honourable,
266 | Thy purpose marriage, send me word to-morrow,
267 | By one that I'll procure to come to thee,
268 | Where and what time thou wilt perform the rite;
269 | And all my fortunes at thy foot I'll lay
270 | And follow thee my lord throughout the world.
271 |
272 | Nurse
273 |
274 | [Within] Madam!
275 |
276 | JULIET
277 |
278 | I come, anon.--But if thou mean'st not well,
279 | I do beseech thee--
280 |
281 | Nurse
282 | [Within] Madam!
283 |
284 | JULIET
285 |
286 | By and by, I come:--
287 | To cease thy suit, and leave me to my grief:
288 | To-morrow will I send.
289 |
290 | ROMEO
291 |
292 | So thrive my soul--
293 |
294 | JULIET
295 |
296 | A thousand times good night!
297 | Exit, above
298 |
299 | ROMEO
300 |
301 | A thousand times the worse, to want thy light.
302 | Love goes toward love, as schoolboys from
303 | their books,
304 | But love from love, toward school with heavy looks.
305 | Retiring
306 |
307 | Re-enter JULIET, above
308 |
309 | JULIET
310 |
311 | Hist! Romeo, hist! O, for a falconer's voice,
312 | To lure this tassel-gentle back again!
313 | Bondage is hoarse, and may not speak aloud;
314 | Else would I tear the cave where Echo lies,
315 | And make her airy tongue more hoarse than mine,
316 | With repetition of my Romeo's name.
317 |
318 | ROMEO
319 |
320 | It is my soul that calls upon my name:
321 | How silver-sweet sound lovers' tongues by night,
322 | Like softest music to attending ears!
323 |
324 | JULIET
325 |
326 | Romeo!
327 |
328 | ROMEO
329 |
330 | My dear?
331 |
332 | JULIET
333 |
334 | At what o'clock to-morrow
335 | Shall I send to thee?
336 |
337 | ROMEO
338 |
339 | At the hour of nine.
340 |
341 | JULIET
342 |
343 | I will not fail: 'tis twenty years till then.
344 | I have forgot why I did call thee back.
345 |
346 | ROMEO
347 |
348 | Let me stand here till thou remember it.
349 |
350 | JULIET
351 |
352 | I shall forget, to have thee still stand there,
353 | Remembering how I love thy company.
354 |
355 | ROMEO
356 |
357 | And I'll still stay, to have thee still forget,
358 | Forgetting any other home but this.
359 |
360 | JULIET
361 |
362 | 'Tis almost morning; I would have thee gone:
363 | And yet no further than a wanton's bird;
364 | Who lets it hop a little from her hand,
365 | Like a poor prisoner in his twisted gyves,
366 | And with a silk thread plucks it back again,
367 | So loving-jealous of his liberty.
368 |
369 | ROMEO
370 |
371 | I would I were thy bird.
372 |
373 | JULIET
374 |
375 | Sweet, so would I:
376 | Yet I should kill thee with much cherishing.
377 | Good night, good night! parting is such
378 | sweet sorrow,
379 | That I shall say good night till it be morrow.
380 |
381 | Exit above
382 |
383 | ROMEO
384 |
385 | Sleep dwell upon thine eyes, peace in thy breast!
386 | Would I were sleep and peace, so sweet to rest!
387 | Hence will I to my ghostly father's cell,
388 | His help to crave, and my dear hap to tell.
389 |
390 | Exit
391 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/romeo.txt:
--------------------------------------------------------------------------------
1 | But soft what light through yonder window breaks
2 | It is the east and Juliet is the sun
3 | Arise fair sun and kill the envious moon
4 | Who is already sick and pale with grief
5 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/roster.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sqlite3
3 |
4 | conn = sqlite3.connect('rosterdb.sqlite')
5 | cur = conn.cursor()
6 |
7 | # Do some setup
8 | cur.executescript('''
9 | DROP TABLE IF EXISTS User;
10 | DROP TABLE IF EXISTS Member;
11 | DROP TABLE IF EXISTS Course;
12 |
13 | CREATE TABLE User (
14 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
15 | name TEXT UNIQUE
16 | );
17 |
18 | CREATE TABLE Course (
19 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
20 | title TEXT UNIQUE
21 | );
22 |
23 | CREATE TABLE Member (
24 | user_id INTEGER,
25 | course_id INTEGER,
26 | role INTEGER,
27 | PRIMARY KEY (user_id, course_id)
28 | )
29 | ''')
30 |
31 | fname = raw_input('Enter file name: ')
32 | if ( len(fname) < 1 ) : fname = 'roster_data.json'
33 |
34 | # [
35 | # [ "Charley", "si110", 1 ],
36 | # [ "Mea", "si110", 0 ],
37 |
38 | str_data = open(fname).read()
39 | json_data = json.loads(str_data)
40 |
41 | for entry in json_data:
42 |
43 | name = entry[0];
44 | title = entry[1];
45 |
46 | print name, title
47 |
48 | cur.execute('''INSERT OR IGNORE INTO User (name)
49 | VALUES ( ? )''', ( name, ) )
50 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
51 | user_id = cur.fetchone()[0]
52 |
53 | cur.execute('''INSERT OR IGNORE INTO Course (title)
54 | VALUES ( ? )''', ( title, ) )
55 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
56 | course_id = cur.fetchone()[0]
57 |
58 | cur.execute('''INSERT OR REPLACE INTO Member
59 | (user_id, course_id) VALUES ( ?, ? )''',
60 | ( user_id, course_id ) )
61 |
62 | conn.commit()
63 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/roster.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/roster.zip
--------------------------------------------------------------------------------
/EXAMPLE CODE/roster/roster.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sqlite3
3 |
4 | conn = sqlite3.connect('rosterdb.sqlite')
5 | cur = conn.cursor()
6 |
7 | # Do some setup
8 | cur.executescript('''
9 | DROP TABLE IF EXISTS User;
10 | DROP TABLE IF EXISTS Member;
11 | DROP TABLE IF EXISTS Course;
12 |
13 | CREATE TABLE User (
14 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
15 | name TEXT UNIQUE
16 | );
17 |
18 | CREATE TABLE Course (
19 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
20 | title TEXT UNIQUE
21 | );
22 |
23 | CREATE TABLE Member (
24 | user_id INTEGER,
25 | course_id INTEGER,
26 | role INTEGER,
27 | PRIMARY KEY (user_id, course_id)
28 | )
29 | ''')
30 |
31 | fname = raw_input('Enter file name: ')
32 | if ( len(fname) < 1 ) : fname = 'roster_data.json'
33 |
34 | # [
35 | # [ "Charley", "si110", 1 ],
36 | # [ "Mea", "si110", 0 ],
37 |
38 | str_data = open(fname).read()
39 | json_data = json.loads(str_data)
40 |
41 | for entry in json_data:
42 |
43 | name = entry[0];
44 | title = entry[1];
45 |
46 | print name, title
47 |
48 | cur.execute('''INSERT OR IGNORE INTO User (name)
49 | VALUES ( ? )''', ( name, ) )
50 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
51 | user_id = cur.fetchone()[0]
52 |
53 | cur.execute('''INSERT OR IGNORE INTO Course (title)
54 | VALUES ( ? )''', ( title, ) )
55 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
56 | course_id = cur.fetchone()[0]
57 |
58 | cur.execute('''INSERT OR REPLACE INTO Member
59 | (user_id, course_id) VALUES ( ?, ? )''',
60 | ( user_id, course_id ) )
61 |
62 | conn.commit()
63 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/roster/roster_data.json:
--------------------------------------------------------------------------------
1 | [
2 | [
3 | "Charley",
4 | "si110",
5 | 1
6 | ],
7 | [
8 | "Mea",
9 | "si110",
10 | 0
11 | ],
12 | [
13 | "Hattie",
14 | "si110",
15 | 0
16 | ],
17 | [
18 | "Lyena",
19 | "si110",
20 | 0
21 | ],
22 | [
23 | "Keziah",
24 | "si110",
25 | 0
26 | ],
27 | [
28 | "Ellyce",
29 | "si110",
30 | 0
31 | ],
32 | [
33 | "Thalia",
34 | "si110",
35 | 0
36 | ],
37 | [
38 | "Meabh",
39 | "si110",
40 | 0
41 | ],
42 | [
43 | "Aria",
44 | "si110",
45 | 0
46 | ],
47 | [
48 | "Reena",
49 | "si110",
50 | 0
51 | ],
52 | [
53 | "Ioannis",
54 | "si110",
55 | 0
56 | ],
57 | [
58 | "Reily",
59 | "si110",
60 | 0
61 | ],
62 | [
63 | "Sidharth",
64 | "si110",
65 | 0
66 | ],
67 | [
68 | "Keiara",
69 | "si110",
70 | 0
71 | ],
72 | [
73 | "Yann",
74 | "si110",
75 | 0
76 | ],
77 | [
78 | "Marykate",
79 | "si110",
80 | 0
81 | ],
82 | [
83 | "Dylan",
84 | "si110",
85 | 0
86 | ],
87 | [
88 | "Kiran",
89 | "si110",
90 | 0
91 | ],
92 | [
93 | "Faizaan",
94 | "si110",
95 | 0
96 | ],
97 | [
98 | "Aneshia",
99 | "si110",
100 | 0
101 | ],
102 | [
103 | "Kamron",
104 | "si110",
105 | 0
106 | ],
107 | [
108 | "Allen",
109 | "si110",
110 | 0
111 | ],
112 | [
113 | "Marshall",
114 | "si110",
115 | 0
116 | ],
117 | [
118 | "Rosa",
119 | "si106",
120 | 1
121 | ],
122 | [
123 | "Nora",
124 | "si106",
125 | 0
126 | ],
127 | [
128 | "Mairin",
129 | "si106",
130 | 0
131 | ],
132 | [
133 | "Zendel",
134 | "si106",
135 | 0
136 | ],
137 | [
138 | "Honie",
139 | "si106",
140 | 0
141 | ],
142 | [
143 | "Betsy",
144 | "si106",
145 | 0
146 | ],
147 | [
148 | "Davie",
149 | "si106",
150 | 0
151 | ],
152 | [
153 | "Larissa",
154 | "si106",
155 | 0
156 | ],
157 | [
158 | "Shaurya",
159 | "si106",
160 | 0
161 | ],
162 | [
163 | "Shania",
164 | "si106",
165 | 0
166 | ],
167 | [
168 | "Sorcha",
169 | "si106",
170 | 0
171 | ],
172 | [
173 | "Jeanna",
174 | "si106",
175 | 0
176 | ],
177 | [
178 | "Temba",
179 | "si106",
180 | 0
181 | ],
182 | [
183 | "Buse",
184 | "si106",
185 | 0
186 | ],
187 | [
188 | "Mohammed",
189 | "si106",
190 | 0
191 | ],
192 | [
193 | "Kayah",
194 | "si106",
195 | 0
196 | ],
197 | [
198 | "Kareena",
199 | "si106",
200 | 0
201 | ],
202 | [
203 | "Dineo",
204 | "si106",
205 | 0
206 | ],
207 | [
208 | "Philippa",
209 | "si106",
210 | 0
211 | ],
212 | [
213 | "Lia",
214 | "si206",
215 | 1
216 | ],
217 | [
218 | "Sharlyn",
219 | "si206",
220 | 0
221 | ],
222 | [
223 | "Linton",
224 | "si206",
225 | 0
226 | ],
227 | [
228 | "Temilade",
229 | "si206",
230 | 0
231 | ],
232 | [
233 | "Areez",
234 | "si206",
235 | 0
236 | ],
237 | [
238 | "MacCartney",
239 | "si206",
240 | 0
241 | ],
242 | [
243 | "Abubakar",
244 | "si206",
245 | 0
246 | ],
247 | [
248 | "Derryn",
249 | "si206",
250 | 0
251 | ],
252 | [
253 | "Elan",
254 | "si206",
255 | 0
256 | ],
257 | [
258 | "Vikki",
259 | "si206",
260 | 0
261 | ],
262 | [
263 | "Anisa",
264 | "si206",
265 | 0
266 | ],
267 | [
268 | "Klevis",
269 | "si206",
270 | 0
271 | ],
272 | [
273 | "Tait",
274 | "si206",
275 | 0
276 | ],
277 | [
278 | "Rhea",
279 | "si206",
280 | 0
281 | ],
282 | [
283 | "Pearsen",
284 | "si206",
285 | 0
286 | ],
287 | [
288 | "Willow",
289 | "si206",
290 | 0
291 | ],
292 | [
293 | "Skye",
294 | "si206",
295 | 0
296 | ],
297 | [
298 | "Caralee",
299 | "si206",
300 | 0
301 | ],
302 | [
303 | "Charlee",
304 | "si206",
305 | 0
306 | ],
307 | [
308 | "Karyn",
309 | "si206",
310 | 0
311 | ],
312 | [
313 | "Elana",
314 | "si206",
315 | 0
316 | ],
317 | [
318 | "Maggie",
319 | "si206",
320 | 0
321 | ],
322 | [
323 | "Eryk",
324 | "si206",
325 | 0
326 | ],
327 | [
328 | "Zulaikha",
329 | "si301",
330 | 1
331 | ],
332 | [
333 | "Elshan",
334 | "si301",
335 | 0
336 | ],
337 | [
338 | "Anastasia",
339 | "si301",
340 | 0
341 | ],
342 | [
343 | "Connar",
344 | "si301",
345 | 0
346 | ],
347 | [
348 | "Anay",
349 | "si301",
350 | 0
351 | ],
352 | [
353 | "Jayla",
354 | "si301",
355 | 0
356 | ],
357 | [
358 | "Cai",
359 | "si301",
360 | 0
361 | ],
362 | [
363 | "Zijie",
364 | "si301",
365 | 0
366 | ],
367 | [
368 | "Riana",
369 | "si301",
370 | 0
371 | ],
372 | [
373 | "Codie",
374 | "si301",
375 | 0
376 | ],
377 | [
378 | "Colette",
379 | "si301",
380 | 0
381 | ],
382 | [
383 | "Lucee",
384 | "si301",
385 | 0
386 | ],
387 | [
388 | "Tatiana",
389 | "si301",
390 | 0
391 | ],
392 | [
393 | "Zhong",
394 | "si301",
395 | 0
396 | ],
397 | [
398 | "Lowri",
399 | "si301",
400 | 0
401 | ],
402 | [
403 | "Maggy",
404 | "si301",
405 | 0
406 | ],
407 | [
408 | "Basher",
409 | "si301",
410 | 0
411 | ],
412 | [
413 | "Tanika",
414 | "si301",
415 | 0
416 | ],
417 | [
418 | "Aria",
419 | "si301",
420 | 0
421 | ],
422 | [
423 | "Belle",
424 | "si301",
425 | 0
426 | ],
427 | [
428 | "Laranya",
429 | "si301",
430 | 0
431 | ],
432 | [
433 | "Dayna",
434 | "si301",
435 | 0
436 | ],
437 | [
438 | "Elleanne",
439 | "si301",
440 | 0
441 | ],
442 | [
443 | "Maanav",
444 | "si310",
445 | 1
446 | ],
447 | [
448 | "Tamta",
449 | "si310",
450 | 0
451 | ],
452 | [
453 | "Frazer",
454 | "si310",
455 | 0
456 | ],
457 | [
458 | "Sacha",
459 | "si310",
460 | 0
461 | ],
462 | [
463 | "Aidan",
464 | "si310",
465 | 0
466 | ],
467 | [
468 | "Abel",
469 | "si310",
470 | 0
471 | ],
472 | [
473 | "Ahtasham",
474 | "si310",
475 | 0
476 | ],
477 | [
478 | "Avinash",
479 | "si310",
480 | 0
481 | ],
482 | [
483 | "Colette",
484 | "si310",
485 | 0
486 | ],
487 | [
488 | "Cohen",
489 | "si310",
490 | 0
491 | ],
492 | [
493 | "Rori",
494 | "si310",
495 | 0
496 | ],
497 | [
498 | "Youer",
499 | "si310",
500 | 0
501 | ],
502 | [
503 | "Jamey",
504 | "si310",
505 | 0
506 | ],
507 | [
508 | "Makenzie",
509 | "si310",
510 | 0
511 | ],
512 | [
513 | "Ida",
514 | "si310",
515 | 0
516 | ],
517 | [
518 | "Alexzander",
519 | "si310",
520 | 0
521 | ],
522 | [
523 | "Kavita",
524 | "si310",
525 | 0
526 | ],
527 | [
528 | "Talia",
529 | "si310",
530 | 0
531 | ],
532 | [
533 | "Anthony",
534 | "si310",
535 | 0
536 | ],
537 | [
538 | "Elona",
539 | "si334",
540 | 1
541 | ],
542 | [
543 | "Inan",
544 | "si334",
545 | 0
546 | ],
547 | [
548 | "Caoilainn",
549 | "si334",
550 | 0
551 | ],
552 | [
553 | "Ainsley",
554 | "si334",
555 | 0
556 | ],
557 | [
558 | "Franciszek",
559 | "si334",
560 | 0
561 | ],
562 | [
563 | "Corrie",
564 | "si334",
565 | 0
566 | ],
567 | [
568 | "Nolan",
569 | "si334",
570 | 0
571 | ],
572 | [
573 | "Makala",
574 | "si334",
575 | 0
576 | ],
577 | [
578 | "Obieluem",
579 | "si334",
580 | 0
581 | ],
582 | [
583 | "Camryn",
584 | "si334",
585 | 0
586 | ],
587 | [
588 | "Honie",
589 | "si334",
590 | 0
591 | ],
592 | [
593 | "Ole",
594 | "si334",
595 | 0
596 | ],
597 | [
598 | "Raine",
599 | "si334",
600 | 0
601 | ],
602 | [
603 | "Tyllor",
604 | "si334",
605 | 0
606 | ],
607 | [
608 | "Diane",
609 | "si334",
610 | 0
611 | ],
612 | [
613 | "Cullen",
614 | "si334",
615 | 0
616 | ],
617 | [
618 | "Taylor",
619 | "si334",
620 | 0
621 | ],
622 | [
623 | "Schekina",
624 | "si334",
625 | 0
626 | ],
627 | [
628 | "Kensey",
629 | "si334",
630 | 0
631 | ],
632 | [
633 | "Zhi",
634 | "si334",
635 | 0
636 | ],
637 | [
638 | "Kiran",
639 | "si334",
640 | 0
641 | ],
642 | [
643 | "Tymoteusz",
644 | "si334",
645 | 0
646 | ],
647 | [
648 | "Windsor",
649 | "si363",
650 | 1
651 | ],
652 | [
653 | "Kashish",
654 | "si363",
655 | 0
656 | ],
657 | [
658 | "Diarmid",
659 | "si363",
660 | 0
661 | ],
662 | [
663 | "Laura",
664 | "si363",
665 | 0
666 | ],
667 | [
668 | "Jaskaran",
669 | "si363",
670 | 0
671 | ],
672 | [
673 | "Presley",
674 | "si363",
675 | 0
676 | ],
677 | [
678 | "Brooklynn",
679 | "si363",
680 | 0
681 | ],
682 | [
683 | "Heddle",
684 | "si363",
685 | 0
686 | ],
687 | [
688 | "Travis",
689 | "si363",
690 | 0
691 | ],
692 | [
693 | "Alx",
694 | "si363",
695 | 0
696 | ],
697 | [
698 | "Airen",
699 | "si363",
700 | 0
701 | ],
702 | [
703 | "Erika",
704 | "si363",
705 | 0
706 | ],
707 | [
708 | "Mackie",
709 | "si363",
710 | 0
711 | ],
712 | [
713 | "Wen",
714 | "si363",
715 | 0
716 | ],
717 | [
718 | "Seaan",
719 | "si363",
720 | 0
721 | ],
722 | [
723 | "Meghan",
724 | "si363",
725 | 0
726 | ],
727 | [
728 | "Ryaan",
729 | "si363",
730 | 0
731 | ],
732 | [
733 | "Imogem",
734 | "si364",
735 | 1
736 | ],
737 | [
738 | "Harlie",
739 | "si364",
740 | 0
741 | ],
742 | [
743 | "Ronnie",
744 | "si364",
745 | 0
746 | ],
747 | [
748 | "Lucca",
749 | "si364",
750 | 0
751 | ],
752 | [
753 | "Shanelle",
754 | "si364",
755 | 0
756 | ],
757 | [
758 | "Ieuan",
759 | "si364",
760 | 0
761 | ],
762 | [
763 | "Anneliese",
764 | "si364",
765 | 0
766 | ],
767 | [
768 | "Simon",
769 | "si364",
770 | 0
771 | ],
772 | [
773 | "Sorche",
774 | "si364",
775 | 0
776 | ],
777 | [
778 | "Nawal",
779 | "si364",
780 | 0
781 | ],
782 | [
783 | "Adelaide",
784 | "si364",
785 | 0
786 | ],
787 | [
788 | "Rhia",
789 | "si364",
790 | 0
791 | ],
792 | [
793 | "Katarzyna",
794 | "si364",
795 | 0
796 | ],
797 | [
798 | "LLeyton",
799 | "si364",
800 | 0
801 | ],
802 | [
803 | "Enzo",
804 | "si364",
805 | 0
806 | ],
807 | [
808 | "Declan",
809 | "si364",
810 | 0
811 | ],
812 | [
813 | "Emelie",
814 | "si364",
815 | 0
816 | ],
817 | [
818 | "Baillie",
819 | "si364",
820 | 0
821 | ],
822 | [
823 | "Shola",
824 | "si364",
825 | 0
826 | ],
827 | [
828 | "Jenna",
829 | "si422",
830 | 1
831 | ],
832 | [
833 | "Miles",
834 | "si422",
835 | 0
836 | ],
837 | [
838 | "Sakina",
839 | "si422",
840 | 0
841 | ],
842 | [
843 | "Melanie",
844 | "si422",
845 | 0
846 | ],
847 | [
848 | "Bailie",
849 | "si422",
850 | 0
851 | ],
852 | [
853 | "Cassy",
854 | "si422",
855 | 0
856 | ],
857 | [
858 | "Nikash",
859 | "si422",
860 | 0
861 | ],
862 | [
863 | "Hebe",
864 | "si422",
865 | 0
866 | ],
867 | [
868 | "Sia",
869 | "si422",
870 | 0
871 | ],
872 | [
873 | "Skyla",
874 | "si422",
875 | 0
876 | ],
877 | [
878 | "Jamaal",
879 | "si422",
880 | 0
881 | ],
882 | [
883 | "Keanna",
884 | "si422",
885 | 0
886 | ],
887 | [
888 | "Vanya",
889 | "si422",
890 | 0
891 | ],
892 | [
893 | "Temperance",
894 | "si422",
895 | 0
896 | ],
897 | [
898 | "Hafiza",
899 | "si422",
900 | 0
901 | ],
902 | [
903 | "Alx",
904 | "si422",
905 | 0
906 | ],
907 | [
908 | "Brigitte",
909 | "si422",
910 | 0
911 | ],
912 | [
913 | "Eliana",
914 | "si422",
915 | 0
916 | ],
917 | [
918 | "Kayden",
919 | "si422",
920 | 0
921 | ],
922 | [
923 | "Man",
924 | "si422",
925 | 0
926 | ],
927 | [
928 | "Jaydyn",
929 | "si422",
930 | 0
931 | ],
932 | [
933 | "Soukina",
934 | "si430",
935 | 1
936 | ],
937 | [
938 | "Stephenjunior",
939 | "si430",
940 | 0
941 | ],
942 | [
943 | "Buddy",
944 | "si430",
945 | 0
946 | ],
947 | [
948 | "Holly",
949 | "si430",
950 | 0
951 | ],
952 | [
953 | "Kamilia",
954 | "si430",
955 | 0
956 | ],
957 | [
958 | "Cassie",
959 | "si430",
960 | 0
961 | ],
962 | [
963 | "Kris",
964 | "si430",
965 | 0
966 | ],
967 | [
968 | "Maia",
969 | "si430",
970 | 0
971 | ],
972 | [
973 | "Abel",
974 | "si430",
975 | 0
976 | ],
977 | [
978 | "Tamika",
979 | "si430",
980 | 0
981 | ],
982 | [
983 | "Deano",
984 | "si430",
985 | 0
986 | ],
987 | [
988 | "Rosa",
989 | "si430",
990 | 0
991 | ],
992 | [
993 | "Georgia",
994 | "si430",
995 | 0
996 | ],
997 | [
998 | "Louie",
999 | "si430",
1000 | 0
1001 | ],
1002 | [
1003 | "Kassie",
1004 | "si430",
1005 | 0
1006 | ],
1007 | [
1008 | "Mutinta",
1009 | "si430",
1010 | 0
1011 | ],
1012 | [
1013 | "Manwen",
1014 | "si430",
1015 | 0
1016 | ]
1017 | ]
--------------------------------------------------------------------------------
/EXAMPLE CODE/search1.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | count = 0
3 | for line in fhand:
4 | if line.startswith('From:') :
5 | print line
6 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search10.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 | words = line.split()
4 | # print 'Debug:', words
5 | if len(words) == 0 : continue
6 | if words[0] != 'From' : continue
7 | print words[2]
8 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search2.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 | line = line.rstrip()
4 | if line.startswith('From:') :
5 | print line
6 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search3.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 | line = line.rstrip()
4 | # Skip 'uninteresting lines'
5 | if not line.startswith('From:') :
6 | continue
7 | # Process our 'interesting' line
8 | print line
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search4.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 | line = line.rstrip()
4 | if line.find('@uct.ac.za') == -1 : continue
5 | print line
6 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search5.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | for line in fhand:
3 | line = line.rstrip()
4 | if not line.startswith('From ') : continue
5 | words = line.split()
6 | print words[2]
7 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search6.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter the file name: ')
2 | fhand = open(fname)
3 | count = 0
4 | for line in fhand:
5 | if line.startswith('Subject:') :
6 | count = count + 1
7 | print 'There were', count, 'subject lines in', fname
8 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search7.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter the file name: ')
2 | try:
3 | fhand = open(fname)
4 | except:
5 | print 'File cannot be opened:', fname
6 | exit()
7 | count = 0
8 | for line in fhand:
9 | if line.startswith('Subject:') :
10 | count = count + 1
11 | print 'There were', count, 'subject lines in', fname
12 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search8.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | count = 0
3 | for line in fhand:
4 | words = line.split()
5 | if words[0] != 'From' : continue
6 | print words[2]
7 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/search9.py:
--------------------------------------------------------------------------------
1 | fhand = open('mbox-short.txt')
2 | count = 0
3 | for line in fhand:
4 | words = line.split()
5 | print 'Debug:', words
6 | if words[0] != 'From' : continue
7 | print words[2]
8 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/sequence.py:
--------------------------------------------------------------------------------
1 | inp = raw_input('Enter a Number:')
2 | n = int(inp)
3 | while n != 1:
4 | print n, # Use comma to suppress newline
5 | if n%2 == 0: # n is even
6 | n = n/2
7 | else: # n is odd
8 | n = n*3+1
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/socket1.py:
--------------------------------------------------------------------------------
1 | import socket
2 |
3 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
4 | mysock.connect(('www.py4inf.com', 80))
5 | mysock.send('GET http://www.py4inf.com/code/romeo.txt HTTP/1.0\n\n')
6 |
7 | while True:
8 | data = mysock.recv(512)
9 | if ( len(data) < 1 ) :
10 | break
11 | print data;
12 |
13 | mysock.close()
14 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/socket2.py:
--------------------------------------------------------------------------------
1 | import socket
2 |
3 | url = raw_input('Enter: ')
4 | words = url.split('/')
5 | host = words[2]
6 |
7 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
8 | mysock.connect((host, 80))
9 | mysock.send('GET '+url+' HTTP/1.0\n\n')
10 |
11 | while True:
12 | data = mysock.recv(512)
13 | if ( len(data) < 1 ) :
14 | break
15 | print data,
16 |
17 | mysock.close()
18 |
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/soft.py:
--------------------------------------------------------------------------------
1 | txt = 'but soft what light in yonder window breaks'
2 | words = txt.split()
3 | t = list()
4 | for word in words:
5 | t.append((len(word), word))
6 |
7 | t.sort(reverse=True)
8 |
9 | res = list()
10 | for length, word in t:
11 | res.append(word)
12 |
13 | print res
14 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/spamave.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter the file name: ')
2 | try:
3 | fhand = open(fname)
4 | except:
5 | print 'File cannot be opened:', fname
6 | exit()
7 | count = 0
8 | total = 0
9 | for line in fhand:
10 | words = line.split()
11 | if len(words) != 2 : continue
12 | if words[0] != 'X-DSPAM-Confidence:' : continue
13 | try:
14 | conf = float(words[1])
15 | except:
16 | continue
17 | count = count + 1
18 | total = total + conf
19 | average = total / count
20 | print 'Average spam confidence:', average
21 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/tracks.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/tracks.zip
--------------------------------------------------------------------------------
/EXAMPLE CODE/tracks/README.txt:
--------------------------------------------------------------------------------
1 | TBD
2 |
3 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/tracks/tracks.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 | import sqlite3
3 |
4 | conn = sqlite3.connect('trackdb.sqlite')
5 | cur = conn.cursor()
6 |
7 | # Make some fresh tables using executescript()
8 | cur.executescript('''
9 | DROP TABLE IF EXISTS Artist;
10 | DROP TABLE IF EXISTS Album;
11 | DROP TABLE IF EXISTS Track;
12 |
13 | CREATE TABLE Artist (
14 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
15 | name TEXT UNIQUE
16 | );
17 |
18 | CREATE TABLE Album (
19 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
20 | artist_id INTEGER,
21 | title TEXT UNIQUE
22 | );
23 |
24 | CREATE TABLE Track (
25 | id INTEGER NOT NULL PRIMARY KEY
26 | AUTOINCREMENT UNIQUE,
27 | title TEXT UNIQUE,
28 | album_id INTEGER,
29 | len INTEGER, rating INTEGER, count INTEGER
30 | );
31 | ''')
32 |
33 |
34 | fname = raw_input('Enter file name: ')
35 | if ( len(fname) < 1 ) : fname = 'Library.xml'
36 |
37 | # Track ID369
38 | # NameAnother One Bites The Dust
39 | # ArtistQueen
40 | def lookup(d, key):
41 | found = False
42 | for child in d:
43 | if found : return child.text
44 | if child.tag == 'key' and child.text == key :
45 | found = True
46 | return None
47 |
48 | stuff = ET.parse(fname)
49 | all = stuff.findall('dict/dict/dict')
50 | print 'Dict count:', len(all)
51 | for entry in all:
52 | if ( lookup(entry, 'Track ID') is None ) : continue
53 |
54 | name = lookup(entry, 'Name')
55 | artist = lookup(entry, 'Artist')
56 | album = lookup(entry, 'Album')
57 | count = lookup(entry, 'Play Count')
58 | rating = lookup(entry, 'Rating')
59 | length = lookup(entry, 'Total Time')
60 |
61 | if name is None or artist is None or album is None :
62 | continue
63 |
64 | print name, artist, album, count, rating, length
65 |
66 | cur.execute('''INSERT OR IGNORE INTO Artist (name)
67 | VALUES ( ? )''', ( artist, ) )
68 | cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
69 | artist_id = cur.fetchone()[0]
70 |
71 | cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id)
72 | VALUES ( ?, ? )''', ( album, artist_id ) )
73 | cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
74 | album_id = cur.fetchone()[0]
75 |
76 | cur.execute('''INSERT OR REPLACE INTO Track
77 | (title, album_id, len, rating, count)
78 | VALUES ( ?, ?, ?, ?, ? )''',
79 | ( name, album_id, length, rating, count ) )
80 |
81 | conn.commit()
82 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twdump.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 | cur.execute('SELECT * FROM Twitter')
6 | count = 0
7 | for row in cur :
8 | print row
9 | count = count + 1
10 | print count, 'rows.'
11 | cur.close()
12 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twfriends.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import twurl
3 | import json
4 | import sqlite3
5 |
6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
7 |
8 | conn = sqlite3.connect('friends.sqlite')
9 | cur = conn.cursor()
10 |
11 | cur.execute('''CREATE TABLE IF NOT EXISTS People
12 | (id INTEGER PRIMARY KEY, name TEXT UNIQUE, retrieved INTEGER)''')
13 | cur.execute('''CREATE TABLE IF NOT EXISTS Follows
14 | (from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''')
15 |
16 | while True:
17 | acct = raw_input('Enter a Twitter account, or quit: ')
18 | if ( acct == 'quit' ) : break
19 | if ( len(acct) < 1 ) :
20 | cur.execute('SELECT id, name FROM People WHERE retrieved = 0 LIMIT 1')
21 | try:
22 | (id, acct) = cur.fetchone()
23 | except:
24 | print 'No unretrieved Twitter accounts found'
25 | continue
26 | else:
27 | cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1',
28 | (acct, ) )
29 | try:
30 | id = cur.fetchone()[0]
31 | except:
32 | cur.execute('INSERT OR IGNORE INTO People (name, retrieved) VALUES ( ?, 0)',
33 | ( acct, ) )
34 | conn.commit()
35 | if cur.rowcount != 1 :
36 | print 'Error inserting account:',acct
37 | continue
38 | id = cur.lastrowid
39 |
40 | url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '5'} )
41 | print 'Retrieving account', acct
42 | connection = urllib.urlopen(url)
43 | data = connection.read()
44 | headers = connection.info().dict
45 | print 'Remaining', headers['x-rate-limit-remaining']
46 |
47 | js = json.loads(data)
48 | # print json.dumps(js, indent=4)
49 |
50 | cur.execute('UPDATE People SET retrieved=1 WHERE name = ?', (acct, ) )
51 |
52 | countnew = 0
53 | countold = 0
54 | for u in js['users'] :
55 | friend = u['screen_name']
56 | print friend
57 | cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1',
58 | (friend, ) )
59 | try:
60 | friend_id = cur.fetchone()[0]
61 | countold = countold + 1
62 | except:
63 | cur.execute('''INSERT OR IGNORE INTO People (name, retrieved)
64 | VALUES ( ?, 0)''', ( friend, ) )
65 | conn.commit()
66 | if cur.rowcount != 1 :
67 | print 'Error inserting account:',friend
68 | continue
69 | friend_id = cur.lastrowid
70 | countnew = countnew + 1
71 | cur.execute('INSERT OR IGNORE INTO Follows (from_id, to_id) VALUES (?, ?)',
72 | (id, friend_id) )
73 | print 'New accounts=',countnew,' revisited=',countold
74 | conn.commit()
75 |
76 | cur.close()
77 |
78 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twitter1.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import twurl
3 |
4 | TWITTER_URL = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
5 |
6 | while True:
7 | print ''
8 | acct = raw_input('Enter Twitter Account:')
9 | if ( len(acct) < 1 ) : break
10 | url = twurl.augment(TWITTER_URL,
11 | {'screen_name': acct, 'count': '2'} )
12 | print 'Retrieving', url
13 | connection = urllib.urlopen(url)
14 | data = connection.read()
15 | print data[:250]
16 | headers = connection.info().dict
17 | # print headers
18 | print 'Remaining', headers['x-rate-limit-remaining']
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twitter2.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import twurl
3 | import json
4 |
5 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
6 |
7 | while True:
8 | print ''
9 | acct = raw_input('Enter Twitter Account:')
10 | if ( len(acct) < 1 ) : break
11 | url = twurl.augment(TWITTER_URL,
12 | {'screen_name': acct, 'count': '5'} )
13 | print 'Retrieving', url
14 | connection = urllib.urlopen(url)
15 | data = connection.read()
16 | headers = connection.info().dict
17 | print 'Remaining', headers['x-rate-limit-remaining']
18 | js = json.loads(data)
19 | print json.dumps(js, indent=4)
20 |
21 | for u in js['users'] :
22 | print u['screen_name']
23 | s = u['status']['text']
24 | print ' ',s[:50]
25 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twjoin.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('friends.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('SELECT * FROM People')
7 | count = 0
8 | print 'People:'
9 | for row in cur :
10 | if count < 5: print row
11 | count = count + 1
12 | print count, 'rows.'
13 |
14 | cur.execute('SELECT * FROM Follows')
15 | count = 0
16 | print 'Follows:'
17 | for row in cur :
18 | if count < 5: print row
19 | count = count + 1
20 | print count, 'rows.'
21 |
22 | cur.execute('''SELECT * FROM Follows JOIN People
23 | ON Follows.to_id = People.id WHERE Follows.from_id = 2''')
24 | count = 0
25 | print 'Connections for id=2:'
26 | for row in cur :
27 | if count < 5: print row
28 | count = count + 1
29 | print count, 'rows.'
30 |
31 | cur.close()
32 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twspider.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import twurl
3 | import json
4 | import sqlite3
5 |
6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
7 |
8 | conn = sqlite3.connect('spider.sqlite')
9 | cur = conn.cursor()
10 |
11 | cur.execute('''
12 | CREATE TABLE IF NOT EXISTS Twitter (name TEXT, retrieved INTEGER, friends INTEGER)''')
13 |
14 | while True:
15 | acct = raw_input('Enter a Twitter account, or quit: ')
16 | if ( acct == 'quit' ) : break
17 | if ( len(acct) < 1 ) :
18 | cur.execute('SELECT name FROM Twitter WHERE retrieved = 0 LIMIT 1')
19 | try:
20 | acct = cur.fetchone()[0]
21 | except:
22 | print 'No unretrieved Twitter accounts found'
23 | continue
24 |
25 | url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '5'} )
26 | print 'Retrieving', url
27 | connection = urllib.urlopen(url)
28 | data = connection.read()
29 | headers = connection.info().dict
30 | print 'Remaining', headers['x-rate-limit-remaining']
31 | js = json.loads(data)
32 | # print json.dumps(js, indent=4)
33 |
34 | cur.execute('UPDATE Twitter SET retrieved=1 WHERE name = ?', (acct, ) )
35 |
36 | countnew = 0
37 | countold = 0
38 | for u in js['users'] :
39 | friend = u['screen_name']
40 | print friend
41 | cur.execute('SELECT friends FROM Twitter WHERE name = ? LIMIT 1',
42 | (friend, ) )
43 | try:
44 | count = cur.fetchone()[0]
45 | cur.execute('UPDATE Twitter SET friends = ? WHERE name = ?',
46 | (count+1, friend) )
47 | countold = countold + 1
48 | except:
49 | cur.execute('''INSERT INTO Twitter (name, retrieved, friends)
50 | VALUES ( ?, 0, 1 )''', ( friend, ) )
51 | countnew = countnew + 1
52 | print 'New accounts=',countnew,' revisited=',countold
53 | conn.commit()
54 |
55 | cur.close()
56 |
57 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twtest.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | from twurl import augment
3 |
4 | print '* Calling Twitter...'
5 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json',
6 | {'screen_name': 'drchuck', 'count': '2'} )
7 | print url
8 | connection = urllib.urlopen(url)
9 | data = connection.read()
10 | print data
11 | headers = connection.info().dict
12 | print headers
13 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/twurl.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import oauth
3 | import hidden
4 |
5 | def augment(url, parameters) :
6 | secrets = hidden.oauth()
7 | consumer = oauth.OAuthConsumer(secrets['consumer_key'], secrets['consumer_secret'])
8 | token = oauth.OAuthToken(secrets['token_key'],secrets['token_secret'])
9 |
10 | oauth_request = oauth.OAuthRequest.from_consumer_and_token(consumer,
11 | token=token, http_method='GET', http_url=url, parameters=parameters)
12 | oauth_request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, token)
13 | return oauth_request.to_url()
14 |
15 |
16 | def test_me() :
17 | print '* Calling Twitter...'
18 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json',
19 | {'screen_name': 'drchuck', 'count': '2'} )
20 | print url
21 | connection = urllib.urlopen(url)
22 | data = connection.read()
23 | print data
24 | headers = connection.info().dict
25 | print headers
26 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcheck.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import join
3 | for (dirname, dirs, files) in os.walk('.'):
4 | for filename in files:
5 | if filename.endswith('.txt') :
6 | thefile = os.path.join(dirname,filename)
7 | size = os.path.getsize(thefile)
8 | if size == 2578 or size == 2565:
9 | continue
10 | fhand = open(thefile,'r')
11 | lines = list()
12 | for line in fhand:
13 | lines.append(line)
14 | fhand.close()
15 | if len(lines) > 1:
16 | print len(lines), thefile
17 | print lines[:4]
18 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcheck2.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import join
3 | for (dirname, dirs, files) in os.walk('.'):
4 | for filename in files:
5 | if filename.endswith('.txt') :
6 | thefile = os.path.join(dirname,filename)
7 | size = os.path.getsize(thefile)
8 | if size == 2578 or size == 2565:
9 | continue
10 | fhand = open(thefile,'r')
11 | lines = list()
12 | for line in fhand:
13 | lines.append(line)
14 | fhand.close()
15 | if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') :
16 | continue
17 | if len(lines) > 1:
18 | print len(lines), thefile
19 | print lines[:4]
20 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcheck3.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import join
3 | for (dirname, dirs, files) in os.walk('.'):
4 | for filename in files:
5 | if filename.endswith('.txt') :
6 | thefile = os.path.join(dirname,filename)
7 | size = os.path.getsize(thefile)
8 | if size == 2578 or size == 2565:
9 | print 'T-Mobile:',thefile
10 | continue
11 | fhand = open(thefile,'r')
12 | lines = list()
13 | for line in fhand:
14 | lines.append(line)
15 | fhand.close()
16 | if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') :
17 | print 'iPhone:', thefile
18 | continue
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtcount.py:
--------------------------------------------------------------------------------
1 | import os
2 | count = 0
3 | for dirname, dirs, files in os.walk('.'):
4 | for filename in files:
5 | if filename.endswith('.txt') :
6 | count = count + 1
7 |
8 | print 'Files:', count
9 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtdelete.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import join
3 | for (dirname, dirs, files) in os.walk('.'):
4 | for filename in files:
5 | if filename.endswith('.txt') :
6 | thefile = os.path.join(dirname,filename)
7 | size = os.path.getsize(thefile)
8 | if size == 2578 or size == 2565:
9 | print 'T-Mobile:',thefile
10 | os.remove(thefile)
11 | continue
12 | fhand = open(thefile,'r')
13 | lines = list()
14 | for line in fhand:
15 | lines.append(line)
16 | fhand.close()
17 | if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') :
18 | print 'iPhone:', thefile
19 | os.remove(thefile)
20 | continue
21 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtmd5.py:
--------------------------------------------------------------------------------
1 | import os
2 | import hashlib
3 | from os.path import join
4 |
5 | hashes = dict()
6 | for (dirname, dirs, files) in os.walk('.'):
7 | for filename in files:
8 | if filename.endswith('.txt') :
9 | thefile = os.path.join(dirname,filename)
10 | fhand = open(thefile,'r')
11 | data = fhand.read()
12 | fhand.close()
13 | hash = hashlib.md5(data).hexdigest()
14 | # print thefile, hash
15 | if hash in hashes:
16 | print hashes[hash], thefile
17 | else:
18 | hashes[hash] = thefile
19 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/txtsize.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import join
3 | for (dirname, dirs, files) in os.walk('.'):
4 | for filename in files:
5 | if filename.endswith('.txt') :
6 | thefile = os.path.join(dirname,filename)
7 | print os.path.getsize(thefile), thefile
8 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urljpeg.py:
--------------------------------------------------------------------------------
1 | import socket
2 | import time
3 |
4 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
5 | mysock.connect(('www.py4inf.com', 80))
6 | mysock.send('GET http://www.py4inf.com/cover.jpg HTTP/1.0\n\n')
7 |
8 |
9 | count = 0
10 | picture = "";
11 | while True:
12 | data = mysock.recv(5120)
13 | if ( len(data) < 1 ) : break
14 | time.sleep(0.25)
15 | count = count + len(data)
16 | print len(data),count
17 | picture = picture + data
18 |
19 | mysock.close()
20 |
21 | # Look for the end of the header (2 CRLF)
22 | pos = picture.find("\r\n\r\n");
23 | print 'Header length',pos
24 | print picture[:pos]
25 |
26 | # Skip past the header and save the picture data
27 | picture = picture[pos+4:]
28 | fhand = open("stuff.jpg","wb")
29 | fhand.write(picture);
30 | fhand.close()
31 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urllib1.py:
--------------------------------------------------------------------------------
1 | import urllib
2 |
3 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt')
4 | for line in fhand:
5 | print line.strip()
6 |
7 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urllib2.py:
--------------------------------------------------------------------------------
1 | import urllib
2 |
3 | fhand = urllib.urlopen('http://www.dr-chuck.com/page1.htm')
4 | for line in fhand:
5 | print line.strip()
--------------------------------------------------------------------------------
/EXAMPLE CODE/urllink2.py:
--------------------------------------------------------------------------------
1 | # Note - this code must run in Python 2.x and you must download
2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
3 | # Into the same folder as this program
4 |
5 | import urllib
6 | from BeautifulSoup import *
7 |
8 | url = raw_input('Enter - ')
9 | html = urllib.urlopen(url).read()
10 |
11 | soup = BeautifulSoup(html)
12 |
13 | # Retrieve all of the anchor tags
14 | tags = soup('a')
15 | for tag in tags:
16 | # Look at the parts of a tag
17 | print 'TAG:',tag
18 | print 'URL:',tag.get('href', None)
19 | print 'Contents:',tag.contents[0]
20 | print 'Attrs:',tag.attrs
21 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urllink3.py:
--------------------------------------------------------------------------------
1 | # Note - this code must run in Python 2.x and you must download
2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
3 | # Into the same folder as this program
4 |
5 | import urllib
6 | from BeautifulSoup import *
7 |
8 | todo = list()
9 | visited = list()
10 | url = raw_input('Enter - ')
11 | todo.append(url)
12 |
13 | while len(todo) > 0 :
14 | print "====== Todo list count is ",len(todo)
15 | url = todo.pop()
16 |
17 | if ( not url.startswith('http') ) :
18 | print "Skipping", url
19 | continue
20 |
21 | if ( url.find('facebook') > 0 ) :
22 | continue
23 |
24 | if ( url in visited ) :
25 | print "Visited", url
26 | continue
27 |
28 | print "===== Retrieving ", url
29 |
30 | html = urllib.urlopen(url).read()
31 | soup = BeautifulSoup(html)
32 | visited.append(url)
33 |
34 | # Retrieve all of the anchor tags
35 | tags = soup('a')
36 | for tag in tags:
37 | newurl = tag.get('href', None)
38 | if ( newurl != None ) :
39 | todo.append(newurl)
40 |
41 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urllinks.py:
--------------------------------------------------------------------------------
1 | # Note - this code must run in Python 2.x and you must download
2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
3 | # Into the same folder as this program
4 |
5 | import urllib
6 | from BeautifulSoup import *
7 |
8 | url = raw_input('Enter - ')
9 | html = urllib.urlopen(url).read()
10 | soup = BeautifulSoup(html)
11 |
12 | # Retrieve all of the anchor tags
13 | tags = soup('a')
14 | for tag in tags:
15 | print tag.get('href', None)
16 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urlregex.py:
--------------------------------------------------------------------------------
1 | # Search for lines that start with From and have an at sign
2 | import urllib
3 | import re
4 |
5 | url = raw_input('Enter - ')
6 | html = urllib.urlopen(url).read()
7 | links = re.findall('href="(http://.*?)"', html)
8 | for link in links:
9 | print link
10 |
11 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/urlwords.py:
--------------------------------------------------------------------------------
1 | import urllib
2 |
3 | counts = dict()
4 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt')
5 | for line in fhand:
6 | words = line.split()
7 | for word in words:
8 | counts[word] = counts.get(word,0) + 1
9 | print counts
10 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/whathour.py:
--------------------------------------------------------------------------------
1 | fname = raw_input('Enter file name: ')
2 | fhand = open(fname)
3 | c = dict()
4 | for line in fhand:
5 | if not line.startswith('From ') : continue
6 | pieces = line.split()
7 | time = pieces[5]
8 | parts = time.split(':')
9 | hour = parts[0]
10 | c[hour] = c.get(hour,0) + 1
11 |
12 | lst = list()
13 | for key in c:
14 | value = c[key]
15 | lst.append( (value, key) )
16 |
17 | lst.sort()
18 |
19 | for value, key in lst:
20 | print key, value
21 |
22 |
23 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/wikidata.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/wikidata.db
--------------------------------------------------------------------------------
/EXAMPLE CODE/wikigrade.py:
--------------------------------------------------------------------------------
1 | # Note - this code must run in Python 2.x and you must download
2 | # http://www.pythonlearn.com/code/BeautifulSoup.py
3 | # Into the same folder as this program
4 |
5 | import string
6 | import sqlite3
7 | import urllib
8 | import xml.etree.ElementTree as ET
9 | from BeautifulSoup import *
10 |
11 | conn = sqlite3.connect('wikidata.db')
12 | cur = conn.cursor()
13 |
14 | cur.execute('''
15 | CREATE TABLE IF NOT EXISTS TinyTable (id INTEGER PRIMARY KEY,
16 | url TEXT, page BLOB, retrieved_at timestamp)''')
17 |
18 | # A slightly extended dictionary
19 | class sash(dict):
20 | def sortvalues(self,reverse=True):
21 | return sorted(self.items(),key=lambda x: (x[1], x[0]), reverse=reverse)
22 |
23 | def tinyTable(url):
24 | global cur,conn
25 | cur.execute('SELECT id,page,retrieved_at FROM TinyTable WHERE URL = ?', (url, ))
26 | try:
27 | row = cur.fetchone()
28 | print 'DATE',row[2]
29 | return row[1]
30 | except:
31 | row = None
32 | print 'Retrieving', url
33 |
34 | data = urllib.urlopen (url).read()
35 | if row != None:
36 | cur.execute("UPDATE TinyTable SET page=?,retrieved_at=datetime('now') WHERE id=?", (unicode(data, 'utf-8'), row[0]))
37 | else:
38 | cur.execute("INSERT INTO TinyTable (url, page, retrieved_at) VALUES (?, ?, datetime('now'))",(url, unicode(data, 'utf-8')))
39 | conn.commit()
40 | return data
41 |
42 | cururl = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7?pageName=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138%2Fhome&action=view&panel=Main&realm=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138'
43 | prefix = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7'
44 |
45 | urls = list()
46 | urls.append(cururl)
47 | visited = list()
48 | editcounts = sash()
49 | postcounts = sash()
50 |
51 | while len(urls) > 0 :
52 | print '=== URLS Yet To Retrieve:',len(urls)
53 | cururl = urls.pop()
54 | if cururl in visited: continue
55 | print 'RETRIEVING',cururl
56 | data = tinyTable(cururl)
57 | visited.append(cururl)
58 | soup = BeautifulSoup(data)
59 | tags = soup('a')
60 | # print 'Tags'
61 | for tag in tags:
62 | print tag
63 | url = tag.get('href',None)
64 | if url == None : continue
65 | # Don't follow absolute urls
66 | if not url.startswith(prefix) : continue
67 | newurl = urllib.basejoin(cururl,url)
68 | if newurl in visited : continue
69 | # print 'APPENDING',newurl
70 | if newurl.find('action=view') > 0 or newurl.find('action=history') > 0 :
71 | urls.append(newurl)
72 |
73 | print 'EDITS:'
74 | for (key,val) in editcounts.sortvalues():
75 | print key, val
76 |
77 | for (key,val) in sorted(postcounts.items()):
78 | print key, val
79 |
80 | conn.close()
81 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/wordlist.py:
--------------------------------------------------------------------------------
1 | name = raw_input('Enter file: ')
2 | handle = open(name, 'r')
3 | wordlist = list()
4 | for line in handle:
5 | words = line.split()
6 | for word in words:
7 | if word in wordlist: continue
8 | wordlist.append(word)
9 |
10 | wordlist.sort()
11 | print wordlist
12 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/words.py:
--------------------------------------------------------------------------------
1 | name = raw_input('Enter file:')
2 | handle = open(name, 'r')
3 | text = handle.read()
4 | words = text.split()
5 | counts = dict()
6 | for word in words:
7 | counts[word] = counts.get(word,0) + 1
8 |
9 | bigcount = None
10 | bigword = None
11 | for word,count in counts.items():
12 | if bigcount == None or count > bigcount:
13 | bigword = word
14 | bigcount = count
15 |
16 | print bigword, bigcount
17 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/words.txt:
--------------------------------------------------------------------------------
1 | Writing programs or programming is a very creative
2 | and rewarding activity You can write programs for
3 | many reasons ranging from making your living to solving
4 | a difficult data analysis problem to having fun to helping
5 | someone else solve a problem This book assumes that
6 | {\em everyone} needs to know how to program and that once
7 | you know how to program, you will figure out what you want
8 | to do with your newfound skills
9 |
10 | We are surrounded in our daily lives with computers ranging
11 | from laptops to cell phones We can think of these computers
12 | as our personal assistants who can take care of many things
13 | on our behalf The hardware in our current-day computers
14 | is essentially built to continuously ask us the question
15 | What would you like me to do next
16 |
17 | Our computers are fast and have vasts amounts of memory and
18 | could be very helpful to us if we only knew the language to
19 | speak to explain to the computer what we would like it to
20 | do next If we knew this language we could tell the
21 | computer to do tasks on our behalf that were reptitive
22 | Interestingly, the kinds of things computers can do best
23 | are often the kinds of things that we humans find boring
24 | and mind-numbing
25 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/xml1.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 |
3 | data = '''
4 |
5 | Chuck
6 |
7 | +1 734 303 4456
8 |
9 |
10 | '''
11 |
12 | tree = ET.fromstring(data)
13 | print 'Name:',tree.find('name').text
14 | print 'Attr:',tree.find('email').get('hide')
15 |
--------------------------------------------------------------------------------
/EXAMPLE CODE/xml2.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 |
3 | input = '''
4 |
5 |
6 |
7 | 001
8 | Chuck
9 |
10 |
11 | 009
12 | Brent
13 |
14 |
15 | '''
16 |
17 | stuff = ET.fromstring(input)
18 | lst = stuff.findall('users/user')
19 | print 'User count:', len(lst)
20 |
21 | for item in lst:
22 | print 'Name', item.find('name').text
23 | print 'Id', item.find('id').text
24 | print 'Attribute', item.get("x")
25 | print ""
26 |
27 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #Introduction
2 | This repository contains the resources and materials I've generated myself during the course ["Using Python to Access Web Data"](https://www.coursera.org/learn/python-network-data/), from University of Michigan and offered in Coursera
3 |
4 | #Content available
5 | Currently, the only content available are the Python files corresponding to the programming assignments I've coded for the course. I plan on making available also documents with my notes from the course, but I'm still finishing them
6 |
7 | #Folder structure
8 | The content follows the same structure than the course: there's a folder for each of the weeks the course is structured in, and them contains all the material from that week. There are two types of files: the ones that start with "C" are the coding files containing the exercises, while the ones that start with "A" are other resources used on or obtained as a result from the assignments. The other two numbers represent the unit and the index of that document within the type+week respectively.
9 |
10 | *Example: the file starting with C4.2. is the 2nd coding file needed for the programming assignment used on Unit 4*
11 |
--------------------------------------------------------------------------------
/Textbook - Castellano.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Textbook - Castellano.epub
--------------------------------------------------------------------------------
/Textbook - English.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Textbook - English.epub
--------------------------------------------------------------------------------
/Unit 1 - Introduction/A1.1 - Code screenshot.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 1 - Introduction/A1.1 - Code screenshot.PNG
--------------------------------------------------------------------------------
/Unit 1 - Introduction/A1.2. - Script execution.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 1 - Introduction/A1.2. - Script execution.PNG
--------------------------------------------------------------------------------
/Unit 1 - Introduction/C1.1 - Firstcode.py:
--------------------------------------------------------------------------------
1 | print("Hello pythonistas!")
--------------------------------------------------------------------------------
/Unit 2 - Regular expressions/C2.1 - Programming assignment.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | #Opening the file in which we'll need to find the numbers
4 | sample_file = open('A.2.2 - regex text data.txt')
5 |
6 | #Obtaining strings representing the numbers in that file
7 | text = sample_file.read() #With read, we read the entire text and not line by line
8 | number_regex = '[0-9]+'
9 | numbers = re.findall(number_regex, text) #Match any combination of one or more digits
10 |
11 | #Casting them to integers and getting the total sum
12 | total = sum(int(num) for num in numbers)
13 |
14 | print(total)
15 |
16 | #Closing the file to avoid memory problems
17 | sample_file.close()
--------------------------------------------------------------------------------
/Unit 3 - Networks and sockets/C3.1. - Programming assignment.py:
--------------------------------------------------------------------------------
1 | import socket
2 |
3 | #Setting the socket
4 | mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
5 | mysocket.connect( ('www.pythonlearn.com', 80) )
6 |
7 | #Making the HTTP request that will get us the desired document
8 | mysocket.send("GET http://www.pythonlearn.com/code/intro-short.txt HTTP/1.0 \n\n")
9 |
10 | while True:
11 | #Obtaining the web data
12 | webdata = mysocket.recv(512)
13 |
14 | #When there's no more data left, we'll stop the loop
15 | if len(webdata) < 1:
16 | break
17 |
18 | #Printing the obtained data
19 | print webdata
20 |
21 | mysocket.close()
--------------------------------------------------------------------------------
/Unit 4 - Programs that surf the web/BeautifulSoup.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 4 - Programs that surf the web/BeautifulSoup.pyc
--------------------------------------------------------------------------------
/Unit 4 - Programs that surf the web/C4.1. Programming assignment.py:
--------------------------------------------------------------------------------
1 | """
2 | This assignment consists of using urllib to read the HTML from the data files
3 | indicated, and parse the data, extract the numbers and compute the sum of the
4 | numbers in the file
5 |
6 | DATA FORMAT:
7 | The file is a table of names and comment counts. You can ignore most of the data
8 | in the file except for lines like the following:
9 |
10 |
Modu
90
11 |
Kenzie
88
12 |
Hubert
87
13 |
14 | You are to find all the tags in the file and pull out the numbers from the
15 | tag and sum the numbers.
16 |
17 | Look at the sample code (http://www.pythonlearn.com/code/urllink2.py) provided. It
18 | shows how to find all of a certain kind of tag, loop through the tags and extract
19 | the various aspects of the tags.
20 |
21 | # Retrieve all of the anchor tags
22 | tags = soup('a')
23 | for tag in tags:
24 | # Look at the parts of a tag
25 | print 'TAG:',tag
26 | print 'URL:',tag.get('href', None)
27 | print 'Contents:',tag.contents[0]
28 | print 'Attrs:',tag.attrs
29 |
30 | You need to adjust this code to look for span tags and pull out the text content of the
31 | span tag, convert them to integers and add them up to complete the assignment. """
32 |
33 |
34 | import urllib
35 | from BeautifulSoup import *
36 |
37 | sample_url = "http://python-data.dr-chuck.net/comments_42.html"
38 | data_url = "http://python-data.dr-chuck.net/comments_277464.html"
39 |
40 | #Getting the html information and parsing it with BeautifulSoup
41 | html = urllib.urlopen(data_url).read()
42 | soup = BeautifulSoup(html)
43 |
44 | #Getting a list with the "span" tags
45 | tags = soup('span')
46 |
47 | #Counting the sum of all the values within the span tags
48 | count = 0
49 | for tag in tags:
50 |
51 | #We need to cast them to int, as they're parsed as text strings
52 | count += int(tag.contents[0])
53 |
54 | print(count)
55 |
56 |
--------------------------------------------------------------------------------
/Unit 4 - Programs that surf the web/C4.2. Programming assignment 2.py:
--------------------------------------------------------------------------------
1 | """
2 | In this assignment you will write a Python program that expands on http://www.pythonlearn.com/code/urllinks.py
3 | The program will use urllib to read the HTML from the data files below, extract
4 | the href= vaues from the anchor tags, scan for a tag that is in a particular
5 | position from the top and follow that link, repeat the process a number of times,
6 | and report the last name you find.
7 |
8 |
9 | SAMPLE:
10 | Find the link at position 3 (the first name is 1). Follow that link. Repeat this
11 | process 4 times. The answer is the last name that you retrieve.
12 | The result should be: Anayah
13 |
14 | PROBLEM:
15 | Find the link at position 18 (the first name is 1). Follow that link. Repeat this
16 | process 7 times. The answer is the last name that you retrieve.
17 | Hint: the name starts with S
18 | """
19 |
20 | import urllib
21 | from BeautifulSoup import *
22 |
23 | #SAMPLE DATA
24 | sample_url = "http://python-data.dr-chuck.net/known_by_Fikret.html"
25 | sample_repetitions = 4
26 | sample_resultPosition = 3
27 |
28 | #ACTUAL PROBLEM DATA
29 | problem_url = "http://python-data.dr-chuck.net/known_by_Max.html"
30 | problem_repetitions = 7
31 | problem_resultPosition = 18
32 |
33 |
34 | #Choosing the type of execution we're trying
35 | type_of_execution = 'problem'
36 | if type_of_execution == 'sample':
37 | (link, repetitions, resultPosition) = (sample_url, sample_repetitions, sample_resultPosition)
38 |
39 | elif type_of_execution == 'problem':
40 | (link, repetitions, resultPosition) = (problem_url, problem_repetitions, problem_resultPosition)
41 |
42 |
43 | #Amount of iterations needed
44 | for times in range(repetitions):
45 |
46 | #Getting the information of the correspondent url
47 | html = urllib.urlopen(link).read()
48 | soup = BeautifulSoup(html)
49 | tags = soup('a')
50 |
51 | #We are indicated that the first name is 1, but in Python the array begins in 0,
52 | #so we have to take 1 unit from the index
53 | link = tags[resultPosition - 1].get('href')
54 |
55 | #Getting the content of the tag in the specified position. It should correspond to
56 | #the answer we're looking for
57 | result_name = tags[resultPosition - 1].contents[0]
58 | print(result_name)
59 |
60 |
--------------------------------------------------------------------------------
/Unit 5 - Web services and XML/BeautifulSoup.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 5 - Web services and XML/BeautifulSoup.pyc
--------------------------------------------------------------------------------
/Unit 5 - Web services and XML/C5.1. Programming assignment.py:
--------------------------------------------------------------------------------
1 | """
2 | EXTRACTING DATA FROM XML
3 | In this assignment you will write a Python program somewhat similar to
4 | http://www.pythonlearn.com/code/geoxml.py. The program will prompt for a URL,
5 | read the XML data from that URL using urllib and then parse and extract the
6 | comment counts from the XML data, compute the sum of the numbers in the file.
7 |
8 | We provide two files for this assignment. One is a sample file where we give you
9 | the sum for your testing and the other is the actual data you need to process for
10 | the assignment.
11 |
12 | Sample data: http://python-data.dr-chuck.net/comments_42.xml (Sum=2553)
13 | Actual data: http://python-data.dr-chuck.net/comments_277461.xml
14 |
15 | You do not need to save these files to your folder since your program will read
16 | the data directly from the URL. Note: Each student will have a distinct data url
17 | for the assignment - so only use your own data url for analysis.
18 |
19 |
20 | DATA FORMAT AND APPROACH
21 | The data consists of a number of names and comment counts in XML as follows:
22 |
23 |
24 | Matthias
25 | 97
26 |
27 |
28 | You are to look through all the tags and find the values sum
29 | the numbers. The closest sample code that shows how to parse XML is geoxml.py.
30 | But since the nesting of the elements in our data is different than the data we
31 | are parsing in that sample code you will have to make real changes to the code.
32 |
33 | To make the code a little simpler, you can use an XPath selector string to look
34 | through the entire tree of XML for any tag named 'count' with the following line
35 | of code:
36 |
37 | counts = tree.findall('.//count')
38 |
39 | Take a look at the Python ElementTree documentation and look for the supported
40 | XPath syntax for details. You could also work from the top of the XML down to
41 | the comments node and then loop through the child nodes of the comments node.
42 | """
43 | #We'll left XPath for another moment, as it requires further investigation. For
44 | #now we'll look for the count tags by knowing its structure:
45 | #commentinfo -> comments -> comment -> count
46 |
47 |
48 | import urllib
49 | from BeautifulSoup import *
50 | import xml.etree.ElementTree as ET
51 |
52 | sample_data = "http://python-data.dr-chuck.net/comments_42.xml"
53 | actual_data = "http://python-data.dr-chuck.net/comments_277461.xml"
54 |
55 | #We'll analyze this generic parameter, so we will only need to change its source
56 | #and not every single one of its appearances in the code
57 | #NOTE: I'm using Sublime Text and it doesn't accept raw_input, so I'll set the URL
58 | #from here isntead from a user prompt
59 | data_url = actual_data
60 | data = urllib.urlopen(data_url).read()
61 |
62 | #xml_data contains the commentinfo object, as it is the main structure, so we
63 | #have to look for the comments element and then for all its comment elements
64 | xml_data = ET.fromstring(data)
65 | search_str = "comments/comment"
66 | count_tags = xml_data.findall(search_str)
67 |
68 | #Computing the sum
69 | total_count = 0
70 | for tag in count_tags:
71 | #We'll find the "count" element inside each "comment" element and add it
72 | count = tag.find('count')
73 | total_count += int(count.text)
74 |
75 | print(total_count)
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/Unit 6 - JSON and the REST architecture/C.6.1. - Programming assignment 1.py:
--------------------------------------------------------------------------------
1 | """ EXTRACTING DATA FROM JSON
2 | In this assignment you will write a Python program somewhat similar to
3 | http://www.pythonlearn.com/code/json2.py. The program will prompt for a URL, read
4 | the JSON data from that URL using urllib and then parse and extract the comment
5 | counts from the JSON data, compute the sum of the numbers in the file and enter
6 | the sum below:
7 |
8 | We provide two files for this assignment. One is a sample file where we give you
9 | the sum for your testing and the other is the actual data you need to process for
10 | the assignment.
11 |
12 | - Sample data: http://python-data.dr-chuck.net/comments_42.json (Sum=2553)
13 | - Actual data: http://python-data.dr-chuck.net/comments_277465.json
14 |
15 | You do not need to save these files to your folder since your program will read
16 | the data directly from the URL. Note: Each student will have a distinct data url
17 | for the assignment - so only use your own data url for analysis.
18 |
19 |
20 | DATA FORMAT
21 | The data consists of a number of names and comment counts in JSON as follows:
22 |
23 | {
24 | comments: [
25 | {
26 | name: "Matthias"
27 | count: 97
28 | },
29 | {
30 | name: "Geomer"
31 | count: 97
32 | }
33 | ...
34 | ]
35 | }
36 | The closest sample code that shows how to parse JSON and extract a list is
37 | json2.py. You might also want to look at geoxml.py to see how to prompt for a URL
38 | and retrieve data from a URL.
39 | """
40 |
41 |
42 | import urllib
43 | import json
44 |
45 | sample_url = "http://python-data.dr-chuck.net/comments_42.json"
46 | data_url = "http://python-data.dr-chuck.net/comments_277465.json"
47 |
48 | #Reading the URL and parsing its data
49 | urldata = urllib.urlopen(data_url).read()
50 | data = json.loads(urldata)
51 |
52 | #Finding each "count" field and adding its value to the total sum.
53 | total = 0
54 | for comment in data["comments"]:
55 | total += comment["count"]
56 |
57 | print("TOTAL SUM: ", total)
--------------------------------------------------------------------------------
/Unit 6 - JSON and the REST architecture/C.6.2. - Programming assignment 2.py:
--------------------------------------------------------------------------------
1 | """
2 | CALLING A JSON API
3 | In this assignment you will write a Python program somewhat similar to
4 | http://www.pythonlearn.com/code/geojson.py. The program will prompt for a location,
5 | contact a web service and retrieve JSON for the web service and parse that data,
6 | and retrieve the first place_id from the JSON. A place ID is a textual identifier
7 | that uniquely identifies a place as within Google Maps.
8 |
9 |
10 | API ENDPOINTS
11 | To complete this assignment, you should use this API endpoint that has a static
12 | subset of the Google Data:
13 |
14 | http://python-data.dr-chuck.net/geojson
15 |
16 | This API uses the same parameters (sensor and address) as the Google API. This
17 | API also has no rate limit so you can test as often as you like. If you visit
18 | the URL with no parameters, you get a list of all of the address values which
19 | can be used with this API.
20 |
21 | To call the API, you need to provide a sensor=false parameter and the address
22 | that you are requesting as the address= parameter that is properly URL encoded
23 | using the urllib.urlencode() fuction as shown in
24 | http://www.pythonlearn.com/code/geojson.py
25 |
26 |
27 | TEST DATA / SAMPLE EXECUTION
28 | You can test to see if your program is working with a location of "South Federal
29 | University" which will have a place_id of "ChIJJ8oO7_B_bIcR2AlhC8nKlok".
30 |
31 |
32 | TURN IN
33 | Please run your program to find the place_id for this location: Columbia
34 | University
35 |
36 | Make sure to enter the name and case exactly as above and enter the place_id and
37 | your Python code below. Hint: The first seven characters of the place_id are
38 | "ChIJdeM ...". Make sure to retreive the data from the URL specified above and
39 | not the normal Google API. Your program should work with the Google API - but the
40 | place_id may not match for this assignment.
41 | """
42 |
43 | import json
44 | import urllib
45 |
46 | #Stroring the given parameters
47 | serviceurl = "http://python-data.dr-chuck.net/geojson?"
48 | sample_address = "South Federal University"
49 | data_address = "Columbia University"
50 | address_wanted = data_address
51 |
52 | #Setting the GET parameters on the URL
53 | parameters = {"sensor": "false", "address": address_wanted}
54 | paramsurl = urllib.urlencode(parameters)
55 |
56 | #Generating the complete URL. Printing it in order to check if it's correct.
57 | queryurl = serviceurl + paramsurl
58 | print("DATA URL: ", queryurl)
59 |
60 | #Obtaining and reading the data
61 | data = urllib.urlopen(queryurl).read()
62 |
63 | #Parsing the data and looking for the field we want.
64 | #That field is inside the "Results" array, in its first item (if our address is
65 | #correct we can assume that the result would be the correct one) and on its
66 | #"place_id" field
67 | jsondata = json.loads(str(data))
68 | place_id = jsondata["results"][0]["place_id"]
69 | print("PLACE ID: ", place_id)
70 |
71 |
--------------------------------------------------------------------------------