├── Capstone
    ├── Mailing List I
    │   ├── Content.sqlite Snapshot.PNG
    │   ├── Gline Visualization.PNG
    │   ├── Gmodel Index sqlite Screenshot.PNG
    │   ├── Gmodel.py Application Screenshot.PNG
    │   ├── Histogram gbasic.PNG
    │   ├── README.txt
    │   ├── Second Gline.PNG
    │   ├── Wordcloud.PNG
    │   ├── content.sqlite
    │   ├── d3.layout.cloud.js
    │   ├── d3.v2.js
    │   ├── gbasic.py
    │   ├── gline.htm
    │   ├── gline.js
    │   ├── gline.py
    │   ├── gmane.py
    │   ├── gmodel.py
    │   ├── gword.htm
    │   ├── gword.js
    │   ├── gword.py
    │   ├── gyear.py
    │   ├── index.sqlite
    │   └── mapping.sqlite
    └── Pagerank
    │   ├── LICENSE
    │   ├── Pagerank Dr.Chuck.PNG
    │   ├── Pagerank WP.PNG
    │   ├── README.txt
    │   ├── bs4
    │       ├── __init__.py
    │       ├── __init__.py.bak
    │       ├── builder
    │       │   ├── __init__.py
    │       │   ├── __init__.py.bak
    │       │   ├── _html5lib.py
    │       │   ├── _html5lib.py.bak
    │       │   ├── _htmlparser.py
    │       │   ├── _htmlparser.py.bak
    │       │   ├── _lxml.py
    │       │   └── _lxml.py.bak
    │       ├── dammit.py
    │       ├── dammit.py.bak
    │       ├── diagnose.py
    │       ├── diagnose.py.bak
    │       ├── element.py
    │       ├── element.py.bak
    │       ├── testing.py
    │       ├── testing.py.bak
    │       └── tests
    │       │   ├── __init__.py
    │       │   ├── test_builder_registry.py
    │       │   ├── test_docs.py
    │       │   ├── test_html5lib.py
    │       │   ├── test_html5lib.py.bak
    │       │   ├── test_htmlparser.py
    │       │   ├── test_lxml.py
    │       │   ├── test_lxml.py.bak
    │       │   ├── test_soup.py
    │       │   ├── test_soup.py.bak
    │       │   ├── test_tree.py
    │       │   └── test_tree.py.bak
    │   ├── d3.v2.js
    │   ├── force.css
    │   ├── force.html
    │   ├── force.js
    │   ├── pagerank orginal.PNG
    │   ├── spdump.py
    │   ├── spdump.py Dr. Chuck.PNG
    │   ├── spdump.py WP.PNG
    │   ├── spider.js
    │   ├── spider.py
    │   ├── spider.sqlite
    │   ├── spjson.py
    │   ├── sprank.py
    │   └── spreset.py
├── Programming for Everybody
    ├── Chapter 1
    │   └── Hello World.py
    ├── Chapter 2
    │   ├── Assignment 2.2.py
    │   └── Assignment 2.3.py
    ├── Chapter 3
    │   ├── Assignment 3.1.py
    │   └── Assignment 3.3.py
    ├── Chapter 4
    │   └── Assignment 4.6.py
    └── Chapter 5
    │   └── Assignment 5.2.py
├── Python Data Structures
    ├── Atom Editor Test.PNG
    ├── Chapter 10
    │   ├── Assignment 10.2.py
    │   └── mbox-short.txt
    ├── Chapter 6
    │   └── Assignment 6.5.py
    ├── Chapter 7
    │   ├── Assignment 7.1.py
    │   ├── Assignment 7.2.py
    │   ├── mbox-short.txt
    │   └── words.txt
    ├── Chapter 8
    │   ├── Assignment 8.4.py
    │   ├── Assignment 8.5.py
    │   ├── mbox-short.txt
    │   └── romeo.txt
    ├── Chapter 9
    │   ├── Assignment 9.4.py
    │   └── mbox-short.txt
    ├── Directory Test.PNG
    └── Test.py
├── README.md
├── Using Databases with Python
    ├── Week 2
    │   ├── First Database.db
    │   ├── First Database.db.sqbpro
    │   ├── First Database.sql
    │   ├── emaildb.py
    │   ├── emaildb.sqlite
    │   ├── mbox-short.txt
    │   └── mbox.txt
    ├── Week 3
    │   ├── Library.xml
    │   ├── README.txt
    │   ├── trackdb.sqlite
    │   ├── tracks.py
    │   ├── trackscomplete.py
    │   └── tracksdb.py
    ├── Week 4
    │   ├── HW Result.sql
    │   ├── roster.py
    │   ├── roster_data.json
    │   └── rosterdb.sqlite
    └── Week 5
    │   ├── Google API Key.doc
    │   ├── README.txt
    │   ├── geodata.sqlite
    │   ├── geodump.png
    │   ├── geodump.py
    │   ├── geoload.png
    │   ├── geoload.py
    │   ├── where.data
    │   ├── where.html
    │   ├── where.js
    │   └── where.png
└── Using Python to Access Web Data
    ├── Week 1
        ├── Atom Editor Test.PNG
        ├── Directory Test.PNG
        └── Test.py
    ├── Week 2
        ├── Week 2.py
        └── regex_sum_41647.txt
    ├── Week 3
        ├── Week 3.py
        └── intro-short.txt
    ├── Week 4
        ├── Following Links in HTML Using BeautifulSoup.py
        └── Scraping HTML Data with BeautifulSoup.py
    ├── Week 5
        └── Extracting Data from XML.py
    └── Week 6
        ├── Extracting Data from JSON.py
        └── GEOSON API.py


/Capstone/Mailing List I/Content.sqlite Snapshot.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Content.sqlite Snapshot.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/Gline Visualization.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gline Visualization.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/Gmodel Index sqlite Screenshot.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gmodel Index sqlite Screenshot.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/Gmodel.py Application Screenshot.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gmodel.py Application Screenshot.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/Histogram gbasic.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Histogram gbasic.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/README.txt:
--------------------------------------------------------------------------------
  1 | Analyzing an EMAIL Archive from gmane and vizualizing the data
  2 | using the D3 JavaScript library
  3 | 
  4 | This is a set of tools that allow you to pull down an archive
  5 | of a gmane repository using the instructions at:
  6 | 
  7 | http://gmane.org/export.php
  8 | 
  9 | In order not to overwhelm the gmane.org server, I have put up 
 10 | my own copy of the messages at: 
 11 | 
 12 | http://mbox.dr-chuck.net/
 13 | 
 14 | This server will be faster and take a lot of load off the 
 15 | gmane.org server.
 16 | 
 17 | You should install the SQLite browser to view and modify the databases from:
 18 | 
 19 | http://sqlitebrowser.org/
 20 | 
 21 | The first step is to spider the gmane repository.  The base URL 
 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai
 23 | developer list.  You can spider another repository by changing that
 24 | base url.   Make sure to delete the content.sqlite file if you 
 25 | switch the base url.  The gmane.py file operates as a spider in 
 26 | that it runs slowly and retrieves one mail message per second so 
 27 | as to avoid getting throttled by gmane.org.   It stores all of
 28 | its data in a database and can be interrupted and re-started 
 29 | as often as needed.   It may take many hours to pull all the data
 30 | down.  So you may need to restart several times.
 31 | 
 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 
 33 | email here:
 34 | 
 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite
 36 | 
 37 | If you download this, you can "catch up with the latest" by
 38 | running gmane.py.
 39 | 
 40 | Navigate to the folder where you extracted the gmane.zip
 41 | 
 42 | Note: Windows has difficulty in displaying UTF-8 characters
 43 | in the console so for each console window you open, you may need
 44 | to type the following command before running this code:
 45 | 
 46 |     chcp 65001
 47 | 
 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how
 49 | 
 50 | Here is a run of gmane.py getting the last five messages of the
 51 | sakai developer list:
 52 | 
 53 | Mac: python3 gmane.py 
 54 | Win: gmane.py 
 55 | 
 56 | How many messages:10
 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662
 58 |     ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation
 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434
 60 |     csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference:  sakai developers break into song
 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055
 62 |     kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5
 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721
 64 |     michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments
 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443
 66 |     john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments
 67 | Does not start with From 
 68 | 
 69 | The program scans content.sqlite from 1 up to the first message number not
 70 | already spidered and starts spidering at that message.  It continues spidering
 71 | until it has spidered the desired number of messages or it reaches a page
 72 | that does not appear to be a properly formatted message.
 73 | 
 74 | Sometimes gmane.org is missing a message.  Perhaps administrators can delete messages
 75 | or perhaps they get lost - I don't know.   If your spider stops, and it seems it has hit
 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave
 77 | all the other fields blank - and then restart gmane.py.   This will unstick the 
 78 | spidering process and allow it to continue.  These empty messages will be ignored in the next
 79 | phase of the process.
 80 | 
 81 | One nice thing is that once you have spidered all of the messages and have them in 
 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the
 83 | list.  gmane.py will quickly scan to the end of the already-spidered pages and check 
 84 | if there are new messages and then quickly retrieve those messages and add them 
 85 | to content.sqlite.
 86 | 
 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed.
 88 | This is intentional as it allows you to look at content.sqlite to debug the process.
 89 | It would be a bad idea to run any queries against this database as they would be 
 90 | slow.
 91 | 
 92 | The second process is running the program gmodel.py.  gmodel.py reads the rough/raw 
 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 
 94 | data in the file index.sqlite.  The file index.sqlite will be much smaller (often 10X
 95 | smaller) than content.sqlite because it also compresses the header and body text.
 96 | 
 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing
 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 
 99 | data cleaning process.
100 | 
101 | Running gmodel.py works as follows:
102 | 
103 | Mac: python3 gmodel.py
104 | Win: gmodel.py
105 | 
106 | Loaded allsenders 1588 and mapping 28 dns mapping 1
107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com
108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu
109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu
110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu
111 | ...
112 | 
113 | The gmodel.py program does a number of data cleaing steps
114 | 
115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 
116 | other domain names are truncated to three levels.  So si.umich.edu becomes
117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk.   Also mail addresses are
118 | forced to lower case and some of the @gmane.org address like the following
119 | 
120 |    arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org
121 | 
122 | are converted to the real address whenever there is a matching real email
123 | address elsewhere in the message corpus.
124 | 
125 | If you look in the content.sqlite database there are two tables that allow
126 | you to map both domain names and individual email addresses that change over 
127 | the lifetime of the email list.  For example, Steve Githens used the following
128 | email addresses over the life of the Sakai developer list:
129 | 
130 | s-githens@northwestern.edu
131 | sgithens@cam.ac.uk
132 | swgithen@mtu.edu
133 | 
134 | We can add two entries to the Mapping table
135 | 
136 | s-githens@northwestern.edu ->  swgithen@mtu.edu
137 | sgithens@cam.ac.uk -> swgithen@mtu.edu
138 | 
139 | And so all the mail messages will be collected under one sender even if 
140 | they used several email addresses over the lifetime of the mailing list.
141 | 
142 | You can also make similar entries in the DNSMapping table if there are multiple
143 | DNS names you want mapped to a single DNS.  In the Sakai data I add the following
144 | mapping:
145 | 
146 | iupui.edu -> indiana.edu
147 | 
148 | So all the folks from the various Indiana University campuses are tracked together
149 | 
150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings
151 | to make the data cleaner and cleaner.   When you are done, you will have a nicely
152 | indexed version of the email in index.sqlite.   This is the file to use to do data
153 | analysis.   With this file, data analysis will be really quick.
154 | 
155 | The first, simplest data analysis is to do a "who does the most" and "which 
156 | organzation does the most"?  This is done using gbasic.py:
157 | 
158 | Mac: python3 gbasic.py 
159 | Win: gbasic.py 
160 | 
161 | How many to dump? 5
162 | Loaded messages= 51330 subjects= 25033 senders= 1584
163 | 
164 | Top 5 Email list participants
165 | steve.swinsburg@gmail.com 2657
166 | azeckoski@unicon.net 1742
167 | ieb@tfd.co.uk 1591
168 | csev@umich.edu 1304
169 | david.horwitz@uct.ac.za 1184
170 | 
171 | Top 5 Email list organizations
172 | gmail.com 7339
173 | umich.edu 6243
174 | uct.ac.za 2451
175 | indiana.edu 2258
176 | unicon.net 2055
177 | 
178 | You can look at the data in index.sqlite and if you find a problem, you 
179 | can update the Mapping table and DNSMapping table in content.sqlite and
180 | re-run gmodel.py.
181 | 
182 | There is a simple vizualization of the word frequence in the subject lines
183 | in the file gword.py:
184 | 
185 | Mac: python3 gword.py
186 | Win: gword.py
187 | 
188 | Range of counts: 33229 129
189 | Output written to gword.js
190 | 
191 | This produces the file gword.js which you can visualize using the file 
192 | gword.htm.
193 | 
194 | A second visualization is in gline.py.  It visualizes email participation by 
195 | organizations over time.
196 | 
197 | Mac: python3 gline.py 
198 | Win: gline.py 
199 | 
200 | Loaded messages= 51330 subjects= 25033 senders= 1584
201 | Top 10 Oranizations
202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk']
203 | Output written to gline.js
204 | 
205 | Its output is written to gline.js which is visualized using gline.htm.
206 | 
207 | Some URLs for visualization ideas:
208 | 
209 | https://developers.google.com/chart/
210 | 
211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart
212 | 
213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats
214 | 
215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline
216 | 
217 | http://bost.ocks.org/mike/uberdata/
218 | 
219 | http://mbostock.github.io/d3/talk/20111018/calendar.html
220 | 
221 | http://nltk.org/install.html
222 | 
223 | As always - comments welcome.
224 | 
225 | -- Dr. Chuck
226 | Sun Sep 29 00:11:01 EDT 2013
227 | 
228 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/Second Gline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Second Gline.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/Wordcloud.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Wordcloud.PNG


--------------------------------------------------------------------------------
/Capstone/Mailing List I/content.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/content.sqlite


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gbasic.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import zlib
 4 | 
 5 | howmany = int(input("How many to dump? "))
 6 | 
 7 | conn = sqlite3.connect('index.sqlite')
 8 | cur = conn.cursor()
 9 | 
10 | cur.execute('SELECT id, sender FROM Senders')
11 | senders = dict()
12 | for message_row in cur :
13 |     senders[message_row[0]] = message_row[1]
14 | 
15 | cur.execute('SELECT id, subject FROM Subjects')
16 | subjects = dict()
17 | for message_row in cur :
18 |     subjects[message_row[0]] = message_row[1]
19 | 
20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
22 | messages = dict()
23 | for message_row in cur :
24 |     messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4])
25 | 
26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders))
27 | 
28 | sendcounts = dict()
29 | sendorgs = dict()
30 | for (message_id, message) in list(messages.items()):
31 |     sender = message[1]
32 |     sendcounts[sender] = sendcounts.get(sender,0) + 1
33 |     pieces = senders[sender].split("@")
34 |     if len(pieces) != 2 : continue
35 |     dns = pieces[1]
36 |     sendorgs[dns] = sendorgs.get(dns,0) + 1
37 | 
38 | print('')
39 | print('Top',howmany,'Email list participants')
40 | 
41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True)
42 | for k in x[:howmany]:
43 |     print(senders[k], sendcounts[k])
44 |     if sendcounts[k] < 10 : break
45 | 
46 | print('')
47 | print('Top',howmany,'Email list organizations')
48 | 
49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True)
50 | for k in x[:howmany]:
51 |     print(k, sendorgs[k])
52 |     if sendorgs[k] < 10 : break
53 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gline.htm:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <script type="text/javascript" src="gline.js"></script>
 4 |     <script type="text/javascript" src="https://www.google.com/jsapi"></script>
 5 |     <script type="text/javascript">
 6 |       google.load("visualization", "1", {packages:["corechart"]});
 7 |       google.setOnLoadCallback(drawChart);
 8 |       function drawChart() {
 9 |         var data = google.visualization.arrayToDataTable( gline );
10 | 
11 |         var options = {
12 |           title: 'Sakai Developer Email Participation by Organization'
13 |         };
14 | 
15 |         var chart = new google.visualization.LineChart(document.getElementById('chart_div'));
16 |         chart.draw(data, options);
17 |       }
18 |     </script>
19 |   </head>
20 |   <body>
21 |     <div id="chart_div" style="width: 1400px; height: 800px;"></div>
22 |   </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gline.js:
--------------------------------------------------------------------------------
 1 | gline = [ ['Month','umich.edu','gmail.com','swinsborg.com','cam.ac.uk','uct.ac.za','indiana.edu','unicon.net','berkeley.edu','longsight.com','stanford.edu'],
 2 | ['2005-12',57,10,0,7,14,12,6,12,0,4],
 3 | ['2006-01',114,23,0,19,27,32,10,33,0,13],
 4 | ['2006-02',121,28,0,28,23,33,7,28,0,24],
 5 | ['2006-03',86,27,0,44,18,43,11,34,1,14],
 6 | ['2006-04',125,24,0,54,38,47,42,44,1,25],
 7 | ['2006-05',151,26,0,103,51,55,101,76,2,22],
 8 | ['2006-06',119,30,0,76,66,70,37,32,0,13],
 9 | ['2006-07',86,19,0,47,55,76,37,18,0,39],
10 | ['2006-08',136,70,0,46,44,102,50,20,0,21],
11 | ['2006-09',131,46,0,36,26,46,28,27,0,32],
12 | ['2006-10',109,28,0,74,20,52,35,30,0,31],
13 | ['2006-11',87,55,0,51,47,36,24,35,0,16],
14 | ['2006-12',54,58,0,21,13,46,8,26,0,15],
15 | ['2007-01',84,32,0,42,35,43,10,24,0,26],
16 | ['2007-02',114,51,0,59,44,54,10,18,0,30],
17 | ['2007-03',93,45,4,54,38,64,4,46,0,34],
18 | ['2007-04',68,54,1,46,25,72,10,24,0,27],
19 | ['2007-05',98,45,17,61,61,41,16,62,0,39],
20 | ['2007-06',115,43,3,58,36,39,33,50,0,38],
21 | ['2007-07',126,53,28,89,69,28,59,45,0,34],
22 | ['2007-08',182,37,21,102,50,63,48,46,0,28],
23 | ['2007-09',167,52,31,132,70,47,98,53,0,58],
24 | ['2007-10',104,62,41,97,47,44,100,41,0,73],
25 | ['2007-11',110,46,22,142,51,40,90,41,0,23],
26 | ['2007-12',151,71,21,123,36,33,67,35,0,17],
27 | ['2008-01',126,49,17,64,32,42,24,39,0,15],
28 | ['2008-02',94,39,51,89,30,34,19,14,0,23],
29 | ['2008-03',89,45,14,43,42,39,29,19,0,27],
30 | ['2008-04',140,58,22,99,50,27,57,40,0,26],
31 | ['2008-05',130,60,44,104,36,17,93,26,0,19],
32 | ['2008-06',96,28,13,36,33,21,31,25,0,5],
33 | ['2008-07',115,32,24,75,55,22,59,30,2,7],
34 | ['2008-08',165,42,31,80,65,23,41,39,5,13],
35 | ['2008-09',119,54,31,35,35,44,28,25,0,10],
36 | ['2008-10',85,40,33,60,31,27,33,15,4,3],
37 | ['2008-11',43,23,19,26,19,12,17,11,1,5],
38 | ['2008-12',67,30,18,17,22,12,18,6,3,4],
39 | ['2009-01',46,16,18,19,27,3,1,9,0,4],
40 | ['2009-02',23,43,38,26,17,15,5,9,1,5],
41 | ['2009-03',94,76,56,5,27,18,19,6,7,9],
42 | ['2009-04',74,101,43,2,28,18,42,5,8,10],
43 | ['2009-05',49,122,61,6,29,16,25,13,10,12],
44 | ['2009-06',43,64,41,4,29,11,8,27,8,4],
45 | ['2009-07',67,99,50,12,32,20,21,27,12,4],
46 | ['2009-08',42,59,17,13,35,25,12,8,13,3],
47 | ['2009-09',71,42,23,8,33,22,11,6,9,19],
48 | ['2009-10',77,69,85,4,50,43,29,3,9,8],
49 | ['2009-11',55,40,46,10,26,26,14,11,6,9],
50 | ['2009-12',43,34,26,2,21,16,11,2,3,4],
51 | ['2010-01',57,29,39,3,26,19,10,3,17,10],
52 | ['2010-02',54,36,42,12,22,21,16,2,16,1],
53 | ['2010-03',72,89,53,12,38,22,18,8,14,13],
54 | ['2010-04',41,38,30,2,18,9,15,9,17,24],
55 | ['2010-05',50,32,47,3,34,10,8,2,8,9],
56 | ['2010-06',28,56,47,10,18,12,7,2,19,14],
57 | ['2010-07',53,57,54,10,42,9,4,3,11,10],
58 | ['2010-08',57,47,36,13,41,18,11,1,15,8],
59 | ['2010-09',58,44,34,4,22,21,3,3,18,4],
60 | ['2010-10',42,41,18,2,12,4,4,4,14,11],
61 | ['2010-11',41,34,23,5,13,10,4,0,7,1],
62 | ['2010-12',26,32,13,2,11,8,7,1,7,3],
63 | ['2011-01',35,47,46,5,20,7,2,2,22,6],
64 | ['2011-02',30,58,51,2,15,9,17,1,18,6],
65 | ['2011-03',60,86,54,10,12,17,15,2,52,11],
66 | ['2011-04',38,45,25,3,6,21,6,0,19,3],
67 | ['2011-05',18,39,15,9,13,14,8,1,19,3],
68 | ['2011-06',30,89,22,4,22,10,13,0,7,4],
69 | ['2011-07',45,69,73,5,18,16,6,1,39,5],
70 | ['2011-08',42,45,37,9,13,13,4,4,48,10],
71 | ['2011-09',40,80,28,5,11,16,12,6,33,18],
72 | ['2011-10',23,59,26,7,4,11,12,0,34,6],
73 | ['2011-11',30,86,42,5,16,9,3,1,23,1],
74 | ['2011-12',26,30,31,2,7,5,5,0,21,13],
75 | ['2012-01',32,54,28,1,20,8,8,0,32,10],
76 | ['2012-02',37,85,62,5,24,10,18,4,36,22],
77 | ['2012-03',42,84,47,2,19,4,19,1,56,9],
78 | ['2012-04',24,50,52,0,11,4,15,1,45,10],
79 | ['2012-05',24,92,66,1,9,12,18,1,36,12],
80 | ['2012-06',53,90,51,2,12,14,54,1,47,1],
81 | ['2012-07',13,59,47,7,7,5,8,2,40,5],
82 | ['2012-08',17,61,51,1,7,10,18,1,41,1],
83 | ['2012-09',21,44,35,1,14,11,16,0,46,9],
84 | ['2012-10',21,51,36,3,19,6,10,0,63,9],
85 | ['2012-11',29,63,62,2,15,20,11,0,71,5],
86 | ['2012-12',12,29,35,0,5,8,9,0,31,9],
87 | ['2013-01',39,44,40,3,5,3,3,0,50,6],
88 | ['2013-02',59,97,66,0,5,25,13,0,54,8],
89 | ['2013-03',60,122,66,1,12,33,30,0,72,11],
90 | ['2013-04',9,34,18,0,4,4,8,0,17,2]
91 | ];
92 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gline.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import zlib
 4 | 
 5 | conn = sqlite3.connect('index.sqlite')
 6 | cur = conn.cursor()
 7 | 
 8 | cur.execute('SELECT id, sender FROM Senders')
 9 | senders = dict()
10 | for message_row in cur :
11 |     senders[message_row[0]] = message_row[1]
12 | 
13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
14 | messages = dict()
15 | for message_row in cur :
16 |     messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4])
17 | 
18 | print("Loaded messages=",len(messages),"senders=",len(senders))
19 | 
20 | sendorgs = dict()
21 | for (message_id, message) in list(messages.items()):
22 |     sender = message[1]
23 |     pieces = senders[sender].split("@")
24 |     if len(pieces) != 2 : continue
25 |     dns = pieces[1]
26 |     sendorgs[dns] = sendorgs.get(dns,0) + 1
27 | 
28 | # pick the top schools
29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
30 | orgs = orgs[:10]
31 | print("Top 10 Oranizations")
32 | print(orgs)
33 | 
34 | counts = dict()
35 | months = list()
36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
37 | for (message_id, message) in list(messages.items()):
38 |     sender = message[1]
39 |     pieces = senders[sender].split("@")
40 |     if len(pieces) != 2 : continue
41 |     dns = pieces[1]
42 |     if dns not in orgs : continue
43 |     month = message[3][:7]
44 |     if month not in months : months.append(month)
45 |     key = (month, dns)
46 |     counts[key] = counts.get(key,0) + 1
47 | 
48 | months.sort()
49 | # print counts
50 | # print months
51 | 
52 | fhand = open('gline.js','w')
53 | fhand.write("gline = [ ['Year'")
54 | for org in orgs:
55 |     fhand.write(",'"+org+"'")
56 | fhand.write("]")
57 | 
58 | for month in months:
59 |     fhand.write(",\n['"+month+"'")
60 |     for org in orgs:
61 |         key = (month, org)
62 |         val = counts.get(key,0)
63 |         fhand.write(","+str(val))
64 |     fhand.write("]");
65 | 
66 | fhand.write("\n];\n")
67 | fhand.close()
68 | 
69 | print("Output written to gline.js")
70 | print("Open gline.htm to visualize the data")
71 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gmane.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import time
  3 | import ssl
  4 | import urllib.request, urllib.parse, urllib.error
  5 | from urllib.parse import urljoin
  6 | from urllib.parse import urlparse
  7 | import re
  8 | from datetime import datetime, timedelta
  9 | 
 10 | # Not all systems have this so conditionally define parser
 11 | try:
 12 |     import dateutil.parser as parser
 13 | except:
 14 |     pass
 15 | 
 16 | def parsemaildate(md) :
 17 |     # See if we have dateutil
 18 |     try:
 19 |         pdate = parser.parse(tdate)
 20 |         test_at = pdate.isoformat()
 21 |         return test_at
 22 |     except:
 23 |         pass
 24 | 
 25 |     # Non-dateutil version - we try our best
 26 | 
 27 |     pieces = md.split()
 28 |     notz = " ".join(pieces[:4]).strip()
 29 | 
 30 |     # Try a bunch of format variations - strptime() is *lame*
 31 |     dnotz = None
 32 |     for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
 33 |         '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
 34 |         '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
 35 |         try:
 36 |             dnotz = datetime.strptime(notz, form)
 37 |             break
 38 |         except:
 39 |             continue
 40 | 
 41 |     if dnotz is None :
 42 |         # print 'Bad Date:',md
 43 |         return None
 44 | 
 45 |     iso = dnotz.isoformat()
 46 | 
 47 |     tz = "+0000"
 48 |     try:
 49 |         tz = pieces[4]
 50 |         ival = int(tz) # Only want numeric timezone values
 51 |         if tz == '-0000' : tz = '+0000'
 52 |         tzh = tz[:3]
 53 |         tzm = tz[3:]
 54 |         tz = tzh+":"+tzm
 55 |     except:
 56 |         pass
 57 | 
 58 |     return iso+tz
 59 | 
 60 | # Ignore SSL certificate errors
 61 | ctx = ssl.create_default_context()
 62 | ctx.check_hostname = False
 63 | ctx.verify_mode = ssl.CERT_NONE
 64 | 
 65 | conn = sqlite3.connect('content.sqlite')
 66 | cur = conn.cursor()
 67 | 
 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
 69 | 
 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages
 71 |     (id INTEGER UNIQUE, email TEXT, sent_at TEXT,
 72 |      subject TEXT, headers TEXT, body TEXT)''')
 73 | 
 74 | # Pick up where we left off
 75 | start = None
 76 | cur.execute('SELECT max(id) FROM Messages' )
 77 | try:
 78 |     row = cur.fetchone()
 79 |     if row is None :
 80 |         start = 0
 81 |     else:
 82 |         start = row[0]
 83 | except:
 84 |     start = 0
 85 | 
 86 | if start is None : start = 0
 87 | 
 88 | many = 0
 89 | count = 0
 90 | fail = 0
 91 | while True:
 92 |     if ( many < 1 ) :
 93 |         conn.commit()
 94 |         sval = input('How many messages:')
 95 |         if ( len(sval) < 1 ) : break
 96 |         many = int(sval)
 97 | 
 98 |     start = start + 1
 99 |     cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
100 |     try:
101 |         row = cur.fetchone()
102 |         if row is not None : continue
103 |     except:
104 |         row = None
105 | 
106 |     many = many - 1
107 |     url = baseurl + str(start) + '/' + str(start + 1)
108 | 
109 |     text = "None"
110 |     try:
111 |         # Open with a timeout of 30 seconds
112 |         document = urllib.request.urlopen(url, None, 30, context=ctx)
113 |         text = document.read().decode()
114 |         if document.getcode() != 200 :
115 |             print("Error code=",document.getcode(), url)
116 |             break
117 |     except KeyboardInterrupt:
118 |         print('')
119 |         print('Program interrupted by user...')
120 |         break
121 |     except Exception as e:
122 |         print("Unable to retrieve or parse page",url)
123 |         print("Error",e)
124 |         fail = fail + 1
125 |         if fail > 5 : break
126 |         continue
127 | 
128 |     print(url,len(text))
129 |     count = count + 1
130 | 
131 |     if not text.startswith("From "):
132 |         print(text)
133 |         print("Did not find From ")
134 |         fail = fail + 1
135 |         if fail > 5 : break
136 |         continue
137 | 
138 |     pos = text.find("\n\n")
139 |     if pos > 0 :
140 |         hdr = text[:pos]
141 |         body = text[pos+2:]
142 |     else:
143 |         print(text)
144 |         print("Could not find break between headers and body")
145 |         fail = fail + 1
146 |         if fail > 5 : break
147 |         continue
148 | 
149 |     email = None
150 |     x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
151 |     if len(x) == 1 :
152 |         email = x[0];
153 |         email = email.strip().lower()
154 |         email = email.replace("<","")
155 |     else:
156 |         x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
157 |         if len(x) == 1 :
158 |             email = x[0];
159 |             email = email.strip().lower()
160 |             email = email.replace("<","")
161 | 
162 |     date = None
163 |     y = re.findall('\Date: .*, (.*)\n', hdr)
164 |     if len(y) == 1 :
165 |         tdate = y[0]
166 |         tdate = tdate[:26]
167 |         try:
168 |             sent_at = parsemaildate(tdate)
169 |         except:
170 |             print(text)
171 |             print("Parse fail",tdate)
172 |             fail = fail + 1
173 |             if fail > 5 : break
174 |             continue
175 | 
176 |     subject = None
177 |     z = re.findall('\Subject: (.*)\n', hdr)
178 |     if len(z) == 1 : subject = z[0].strip().lower();
179 | 
180 |     # Reset the fail counter
181 |     fail = 0
182 |     print("   ",email,sent_at,subject)
183 |     cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body)
184 |         VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
185 |     if count % 50 == 0 : conn.commit()
186 |     if count % 100 == 0 : time.sleep(1)
187 | 
188 | conn.commit()
189 | cur.close()
190 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gmodel.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import time
  3 | import re
  4 | import zlib
  5 | from datetime import datetime, timedelta
  6 | 
  7 | # Not all systems have this
  8 | try:
  9 |     import dateutil.parser as parser
 10 | except:
 11 |     pass
 12 | 
 13 | dnsmapping = dict()
 14 | mapping = dict()
 15 | 
 16 | def fixsender(sender,allsenders=None) :
 17 |     global dnsmapping
 18 |     global mapping
 19 |     if sender is None : return None
 20 |     sender = sender.strip().lower()
 21 |     sender = sender.replace('<','').replace('>','')
 22 | 
 23 |     # Check if we have a hacked gmane.org from address
 24 |     if allsenders is not None and sender.endswith('gmane.org') :
 25 |         pieces = sender.split('-')
 26 |         realsender = None
 27 |         for s in allsenders:
 28 |             if s.startswith(pieces[0]) :
 29 |                 realsender = sender
 30 |                 sender = s
 31 |                 # print(realsender, sender)
 32 |                 break
 33 |         if realsender is None :
 34 |             for s in mapping:
 35 |                 if s.startswith(pieces[0]) :
 36 |                     realsender = sender
 37 |                     sender = mapping[s]
 38 |                     # print(realsender, sender)
 39 |                     break
 40 |         if realsender is None : sender = pieces[0]
 41 | 
 42 |     mpieces = sender.split("@")
 43 |     if len(mpieces) != 2 : return sender
 44 |     dns = mpieces[1]
 45 |     x = dns
 46 |     pieces = dns.split(".")
 47 |     if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
 48 |         dns = ".".join(pieces[-2:])
 49 |     else:
 50 |         dns = ".".join(pieces[-3:])
 51 |     # if dns != x : print(x,dns)
 52 |     # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns))
 53 |     dns = dnsmapping.get(dns,dns)
 54 |     return mpieces[0] + '@' + dns
 55 | 
 56 | def parsemaildate(md) :
 57 |     # See if we have dateutil
 58 |     try:
 59 |         pdate = parser.parse(tdate)
 60 |         test_at = pdate.isoformat()
 61 |         return test_at
 62 |     except:
 63 |         pass
 64 | 
 65 |     # Non-dateutil version - we try our best
 66 | 
 67 |     pieces = md.split()
 68 |     notz = " ".join(pieces[:4]).strip()
 69 | 
 70 |     # Try a bunch of format variations - strptime() is *lame*
 71 |     dnotz = None
 72 |     for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
 73 |         '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
 74 |         '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
 75 |         try:
 76 |             dnotz = datetime.strptime(notz, form)
 77 |             break
 78 |         except:
 79 |             continue
 80 | 
 81 |     if dnotz is None :
 82 |         # print('Bad Date:',md)
 83 |         return None
 84 | 
 85 |     iso = dnotz.isoformat()
 86 | 
 87 |     tz = "+0000"
 88 |     try:
 89 |         tz = pieces[4]
 90 |         ival = int(tz) # Only want numeric timezone values
 91 |         if tz == '-0000' : tz = '+0000'
 92 |         tzh = tz[:3]
 93 |         tzm = tz[3:]
 94 |         tz = tzh+":"+tzm
 95 |     except:
 96 |         pass
 97 | 
 98 |     return iso+tz
 99 | 
100 | # Parse out the info...
101 | def parseheader(hdr, allsenders=None):
102 |     if hdr is None or len(hdr) < 1 : return None
103 |     sender = None
104 |     x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
105 |     if len(x) >= 1 :
106 |         sender = x[0]
107 |     else:
108 |         x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
109 |         if len(x) >= 1 :
110 |             sender = x[0]
111 | 
112 |     # normalize the domain name of Email addresses
113 |     sender = fixsender(sender, allsenders)
114 | 
115 |     date = None
116 |     y = re.findall('\nDate: .*, (.*)\n', hdr)
117 |     sent_at = None
118 |     if len(y) >= 1 :
119 |         tdate = y[0]
120 |         tdate = tdate[:26]
121 |         try:
122 |             sent_at = parsemaildate(tdate)
123 |         except Exception as e:
124 |             # print('Date ignored ',tdate, e)
125 |             return None
126 | 
127 |     subject = None
128 |     z = re.findall('\nSubject: (.*)\n', hdr)
129 |     if len(z) >= 1 : subject = z[0].strip().lower()
130 | 
131 |     guid = None
132 |     z = re.findall('\nMessage-ID: (.*)\n', hdr)
133 |     if len(z) >= 1 : guid = z[0].strip().lower()
134 | 
135 |     if sender is None or sent_at is None or subject is None or guid is None :
136 |         return None
137 |     return (guid, sender, subject, sent_at)
138 | 
139 | conn = sqlite3.connect('index.sqlite')
140 | cur = conn.cursor()
141 | 
142 | cur.execute('''DROP TABLE IF EXISTS Messages ''')
143 | cur.execute('''DROP TABLE IF EXISTS Senders ''')
144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''')
145 | cur.execute('''DROP TABLE IF EXISTS Replies ''')
146 | 
147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages
148 |     (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER,
149 |      sender_id INTEGER, subject_id INTEGER,
150 |      headers BLOB, body BLOB)''')
151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders
152 |     (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects
154 |     (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies
156 |     (from_id INTEGER, to_id INTEGER)''')
157 | 
158 | conn_1 = sqlite3.connect('mapping.sqlite')
159 | cur_1 = conn_1.cursor()
160 | 
161 | cur_1.execute('''SELECT old,new FROM DNSMapping''')
162 | for message_row in cur_1 :
163 |     dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower()
164 | 
165 | mapping = dict()
166 | cur_1.execute('''SELECT old,new FROM Mapping''')
167 | for message_row in cur_1 :
168 |     old = fixsender(message_row[0])
169 |     new = fixsender(message_row[1])
170 |     mapping[old] = fixsender(new)
171 | 
172 | # Done with mapping.sqlite
173 | conn_1.close()
174 | 
175 | # Open the main content (Read only)
176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True)
177 | cur_1 = conn_1.cursor()
178 | 
179 | allsenders = list()
180 | cur_1.execute('''SELECT email FROM Messages''')
181 | for message_row in cur_1 :
182 |     sender = fixsender(message_row[0])
183 |     if sender is None : continue
184 |     if 'gmane.org' in sender : continue
185 |     if sender in allsenders: continue
186 |     allsenders.append(sender)
187 | 
188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping))
189 | 
190 | cur_1.execute('''SELECT headers, body, sent_at
191 |     FROM Messages ORDER BY sent_at''')
192 | 
193 | senders = dict()
194 | subjects = dict()
195 | guids = dict()
196 | 
197 | count = 0
198 | 
199 | for message_row in cur_1 :
200 |     hdr = message_row[0]
201 |     parsed = parseheader(hdr, allsenders)
202 |     if parsed is None: continue
203 |     (guid, sender, subject, sent_at) = parsed
204 | 
205 |     # Apply the sender mapping
206 |     sender = mapping.get(sender,sender)
207 | 
208 |     count = count + 1
209 |     if count % 250 == 1 : print(count,sent_at, sender)
210 |     # print(guid, sender, subject, sent_at)
211 | 
212 |     if 'gmane.org' in sender:
213 |         print("Error in sender ===", sender)
214 | 
215 |     sender_id = senders.get(sender,None)
216 |     subject_id = subjects.get(subject,None)
217 |     guid_id = guids.get(guid,None)
218 | 
219 |     if sender_id is None :
220 |         cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
221 |         conn.commit()
222 |         cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
223 |         try:
224 |             row = cur.fetchone()
225 |             sender_id = row[0]
226 |             senders[sender] = sender_id
227 |         except:
228 |             print('Could not retrieve sender id',sender)
229 |             break
230 |     if subject_id is None :
231 |         cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
232 |         conn.commit()
233 |         cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
234 |         try:
235 |             row = cur.fetchone()
236 |             subject_id = row[0]
237 |             subjects[subject] = subject_id
238 |         except:
239 |             print('Could not retrieve subject id',subject)
240 |             break
241 |     # print(sender_id, subject_id)
242 |     cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )',
243 |             ( guid, sender_id, subject_id, sent_at,
244 |             zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) )
245 |     conn.commit()
246 |     cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
247 |     try:
248 |         row = cur.fetchone()
249 |         message_id = row[0]
250 |         guids[guid] = message_id
251 |     except:
252 |         print('Could not retrieve guid id',guid)
253 |         break
254 | 
255 | cur.close()
256 | cur_1.close()
257 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gword.htm:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | <script src="d3.v2.js"></script>
 4 | <script src="d3.layout.cloud.js"></script>
 5 | <script src="gword.js"></script>
 6 | <body>
 7 | <script>
 8 |   var fill = d3.scale.category20();
 9 | 
10 |   d3.layout.cloud().size([700, 700])
11 |       .words(gword)
12 |       .rotate(function() { return ~~(Math.random() * 2) * 90; })
13 |       .font("Impact")
14 |       .fontSize(function(d) { return d.size; })
15 |       .on("end", draw)
16 |       .start();
17 | 
18 |   function draw(words) {
19 |     d3.select("body").append("svg")
20 |         .attr("width", 700)
21 |         .attr("height", 700)
22 |       .append("g")
23 |         .attr("transform", "translate(350,350)")
24 |       .selectAll("text")
25 |         .data(words)
26 |       .enter().append("text")
27 |         .style("font-size", function(d) { return d.size + "px"; })
28 |         .style("font-family", "Impact")
29 |         .style("fill", function(d, i) { return fill(i); })
30 |         .attr("text-anchor", "middle")
31 |         .attr("transform", function(d) {
32 |           return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
33 |         })
34 |         .text(function(d) { return d.text; });
35 |   }
36 | </script>
37 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gword.js:
--------------------------------------------------------------------------------
  1 | gword = [{text: 'sakai', size: 100},
  2 | {text: 'building', size: 71},
  3 | {text: 'tool', size: 26},
  4 | {text: 'with', size: 26},
  5 | {text: 'site', size: 24},
  6 | {text: 'problem', size: 23},
  7 | {text: 'error', size: 22},
  8 | {text: 'from', size: 22},
  9 | {text: 'question', size: 22},
 10 | {text: 'samigo', size: 22},
 11 | {text: 'build', size: 21},
 12 | {text: 'release', size: 21},
 13 | {text: 'trunk', size: 21},
 14 | {text: 'using', size: 21},
 15 | {text: 'resources', size: 21},
 16 | {text: 'issue', size: 21},
 17 | {text: 'user', size: 21},
 18 | {text: 'help', size: 21},
 19 | {text: 'tools', size: 21},
 20 | {text: 'melete', size: 21},
 21 | {text: 'problems', size: 21},
 22 | {text: 'gradebook', size: 21},
 23 | {text: 'maven', size: 21},
 24 | {text: 'mysql', size: 21},
 25 | {text: 'course', size: 21},
 26 | {text: 'tomcat', size: 20},
 27 | {text: 'about', size: 20},
 28 | {text: 'deploying', size: 20},
 29 | {text: 'content', size: 20},
 30 | {text: 'sites', size: 20},
 31 | {text: 'integration', size: 20},
 32 | {text: 'users', size: 20},
 33 | {text: 'email', size: 20},
 34 | {text: 'jira', size: 20},
 35 | {text: 'issues', size: 20},
 36 | {text: 'portal', size: 20},
 37 | {text: 'upgrade', size: 20},
 38 | {text: 'broken', size: 20},
 39 | {text: 'update', size: 20},
 40 | {text: 'change', size: 20},
 41 | {text: 'file', size: 20},
 42 | {text: 'assignment', size: 20},
 43 | {text: 'when', size: 20},
 44 | {text: 'search', size: 20},
 45 | {text: 'code', size: 20},
 46 | {text: 'service', size: 20},
 47 | {text: 'management', size: 20},
 48 | {text: 'webdav', size: 20},
 49 | {text: 'test', size: 20},
 50 | {text: 'errors', size: 20},
 51 | {text: 'oracle', size: 20},
 52 | {text: 'assignments', size: 20},
 53 | {text: 'files', size: 20},
 54 | {text: 'profile', size: 20},
 55 | {text: 'production', size: 20},
 56 | {text: 'page', size: 20},
 57 | {text: 'version', size: 20},
 58 | {text: 'database', size: 20},
 59 | {text: 'hibernate', size: 20},
 60 | {text: 'java', size: 20},
 61 | {text: 'chat', size: 20},
 62 | {text: 'changes', size: 20},
 63 | {text: 'ldap', size: 20},
 64 | {text: 'project', size: 20},
 65 | {text: 'questions', size: 20},
 66 | {text: 'login', size: 20},
 67 | {text: 'testing', size: 20},
 68 | {text: 'info', size: 20},
 69 | {text: 'startup', size: 20},
 70 | {text: 'data', size: 20},
 71 | {text: 'conversion', size: 20},
 72 | {text: 'jforum', size: 20},
 73 | {text: 'performance', size: 20},
 74 | {text: 'kernel', size: 20},
 75 | {text: 'adding', size: 20},
 76 | {text: 'support', size: 20},
 77 | {text: 'import', size: 20},
 78 | {text: 'call', size: 20},
 79 | {text: 'nightly', size: 20},
 80 | {text: 'running', size: 20},
 81 | {text: 'access', size: 20},
 82 | {text: 'branch', size: 20},
 83 | {text: 'into', size: 20},
 84 | {text: 'multiple', size: 20},
 85 | {text: 'message', size: 20},
 86 | {text: 'default', size: 20},
 87 | {text: 'status', size: 20},
 88 | {text: 'source', size: 20},
 89 | {text: 'create', size: 20},
 90 | {text: 'wiki', size: 20},
 91 | {text: 'scorm', size: 20},
 92 | {text: 'setup', size: 20},
 93 | {text: 'what', size: 20},
 94 | {text: 'more', size: 20},
 95 | {text: 'does', size: 20},
 96 | {text: 'configuration', size: 20},
 97 | {text: 'down', size: 20},
 98 | {text: 'list', size: 20},
 99 | {text: 'getting', size: 20},
100 | {text: 'server', size: 20}
101 | ];
102 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gword.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import zlib
 4 | import string
 5 | 
 6 | conn = sqlite3.connect('index.sqlite')
 7 | cur = conn.cursor()
 8 | 
 9 | cur.execute('SELECT id, subject FROM Subjects')
10 | subjects = dict()
11 | for message_row in cur :
12 |     subjects[message_row[0]] = message_row[1]
13 | 
14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
15 | cur.execute('SELECT subject_id FROM Messages')
16 | counts = dict()
17 | for message_row in cur :
18 |     text = subjects[message_row[0]]
19 |     text = text.translate(str.maketrans('','',string.punctuation))
20 |     text = text.translate(str.maketrans('','','1234567890'))
21 |     text = text.strip()
22 |     text = text.lower()
23 |     words = text.split()
24 |     for word in words:
25 |         if len(word) < 4 : continue
26 |         counts[word] = counts.get(word,0) + 1
27 | 
28 | x = sorted(counts, key=counts.get, reverse=True)
29 | highest = None
30 | lowest = None
31 | for k in x[:100]:
32 |     if highest is None or highest < counts[k] :
33 |         highest = counts[k]
34 |     if lowest is None or lowest > counts[k] :
35 |         lowest = counts[k]
36 | print('Range of counts:',highest,lowest)
37 | 
38 | # Spread the font sizes across 20-100 based on the count
39 | bigsize = 80
40 | smallsize = 20
41 | 
42 | fhand = open('gword.js','w')
43 | fhand.write("gword = [")
44 | first = True
45 | for k in x[:100]:
46 |     if not first : fhand.write( ",\n")
47 |     first = False
48 |     size = counts[k]
49 |     size = (size - lowest) / float(highest - lowest)
50 |     size = int((size * bigsize) + smallsize)
51 |     fhand.write("{text: '"+k+"', size: "+str(size)+"}")
52 | fhand.write( "\n];\n")
53 | fhand.close()
54 | 
55 | print("Output written to gword.js")
56 | print("Open gword.htm in a browser to see the vizualization")
57 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/gyear.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import urllib.request, urllib.parse, urllib.error
 4 | import zlib
 5 | 
 6 | conn = sqlite3.connect('index.sqlite')
 7 | cur = conn.cursor()
 8 | 
 9 | cur.execute('SELECT id, sender FROM Senders')
10 | senders = dict()
11 | for message_row in cur :
12 |     senders[message_row[0]] = message_row[1]
13 | 
14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
15 | messages = dict()
16 | for message_row in cur :
17 |     messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4])
18 | 
19 | print("Loaded messages=",len(messages),"senders=",len(senders))
20 | 
21 | sendorgs = dict()
22 | for (message_id, message) in list(messages.items()):
23 |     sender = message[1]
24 |     pieces = senders[sender].split("@")
25 |     if len(pieces) != 2 : continue
26 |     dns = pieces[1]
27 |     sendorgs[dns] = sendorgs.get(dns,0) + 1
28 | 
29 | # pick the top schools
30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
31 | orgs = orgs[:10]
32 | print("Top 10 Oranizations")
33 | print(orgs)
34 | # orgs = ['total'] + orgs
35 | 
36 | counts = dict()
37 | months = list()
38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
39 | for (message_id, message) in list(messages.items()):
40 |     sender = message[1]
41 |     pieces = senders[sender].split("@")
42 |     if len(pieces) != 2 : continue
43 |     dns = pieces[1]
44 |     if dns not in orgs : continue
45 |     month = message[3][:4]
46 |     if month not in months : months.append(month)
47 |     key = (month, dns)
48 |     counts[key] = counts.get(key,0) + 1
49 |     tkey = (month, 'total')
50 |     counts[tkey] = counts.get(tkey,0) + 1
51 |     
52 | months.sort()
53 | # print counts
54 | # print months
55 | 
56 | fhand = open('gline.js','w')
57 | fhand.write("gline = [ ['Year'")
58 | for org in orgs:
59 |     fhand.write(",'"+org+"'")
60 | fhand.write("]")
61 | 
62 | for month in months[1:-1]:
63 |     fhand.write(",\n['"+month+"'")
64 |     for org in orgs:
65 |         key = (month, org)
66 |         val = counts.get(key,0)
67 |         fhand.write(","+str(val))
68 |     fhand.write("]");
69 | 
70 | fhand.write("\n];\n")
71 | fhand.close()
72 | 
73 | print("Output written to gline.js")
74 | print("Open gline.htm to visualize the data")
75 | 
76 | 


--------------------------------------------------------------------------------
/Capstone/Mailing List I/index.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/index.sqlite


--------------------------------------------------------------------------------
/Capstone/Mailing List I/mapping.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/mapping.sqlite


--------------------------------------------------------------------------------
/Capstone/Pagerank/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012, Michael Bostock
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * The name Michael Bostock may not be used to endorse or promote products
15 |   derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/Pagerank Dr.Chuck.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/Pagerank Dr.Chuck.PNG


--------------------------------------------------------------------------------
/Capstone/Pagerank/Pagerank WP.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/Pagerank WP.PNG


--------------------------------------------------------------------------------
/Capstone/Pagerank/README.txt:
--------------------------------------------------------------------------------
  1 | Simple Python Search Spider, Page Ranker, and Visualizer
  2 | 
  3 | This is a set of programs that emulate some of the functions of a 
  4 | search engine.  They store their data in a SQLITE3 database named
  5 | 'spider.sqlite'.  This file can be removed at any time to restart the
  6 | process.   
  7 | 
  8 | You should install the SQLite browser to view and modify 
  9 | the databases from:
 10 | 
 11 | http://sqlitebrowser.org/
 12 | 
 13 | This program crawls a web site and pulls a series of pages into the
 14 | database, recording the links between pages.
 15 | 
 16 | Note: Windows has difficulty in displaying UTF-8 characters
 17 | in the console so for each console window you open, you may need
 18 | to type the following command before running this code:
 19 | 
 20 |     chcp 65001
 21 | 
 22 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how
 23 | 
 24 | Mac: rm spider.sqlite
 25 | Mac: python3 spider.py
 26 | 
 27 | Win: del spider.sqlite
 28 | Win: spider.py
 29 | 
 30 | Enter web url or enter: http://www.dr-chuck.com/
 31 | ['http://www.dr-chuck.com']
 32 | How many pages:2
 33 | 1 http://www.dr-chuck.com/ 12
 34 | 2 http://www.dr-chuck.com/csev-blog/ 57
 35 | How many pages:
 36 | 
 37 | In this sample run, we told it to crawl a website and retrieve two 
 38 | pages.  If you restart the program again and tell it to crawl more
 39 | pages, it will not re-crawl any pages already in the database.  Upon 
 40 | restart it goes to a random non-crawled page and starts there.  So 
 41 | each successive run of spider.py is additive.
 42 | 
 43 | Mac: python3 spider.py 
 44 | Win: spider.py
 45 | 
 46 | Enter web url or enter: http://www.dr-chuck.com/
 47 | ['http://www.dr-chuck.com']
 48 | How many pages:3
 49 | 3 http://www.dr-chuck.com/csev-blog 57
 50 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1
 51 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13
 52 | How many pages:
 53 | 
 54 | You can have multiple starting points in the same database - 
 55 | within the program these are called "webs".   The spider
 56 | chooses randomly amongst all non-visited links across all
 57 | the webs.
 58 | 
 59 | If you want to dump the contents of the spider.sqlite file, you can 
 60 | run spdump.py as follows:
 61 | 
 62 | Mac: python3 spdump.py 
 63 | Win: spdump.py
 64 | 
 65 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog')
 66 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
 67 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/')
 68 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
 69 | 4 rows.
 70 | 
 71 | This shows the number of incoming links, the old page rank, the new page
 72 | rank, the id of the page, and the url of the page.  The spdump.py program
 73 | only shows pages that have at least one incoming link to them.
 74 | 
 75 | Once you have a few pages in the database, you can run Page Rank on the
 76 | pages using the sprank.py program.  You simply tell it how many Page
 77 | Rank iterations to run.
 78 | 
 79 | Mac: python3 sprank.py 
 80 | Win: sprank.py 
 81 | 
 82 | How many iterations:2
 83 | 1 0.546848992536
 84 | 2 0.226714939664
 85 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)]
 86 | 
 87 | You can dump the database again to see that page rank has been updated:
 88 | 
 89 | Mac: python3 spdump.py 
 90 | Win: spdump.py 
 91 | 
 92 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog')
 93 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
 94 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/')
 95 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
 96 | 4 rows.
 97 | 
 98 | You can run sprank.py as many times as you like and it will simply refine
 99 | the page rank the more times you run it.  You can even run sprank.py a few times
100 | and then go spider a few more pages sith spider.py and then run sprank.py
101 | to converge the page ranks.
102 | 
103 | If you want to restart the Page Rank calculations without re-spidering the 
104 | web pages, you can use spreset.py
105 | 
106 | Mac: python3 spreset.py 
107 | Win: spreset.py 
108 | 
109 | All pages set to a rank of 1.0
110 | 
111 | Mac: python3 sprank.py 
112 | Win: sprank.py 
113 | 
114 | How many iterations:50
115 | 1 0.546848992536
116 | 2 0.226714939664
117 | 3 0.0659516187242
118 | 4 0.0244199333
119 | 5 0.0102096489546
120 | 6 0.00610244329379
121 | ...
122 | 42 0.000109076928206
123 | 43 9.91987599002e-05
124 | 44 9.02151706798e-05
125 | 45 8.20451504471e-05
126 | 46 7.46150183837e-05
127 | 47 6.7857770908e-05
128 | 48 6.17124694224e-05
129 | 49 5.61236959327e-05
130 | 50 5.10410499467e-05
131 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)]
132 | 
133 | For each iteration of the page rank algorithm it prints the average
134 | change per page of the page rank.   The network initially is quite 
135 | unbalanced and so the individual page ranks are changeing wildly.
136 | But in a few short iterations, the page rank converges.  You 
137 | should run prank.py long enough that the page ranks converge.
138 | 
139 | If you want to visualize the current top pages in terms of page rank,
140 | run spjson.py to write the pages out in JSON format to be viewed in a
141 | web browser.
142 | 
143 | Mac: python3 spjson.py 
144 | Win: spjson.py 
145 | 
146 | Creating JSON output on spider.js...
147 | How many nodes? 30
148 | Open force.html in a browser to view the visualization
149 | 
150 | You can view this data by opening the file force.html in your web browser.  
151 | This shows an automatic layout of the nodes and links.  You can click and 
152 | drag any node and you can also double click on a node to find the URL
153 | that is represented by the node.
154 | 
155 | This visualization is provided using the force layout from:
156 | 
157 | http://mbostock.github.com/d3/
158 | 
159 | If you rerun the other utilities and then re-run spjson.py - you merely
160 | have to press refresh in the browser to get the new data from spider.js.
161 | 
162 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/builder/__init__.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import itertools
  3 | import sys
  4 | from bs4.element import (
  5 |     CharsetMetaAttributeValue,
  6 |     ContentMetaAttributeValue,
  7 |     whitespace_re
  8 |     )
  9 | 
 10 | __all__ = [
 11 |     'HTMLTreeBuilder',
 12 |     'SAXTreeBuilder',
 13 |     'TreeBuilder',
 14 |     'TreeBuilderRegistry',
 15 |     ]
 16 | 
 17 | # Some useful features for a TreeBuilder to have.
 18 | FAST = 'fast'
 19 | PERMISSIVE = 'permissive'
 20 | STRICT = 'strict'
 21 | XML = 'xml'
 22 | HTML = 'html'
 23 | HTML_5 = 'html5'
 24 | 
 25 | 
 26 | class TreeBuilderRegistry(object):
 27 | 
 28 |     def __init__(self):
 29 |         self.builders_for_feature = defaultdict(list)
 30 |         self.builders = []
 31 | 
 32 |     def register(self, treebuilder_class):
 33 |         """Register a treebuilder based on its advertised features."""
 34 |         for feature in treebuilder_class.features:
 35 |             self.builders_for_feature[feature].insert(0, treebuilder_class)
 36 |         self.builders.insert(0, treebuilder_class)
 37 | 
 38 |     def lookup(self, *features):
 39 |         if len(self.builders) == 0:
 40 |             # There are no builders at all.
 41 |             return None
 42 | 
 43 |         if len(features) == 0:
 44 |             # They didn't ask for any features. Give them the most
 45 |             # recently registered builder.
 46 |             return self.builders[0]
 47 | 
 48 |         # Go down the list of features in order, and eliminate any builders
 49 |         # that don't match every feature.
 50 |         features = list(features)
 51 |         features.reverse()
 52 |         candidates = None
 53 |         candidate_set = None
 54 |         while len(features) > 0:
 55 |             feature = features.pop()
 56 |             we_have_the_feature = self.builders_for_feature.get(feature, [])
 57 |             if len(we_have_the_feature) > 0:
 58 |                 if candidates is None:
 59 |                     candidates = we_have_the_feature
 60 |                     candidate_set = set(candidates)
 61 |                 else:
 62 |                     # Eliminate any candidates that don't have this feature.
 63 |                     candidate_set = candidate_set.intersection(
 64 |                         set(we_have_the_feature))
 65 | 
 66 |         # The only valid candidates are the ones in candidate_set.
 67 |         # Go through the original list of candidates and pick the first one
 68 |         # that's in candidate_set.
 69 |         if candidate_set is None:
 70 |             return None
 71 |         for candidate in candidates:
 72 |             if candidate in candidate_set:
 73 |                 return candidate
 74 |         return None
 75 | 
 76 | # The BeautifulSoup class will take feature lists from developers and use them
 77 | # to look up builders in this registry.
 78 | builder_registry = TreeBuilderRegistry()
 79 | 
 80 | class TreeBuilder(object):
 81 |     """Turn a document into a Beautiful Soup object tree."""
 82 | 
 83 |     NAME = "[Unknown tree builder]"
 84 |     ALTERNATE_NAMES = []
 85 |     features = []
 86 | 
 87 |     is_xml = False
 88 |     picklable = False
 89 |     preserve_whitespace_tags = set()
 90 |     empty_element_tags = None # A tag will be considered an empty-element
 91 |                               # tag when and only when it has no contents.
 92 | 
 93 |     # A value for these tag/attribute combinations is a space- or
 94 |     # comma-separated list of CDATA, rather than a single CDATA.
 95 |     cdata_list_attributes = {}
 96 | 
 97 | 
 98 |     def __init__(self):
 99 |         self.soup = None
100 | 
101 |     def reset(self):
102 |         pass
103 | 
104 |     def can_be_empty_element(self, tag_name):
105 |         """Might a tag with this name be an empty-element tag?
106 | 
107 |         The final markup may or may not actually present this tag as
108 |         self-closing.
109 | 
110 |         For instance: an HTMLBuilder does not consider a <p> tag to be
111 |         an empty-element tag (it's not in
112 |         HTMLBuilder.empty_element_tags). This means an empty <p> tag
113 |         will be presented as "<p></p>", not "<p />".
114 | 
115 |         The default implementation has no opinion about which tags are
116 |         empty-element tags, so a tag will be presented as an
117 |         empty-element tag if and only if it has no contents.
118 |         "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
119 |         be left alone.
120 |         """
121 |         if self.empty_element_tags is None:
122 |             return True
123 |         return tag_name in self.empty_element_tags
124 | 
125 |     def feed(self, markup):
126 |         raise NotImplementedError()
127 | 
128 |     def prepare_markup(self, markup, user_specified_encoding=None,
129 |                        document_declared_encoding=None):
130 |         return markup, None, None, False
131 | 
132 |     def test_fragment_to_document(self, fragment):
133 |         """Wrap an HTML fragment to make it look like a document.
134 | 
135 |         Different parsers do this differently. For instance, lxml
136 |         introduces an empty <head> tag, and html5lib
137 |         doesn't. Abstracting this away lets us write simple tests
138 |         which run HTML fragments through the parser and compare the
139 |         results against other HTML fragments.
140 | 
141 |         This method should not be used outside of tests.
142 |         """
143 |         return fragment
144 | 
145 |     def set_up_substitutions(self, tag):
146 |         return False
147 | 
148 |     def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149 |         """Replaces class="foo bar" with class=["foo", "bar"]
150 | 
151 |         Modifies its input in place.
152 |         """
153 |         if not attrs:
154 |             return attrs
155 |         if self.cdata_list_attributes:
156 |             universal = self.cdata_list_attributes.get('*', [])
157 |             tag_specific = self.cdata_list_attributes.get(
158 |                 tag_name.lower(), None)
159 |             for attr in list(attrs.keys()):
160 |                 if attr in universal or (tag_specific and attr in tag_specific):
161 |                     # We have a "class"-type attribute whose string
162 |                     # value is a whitespace-separated list of
163 |                     # values. Split it into a list.
164 |                     value = attrs[attr]
165 |                     if isinstance(value, str):
166 |                         values = whitespace_re.split(value)
167 |                     else:
168 |                         # html5lib sometimes calls setAttributes twice
169 |                         # for the same tag when rearranging the parse
170 |                         # tree. On the second call the attribute value
171 |                         # here is already a list.  If this happens,
172 |                         # leave the value alone rather than trying to
173 |                         # split it again.
174 |                         values = value
175 |                     attrs[attr] = values
176 |         return attrs
177 | 
178 | class SAXTreeBuilder(TreeBuilder):
179 |     """A Beautiful Soup treebuilder that listens for SAX events."""
180 | 
181 |     def feed(self, markup):
182 |         raise NotImplementedError()
183 | 
184 |     def close(self):
185 |         pass
186 | 
187 |     def startElement(self, name, attrs):
188 |         attrs = dict((key[1], value) for key, value in list(attrs.items()))
189 |         #print "Start %s, %r" % (name, attrs)
190 |         self.soup.handle_starttag(name, attrs)
191 | 
192 |     def endElement(self, name):
193 |         #print "End %s" % name
194 |         self.soup.handle_endtag(name)
195 | 
196 |     def startElementNS(self, nsTuple, nodeName, attrs):
197 |         # Throw away (ns, nodeName) for now.
198 |         self.startElement(nodeName, attrs)
199 | 
200 |     def endElementNS(self, nsTuple, nodeName):
201 |         # Throw away (ns, nodeName) for now.
202 |         self.endElement(nodeName)
203 |         #handler.endElementNS((ns, node.nodeName), node.nodeName)
204 | 
205 |     def startPrefixMapping(self, prefix, nodeValue):
206 |         # Ignore the prefix for now.
207 |         pass
208 | 
209 |     def endPrefixMapping(self, prefix):
210 |         # Ignore the prefix for now.
211 |         # handler.endPrefixMapping(prefix)
212 |         pass
213 | 
214 |     def characters(self, content):
215 |         self.soup.handle_data(content)
216 | 
217 |     def startDocument(self):
218 |         pass
219 | 
220 |     def endDocument(self):
221 |         pass
222 | 
223 | 
224 | class HTMLTreeBuilder(TreeBuilder):
225 |     """This TreeBuilder knows facts about HTML.
226 | 
227 |     Such as which tags are empty-element tags.
228 |     """
229 | 
230 |     preserve_whitespace_tags = set(['pre', 'textarea'])
231 |     empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
232 |                               'spacer', 'link', 'frame', 'base'])
233 | 
234 |     # The HTML standard defines these attributes as containing a
235 |     # space-separated list of values, not a single value. That is,
236 |     # class="foo bar" means that the 'class' attribute has two values,
237 |     # 'foo' and 'bar', not the single value 'foo bar'.  When we
238 |     # encounter one of these attributes, we will parse its value into
239 |     # a list of values if possible. Upon output, the list will be
240 |     # converted back into a string.
241 |     cdata_list_attributes = {
242 |         "*" : ['class', 'accesskey', 'dropzone'],
243 |         "a" : ['rel', 'rev'],
244 |         "link" :  ['rel', 'rev'],
245 |         "td" : ["headers"],
246 |         "th" : ["headers"],
247 |         "td" : ["headers"],
248 |         "form" : ["accept-charset"],
249 |         "object" : ["archive"],
250 | 
251 |         # These are HTML5 specific, as are *.accesskey and *.dropzone above.
252 |         "area" : ["rel"],
253 |         "icon" : ["sizes"],
254 |         "iframe" : ["sandbox"],
255 |         "output" : ["for"],
256 |         }
257 | 
258 |     def set_up_substitutions(self, tag):
259 |         # We are only interested in <meta> tags
260 |         if tag.name != 'meta':
261 |             return False
262 | 
263 |         http_equiv = tag.get('http-equiv')
264 |         content = tag.get('content')
265 |         charset = tag.get('charset')
266 | 
267 |         # We are interested in <meta> tags that say what encoding the
268 |         # document was originally in. This means HTML 5-style <meta>
269 |         # tags that provide the "charset" attribute. It also means
270 |         # HTML 4-style <meta> tags that provide the "content"
271 |         # attribute and have "http-equiv" set to "content-type".
272 |         #
273 |         # In both cases we will replace the value of the appropriate
274 |         # attribute with a standin object that can take on any
275 |         # encoding.
276 |         meta_encoding = None
277 |         if charset is not None:
278 |             # HTML 5 style:
279 |             # <meta charset="utf8">
280 |             meta_encoding = charset
281 |             tag['charset'] = CharsetMetaAttributeValue(charset)
282 | 
283 |         elif (content is not None and http_equiv is not None
284 |               and http_equiv.lower() == 'content-type'):
285 |             # HTML 4 style:
286 |             # <meta http-equiv="content-type" content="text/html; charset=utf8">
287 |             tag['content'] = ContentMetaAttributeValue(content)
288 | 
289 |         return (meta_encoding is not None)
290 | 
291 | def register_treebuilders_from(module):
292 |     """Copy TreeBuilders from the given module into this module."""
293 |     # I'm fairly sure this is not the best way to do this.
294 |     this_module = sys.modules['bs4.builder']
295 |     for name in module.__all__:
296 |         obj = getattr(module, name)
297 | 
298 |         if issubclass(obj, TreeBuilder):
299 |             setattr(this_module, name, obj)
300 |             this_module.__all__.append(name)
301 |             # Register the builder while we're at it.
302 |             this_module.builder_registry.register(obj)
303 | 
304 | class ParserRejectedMarkup(Exception):
305 |     pass
306 | 
307 | # Builders are registered in reverse order of priority, so that custom
308 | # builder registrations will take precedence. In general, we want lxml
309 | # to take precedence over html5lib, because it's faster. And we only
310 | # want to use HTMLParser as a last result.
311 | from . import _htmlparser
312 | register_treebuilders_from(_htmlparser)
313 | try:
314 |     from . import _html5lib
315 |     register_treebuilders_from(_html5lib)
316 | except ImportError:
317 |     # They don't have html5lib installed.
318 |     pass
319 | try:
320 |     from . import _lxml
321 |     register_treebuilders_from(_lxml)
322 | except ImportError:
323 |     # They don't have lxml installed.
324 |     pass
325 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/builder/_htmlparser.py:
--------------------------------------------------------------------------------
  1 | """Use the HTMLParser library to parse HTML files that aren't too bad."""
  2 | 
  3 | __all__ = [
  4 |     'HTMLParserTreeBuilder',
  5 |     ]
  6 | 
  7 | from html.parser import HTMLParser
  8 | 
  9 | try:
 10 |     from html.parser import HTMLParseError
 11 | except ImportError as e:
 12 |     # HTMLParseError is removed in Python 3.5. Since it can never be
 13 |     # thrown in 3.5, we can just define our own class as a placeholder.
 14 |     class HTMLParseError(Exception):
 15 |         pass
 16 | 
 17 | import sys
 18 | import warnings
 19 | 
 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
 21 | # argument, which we'd like to set to False. Unfortunately,
 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet
 23 | # before Python 3.2.3.
 24 | #
 25 | # At the end of this file, we monkeypatch HTMLParser so that
 26 | # strict=True works well on Python 3.2.2.
 27 | major, minor, release = sys.version_info[:3]
 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
 31 | 
 32 | 
 33 | from bs4.element import (
 34 |     CData,
 35 |     Comment,
 36 |     Declaration,
 37 |     Doctype,
 38 |     ProcessingInstruction,
 39 |     )
 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit
 41 | 
 42 | from bs4.builder import (
 43 |     HTML,
 44 |     HTMLTreeBuilder,
 45 |     STRICT,
 46 |     )
 47 | 
 48 | 
 49 | HTMLPARSER = 'html.parser'
 50 | 
 51 | class BeautifulSoupHTMLParser(HTMLParser):
 52 |     def handle_starttag(self, name, attrs):
 53 |         # XXX namespace
 54 |         attr_dict = {}
 55 |         for key, value in attrs:
 56 |             # Change None attribute values to the empty string
 57 |             # for consistency with the other tree builders.
 58 |             if value is None:
 59 |                 value = ''
 60 |             attr_dict[key] = value
 61 |             attrvalue = '""'
 62 |         self.soup.handle_starttag(name, None, None, attr_dict)
 63 | 
 64 |     def handle_endtag(self, name):
 65 |         self.soup.handle_endtag(name)
 66 | 
 67 |     def handle_data(self, data):
 68 |         self.soup.handle_data(data)
 69 | 
 70 |     def handle_charref(self, name):
 71 |         # XXX workaround for a bug in HTMLParser. Remove this once
 72 |         # it's fixed in all supported versions.
 73 |         # http://bugs.python.org/issue13633
 74 |         if name.startswith('x'):
 75 |             real_name = int(name.lstrip('x'), 16)
 76 |         elif name.startswith('X'):
 77 |             real_name = int(name.lstrip('X'), 16)
 78 |         else:
 79 |             real_name = int(name)
 80 | 
 81 |         try:
 82 |             data = chr(real_name)
 83 |         except (ValueError, OverflowError) as e:
 84 |             data = "\N{REPLACEMENT CHARACTER}"
 85 | 
 86 |         self.handle_data(data)
 87 | 
 88 |     def handle_entityref(self, name):
 89 |         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
 90 |         if character is not None:
 91 |             data = character
 92 |         else:
 93 |             data = "&%s;" % name
 94 |         self.handle_data(data)
 95 | 
 96 |     def handle_comment(self, data):
 97 |         self.soup.endData()
 98 |         self.soup.handle_data(data)
 99 |         self.soup.endData(Comment)
100 | 
101 |     def handle_decl(self, data):
102 |         self.soup.endData()
103 |         if data.startswith("DOCTYPE "):
104 |             data = data[len("DOCTYPE "):]
105 |         elif data == 'DOCTYPE':
106 |             # i.e. "<!DOCTYPE>"
107 |             data = ''
108 |         self.soup.handle_data(data)
109 |         self.soup.endData(Doctype)
110 | 
111 |     def unknown_decl(self, data):
112 |         if data.upper().startswith('CDATA['):
113 |             cls = CData
114 |             data = data[len('CDATA['):]
115 |         else:
116 |             cls = Declaration
117 |         self.soup.endData()
118 |         self.soup.handle_data(data)
119 |         self.soup.endData(cls)
120 | 
121 |     def handle_pi(self, data):
122 |         self.soup.endData()
123 |         self.soup.handle_data(data)
124 |         self.soup.endData(ProcessingInstruction)
125 | 
126 | 
127 | class HTMLParserTreeBuilder(HTMLTreeBuilder):
128 | 
129 |     is_xml = False
130 |     picklable = True
131 |     NAME = HTMLPARSER
132 |     features = [NAME, HTML, STRICT]
133 | 
134 |     def __init__(self, *args, **kwargs):
135 |         if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
136 |             kwargs['strict'] = False
137 |         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 |             kwargs['convert_charrefs'] = False
139 |         self.parser_args = (args, kwargs)
140 | 
141 |     def prepare_markup(self, markup, user_specified_encoding=None,
142 |                        document_declared_encoding=None, exclude_encodings=None):
143 |         """
144 |         :return: A 4-tuple (markup, original encoding, encoding
145 |         declared within markup, whether any characters had to be
146 |         replaced with REPLACEMENT CHARACTER).
147 |         """
148 |         if isinstance(markup, str):
149 |             yield (markup, None, None, False)
150 |             return
151 | 
152 |         try_encodings = [user_specified_encoding, document_declared_encoding]
153 |         dammit = UnicodeDammit(markup, try_encodings, is_html=True,
154 |                                exclude_encodings=exclude_encodings)
155 |         yield (dammit.markup, dammit.original_encoding,
156 |                dammit.declared_html_encoding,
157 |                dammit.contains_replacement_characters)
158 | 
159 |     def feed(self, markup):
160 |         args, kwargs = self.parser_args
161 |         parser = BeautifulSoupHTMLParser(*args, **kwargs)
162 |         parser.soup = self.soup
163 |         try:
164 |             parser.feed(markup)
165 |         except HTMLParseError as e:
166 |             warnings.warn(RuntimeWarning(
167 |                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
168 |             raise e
169 | 
170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
171 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
172 | # string.
173 | #
174 | # XXX This code can be removed once most Python 3 users are on 3.2.3.
175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
176 |     import re
177 |     attrfind_tolerant = re.compile(
178 |         r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
179 |         r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
180 |     HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
181 | 
182 |     locatestarttagend = re.compile(r"""
183 |   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
184 |   (?:\s+                             # whitespace before attribute name
185 |     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
186 |       (?:\s*=\s*                     # value indicator
187 |         (?:'[^']*'                   # LITA-enclosed value
188 |           |\"[^\"]*\"                # LIT-enclosed value
189 |           |[^'\">\s]+                # bare value
190 |          )
191 |        )?
192 |      )
193 |    )*
194 |   \s*                                # trailing whitespace
195 | """, re.VERBOSE)
196 |     BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
197 | 
198 |     from html.parser import tagfind, attrfind
199 | 
200 |     def parse_starttag(self, i):
201 |         self.__starttag_text = None
202 |         endpos = self.check_for_whole_start_tag(i)
203 |         if endpos < 0:
204 |             return endpos
205 |         rawdata = self.rawdata
206 |         self.__starttag_text = rawdata[i:endpos]
207 | 
208 |         # Now parse the data between i+1 and j into a tag and attrs
209 |         attrs = []
210 |         match = tagfind.match(rawdata, i+1)
211 |         assert match, 'unexpected call to parse_starttag()'
212 |         k = match.end()
213 |         self.lasttag = tag = rawdata[i+1:k].lower()
214 |         while k < endpos:
215 |             if self.strict:
216 |                 m = attrfind.match(rawdata, k)
217 |             else:
218 |                 m = attrfind_tolerant.match(rawdata, k)
219 |             if not m:
220 |                 break
221 |             attrname, rest, attrvalue = m.group(1, 2, 3)
222 |             if not rest:
223 |                 attrvalue = None
224 |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
225 |                  attrvalue[:1] == '"' == attrvalue[-1:]:
226 |                 attrvalue = attrvalue[1:-1]
227 |             if attrvalue:
228 |                 attrvalue = self.unescape(attrvalue)
229 |             attrs.append((attrname.lower(), attrvalue))
230 |             k = m.end()
231 | 
232 |         end = rawdata[k:endpos].strip()
233 |         if end not in (">", "/>"):
234 |             lineno, offset = self.getpos()
235 |             if "\n" in self.__starttag_text:
236 |                 lineno = lineno + self.__starttag_text.count("\n")
237 |                 offset = len(self.__starttag_text) \
238 |                          - self.__starttag_text.rfind("\n")
239 |             else:
240 |                 offset = offset + len(self.__starttag_text)
241 |             if self.strict:
242 |                 self.error("junk characters in start tag: %r"
243 |                            % (rawdata[k:endpos][:20],))
244 |             self.handle_data(rawdata[i:endpos])
245 |             return endpos
246 |         if end.endswith('/>'):
247 |             # XHTML-style empty tag: <span attr="value" />
248 |             self.handle_startendtag(tag, attrs)
249 |         else:
250 |             self.handle_starttag(tag, attrs)
251 |             if tag in self.CDATA_CONTENT_ELEMENTS:
252 |                 self.set_cdata_mode(tag)
253 |         return endpos
254 | 
255 |     def set_cdata_mode(self, elem):
256 |         self.cdata_elem = elem.lower()
257 |         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
258 | 
259 |     BeautifulSoupHTMLParser.parse_starttag = parse_starttag
260 |     BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
261 | 
262 |     CONSTRUCTOR_TAKES_STRICT = True
263 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/builder/_htmlparser.py.bak:
--------------------------------------------------------------------------------
  1 | """Use the HTMLParser library to parse HTML files that aren't too bad."""
  2 | 
  3 | __all__ = [
  4 |     'HTMLParserTreeBuilder',
  5 |     ]
  6 | 
  7 | from HTMLParser import HTMLParser
  8 | 
  9 | try:
 10 |     from HTMLParser import HTMLParseError
 11 | except ImportError, e:
 12 |     # HTMLParseError is removed in Python 3.5. Since it can never be
 13 |     # thrown in 3.5, we can just define our own class as a placeholder.
 14 |     class HTMLParseError(Exception):
 15 |         pass
 16 | 
 17 | import sys
 18 | import warnings
 19 | 
 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
 21 | # argument, which we'd like to set to False. Unfortunately,
 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet
 23 | # before Python 3.2.3.
 24 | #
 25 | # At the end of this file, we monkeypatch HTMLParser so that
 26 | # strict=True works well on Python 3.2.2.
 27 | major, minor, release = sys.version_info[:3]
 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
 31 | 
 32 | 
 33 | from bs4.element import (
 34 |     CData,
 35 |     Comment,
 36 |     Declaration,
 37 |     Doctype,
 38 |     ProcessingInstruction,
 39 |     )
 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit
 41 | 
 42 | from bs4.builder import (
 43 |     HTML,
 44 |     HTMLTreeBuilder,
 45 |     STRICT,
 46 |     )
 47 | 
 48 | 
 49 | HTMLPARSER = 'html.parser'
 50 | 
 51 | class BeautifulSoupHTMLParser(HTMLParser):
 52 |     def handle_starttag(self, name, attrs):
 53 |         # XXX namespace
 54 |         attr_dict = {}
 55 |         for key, value in attrs:
 56 |             # Change None attribute values to the empty string
 57 |             # for consistency with the other tree builders.
 58 |             if value is None:
 59 |                 value = ''
 60 |             attr_dict[key] = value
 61 |             attrvalue = '""'
 62 |         self.soup.handle_starttag(name, None, None, attr_dict)
 63 | 
 64 |     def handle_endtag(self, name):
 65 |         self.soup.handle_endtag(name)
 66 | 
 67 |     def handle_data(self, data):
 68 |         self.soup.handle_data(data)
 69 | 
 70 |     def handle_charref(self, name):
 71 |         # XXX workaround for a bug in HTMLParser. Remove this once
 72 |         # it's fixed in all supported versions.
 73 |         # http://bugs.python.org/issue13633
 74 |         if name.startswith('x'):
 75 |             real_name = int(name.lstrip('x'), 16)
 76 |         elif name.startswith('X'):
 77 |             real_name = int(name.lstrip('X'), 16)
 78 |         else:
 79 |             real_name = int(name)
 80 | 
 81 |         try:
 82 |             data = unichr(real_name)
 83 |         except (ValueError, OverflowError), e:
 84 |             data = u"\N{REPLACEMENT CHARACTER}"
 85 | 
 86 |         self.handle_data(data)
 87 | 
 88 |     def handle_entityref(self, name):
 89 |         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
 90 |         if character is not None:
 91 |             data = character
 92 |         else:
 93 |             data = "&%s;" % name
 94 |         self.handle_data(data)
 95 | 
 96 |     def handle_comment(self, data):
 97 |         self.soup.endData()
 98 |         self.soup.handle_data(data)
 99 |         self.soup.endData(Comment)
100 | 
101 |     def handle_decl(self, data):
102 |         self.soup.endData()
103 |         if data.startswith("DOCTYPE "):
104 |             data = data[len("DOCTYPE "):]
105 |         elif data == 'DOCTYPE':
106 |             # i.e. "<!DOCTYPE>"
107 |             data = ''
108 |         self.soup.handle_data(data)
109 |         self.soup.endData(Doctype)
110 | 
111 |     def unknown_decl(self, data):
112 |         if data.upper().startswith('CDATA['):
113 |             cls = CData
114 |             data = data[len('CDATA['):]
115 |         else:
116 |             cls = Declaration
117 |         self.soup.endData()
118 |         self.soup.handle_data(data)
119 |         self.soup.endData(cls)
120 | 
121 |     def handle_pi(self, data):
122 |         self.soup.endData()
123 |         self.soup.handle_data(data)
124 |         self.soup.endData(ProcessingInstruction)
125 | 
126 | 
127 | class HTMLParserTreeBuilder(HTMLTreeBuilder):
128 | 
129 |     is_xml = False
130 |     picklable = True
131 |     NAME = HTMLPARSER
132 |     features = [NAME, HTML, STRICT]
133 | 
134 |     def __init__(self, *args, **kwargs):
135 |         if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
136 |             kwargs['strict'] = False
137 |         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 |             kwargs['convert_charrefs'] = False
139 |         self.parser_args = (args, kwargs)
140 | 
141 |     def prepare_markup(self, markup, user_specified_encoding=None,
142 |                        document_declared_encoding=None, exclude_encodings=None):
143 |         """
144 |         :return: A 4-tuple (markup, original encoding, encoding
145 |         declared within markup, whether any characters had to be
146 |         replaced with REPLACEMENT CHARACTER).
147 |         """
148 |         if isinstance(markup, unicode):
149 |             yield (markup, None, None, False)
150 |             return
151 | 
152 |         try_encodings = [user_specified_encoding, document_declared_encoding]
153 |         dammit = UnicodeDammit(markup, try_encodings, is_html=True,
154 |                                exclude_encodings=exclude_encodings)
155 |         yield (dammit.markup, dammit.original_encoding,
156 |                dammit.declared_html_encoding,
157 |                dammit.contains_replacement_characters)
158 | 
159 |     def feed(self, markup):
160 |         args, kwargs = self.parser_args
161 |         parser = BeautifulSoupHTMLParser(*args, **kwargs)
162 |         parser.soup = self.soup
163 |         try:
164 |             parser.feed(markup)
165 |         except HTMLParseError, e:
166 |             warnings.warn(RuntimeWarning(
167 |                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
168 |             raise e
169 | 
170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
171 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
172 | # string.
173 | #
174 | # XXX This code can be removed once most Python 3 users are on 3.2.3.
175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
176 |     import re
177 |     attrfind_tolerant = re.compile(
178 |         r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
179 |         r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
180 |     HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
181 | 
182 |     locatestarttagend = re.compile(r"""
183 |   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
184 |   (?:\s+                             # whitespace before attribute name
185 |     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
186 |       (?:\s*=\s*                     # value indicator
187 |         (?:'[^']*'                   # LITA-enclosed value
188 |           |\"[^\"]*\"                # LIT-enclosed value
189 |           |[^'\">\s]+                # bare value
190 |          )
191 |        )?
192 |      )
193 |    )*
194 |   \s*                                # trailing whitespace
195 | """, re.VERBOSE)
196 |     BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
197 | 
198 |     from html.parser import tagfind, attrfind
199 | 
200 |     def parse_starttag(self, i):
201 |         self.__starttag_text = None
202 |         endpos = self.check_for_whole_start_tag(i)
203 |         if endpos < 0:
204 |             return endpos
205 |         rawdata = self.rawdata
206 |         self.__starttag_text = rawdata[i:endpos]
207 | 
208 |         # Now parse the data between i+1 and j into a tag and attrs
209 |         attrs = []
210 |         match = tagfind.match(rawdata, i+1)
211 |         assert match, 'unexpected call to parse_starttag()'
212 |         k = match.end()
213 |         self.lasttag = tag = rawdata[i+1:k].lower()
214 |         while k < endpos:
215 |             if self.strict:
216 |                 m = attrfind.match(rawdata, k)
217 |             else:
218 |                 m = attrfind_tolerant.match(rawdata, k)
219 |             if not m:
220 |                 break
221 |             attrname, rest, attrvalue = m.group(1, 2, 3)
222 |             if not rest:
223 |                 attrvalue = None
224 |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
225 |                  attrvalue[:1] == '"' == attrvalue[-1:]:
226 |                 attrvalue = attrvalue[1:-1]
227 |             if attrvalue:
228 |                 attrvalue = self.unescape(attrvalue)
229 |             attrs.append((attrname.lower(), attrvalue))
230 |             k = m.end()
231 | 
232 |         end = rawdata[k:endpos].strip()
233 |         if end not in (">", "/>"):
234 |             lineno, offset = self.getpos()
235 |             if "\n" in self.__starttag_text:
236 |                 lineno = lineno + self.__starttag_text.count("\n")
237 |                 offset = len(self.__starttag_text) \
238 |                          - self.__starttag_text.rfind("\n")
239 |             else:
240 |                 offset = offset + len(self.__starttag_text)
241 |             if self.strict:
242 |                 self.error("junk characters in start tag: %r"
243 |                            % (rawdata[k:endpos][:20],))
244 |             self.handle_data(rawdata[i:endpos])
245 |             return endpos
246 |         if end.endswith('/>'):
247 |             # XHTML-style empty tag: <span attr="value" />
248 |             self.handle_startendtag(tag, attrs)
249 |         else:
250 |             self.handle_starttag(tag, attrs)
251 |             if tag in self.CDATA_CONTENT_ELEMENTS:
252 |                 self.set_cdata_mode(tag)
253 |         return endpos
254 | 
255 |     def set_cdata_mode(self, elem):
256 |         self.cdata_elem = elem.lower()
257 |         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
258 | 
259 |     BeautifulSoupHTMLParser.parse_starttag = parse_starttag
260 |     BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
261 | 
262 |     CONSTRUCTOR_TAKES_STRICT = True
263 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/builder/_lxml.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'LXMLTreeBuilderForXML',
  3 |     'LXMLTreeBuilder',
  4 |     ]
  5 | 
  6 | from io import BytesIO
  7 | from io import StringIO
  8 | import collections
  9 | from lxml import etree
 10 | from bs4.element import (
 11 |     Comment,
 12 |     Doctype,
 13 |     NamespacedAttribute,
 14 |     ProcessingInstruction,
 15 | )
 16 | from bs4.builder import (
 17 |     FAST,
 18 |     HTML,
 19 |     HTMLTreeBuilder,
 20 |     PERMISSIVE,
 21 |     ParserRejectedMarkup,
 22 |     TreeBuilder,
 23 |     XML)
 24 | from bs4.dammit import EncodingDetector
 25 | 
 26 | LXML = 'lxml'
 27 | 
 28 | class LXMLTreeBuilderForXML(TreeBuilder):
 29 |     DEFAULT_PARSER_CLASS = etree.XMLParser
 30 | 
 31 |     is_xml = True
 32 | 
 33 |     NAME = "lxml-xml"
 34 |     ALTERNATE_NAMES = ["xml"]
 35 | 
 36 |     # Well, it's permissive by XML parser standards.
 37 |     features = [NAME, LXML, XML, FAST, PERMISSIVE]
 38 | 
 39 |     CHUNK_SIZE = 512
 40 | 
 41 |     # This namespace mapping is specified in the XML Namespace
 42 |     # standard.
 43 |     DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
 44 | 
 45 |     def default_parser(self, encoding):
 46 |         # This can either return a parser object or a class, which
 47 |         # will be instantiated with default arguments.
 48 |         if self._default_parser is not None:
 49 |             return self._default_parser
 50 |         return etree.XMLParser(
 51 |             target=self, strip_cdata=False, recover=True, encoding=encoding)
 52 | 
 53 |     def parser_for(self, encoding):
 54 |         # Use the default parser.
 55 |         parser = self.default_parser(encoding)
 56 | 
 57 |         if isinstance(parser, collections.Callable):
 58 |             # Instantiate the parser with default arguments
 59 |             parser = parser(target=self, strip_cdata=False, encoding=encoding)
 60 |         return parser
 61 | 
 62 |     def __init__(self, parser=None, empty_element_tags=None):
 63 |         # TODO: Issue a warning if parser is present but not a
 64 |         # callable, since that means there's no way to create new
 65 |         # parsers for different encodings.
 66 |         self._default_parser = parser
 67 |         if empty_element_tags is not None:
 68 |             self.empty_element_tags = set(empty_element_tags)
 69 |         self.soup = None
 70 |         self.nsmaps = [self.DEFAULT_NSMAPS]
 71 | 
 72 |     def _getNsTag(self, tag):
 73 |         # Split the namespace URL out of a fully-qualified lxml tag
 74 |         # name. Copied from lxml's src/lxml/sax.py.
 75 |         if tag[0] == '{':
 76 |             return tuple(tag[1:].split('}', 1))
 77 |         else:
 78 |             return (None, tag)
 79 | 
 80 |     def prepare_markup(self, markup, user_specified_encoding=None,
 81 |                        exclude_encodings=None,
 82 |                        document_declared_encoding=None):
 83 |         """
 84 |         :yield: A series of 4-tuples.
 85 |          (markup, encoding, declared encoding,
 86 |           has undergone character replacement)
 87 | 
 88 |         Each 4-tuple represents a strategy for parsing the document.
 89 |         """
 90 |         if isinstance(markup, str):
 91 |             # We were given Unicode. Maybe lxml can parse Unicode on
 92 |             # this system?
 93 |             yield markup, None, document_declared_encoding, False
 94 | 
 95 |         if isinstance(markup, str):
 96 |             # No, apparently not. Convert the Unicode to UTF-8 and
 97 |             # tell lxml to parse it as UTF-8.
 98 |             yield (markup.encode("utf8"), "utf8",
 99 |                    document_declared_encoding, False)
100 | 
101 |         # Instead of using UnicodeDammit to convert the bytestring to
102 |         # Unicode using different encodings, use EncodingDetector to
103 |         # iterate over the encodings, and tell lxml to try to parse
104 |         # the document as each one in turn.
105 |         is_html = not self.is_xml
106 |         try_encodings = [user_specified_encoding, document_declared_encoding]
107 |         detector = EncodingDetector(
108 |             markup, try_encodings, is_html, exclude_encodings)
109 |         for encoding in detector.encodings:
110 |             yield (detector.markup, encoding, document_declared_encoding, False)
111 | 
112 |     def feed(self, markup):
113 |         if isinstance(markup, bytes):
114 |             markup = BytesIO(markup)
115 |         elif isinstance(markup, str):
116 |             markup = StringIO(markup)
117 | 
118 |         # Call feed() at least once, even if the markup is empty,
119 |         # or the parser won't be initialized.
120 |         data = markup.read(self.CHUNK_SIZE)
121 |         try:
122 |             self.parser = self.parser_for(self.soup.original_encoding)
123 |             self.parser.feed(data)
124 |             while len(data) != 0:
125 |                 # Now call feed() on the rest of the data, chunk by chunk.
126 |                 data = markup.read(self.CHUNK_SIZE)
127 |                 if len(data) != 0:
128 |                     self.parser.feed(data)
129 |             self.parser.close()
130 |         except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
131 |             raise ParserRejectedMarkup(str(e))
132 | 
133 |     def close(self):
134 |         self.nsmaps = [self.DEFAULT_NSMAPS]
135 | 
136 |     def start(self, name, attrs, nsmap={}):
137 |         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138 |         attrs = dict(attrs)
139 |         nsprefix = None
140 |         # Invert each namespace map as it comes in.
141 |         if len(self.nsmaps) > 1:
142 |             # There are no new namespaces for this tag, but
143 |             # non-default namespaces are in play, so we need a
144 |             # separate tag stack to know when they end.
145 |             self.nsmaps.append(None)
146 |         elif len(nsmap) > 0:
147 |             # A new namespace mapping has come into play.
148 |             inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
149 |             self.nsmaps.append(inverted_nsmap)
150 |             # Also treat the namespace mapping as a set of attributes on the
151 |             # tag, so we can recreate it later.
152 |             attrs = attrs.copy()
153 |             for prefix, namespace in list(nsmap.items()):
154 |                 attribute = NamespacedAttribute(
155 |                     "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
156 |                 attrs[attribute] = namespace
157 | 
158 |         # Namespaces are in play. Find any attributes that came in
159 |         # from lxml with namespaces attached to their names, and
160 |         # turn then into NamespacedAttribute objects.
161 |         new_attrs = {}
162 |         for attr, value in list(attrs.items()):
163 |             namespace, attr = self._getNsTag(attr)
164 |             if namespace is None:
165 |                 new_attrs[attr] = value
166 |             else:
167 |                 nsprefix = self._prefix_for_namespace(namespace)
168 |                 attr = NamespacedAttribute(nsprefix, attr, namespace)
169 |                 new_attrs[attr] = value
170 |         attrs = new_attrs
171 | 
172 |         namespace, name = self._getNsTag(name)
173 |         nsprefix = self._prefix_for_namespace(namespace)
174 |         self.soup.handle_starttag(name, namespace, nsprefix, attrs)
175 | 
176 |     def _prefix_for_namespace(self, namespace):
177 |         """Find the currently active prefix for the given namespace."""
178 |         if namespace is None:
179 |             return None
180 |         for inverted_nsmap in reversed(self.nsmaps):
181 |             if inverted_nsmap is not None and namespace in inverted_nsmap:
182 |                 return inverted_nsmap[namespace]
183 |         return None
184 | 
185 |     def end(self, name):
186 |         self.soup.endData()
187 |         completed_tag = self.soup.tagStack[-1]
188 |         namespace, name = self._getNsTag(name)
189 |         nsprefix = None
190 |         if namespace is not None:
191 |             for inverted_nsmap in reversed(self.nsmaps):
192 |                 if inverted_nsmap is not None and namespace in inverted_nsmap:
193 |                     nsprefix = inverted_nsmap[namespace]
194 |                     break
195 |         self.soup.handle_endtag(name, nsprefix)
196 |         if len(self.nsmaps) > 1:
197 |             # This tag, or one of its parents, introduced a namespace
198 |             # mapping, so pop it off the stack.
199 |             self.nsmaps.pop()
200 | 
201 |     def pi(self, target, data):
202 |         self.soup.endData()
203 |         self.soup.handle_data(target + ' ' + data)
204 |         self.soup.endData(ProcessingInstruction)
205 | 
206 |     def data(self, content):
207 |         self.soup.handle_data(content)
208 | 
209 |     def doctype(self, name, pubid, system):
210 |         self.soup.endData()
211 |         doctype = Doctype.for_name_and_ids(name, pubid, system)
212 |         self.soup.object_was_parsed(doctype)
213 | 
214 |     def comment(self, content):
215 |         "Handle comments as Comment objects."
216 |         self.soup.endData()
217 |         self.soup.handle_data(content)
218 |         self.soup.endData(Comment)
219 | 
220 |     def test_fragment_to_document(self, fragment):
221 |         """See `TreeBuilder`."""
222 |         return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
223 | 
224 | 
225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
226 | 
227 |     NAME = LXML
228 |     ALTERNATE_NAMES = ["lxml-html"]
229 | 
230 |     features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231 |     is_xml = False
232 | 
233 |     def default_parser(self, encoding):
234 |         return etree.HTMLParser
235 | 
236 |     def feed(self, markup):
237 |         encoding = self.soup.original_encoding
238 |         try:
239 |             self.parser = self.parser_for(encoding)
240 |             self.parser.feed(markup)
241 |             self.parser.close()
242 |         except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
243 |             raise ParserRejectedMarkup(str(e))
244 | 
245 | 
246 |     def test_fragment_to_document(self, fragment):
247 |         """See `TreeBuilder`."""
248 |         return '<html><body>%s</body></html>' % fragment
249 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/builder/_lxml.py.bak:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'LXMLTreeBuilderForXML',
  3 |     'LXMLTreeBuilder',
  4 |     ]
  5 | 
  6 | from io import BytesIO
  7 | from StringIO import StringIO
  8 | import collections
  9 | from lxml import etree
 10 | from bs4.element import (
 11 |     Comment,
 12 |     Doctype,
 13 |     NamespacedAttribute,
 14 |     ProcessingInstruction,
 15 | )
 16 | from bs4.builder import (
 17 |     FAST,
 18 |     HTML,
 19 |     HTMLTreeBuilder,
 20 |     PERMISSIVE,
 21 |     ParserRejectedMarkup,
 22 |     TreeBuilder,
 23 |     XML)
 24 | from bs4.dammit import EncodingDetector
 25 | 
 26 | LXML = 'lxml'
 27 | 
 28 | class LXMLTreeBuilderForXML(TreeBuilder):
 29 |     DEFAULT_PARSER_CLASS = etree.XMLParser
 30 | 
 31 |     is_xml = True
 32 | 
 33 |     NAME = "lxml-xml"
 34 |     ALTERNATE_NAMES = ["xml"]
 35 | 
 36 |     # Well, it's permissive by XML parser standards.
 37 |     features = [NAME, LXML, XML, FAST, PERMISSIVE]
 38 | 
 39 |     CHUNK_SIZE = 512
 40 | 
 41 |     # This namespace mapping is specified in the XML Namespace
 42 |     # standard.
 43 |     DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
 44 | 
 45 |     def default_parser(self, encoding):
 46 |         # This can either return a parser object or a class, which
 47 |         # will be instantiated with default arguments.
 48 |         if self._default_parser is not None:
 49 |             return self._default_parser
 50 |         return etree.XMLParser(
 51 |             target=self, strip_cdata=False, recover=True, encoding=encoding)
 52 | 
 53 |     def parser_for(self, encoding):
 54 |         # Use the default parser.
 55 |         parser = self.default_parser(encoding)
 56 | 
 57 |         if isinstance(parser, collections.Callable):
 58 |             # Instantiate the parser with default arguments
 59 |             parser = parser(target=self, strip_cdata=False, encoding=encoding)
 60 |         return parser
 61 | 
 62 |     def __init__(self, parser=None, empty_element_tags=None):
 63 |         # TODO: Issue a warning if parser is present but not a
 64 |         # callable, since that means there's no way to create new
 65 |         # parsers for different encodings.
 66 |         self._default_parser = parser
 67 |         if empty_element_tags is not None:
 68 |             self.empty_element_tags = set(empty_element_tags)
 69 |         self.soup = None
 70 |         self.nsmaps = [self.DEFAULT_NSMAPS]
 71 | 
 72 |     def _getNsTag(self, tag):
 73 |         # Split the namespace URL out of a fully-qualified lxml tag
 74 |         # name. Copied from lxml's src/lxml/sax.py.
 75 |         if tag[0] == '{':
 76 |             return tuple(tag[1:].split('}', 1))
 77 |         else:
 78 |             return (None, tag)
 79 | 
 80 |     def prepare_markup(self, markup, user_specified_encoding=None,
 81 |                        exclude_encodings=None,
 82 |                        document_declared_encoding=None):
 83 |         """
 84 |         :yield: A series of 4-tuples.
 85 |          (markup, encoding, declared encoding,
 86 |           has undergone character replacement)
 87 | 
 88 |         Each 4-tuple represents a strategy for parsing the document.
 89 |         """
 90 |         if isinstance(markup, unicode):
 91 |             # We were given Unicode. Maybe lxml can parse Unicode on
 92 |             # this system?
 93 |             yield markup, None, document_declared_encoding, False
 94 | 
 95 |         if isinstance(markup, unicode):
 96 |             # No, apparently not. Convert the Unicode to UTF-8 and
 97 |             # tell lxml to parse it as UTF-8.
 98 |             yield (markup.encode("utf8"), "utf8",
 99 |                    document_declared_encoding, False)
100 | 
101 |         # Instead of using UnicodeDammit to convert the bytestring to
102 |         # Unicode using different encodings, use EncodingDetector to
103 |         # iterate over the encodings, and tell lxml to try to parse
104 |         # the document as each one in turn.
105 |         is_html = not self.is_xml
106 |         try_encodings = [user_specified_encoding, document_declared_encoding]
107 |         detector = EncodingDetector(
108 |             markup, try_encodings, is_html, exclude_encodings)
109 |         for encoding in detector.encodings:
110 |             yield (detector.markup, encoding, document_declared_encoding, False)
111 | 
112 |     def feed(self, markup):
113 |         if isinstance(markup, bytes):
114 |             markup = BytesIO(markup)
115 |         elif isinstance(markup, unicode):
116 |             markup = StringIO(markup)
117 | 
118 |         # Call feed() at least once, even if the markup is empty,
119 |         # or the parser won't be initialized.
120 |         data = markup.read(self.CHUNK_SIZE)
121 |         try:
122 |             self.parser = self.parser_for(self.soup.original_encoding)
123 |             self.parser.feed(data)
124 |             while len(data) != 0:
125 |                 # Now call feed() on the rest of the data, chunk by chunk.
126 |                 data = markup.read(self.CHUNK_SIZE)
127 |                 if len(data) != 0:
128 |                     self.parser.feed(data)
129 |             self.parser.close()
130 |         except (UnicodeDecodeError, LookupError, etree.ParserError), e:
131 |             raise ParserRejectedMarkup(str(e))
132 | 
133 |     def close(self):
134 |         self.nsmaps = [self.DEFAULT_NSMAPS]
135 | 
136 |     def start(self, name, attrs, nsmap={}):
137 |         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138 |         attrs = dict(attrs)
139 |         nsprefix = None
140 |         # Invert each namespace map as it comes in.
141 |         if len(self.nsmaps) > 1:
142 |             # There are no new namespaces for this tag, but
143 |             # non-default namespaces are in play, so we need a
144 |             # separate tag stack to know when they end.
145 |             self.nsmaps.append(None)
146 |         elif len(nsmap) > 0:
147 |             # A new namespace mapping has come into play.
148 |             inverted_nsmap = dict((value, key) for key, value in nsmap.items())
149 |             self.nsmaps.append(inverted_nsmap)
150 |             # Also treat the namespace mapping as a set of attributes on the
151 |             # tag, so we can recreate it later.
152 |             attrs = attrs.copy()
153 |             for prefix, namespace in nsmap.items():
154 |                 attribute = NamespacedAttribute(
155 |                     "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
156 |                 attrs[attribute] = namespace
157 | 
158 |         # Namespaces are in play. Find any attributes that came in
159 |         # from lxml with namespaces attached to their names, and
160 |         # turn then into NamespacedAttribute objects.
161 |         new_attrs = {}
162 |         for attr, value in attrs.items():
163 |             namespace, attr = self._getNsTag(attr)
164 |             if namespace is None:
165 |                 new_attrs[attr] = value
166 |             else:
167 |                 nsprefix = self._prefix_for_namespace(namespace)
168 |                 attr = NamespacedAttribute(nsprefix, attr, namespace)
169 |                 new_attrs[attr] = value
170 |         attrs = new_attrs
171 | 
172 |         namespace, name = self._getNsTag(name)
173 |         nsprefix = self._prefix_for_namespace(namespace)
174 |         self.soup.handle_starttag(name, namespace, nsprefix, attrs)
175 | 
176 |     def _prefix_for_namespace(self, namespace):
177 |         """Find the currently active prefix for the given namespace."""
178 |         if namespace is None:
179 |             return None
180 |         for inverted_nsmap in reversed(self.nsmaps):
181 |             if inverted_nsmap is not None and namespace in inverted_nsmap:
182 |                 return inverted_nsmap[namespace]
183 |         return None
184 | 
185 |     def end(self, name):
186 |         self.soup.endData()
187 |         completed_tag = self.soup.tagStack[-1]
188 |         namespace, name = self._getNsTag(name)
189 |         nsprefix = None
190 |         if namespace is not None:
191 |             for inverted_nsmap in reversed(self.nsmaps):
192 |                 if inverted_nsmap is not None and namespace in inverted_nsmap:
193 |                     nsprefix = inverted_nsmap[namespace]
194 |                     break
195 |         self.soup.handle_endtag(name, nsprefix)
196 |         if len(self.nsmaps) > 1:
197 |             # This tag, or one of its parents, introduced a namespace
198 |             # mapping, so pop it off the stack.
199 |             self.nsmaps.pop()
200 | 
201 |     def pi(self, target, data):
202 |         self.soup.endData()
203 |         self.soup.handle_data(target + ' ' + data)
204 |         self.soup.endData(ProcessingInstruction)
205 | 
206 |     def data(self, content):
207 |         self.soup.handle_data(content)
208 | 
209 |     def doctype(self, name, pubid, system):
210 |         self.soup.endData()
211 |         doctype = Doctype.for_name_and_ids(name, pubid, system)
212 |         self.soup.object_was_parsed(doctype)
213 | 
214 |     def comment(self, content):
215 |         "Handle comments as Comment objects."
216 |         self.soup.endData()
217 |         self.soup.handle_data(content)
218 |         self.soup.endData(Comment)
219 | 
220 |     def test_fragment_to_document(self, fragment):
221 |         """See `TreeBuilder`."""
222 |         return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
223 | 
224 | 
225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
226 | 
227 |     NAME = LXML
228 |     ALTERNATE_NAMES = ["lxml-html"]
229 | 
230 |     features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231 |     is_xml = False
232 | 
233 |     def default_parser(self, encoding):
234 |         return etree.HTMLParser
235 | 
236 |     def feed(self, markup):
237 |         encoding = self.soup.original_encoding
238 |         try:
239 |             self.parser = self.parser_for(encoding)
240 |             self.parser.feed(markup)
241 |             self.parser.close()
242 |         except (UnicodeDecodeError, LookupError, etree.ParserError), e:
243 |             raise ParserRejectedMarkup(str(e))
244 | 
245 | 
246 |     def test_fragment_to_document(self, fragment):
247 |         """See `TreeBuilder`."""
248 |         return u'<html><body>%s</body></html>' % fragment
249 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/diagnose.py:
--------------------------------------------------------------------------------
  1 | """Diagnostic functions, mainly for use when doing tech support."""
  2 | import cProfile
  3 | from io import StringIO
  4 | from html.parser import HTMLParser
  5 | import bs4
  6 | from bs4 import BeautifulSoup, __version__
  7 | from bs4.builder import builder_registry
  8 | 
  9 | import os
 10 | import pstats
 11 | import random
 12 | import tempfile
 13 | import time
 14 | import traceback
 15 | import sys
 16 | import cProfile
 17 | 
 18 | def diagnose(data):
 19 |     """Diagnostic suite for isolating common problems."""
 20 |     print("Diagnostic running on Beautiful Soup %s" % __version__)
 21 |     print("Python version %s" % sys.version)
 22 | 
 23 |     basic_parsers = ["html.parser", "html5lib", "lxml"]
 24 |     for name in basic_parsers:
 25 |         for builder in builder_registry.builders:
 26 |             if name in builder.features:
 27 |                 break
 28 |         else:
 29 |             basic_parsers.remove(name)
 30 |             print((
 31 |                 "I noticed that %s is not installed. Installing it may help." %
 32 |                 name))
 33 | 
 34 |     if 'lxml' in basic_parsers:
 35 |         basic_parsers.append(["lxml", "xml"])
 36 |         try:
 37 |             from lxml import etree
 38 |             print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
 39 |         except ImportError as e:
 40 |             print (
 41 |                 "lxml is not installed or couldn't be imported.")
 42 | 
 43 | 
 44 |     if 'html5lib' in basic_parsers:
 45 |         try:
 46 |             import html5lib
 47 |             print("Found html5lib version %s" % html5lib.__version__)
 48 |         except ImportError as e:
 49 |             print (
 50 |                 "html5lib is not installed or couldn't be imported.")
 51 | 
 52 |     if hasattr(data, 'read'):
 53 |         data = data.read()
 54 |     elif os.path.exists(data):
 55 |         print('"%s" looks like a filename. Reading data from the file.' % data)
 56 |         data = open(data).read()
 57 |     elif data.startswith("http:") or data.startswith("https:"):
 58 |         print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
 59 |         print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
 60 |         return
 61 |     print()
 62 | 
 63 |     for parser in basic_parsers:
 64 |         print("Trying to parse your markup with %s" % parser)
 65 |         success = False
 66 |         try:
 67 |             soup = BeautifulSoup(data, parser)
 68 |             success = True
 69 |         except Exception as e:
 70 |             print("%s could not parse the markup." % parser)
 71 |             traceback.print_exc()
 72 |         if success:
 73 |             print("Here's what %s did with the markup:" % parser)
 74 |             print(soup.prettify())
 75 | 
 76 |         print("-" * 80)
 77 | 
 78 | def lxml_trace(data, html=True, **kwargs):
 79 |     """Print out the lxml events that occur during parsing.
 80 | 
 81 |     This lets you see how lxml parses a document when no Beautiful
 82 |     Soup code is running.
 83 |     """
 84 |     from lxml import etree
 85 |     for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
 86 |         print(("%s, %4s, %s" % (event, element.tag, element.text)))
 87 | 
 88 | class AnnouncingParser(HTMLParser):
 89 |     """Announces HTMLParser parse events, without doing anything else."""
 90 | 
 91 |     def _p(self, s):
 92 |         print(s)
 93 | 
 94 |     def handle_starttag(self, name, attrs):
 95 |         self._p("%s START" % name)
 96 | 
 97 |     def handle_endtag(self, name):
 98 |         self._p("%s END" % name)
 99 | 
100 |     def handle_data(self, data):
101 |         self._p("%s DATA" % data)
102 | 
103 |     def handle_charref(self, name):
104 |         self._p("%s CHARREF" % name)
105 | 
106 |     def handle_entityref(self, name):
107 |         self._p("%s ENTITYREF" % name)
108 | 
109 |     def handle_comment(self, data):
110 |         self._p("%s COMMENT" % data)
111 | 
112 |     def handle_decl(self, data):
113 |         self._p("%s DECL" % data)
114 | 
115 |     def unknown_decl(self, data):
116 |         self._p("%s UNKNOWN-DECL" % data)
117 | 
118 |     def handle_pi(self, data):
119 |         self._p("%s PI" % data)
120 | 
121 | def htmlparser_trace(data):
122 |     """Print out the HTMLParser events that occur during parsing.
123 | 
124 |     This lets you see how HTMLParser parses a document when no
125 |     Beautiful Soup code is running.
126 |     """
127 |     parser = AnnouncingParser()
128 |     parser.feed(data)
129 | 
130 | _vowels = "aeiou"
131 | _consonants = "bcdfghjklmnpqrstvwxyz"
132 | 
133 | def rword(length=5):
134 |     "Generate a random word-like string."
135 |     s = ''
136 |     for i in range(length):
137 |         if i % 2 == 0:
138 |             t = _consonants
139 |         else:
140 |             t = _vowels
141 |         s += random.choice(t)
142 |     return s
143 | 
144 | def rsentence(length=4):
145 |     "Generate a random sentence-like string."
146 |     return " ".join(rword(random.randint(4,9)) for i in range(length))
147 |         
148 | def rdoc(num_elements=1000):
149 |     """Randomly generate an invalid HTML document."""
150 |     tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
151 |     elements = []
152 |     for i in range(num_elements):
153 |         choice = random.randint(0,3)
154 |         if choice == 0:
155 |             # New tag.
156 |             tag_name = random.choice(tag_names)
157 |             elements.append("<%s>" % tag_name)
158 |         elif choice == 1:
159 |             elements.append(rsentence(random.randint(1,4)))
160 |         elif choice == 2:
161 |             # Close a tag.
162 |             tag_name = random.choice(tag_names)
163 |             elements.append("</%s>" % tag_name)
164 |     return "<html>" + "\n".join(elements) + "</html>"
165 | 
166 | def benchmark_parsers(num_elements=100000):
167 |     """Very basic head-to-head performance benchmark."""
168 |     print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
169 |     data = rdoc(num_elements)
170 |     print("Generated a large invalid HTML document (%d bytes)." % len(data))
171 |     
172 |     for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
173 |         success = False
174 |         try:
175 |             a = time.time()
176 |             soup = BeautifulSoup(data, parser)
177 |             b = time.time()
178 |             success = True
179 |         except Exception as e:
180 |             print("%s could not parse the markup." % parser)
181 |             traceback.print_exc()
182 |         if success:
183 |             print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
184 | 
185 |     from lxml import etree
186 |     a = time.time()
187 |     etree.HTML(data)
188 |     b = time.time()
189 |     print("Raw lxml parsed the markup in %.2fs." % (b-a))
190 | 
191 |     import html5lib
192 |     parser = html5lib.HTMLParser()
193 |     a = time.time()
194 |     parser.parse(data)
195 |     b = time.time()
196 |     print("Raw html5lib parsed the markup in %.2fs." % (b-a))
197 | 
198 | def profile(num_elements=100000, parser="lxml"):
199 | 
200 |     filehandle = tempfile.NamedTemporaryFile()
201 |     filename = filehandle.name
202 | 
203 |     data = rdoc(num_elements)
204 |     vars = dict(bs4=bs4, data=data, parser=parser)
205 |     cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
206 | 
207 |     stats = pstats.Stats(filename)
208 |     # stats.strip_dirs()
209 |     stats.sort_stats("cumulative")
210 |     stats.print_stats('_html5lib|bs4', 50)
211 | 
212 | if __name__ == '__main__':
213 |     diagnose(sys.stdin.read())
214 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/diagnose.py.bak:
--------------------------------------------------------------------------------
  1 | """Diagnostic functions, mainly for use when doing tech support."""
  2 | import cProfile
  3 | from StringIO import StringIO
  4 | from HTMLParser import HTMLParser
  5 | import bs4
  6 | from bs4 import BeautifulSoup, __version__
  7 | from bs4.builder import builder_registry
  8 | 
  9 | import os
 10 | import pstats
 11 | import random
 12 | import tempfile
 13 | import time
 14 | import traceback
 15 | import sys
 16 | import cProfile
 17 | 
 18 | def diagnose(data):
 19 |     """Diagnostic suite for isolating common problems."""
 20 |     print "Diagnostic running on Beautiful Soup %s" % __version__
 21 |     print "Python version %s" % sys.version
 22 | 
 23 |     basic_parsers = ["html.parser", "html5lib", "lxml"]
 24 |     for name in basic_parsers:
 25 |         for builder in builder_registry.builders:
 26 |             if name in builder.features:
 27 |                 break
 28 |         else:
 29 |             basic_parsers.remove(name)
 30 |             print (
 31 |                 "I noticed that %s is not installed. Installing it may help." %
 32 |                 name)
 33 | 
 34 |     if 'lxml' in basic_parsers:
 35 |         basic_parsers.append(["lxml", "xml"])
 36 |         try:
 37 |             from lxml import etree
 38 |             print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
 39 |         except ImportError, e:
 40 |             print (
 41 |                 "lxml is not installed or couldn't be imported.")
 42 | 
 43 | 
 44 |     if 'html5lib' in basic_parsers:
 45 |         try:
 46 |             import html5lib
 47 |             print "Found html5lib version %s" % html5lib.__version__
 48 |         except ImportError, e:
 49 |             print (
 50 |                 "html5lib is not installed or couldn't be imported.")
 51 | 
 52 |     if hasattr(data, 'read'):
 53 |         data = data.read()
 54 |     elif os.path.exists(data):
 55 |         print '"%s" looks like a filename. Reading data from the file.' % data
 56 |         data = open(data).read()
 57 |     elif data.startswith("http:") or data.startswith("https:"):
 58 |         print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
 59 |         print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
 60 |         return
 61 |     print
 62 | 
 63 |     for parser in basic_parsers:
 64 |         print "Trying to parse your markup with %s" % parser
 65 |         success = False
 66 |         try:
 67 |             soup = BeautifulSoup(data, parser)
 68 |             success = True
 69 |         except Exception, e:
 70 |             print "%s could not parse the markup." % parser
 71 |             traceback.print_exc()
 72 |         if success:
 73 |             print "Here's what %s did with the markup:" % parser
 74 |             print soup.prettify()
 75 | 
 76 |         print "-" * 80
 77 | 
 78 | def lxml_trace(data, html=True, **kwargs):
 79 |     """Print out the lxml events that occur during parsing.
 80 | 
 81 |     This lets you see how lxml parses a document when no Beautiful
 82 |     Soup code is running.
 83 |     """
 84 |     from lxml import etree
 85 |     for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
 86 |         print("%s, %4s, %s" % (event, element.tag, element.text))
 87 | 
 88 | class AnnouncingParser(HTMLParser):
 89 |     """Announces HTMLParser parse events, without doing anything else."""
 90 | 
 91 |     def _p(self, s):
 92 |         print(s)
 93 | 
 94 |     def handle_starttag(self, name, attrs):
 95 |         self._p("%s START" % name)
 96 | 
 97 |     def handle_endtag(self, name):
 98 |         self._p("%s END" % name)
 99 | 
100 |     def handle_data(self, data):
101 |         self._p("%s DATA" % data)
102 | 
103 |     def handle_charref(self, name):
104 |         self._p("%s CHARREF" % name)
105 | 
106 |     def handle_entityref(self, name):
107 |         self._p("%s ENTITYREF" % name)
108 | 
109 |     def handle_comment(self, data):
110 |         self._p("%s COMMENT" % data)
111 | 
112 |     def handle_decl(self, data):
113 |         self._p("%s DECL" % data)
114 | 
115 |     def unknown_decl(self, data):
116 |         self._p("%s UNKNOWN-DECL" % data)
117 | 
118 |     def handle_pi(self, data):
119 |         self._p("%s PI" % data)
120 | 
121 | def htmlparser_trace(data):
122 |     """Print out the HTMLParser events that occur during parsing.
123 | 
124 |     This lets you see how HTMLParser parses a document when no
125 |     Beautiful Soup code is running.
126 |     """
127 |     parser = AnnouncingParser()
128 |     parser.feed(data)
129 | 
130 | _vowels = "aeiou"
131 | _consonants = "bcdfghjklmnpqrstvwxyz"
132 | 
133 | def rword(length=5):
134 |     "Generate a random word-like string."
135 |     s = ''
136 |     for i in range(length):
137 |         if i % 2 == 0:
138 |             t = _consonants
139 |         else:
140 |             t = _vowels
141 |         s += random.choice(t)
142 |     return s
143 | 
144 | def rsentence(length=4):
145 |     "Generate a random sentence-like string."
146 |     return " ".join(rword(random.randint(4,9)) for i in range(length))
147 |         
148 | def rdoc(num_elements=1000):
149 |     """Randomly generate an invalid HTML document."""
150 |     tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
151 |     elements = []
152 |     for i in range(num_elements):
153 |         choice = random.randint(0,3)
154 |         if choice == 0:
155 |             # New tag.
156 |             tag_name = random.choice(tag_names)
157 |             elements.append("<%s>" % tag_name)
158 |         elif choice == 1:
159 |             elements.append(rsentence(random.randint(1,4)))
160 |         elif choice == 2:
161 |             # Close a tag.
162 |             tag_name = random.choice(tag_names)
163 |             elements.append("</%s>" % tag_name)
164 |     return "<html>" + "\n".join(elements) + "</html>"
165 | 
166 | def benchmark_parsers(num_elements=100000):
167 |     """Very basic head-to-head performance benchmark."""
168 |     print "Comparative parser benchmark on Beautiful Soup %s" % __version__
169 |     data = rdoc(num_elements)
170 |     print "Generated a large invalid HTML document (%d bytes)." % len(data)
171 |     
172 |     for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
173 |         success = False
174 |         try:
175 |             a = time.time()
176 |             soup = BeautifulSoup(data, parser)
177 |             b = time.time()
178 |             success = True
179 |         except Exception, e:
180 |             print "%s could not parse the markup." % parser
181 |             traceback.print_exc()
182 |         if success:
183 |             print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
184 | 
185 |     from lxml import etree
186 |     a = time.time()
187 |     etree.HTML(data)
188 |     b = time.time()
189 |     print "Raw lxml parsed the markup in %.2fs." % (b-a)
190 | 
191 |     import html5lib
192 |     parser = html5lib.HTMLParser()
193 |     a = time.time()
194 |     parser.parse(data)
195 |     b = time.time()
196 |     print "Raw html5lib parsed the markup in %.2fs." % (b-a)
197 | 
198 | def profile(num_elements=100000, parser="lxml"):
199 | 
200 |     filehandle = tempfile.NamedTemporaryFile()
201 |     filename = filehandle.name
202 | 
203 |     data = rdoc(num_elements)
204 |     vars = dict(bs4=bs4, data=data, parser=parser)
205 |     cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
206 | 
207 |     stats = pstats.Stats(filename)
208 |     # stats.strip_dirs()
209 |     stats.sort_stats("cumulative")
210 |     stats.print_stats('_html5lib|bs4', 50)
211 | 
212 | if __name__ == '__main__':
213 |     diagnose(sys.stdin.read())
214 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/__init__.py:
--------------------------------------------------------------------------------
1 | "The beautifulsoup tests."
2 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_builder_registry.py:
--------------------------------------------------------------------------------
  1 | """Tests of the builder registry."""
  2 | 
  3 | import unittest
  4 | import warnings
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | from bs4.builder import (
  8 |     builder_registry as registry,
  9 |     HTMLParserTreeBuilder,
 10 |     TreeBuilderRegistry,
 11 | )
 12 | 
 13 | try:
 14 |     from bs4.builder import HTML5TreeBuilder
 15 |     HTML5LIB_PRESENT = True
 16 | except ImportError:
 17 |     HTML5LIB_PRESENT = False
 18 | 
 19 | try:
 20 |     from bs4.builder import (
 21 |         LXMLTreeBuilderForXML,
 22 |         LXMLTreeBuilder,
 23 |         )
 24 |     LXML_PRESENT = True
 25 | except ImportError:
 26 |     LXML_PRESENT = False
 27 | 
 28 | 
 29 | class BuiltInRegistryTest(unittest.TestCase):
 30 |     """Test the built-in registry with the default builders registered."""
 31 | 
 32 |     def test_combination(self):
 33 |         if LXML_PRESENT:
 34 |             self.assertEqual(registry.lookup('fast', 'html'),
 35 |                              LXMLTreeBuilder)
 36 | 
 37 |         if LXML_PRESENT:
 38 |             self.assertEqual(registry.lookup('permissive', 'xml'),
 39 |                              LXMLTreeBuilderForXML)
 40 |         self.assertEqual(registry.lookup('strict', 'html'),
 41 |                           HTMLParserTreeBuilder)
 42 |         if HTML5LIB_PRESENT:
 43 |             self.assertEqual(registry.lookup('html5lib', 'html'),
 44 |                               HTML5TreeBuilder)
 45 | 
 46 |     def test_lookup_by_markup_type(self):
 47 |         if LXML_PRESENT:
 48 |             self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
 49 |             self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
 50 |         else:
 51 |             self.assertEqual(registry.lookup('xml'), None)
 52 |             if HTML5LIB_PRESENT:
 53 |                 self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
 54 |             else:
 55 |                 self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
 56 | 
 57 |     def test_named_library(self):
 58 |         if LXML_PRESENT:
 59 |             self.assertEqual(registry.lookup('lxml', 'xml'),
 60 |                              LXMLTreeBuilderForXML)
 61 |             self.assertEqual(registry.lookup('lxml', 'html'),
 62 |                              LXMLTreeBuilder)
 63 |         if HTML5LIB_PRESENT:
 64 |             self.assertEqual(registry.lookup('html5lib'),
 65 |                               HTML5TreeBuilder)
 66 | 
 67 |         self.assertEqual(registry.lookup('html.parser'),
 68 |                           HTMLParserTreeBuilder)
 69 | 
 70 |     def test_beautifulsoup_constructor_does_lookup(self):
 71 | 
 72 |         with warnings.catch_warnings(record=True) as w:
 73 |             # This will create a warning about not explicitly
 74 |             # specifying a parser, but we'll ignore it.
 75 | 
 76 |             # You can pass in a string.
 77 |             BeautifulSoup("", features="html")
 78 |             # Or a list of strings.
 79 |             BeautifulSoup("", features=["html", "fast"])
 80 | 
 81 |         # You'll get an exception if BS can't find an appropriate
 82 |         # builder.
 83 |         self.assertRaises(ValueError, BeautifulSoup,
 84 |                           "", features="no-such-feature")
 85 | 
 86 | class RegistryTest(unittest.TestCase):
 87 |     """Test the TreeBuilderRegistry class in general."""
 88 | 
 89 |     def setUp(self):
 90 |         self.registry = TreeBuilderRegistry()
 91 | 
 92 |     def builder_for_features(self, *feature_list):
 93 |         cls = type('Builder_' + '_'.join(feature_list),
 94 |                    (object,), {'features' : feature_list})
 95 | 
 96 |         self.registry.register(cls)
 97 |         return cls
 98 | 
 99 |     def test_register_with_no_features(self):
100 |         builder = self.builder_for_features()
101 | 
102 |         # Since the builder advertises no features, you can't find it
103 |         # by looking up features.
104 |         self.assertEqual(self.registry.lookup('foo'), None)
105 | 
106 |         # But you can find it by doing a lookup with no features, if
107 |         # this happens to be the only registered builder.
108 |         self.assertEqual(self.registry.lookup(), builder)
109 | 
110 |     def test_register_with_features_makes_lookup_succeed(self):
111 |         builder = self.builder_for_features('foo', 'bar')
112 |         self.assertEqual(self.registry.lookup('foo'), builder)
113 |         self.assertEqual(self.registry.lookup('bar'), builder)
114 | 
115 |     def test_lookup_fails_when_no_builder_implements_feature(self):
116 |         builder = self.builder_for_features('foo', 'bar')
117 |         self.assertEqual(self.registry.lookup('baz'), None)
118 | 
119 |     def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
120 |         builder1 = self.builder_for_features('foo')
121 |         builder2 = self.builder_for_features('bar')
122 |         self.assertEqual(self.registry.lookup(), builder2)
123 | 
124 |     def test_lookup_fails_when_no_tree_builders_registered(self):
125 |         self.assertEqual(self.registry.lookup(), None)
126 | 
127 |     def test_lookup_gets_most_recent_builder_supporting_all_features(self):
128 |         has_one = self.builder_for_features('foo')
129 |         has_the_other = self.builder_for_features('bar')
130 |         has_both_early = self.builder_for_features('foo', 'bar', 'baz')
131 |         has_both_late = self.builder_for_features('foo', 'bar', 'quux')
132 |         lacks_one = self.builder_for_features('bar')
133 |         has_the_other = self.builder_for_features('foo')
134 | 
135 |         # There are two builders featuring 'foo' and 'bar', but
136 |         # the one that also features 'quux' was registered later.
137 |         self.assertEqual(self.registry.lookup('foo', 'bar'),
138 |                           has_both_late)
139 | 
140 |         # There is only one builder featuring 'foo', 'bar', and 'baz'.
141 |         self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
142 |                           has_both_early)
143 | 
144 |     def test_lookup_fails_when_cannot_reconcile_requested_features(self):
145 |         builder1 = self.builder_for_features('foo', 'bar')
146 |         builder2 = self.builder_for_features('foo', 'baz')
147 |         self.assertEqual(self.registry.lookup('bar', 'baz'), None)
148 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | "Test harness for doctests."
 2 | 
 3 | # pylint: disable-msg=E0611,W0142
 4 | 
 5 | __metaclass__ = type
 6 | __all__ = [
 7 |     'additional_tests',
 8 |     ]
 9 | 
10 | import atexit
11 | import doctest
12 | import os
13 | #from pkg_resources import (
14 | #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
15 | import unittest
16 | 
17 | DOCTEST_FLAGS = (
18 |     doctest.ELLIPSIS |
19 |     doctest.NORMALIZE_WHITESPACE |
20 |     doctest.REPORT_NDIFF)
21 | 
22 | 
23 | # def additional_tests():
24 | #     "Run the doc tests (README.txt and docs/*, if any exist)"
25 | #     doctest_files = [
26 | #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
27 | #     if resource_exists('bs4', 'docs'):
28 | #         for name in resource_listdir('bs4', 'docs'):
29 | #             if name.endswith('.txt'):
30 | #                 doctest_files.append(
31 | #                     os.path.abspath(
32 | #                         resource_filename('bs4', 'docs/%s' % name)))
33 | #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34 | #     atexit.register(cleanup_resources)
35 | #     return unittest.TestSuite((
36 | #         doctest.DocFileSuite(*doctest_files, **kwargs)))
37 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_html5lib.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html5lib tree builder generates good trees."""
 2 | 
 3 | import warnings
 4 | 
 5 | try:
 6 |     from bs4.builder import HTML5TreeBuilder
 7 |     HTML5LIB_PRESENT = True
 8 | except ImportError as e:
 9 |     HTML5LIB_PRESENT = False
10 | from bs4.element import SoupStrainer
11 | from bs4.testing import (
12 |     HTML5TreeBuilderSmokeTest,
13 |     SoupTest,
14 |     skipIf,
15 | )
16 | 
17 | @skipIf(
18 |     not HTML5LIB_PRESENT,
19 |     "html5lib seems not to be present, not testing its tree builder.")
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 |     """See ``HTML5TreeBuilderSmokeTest``."""
22 | 
23 |     @property
24 |     def default_builder(self):
25 |         return HTML5TreeBuilder()
26 | 
27 |     def test_soupstrainer(self):
28 |         # The html5lib tree builder does not support SoupStrainers.
29 |         strainer = SoupStrainer("b")
30 |         markup = "<p>A <b>bold</b> statement.</p>"
31 |         with warnings.catch_warnings(record=True) as w:
32 |             soup = self.soup(markup, parse_only=strainer)
33 |         self.assertEqual(
34 |             soup.decode(), self.document_for(markup))
35 | 
36 |         self.assertTrue(
37 |             "the html5lib tree builder doesn't support parse_only" in
38 |             str(w[0].message))
39 | 
40 |     def test_correctly_nested_tables(self):
41 |         """html5lib inserts <tbody> tags where other parsers don't."""
42 |         markup = ('<table id="1">'
43 |                   '<tr>'
44 |                   "<td>Here's another table:"
45 |                   '<table id="2">'
46 |                   '<tr><td>foo</td></tr>'
47 |                   '</table></td>')
48 | 
49 |         self.assertSoupEquals(
50 |             markup,
51 |             '<table id="1"><tbody><tr><td>Here\'s another table:'
52 |             '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 |             '</td></tr></tbody></table>')
54 | 
55 |         self.assertSoupEquals(
56 |             "<table><thead><tr><td>Foo</td></tr></thead>"
57 |             "<tbody><tr><td>Bar</td></tr></tbody>"
58 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59 | 
60 |     def test_xml_declaration_followed_by_doctype(self):
61 |         markup = '''<?xml version="1.0" encoding="utf-8"?>
62 | <!DOCTYPE html>
63 | <html>
64 |   <head>
65 |   </head>
66 |   <body>
67 |    <p>foo</p>
68 |   </body>
69 | </html>'''
70 |         soup = self.soup(markup)
71 |         # Verify that we can reach the <p> tag; this means the tree is connected.
72 |         self.assertEqual(b"<p>foo</p>", soup.p.encode())
73 | 
74 |     def test_reparented_markup(self):
75 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 |         soup = self.soup(markup)
77 |         self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 |         self.assertEqual(2, len(soup.find_all('p')))
79 | 
80 | 
81 |     def test_reparented_markup_ends_with_whitespace(self):
82 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 |         soup = self.soup(markup)
84 |         self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 |         self.assertEqual(2, len(soup.find_all('p')))
86 | 
87 |     def test_processing_instruction(self):
88 |         """Processing instructions become comments."""
89 |         markup = b"""<?PITarget PIContent?>"""
90 |         soup = self.soup(markup)
91 |         assert str(soup).startswith("<!--?PITarget PIContent?-->")
92 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_html5lib.py.bak:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html5lib tree builder generates good trees."""
 2 | 
 3 | import warnings
 4 | 
 5 | try:
 6 |     from bs4.builder import HTML5TreeBuilder
 7 |     HTML5LIB_PRESENT = True
 8 | except ImportError, e:
 9 |     HTML5LIB_PRESENT = False
10 | from bs4.element import SoupStrainer
11 | from bs4.testing import (
12 |     HTML5TreeBuilderSmokeTest,
13 |     SoupTest,
14 |     skipIf,
15 | )
16 | 
17 | @skipIf(
18 |     not HTML5LIB_PRESENT,
19 |     "html5lib seems not to be present, not testing its tree builder.")
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 |     """See ``HTML5TreeBuilderSmokeTest``."""
22 | 
23 |     @property
24 |     def default_builder(self):
25 |         return HTML5TreeBuilder()
26 | 
27 |     def test_soupstrainer(self):
28 |         # The html5lib tree builder does not support SoupStrainers.
29 |         strainer = SoupStrainer("b")
30 |         markup = "<p>A <b>bold</b> statement.</p>"
31 |         with warnings.catch_warnings(record=True) as w:
32 |             soup = self.soup(markup, parse_only=strainer)
33 |         self.assertEqual(
34 |             soup.decode(), self.document_for(markup))
35 | 
36 |         self.assertTrue(
37 |             "the html5lib tree builder doesn't support parse_only" in
38 |             str(w[0].message))
39 | 
40 |     def test_correctly_nested_tables(self):
41 |         """html5lib inserts <tbody> tags where other parsers don't."""
42 |         markup = ('<table id="1">'
43 |                   '<tr>'
44 |                   "<td>Here's another table:"
45 |                   '<table id="2">'
46 |                   '<tr><td>foo</td></tr>'
47 |                   '</table></td>')
48 | 
49 |         self.assertSoupEquals(
50 |             markup,
51 |             '<table id="1"><tbody><tr><td>Here\'s another table:'
52 |             '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 |             '</td></tr></tbody></table>')
54 | 
55 |         self.assertSoupEquals(
56 |             "<table><thead><tr><td>Foo</td></tr></thead>"
57 |             "<tbody><tr><td>Bar</td></tr></tbody>"
58 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59 | 
60 |     def test_xml_declaration_followed_by_doctype(self):
61 |         markup = '''<?xml version="1.0" encoding="utf-8"?>
62 | <!DOCTYPE html>
63 | <html>
64 |   <head>
65 |   </head>
66 |   <body>
67 |    <p>foo</p>
68 |   </body>
69 | </html>'''
70 |         soup = self.soup(markup)
71 |         # Verify that we can reach the <p> tag; this means the tree is connected.
72 |         self.assertEqual(b"<p>foo</p>", soup.p.encode())
73 | 
74 |     def test_reparented_markup(self):
75 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 |         soup = self.soup(markup)
77 |         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 |         self.assertEqual(2, len(soup.find_all('p')))
79 | 
80 | 
81 |     def test_reparented_markup_ends_with_whitespace(self):
82 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 |         soup = self.soup(markup)
84 |         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 |         self.assertEqual(2, len(soup.find_all('p')))
86 | 
87 |     def test_processing_instruction(self):
88 |         """Processing instructions become comments."""
89 |         markup = b"""<?PITarget PIContent?>"""
90 |         soup = self.soup(markup)
91 |         assert str(soup).startswith("<!--?PITarget PIContent?-->")
92 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_htmlparser.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html.parser tree builder generates good
 2 | trees."""
 3 | 
 4 | from pdb import set_trace
 5 | import pickle
 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 7 | from bs4.builder import HTMLParserTreeBuilder
 8 | 
 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
10 | 
11 |     @property
12 |     def default_builder(self):
13 |         return HTMLParserTreeBuilder()
14 | 
15 |     def test_namespaced_system_doctype(self):
16 |         # html.parser can't handle namespaced doctypes, so skip this one.
17 |         pass
18 | 
19 |     def test_namespaced_public_doctype(self):
20 |         # html.parser can't handle namespaced doctypes, so skip this one.
21 |         pass
22 | 
23 |     def test_builder_is_pickled(self):
24 |         """Unlike most tree builders, HTMLParserTreeBuilder and will
25 |         be restored after pickling.
26 |         """
27 |         tree = self.soup("<a><b>foo</a>")
28 |         dumped = pickle.dumps(tree, 2)
29 |         loaded = pickle.loads(dumped)
30 |         self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_lxml.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the lxml tree builder generates good trees."""
 2 | 
 3 | import re
 4 | import warnings
 5 | 
 6 | try:
 7 |     import lxml.etree
 8 |     LXML_PRESENT = True
 9 |     LXML_VERSION = lxml.etree.LXML_VERSION
10 | except ImportError as e:
11 |     LXML_PRESENT = False
12 |     LXML_VERSION = (0,)
13 | 
14 | if LXML_PRESENT:
15 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16 | 
17 | from bs4 import (
18 |     BeautifulSoup,
19 |     BeautifulStoneSoup,
20 |     )
21 | from bs4.element import Comment, Doctype, SoupStrainer
22 | from bs4.testing import skipIf
23 | from bs4.tests import test_htmlparser
24 | from bs4.testing import (
25 |     HTMLTreeBuilderSmokeTest,
26 |     XMLTreeBuilderSmokeTest,
27 |     SoupTest,
28 |     skipIf,
29 | )
30 | 
31 | @skipIf(
32 |     not LXML_PRESENT,
33 |     "lxml seems not to be present, not testing its tree builder.")
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 |     """See ``HTMLTreeBuilderSmokeTest``."""
36 | 
37 |     @property
38 |     def default_builder(self):
39 |         return LXMLTreeBuilder()
40 | 
41 |     def test_out_of_range_entity(self):
42 |         self.assertSoupEquals(
43 |             "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 |         self.assertSoupEquals(
45 |             "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 |         self.assertSoupEquals(
47 |             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48 | 
49 |     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 |     # test if an old version of lxml is installed.
51 | 
52 |     @skipIf(
53 |         not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 |         "Skipping doctype test for old version of lxml to avoid segfault.")
55 |     def test_empty_doctype(self):
56 |         soup = self.soup("<!DOCTYPE>")
57 |         doctype = soup.contents[0]
58 |         self.assertEqual("", doctype.strip())
59 | 
60 |     def test_beautifulstonesoup_is_xml_parser(self):
61 |         # Make sure that the deprecated BSS class uses an xml builder
62 |         # if one is installed.
63 |         with warnings.catch_warnings(record=True) as w:
64 |             soup = BeautifulStoneSoup("<b />")
65 |         self.assertEqual("<b/>", str(soup.b))
66 |         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 | 
68 | @skipIf(
69 |     not LXML_PRESENT,
70 |     "lxml seems not to be present, not testing its XML tree builder.")
71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
72 |     """See ``HTMLTreeBuilderSmokeTest``."""
73 | 
74 |     @property
75 |     def default_builder(self):
76 |         return LXMLTreeBuilderForXML()
77 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/bs4/tests/test_lxml.py.bak:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the lxml tree builder generates good trees."""
 2 | 
 3 | import re
 4 | import warnings
 5 | 
 6 | try:
 7 |     import lxml.etree
 8 |     LXML_PRESENT = True
 9 |     LXML_VERSION = lxml.etree.LXML_VERSION
10 | except ImportError, e:
11 |     LXML_PRESENT = False
12 |     LXML_VERSION = (0,)
13 | 
14 | if LXML_PRESENT:
15 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16 | 
17 | from bs4 import (
18 |     BeautifulSoup,
19 |     BeautifulStoneSoup,
20 |     )
21 | from bs4.element import Comment, Doctype, SoupStrainer
22 | from bs4.testing import skipIf
23 | from bs4.tests import test_htmlparser
24 | from bs4.testing import (
25 |     HTMLTreeBuilderSmokeTest,
26 |     XMLTreeBuilderSmokeTest,
27 |     SoupTest,
28 |     skipIf,
29 | )
30 | 
31 | @skipIf(
32 |     not LXML_PRESENT,
33 |     "lxml seems not to be present, not testing its tree builder.")
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 |     """See ``HTMLTreeBuilderSmokeTest``."""
36 | 
37 |     @property
38 |     def default_builder(self):
39 |         return LXMLTreeBuilder()
40 | 
41 |     def test_out_of_range_entity(self):
42 |         self.assertSoupEquals(
43 |             "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 |         self.assertSoupEquals(
45 |             "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 |         self.assertSoupEquals(
47 |             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48 | 
49 |     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 |     # test if an old version of lxml is installed.
51 | 
52 |     @skipIf(
53 |         not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 |         "Skipping doctype test for old version of lxml to avoid segfault.")
55 |     def test_empty_doctype(self):
56 |         soup = self.soup("<!DOCTYPE>")
57 |         doctype = soup.contents[0]
58 |         self.assertEqual("", doctype.strip())
59 | 
60 |     def test_beautifulstonesoup_is_xml_parser(self):
61 |         # Make sure that the deprecated BSS class uses an xml builder
62 |         # if one is installed.
63 |         with warnings.catch_warnings(record=True) as w:
64 |             soup = BeautifulStoneSoup("<b />")
65 |         self.assertEqual(u"<b/>", unicode(soup.b))
66 |         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 | 
68 | @skipIf(
69 |     not LXML_PRESENT,
70 |     "lxml seems not to be present, not testing its XML tree builder.")
71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
72 |     """See ``HTMLTreeBuilderSmokeTest``."""
73 | 
74 |     @property
75 |     def default_builder(self):
76 |         return LXMLTreeBuilderForXML()
77 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/force.css:
--------------------------------------------------------------------------------
 1 | circle.node {
 2 |   stroke: #fff;
 3 |   stroke-width: 1.5px;
 4 | }
 5 | 
 6 | line.link {
 7 |   stroke: #999;
 8 |   stroke-opacity: .6;
 9 | }
10 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/force.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Force-Directed Layout</title>
 5 |     <script type="text/javascript" src="d3.v2.js"></script>
 6 |     <script type="text/javascript" src="spider.js"></script>
 7 |     <link type="text/css" rel="stylesheet" href="force.css"/>
 8 |   </head>
 9 |   <body style="font-family: sans-serif;">
10 |     <script>
11 |         document.write("<p>Starting url: "+spiderJson.nodes[0].url+"</p>");
12 |     </script>
13 |     <div id="chart" style="border:1px"></div>
14 |     <script type="text/javascript" src="force.js"></script>
15 | 	<p>If you don't see a chart above, check the JavaScript console. You may
16 | 	need to use a different browser.</p>
17 |   </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/force.js:
--------------------------------------------------------------------------------
 1 | var width = 600,
 2 |     height = 600;
 3 | 
 4 | var color = d3.scale.category20();
 5 | 
 6 | var dist = (width + height) / 4;
 7 | 
 8 | var force = d3.layout.force()
 9 |     .charge(-120)
10 |     .linkDistance(dist)
11 |     .size([width, height]);
12 | 
13 | function getrank(rval) {
14 |   return (rval/2.0) + 3;
15 | }
16 | 
17 | function getcolor(rval) {
18 |   return color(rval);
19 | }
20 | 
21 | var svg = d3.select("#chart").append("svg")
22 |     .attr("width", width)
23 |     .attr("height", height);
24 | 
25 | function loadData(json) {
26 |   force
27 |       .nodes(json.nodes)
28 |       .links(json.links);
29 | 
30 |     var k = Math.sqrt(json.nodes.length / (width * height));
31 | 
32 |     force
33 |         .charge(-10 / k)
34 |         .gravity(100 * k)
35 |         .start();
36 | 
37 |   var link = svg.selectAll("line.link")
38 |       .data(json.links)
39 |       .enter().append("line")
40 |       .attr("class", "link")
41 |       .style("stroke-width", function(d) { return Math.sqrt(d.value); });
42 | 
43 |   var node = svg.selectAll("circle.node")
44 |       .data(json.nodes)
45 |       .enter().append("circle")
46 |       .attr("class", "node")
47 |       .attr("r", function(d) { return getrank(d.rank); } )
48 |       .style("fill", function(d) { return getcolor(d.rank); })
49 |       .on("dblclick",function(d) { 
50 |             if ( confirm('Do you want to open '+d.url) ) 
51 |                 window.open(d.url,'_new',''); 
52 |             d3.event.stopPropagation();
53 |         })
54 |       .call(force.drag);
55 | 
56 |   node.append("title")
57 |       .text(function(d) { return d.url; });
58 | 
59 |   force.on("tick", function() {
60 |     link.attr("x1", function(d) { return d.source.x; })
61 |         .attr("y1", function(d) { return d.source.y; })
62 |         .attr("x2", function(d) { return d.target.x; })
63 |         .attr("y2", function(d) { return d.target.y; });
64 | 
65 |     node.attr("cx", function(d) { return d.x; })
66 |         .attr("cy", function(d) { return d.y; });
67 |   });
68 | 
69 | }
70 | loadData(spiderJson);
71 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/pagerank orginal.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/pagerank orginal.PNG


--------------------------------------------------------------------------------
/Capstone/Pagerank/spdump.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 
 7 |      FROM Pages JOIN Links ON Pages.id = Links.to_id
 8 |      WHERE html IS NOT NULL
 9 |      GROUP BY id ORDER BY inbound DESC''')
10 | 
11 | count = 0
12 | for row in cur :
13 |     if count < 50 : print(row)
14 |     count = count + 1
15 | print(count, 'rows.')
16 | cur.close()
17 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/spdump.py Dr. Chuck.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spdump.py Dr. Chuck.PNG


--------------------------------------------------------------------------------
/Capstone/Pagerank/spdump.py WP.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spdump.py WP.PNG


--------------------------------------------------------------------------------
/Capstone/Pagerank/spider.js:
--------------------------------------------------------------------------------
  1 | spiderJson = {"nodes":[
  2 | {"weight":126,"rank":19.0, "id":6, "url":"https://www.washingtonpost.com"},
  3 | {"weight":7,"rank":0.1367546748927559, "id":12, "url":"https://www.washingtonpost.com/news/arts-and-entertainment/wp/2017/10/26/megyn-kelly-responds-to-those-bill-oreilly-thank-you-notes-its-right-out-of-the-playbook"},
  4 | {"weight":7,"rank":0.1367546748927559, "id":14, "url":"https://www.washingtonpost.com/news/politics/wp/2017/09/25/a-trump-judicial-pick-said-transgender-children-are-proof-that-satans-plan-is-working/?utm_term=.8e0dac432ca8"},
  5 | {"weight":12,"rank":0.1367546748927559, "id":17, "url":"https://www.washingtonpost.com/news/animalia/wp/2017/10/26/this-bears-tongue-was-monstrously-enlarged-a-team-of-vets-gave-him-a-second-chance-at-life"},
  6 | {"weight":7,"rank":0.1367546748927559, "id":18, "url":"https://www.washingtonpost.com/news/speaking-of-science/wp/2017/10/25/this-bug-is-real-and-now-so-are-your-nightmares"},
  7 | {"weight":4,"rank":0.0, "id":20, "url":"https://www.washingtonpost.com/news/early-lead/wp/2017/10/26/joe-girardi-out-as-yankees-manager-and-the-nationals-just-happen-to-have-an-opening"},
  8 | {"weight":9,"rank":0.11777961277413226, "id":22, "url":"https://www.washingtonpost.com/people/mike-debonis"},
  9 | {"weight":4,"rank":0.0, "id":25, "url":"https://www.washingtonpost.com/news/post-politics/wp/2017/10/26/trump-plans-to-declare-the-opioid-crisis-a-public-health-emergency"},
 10 | {"weight":5,"rank":0.0, "id":32, "url":"https://www.washingtonpost.com/people/samantha-schmidt"},
 11 | {"weight":5,"rank":0.0, "id":38, "url":"https://www.washingtonpost.com/people/erik-wemple"},
 12 | {"weight":5,"rank":0.0, "id":41, "url":"https://www.washingtonpost.com/news/the-watch/wp/2017/10/26/mississippi-judge-resigns-after-barring-mother-from-seeing-newborn-because-of-unpaid-court-fees"},
 13 | {"weight":7,"rank":0.2567572862009351, "id":42, "url":"https://www.washingtonpost.com/people/radley-balko"},
 14 | {"weight":8,"rank":0.0, "id":43, "url":"https://www.washingtonpost.com/news/posteverything/wp/2017/10/26/george-w-bush-is-not-the-resistance-hes-part-of-what-brought-us-trump"},
 15 | {"weight":5,"rank":0.0, "id":45, "url":"https://www.washingtonpost.com/people/damian-paletta"},
 16 | {"weight":7,"rank":0.10600165188954895, "id":47, "url":"https://www.washingtonpost.com/powerpost/booker-graham-and-elias-expected-to-testify-today-in-menendez-corruption-trial/2017/10/26/34d9219c-ba4a-11e7-9e58-e6288544af98_story.html"},
 17 | {"weight":5,"rank":0.0, "id":51, "url":"https://www.washingtonpost.com/people/anna-fifield"},
 18 | {"weight":5,"rank":0.0, "id":53, "url":"https://www.washingtonpost.com/people/william-booth"},
 19 | {"weight":7,"rank":0.018975062118623574, "id":54, "url":"https://www.washingtonpost.com/local/virginia-politics/trump-again-tweets-on-virginias-governors-race-says-northam-will-be-very-weak-on-crime/2017/10/26/4c5d5ea6-ba58-11e7-be94-fabb0f1e9ffb_story.html"},
 20 | {"weight":5,"rank":0.0, "id":57, "url":"https://www.washingtonpost.com/people/perry-stein"},
 21 | {"weight":5,"rank":0.0, "id":59, "url":"https://www.washingtonpost.com/news/wonk/wp/2017/10/26/why-mcdonalds-is-beating-out-the-fresh-healthy-competition"},
 22 | {"weight":9,"rank":0.0, "id":64, "url":"https://www.washingtonpost.com/local/obituaries/notable-deaths-so-far-this-year/2017/01/17/750ed23a-dcf5-11e6-acdf-14da832ae861_gallery.html"},
 23 | {"weight":16,"rank":1.3854428993176182, "id":66, "url":"https://www.washingtonpost.com/powerpost/gops-insurgents-step-up-campaign-against-mcconnell/2017/10/25/ec3a5af4-b9a0-11e7-9e58-e6288544af98_story.html?tid=pm_pop"},
 24 | {"weight":10,"rank":0.006418872637825229, "id":74, "url":"https://www.washingtonpost.com/crossword-puzzles"},
 25 | {"weight":4,"rank":0.0, "id":80, "url":"https://www.washingtonpost.com/news/the-fix/wp/2017/10/26/trump-flubs-another-promise-declaring-the-opioid-crisis-a-national-emergency"},
 26 | {"weight":4,"rank":0.0, "id":83, "url":"https://www.washingtonpost.com/news/worldviews/wp/2017/10/26/whats-a-scary-superstition-in-your-part-of-the-world-share-it-with-us"},
 27 | {"weight":4,"rank":0.0, "id":90, "url":"https://www.washingtonpost.com/news/posteverything/wp/2017/10/26/will-republicans-regret-enabling-a-demagogue-my-iranian-parents-did"}],
 28 | "links":[
 29 | {"source":0,"target":0,"value":3},
 30 | {"source":0,"target":1,"value":3},
 31 | {"source":0,"target":2,"value":3},
 32 | {"source":0,"target":3,"value":3},
 33 | {"source":0,"target":4,"value":3},
 34 | {"source":0,"target":5,"value":3},
 35 | {"source":0,"target":6,"value":3},
 36 | {"source":0,"target":7,"value":3},
 37 | {"source":0,"target":8,"value":3},
 38 | {"source":0,"target":9,"value":3},
 39 | {"source":0,"target":10,"value":3},
 40 | {"source":0,"target":11,"value":3},
 41 | {"source":0,"target":12,"value":3},
 42 | {"source":0,"target":13,"value":3},
 43 | {"source":0,"target":14,"value":3},
 44 | {"source":0,"target":15,"value":3},
 45 | {"source":0,"target":16,"value":3},
 46 | {"source":0,"target":17,"value":3},
 47 | {"source":0,"target":18,"value":3},
 48 | {"source":0,"target":19,"value":3},
 49 | {"source":0,"target":20,"value":3},
 50 | {"source":0,"target":21,"value":3},
 51 | {"source":0,"target":22,"value":3},
 52 | {"source":0,"target":23,"value":3},
 53 | {"source":0,"target":24,"value":3},
 54 | {"source":0,"target":25,"value":3},
 55 | {"source":7,"target":0,"value":3},
 56 | {"source":7,"target":7,"value":3},
 57 | {"source":9,"target":9,"value":3},
 58 | {"source":9,"target":0,"value":3},
 59 | {"source":9,"target":21,"value":3},
 60 | {"source":17,"target":0,"value":3},
 61 | {"source":17,"target":17,"value":3},
 62 | {"source":23,"target":0,"value":3},
 63 | {"source":23,"target":23,"value":3},
 64 | {"source":11,"target":11,"value":3},
 65 | {"source":11,"target":0,"value":3},
 66 | {"source":11,"target":21,"value":3},
 67 | {"source":24,"target":0,"value":3},
 68 | {"source":24,"target":24,"value":3},
 69 | {"source":19,"target":0,"value":3},
 70 | {"source":19,"target":19,"value":3},
 71 | {"source":22,"target":22,"value":3},
 72 | {"source":22,"target":0,"value":3},
 73 | {"source":8,"target":8,"value":3},
 74 | {"source":8,"target":0,"value":3},
 75 | {"source":8,"target":21,"value":3},
 76 | {"source":14,"target":0,"value":3},
 77 | {"source":14,"target":14,"value":3},
 78 | {"source":15,"target":15,"value":3},
 79 | {"source":15,"target":0,"value":3},
 80 | {"source":15,"target":21,"value":3},
 81 | {"source":4,"target":0,"value":3},
 82 | {"source":4,"target":4,"value":3},
 83 | {"source":13,"target":13,"value":3},
 84 | {"source":13,"target":0,"value":3},
 85 | {"source":13,"target":21,"value":3},
 86 | {"source":21,"target":0,"value":3},
 87 | {"source":21,"target":21,"value":3},
 88 | {"source":5,"target":0,"value":3},
 89 | {"source":5,"target":5,"value":3},
 90 | {"source":16,"target":16,"value":3},
 91 | {"source":16,"target":0,"value":3},
 92 | {"source":16,"target":21,"value":3},
 93 | {"source":18,"target":18,"value":3},
 94 | {"source":18,"target":0,"value":3},
 95 | {"source":18,"target":21,"value":3},
 96 | {"source":25,"target":0,"value":3},
 97 | {"source":25,"target":25,"value":3},
 98 | {"source":3,"target":0,"value":3},
 99 | {"source":3,"target":3,"value":3},
100 | {"source":3,"target":21,"value":3},
101 | {"source":12,"target":0,"value":3},
102 | {"source":12,"target":12,"value":3},
103 | {"source":20,"target":0,"value":3},
104 | {"source":20,"target":20,"value":3},
105 | {"source":1,"target":0,"value":3},
106 | {"source":1,"target":1,"value":3},
107 | {"source":6,"target":6,"value":3},
108 | {"source":6,"target":0,"value":3},
109 | {"source":6,"target":21,"value":3},
110 | {"source":10,"target":0,"value":3},
111 | {"source":10,"target":10,"value":3},
112 | {"source":10,"target":11,"value":3},
113 | {"source":2,"target":0,"value":3},
114 | {"source":2,"target":2,"value":3}]};


--------------------------------------------------------------------------------
/Capstone/Pagerank/spider.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import urllib.error
  3 | import ssl
  4 | from urllib.parse import urljoin
  5 | from urllib.parse import urlparse
  6 | from urllib.request import urlopen
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | # Ignore SSL certificate errors
 10 | ctx = ssl.create_default_context()
 11 | ctx.check_hostname = False
 12 | ctx.verify_mode = ssl.CERT_NONE
 13 | 
 14 | conn = sqlite3.connect('spider.sqlite')
 15 | cur = conn.cursor()
 16 | 
 17 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages
 18 |     (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
 19 |      error INTEGER, old_rank REAL, new_rank REAL)''')
 20 | 
 21 | cur.execute('''CREATE TABLE IF NOT EXISTS Links
 22 |     (from_id INTEGER, to_id INTEGER)''')
 23 | 
 24 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
 25 | 
 26 | # Check to see if we are already in progress...
 27 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
 28 | row = cur.fetchone()
 29 | if row is not None:
 30 |     print("Restarting existing crawl.  Remove spider.sqlite to start a fresh crawl.")
 31 | else :
 32 |     starturl = input('Enter web url or enter: ')
 33 |     if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/'
 34 |     if ( starturl.endswith('/') ) : starturl = starturl[:-1]
 35 |     web = starturl
 36 |     if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
 37 |         pos = starturl.rfind('/')
 38 |         web = starturl[:pos]
 39 | 
 40 |     if ( len(web) > 1 ) :
 41 |         cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
 42 |         cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
 43 |         conn.commit()
 44 | 
 45 | # Get the current webs
 46 | cur.execute('''SELECT url FROM Webs''')
 47 | webs = list()
 48 | for row in cur:
 49 |     webs.append(str(row[0]))
 50 | 
 51 | print(webs)
 52 | 
 53 | many = 0
 54 | while True:
 55 |     if ( many < 1 ) :
 56 |         sval = input('How many pages:')
 57 |         if ( len(sval) < 1 ) : break
 58 |         many = int(sval)
 59 |     many = many - 1
 60 | 
 61 |     cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
 62 |     try:
 63 |         row = cur.fetchone()
 64 |         # print row
 65 |         fromid = row[0]
 66 |         url = row[1]
 67 |     except:
 68 |         print('No unretrieved HTML pages found')
 69 |         many = 0
 70 |         break
 71 | 
 72 |     print(fromid, url, end=' ')
 73 | 
 74 |     # If we are retrieving this page, there should be no links from it
 75 |     cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
 76 |     try:
 77 |         document = urlopen(url, context=ctx)
 78 | 
 79 |         html = document.read()
 80 |         if document.getcode() != 200 :
 81 |             print("Error on page: ",document.getcode())
 82 |             cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
 83 | 
 84 |         if 'text/html' != document.info().get_content_type() :
 85 |             print("Ignore non text/html page")
 86 |             cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) )
 87 |             cur.execute('UPDATE Pages SET error=0 WHERE url=?', (url, ) )
 88 |             conn.commit()
 89 |             continue
 90 | 
 91 |         print('('+str(len(html))+')', end=' ')
 92 | 
 93 |         soup = BeautifulSoup(html, "html.parser")
 94 |     except KeyboardInterrupt:
 95 |         print('')
 96 |         print('Program interrupted by user...')
 97 |         break
 98 |     except:
 99 |         print("Unable to retrieve or parse page")
100 |         cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
101 |         conn.commit()
102 |         continue
103 | 
104 |     cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
105 |     cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) )
106 |     conn.commit()
107 | 
108 |     # Retrieve all of the anchor tags
109 |     tags = soup('a')
110 |     count = 0
111 |     for tag in tags:
112 |         href = tag.get('href', None)
113 |         if ( href is None ) : continue
114 |         # Resolve relative references like href="/contact"
115 |         up = urlparse(href)
116 |         if ( len(up.scheme) < 1 ) :
117 |             href = urljoin(url, href)
118 |         ipos = href.find('#')
119 |         if ( ipos > 1 ) : href = href[:ipos]
120 |         if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
121 |         if ( href.endswith('/') ) : href = href[:-1]
122 |         # print href
123 |         if ( len(href) < 1 ) : continue
124 | 
125 | 		# Check if the URL is in any of the webs
126 |         found = False
127 |         for web in webs:
128 |             if ( href.startswith(web) ) :
129 |                 found = True
130 |                 break
131 |         if not found : continue
132 | 
133 |         cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
134 |         count = count + 1
135 |         conn.commit()
136 | 
137 |         cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
138 |         try:
139 |             row = cur.fetchone()
140 |             toid = row[0]
141 |         except:
142 |             print('Could not retrieve id')
143 |             continue
144 |         # print fromid, toid
145 |         cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )
146 | 
147 | 
148 |     print(count)
149 | 
150 | cur.close()
151 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/spider.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spider.sqlite


--------------------------------------------------------------------------------
/Capstone/Pagerank/spjson.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | print("Creating JSON output on spider.js...")
 7 | howmany = int(input("How many nodes? "))
 8 | 
 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 
10 |     FROM Pages JOIN Links ON Pages.id = Links.to_id
11 |     WHERE html IS NOT NULL AND ERROR IS NULL
12 |     GROUP BY id ORDER BY id,inbound''')
13 | 
14 | fhand = open('spider.js','w')
15 | nodes = list()
16 | maxrank = None
17 | minrank = None
18 | for row in cur :
19 |     nodes.append(row)
20 |     rank = row[2]
21 |     if maxrank is None or maxrank < rank: maxrank = rank
22 |     if minrank is None or minrank > rank : minrank = rank
23 |     if len(nodes) > howmany : break
24 | 
25 | if maxrank == minrank or maxrank is None or minrank is None:
26 |     print("Error - please run sprank.py to compute page rank")
27 |     quit()
28 | 
29 | fhand.write('spiderJson = {"nodes":[\n')
30 | count = 0
31 | map = dict()
32 | ranks = dict()
33 | for row in nodes :
34 |     if count > 0 : fhand.write(',\n')
35 |     # print row
36 |     rank = row[2]
37 |     rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 
38 |     fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',')
39 |     fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}')
40 |     map[row[3]] = count
41 |     ranks[row[3]] = rank
42 |     count = count + 1
43 | fhand.write('],\n')
44 | 
45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
46 | fhand.write('"links":[\n')
47 | 
48 | count = 0
49 | for row in cur :
50 |     # print row
51 |     if row[0] not in map or row[1] not in map : continue
52 |     if count > 0 : fhand.write(',\n')
53 |     rank = ranks[row[0]]
54 |     srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 
55 |     fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}')
56 |     count = count + 1
57 | fhand.write(']};')
58 | fhand.close()
59 | cur.close()
60 | 
61 | print("Open force.html in a browser to view the visualization")
62 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/sprank.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | 
  3 | conn = sqlite3.connect('spider.sqlite')
  4 | cur = conn.cursor()
  5 | 
  6 | # Find the ids that send out page rank - we only are interested
  7 | # in pages in the SCC that have in and out links
  8 | cur.execute('''SELECT DISTINCT from_id FROM Links''')
  9 | from_ids = list()
 10 | for row in cur: 
 11 |     from_ids.append(row[0])
 12 | 
 13 | # Find the ids that receive page rank 
 14 | to_ids = list()
 15 | links = list()
 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
 17 | for row in cur:
 18 |     from_id = row[0]
 19 |     to_id = row[1]
 20 |     if from_id == to_id : continue
 21 |     if from_id not in from_ids : continue
 22 |     if to_id not in from_ids : continue
 23 |     links.append(row)
 24 |     if to_id not in to_ids : to_ids.append(to_id)
 25 | 
 26 | # Get latest page ranks for strongly connected component
 27 | prev_ranks = dict()
 28 | for node in from_ids:
 29 |     cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, ))
 30 |     row = cur.fetchone()
 31 |     prev_ranks[node] = row[0]
 32 | 
 33 | sval = input('How many iterations:')
 34 | many = 1
 35 | if ( len(sval) > 0 ) : many = int(sval)
 36 | 
 37 | # Sanity check
 38 | if len(prev_ranks) < 1 : 
 39 |     print("Nothing to page rank.  Check data.")
 40 |     quit()
 41 | 
 42 | # Lets do Page Rank in memory so it is really fast
 43 | for i in range(many):
 44 |     # print prev_ranks.items()[:5]
 45 |     next_ranks = dict();
 46 |     total = 0.0
 47 |     for (node, old_rank) in list(prev_ranks.items()):
 48 |         total = total + old_rank
 49 |         next_ranks[node] = 0.0
 50 |     # print total
 51 | 
 52 |     # Find the number of outbound links and sent the page rank down each
 53 |     for (node, old_rank) in list(prev_ranks.items()):
 54 |         # print node, old_rank
 55 |         give_ids = list()
 56 |         for (from_id, to_id) in links:
 57 |             if from_id != node : continue
 58 |            #  print '   ',from_id,to_id
 59 | 
 60 |             if to_id not in to_ids: continue
 61 |             give_ids.append(to_id)
 62 |         if ( len(give_ids) < 1 ) : continue
 63 |         amount = old_rank / len(give_ids)
 64 |         # print node, old_rank,amount, give_ids
 65 |     
 66 |         for id in give_ids:
 67 |             next_ranks[id] = next_ranks[id] + amount
 68 |     
 69 |     newtot = 0
 70 |     for (node, next_rank) in list(next_ranks.items()):
 71 |         newtot = newtot + next_rank
 72 |     evap = (total - newtot) / len(next_ranks)
 73 | 
 74 |     # print newtot, evap
 75 |     for node in next_ranks:
 76 |         next_ranks[node] = next_ranks[node] + evap
 77 | 
 78 |     newtot = 0
 79 |     for (node, next_rank) in list(next_ranks.items()):
 80 |         newtot = newtot + next_rank
 81 | 
 82 |     # Compute the per-page average change from old rank to new rank
 83 |     # As indication of convergence of the algorithm
 84 |     totdiff = 0
 85 |     for (node, old_rank) in list(prev_ranks.items()):
 86 |         new_rank = next_ranks[node]
 87 |         diff = abs(old_rank-new_rank)
 88 |         totdiff = totdiff + diff
 89 | 
 90 |     avediff = totdiff / len(prev_ranks)
 91 |     print(i+1, avediff)
 92 | 
 93 |     # rotate
 94 |     prev_ranks = next_ranks
 95 | 
 96 | # Put the final ranks back into the database
 97 | print(list(next_ranks.items())[:5])
 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''')
 99 | for (id, new_rank) in list(next_ranks.items()) :
100 |     cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id))
101 | conn.commit()
102 | cur.close()
103 | 
104 | 


--------------------------------------------------------------------------------
/Capstone/Pagerank/spreset.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | conn = sqlite3.connect('spider.sqlite')
 4 | cur = conn.cursor()
 5 | 
 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''')
 7 | conn.commit()
 8 | 
 9 | cur.close()
10 | 
11 | print("All pages set to a rank of 1.0")
12 | 


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 1/Hello World.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Oct 16 08:59:26 2017
4 | 
5 | @author: atse
6 | """
7 | 
8 | print("Hello World!")


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 2/Assignment 2.2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Oct 16 09:00:05 2017
4 | 
5 | @author: atse
6 | """
7 | 
8 | name = input("Enter your name")
9 | print("Hello %s" % name)


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 2/Assignment 2.3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:01:38 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | hrs = input("Enter Hours:")
 9 | rate = input("Enter Rate:")
10 | cost = float(hrs)*float(rate)
11 | 
12 | print("Pay: %s" % cost)
13 | 
14 | 


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 3/Assignment 3.1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:03:50 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | try:
 9 | 	h = input("please input your hour:")
10 | 	hour = float(h)
11 | 	r = input("please input your rate:")
12 | 	rate = float(r)
13 | 	if hour < 0:
14 | 		print("Please,input your positive numberic")
15 | 	elif rate < 0:
16 | 		print("Please,input your positive numberic")
17 | 	elif hour > 40:
18 | 		print("%.2f" % (40*rate+(hour-40)*1.5*rate))
19 | 	else:
20 | 		print("%.2f" % (hour*rate))
21 | except:
22 | 		print("Please,input your numberic")
23 | 


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 3/Assignment 3.3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:04:59 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | score  = float(input("Enter a score between 0.0 and 1.0: "))
 9 | if score<0.0 or score>1.0:
10 |     print("Error: Score out of range")
11 | if score<0.6:
12 |     print("F")
13 | elif score>=0.6 and score<0.7:
14 |     print("D")
15 | elif score>=0.7 and score<0.8:
16 |     print("C")
17 | elif score>=0.8 and score<0.9:
18 |     print("B")
19 | elif score>=0.9:
20 |     print("A")
21 |     


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 4/Assignment 4.6.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:07:25 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | def computepay(hrs,rps):
 9 |     final_pay = 0.0
10 |     hours = float(hrs)
11 |     rate_per_hour = float(rps)
12 |     if  hours>40:
13 |         final_pay += 40 * float(rate_per_hour)
14 |         hours -= 40
15 |         final_pay += hours * rate_per_hour * 1.5
16 |     else:
17 |         final_pay += hours * rate_per_hour 
18 |         
19 |     return final_pay
20 | 
21 | 
22 | hrs = input("Enter Hours: ")
23 | rate = input("Enter Rate: ")
24 | p = computepay(hrs,rate)
25 | print(p)
26 | 


--------------------------------------------------------------------------------
/Programming for Everybody/Chapter 5/Assignment 5.2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:09:05 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | largest = None
 9 | smallest = None
10 | while True:
11 |     try:
12 |         num = input("Enter a number: ")
13 |         if num == "done" : break
14 |         n = int(num)
15 |         if largest is None:
16 |             largest = n
17 |         if smallest is None:
18 |             smallest = n
19 |         if n > largest:
20 |             largest = n
21 |         if n < smallest:
22 |             smallest = n
23 |     except:
24 |         print('Invalid input')
25 | 
26 | print("Maximum is", largest)
27 | print("Minimum is", smallest)
28 | 


--------------------------------------------------------------------------------
/Python Data Structures/Atom Editor Test.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Python Data Structures/Atom Editor Test.PNG


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 10/Assignment 10.2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:29:38 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | name = input("Enter file:")
 9 | if len(name) < 1 : name = "mbox-short.txt"
10 | handle = open(name)
11 | a = handle.read()
12 | b = a.split("\n")
13 | d = []
14 | for i in b:
15 |     if i.startswith("From "):
16 |         c = i.split(":")
17 |         d.append(c[0][-2:])
18 | d.sort()
19 | counts = {}
20 | for j in d:
21 |     counts[j] = d.count(j)
22 | for k, l in counts.items():
23 |     print(k, l)
24 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 6/Assignment 6.5.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:11:42 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | text = "X-DSPAM-Confidence:    0.8475";
 9 | a = text[-6:]
10 | b = float(a)
11 | print(b)
12 | text.find(":")
13 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 7/Assignment 7.1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:12:55 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | # Use words.txt as the file name
 9 | fname = input("Enter file name: ")
10 | fh = open(fname)
11 | a = fh.read()
12 | print(a.upper().rstrip())


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 7/Assignment 7.2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:17:11 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | # Use the file name mbox-short.txt as the file name
 9 | fname = input("Enter file name: ")
10 | fh = open(fname)
11 | a = []
12 | for line in fh:
13 |     if line.startswith("X-DSPAM-Confidence:"):
14 |         a.append(float(line[-6:]))
15 | total = 0
16 | for i in a:
17 |     total = total + i
18 | mean = total/(len(a))    
19 | print("Average spam confidence:", mean)
20 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 7/words.txt:
--------------------------------------------------------------------------------
 1 | Writing programs or programming is a very creative
 2 | and rewarding activity  You can write programs for
 3 | many reasons ranging from making your living to solving
 4 | a difficult data analysis problem to having fun to helping
 5 | someone else solve a problem  This book assumes that
 6 | {\em everyone} needs to know how to program and that once
 7 | you know how to program, you will figure out what you want
 8 | to do with your newfound skills
 9 | 
10 | We are surrounded in our daily lives with computers ranging
11 | from laptops to cell phones  We can think of these computers
12 | as our personal assistants who can take care of many things
13 | on our behalf  The hardware in our current-day computers
14 | is essentially built to continuously ask us the question
15 | What would you like me to do next
16 | 
17 | Our computers are fast and have vasts amounts of memory and 
18 | could be very helpful to us if we only knew the language to 
19 | speak to explain to the computer what we would like it to 
20 | do next If we knew this language we could tell the 
21 | computer to do tasks on our behalf that were reptitive  
22 | Interestingly, the kinds of things computers can do best
23 | are often the kinds of things that we humans find boring
24 | and mind-numbing
25 | 
26 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 8/Assignment 8.4.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Spyder Editor
 4 | 
 5 | This is a temporary script file.
 6 | """
 7 | 
 8 | get = input('Please enter file name:')
 9 | handle = open(get)
10 | text = list()
11 | for line in handle:
12 | 	line = line.rstrip()
13 | 	line = line.split()
14 | 	for i in line:
15 | 		if i in text:
16 | 			continue
17 | 		else:
18 | 			text.append(i)
19 | text.sort()
20 | print(text)
21 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 8/Assignment 8.5.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:25:10 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | file = input('Please enter file name:')
 9 | handle = open(file)
10 | count = 0
11 | for line in handle:
12 | 	line = line.rstrip()
13 | 	if not line.startswith('From '):
14 | 		continue
15 | 	line = line.split()
16 | 	print(line[1])
17 | 	count = count+1
18 | print('There were', count, 'lines in the file with From as the first word')
19 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 8/romeo.txt:
--------------------------------------------------------------------------------
1 | But soft what light through yonder window breaks
2 | It is the east and Juliet is the sun
3 | Arise fair sun and kill the envious moon
4 | Who is already sick and pale with grief
5 | 
6 | 


--------------------------------------------------------------------------------
/Python Data Structures/Chapter 9/Assignment 9.4.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 09:27:17 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | file = input('Please enter file name: ')
 9 | handle = open(file)
10 | data1 = list()
11 | data2 = dict()
12 | for line in handle:
13 | 	line = line.rstrip()
14 | 	if not line.startswith('From '):
15 | 		continue
16 | 	line = line.split()
17 | 	line = line[1]
18 | 	data1.append(line)
19 | for i in data1:
20 | 	data2[i] = data2.get(i,0)+1
21 | 
22 | word = None
23 | max = None
24 | 
25 | for aa, bb in data2.items():
26 | 	if max is None or bb > max:
27 | 		word = aa
28 | 		max = bb
29 | 
30 | print(word, max)


--------------------------------------------------------------------------------
/Python Data Structures/Directory Test.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Python Data Structures/Directory Test.PNG


--------------------------------------------------------------------------------
/Python Data Structures/Test.py:
--------------------------------------------------------------------------------
1 | print("I am writing a line on code!")
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-for-Everybody-Coursera
2 | Coursera courses for the Python for Everybody Specialization by the University of Michigan. This specialization teaches the fundamentals on how to get started on learning to use Python. I for myself started out in a non-technical background and found a way to learn the material. 
3 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 2/First Database.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 2/First Database.db


--------------------------------------------------------------------------------
/Using Databases with Python/Week 2/First Database.db.sqbpro:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?><sqlb_project><db path="C:/Users/atse/Documents/Python for Everybody Specialization/Using Databases with Python/Week 2/First Database.db" foreign_keys="1"/><window><current_tab id="3"/></window><tab_structure><column_width id="0" width="300"/><column_width id="1" width="0"/><column_width id="2" width="100"/><column_width id="3" width="315"/><expanded_item id="0" parent="1"/><expanded_item id="1" parent="1"/><expanded_item id="2" parent="1"/><expanded_item id="3" parent="1"/></tab_structure><tab_browse><current_table name="Ages"/><default_encoding codec=""/><browsetable_info data="AAAAAQAAAAgAQQBnAGUAcwAAAAAAAAAAAAAAAAAAAAAAAAAAAP//////////AAAAAP////8="/></tab_browse><tab_sql><sql name="SQL 1">CREATE TABLE Ages ( 
 2 |   name VARCHAR(128), 
 3 |   age INTEGER
 4 | )
 5 | DELETE FROM Ages;
 6 | INSERT INTO Ages (name, age) VALUES ('Davie', 20);
 7 | INSERT INTO Ages (name, age) VALUES ('Daanyaal', 20);
 8 | INSERT INTO Ages (name, age) VALUES ('Ireayomide', 19);
 9 | INSERT INTO Ages (name, age) VALUES ('Jagat', 34);
10 | SELECT hex(name || age) AS X FROM Ages ORDER BY X
11 | </sql><current_tab id="0"/></tab_sql></sqlb_project>
12 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 2/First Database.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE Ages ( 
 2 |   name VARCHAR(128), 
 3 |   age INTEGER
 4 | )
 5 | DELETE FROM Ages;
 6 | INSERT INTO Ages (name, age) VALUES ('Davie', 20);
 7 | INSERT INTO Ages (name, age) VALUES ('Daanyaal', 20);
 8 | INSERT INTO Ages (name, age) VALUES ('Ireayomide', 19);
 9 | INSERT INTO Ages (name, age) VALUES ('Jagat', 34);
10 | 
11 | SELECT hex(name || age) AS X FROM Ages ORDER BY X
12 | 
13 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 2/emaildb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 23 09:12:07 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import sqlite3
 9 | 
10 | conn = sqlite3.connect('emaildb.sqlite')
11 | cur = conn.cursor()
12 | 
13 | cur.execute('''
14 | DROP TABLE IF EXISTS Counts''')
15 | 
16 | cur.execute('''
17 | CREATE TABLE Counts (org TEXT, count INTEGER)''')
18 | 
19 | fname = input('Enter file name: ')
20 | if (len(fname) < 1): fname = 'mbox-short.txt'
21 | fh = open(fname)
22 | for line in fh:
23 |     if not line.startswith('From: '): continue
24 |     pieces = line.split()
25 |     org = pieces[1].split('@')[1]
26 |     cur.execute('SELECT count FROM Counts WHERE org = ? ', (org,))
27 |     row = cur.fetchone()
28 |     if row is None:
29 |         cur.execute('''INSERT INTO Counts (org, count)
30 |                 VALUES (?, 1)''', (org,))
31 |     else:
32 |         cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?',
33 |                     (org,))
34 |     conn.commit()
35 | 
36 | # https://www.sqlite.org/lang_select.html
37 | sqlstr = 'SELECT org, count FROM Counts ORDER BY count DESC LIMIT 10'
38 | 
39 | for row in cur.execute(sqlstr):
40 |     print(str(row[0]), row[1])
41 | 
42 | cur.close()
43 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 2/emaildb.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 2/emaildb.sqlite


--------------------------------------------------------------------------------
/Using Databases with Python/Week 3/README.txt:
--------------------------------------------------------------------------------
1 | To export your own Library.xml from iTunes 
2 | 
3 | File -> Library -> Export Library
4 | 
5 | Make sure it is in the correct folder.   Of course iTUnes might change
6 | UI and/or export format any time - so good luck :)
7 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 3/trackdb.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 3/trackdb.sqlite


--------------------------------------------------------------------------------
/Using Databases with Python/Week 3/tracks.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | import sqlite3
 3 | 
 4 | conn = sqlite3.connect('trackdb.sqlite')
 5 | cur = conn.cursor()
 6 | 
 7 | # Make some fresh tables using executescript()
 8 | cur.executescript('''
 9 | DROP TABLE IF EXISTS Artist;
10 | DROP TABLE IF EXISTS Album;
11 | DROP TABLE IF EXISTS Track;
12 | CREATE TABLE Artist (
13 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
14 |     name    TEXT UNIQUE
15 | );
16 | CREATE TABLE Genre (
17 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
18 |     name    TEXT UNIQUE
19 | );
20 | CREATE TABLE Album (
21 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
22 |     artist_id  INTEGER,
23 |     title   TEXT UNIQUE
24 | );
25 | CREATE TABLE Track (
26 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
27 |     title TEXT  UNIQUE,
28 |     album_id  INTEGER,
29 |     genre_id INTEGER,
30 |     len INTEGER, rating INTEGER, count INTEGER
31 | );
32 | ''')
33 | 
34 | 
35 | fname = input('Enter file name: ')
36 | if (len(fname) < 1 ) : fname = 'Library.xml'
37 | 
38 | # <key>Track ID</key><integer>369</integer>
39 | # <key>Name</key><string>Another One Bites The Dust</string>
40 | # <key>Artist</key><string>Queen</string>
41 | def lookup(d, key):
42 |     found = False
43 |     for child in d:
44 |         if found : return child.text
45 |         if child.tag == 'key' and child.text == key :
46 |             found = True
47 |     return None
48 | 
49 | stuff = ET.parse(fname)
50 | all = stuff.findall('dict/dict/dict')
51 | print('Dict count:', len(all))
52 | for entry in all:
53 |     if ( lookup(entry, 'Track ID') is None ) : continue
54 | 
55 |     name = lookup(entry, 'Name')
56 |     artist = lookup(entry, 'Artist')
57 |     album = lookup(entry, 'Album')
58 |     count = lookup(entry, 'Play Count')
59 |     rating = lookup(entry, 'Rating')
60 |     length = lookup(entry, 'Total Time')
61 |     genre = lookup(entry, 'Genre')
62 | 
63 |     if name is None or artist is None or album is None or genre is None : 
64 |         continue
65 | 
66 |     print(name, artist, album, count, rating, length, genre)
67 | 
68 |     cur.execute('''INSERT OR IGNORE INTO Artist (name) 
69 |         VALUES ( ? )''', ( artist, ) )
70 |     cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
71 |     artist_id = cur.fetchone()[0]
72 | 
73 |     cur.execute('''INSERT OR IGNORE INTO Genre (name) 
74 |         VALUES ( ? )''', ( genre, ) )
75 |     cur.execute('SELECT id FROM Genre WHERE name = ?', (genre, ))
76 |     genre_id = cur.fetchone()[0]
77 | 
78 |     cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 
79 |         VALUES ( ?, ? )''', ( album, artist_id ) )
80 |     cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
81 |     album_id = cur.fetchone()[0]
82 | 
83 |     cur.execute('''INSERT OR REPLACE INTO Track
84 |         (title, album_id, len, rating, count, genre_id) 
85 |         VALUES ( ?, ?, ?, ?, ?, ?)''', 
86 |         ( name, album_id, length, rating, count, genre_id) )
87 | 
88 |     conn.commit()
89 | 
90 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 3/trackscomplete.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Oct 23 09:52:41 2017
  4 | 
  5 | @author: atse
  6 | """
  7 | 
  8 | import xml.etree.ElementTree as ET
  9 |     #parses the xml into element form
 10 | import sqlite3
 11 |     #creates if does not exist database for tracks/if it does just establishes connection to the db
 12 | conn = sqlite3.connect('Trackdb.sqlite')
 13 |     #establishes connection to the database/
 14 | cur = conn.cursor()
 15 | 
 16 | #clears all tables out so no conflicting or bad info to begin
 17 | cur.executescript('''
 18 | DROP TABLE IF EXISTS Artist;
 19 | DROP TABLE IF EXISTS Album;
 20 | DROP TABLE IF EXISTS Genre;
 21 | DROP TABLE IF EXISTS Track;''')
 22 | 
 23 |     # creates the table 'Artist' id key autoincrements and must be unique, the artists name is stored there and will be referenced by table downstream
 24 | cur.execute(''' CREATE TABLE Artist (
 25 |     id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
 26 |     name TEXT UNIQUE
 27 |     );''')
 28 | 
 29 | cur.execute(''' CREATE TABLE Genre (
 30 |     id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
 31 |     name TEXT UNIQUE
 32 |     );''') # each line starts with the name of the prospective column followed by the schema thats being set for it entries
 33 |     #
 34 | cur.execute(''' CREATE TABLE Album (
 35 |     id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
 36 |     artist_id INTEGER,
 37 |     title TEXT UNIQUE
 38 |     );''')
 39 | cur.execute(''' CREATE TABLE Track (
 40 |     id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
 41 |     title TEXT UNIQUE,
 42 |     album_id INTEGER,
 43 |     genre_id INTEGER,
 44 |     len INTEGER,
 45 |     rating INTEGER,
 46 |     count INTEGER
 47 |     );''')
 48 | 
 49 | 
 50 | tree = ET.parse(r'C:\Users\\atse\Documents\Python for Everybody Specialization\Using Databases with Python\Week 3\Library.xml')
 51 | 
 52 | root = tree.getroot()
 53 |     # . - selects current node (plist) most useful at beginning path
 54 |     #/dict/dict//dict - then goes dict branch then next child then //
 55 |     # //dict selects all child elements 'dict' within that wrung of tree (ladder)
 56 | itlist = root.findall('./dict/dict//dict')
 57 | 
 58 | def lookup(dic, key):
 59 |     found = False
 60 |     for child in dic:
 61 |         if found: return child.text
 62 |         if child.tag == 'key' and child.text == key:
 63 |             found = True
 64 |     return None
 65 | 
 66 | print('Dict count', len(itlist))
 67 | for entry in itlist:
 68 |     if (lookup(entry,'Track ID') is None) : continue
 69 | 
 70 |     name = lookup(entry, 'Name')
 71 |     artist = lookup(entry, 'Artist')
 72 |     album = lookup(entry, 'Album')
 73 |     genre = lookup(entry, 'Genre')
 74 |     length = lookup(entry, 'Total Time')
 75 |     count = lookup(entry, 'Play Count')
 76 |     rating = lookup(entry, 'Rating')
 77 | 
 78 |     if name is None or artist is None or album is None or genre is None : continue
 79 | 
 80 | 
 81 |     print(name, artist, album, count, rating, length)
 82 |     #insert the new row into DB and table, then specify colum
 83 |     #to place information into, BUT b/c using variable
 84 |     #need to use ? placeholder, b/c value is in a variable
 85 |     #and not directly entered into the VALUES field
 86 |     cur.execute('''INSERT or IGNORE INTO Artist (name)
 87 |         VALUES (?)''', (artist,))
 88 |     # grabs the row with the corresponding artist name, impt ONLY 1 though
 89 |     # selects this info, this way, because no human error, computer handles
 90 |     # getting the value, and need the value because going to use it in the
 91 |     # following entry
 92 |     cur.execute('SELECT id FROM Artist WHERE name = ?', (artist,))
 93 |     artist_id = cur.fetchone()[0]
 94 | 
 95 |     cur.execute('''INSERT OR IGNORE INTO Genre (name)
 96 |         VALUES (?)''', ( genre, ) )
 97 | 
 98 |     cur.execute('SELECT id FROM Genre WHERE name = ?', ( genre, ) )
 99 |     genre_id = cur.fetchone()[0]
100 | 
101 |     cur.execute('''INSERT or IGNORE INTO Album (title, artist_id)
102 |         VALUES (?,?)''', (album, artist_id))
103 |     cur.execute('SELECT id FROM Album WHERE title = ?', (album,))
104 |     # good way to think here is that row is actually what the cursor is pointing at
105 |     album_id = cur.fetchone()[0]
106 | 
107 |     cur.execute('''INSERT OR REPLACE INTO Track
108 |         (title, album_id, genre_id, len, rating, count)
109 |         VALUES (?,?,?,?,?,?)''',
110 |         (name, album_id, genre_id, length, rating, count))
111 | 
112 |     conn.commit()


--------------------------------------------------------------------------------
/Using Databases with Python/Week 3/tracksdb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 23 09:59:12 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import xml.etree.ElementTree as ET
 9 | import sqlite3
10 | 
11 | conn = sqlite3.connect('trackdb.sqlite')
12 | cur = conn.cursor()
13 | 
14 | # Make some fresh tables using executescript()
15 | cur.executescript('''
16 | DROP TABLE IF EXISTS Artist;
17 | DROP TABLE IF EXISTS Genre;
18 | DROP TABLE IF EXISTS Album;
19 | DROP TABLE IF EXISTS Track;
20 | CREATE TABLE Artist (
21 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
22 |     name    TEXT UNIQUE
23 | );
24 | CREATE TABLE Genre (
25 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
26 |     name    TEXT UNIQUE
27 | );
28 | CREATE TABLE Album (
29 |     id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
30 |     artist_id  INTEGER,
31 |     title   TEXT UNIQUE
32 | );
33 | CREATE TABLE Track (
34 |     id  INTEGER NOT NULL PRIMARY KEY 
35 |         AUTOINCREMENT UNIQUE,
36 |     title TEXT  UNIQUE,
37 |     album_id  INTEGER,
38 |     genre_id INTEGER,
39 |     len INTEGER, rating INTEGER, count INTEGER
40 | );
41 | ''')
42 | 
43 | 
44 | fname = input('Enter file name: ')
45 | if ( len(fname) < 1 ) : fname = 'Library.xml'
46 | 
47 | # <key>Track ID</key><integer>369</integer>
48 | # <key>Name</key><string>Another One Bites The Dust</string>
49 | # <key>Artist</key><string>Queen</string>
50 | def lookup(d, key):
51 |     found = False
52 |     for child in d:
53 |         if found : return child.text
54 |         if child.tag == 'key' and child.text == key :
55 |             found = True
56 |     return None
57 | 
58 | stuff = ET.parse(fname)
59 | all = stuff.findall('dict/dict/dict')
60 | print ('Dict count:', len(all))
61 | for entry in all:
62 |     if ( lookup(entry, 'Track ID') is None ) : continue
63 | 
64 |     name = lookup(entry, 'Name')
65 |     artist = lookup(entry, 'Artist')
66 |     album = lookup(entry, 'Album')
67 |     genre = lookup(entry, 'Genre')
68 |     count = lookup(entry, 'Play Count')
69 |     rating = lookup(entry, 'Rating')
70 |     length = lookup(entry, 'Total Time')
71 | 
72 |     if name is None or artist is None or genre is None or album is None : 
73 |         continue
74 | 
75 |     print(name, artist, album, genre, count, rating, length)
76 | 
77 |     cur.execute('''INSERT OR IGNORE INTO Artist (name) 
78 |         VALUES ( ? )''', ( artist, ) )
79 |     cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
80 |     artist_id = cur.fetchone()[0]
81 | 
82 |     cur.execute('''INSERT OR IGNORE INTO Genre (name) 
83 |     VALUES ( ? )''', ( genre, ) )
84 |     cur.execute('SELECT id FROM Genre WHERE name = ? ', (genre, ))
85 |     genre_id = cur.fetchone()[0]
86 | 
87 |     cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 
88 |         VALUES ( ?, ? )''', ( album, artist_id ) )
89 |     cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
90 |     album_id = cur.fetchone()[0]
91 | 
92 |     cur.execute('''INSERT OR REPLACE INTO Track
93 |         (title, album_id, genre_id, len, rating, count) 
94 |         VALUES ( ?, ?, ?, ?, ?, ? )''', 
95 |         ( name, album_id, genre_id, length, rating, count ) )
96 | 
97 |     conn.commit()


--------------------------------------------------------------------------------
/Using Databases with Python/Week 4/HW Result.sql:
--------------------------------------------------------------------------------
1 | SELECT hex(User.name || Course.title || Member.role ) AS X FROM 
2 |     User JOIN Member JOIN Course 
3 |     ON User.id = Member.user_id AND Member.course_id = Course.id
4 |     ORDER BY X
5 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 4/roster.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 23 20:23:04 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import json
 9 | import sqlite3
10 | 
11 | conn = sqlite3.connect('rosterdb.sqlite')
12 | cur = conn.cursor()
13 | 
14 | # Do some setup
15 | cur.executescript('''
16 | DROP TABLE IF EXISTS User;
17 | DROP TABLE IF EXISTS Member;
18 | DROP TABLE IF EXISTS Course;
19 | CREATE TABLE User (
20 |     id     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
21 |     name   TEXT UNIQUE
22 | );
23 | CREATE TABLE Course (
24 |     id     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
25 |     title  TEXT UNIQUE
26 | );
27 | CREATE TABLE Member (
28 |     user_id     INTEGER,
29 |     course_id   INTEGER,
30 |     role        INTEGER,
31 |     PRIMARY KEY (user_id, course_id)
32 | )
33 | ''')
34 | 
35 | fname = input('Enter file name: ')
36 | if ( len(fname) < 1 ) : fname = 'roster_data.json'
37 | 
38 | # [
39 | #   [ "Charley", "si110", 1 ],
40 | #   [ "Mea", "si110", 0 ],
41 | 
42 | str_data = open(fname).read()
43 | json_data = json.loads(str_data)
44 | 
45 | for entry in json_data:
46 | 
47 |     name = entry[0];
48 |     title = entry[1];
49 |     role = entry[2];
50 | 
51 |     print(name, title, role)
52 | 
53 |     cur.execute('''INSERT OR IGNORE INTO User (name) 
54 |         VALUES ( ? )''', ( name, ) )
55 |     cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
56 |     user_id = cur.fetchone()[0]
57 | 
58 |     cur.execute('''INSERT OR IGNORE INTO Course (title) 
59 |         VALUES ( ? )''', ( title, ) )
60 |     cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
61 |     course_id = cur.fetchone()[0]
62 | 
63 |     cur.execute('''INSERT OR REPLACE INTO Member
64 |         (user_id, course_id, role) VALUES ( ?, ?, ? )''', 
65 |         ( user_id, course_id, role ) )
66 | 
67 |     conn.commit()
68 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 4/rosterdb.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 4/rosterdb.sqlite


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/Google API Key.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/Google API Key.doc


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/README.txt:
--------------------------------------------------------------------------------
  1 | Using the Google Places API with a Database and
  2 | Visualizing Data on Google Map
  3 | 
  4 | In this project, we are using the Google geocoding API
  5 | to clean up some user-entered geographic locations of
  6 | university names and then placing the data on a Google
  7 | Map.
  8 | 
  9 | Note: Windows has difficulty in displaying UTF-8 characters
 10 | in the console so for each command window you open, you may need
 11 | to type the following command before running this code:
 12 | 
 13 |     chcp 65001
 14 | 
 15 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how
 16 | 
 17 | 
 18 | You should install the SQLite browser to view and modify
 19 | the databases from:
 20 | 
 21 | http://sqlitebrowser.org/
 22 | 
 23 | The first problem to solve is that the Google geocoding
 24 | API is rate limited to a fixed number of requests per day.
 25 | So if you have a lot of data you might need to stop and
 26 | restart the lookup process several times.  So we break
 27 | the problem into two phases.
 28 | 
 29 | In the first phase we take our input data in the file
 30 | (where.data) and read it one line at a time, and retrieve the
 31 | geocoded response and store it in a database (geodata.sqlite).
 32 | Before we use the geocoding API, we simply check to see if
 33 | we already have the data for that particular line of input.
 34 | 
 35 | You can re-start the process at any time by removing the file
 36 | geodata.sqlite
 37 | 
 38 | Run the geoload.py program.   This program will read the input
 39 | lines in where.data and for each line check to see if it is already
 40 | in the database and if we don't have the data for the location,
 41 | call the geocoding API to retrieve the data and store it in
 42 | the database.
 43 | 
 44 | As of December 2016, the Google Geocoding APIs changed dramatically.
 45 | They moved some functionality that we use from the Geocoding API
 46 | into the Places API.  Also all the Google Geo-related APIs require an
 47 | API key. To complete this assignment without a Google account,
 48 | without an API key, or from a country that blocks
 49 | access to Google, you can use a subset of that data which is
 50 | available at:
 51 | 
 52 | http://py4e-data.dr-chuck.net/geojson
 53 | 
 54 | To use this, simply leave the api_key set to False in 
 55 | geoload.py.
 56 | 
 57 | This URL only has a subset of the data but it has no rate limit so
 58 | it is good for testing.
 59 | 
 60 | If you want to try this with the API key, follow the
 61 | instructions at:
 62 | 
 63 | https://developers.google.com/maps/documentation/geocoding/intro
 64 | 
 65 | and put the API key in the code.
 66 | 
 67 | Here is a sample run after there is already some data in the
 68 | database:
 69 | 
 70 | Mac: python3 geoload.py
 71 | Win: geoload.py
 72 | 
 73 | Found in database  Northeastern University
 74 | 
 75 | Found in database  University of Hong Kong, Illinois Institute of Technology, Bradley University
 76 | 
 77 | Found in database  Technion
 78 | 
 79 | Found in database  Viswakarma Institute, Pune, India
 80 | 
 81 | Found in database  UMD
 82 | 
 83 | Found in database  Tufts University
 84 | 
 85 | Resolving Monash University
 86 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Monash+University
 87 | Retrieved 2063 characters {    "results" : [
 88 | {u'status': u'OK', u'results': ... }
 89 | 
 90 | Resolving Kokshetau Institute of Economics and Management
 91 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Kokshetau+Institute+of+Economics+and+Management
 92 | Retrieved 1749 characters {    "results" : [
 93 | {u'status': u'OK', u'results': ... }
 94 | 
 95 | The first five locations are already in the database and so they
 96 | are skipped.  The program scans to the point where it finds un-retrieved
 97 | locations and starts retrieving them.
 98 | 
 99 | The geoload.py can be stopped at any time, and there is a counter
100 | that you can use to limit the number of calls to the geocoding
101 | API for each run.
102 | 
103 | Once you have some data loaded into geodata.sqlite, you can
104 | visualize the data using the (geodump.py) program.  This
105 | program reads the database and writes tile file (where.js)
106 | with the location, latitude, and longitude in the form of
107 | executable JavaScript code.
108 | 
109 | A run of the geodump.py program is as follows:
110 | 
111 | Mac: python3 geodump.py
112 | Win: geodump.py
113 | 
114 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975
115 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811
116 | ...
117 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667
118 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682
119 | Kokshetau, Kazakhstan 53.2833333 69.3833333
120 | ...
121 | 12 records written to where.js
122 | Open where.html to view the data in a browser
123 | 
124 | The file (where.html) consists of HTML and JavaScript to visualize
125 | a Google Map.  It reads the most recent data in where.js to get
126 | the data to be visualized.  Here is the format of the where.js file:
127 | 
128 | myData = [
129 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'],
130 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'],
131 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'],
132 |    ...
133 | ];
134 | 
135 | This is a JavaScript list of lists.  The syntax for JavaScript
136 | list constants is very similar to Python so the syntax should
137 | be familiar to you.
138 | 
139 | Simply open where.html in a browser to see the locations.  You
140 | can hover over each map pin to find the location that the
141 | gecoding API returned for the user-entered input.  If you
142 | cannot see any data when you open the where.html file, you might
143 | want to check the JavaScript or developer console for your browser.
144 | 
145 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/geodata.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/geodata.sqlite


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/geodump.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/geodump.png


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/geodump.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import json
 3 | import codecs
 4 | 
 5 | conn = sqlite3.connect('geodata.sqlite')
 6 | cur = conn.cursor()
 7 | 
 8 | cur.execute('SELECT * FROM Locations')
 9 | fhand = codecs.open('where.js', 'w', "utf-8")
10 | fhand.write("myData = [\n")
11 | count = 0
12 | for row in cur :
13 |     data = str(row[1].decode())
14 |     try: js = json.loads(str(data))
15 |     except: continue
16 | 
17 |     if not('status' in js and js['status'] == 'OK') : continue
18 | 
19 |     lat = js["results"][0]["geometry"]["location"]["lat"]
20 |     lng = js["results"][0]["geometry"]["location"]["lng"]
21 |     if lat == 0 or lng == 0 : continue
22 |     where = js['results'][0]['formatted_address']
23 |     where = where.replace("'", "")
24 |     try :
25 |         print(where, lat, lng)
26 | 
27 |         count = count + 1
28 |         if count > 1 : fhand.write(",\n")
29 |         output = "["+str(lat)+","+str(lng)+", '"+where+"']"
30 |         fhand.write(output)
31 |     except:
32 |         continue
33 | 
34 | fhand.write("\n];\n")
35 | cur.close()
36 | fhand.close()
37 | print(count, "records written to where.js")
38 | print("Open where.html to view the data in a browser")
39 | 
40 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/geoload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/geoload.png


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/geoload.py:
--------------------------------------------------------------------------------
 1 | import urllib.request, urllib.parse, urllib.error
 2 | import http
 3 | import sqlite3
 4 | import json
 5 | import time
 6 | import ssl
 7 | import sys
 8 | 
 9 | api_key = False
10 | # If you have a Google Places API key, enter it here
11 | api_key = 'AIzaSyDT8qBVoIMqfs6VvXZynQ9YbutG3kDOdmA '
12 | 
13 | if api_key is False:
14 |     serviceurl = "http://py4e-data.dr-chuck.net/geojson?"
15 | else :
16 |     serviceurl = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
17 | 
18 | # Additional detail for urllib
19 | # http.client.HTTPConnection.debuglevel = 1
20 | 
21 | conn = sqlite3.connect('geodata.sqlite')
22 | cur = conn.cursor()
23 | 
24 | cur.execute('''
25 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''')
26 | 
27 | # Ignore SSL certificate errors
28 | ctx = ssl.create_default_context()
29 | ctx.check_hostname = False
30 | ctx.verify_mode = ssl.CERT_NONE
31 | 
32 | fh = open("where.data")
33 | count = 0
34 | for line in fh:
35 |     if count > 200 :
36 |         print('Retrieved 200 locations, restart to retrieve more')
37 |         break
38 | 
39 |     address = line.strip()
40 |     print('')
41 |     cur.execute("SELECT geodata FROM Locations WHERE address= ?",
42 |         (memoryview(address.encode()), ))
43 | 
44 |     try:
45 |         data = cur.fetchone()[0]
46 |         print("Found in database ",address)
47 |         continue
48 |     except:
49 |         pass
50 | 
51 |     parms = dict()
52 |     parms["query"] = address
53 |     if api_key is not False: parms['key'] = api_key
54 |     url = serviceurl + urllib.parse.urlencode(parms)
55 | 
56 |     print('Retrieving', url)
57 |     uh = urllib.request.urlopen(url, context=ctx)
58 |     data = uh.read().decode()
59 |     print('Retrieved', len(data), 'characters', data[:20].replace('\n', ' '))
60 |     count = count + 1
61 | 
62 |     try:
63 |         js = json.loads(data)
64 |     except:
65 |         print(data)  # We print in case unicode causes an error
66 |         continue
67 | 
68 |     if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') :
69 |         print('==== Failure To Retrieve ====')
70 |         print(data)
71 |         break
72 | 
73 |     cur.execute('''INSERT INTO Locations (address, geodata)
74 |             VALUES ( ?, ? )''', (memoryview(address.encode()), memoryview(data.encode()) ) )
75 |     conn.commit()
76 |     if count % 10 == 0 :
77 |         print('Pausing for a bit...')
78 |         time.sleep(5)
79 | 
80 | print("Run geodump.py to read the data from the database so you can vizualize it on a map.")
81 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/where.data:
--------------------------------------------------------------------------------
  1 | AGH University of Science and Technology
  2 | Academy of Fine Arts Warsaw Poland
  3 | American University in Cairo
  4 | Arizona State University
  5 | Athens Information Technology
  6 | BITS Pilani
  7 | Babcock University
  8 | Banaras Hindu University
  9 | Bangalore University
 10 | Baylor University
 11 | Beijing normal university
 12 | Belarusian State University
 13 | Belgrade University
 14 | Beloit College
 15 | Belorussian State University
 16 | Ben Gurion University
 17 | Bharthidasan University
 18 | Boston University
 19 | California Polytechnic State University of San Luis Obispo
 20 | California State University San Bernardino
 21 | City of Westminster College
 22 | Columbia University
 23 | Cranfield University
 24 | Czech Technical University in Prague
 25 | Dartmouth
 26 | De Anza College
 27 | Distant University of Hagen
 28 | Dnipropetrovsk National University
 29 | Dokuz Eylul University
 30 | Drexel
 31 | Drexel University and University of Texas at Austin
 32 | Duke University
 33 | EM Lyon
 34 | Ecole centrale de PARIS
 35 | Elon University
 36 | Erhvervsakademi Sydvest
 37 | Escuela Superior Politecnica del Litoral
 38 | Fachhochschule Dusseldorf
 39 | Fachhochschule FH Salzburg
 40 | Faculdade de Tecnologia do Estado de Sao Paulo
 41 | Faculty of Technical Sciences Novi Sad Serbia
 42 | Farmingdale State University
 43 | Federal University of Minas Gerais
 44 | Florida Atlantic University
 45 | Franklin Pierce College
 46 | Gauhati University
 47 | George Mason University
 48 | Georgetown University Law Center
 49 | Georgia State University
 50 | Grandville
 51 | Groep T University
 52 | Hanoi University of Science and Technology
 53 | Hebrew University
 54 | IIIT Hyderabad
 55 | IIT KANPUR
 56 | IT College of Estonia
 57 | IU
 58 | IUAV Venezia
 59 | Illinois Institute of Technology
 60 | Illinois State University Joliet Junior College
 61 | Indian Institute of Technology
 62 | Indian Institute of Technology Kharagpur India
 63 | Indian School of Mines Dhanbad
 64 | Indiana University
 65 | Indiana University at Bloomington
 66 | Institut Superieur de technologies
 67 | Institute of Business and Modern Technologies
 68 | Instituto Tecnologico de Santo Domingo
 69 | International Institute of Information Technology Hyderabad
 70 | Irkutsk State University
 71 | JADAVPUR UNIVERSITY
 72 | Jawaharlal Nehru Technological University
 73 | Jawaharlal Nehru University
 74 | Jordan University of Science and Technology
 75 | K-State
 76 | KUL
 77 | Kalamazoo College
 78 | Kaunas Technology University
 79 | Kaunas university of technology
 80 | Kazan Federal University
 81 | Kent State University
 82 | Kharkiv State Academy of Municipal Economy Ukraine
 83 | King Mongkuts University of Technology Thonburi
 84 | Kokshetau Institute of Economics and Management
 85 | Kyiv Polytechnic Institute
 86 | Kyiv Polytechnical Institute
 87 | Kyiv Unisersity of Oriental Language
 88 | Laurentian University
 89 | Lisandro Alvarado
 90 | Lodz University of Technology
 91 | Lviv University
 92 | MSU
 93 | Madras university
 94 | Magnitogorsk State Technical University
 95 | Malayer Azad University
 96 | Marietta College
 97 | Masdar Institute
 98 | Matematicki fakultet Beograd
 99 | Michigan State University
100 | Middle East Technical University
101 | Missouri University of Science and Technology
102 | Monash
103 | Monash University
104 | Monash University Churchill Australia
105 | Monterrey Institute of Technology and Higher Education
106 | Moscow Engineering-Physics Institute
107 | Moscow Institute of Physics & Technology
108 | Moscow State University
109 | NIT ROURKELA
110 | NYU
111 | Nagpur University
112 | Nanyang Technological University
113 | National Institute of Technology Jalandhar
114 | National Taiwan University
115 | National University of Engineering
116 | North Central College
117 | Northeastern University
118 | Northwestern University
119 | Obninsk Technical University of Nuclear Power Engineering Russia
120 | Old Dominion University
121 | Oregon Institute of Technology
122 | PUCMM
123 | Payame Noor University
124 | Penn State University
125 | Politecnico di Milano
126 | Politehnica University Bucharest
127 | Polytechnic University of Timisoara
128 | Pondicherry University
129 | Pontificia universidad catolica de chile
130 | Portland State University
131 | Purdue University Indianapolis
132 | R V College of Engineering
133 | RPI
134 | Ramapo College of New Jersey
135 | Rochester Institute of Technology
136 | SASTRA University
137 | Saint Petersburg State University
138 | Saint Petersburg State University of Aerospace Instrumentation
139 | Saint-Petersburg Polytechnic Univesity
140 | San Francisco State University
141 | San Jose State University
142 | Shanghai Jiao Tong University
143 | Sharif University of Technology
144 | Simon Bolivar University
145 | Simon Fraser University
146 | Smolensk State University
147 | Sonoma State University
148 | South Federal University
149 | Spiru Haret University
150 | Stanford
151 | State University of Campinas
152 | State University of New York College at Oswego
153 | Stellenbosch University
154 | Stonehill College
155 | Tallinn University
156 | Tallinn University of Technology
157 | Tampere University of Technology
158 | Tanta University
159 | Tarrant County College
160 | Technical University of Cluj-Napoca
161 | Technion
162 | Tel Aviv University
163 | The Jerusalem collage of engineering
164 | The University of Latvia
165 | The University of Manchester
166 | The University of South Africa
167 | Transilvania University
168 | Tufts University
169 | UC Berkeley
170 | UCLA
171 | UCSD
172 | UIUC
173 | UMD
174 | UNISA
175 | UNIVERSIDAD DE Buenos Aires
176 | UOC
177 | USC
178 | UW Madison
179 | Universidad Central de Venezuela
180 | Universidad Complutense de Madrid
181 | Universidad Cooperativa de Colombia
182 | Universidad Nacional Autonoma de Mexico
183 | Universidad Nacional Costa Rica
184 | Universidad Nacional de Colombia
185 | Universidad Tecnologica Boliviana
186 | Universidad de Buenos Aires
187 | Universidad de Castilla La Mancha
188 | Universidad de Los Andes Colombia
189 | Universidad de Oriente
190 | Universidad de San Carlos de Guatemala
191 | Universidad de Valladolid
192 | Universidad de la Sabana
193 | Universidad del Valle de Guatemala
194 | Universidade Federal da Paraiba
195 | Universidade Federal de Santa Catarina
196 | Universidade Federal do Rio Grande do Sul
197 | Universidade Federal do Rio de Janeiro
198 | Universidade Tecnica de Lisboa
199 | Universidade de Sao Paulo
200 | Universidade do Minho
201 | Universitas Gadjah Mada
202 | Universitat Politecnica de Valencia
203 | Universite Catholique de Louvain
204 | University College Dublin
205 | University Munich
206 | University of Akron
207 | University of Alberta
208 | University of Amsterdam
209 | University of Arkansas
210 | University of Athens
211 | University of Belgrade
212 | University of Birmingham
213 | University of Buenos Aires
214 | University of Cambridge
215 | University of Central Oklahoma
216 | University of Chicago
217 | University of Cincinnati
218 | University of Colorado at Boulder
219 | University of Connecticut
220 | University of Dallas
221 | University of Debrecen
222 | University of Delaware
223 | University of Erlangen-Nuremberg
224 | University of Essex
225 | University of Evora
226 | University of Florida
227 | University of Gothenburg
228 | University of Greifswald
229 | University of Hamburg
230 | University of Hawaii
231 | University of Helsinki
232 | University of Ilorin Kwara State
233 | University of Jaffna
234 | University of Kansas
235 | University of Kerala
236 | University of London
237 | University of Malaga
238 | University of Malaya
239 | University of Manchester
240 | University of Michigan
241 | University of Missouri - Columbia
242 | University of Moratuwa
243 | University of Mumbai
244 | University of Nebraska
245 | University of Nebraska - Lincoln
246 | University of New Haven
247 | University of New South Wales
248 | University of Notre Dame
249 | University of Oklahoma
250 | University of Ottawa
251 | University of Oxford
252 | University of Padua
253 | University of Pavia Italy
254 | University of Pennsylvania
255 | University of Piraeus Athens
256 | University of Pretoria
257 | University of Salamanca
258 | University of Sao Paulo
259 | University of Sarajevo
260 | University of Southern California
261 | University of Stellenbosch
262 | University of Tartu
263 | University of Tehran
264 | University of Texas
265 | University of Texas at Austin
266 | University of Toronto
267 | University of Tuebingen
268 | University of Twente
269 | University of Utah
270 | University of Vienna
271 | University of Warsaw
272 | University of Washington
273 | University of Washington - Bothell
274 | University of Waterloo
275 | University of West Florida
276 | University of Wisconsin
277 | University of the Punjab Lahore
278 | University of the Witwatersrand
279 | Vilnius Gediminas Technical University
280 | Vilnius University
281 | Virginia Commonwealth University
282 | Virginia Tech
283 | Viswakarma Institute Pune India
284 | Warsaw University
285 | Washington State University
286 | Wayne State
287 | Weber State
288 | Weizmann Institute of Science
289 | Western Governors University
290 | Xavier University
291 | Zagazig University
292 | allama iqbal open university islamabad
293 | arizona state university
294 | federal institute of tecnology and education from southeastern Minas Gerais
295 | kansas state university
296 | universidad complutense de madrid
297 | university of Patras
298 | university of padua
299 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/where.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <meta name="viewport" content="initial-scale=1.0, user-scalable=no">
 4 |     <meta charset="utf-8">
 5 |     <title>A Map of Information</title>
 6 |     <link href="http://google-developers.appspot.com/maps/documentation/javascript/examples/default.css" rel="stylesheet">
 7 | 
 8 |     <!-- If you are in China, you may need to use theis site for the Google Maps code
 9 |     <script src="http://maps.google.cn/maps/api/js" type="text/javascript"></script> -->
10 |     <script src="http://maps.googleapis.com/maps/api/js?key=AIzaSyDT8qBVoIMqfs6VvXZynQ9YbutG3kDOdmA &callback=initMap"></script>
11 | 
12 |     <script src="http://google-maps-utility-library-v3.googlecode.com/svn/trunk/markerclusterer/src/markerclusterer_compiled.js"></script>
13 |     <script src="where.js"></script>
14 |     <script>
15 |       function initialize() {
16 |         alert("To see the title of a marker, hover over the marker but don't click.");
17 |         var myLatlng = new google.maps.LatLng(37.8272, 122.2913)
18 |         var mapOptions = {
19 |           zoom: 3,
20 |           center: myLatlng,
21 |           mapTypeId: google.maps.MapTypeId.ROADMAP
22 |         }
23 |         var map = new google.maps.Map(document.getElementById('map_canvas'), mapOptions);
24 |         i = 0;
25 |         var markers = [];
26 |         for ( pos in myData ) {
27 |             i = i + 1;
28 |             var row = myData[pos];
29 | 		    window.console && console.log(row);
30 |             // if ( i < 3 ) { alert(row); }
31 |             var newLatlng = new google.maps.LatLng(row[0], row[1]);
32 |             var marker = new google.maps.Marker({
33 |                 position: newLatlng,
34 |                 map: map,
35 |                 title: row[2]
36 |             });
37 |             markers.push(marker);
38 |         }
39 |       }
40 |     </script>
41 |   </head>
42 |   <body onload="initialize()">
43 | <div id="map_canvas" style="height: 500px"></div>
44 | <p><b>About this Map</b></p>
45 | <p>
46 | This is a cool map from 
47 | <a href="http://www.py4e.com">www.py4e.com</a>.
48 | </p>
49 | </body>
50 | </html>
51 | 


--------------------------------------------------------------------------------
/Using Databases with Python/Week 5/where.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/where.png


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 1/Atom Editor Test.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Python to Access Web Data/Week 1/Atom Editor Test.PNG


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 1/Directory Test.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Python to Access Web Data/Week 1/Directory Test.PNG


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 1/Test.py:
--------------------------------------------------------------------------------
1 | print("I am writing a line on code!")
2 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 2/Week 2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 20:06:30 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import re
 9 | pile = open('regex_sum_41647.txt')
10 | gold = pile.read()
11 | copier = re.findall("[0-9]+", gold)
12 | dice = [int(i) for i in copier]
13 | sum = 0
14 | for k in dice:
15 | 	sum += k
16 | print(sum)
17 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 3/Week 3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 20:27:29 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import socket
 9 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
10 | mysock.connect(('data.pr4e.org', 80))
11 | cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode()
12 | mysock.send(cmd)
13 | 
14 | lit = list()
15 | while True:
16 |     data = mysock.recv(512)
17 |     lit.append(data)     
18 |     if (len(data) < 1):
19 |         break
20 |     print(data.decode())
21 | mysock.close()
22 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 3/intro-short.txt:
--------------------------------------------------------------------------------
 1 | Why should you learn to write programs?
 2 | 
 3 | Writing programs (or programming) is a very creative 
 4 | and rewarding activity.  You can write programs for 
 5 | many reasons, ranging from making your living to solving
 6 | a difficult data analysis problem to having fun to helping
 7 | someone else solve a problem.  This book assumes that 
 8 | everyone needs to know how to program, and that once 
 9 | you know how to program you will figure out what you want 
10 | to do with your newfound skills.  
11 | 
12 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 4/Following Links in HTML Using BeautifulSoup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 21:12:49 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import urllib.request, urllib.parse, urllib.error
 9 | from bs4 import BeautifulSoup
10 | import ssl
11 | ctx = ssl.create_default_context()
12 | ctx.check_hostname = False
13 | ctx.verify_mode = ssl.CERT_NONE
14 | 
15 | url = input('Enter URL: ')
16 | num = input('Enter count: ')
17 | pos = input('Enter position: ')
18 | print('Retrieving: ', url)
19 | for times in range(int(num)):
20 |     html = urllib.request.urlopen(url, context=ctx).read()
21 |     soup = BeautifulSoup(html, 'html.parser')
22 |     tags = soup('a')
23 |     print('Retrieving: ', tags[int(pos)-1].get('href', None))
24 |     url = tags[int(pos)-1].get('href', None)
25 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 4/Scraping HTML Data with BeautifulSoup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 21:10:37 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | from urllib.request import urlopen
 9 | from bs4 import BeautifulSoup
10 | import ssl
11 | 
12 | ctx = ssl.create_default_context()
13 | ctx.check_hostname = False
14 | ctx.verify_mode = ssl.CERT_NONE
15 | html = urlopen('  http://py4e-data.dr-chuck.net/comments_41649.html', context=ctx).read()
16 | soup = BeautifulSoup(html, "html.parser")
17 | tags = soup('span')
18 | sum = 0
19 | coun = 0
20 | print('Enter - ')
21 | for tag in tags:
22 |     coun += 1    
23 |     sum += int(tag.contents[0])
24 | print('Count', coun, '\nSum', sum)
25 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 5/Extracting Data from XML.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 21:43:48 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import urllib.request, urllib.parse, urllib.error
 9 | import xml.etree.ElementTree as ET
10 | import ssl
11 | ctx = ssl.create_default_context()
12 | ctx.check_hostname = False
13 | ctx.verify_mode = ssl.CERT_NONE
14 | 
15 | url = input('Enter location: ')
16 | print ('Retrieving ', url)
17 | html = urllib.request.urlopen(url, context=ctx).read()
18 | print ('Retrieved', len(html), 'characters')
19 | tree = ET.fromstring(html)
20 | print ('Count: ',len(tree.findall('.//count')))
21 | total = 0
22 | for r in tree.findall("./comments/comment"):
23 |     total += int(r.find('count').text)
24 | print ('Sum: ', total)
25 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 6/Extracting Data from JSON.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 21:45:18 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import urllib.request, urllib.parse, urllib.error
 9 | import json
10 | 
11 | url = input('Enter location: ')
12 | data = urllib.request.urlopen(url).read()
13 | info = json.loads(data)
14 | info = info['comments']
15 | print ('Retrieving', url, '\nRetrieved', len(data), 'caracters', '\nCount:', len(info))
16 | num = 0
17 | for item in info:
18 |     num += int(item['count'])
19 | print ('Sum:', num)
20 | 


--------------------------------------------------------------------------------
/Using Python to Access Web Data/Week 6/GEOSON API.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct 16 21:46:57 2017
 4 | 
 5 | @author: atse
 6 | """
 7 | 
 8 | import urllib.request, urllib.parse, urllib.error
 9 | import json
10 | 
11 | serviceurl = 'http://python-data.dr-chuck.net/geojson'
12 | address = input('Enter location: ')
13 | url = serviceurl + '?' + urllib.parse.urlencode({'sensor':'false', 'address':  address})
14 | data = urllib.request.urlopen(url).read().decode()
15 | info = json.loads(data)
16 | info = info['results']
17 | print ('Retrieving', url, '\nRetrieved', len(data), 'caracters')
18 | for item in info:
19 |     key = item['place_id']
20 | print ('Place id:', key)
21 | 


--------------------------------------------------------------------------------