├── Capstone ├── Mailing List I │ ├── Content.sqlite Snapshot.PNG │ ├── Gline Visualization.PNG │ ├── Gmodel Index sqlite Screenshot.PNG │ ├── Gmodel.py Application Screenshot.PNG │ ├── Histogram gbasic.PNG │ ├── README.txt │ ├── Second Gline.PNG │ ├── Wordcloud.PNG │ ├── content.sqlite │ ├── d3.layout.cloud.js │ ├── d3.v2.js │ ├── gbasic.py │ ├── gline.htm │ ├── gline.js │ ├── gline.py │ ├── gmane.py │ ├── gmodel.py │ ├── gword.htm │ ├── gword.js │ ├── gword.py │ ├── gyear.py │ ├── index.sqlite │ └── mapping.sqlite └── Pagerank │ ├── LICENSE │ ├── Pagerank Dr.Chuck.PNG │ ├── Pagerank WP.PNG │ ├── README.txt │ ├── bs4 │ ├── __init__.py │ ├── __init__.py.bak │ ├── builder │ │ ├── __init__.py │ │ ├── __init__.py.bak │ │ ├── _html5lib.py │ │ ├── _html5lib.py.bak │ │ ├── _htmlparser.py │ │ ├── _htmlparser.py.bak │ │ ├── _lxml.py │ │ └── _lxml.py.bak │ ├── dammit.py │ ├── dammit.py.bak │ ├── diagnose.py │ ├── diagnose.py.bak │ ├── element.py │ ├── element.py.bak │ ├── testing.py │ ├── testing.py.bak │ └── tests │ │ ├── __init__.py │ │ ├── test_builder_registry.py │ │ ├── test_docs.py │ │ ├── test_html5lib.py │ │ ├── test_html5lib.py.bak │ │ ├── test_htmlparser.py │ │ ├── test_lxml.py │ │ ├── test_lxml.py.bak │ │ ├── test_soup.py │ │ ├── test_soup.py.bak │ │ ├── test_tree.py │ │ └── test_tree.py.bak │ ├── d3.v2.js │ ├── force.css │ ├── force.html │ ├── force.js │ ├── pagerank orginal.PNG │ ├── spdump.py │ ├── spdump.py Dr. Chuck.PNG │ ├── spdump.py WP.PNG │ ├── spider.js │ ├── spider.py │ ├── spider.sqlite │ ├── spjson.py │ ├── sprank.py │ └── spreset.py ├── Programming for Everybody ├── Chapter 1 │ └── Hello World.py ├── Chapter 2 │ ├── Assignment 2.2.py │ └── Assignment 2.3.py ├── Chapter 3 │ ├── Assignment 3.1.py │ └── Assignment 3.3.py ├── Chapter 4 │ └── Assignment 4.6.py └── Chapter 5 │ └── Assignment 5.2.py ├── Python Data Structures ├── Atom Editor Test.PNG ├── Chapter 10 │ ├── Assignment 10.2.py │ └── mbox-short.txt ├── Chapter 6 │ └── Assignment 6.5.py ├── Chapter 7 │ ├── Assignment 7.1.py │ ├── Assignment 7.2.py │ ├── mbox-short.txt │ └── words.txt ├── Chapter 8 │ ├── Assignment 8.4.py │ ├── Assignment 8.5.py │ ├── mbox-short.txt │ └── romeo.txt ├── Chapter 9 │ ├── Assignment 9.4.py │ └── mbox-short.txt ├── Directory Test.PNG └── Test.py ├── README.md ├── Using Databases with Python ├── Week 2 │ ├── First Database.db │ ├── First Database.db.sqbpro │ ├── First Database.sql │ ├── emaildb.py │ ├── emaildb.sqlite │ ├── mbox-short.txt │ └── mbox.txt ├── Week 3 │ ├── Library.xml │ ├── README.txt │ ├── trackdb.sqlite │ ├── tracks.py │ ├── trackscomplete.py │ └── tracksdb.py ├── Week 4 │ ├── HW Result.sql │ ├── roster.py │ ├── roster_data.json │ └── rosterdb.sqlite └── Week 5 │ ├── Google API Key.doc │ ├── README.txt │ ├── geodata.sqlite │ ├── geodump.png │ ├── geodump.py │ ├── geoload.png │ ├── geoload.py │ ├── where.data │ ├── where.html │ ├── where.js │ └── where.png └── Using Python to Access Web Data ├── Week 1 ├── Atom Editor Test.PNG ├── Directory Test.PNG └── Test.py ├── Week 2 ├── Week 2.py └── regex_sum_41647.txt ├── Week 3 ├── Week 3.py └── intro-short.txt ├── Week 4 ├── Following Links in HTML Using BeautifulSoup.py └── Scraping HTML Data with BeautifulSoup.py ├── Week 5 └── Extracting Data from XML.py └── Week 6 ├── Extracting Data from JSON.py └── GEOSON API.py /Capstone/Mailing List I/Content.sqlite Snapshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Content.sqlite Snapshot.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Gline Visualization.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gline Visualization.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Gmodel Index sqlite Screenshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gmodel Index sqlite Screenshot.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Gmodel.py Application Screenshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gmodel.py Application Screenshot.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Histogram gbasic.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Histogram gbasic.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive from gmane and vizualizing the data 2 | using the D3 JavaScript library 3 | 4 | This is a set of tools that allow you to pull down an archive 5 | of a gmane repository using the instructions at: 6 | 7 | http://gmane.org/export.php 8 | 9 | In order not to overwhelm the gmane.org server, I have put up 10 | my own copy of the messages at: 11 | 12 | http://mbox.dr-chuck.net/ 13 | 14 | This server will be faster and take a lot of load off the 15 | gmane.org server. 16 | 17 | You should install the SQLite browser to view and modify the databases from: 18 | 19 | http://sqlitebrowser.org/ 20 | 21 | The first step is to spider the gmane repository. The base URL 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai 23 | developer list. You can spider another repository by changing that 24 | base url. Make sure to delete the content.sqlite file if you 25 | switch the base url. The gmane.py file operates as a spider in 26 | that it runs slowly and retrieves one mail message per second so 27 | as to avoid getting throttled by gmane.org. It stores all of 28 | its data in a database and can be interrupted and re-started 29 | as often as needed. It may take many hours to pull all the data 30 | down. So you may need to restart several times. 31 | 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 33 | email here: 34 | 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite 36 | 37 | If you download this, you can "catch up with the latest" by 38 | running gmane.py. 39 | 40 | Navigate to the folder where you extracted the gmane.zip 41 | 42 | Note: Windows has difficulty in displaying UTF-8 characters 43 | in the console so for each console window you open, you may need 44 | to type the following command before running this code: 45 | 46 | chcp 65001 47 | 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 49 | 50 | Here is a run of gmane.py getting the last five messages of the 51 | sakai developer list: 52 | 53 | Mac: python3 gmane.py 54 | Win: gmane.py 55 | 56 | How many messages:10 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662 58 | ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434 60 | csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference: sakai developers break into song 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055 62 | kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721 64 | michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 66 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 67 | Does not start with From 68 | 69 | The program scans content.sqlite from 1 up to the first message number not 70 | already spidered and starts spidering at that message. It continues spidering 71 | until it has spidered the desired number of messages or it reaches a page 72 | that does not appear to be a properly formatted message. 73 | 74 | Sometimes gmane.org is missing a message. Perhaps administrators can delete messages 75 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 77 | all the other fields blank - and then restart gmane.py. This will unstick the 78 | spidering process and allow it to continue. These empty messages will be ignored in the next 79 | phase of the process. 80 | 81 | One nice thing is that once you have spidered all of the messages and have them in 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 83 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 84 | if there are new messages and then quickly retrieve those messages and add them 85 | to content.sqlite. 86 | 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 88 | This is intentional as it allows you to look at content.sqlite to debug the process. 89 | It would be a bad idea to run any queries against this database as they would be 90 | slow. 91 | 92 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 94 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 95 | smaller) than content.sqlite because it also compresses the header and body text. 96 | 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 99 | data cleaning process. 100 | 101 | Running gmodel.py works as follows: 102 | 103 | Mac: python3 gmodel.py 104 | Win: gmodel.py 105 | 106 | Loaded allsenders 1588 and mapping 28 dns mapping 1 107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 111 | ... 112 | 113 | The gmodel.py program does a number of data cleaing steps 114 | 115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 116 | other domain names are truncated to three levels. So si.umich.edu becomes 117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 118 | forced to lower case and some of the @gmane.org address like the following 119 | 120 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 121 | 122 | are converted to the real address whenever there is a matching real email 123 | address elsewhere in the message corpus. 124 | 125 | If you look in the content.sqlite database there are two tables that allow 126 | you to map both domain names and individual email addresses that change over 127 | the lifetime of the email list. For example, Steve Githens used the following 128 | email addresses over the life of the Sakai developer list: 129 | 130 | s-githens@northwestern.edu 131 | sgithens@cam.ac.uk 132 | swgithen@mtu.edu 133 | 134 | We can add two entries to the Mapping table 135 | 136 | s-githens@northwestern.edu -> swgithen@mtu.edu 137 | sgithens@cam.ac.uk -> swgithen@mtu.edu 138 | 139 | And so all the mail messages will be collected under one sender even if 140 | they used several email addresses over the lifetime of the mailing list. 141 | 142 | You can also make similar entries in the DNSMapping table if there are multiple 143 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 144 | mapping: 145 | 146 | iupui.edu -> indiana.edu 147 | 148 | So all the folks from the various Indiana University campuses are tracked together 149 | 150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 151 | to make the data cleaner and cleaner. When you are done, you will have a nicely 152 | indexed version of the email in index.sqlite. This is the file to use to do data 153 | analysis. With this file, data analysis will be really quick. 154 | 155 | The first, simplest data analysis is to do a "who does the most" and "which 156 | organzation does the most"? This is done using gbasic.py: 157 | 158 | Mac: python3 gbasic.py 159 | Win: gbasic.py 160 | 161 | How many to dump? 5 162 | Loaded messages= 51330 subjects= 25033 senders= 1584 163 | 164 | Top 5 Email list participants 165 | steve.swinsburg@gmail.com 2657 166 | azeckoski@unicon.net 1742 167 | ieb@tfd.co.uk 1591 168 | csev@umich.edu 1304 169 | david.horwitz@uct.ac.za 1184 170 | 171 | Top 5 Email list organizations 172 | gmail.com 7339 173 | umich.edu 6243 174 | uct.ac.za 2451 175 | indiana.edu 2258 176 | unicon.net 2055 177 | 178 | You can look at the data in index.sqlite and if you find a problem, you 179 | can update the Mapping table and DNSMapping table in content.sqlite and 180 | re-run gmodel.py. 181 | 182 | There is a simple vizualization of the word frequence in the subject lines 183 | in the file gword.py: 184 | 185 | Mac: python3 gword.py 186 | Win: gword.py 187 | 188 | Range of counts: 33229 129 189 | Output written to gword.js 190 | 191 | This produces the file gword.js which you can visualize using the file 192 | gword.htm. 193 | 194 | A second visualization is in gline.py. It visualizes email participation by 195 | organizations over time. 196 | 197 | Mac: python3 gline.py 198 | Win: gline.py 199 | 200 | Loaded messages= 51330 subjects= 25033 senders= 1584 201 | Top 10 Oranizations 202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 203 | Output written to gline.js 204 | 205 | Its output is written to gline.js which is visualized using gline.htm. 206 | 207 | Some URLs for visualization ideas: 208 | 209 | https://developers.google.com/chart/ 210 | 211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 212 | 213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 214 | 215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 216 | 217 | http://bost.ocks.org/mike/uberdata/ 218 | 219 | http://mbostock.github.io/d3/talk/20111018/calendar.html 220 | 221 | http://nltk.org/install.html 222 | 223 | As always - comments welcome. 224 | 225 | -- Dr. Chuck 226 | Sun Sep 29 00:11:01 EDT 2013 227 | 228 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/Second Gline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Second Gline.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Wordcloud.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Wordcloud.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/content.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/content.sqlite -------------------------------------------------------------------------------- /Capstone/Mailing List I/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | howmany = int(input("How many to dump? ")) 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | cur = conn.cursor() 9 | 10 | cur.execute('SELECT id, sender FROM Senders') 11 | senders = dict() 12 | for message_row in cur : 13 | senders[message_row[0]] = message_row[1] 14 | 15 | cur.execute('SELECT id, subject FROM Subjects') 16 | subjects = dict() 17 | for message_row in cur : 18 | subjects[message_row[0]] = message_row[1] 19 | 20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 22 | messages = dict() 23 | for message_row in cur : 24 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 25 | 26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) 27 | 28 | sendcounts = dict() 29 | sendorgs = dict() 30 | for (message_id, message) in list(messages.items()): 31 | sender = message[1] 32 | sendcounts[sender] = sendcounts.get(sender,0) + 1 33 | pieces = senders[sender].split("@") 34 | if len(pieces) != 2 : continue 35 | dns = pieces[1] 36 | sendorgs[dns] = sendorgs.get(dns,0) + 1 37 | 38 | print('') 39 | print('Top',howmany,'Email list participants') 40 | 41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 42 | for k in x[:howmany]: 43 | print(senders[k], sendcounts[k]) 44 | if sendcounts[k] < 10 : break 45 | 46 | print('') 47 | print('Top',howmany,'Email list organizations') 48 | 49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 50 | for k in x[:howmany]: 51 | print(k, sendorgs[k]) 52 | if sendorgs[k] < 10 : break 53 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gline.htm: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gline.js: -------------------------------------------------------------------------------- 1 | gline = [ ['Month','umich.edu','gmail.com','swinsborg.com','cam.ac.uk','uct.ac.za','indiana.edu','unicon.net','berkeley.edu','longsight.com','stanford.edu'], 2 | ['2005-12',57,10,0,7,14,12,6,12,0,4], 3 | ['2006-01',114,23,0,19,27,32,10,33,0,13], 4 | ['2006-02',121,28,0,28,23,33,7,28,0,24], 5 | ['2006-03',86,27,0,44,18,43,11,34,1,14], 6 | ['2006-04',125,24,0,54,38,47,42,44,1,25], 7 | ['2006-05',151,26,0,103,51,55,101,76,2,22], 8 | ['2006-06',119,30,0,76,66,70,37,32,0,13], 9 | ['2006-07',86,19,0,47,55,76,37,18,0,39], 10 | ['2006-08',136,70,0,46,44,102,50,20,0,21], 11 | ['2006-09',131,46,0,36,26,46,28,27,0,32], 12 | ['2006-10',109,28,0,74,20,52,35,30,0,31], 13 | ['2006-11',87,55,0,51,47,36,24,35,0,16], 14 | ['2006-12',54,58,0,21,13,46,8,26,0,15], 15 | ['2007-01',84,32,0,42,35,43,10,24,0,26], 16 | ['2007-02',114,51,0,59,44,54,10,18,0,30], 17 | ['2007-03',93,45,4,54,38,64,4,46,0,34], 18 | ['2007-04',68,54,1,46,25,72,10,24,0,27], 19 | ['2007-05',98,45,17,61,61,41,16,62,0,39], 20 | ['2007-06',115,43,3,58,36,39,33,50,0,38], 21 | ['2007-07',126,53,28,89,69,28,59,45,0,34], 22 | ['2007-08',182,37,21,102,50,63,48,46,0,28], 23 | ['2007-09',167,52,31,132,70,47,98,53,0,58], 24 | ['2007-10',104,62,41,97,47,44,100,41,0,73], 25 | ['2007-11',110,46,22,142,51,40,90,41,0,23], 26 | ['2007-12',151,71,21,123,36,33,67,35,0,17], 27 | ['2008-01',126,49,17,64,32,42,24,39,0,15], 28 | ['2008-02',94,39,51,89,30,34,19,14,0,23], 29 | ['2008-03',89,45,14,43,42,39,29,19,0,27], 30 | ['2008-04',140,58,22,99,50,27,57,40,0,26], 31 | ['2008-05',130,60,44,104,36,17,93,26,0,19], 32 | ['2008-06',96,28,13,36,33,21,31,25,0,5], 33 | ['2008-07',115,32,24,75,55,22,59,30,2,7], 34 | ['2008-08',165,42,31,80,65,23,41,39,5,13], 35 | ['2008-09',119,54,31,35,35,44,28,25,0,10], 36 | ['2008-10',85,40,33,60,31,27,33,15,4,3], 37 | ['2008-11',43,23,19,26,19,12,17,11,1,5], 38 | ['2008-12',67,30,18,17,22,12,18,6,3,4], 39 | ['2009-01',46,16,18,19,27,3,1,9,0,4], 40 | ['2009-02',23,43,38,26,17,15,5,9,1,5], 41 | ['2009-03',94,76,56,5,27,18,19,6,7,9], 42 | ['2009-04',74,101,43,2,28,18,42,5,8,10], 43 | ['2009-05',49,122,61,6,29,16,25,13,10,12], 44 | ['2009-06',43,64,41,4,29,11,8,27,8,4], 45 | ['2009-07',67,99,50,12,32,20,21,27,12,4], 46 | ['2009-08',42,59,17,13,35,25,12,8,13,3], 47 | ['2009-09',71,42,23,8,33,22,11,6,9,19], 48 | ['2009-10',77,69,85,4,50,43,29,3,9,8], 49 | ['2009-11',55,40,46,10,26,26,14,11,6,9], 50 | ['2009-12',43,34,26,2,21,16,11,2,3,4], 51 | ['2010-01',57,29,39,3,26,19,10,3,17,10], 52 | ['2010-02',54,36,42,12,22,21,16,2,16,1], 53 | ['2010-03',72,89,53,12,38,22,18,8,14,13], 54 | ['2010-04',41,38,30,2,18,9,15,9,17,24], 55 | ['2010-05',50,32,47,3,34,10,8,2,8,9], 56 | ['2010-06',28,56,47,10,18,12,7,2,19,14], 57 | ['2010-07',53,57,54,10,42,9,4,3,11,10], 58 | ['2010-08',57,47,36,13,41,18,11,1,15,8], 59 | ['2010-09',58,44,34,4,22,21,3,3,18,4], 60 | ['2010-10',42,41,18,2,12,4,4,4,14,11], 61 | ['2010-11',41,34,23,5,13,10,4,0,7,1], 62 | ['2010-12',26,32,13,2,11,8,7,1,7,3], 63 | ['2011-01',35,47,46,5,20,7,2,2,22,6], 64 | ['2011-02',30,58,51,2,15,9,17,1,18,6], 65 | ['2011-03',60,86,54,10,12,17,15,2,52,11], 66 | ['2011-04',38,45,25,3,6,21,6,0,19,3], 67 | ['2011-05',18,39,15,9,13,14,8,1,19,3], 68 | ['2011-06',30,89,22,4,22,10,13,0,7,4], 69 | ['2011-07',45,69,73,5,18,16,6,1,39,5], 70 | ['2011-08',42,45,37,9,13,13,4,4,48,10], 71 | ['2011-09',40,80,28,5,11,16,12,6,33,18], 72 | ['2011-10',23,59,26,7,4,11,12,0,34,6], 73 | ['2011-11',30,86,42,5,16,9,3,1,23,1], 74 | ['2011-12',26,30,31,2,7,5,5,0,21,13], 75 | ['2012-01',32,54,28,1,20,8,8,0,32,10], 76 | ['2012-02',37,85,62,5,24,10,18,4,36,22], 77 | ['2012-03',42,84,47,2,19,4,19,1,56,9], 78 | ['2012-04',24,50,52,0,11,4,15,1,45,10], 79 | ['2012-05',24,92,66,1,9,12,18,1,36,12], 80 | ['2012-06',53,90,51,2,12,14,54,1,47,1], 81 | ['2012-07',13,59,47,7,7,5,8,2,40,5], 82 | ['2012-08',17,61,51,1,7,10,18,1,41,1], 83 | ['2012-09',21,44,35,1,14,11,16,0,46,9], 84 | ['2012-10',21,51,36,3,19,6,10,0,63,9], 85 | ['2012-11',29,63,62,2,15,20,11,0,71,5], 86 | ['2012-12',12,29,35,0,5,8,9,0,31,9], 87 | ['2013-01',39,44,40,3,5,3,3,0,50,6], 88 | ['2013-02',59,97,66,0,5,25,13,0,54,8], 89 | ['2013-03',60,122,66,1,12,33,30,0,72,11], 90 | ['2013-04',9,34,18,0,4,4,8,0,17,2] 91 | ]; 92 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | conn = sqlite3.connect('index.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT id, sender FROM Senders') 9 | senders = dict() 10 | for message_row in cur : 11 | senders[message_row[0]] = message_row[1] 12 | 13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 14 | messages = dict() 15 | for message_row in cur : 16 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 17 | 18 | print("Loaded messages=",len(messages),"senders=",len(senders)) 19 | 20 | sendorgs = dict() 21 | for (message_id, message) in list(messages.items()): 22 | sender = message[1] 23 | pieces = senders[sender].split("@") 24 | if len(pieces) != 2 : continue 25 | dns = pieces[1] 26 | sendorgs[dns] = sendorgs.get(dns,0) + 1 27 | 28 | # pick the top schools 29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 30 | orgs = orgs[:10] 31 | print("Top 10 Oranizations") 32 | print(orgs) 33 | 34 | counts = dict() 35 | months = list() 36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 37 | for (message_id, message) in list(messages.items()): 38 | sender = message[1] 39 | pieces = senders[sender].split("@") 40 | if len(pieces) != 2 : continue 41 | dns = pieces[1] 42 | if dns not in orgs : continue 43 | month = message[3][:7] 44 | if month not in months : months.append(month) 45 | key = (month, dns) 46 | counts[key] = counts.get(key,0) + 1 47 | 48 | months.sort() 49 | # print counts 50 | # print months 51 | 52 | fhand = open('gline.js','w') 53 | fhand.write("gline = [ ['Year'") 54 | for org in orgs: 55 | fhand.write(",'"+org+"'") 56 | fhand.write("]") 57 | 58 | for month in months: 59 | fhand.write(",\n['"+month+"'") 60 | for org in orgs: 61 | key = (month, org) 62 | val = counts.get(key,0) 63 | fhand.write(","+str(val)) 64 | fhand.write("]"); 65 | 66 | fhand.write("\n];\n") 67 | fhand.close() 68 | 69 | print("Output written to gline.js") 70 | print("Open gline.htm to visualize the data") 71 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gmane.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import ssl 4 | import urllib.request, urllib.parse, urllib.error 5 | from urllib.parse import urljoin 6 | from urllib.parse import urlparse 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | # Not all systems have this so conditionally define parser 11 | try: 12 | import dateutil.parser as parser 13 | except: 14 | pass 15 | 16 | def parsemaildate(md) : 17 | # See if we have dateutil 18 | try: 19 | pdate = parser.parse(tdate) 20 | test_at = pdate.isoformat() 21 | return test_at 22 | except: 23 | pass 24 | 25 | # Non-dateutil version - we try our best 26 | 27 | pieces = md.split() 28 | notz = " ".join(pieces[:4]).strip() 29 | 30 | # Try a bunch of format variations - strptime() is *lame* 31 | dnotz = None 32 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 33 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 34 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 35 | try: 36 | dnotz = datetime.strptime(notz, form) 37 | break 38 | except: 39 | continue 40 | 41 | if dnotz is None : 42 | # print 'Bad Date:',md 43 | return None 44 | 45 | iso = dnotz.isoformat() 46 | 47 | tz = "+0000" 48 | try: 49 | tz = pieces[4] 50 | ival = int(tz) # Only want numeric timezone values 51 | if tz == '-0000' : tz = '+0000' 52 | tzh = tz[:3] 53 | tzm = tz[3:] 54 | tz = tzh+":"+tzm 55 | except: 56 | pass 57 | 58 | return iso+tz 59 | 60 | # Ignore SSL certificate errors 61 | ctx = ssl.create_default_context() 62 | ctx.check_hostname = False 63 | ctx.verify_mode = ssl.CERT_NONE 64 | 65 | conn = sqlite3.connect('content.sqlite') 66 | cur = conn.cursor() 67 | 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 69 | 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 71 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 72 | subject TEXT, headers TEXT, body TEXT)''') 73 | 74 | # Pick up where we left off 75 | start = None 76 | cur.execute('SELECT max(id) FROM Messages' ) 77 | try: 78 | row = cur.fetchone() 79 | if row is None : 80 | start = 0 81 | else: 82 | start = row[0] 83 | except: 84 | start = 0 85 | 86 | if start is None : start = 0 87 | 88 | many = 0 89 | count = 0 90 | fail = 0 91 | while True: 92 | if ( many < 1 ) : 93 | conn.commit() 94 | sval = input('How many messages:') 95 | if ( len(sval) < 1 ) : break 96 | many = int(sval) 97 | 98 | start = start + 1 99 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 100 | try: 101 | row = cur.fetchone() 102 | if row is not None : continue 103 | except: 104 | row = None 105 | 106 | many = many - 1 107 | url = baseurl + str(start) + '/' + str(start + 1) 108 | 109 | text = "None" 110 | try: 111 | # Open with a timeout of 30 seconds 112 | document = urllib.request.urlopen(url, None, 30, context=ctx) 113 | text = document.read().decode() 114 | if document.getcode() != 200 : 115 | print("Error code=",document.getcode(), url) 116 | break 117 | except KeyboardInterrupt: 118 | print('') 119 | print('Program interrupted by user...') 120 | break 121 | except Exception as e: 122 | print("Unable to retrieve or parse page",url) 123 | print("Error",e) 124 | fail = fail + 1 125 | if fail > 5 : break 126 | continue 127 | 128 | print(url,len(text)) 129 | count = count + 1 130 | 131 | if not text.startswith("From "): 132 | print(text) 133 | print("Did not find From ") 134 | fail = fail + 1 135 | if fail > 5 : break 136 | continue 137 | 138 | pos = text.find("\n\n") 139 | if pos > 0 : 140 | hdr = text[:pos] 141 | body = text[pos+2:] 142 | else: 143 | print(text) 144 | print("Could not find break between headers and body") 145 | fail = fail + 1 146 | if fail > 5 : break 147 | continue 148 | 149 | email = None 150 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 151 | if len(x) == 1 : 152 | email = x[0]; 153 | email = email.strip().lower() 154 | email = email.replace("<","") 155 | else: 156 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 157 | if len(x) == 1 : 158 | email = x[0]; 159 | email = email.strip().lower() 160 | email = email.replace("<","") 161 | 162 | date = None 163 | y = re.findall('\Date: .*, (.*)\n', hdr) 164 | if len(y) == 1 : 165 | tdate = y[0] 166 | tdate = tdate[:26] 167 | try: 168 | sent_at = parsemaildate(tdate) 169 | except: 170 | print(text) 171 | print("Parse fail",tdate) 172 | fail = fail + 1 173 | if fail > 5 : break 174 | continue 175 | 176 | subject = None 177 | z = re.findall('\Subject: (.*)\n', hdr) 178 | if len(z) == 1 : subject = z[0].strip().lower(); 179 | 180 | # Reset the fail counter 181 | fail = 0 182 | print(" ",email,sent_at,subject) 183 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 184 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 185 | if count % 50 == 0 : conn.commit() 186 | if count % 100 == 0 : time.sleep(1) 187 | 188 | conn.commit() 189 | cur.close() 190 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import re 4 | import zlib 5 | from datetime import datetime, timedelta 6 | 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print(realsender, sender) 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print(realsender, sender) 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print(x,dns) 52 | # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns)) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print('Bad Date:',md) 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception as e: 124 | # print('Date ignored ',tdate, e) 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | conn = sqlite3.connect('index.sqlite') 140 | cur = conn.cursor() 141 | 142 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 143 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 145 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 146 | 147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 148 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 149 | sender_id INTEGER, subject_id INTEGER, 150 | headers BLOB, body BLOB)''') 151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 152 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 154 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 156 | (from_id INTEGER, to_id INTEGER)''') 157 | 158 | conn_1 = sqlite3.connect('mapping.sqlite') 159 | cur_1 = conn_1.cursor() 160 | 161 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 162 | for message_row in cur_1 : 163 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 164 | 165 | mapping = dict() 166 | cur_1.execute('''SELECT old,new FROM Mapping''') 167 | for message_row in cur_1 : 168 | old = fixsender(message_row[0]) 169 | new = fixsender(message_row[1]) 170 | mapping[old] = fixsender(new) 171 | 172 | # Done with mapping.sqlite 173 | conn_1.close() 174 | 175 | # Open the main content (Read only) 176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True) 177 | cur_1 = conn_1.cursor() 178 | 179 | allsenders = list() 180 | cur_1.execute('''SELECT email FROM Messages''') 181 | for message_row in cur_1 : 182 | sender = fixsender(message_row[0]) 183 | if sender is None : continue 184 | if 'gmane.org' in sender : continue 185 | if sender in allsenders: continue 186 | allsenders.append(sender) 187 | 188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) 189 | 190 | cur_1.execute('''SELECT headers, body, sent_at 191 | FROM Messages ORDER BY sent_at''') 192 | 193 | senders = dict() 194 | subjects = dict() 195 | guids = dict() 196 | 197 | count = 0 198 | 199 | for message_row in cur_1 : 200 | hdr = message_row[0] 201 | parsed = parseheader(hdr, allsenders) 202 | if parsed is None: continue 203 | (guid, sender, subject, sent_at) = parsed 204 | 205 | # Apply the sender mapping 206 | sender = mapping.get(sender,sender) 207 | 208 | count = count + 1 209 | if count % 250 == 1 : print(count,sent_at, sender) 210 | # print(guid, sender, subject, sent_at) 211 | 212 | if 'gmane.org' in sender: 213 | print("Error in sender ===", sender) 214 | 215 | sender_id = senders.get(sender,None) 216 | subject_id = subjects.get(subject,None) 217 | guid_id = guids.get(guid,None) 218 | 219 | if sender_id is None : 220 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 221 | conn.commit() 222 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 223 | try: 224 | row = cur.fetchone() 225 | sender_id = row[0] 226 | senders[sender] = sender_id 227 | except: 228 | print('Could not retrieve sender id',sender) 229 | break 230 | if subject_id is None : 231 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 232 | conn.commit() 233 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 234 | try: 235 | row = cur.fetchone() 236 | subject_id = row[0] 237 | subjects[subject] = subject_id 238 | except: 239 | print('Could not retrieve subject id',subject) 240 | break 241 | # print(sender_id, subject_id) 242 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 243 | ( guid, sender_id, subject_id, sent_at, 244 | zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) ) 245 | conn.commit() 246 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 247 | try: 248 | row = cur.fetchone() 249 | message_id = row[0] 250 | guids[guid] = message_id 251 | except: 252 | print('Could not retrieve guid id',guid) 253 | break 254 | 255 | cur.close() 256 | cur_1.close() 257 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gword.js: -------------------------------------------------------------------------------- 1 | gword = [{text: 'sakai', size: 100}, 2 | {text: 'building', size: 71}, 3 | {text: 'tool', size: 26}, 4 | {text: 'with', size: 26}, 5 | {text: 'site', size: 24}, 6 | {text: 'problem', size: 23}, 7 | {text: 'error', size: 22}, 8 | {text: 'from', size: 22}, 9 | {text: 'question', size: 22}, 10 | {text: 'samigo', size: 22}, 11 | {text: 'build', size: 21}, 12 | {text: 'release', size: 21}, 13 | {text: 'trunk', size: 21}, 14 | {text: 'using', size: 21}, 15 | {text: 'resources', size: 21}, 16 | {text: 'issue', size: 21}, 17 | {text: 'user', size: 21}, 18 | {text: 'help', size: 21}, 19 | {text: 'tools', size: 21}, 20 | {text: 'melete', size: 21}, 21 | {text: 'problems', size: 21}, 22 | {text: 'gradebook', size: 21}, 23 | {text: 'maven', size: 21}, 24 | {text: 'mysql', size: 21}, 25 | {text: 'course', size: 21}, 26 | {text: 'tomcat', size: 20}, 27 | {text: 'about', size: 20}, 28 | {text: 'deploying', size: 20}, 29 | {text: 'content', size: 20}, 30 | {text: 'sites', size: 20}, 31 | {text: 'integration', size: 20}, 32 | {text: 'users', size: 20}, 33 | {text: 'email', size: 20}, 34 | {text: 'jira', size: 20}, 35 | {text: 'issues', size: 20}, 36 | {text: 'portal', size: 20}, 37 | {text: 'upgrade', size: 20}, 38 | {text: 'broken', size: 20}, 39 | {text: 'update', size: 20}, 40 | {text: 'change', size: 20}, 41 | {text: 'file', size: 20}, 42 | {text: 'assignment', size: 20}, 43 | {text: 'when', size: 20}, 44 | {text: 'search', size: 20}, 45 | {text: 'code', size: 20}, 46 | {text: 'service', size: 20}, 47 | {text: 'management', size: 20}, 48 | {text: 'webdav', size: 20}, 49 | {text: 'test', size: 20}, 50 | {text: 'errors', size: 20}, 51 | {text: 'oracle', size: 20}, 52 | {text: 'assignments', size: 20}, 53 | {text: 'files', size: 20}, 54 | {text: 'profile', size: 20}, 55 | {text: 'production', size: 20}, 56 | {text: 'page', size: 20}, 57 | {text: 'version', size: 20}, 58 | {text: 'database', size: 20}, 59 | {text: 'hibernate', size: 20}, 60 | {text: 'java', size: 20}, 61 | {text: 'chat', size: 20}, 62 | {text: 'changes', size: 20}, 63 | {text: 'ldap', size: 20}, 64 | {text: 'project', size: 20}, 65 | {text: 'questions', size: 20}, 66 | {text: 'login', size: 20}, 67 | {text: 'testing', size: 20}, 68 | {text: 'info', size: 20}, 69 | {text: 'startup', size: 20}, 70 | {text: 'data', size: 20}, 71 | {text: 'conversion', size: 20}, 72 | {text: 'jforum', size: 20}, 73 | {text: 'performance', size: 20}, 74 | {text: 'kernel', size: 20}, 75 | {text: 'adding', size: 20}, 76 | {text: 'support', size: 20}, 77 | {text: 'import', size: 20}, 78 | {text: 'call', size: 20}, 79 | {text: 'nightly', size: 20}, 80 | {text: 'running', size: 20}, 81 | {text: 'access', size: 20}, 82 | {text: 'branch', size: 20}, 83 | {text: 'into', size: 20}, 84 | {text: 'multiple', size: 20}, 85 | {text: 'message', size: 20}, 86 | {text: 'default', size: 20}, 87 | {text: 'status', size: 20}, 88 | {text: 'source', size: 20}, 89 | {text: 'create', size: 20}, 90 | {text: 'wiki', size: 20}, 91 | {text: 'scorm', size: 20}, 92 | {text: 'setup', size: 20}, 93 | {text: 'what', size: 20}, 94 | {text: 'more', size: 20}, 95 | {text: 'does', size: 20}, 96 | {text: 'configuration', size: 20}, 97 | {text: 'down', size: 20}, 98 | {text: 'list', size: 20}, 99 | {text: 'getting', size: 20}, 100 | {text: 'server', size: 20} 101 | ]; 102 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | import string 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, subject FROM Subjects') 10 | subjects = dict() 11 | for message_row in cur : 12 | subjects[message_row[0]] = message_row[1] 13 | 14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 15 | cur.execute('SELECT subject_id FROM Messages') 16 | counts = dict() 17 | for message_row in cur : 18 | text = subjects[message_row[0]] 19 | text = text.translate(str.maketrans('','',string.punctuation)) 20 | text = text.translate(str.maketrans('','','1234567890')) 21 | text = text.strip() 22 | text = text.lower() 23 | words = text.split() 24 | for word in words: 25 | if len(word) < 4 : continue 26 | counts[word] = counts.get(word,0) + 1 27 | 28 | x = sorted(counts, key=counts.get, reverse=True) 29 | highest = None 30 | lowest = None 31 | for k in x[:100]: 32 | if highest is None or highest < counts[k] : 33 | highest = counts[k] 34 | if lowest is None or lowest > counts[k] : 35 | lowest = counts[k] 36 | print('Range of counts:',highest,lowest) 37 | 38 | # Spread the font sizes across 20-100 based on the count 39 | bigsize = 80 40 | smallsize = 20 41 | 42 | fhand = open('gword.js','w') 43 | fhand.write("gword = [") 44 | first = True 45 | for k in x[:100]: 46 | if not first : fhand.write( ",\n") 47 | first = False 48 | size = counts[k] 49 | size = (size - lowest) / float(highest - lowest) 50 | size = int((size * bigsize) + smallsize) 51 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 52 | fhand.write( "\n];\n") 53 | fhand.close() 54 | 55 | print("Output written to gword.js") 56 | print("Open gword.htm in a browser to see the vizualization") 57 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib.request, urllib.parse, urllib.error 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, sender FROM Senders') 10 | senders = dict() 11 | for message_row in cur : 12 | senders[message_row[0]] = message_row[1] 13 | 14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 15 | messages = dict() 16 | for message_row in cur : 17 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 18 | 19 | print("Loaded messages=",len(messages),"senders=",len(senders)) 20 | 21 | sendorgs = dict() 22 | for (message_id, message) in list(messages.items()): 23 | sender = message[1] 24 | pieces = senders[sender].split("@") 25 | if len(pieces) != 2 : continue 26 | dns = pieces[1] 27 | sendorgs[dns] = sendorgs.get(dns,0) + 1 28 | 29 | # pick the top schools 30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 31 | orgs = orgs[:10] 32 | print("Top 10 Oranizations") 33 | print(orgs) 34 | # orgs = ['total'] + orgs 35 | 36 | counts = dict() 37 | months = list() 38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 39 | for (message_id, message) in list(messages.items()): 40 | sender = message[1] 41 | pieces = senders[sender].split("@") 42 | if len(pieces) != 2 : continue 43 | dns = pieces[1] 44 | if dns not in orgs : continue 45 | month = message[3][:4] 46 | if month not in months : months.append(month) 47 | key = (month, dns) 48 | counts[key] = counts.get(key,0) + 1 49 | tkey = (month, 'total') 50 | counts[tkey] = counts.get(tkey,0) + 1 51 | 52 | months.sort() 53 | # print counts 54 | # print months 55 | 56 | fhand = open('gline.js','w') 57 | fhand.write("gline = [ ['Year'") 58 | for org in orgs: 59 | fhand.write(",'"+org+"'") 60 | fhand.write("]") 61 | 62 | for month in months[1:-1]: 63 | fhand.write(",\n['"+month+"'") 64 | for org in orgs: 65 | key = (month, org) 66 | val = counts.get(key,0) 67 | fhand.write(","+str(val)) 68 | fhand.write("]"); 69 | 70 | fhand.write("\n];\n") 71 | fhand.close() 72 | 73 | print("Output written to gline.js") 74 | print("Open gline.htm to visualize the data") 75 | 76 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/index.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/index.sqlite -------------------------------------------------------------------------------- /Capstone/Mailing List I/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/mapping.sqlite -------------------------------------------------------------------------------- /Capstone/Pagerank/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Capstone/Pagerank/Pagerank Dr.Chuck.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/Pagerank Dr.Chuck.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/Pagerank WP.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/Pagerank WP.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Note: Windows has difficulty in displaying UTF-8 characters 17 | in the console so for each console window you open, you may need 18 | to type the following command before running this code: 19 | 20 | chcp 65001 21 | 22 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 23 | 24 | Mac: rm spider.sqlite 25 | Mac: python3 spider.py 26 | 27 | Win: del spider.sqlite 28 | Win: spider.py 29 | 30 | Enter web url or enter: http://www.dr-chuck.com/ 31 | ['http://www.dr-chuck.com'] 32 | How many pages:2 33 | 1 http://www.dr-chuck.com/ 12 34 | 2 http://www.dr-chuck.com/csev-blog/ 57 35 | How many pages: 36 | 37 | In this sample run, we told it to crawl a website and retrieve two 38 | pages. If you restart the program again and tell it to crawl more 39 | pages, it will not re-crawl any pages already in the database. Upon 40 | restart it goes to a random non-crawled page and starts there. So 41 | each successive run of spider.py is additive. 42 | 43 | Mac: python3 spider.py 44 | Win: spider.py 45 | 46 | Enter web url or enter: http://www.dr-chuck.com/ 47 | ['http://www.dr-chuck.com'] 48 | How many pages:3 49 | 3 http://www.dr-chuck.com/csev-blog 57 50 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 51 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 52 | How many pages: 53 | 54 | You can have multiple starting points in the same database - 55 | within the program these are called "webs". The spider 56 | chooses randomly amongst all non-visited links across all 57 | the webs. 58 | 59 | If you want to dump the contents of the spider.sqlite file, you can 60 | run spdump.py as follows: 61 | 62 | Mac: python3 spdump.py 63 | Win: spdump.py 64 | 65 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 66 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 67 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 68 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 69 | 4 rows. 70 | 71 | This shows the number of incoming links, the old page rank, the new page 72 | rank, the id of the page, and the url of the page. The spdump.py program 73 | only shows pages that have at least one incoming link to them. 74 | 75 | Once you have a few pages in the database, you can run Page Rank on the 76 | pages using the sprank.py program. You simply tell it how many Page 77 | Rank iterations to run. 78 | 79 | Mac: python3 sprank.py 80 | Win: sprank.py 81 | 82 | How many iterations:2 83 | 1 0.546848992536 84 | 2 0.226714939664 85 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 86 | 87 | You can dump the database again to see that page rank has been updated: 88 | 89 | Mac: python3 spdump.py 90 | Win: spdump.py 91 | 92 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 93 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 94 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 95 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 96 | 4 rows. 97 | 98 | You can run sprank.py as many times as you like and it will simply refine 99 | the page rank the more times you run it. You can even run sprank.py a few times 100 | and then go spider a few more pages sith spider.py and then run sprank.py 101 | to converge the page ranks. 102 | 103 | If you want to restart the Page Rank calculations without re-spidering the 104 | web pages, you can use spreset.py 105 | 106 | Mac: python3 spreset.py 107 | Win: spreset.py 108 | 109 | All pages set to a rank of 1.0 110 | 111 | Mac: python3 sprank.py 112 | Win: sprank.py 113 | 114 | How many iterations:50 115 | 1 0.546848992536 116 | 2 0.226714939664 117 | 3 0.0659516187242 118 | 4 0.0244199333 119 | 5 0.0102096489546 120 | 6 0.00610244329379 121 | ... 122 | 42 0.000109076928206 123 | 43 9.91987599002e-05 124 | 44 9.02151706798e-05 125 | 45 8.20451504471e-05 126 | 46 7.46150183837e-05 127 | 47 6.7857770908e-05 128 | 48 6.17124694224e-05 129 | 49 5.61236959327e-05 130 | 50 5.10410499467e-05 131 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 132 | 133 | For each iteration of the page rank algorithm it prints the average 134 | change per page of the page rank. The network initially is quite 135 | unbalanced and so the individual page ranks are changeing wildly. 136 | But in a few short iterations, the page rank converges. You 137 | should run prank.py long enough that the page ranks converge. 138 | 139 | If you want to visualize the current top pages in terms of page rank, 140 | run spjson.py to write the pages out in JSON format to be viewed in a 141 | web browser. 142 | 143 | Mac: python3 spjson.py 144 | Win: spjson.py 145 | 146 | Creating JSON output on spider.js... 147 | How many nodes? 30 148 | Open force.html in a browser to view the visualization 149 | 150 | You can view this data by opening the file force.html in your web browser. 151 | This shows an automatic layout of the nodes and links. You can click and 152 | drag any node and you can also double click on a node to find the URL 153 | that is represented by the node. 154 | 155 | This visualization is provided using the force layout from: 156 | 157 | http://mbostock.github.com/d3/ 158 | 159 | If you rerun the other utilities and then re-run spjson.py - you merely 160 | have to press refresh in the browser to get the new data from spider.js. 161 | 162 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import sys 4 | from bs4.element import ( 5 | CharsetMetaAttributeValue, 6 | ContentMetaAttributeValue, 7 | whitespace_re 8 | ) 9 | 10 | __all__ = [ 11 | 'HTMLTreeBuilder', 12 | 'SAXTreeBuilder', 13 | 'TreeBuilder', 14 | 'TreeBuilderRegistry', 15 | ] 16 | 17 | # Some useful features for a TreeBuilder to have. 18 | FAST = 'fast' 19 | PERMISSIVE = 'permissive' 20 | STRICT = 'strict' 21 | XML = 'xml' 22 | HTML = 'html' 23 | HTML_5 = 'html5' 24 | 25 | 26 | class TreeBuilderRegistry(object): 27 | 28 | def __init__(self): 29 | self.builders_for_feature = defaultdict(list) 30 | self.builders = [] 31 | 32 | def register(self, treebuilder_class): 33 | """Register a treebuilder based on its advertised features.""" 34 | for feature in treebuilder_class.features: 35 | self.builders_for_feature[feature].insert(0, treebuilder_class) 36 | self.builders.insert(0, treebuilder_class) 37 | 38 | def lookup(self, *features): 39 | if len(self.builders) == 0: 40 | # There are no builders at all. 41 | return None 42 | 43 | if len(features) == 0: 44 | # They didn't ask for any features. Give them the most 45 | # recently registered builder. 46 | return self.builders[0] 47 | 48 | # Go down the list of features in order, and eliminate any builders 49 | # that don't match every feature. 50 | features = list(features) 51 | features.reverse() 52 | candidates = None 53 | candidate_set = None 54 | while len(features) > 0: 55 | feature = features.pop() 56 | we_have_the_feature = self.builders_for_feature.get(feature, []) 57 | if len(we_have_the_feature) > 0: 58 | if candidates is None: 59 | candidates = we_have_the_feature 60 | candidate_set = set(candidates) 61 | else: 62 | # Eliminate any candidates that don't have this feature. 63 | candidate_set = candidate_set.intersection( 64 | set(we_have_the_feature)) 65 | 66 | # The only valid candidates are the ones in candidate_set. 67 | # Go through the original list of candidates and pick the first one 68 | # that's in candidate_set. 69 | if candidate_set is None: 70 | return None 71 | for candidate in candidates: 72 | if candidate in candidate_set: 73 | return candidate 74 | return None 75 | 76 | # The BeautifulSoup class will take feature lists from developers and use them 77 | # to look up builders in this registry. 78 | builder_registry = TreeBuilderRegistry() 79 | 80 | class TreeBuilder(object): 81 | """Turn a document into a Beautiful Soup object tree.""" 82 | 83 | NAME = "[Unknown tree builder]" 84 | ALTERNATE_NAMES = [] 85 | features = [] 86 | 87 | is_xml = False 88 | picklable = False 89 | preserve_whitespace_tags = set() 90 | empty_element_tags = None # A tag will be considered an empty-element 91 | # tag when and only when it has no contents. 92 | 93 | # A value for these tag/attribute combinations is a space- or 94 | # comma-separated list of CDATA, rather than a single CDATA. 95 | cdata_list_attributes = {} 96 | 97 | 98 | def __init__(self): 99 | self.soup = None 100 | 101 | def reset(self): 102 | pass 103 | 104 | def can_be_empty_element(self, tag_name): 105 | """Might a tag with this name be an empty-element tag? 106 | 107 | The final markup may or may not actually present this tag as 108 | self-closing. 109 | 110 | For instance: an HTMLBuilder does not consider atag to be 111 | an empty-element tag (it's not in 112 | HTMLBuilder.empty_element_tags). This means an empty
tag 113 | will be presented as "
", not "". 114 | 115 | The default implementation has no opinion about which tags are 116 | empty-element tags, so a tag will be presented as an 117 | empty-element tag if and only if it has no contents. 118 | "A bold statement.
" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('Here's another table:"
45 | '
| ')
48 |
49 | self.assertSoupEquals(
50 | markup,
51 | '
Here\'s another table:'
52 | '
|
Foo |
Bar |
Baz |
foo
68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach thetag; this means the tree is connected. 72 | self.assertEqual(b"
foo
", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = 'foo
\n' 76 | soup = self.soup(markup) 77 | self.assertEqual("foo
\n", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = 'foo
\n\n' 83 | soup = self.soup(markup) 84 | self.assertEqual("foo
\n\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_html5lib.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "A bold statement.
" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib insertsHere's another table:"
45 | '
| ')
48 |
49 | self.assertSoupEquals(
50 | markup,
51 | '
Here\'s another table:'
52 | '
|
Foo |
Bar |
Baz |
foo
68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach thetag; this means the tree is connected. 72 | self.assertEqual(b"
foo
", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = 'foo
\n' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"foo
\n", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = 'foo
\n\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"foo
\n\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from pdb import set_trace 5 | import pickle 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 7 | from bs4.builder import HTMLParserTreeBuilder 8 | 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 10 | 11 | @property 12 | def default_builder(self): 13 | return HTMLParserTreeBuilder() 14 | 15 | def test_namespaced_system_doctype(self): 16 | # html.parser can't handle namespaced doctypes, so skip this one. 17 | pass 18 | 19 | def test_namespaced_public_doctype(self): 20 | # html.parser can't handle namespaced doctypes, so skip this one. 21 | pass 22 | 23 | def test_builder_is_pickled(self): 24 | """Unlike most tree builders, HTMLParserTreeBuilder and will 25 | be restored after pickling. 26 | """ 27 | tree = self.soup("foo") 28 | dumped = pickle.dumps(tree, 2) 29 | loaded = pickle.loads(dumped) 30 | self.assertTrue(isinstance(loaded.builder, type(tree.builder))) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError as e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "foobar
", "foobar
") 44 | self.assertSoupEquals( 45 | "foobar
", "foobar
") 46 | self.assertSoupEquals( 47 | "foobar
", "foobar
") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual("", str(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_lxml.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError, e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "foobar
", "foobar
") 44 | self.assertSoupEquals( 45 | "foobar
", "foobar
") 46 | self.assertSoupEquals( 47 | "foobar
", "foobar
") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Capstone/Pagerank/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /Capstone/Pagerank/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |If you don't see a chart above, check the JavaScript console. You may 16 | need to use a different browser.
17 | 18 | 19 | -------------------------------------------------------------------------------- /Capstone/Pagerank/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /Capstone/Pagerank/pagerank orginal.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/pagerank orginal.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print(row) 14 | count = count + 1 15 | print(count, 'rows.') 16 | cur.close() 17 | -------------------------------------------------------------------------------- /Capstone/Pagerank/spdump.py Dr. Chuck.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spdump.py Dr. Chuck.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/spdump.py WP.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spdump.py WP.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/spider.js: -------------------------------------------------------------------------------- 1 | spiderJson = {"nodes":[ 2 | {"weight":126,"rank":19.0, "id":6, "url":"https://www.washingtonpost.com"}, 3 | {"weight":7,"rank":0.1367546748927559, "id":12, "url":"https://www.washingtonpost.com/news/arts-and-entertainment/wp/2017/10/26/megyn-kelly-responds-to-those-bill-oreilly-thank-you-notes-its-right-out-of-the-playbook"}, 4 | {"weight":7,"rank":0.1367546748927559, "id":14, "url":"https://www.washingtonpost.com/news/politics/wp/2017/09/25/a-trump-judicial-pick-said-transgender-children-are-proof-that-satans-plan-is-working/?utm_term=.8e0dac432ca8"}, 5 | {"weight":12,"rank":0.1367546748927559, "id":17, "url":"https://www.washingtonpost.com/news/animalia/wp/2017/10/26/this-bears-tongue-was-monstrously-enlarged-a-team-of-vets-gave-him-a-second-chance-at-life"}, 6 | {"weight":7,"rank":0.1367546748927559, "id":18, "url":"https://www.washingtonpost.com/news/speaking-of-science/wp/2017/10/25/this-bug-is-real-and-now-so-are-your-nightmares"}, 7 | {"weight":4,"rank":0.0, "id":20, "url":"https://www.washingtonpost.com/news/early-lead/wp/2017/10/26/joe-girardi-out-as-yankees-manager-and-the-nationals-just-happen-to-have-an-opening"}, 8 | {"weight":9,"rank":0.11777961277413226, "id":22, "url":"https://www.washingtonpost.com/people/mike-debonis"}, 9 | {"weight":4,"rank":0.0, "id":25, "url":"https://www.washingtonpost.com/news/post-politics/wp/2017/10/26/trump-plans-to-declare-the-opioid-crisis-a-public-health-emergency"}, 10 | {"weight":5,"rank":0.0, "id":32, "url":"https://www.washingtonpost.com/people/samantha-schmidt"}, 11 | {"weight":5,"rank":0.0, "id":38, "url":"https://www.washingtonpost.com/people/erik-wemple"}, 12 | {"weight":5,"rank":0.0, "id":41, "url":"https://www.washingtonpost.com/news/the-watch/wp/2017/10/26/mississippi-judge-resigns-after-barring-mother-from-seeing-newborn-because-of-unpaid-court-fees"}, 13 | {"weight":7,"rank":0.2567572862009351, "id":42, "url":"https://www.washingtonpost.com/people/radley-balko"}, 14 | {"weight":8,"rank":0.0, "id":43, "url":"https://www.washingtonpost.com/news/posteverything/wp/2017/10/26/george-w-bush-is-not-the-resistance-hes-part-of-what-brought-us-trump"}, 15 | {"weight":5,"rank":0.0, "id":45, "url":"https://www.washingtonpost.com/people/damian-paletta"}, 16 | {"weight":7,"rank":0.10600165188954895, "id":47, "url":"https://www.washingtonpost.com/powerpost/booker-graham-and-elias-expected-to-testify-today-in-menendez-corruption-trial/2017/10/26/34d9219c-ba4a-11e7-9e58-e6288544af98_story.html"}, 17 | {"weight":5,"rank":0.0, "id":51, "url":"https://www.washingtonpost.com/people/anna-fifield"}, 18 | {"weight":5,"rank":0.0, "id":53, "url":"https://www.washingtonpost.com/people/william-booth"}, 19 | {"weight":7,"rank":0.018975062118623574, "id":54, "url":"https://www.washingtonpost.com/local/virginia-politics/trump-again-tweets-on-virginias-governors-race-says-northam-will-be-very-weak-on-crime/2017/10/26/4c5d5ea6-ba58-11e7-be94-fabb0f1e9ffb_story.html"}, 20 | {"weight":5,"rank":0.0, "id":57, "url":"https://www.washingtonpost.com/people/perry-stein"}, 21 | {"weight":5,"rank":0.0, "id":59, "url":"https://www.washingtonpost.com/news/wonk/wp/2017/10/26/why-mcdonalds-is-beating-out-the-fresh-healthy-competition"}, 22 | {"weight":9,"rank":0.0, "id":64, "url":"https://www.washingtonpost.com/local/obituaries/notable-deaths-so-far-this-year/2017/01/17/750ed23a-dcf5-11e6-acdf-14da832ae861_gallery.html"}, 23 | {"weight":16,"rank":1.3854428993176182, "id":66, "url":"https://www.washingtonpost.com/powerpost/gops-insurgents-step-up-campaign-against-mcconnell/2017/10/25/ec3a5af4-b9a0-11e7-9e58-e6288544af98_story.html?tid=pm_pop"}, 24 | {"weight":10,"rank":0.006418872637825229, "id":74, "url":"https://www.washingtonpost.com/crossword-puzzles"}, 25 | {"weight":4,"rank":0.0, "id":80, "url":"https://www.washingtonpost.com/news/the-fix/wp/2017/10/26/trump-flubs-another-promise-declaring-the-opioid-crisis-a-national-emergency"}, 26 | {"weight":4,"rank":0.0, "id":83, "url":"https://www.washingtonpost.com/news/worldviews/wp/2017/10/26/whats-a-scary-superstition-in-your-part-of-the-world-share-it-with-us"}, 27 | {"weight":4,"rank":0.0, "id":90, "url":"https://www.washingtonpost.com/news/posteverything/wp/2017/10/26/will-republicans-regret-enabling-a-demagogue-my-iranian-parents-did"}], 28 | "links":[ 29 | {"source":0,"target":0,"value":3}, 30 | {"source":0,"target":1,"value":3}, 31 | {"source":0,"target":2,"value":3}, 32 | {"source":0,"target":3,"value":3}, 33 | {"source":0,"target":4,"value":3}, 34 | {"source":0,"target":5,"value":3}, 35 | {"source":0,"target":6,"value":3}, 36 | {"source":0,"target":7,"value":3}, 37 | {"source":0,"target":8,"value":3}, 38 | {"source":0,"target":9,"value":3}, 39 | {"source":0,"target":10,"value":3}, 40 | {"source":0,"target":11,"value":3}, 41 | {"source":0,"target":12,"value":3}, 42 | {"source":0,"target":13,"value":3}, 43 | {"source":0,"target":14,"value":3}, 44 | {"source":0,"target":15,"value":3}, 45 | {"source":0,"target":16,"value":3}, 46 | {"source":0,"target":17,"value":3}, 47 | {"source":0,"target":18,"value":3}, 48 | {"source":0,"target":19,"value":3}, 49 | {"source":0,"target":20,"value":3}, 50 | {"source":0,"target":21,"value":3}, 51 | {"source":0,"target":22,"value":3}, 52 | {"source":0,"target":23,"value":3}, 53 | {"source":0,"target":24,"value":3}, 54 | {"source":0,"target":25,"value":3}, 55 | {"source":7,"target":0,"value":3}, 56 | {"source":7,"target":7,"value":3}, 57 | {"source":9,"target":9,"value":3}, 58 | {"source":9,"target":0,"value":3}, 59 | {"source":9,"target":21,"value":3}, 60 | {"source":17,"target":0,"value":3}, 61 | {"source":17,"target":17,"value":3}, 62 | {"source":23,"target":0,"value":3}, 63 | {"source":23,"target":23,"value":3}, 64 | {"source":11,"target":11,"value":3}, 65 | {"source":11,"target":0,"value":3}, 66 | {"source":11,"target":21,"value":3}, 67 | {"source":24,"target":0,"value":3}, 68 | {"source":24,"target":24,"value":3}, 69 | {"source":19,"target":0,"value":3}, 70 | {"source":19,"target":19,"value":3}, 71 | {"source":22,"target":22,"value":3}, 72 | {"source":22,"target":0,"value":3}, 73 | {"source":8,"target":8,"value":3}, 74 | {"source":8,"target":0,"value":3}, 75 | {"source":8,"target":21,"value":3}, 76 | {"source":14,"target":0,"value":3}, 77 | {"source":14,"target":14,"value":3}, 78 | {"source":15,"target":15,"value":3}, 79 | {"source":15,"target":0,"value":3}, 80 | {"source":15,"target":21,"value":3}, 81 | {"source":4,"target":0,"value":3}, 82 | {"source":4,"target":4,"value":3}, 83 | {"source":13,"target":13,"value":3}, 84 | {"source":13,"target":0,"value":3}, 85 | {"source":13,"target":21,"value":3}, 86 | {"source":21,"target":0,"value":3}, 87 | {"source":21,"target":21,"value":3}, 88 | {"source":5,"target":0,"value":3}, 89 | {"source":5,"target":5,"value":3}, 90 | {"source":16,"target":16,"value":3}, 91 | {"source":16,"target":0,"value":3}, 92 | {"source":16,"target":21,"value":3}, 93 | {"source":18,"target":18,"value":3}, 94 | {"source":18,"target":0,"value":3}, 95 | {"source":18,"target":21,"value":3}, 96 | {"source":25,"target":0,"value":3}, 97 | {"source":25,"target":25,"value":3}, 98 | {"source":3,"target":0,"value":3}, 99 | {"source":3,"target":3,"value":3}, 100 | {"source":3,"target":21,"value":3}, 101 | {"source":12,"target":0,"value":3}, 102 | {"source":12,"target":12,"value":3}, 103 | {"source":20,"target":0,"value":3}, 104 | {"source":20,"target":20,"value":3}, 105 | {"source":1,"target":0,"value":3}, 106 | {"source":1,"target":1,"value":3}, 107 | {"source":6,"target":6,"value":3}, 108 | {"source":6,"target":0,"value":3}, 109 | {"source":6,"target":21,"value":3}, 110 | {"source":10,"target":0,"value":3}, 111 | {"source":10,"target":10,"value":3}, 112 | {"source":10,"target":11,"value":3}, 113 | {"source":2,"target":0,"value":3}, 114 | {"source":2,"target":2,"value":3}]}; -------------------------------------------------------------------------------- /Capstone/Pagerank/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib.error 3 | import ssl 4 | from urllib.parse import urljoin 5 | from urllib.parse import urlparse 6 | from urllib.request import urlopen 7 | from bs4 import BeautifulSoup 8 | 9 | # Ignore SSL certificate errors 10 | ctx = ssl.create_default_context() 11 | ctx.check_hostname = False 12 | ctx.verify_mode = ssl.CERT_NONE 13 | 14 | conn = sqlite3.connect('spider.sqlite') 15 | cur = conn.cursor() 16 | 17 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 18 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 19 | error INTEGER, old_rank REAL, new_rank REAL)''') 20 | 21 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 22 | (from_id INTEGER, to_id INTEGER)''') 23 | 24 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 25 | 26 | # Check to see if we are already in progress... 27 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 28 | row = cur.fetchone() 29 | if row is not None: 30 | print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.") 31 | else : 32 | starturl = input('Enter web url or enter: ') 33 | if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/' 34 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 35 | web = starturl 36 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 37 | pos = starturl.rfind('/') 38 | web = starturl[:pos] 39 | 40 | if ( len(web) > 1 ) : 41 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 42 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 43 | conn.commit() 44 | 45 | # Get the current webs 46 | cur.execute('''SELECT url FROM Webs''') 47 | webs = list() 48 | for row in cur: 49 | webs.append(str(row[0])) 50 | 51 | print(webs) 52 | 53 | many = 0 54 | while True: 55 | if ( many < 1 ) : 56 | sval = input('How many pages:') 57 | if ( len(sval) < 1 ) : break 58 | many = int(sval) 59 | many = many - 1 60 | 61 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 62 | try: 63 | row = cur.fetchone() 64 | # print row 65 | fromid = row[0] 66 | url = row[1] 67 | except: 68 | print('No unretrieved HTML pages found') 69 | many = 0 70 | break 71 | 72 | print(fromid, url, end=' ') 73 | 74 | # If we are retrieving this page, there should be no links from it 75 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 76 | try: 77 | document = urlopen(url, context=ctx) 78 | 79 | html = document.read() 80 | if document.getcode() != 200 : 81 | print("Error on page: ",document.getcode()) 82 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 83 | 84 | if 'text/html' != document.info().get_content_type() : 85 | print("Ignore non text/html page") 86 | cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) 87 | cur.execute('UPDATE Pages SET error=0 WHERE url=?', (url, ) ) 88 | conn.commit() 89 | continue 90 | 91 | print('('+str(len(html))+')', end=' ') 92 | 93 | soup = BeautifulSoup(html, "html.parser") 94 | except KeyboardInterrupt: 95 | print('') 96 | print('Program interrupted by user...') 97 | break 98 | except: 99 | print("Unable to retrieve or parse page") 100 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 101 | conn.commit() 102 | continue 103 | 104 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 105 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) 106 | conn.commit() 107 | 108 | # Retrieve all of the anchor tags 109 | tags = soup('a') 110 | count = 0 111 | for tag in tags: 112 | href = tag.get('href', None) 113 | if ( href is None ) : continue 114 | # Resolve relative references like href="/contact" 115 | up = urlparse(href) 116 | if ( len(up.scheme) < 1 ) : 117 | href = urljoin(url, href) 118 | ipos = href.find('#') 119 | if ( ipos > 1 ) : href = href[:ipos] 120 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 121 | if ( href.endswith('/') ) : href = href[:-1] 122 | # print href 123 | if ( len(href) < 1 ) : continue 124 | 125 | # Check if the URL is in any of the webs 126 | found = False 127 | for web in webs: 128 | if ( href.startswith(web) ) : 129 | found = True 130 | break 131 | if not found : continue 132 | 133 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 134 | count = count + 1 135 | conn.commit() 136 | 137 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 138 | try: 139 | row = cur.fetchone() 140 | toid = row[0] 141 | except: 142 | print('Could not retrieve id') 143 | continue 144 | # print fromid, toid 145 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 146 | 147 | 148 | print(count) 149 | 150 | cur.close() 151 | -------------------------------------------------------------------------------- /Capstone/Pagerank/spider.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spider.sqlite -------------------------------------------------------------------------------- /Capstone/Pagerank/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print("Creating JSON output on spider.js...") 7 | howmany = int(input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank is None or maxrank < rank: maxrank = rank 22 | if minrank is None or minrank > rank : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print("Error - please run sprank.py to compute page rank") 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print("Open force.html in a browser to view the visualization") 62 | -------------------------------------------------------------------------------- /Capstone/Pagerank/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print("Nothing to page rank. Check data.") 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in list(prev_ranks.items()): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in list(prev_ranks.items()): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in list(next_ranks.items()): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in list(next_ranks.items()): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in list(prev_ranks.items()): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print(i+1, avediff) 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print(list(next_ranks.items())[:5]) 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in list(next_ranks.items()) : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /Capstone/Pagerank/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print("All pages set to a rank of 1.0") 12 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 1/Hello World.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 08:59:26 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | print("Hello World!") -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 2/Assignment 2.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:00:05 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | name = input("Enter your name") 9 | print("Hello %s" % name) -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 2/Assignment 2.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:01:38 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | hrs = input("Enter Hours:") 9 | rate = input("Enter Rate:") 10 | cost = float(hrs)*float(rate) 11 | 12 | print("Pay: %s" % cost) 13 | 14 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 3/Assignment 3.1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:03:50 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | try: 9 | h = input("please input your hour:") 10 | hour = float(h) 11 | r = input("please input your rate:") 12 | rate = float(r) 13 | if hour < 0: 14 | print("Please,input your positive numberic") 15 | elif rate < 0: 16 | print("Please,input your positive numberic") 17 | elif hour > 40: 18 | print("%.2f" % (40*rate+(hour-40)*1.5*rate)) 19 | else: 20 | print("%.2f" % (hour*rate)) 21 | except: 22 | print("Please,input your numberic") 23 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 3/Assignment 3.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:04:59 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | score = float(input("Enter a score between 0.0 and 1.0: ")) 9 | if score<0.0 or score>1.0: 10 | print("Error: Score out of range") 11 | if score<0.6: 12 | print("F") 13 | elif score>=0.6 and score<0.7: 14 | print("D") 15 | elif score>=0.7 and score<0.8: 16 | print("C") 17 | elif score>=0.8 and score<0.9: 18 | print("B") 19 | elif score>=0.9: 20 | print("A") 21 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 4/Assignment 4.6.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:07:25 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | def computepay(hrs,rps): 9 | final_pay = 0.0 10 | hours = float(hrs) 11 | rate_per_hour = float(rps) 12 | if hours>40: 13 | final_pay += 40 * float(rate_per_hour) 14 | hours -= 40 15 | final_pay += hours * rate_per_hour * 1.5 16 | else: 17 | final_pay += hours * rate_per_hour 18 | 19 | return final_pay 20 | 21 | 22 | hrs = input("Enter Hours: ") 23 | rate = input("Enter Rate: ") 24 | p = computepay(hrs,rate) 25 | print(p) 26 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 5/Assignment 5.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:09:05 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | largest = None 9 | smallest = None 10 | while True: 11 | try: 12 | num = input("Enter a number: ") 13 | if num == "done" : break 14 | n = int(num) 15 | if largest is None: 16 | largest = n 17 | if smallest is None: 18 | smallest = n 19 | if n > largest: 20 | largest = n 21 | if n < smallest: 22 | smallest = n 23 | except: 24 | print('Invalid input') 25 | 26 | print("Maximum is", largest) 27 | print("Minimum is", smallest) 28 | -------------------------------------------------------------------------------- /Python Data Structures/Atom Editor Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Python Data Structures/Atom Editor Test.PNG -------------------------------------------------------------------------------- /Python Data Structures/Chapter 10/Assignment 10.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:29:38 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | name = input("Enter file:") 9 | if len(name) < 1 : name = "mbox-short.txt" 10 | handle = open(name) 11 | a = handle.read() 12 | b = a.split("\n") 13 | d = [] 14 | for i in b: 15 | if i.startswith("From "): 16 | c = i.split(":") 17 | d.append(c[0][-2:]) 18 | d.sort() 19 | counts = {} 20 | for j in d: 21 | counts[j] = d.count(j) 22 | for k, l in counts.items(): 23 | print(k, l) 24 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 6/Assignment 6.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:11:42 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | text = "X-DSPAM-Confidence: 0.8475"; 9 | a = text[-6:] 10 | b = float(a) 11 | print(b) 12 | text.find(":") 13 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 7/Assignment 7.1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:12:55 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | # Use words.txt as the file name 9 | fname = input("Enter file name: ") 10 | fh = open(fname) 11 | a = fh.read() 12 | print(a.upper().rstrip()) -------------------------------------------------------------------------------- /Python Data Structures/Chapter 7/Assignment 7.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:17:11 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | # Use the file name mbox-short.txt as the file name 9 | fname = input("Enter file name: ") 10 | fh = open(fname) 11 | a = [] 12 | for line in fh: 13 | if line.startswith("X-DSPAM-Confidence:"): 14 | a.append(float(line[-6:])) 15 | total = 0 16 | for i in a: 17 | total = total + i 18 | mean = total/(len(a)) 19 | print("Average spam confidence:", mean) 20 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 7/words.txt: -------------------------------------------------------------------------------- 1 | Writing programs or programming is a very creative 2 | and rewarding activity You can write programs for 3 | many reasons ranging from making your living to solving 4 | a difficult data analysis problem to having fun to helping 5 | someone else solve a problem This book assumes that 6 | {\em everyone} needs to know how to program and that once 7 | you know how to program, you will figure out what you want 8 | to do with your newfound skills 9 | 10 | We are surrounded in our daily lives with computers ranging 11 | from laptops to cell phones We can think of these computers 12 | as our personal assistants who can take care of many things 13 | on our behalf The hardware in our current-day computers 14 | is essentially built to continuously ask us the question 15 | What would you like me to do next 16 | 17 | Our computers are fast and have vasts amounts of memory and 18 | could be very helpful to us if we only knew the language to 19 | speak to explain to the computer what we would like it to 20 | do next If we knew this language we could tell the 21 | computer to do tasks on our behalf that were reptitive 22 | Interestingly, the kinds of things computers can do best 23 | are often the kinds of things that we humans find boring 24 | and mind-numbing 25 | 26 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 8/Assignment 8.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Spyder Editor 4 | 5 | This is a temporary script file. 6 | """ 7 | 8 | get = input('Please enter file name:') 9 | handle = open(get) 10 | text = list() 11 | for line in handle: 12 | line = line.rstrip() 13 | line = line.split() 14 | for i in line: 15 | if i in text: 16 | continue 17 | else: 18 | text.append(i) 19 | text.sort() 20 | print(text) 21 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 8/Assignment 8.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:25:10 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | file = input('Please enter file name:') 9 | handle = open(file) 10 | count = 0 11 | for line in handle: 12 | line = line.rstrip() 13 | if not line.startswith('From '): 14 | continue 15 | line = line.split() 16 | print(line[1]) 17 | count = count+1 18 | print('There were', count, 'lines in the file with From as the first word') 19 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 8/romeo.txt: -------------------------------------------------------------------------------- 1 | But soft what light through yonder window breaks 2 | It is the east and Juliet is the sun 3 | Arise fair sun and kill the envious moon 4 | Who is already sick and pale with grief 5 | 6 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 9/Assignment 9.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:27:17 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | file = input('Please enter file name: ') 9 | handle = open(file) 10 | data1 = list() 11 | data2 = dict() 12 | for line in handle: 13 | line = line.rstrip() 14 | if not line.startswith('From '): 15 | continue 16 | line = line.split() 17 | line = line[1] 18 | data1.append(line) 19 | for i in data1: 20 | data2[i] = data2.get(i,0)+1 21 | 22 | word = None 23 | max = None 24 | 25 | for aa, bb in data2.items(): 26 | if max is None or bb > max: 27 | word = aa 28 | max = bb 29 | 30 | print(word, max) -------------------------------------------------------------------------------- /Python Data Structures/Directory Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Python Data Structures/Directory Test.PNG -------------------------------------------------------------------------------- /Python Data Structures/Test.py: -------------------------------------------------------------------------------- 1 | print("I am writing a line on code!") 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-for-Everybody-Coursera 2 | Coursera courses for the Python for Everybody Specialization by the University of Michigan. This specialization teaches the fundamentals on how to get started on learning to use Python. I for myself started out in a non-technical background and found a way to learn the material. 3 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/First Database.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 2/First Database.db -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/First Database.db.sqbpro: -------------------------------------------------------------------------------- 1 |About this Map
45 |46 | This is a cool map from 47 | www.py4e.com. 48 |
49 | 50 | 51 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/where.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/where.png -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 1/Atom Editor Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Python to Access Web Data/Week 1/Atom Editor Test.PNG -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 1/Directory Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Python to Access Web Data/Week 1/Directory Test.PNG -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 1/Test.py: -------------------------------------------------------------------------------- 1 | print("I am writing a line on code!") 2 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 2/Week 2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 20:06:30 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import re 9 | pile = open('regex_sum_41647.txt') 10 | gold = pile.read() 11 | copier = re.findall("[0-9]+", gold) 12 | dice = [int(i) for i in copier] 13 | sum = 0 14 | for k in dice: 15 | sum += k 16 | print(sum) 17 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 3/Week 3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 20:27:29 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import socket 9 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 10 | mysock.connect(('data.pr4e.org', 80)) 11 | cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode() 12 | mysock.send(cmd) 13 | 14 | lit = list() 15 | while True: 16 | data = mysock.recv(512) 17 | lit.append(data) 18 | if (len(data) < 1): 19 | break 20 | print(data.decode()) 21 | mysock.close() 22 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 3/intro-short.txt: -------------------------------------------------------------------------------- 1 | Why should you learn to write programs? 2 | 3 | Writing programs (or programming) is a very creative 4 | and rewarding activity. You can write programs for 5 | many reasons, ranging from making your living to solving 6 | a difficult data analysis problem to having fun to helping 7 | someone else solve a problem. This book assumes that 8 | everyone needs to know how to program, and that once 9 | you know how to program you will figure out what you want 10 | to do with your newfound skills. 11 | 12 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 4/Following Links in HTML Using BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:12:49 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | from bs4 import BeautifulSoup 10 | import ssl 11 | ctx = ssl.create_default_context() 12 | ctx.check_hostname = False 13 | ctx.verify_mode = ssl.CERT_NONE 14 | 15 | url = input('Enter URL: ') 16 | num = input('Enter count: ') 17 | pos = input('Enter position: ') 18 | print('Retrieving: ', url) 19 | for times in range(int(num)): 20 | html = urllib.request.urlopen(url, context=ctx).read() 21 | soup = BeautifulSoup(html, 'html.parser') 22 | tags = soup('a') 23 | print('Retrieving: ', tags[int(pos)-1].get('href', None)) 24 | url = tags[int(pos)-1].get('href', None) 25 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 4/Scraping HTML Data with BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:10:37 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | from urllib.request import urlopen 9 | from bs4 import BeautifulSoup 10 | import ssl 11 | 12 | ctx = ssl.create_default_context() 13 | ctx.check_hostname = False 14 | ctx.verify_mode = ssl.CERT_NONE 15 | html = urlopen(' http://py4e-data.dr-chuck.net/comments_41649.html', context=ctx).read() 16 | soup = BeautifulSoup(html, "html.parser") 17 | tags = soup('span') 18 | sum = 0 19 | coun = 0 20 | print('Enter - ') 21 | for tag in tags: 22 | coun += 1 23 | sum += int(tag.contents[0]) 24 | print('Count', coun, '\nSum', sum) 25 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 5/Extracting Data from XML.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:43:48 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | import xml.etree.ElementTree as ET 10 | import ssl 11 | ctx = ssl.create_default_context() 12 | ctx.check_hostname = False 13 | ctx.verify_mode = ssl.CERT_NONE 14 | 15 | url = input('Enter location: ') 16 | print ('Retrieving ', url) 17 | html = urllib.request.urlopen(url, context=ctx).read() 18 | print ('Retrieved', len(html), 'characters') 19 | tree = ET.fromstring(html) 20 | print ('Count: ',len(tree.findall('.//count'))) 21 | total = 0 22 | for r in tree.findall("./comments/comment"): 23 | total += int(r.find('count').text) 24 | print ('Sum: ', total) 25 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 6/Extracting Data from JSON.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:45:18 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | import json 10 | 11 | url = input('Enter location: ') 12 | data = urllib.request.urlopen(url).read() 13 | info = json.loads(data) 14 | info = info['comments'] 15 | print ('Retrieving', url, '\nRetrieved', len(data), 'caracters', '\nCount:', len(info)) 16 | num = 0 17 | for item in info: 18 | num += int(item['count']) 19 | print ('Sum:', num) 20 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 6/GEOSON API.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:46:57 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | import json 10 | 11 | serviceurl = 'http://python-data.dr-chuck.net/geojson' 12 | address = input('Enter location: ') 13 | url = serviceurl + '?' + urllib.parse.urlencode({'sensor':'false', 'address': address}) 14 | data = urllib.request.urlopen(url).read().decode() 15 | info = json.loads(data) 16 | info = info['results'] 17 | print ('Retrieving', url, '\nRetrieved', len(data), 'caracters') 18 | for item in info: 19 | key = item['place_id'] 20 | print ('Place id:', key) 21 | --------------------------------------------------------------------------------