├── Capstone ├── Mailing List I │ ├── Content.sqlite Snapshot.PNG │ ├── Gline Visualization.PNG │ ├── Gmodel Index sqlite Screenshot.PNG │ ├── Gmodel.py Application Screenshot.PNG │ ├── Histogram gbasic.PNG │ ├── README.txt │ ├── Second Gline.PNG │ ├── Wordcloud.PNG │ ├── content.sqlite │ ├── d3.layout.cloud.js │ ├── d3.v2.js │ ├── gbasic.py │ ├── gline.htm │ ├── gline.js │ ├── gline.py │ ├── gmane.py │ ├── gmodel.py │ ├── gword.htm │ ├── gword.js │ ├── gword.py │ ├── gyear.py │ ├── index.sqlite │ └── mapping.sqlite └── Pagerank │ ├── LICENSE │ ├── Pagerank Dr.Chuck.PNG │ ├── Pagerank WP.PNG │ ├── README.txt │ ├── bs4 │ ├── __init__.py │ ├── __init__.py.bak │ ├── builder │ │ ├── __init__.py │ │ ├── __init__.py.bak │ │ ├── _html5lib.py │ │ ├── _html5lib.py.bak │ │ ├── _htmlparser.py │ │ ├── _htmlparser.py.bak │ │ ├── _lxml.py │ │ └── _lxml.py.bak │ ├── dammit.py │ ├── dammit.py.bak │ ├── diagnose.py │ ├── diagnose.py.bak │ ├── element.py │ ├── element.py.bak │ ├── testing.py │ ├── testing.py.bak │ └── tests │ │ ├── __init__.py │ │ ├── test_builder_registry.py │ │ ├── test_docs.py │ │ ├── test_html5lib.py │ │ ├── test_html5lib.py.bak │ │ ├── test_htmlparser.py │ │ ├── test_lxml.py │ │ ├── test_lxml.py.bak │ │ ├── test_soup.py │ │ ├── test_soup.py.bak │ │ ├── test_tree.py │ │ └── test_tree.py.bak │ ├── d3.v2.js │ ├── force.css │ ├── force.html │ ├── force.js │ ├── pagerank orginal.PNG │ ├── spdump.py │ ├── spdump.py Dr. Chuck.PNG │ ├── spdump.py WP.PNG │ ├── spider.js │ ├── spider.py │ ├── spider.sqlite │ ├── spjson.py │ ├── sprank.py │ └── spreset.py ├── Programming for Everybody ├── Chapter 1 │ └── Hello World.py ├── Chapter 2 │ ├── Assignment 2.2.py │ └── Assignment 2.3.py ├── Chapter 3 │ ├── Assignment 3.1.py │ └── Assignment 3.3.py ├── Chapter 4 │ └── Assignment 4.6.py └── Chapter 5 │ └── Assignment 5.2.py ├── Python Data Structures ├── Atom Editor Test.PNG ├── Chapter 10 │ ├── Assignment 10.2.py │ └── mbox-short.txt ├── Chapter 6 │ └── Assignment 6.5.py ├── Chapter 7 │ ├── Assignment 7.1.py │ ├── Assignment 7.2.py │ ├── mbox-short.txt │ └── words.txt ├── Chapter 8 │ ├── Assignment 8.4.py │ ├── Assignment 8.5.py │ ├── mbox-short.txt │ └── romeo.txt ├── Chapter 9 │ ├── Assignment 9.4.py │ └── mbox-short.txt ├── Directory Test.PNG └── Test.py ├── README.md ├── Using Databases with Python ├── Week 2 │ ├── First Database.db │ ├── First Database.db.sqbpro │ ├── First Database.sql │ ├── emaildb.py │ ├── emaildb.sqlite │ ├── mbox-short.txt │ └── mbox.txt ├── Week 3 │ ├── Library.xml │ ├── README.txt │ ├── trackdb.sqlite │ ├── tracks.py │ ├── trackscomplete.py │ └── tracksdb.py ├── Week 4 │ ├── HW Result.sql │ ├── roster.py │ ├── roster_data.json │ └── rosterdb.sqlite └── Week 5 │ ├── Google API Key.doc │ ├── README.txt │ ├── geodata.sqlite │ ├── geodump.png │ ├── geodump.py │ ├── geoload.png │ ├── geoload.py │ ├── where.data │ ├── where.html │ ├── where.js │ └── where.png └── Using Python to Access Web Data ├── Week 1 ├── Atom Editor Test.PNG ├── Directory Test.PNG └── Test.py ├── Week 2 ├── Week 2.py └── regex_sum_41647.txt ├── Week 3 ├── Week 3.py └── intro-short.txt ├── Week 4 ├── Following Links in HTML Using BeautifulSoup.py └── Scraping HTML Data with BeautifulSoup.py ├── Week 5 └── Extracting Data from XML.py └── Week 6 ├── Extracting Data from JSON.py └── GEOSON API.py /Capstone/Mailing List I/Content.sqlite Snapshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Content.sqlite Snapshot.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Gline Visualization.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gline Visualization.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Gmodel Index sqlite Screenshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gmodel Index sqlite Screenshot.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Gmodel.py Application Screenshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Gmodel.py Application Screenshot.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Histogram gbasic.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Histogram gbasic.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive from gmane and vizualizing the data 2 | using the D3 JavaScript library 3 | 4 | This is a set of tools that allow you to pull down an archive 5 | of a gmane repository using the instructions at: 6 | 7 | http://gmane.org/export.php 8 | 9 | In order not to overwhelm the gmane.org server, I have put up 10 | my own copy of the messages at: 11 | 12 | http://mbox.dr-chuck.net/ 13 | 14 | This server will be faster and take a lot of load off the 15 | gmane.org server. 16 | 17 | You should install the SQLite browser to view and modify the databases from: 18 | 19 | http://sqlitebrowser.org/ 20 | 21 | The first step is to spider the gmane repository. The base URL 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai 23 | developer list. You can spider another repository by changing that 24 | base url. Make sure to delete the content.sqlite file if you 25 | switch the base url. The gmane.py file operates as a spider in 26 | that it runs slowly and retrieves one mail message per second so 27 | as to avoid getting throttled by gmane.org. It stores all of 28 | its data in a database and can be interrupted and re-started 29 | as often as needed. It may take many hours to pull all the data 30 | down. So you may need to restart several times. 31 | 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 33 | email here: 34 | 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite 36 | 37 | If you download this, you can "catch up with the latest" by 38 | running gmane.py. 39 | 40 | Navigate to the folder where you extracted the gmane.zip 41 | 42 | Note: Windows has difficulty in displaying UTF-8 characters 43 | in the console so for each console window you open, you may need 44 | to type the following command before running this code: 45 | 46 | chcp 65001 47 | 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 49 | 50 | Here is a run of gmane.py getting the last five messages of the 51 | sakai developer list: 52 | 53 | Mac: python3 gmane.py 54 | Win: gmane.py 55 | 56 | How many messages:10 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662 58 | ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434 60 | csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference: sakai developers break into song 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055 62 | kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721 64 | michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 66 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 67 | Does not start with From 68 | 69 | The program scans content.sqlite from 1 up to the first message number not 70 | already spidered and starts spidering at that message. It continues spidering 71 | until it has spidered the desired number of messages or it reaches a page 72 | that does not appear to be a properly formatted message. 73 | 74 | Sometimes gmane.org is missing a message. Perhaps administrators can delete messages 75 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 77 | all the other fields blank - and then restart gmane.py. This will unstick the 78 | spidering process and allow it to continue. These empty messages will be ignored in the next 79 | phase of the process. 80 | 81 | One nice thing is that once you have spidered all of the messages and have them in 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 83 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 84 | if there are new messages and then quickly retrieve those messages and add them 85 | to content.sqlite. 86 | 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 88 | This is intentional as it allows you to look at content.sqlite to debug the process. 89 | It would be a bad idea to run any queries against this database as they would be 90 | slow. 91 | 92 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 94 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 95 | smaller) than content.sqlite because it also compresses the header and body text. 96 | 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 99 | data cleaning process. 100 | 101 | Running gmodel.py works as follows: 102 | 103 | Mac: python3 gmodel.py 104 | Win: gmodel.py 105 | 106 | Loaded allsenders 1588 and mapping 28 dns mapping 1 107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 111 | ... 112 | 113 | The gmodel.py program does a number of data cleaing steps 114 | 115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 116 | other domain names are truncated to three levels. So si.umich.edu becomes 117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 118 | forced to lower case and some of the @gmane.org address like the following 119 | 120 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 121 | 122 | are converted to the real address whenever there is a matching real email 123 | address elsewhere in the message corpus. 124 | 125 | If you look in the content.sqlite database there are two tables that allow 126 | you to map both domain names and individual email addresses that change over 127 | the lifetime of the email list. For example, Steve Githens used the following 128 | email addresses over the life of the Sakai developer list: 129 | 130 | s-githens@northwestern.edu 131 | sgithens@cam.ac.uk 132 | swgithen@mtu.edu 133 | 134 | We can add two entries to the Mapping table 135 | 136 | s-githens@northwestern.edu -> swgithen@mtu.edu 137 | sgithens@cam.ac.uk -> swgithen@mtu.edu 138 | 139 | And so all the mail messages will be collected under one sender even if 140 | they used several email addresses over the lifetime of the mailing list. 141 | 142 | You can also make similar entries in the DNSMapping table if there are multiple 143 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 144 | mapping: 145 | 146 | iupui.edu -> indiana.edu 147 | 148 | So all the folks from the various Indiana University campuses are tracked together 149 | 150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 151 | to make the data cleaner and cleaner. When you are done, you will have a nicely 152 | indexed version of the email in index.sqlite. This is the file to use to do data 153 | analysis. With this file, data analysis will be really quick. 154 | 155 | The first, simplest data analysis is to do a "who does the most" and "which 156 | organzation does the most"? This is done using gbasic.py: 157 | 158 | Mac: python3 gbasic.py 159 | Win: gbasic.py 160 | 161 | How many to dump? 5 162 | Loaded messages= 51330 subjects= 25033 senders= 1584 163 | 164 | Top 5 Email list participants 165 | steve.swinsburg@gmail.com 2657 166 | azeckoski@unicon.net 1742 167 | ieb@tfd.co.uk 1591 168 | csev@umich.edu 1304 169 | david.horwitz@uct.ac.za 1184 170 | 171 | Top 5 Email list organizations 172 | gmail.com 7339 173 | umich.edu 6243 174 | uct.ac.za 2451 175 | indiana.edu 2258 176 | unicon.net 2055 177 | 178 | You can look at the data in index.sqlite and if you find a problem, you 179 | can update the Mapping table and DNSMapping table in content.sqlite and 180 | re-run gmodel.py. 181 | 182 | There is a simple vizualization of the word frequence in the subject lines 183 | in the file gword.py: 184 | 185 | Mac: python3 gword.py 186 | Win: gword.py 187 | 188 | Range of counts: 33229 129 189 | Output written to gword.js 190 | 191 | This produces the file gword.js which you can visualize using the file 192 | gword.htm. 193 | 194 | A second visualization is in gline.py. It visualizes email participation by 195 | organizations over time. 196 | 197 | Mac: python3 gline.py 198 | Win: gline.py 199 | 200 | Loaded messages= 51330 subjects= 25033 senders= 1584 201 | Top 10 Oranizations 202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 203 | Output written to gline.js 204 | 205 | Its output is written to gline.js which is visualized using gline.htm. 206 | 207 | Some URLs for visualization ideas: 208 | 209 | https://developers.google.com/chart/ 210 | 211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 212 | 213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 214 | 215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 216 | 217 | http://bost.ocks.org/mike/uberdata/ 218 | 219 | http://mbostock.github.io/d3/talk/20111018/calendar.html 220 | 221 | http://nltk.org/install.html 222 | 223 | As always - comments welcome. 224 | 225 | -- Dr. Chuck 226 | Sun Sep 29 00:11:01 EDT 2013 227 | 228 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/Second Gline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Second Gline.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/Wordcloud.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/Wordcloud.PNG -------------------------------------------------------------------------------- /Capstone/Mailing List I/content.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/content.sqlite -------------------------------------------------------------------------------- /Capstone/Mailing List I/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | howmany = int(input("How many to dump? ")) 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | cur = conn.cursor() 9 | 10 | cur.execute('SELECT id, sender FROM Senders') 11 | senders = dict() 12 | for message_row in cur : 13 | senders[message_row[0]] = message_row[1] 14 | 15 | cur.execute('SELECT id, subject FROM Subjects') 16 | subjects = dict() 17 | for message_row in cur : 18 | subjects[message_row[0]] = message_row[1] 19 | 20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 22 | messages = dict() 23 | for message_row in cur : 24 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 25 | 26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) 27 | 28 | sendcounts = dict() 29 | sendorgs = dict() 30 | for (message_id, message) in list(messages.items()): 31 | sender = message[1] 32 | sendcounts[sender] = sendcounts.get(sender,0) + 1 33 | pieces = senders[sender].split("@") 34 | if len(pieces) != 2 : continue 35 | dns = pieces[1] 36 | sendorgs[dns] = sendorgs.get(dns,0) + 1 37 | 38 | print('') 39 | print('Top',howmany,'Email list participants') 40 | 41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 42 | for k in x[:howmany]: 43 | print(senders[k], sendcounts[k]) 44 | if sendcounts[k] < 10 : break 45 | 46 | print('') 47 | print('Top',howmany,'Email list organizations') 48 | 49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 50 | for k in x[:howmany]: 51 | print(k, sendorgs[k]) 52 | if sendorgs[k] < 10 : break 53 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gline.js: -------------------------------------------------------------------------------- 1 | gline = [ ['Month','umich.edu','gmail.com','swinsborg.com','cam.ac.uk','uct.ac.za','indiana.edu','unicon.net','berkeley.edu','longsight.com','stanford.edu'], 2 | ['2005-12',57,10,0,7,14,12,6,12,0,4], 3 | ['2006-01',114,23,0,19,27,32,10,33,0,13], 4 | ['2006-02',121,28,0,28,23,33,7,28,0,24], 5 | ['2006-03',86,27,0,44,18,43,11,34,1,14], 6 | ['2006-04',125,24,0,54,38,47,42,44,1,25], 7 | ['2006-05',151,26,0,103,51,55,101,76,2,22], 8 | ['2006-06',119,30,0,76,66,70,37,32,0,13], 9 | ['2006-07',86,19,0,47,55,76,37,18,0,39], 10 | ['2006-08',136,70,0,46,44,102,50,20,0,21], 11 | ['2006-09',131,46,0,36,26,46,28,27,0,32], 12 | ['2006-10',109,28,0,74,20,52,35,30,0,31], 13 | ['2006-11',87,55,0,51,47,36,24,35,0,16], 14 | ['2006-12',54,58,0,21,13,46,8,26,0,15], 15 | ['2007-01',84,32,0,42,35,43,10,24,0,26], 16 | ['2007-02',114,51,0,59,44,54,10,18,0,30], 17 | ['2007-03',93,45,4,54,38,64,4,46,0,34], 18 | ['2007-04',68,54,1,46,25,72,10,24,0,27], 19 | ['2007-05',98,45,17,61,61,41,16,62,0,39], 20 | ['2007-06',115,43,3,58,36,39,33,50,0,38], 21 | ['2007-07',126,53,28,89,69,28,59,45,0,34], 22 | ['2007-08',182,37,21,102,50,63,48,46,0,28], 23 | ['2007-09',167,52,31,132,70,47,98,53,0,58], 24 | ['2007-10',104,62,41,97,47,44,100,41,0,73], 25 | ['2007-11',110,46,22,142,51,40,90,41,0,23], 26 | ['2007-12',151,71,21,123,36,33,67,35,0,17], 27 | ['2008-01',126,49,17,64,32,42,24,39,0,15], 28 | ['2008-02',94,39,51,89,30,34,19,14,0,23], 29 | ['2008-03',89,45,14,43,42,39,29,19,0,27], 30 | ['2008-04',140,58,22,99,50,27,57,40,0,26], 31 | ['2008-05',130,60,44,104,36,17,93,26,0,19], 32 | ['2008-06',96,28,13,36,33,21,31,25,0,5], 33 | ['2008-07',115,32,24,75,55,22,59,30,2,7], 34 | ['2008-08',165,42,31,80,65,23,41,39,5,13], 35 | ['2008-09',119,54,31,35,35,44,28,25,0,10], 36 | ['2008-10',85,40,33,60,31,27,33,15,4,3], 37 | ['2008-11',43,23,19,26,19,12,17,11,1,5], 38 | ['2008-12',67,30,18,17,22,12,18,6,3,4], 39 | ['2009-01',46,16,18,19,27,3,1,9,0,4], 40 | ['2009-02',23,43,38,26,17,15,5,9,1,5], 41 | ['2009-03',94,76,56,5,27,18,19,6,7,9], 42 | ['2009-04',74,101,43,2,28,18,42,5,8,10], 43 | ['2009-05',49,122,61,6,29,16,25,13,10,12], 44 | ['2009-06',43,64,41,4,29,11,8,27,8,4], 45 | ['2009-07',67,99,50,12,32,20,21,27,12,4], 46 | ['2009-08',42,59,17,13,35,25,12,8,13,3], 47 | ['2009-09',71,42,23,8,33,22,11,6,9,19], 48 | ['2009-10',77,69,85,4,50,43,29,3,9,8], 49 | ['2009-11',55,40,46,10,26,26,14,11,6,9], 50 | ['2009-12',43,34,26,2,21,16,11,2,3,4], 51 | ['2010-01',57,29,39,3,26,19,10,3,17,10], 52 | ['2010-02',54,36,42,12,22,21,16,2,16,1], 53 | ['2010-03',72,89,53,12,38,22,18,8,14,13], 54 | ['2010-04',41,38,30,2,18,9,15,9,17,24], 55 | ['2010-05',50,32,47,3,34,10,8,2,8,9], 56 | ['2010-06',28,56,47,10,18,12,7,2,19,14], 57 | ['2010-07',53,57,54,10,42,9,4,3,11,10], 58 | ['2010-08',57,47,36,13,41,18,11,1,15,8], 59 | ['2010-09',58,44,34,4,22,21,3,3,18,4], 60 | ['2010-10',42,41,18,2,12,4,4,4,14,11], 61 | ['2010-11',41,34,23,5,13,10,4,0,7,1], 62 | ['2010-12',26,32,13,2,11,8,7,1,7,3], 63 | ['2011-01',35,47,46,5,20,7,2,2,22,6], 64 | ['2011-02',30,58,51,2,15,9,17,1,18,6], 65 | ['2011-03',60,86,54,10,12,17,15,2,52,11], 66 | ['2011-04',38,45,25,3,6,21,6,0,19,3], 67 | ['2011-05',18,39,15,9,13,14,8,1,19,3], 68 | ['2011-06',30,89,22,4,22,10,13,0,7,4], 69 | ['2011-07',45,69,73,5,18,16,6,1,39,5], 70 | ['2011-08',42,45,37,9,13,13,4,4,48,10], 71 | ['2011-09',40,80,28,5,11,16,12,6,33,18], 72 | ['2011-10',23,59,26,7,4,11,12,0,34,6], 73 | ['2011-11',30,86,42,5,16,9,3,1,23,1], 74 | ['2011-12',26,30,31,2,7,5,5,0,21,13], 75 | ['2012-01',32,54,28,1,20,8,8,0,32,10], 76 | ['2012-02',37,85,62,5,24,10,18,4,36,22], 77 | ['2012-03',42,84,47,2,19,4,19,1,56,9], 78 | ['2012-04',24,50,52,0,11,4,15,1,45,10], 79 | ['2012-05',24,92,66,1,9,12,18,1,36,12], 80 | ['2012-06',53,90,51,2,12,14,54,1,47,1], 81 | ['2012-07',13,59,47,7,7,5,8,2,40,5], 82 | ['2012-08',17,61,51,1,7,10,18,1,41,1], 83 | ['2012-09',21,44,35,1,14,11,16,0,46,9], 84 | ['2012-10',21,51,36,3,19,6,10,0,63,9], 85 | ['2012-11',29,63,62,2,15,20,11,0,71,5], 86 | ['2012-12',12,29,35,0,5,8,9,0,31,9], 87 | ['2013-01',39,44,40,3,5,3,3,0,50,6], 88 | ['2013-02',59,97,66,0,5,25,13,0,54,8], 89 | ['2013-03',60,122,66,1,12,33,30,0,72,11], 90 | ['2013-04',9,34,18,0,4,4,8,0,17,2] 91 | ]; 92 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | conn = sqlite3.connect('index.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT id, sender FROM Senders') 9 | senders = dict() 10 | for message_row in cur : 11 | senders[message_row[0]] = message_row[1] 12 | 13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 14 | messages = dict() 15 | for message_row in cur : 16 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 17 | 18 | print("Loaded messages=",len(messages),"senders=",len(senders)) 19 | 20 | sendorgs = dict() 21 | for (message_id, message) in list(messages.items()): 22 | sender = message[1] 23 | pieces = senders[sender].split("@") 24 | if len(pieces) != 2 : continue 25 | dns = pieces[1] 26 | sendorgs[dns] = sendorgs.get(dns,0) + 1 27 | 28 | # pick the top schools 29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 30 | orgs = orgs[:10] 31 | print("Top 10 Oranizations") 32 | print(orgs) 33 | 34 | counts = dict() 35 | months = list() 36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 37 | for (message_id, message) in list(messages.items()): 38 | sender = message[1] 39 | pieces = senders[sender].split("@") 40 | if len(pieces) != 2 : continue 41 | dns = pieces[1] 42 | if dns not in orgs : continue 43 | month = message[3][:7] 44 | if month not in months : months.append(month) 45 | key = (month, dns) 46 | counts[key] = counts.get(key,0) + 1 47 | 48 | months.sort() 49 | # print counts 50 | # print months 51 | 52 | fhand = open('gline.js','w') 53 | fhand.write("gline = [ ['Year'") 54 | for org in orgs: 55 | fhand.write(",'"+org+"'") 56 | fhand.write("]") 57 | 58 | for month in months: 59 | fhand.write(",\n['"+month+"'") 60 | for org in orgs: 61 | key = (month, org) 62 | val = counts.get(key,0) 63 | fhand.write(","+str(val)) 64 | fhand.write("]"); 65 | 66 | fhand.write("\n];\n") 67 | fhand.close() 68 | 69 | print("Output written to gline.js") 70 | print("Open gline.htm to visualize the data") 71 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gmane.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import ssl 4 | import urllib.request, urllib.parse, urllib.error 5 | from urllib.parse import urljoin 6 | from urllib.parse import urlparse 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | # Not all systems have this so conditionally define parser 11 | try: 12 | import dateutil.parser as parser 13 | except: 14 | pass 15 | 16 | def parsemaildate(md) : 17 | # See if we have dateutil 18 | try: 19 | pdate = parser.parse(tdate) 20 | test_at = pdate.isoformat() 21 | return test_at 22 | except: 23 | pass 24 | 25 | # Non-dateutil version - we try our best 26 | 27 | pieces = md.split() 28 | notz = " ".join(pieces[:4]).strip() 29 | 30 | # Try a bunch of format variations - strptime() is *lame* 31 | dnotz = None 32 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 33 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 34 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 35 | try: 36 | dnotz = datetime.strptime(notz, form) 37 | break 38 | except: 39 | continue 40 | 41 | if dnotz is None : 42 | # print 'Bad Date:',md 43 | return None 44 | 45 | iso = dnotz.isoformat() 46 | 47 | tz = "+0000" 48 | try: 49 | tz = pieces[4] 50 | ival = int(tz) # Only want numeric timezone values 51 | if tz == '-0000' : tz = '+0000' 52 | tzh = tz[:3] 53 | tzm = tz[3:] 54 | tz = tzh+":"+tzm 55 | except: 56 | pass 57 | 58 | return iso+tz 59 | 60 | # Ignore SSL certificate errors 61 | ctx = ssl.create_default_context() 62 | ctx.check_hostname = False 63 | ctx.verify_mode = ssl.CERT_NONE 64 | 65 | conn = sqlite3.connect('content.sqlite') 66 | cur = conn.cursor() 67 | 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 69 | 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 71 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 72 | subject TEXT, headers TEXT, body TEXT)''') 73 | 74 | # Pick up where we left off 75 | start = None 76 | cur.execute('SELECT max(id) FROM Messages' ) 77 | try: 78 | row = cur.fetchone() 79 | if row is None : 80 | start = 0 81 | else: 82 | start = row[0] 83 | except: 84 | start = 0 85 | 86 | if start is None : start = 0 87 | 88 | many = 0 89 | count = 0 90 | fail = 0 91 | while True: 92 | if ( many < 1 ) : 93 | conn.commit() 94 | sval = input('How many messages:') 95 | if ( len(sval) < 1 ) : break 96 | many = int(sval) 97 | 98 | start = start + 1 99 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 100 | try: 101 | row = cur.fetchone() 102 | if row is not None : continue 103 | except: 104 | row = None 105 | 106 | many = many - 1 107 | url = baseurl + str(start) + '/' + str(start + 1) 108 | 109 | text = "None" 110 | try: 111 | # Open with a timeout of 30 seconds 112 | document = urllib.request.urlopen(url, None, 30, context=ctx) 113 | text = document.read().decode() 114 | if document.getcode() != 200 : 115 | print("Error code=",document.getcode(), url) 116 | break 117 | except KeyboardInterrupt: 118 | print('') 119 | print('Program interrupted by user...') 120 | break 121 | except Exception as e: 122 | print("Unable to retrieve or parse page",url) 123 | print("Error",e) 124 | fail = fail + 1 125 | if fail > 5 : break 126 | continue 127 | 128 | print(url,len(text)) 129 | count = count + 1 130 | 131 | if not text.startswith("From "): 132 | print(text) 133 | print("Did not find From ") 134 | fail = fail + 1 135 | if fail > 5 : break 136 | continue 137 | 138 | pos = text.find("\n\n") 139 | if pos > 0 : 140 | hdr = text[:pos] 141 | body = text[pos+2:] 142 | else: 143 | print(text) 144 | print("Could not find break between headers and body") 145 | fail = fail + 1 146 | if fail > 5 : break 147 | continue 148 | 149 | email = None 150 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 151 | if len(x) == 1 : 152 | email = x[0]; 153 | email = email.strip().lower() 154 | email = email.replace("<","") 155 | else: 156 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 157 | if len(x) == 1 : 158 | email = x[0]; 159 | email = email.strip().lower() 160 | email = email.replace("<","") 161 | 162 | date = None 163 | y = re.findall('\Date: .*, (.*)\n', hdr) 164 | if len(y) == 1 : 165 | tdate = y[0] 166 | tdate = tdate[:26] 167 | try: 168 | sent_at = parsemaildate(tdate) 169 | except: 170 | print(text) 171 | print("Parse fail",tdate) 172 | fail = fail + 1 173 | if fail > 5 : break 174 | continue 175 | 176 | subject = None 177 | z = re.findall('\Subject: (.*)\n', hdr) 178 | if len(z) == 1 : subject = z[0].strip().lower(); 179 | 180 | # Reset the fail counter 181 | fail = 0 182 | print(" ",email,sent_at,subject) 183 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 184 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 185 | if count % 50 == 0 : conn.commit() 186 | if count % 100 == 0 : time.sleep(1) 187 | 188 | conn.commit() 189 | cur.close() 190 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import re 4 | import zlib 5 | from datetime import datetime, timedelta 6 | 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print(realsender, sender) 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print(realsender, sender) 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print(x,dns) 52 | # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns)) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print('Bad Date:',md) 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception as e: 124 | # print('Date ignored ',tdate, e) 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | conn = sqlite3.connect('index.sqlite') 140 | cur = conn.cursor() 141 | 142 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 143 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 145 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 146 | 147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 148 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 149 | sender_id INTEGER, subject_id INTEGER, 150 | headers BLOB, body BLOB)''') 151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 152 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 154 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 156 | (from_id INTEGER, to_id INTEGER)''') 157 | 158 | conn_1 = sqlite3.connect('mapping.sqlite') 159 | cur_1 = conn_1.cursor() 160 | 161 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 162 | for message_row in cur_1 : 163 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 164 | 165 | mapping = dict() 166 | cur_1.execute('''SELECT old,new FROM Mapping''') 167 | for message_row in cur_1 : 168 | old = fixsender(message_row[0]) 169 | new = fixsender(message_row[1]) 170 | mapping[old] = fixsender(new) 171 | 172 | # Done with mapping.sqlite 173 | conn_1.close() 174 | 175 | # Open the main content (Read only) 176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True) 177 | cur_1 = conn_1.cursor() 178 | 179 | allsenders = list() 180 | cur_1.execute('''SELECT email FROM Messages''') 181 | for message_row in cur_1 : 182 | sender = fixsender(message_row[0]) 183 | if sender is None : continue 184 | if 'gmane.org' in sender : continue 185 | if sender in allsenders: continue 186 | allsenders.append(sender) 187 | 188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) 189 | 190 | cur_1.execute('''SELECT headers, body, sent_at 191 | FROM Messages ORDER BY sent_at''') 192 | 193 | senders = dict() 194 | subjects = dict() 195 | guids = dict() 196 | 197 | count = 0 198 | 199 | for message_row in cur_1 : 200 | hdr = message_row[0] 201 | parsed = parseheader(hdr, allsenders) 202 | if parsed is None: continue 203 | (guid, sender, subject, sent_at) = parsed 204 | 205 | # Apply the sender mapping 206 | sender = mapping.get(sender,sender) 207 | 208 | count = count + 1 209 | if count % 250 == 1 : print(count,sent_at, sender) 210 | # print(guid, sender, subject, sent_at) 211 | 212 | if 'gmane.org' in sender: 213 | print("Error in sender ===", sender) 214 | 215 | sender_id = senders.get(sender,None) 216 | subject_id = subjects.get(subject,None) 217 | guid_id = guids.get(guid,None) 218 | 219 | if sender_id is None : 220 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 221 | conn.commit() 222 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 223 | try: 224 | row = cur.fetchone() 225 | sender_id = row[0] 226 | senders[sender] = sender_id 227 | except: 228 | print('Could not retrieve sender id',sender) 229 | break 230 | if subject_id is None : 231 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 232 | conn.commit() 233 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 234 | try: 235 | row = cur.fetchone() 236 | subject_id = row[0] 237 | subjects[subject] = subject_id 238 | except: 239 | print('Could not retrieve subject id',subject) 240 | break 241 | # print(sender_id, subject_id) 242 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 243 | ( guid, sender_id, subject_id, sent_at, 244 | zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) ) 245 | conn.commit() 246 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 247 | try: 248 | row = cur.fetchone() 249 | message_id = row[0] 250 | guids[guid] = message_id 251 | except: 252 | print('Could not retrieve guid id',guid) 253 | break 254 | 255 | cur.close() 256 | cur_1.close() 257 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gword.js: -------------------------------------------------------------------------------- 1 | gword = [{text: 'sakai', size: 100}, 2 | {text: 'building', size: 71}, 3 | {text: 'tool', size: 26}, 4 | {text: 'with', size: 26}, 5 | {text: 'site', size: 24}, 6 | {text: 'problem', size: 23}, 7 | {text: 'error', size: 22}, 8 | {text: 'from', size: 22}, 9 | {text: 'question', size: 22}, 10 | {text: 'samigo', size: 22}, 11 | {text: 'build', size: 21}, 12 | {text: 'release', size: 21}, 13 | {text: 'trunk', size: 21}, 14 | {text: 'using', size: 21}, 15 | {text: 'resources', size: 21}, 16 | {text: 'issue', size: 21}, 17 | {text: 'user', size: 21}, 18 | {text: 'help', size: 21}, 19 | {text: 'tools', size: 21}, 20 | {text: 'melete', size: 21}, 21 | {text: 'problems', size: 21}, 22 | {text: 'gradebook', size: 21}, 23 | {text: 'maven', size: 21}, 24 | {text: 'mysql', size: 21}, 25 | {text: 'course', size: 21}, 26 | {text: 'tomcat', size: 20}, 27 | {text: 'about', size: 20}, 28 | {text: 'deploying', size: 20}, 29 | {text: 'content', size: 20}, 30 | {text: 'sites', size: 20}, 31 | {text: 'integration', size: 20}, 32 | {text: 'users', size: 20}, 33 | {text: 'email', size: 20}, 34 | {text: 'jira', size: 20}, 35 | {text: 'issues', size: 20}, 36 | {text: 'portal', size: 20}, 37 | {text: 'upgrade', size: 20}, 38 | {text: 'broken', size: 20}, 39 | {text: 'update', size: 20}, 40 | {text: 'change', size: 20}, 41 | {text: 'file', size: 20}, 42 | {text: 'assignment', size: 20}, 43 | {text: 'when', size: 20}, 44 | {text: 'search', size: 20}, 45 | {text: 'code', size: 20}, 46 | {text: 'service', size: 20}, 47 | {text: 'management', size: 20}, 48 | {text: 'webdav', size: 20}, 49 | {text: 'test', size: 20}, 50 | {text: 'errors', size: 20}, 51 | {text: 'oracle', size: 20}, 52 | {text: 'assignments', size: 20}, 53 | {text: 'files', size: 20}, 54 | {text: 'profile', size: 20}, 55 | {text: 'production', size: 20}, 56 | {text: 'page', size: 20}, 57 | {text: 'version', size: 20}, 58 | {text: 'database', size: 20}, 59 | {text: 'hibernate', size: 20}, 60 | {text: 'java', size: 20}, 61 | {text: 'chat', size: 20}, 62 | {text: 'changes', size: 20}, 63 | {text: 'ldap', size: 20}, 64 | {text: 'project', size: 20}, 65 | {text: 'questions', size: 20}, 66 | {text: 'login', size: 20}, 67 | {text: 'testing', size: 20}, 68 | {text: 'info', size: 20}, 69 | {text: 'startup', size: 20}, 70 | {text: 'data', size: 20}, 71 | {text: 'conversion', size: 20}, 72 | {text: 'jforum', size: 20}, 73 | {text: 'performance', size: 20}, 74 | {text: 'kernel', size: 20}, 75 | {text: 'adding', size: 20}, 76 | {text: 'support', size: 20}, 77 | {text: 'import', size: 20}, 78 | {text: 'call', size: 20}, 79 | {text: 'nightly', size: 20}, 80 | {text: 'running', size: 20}, 81 | {text: 'access', size: 20}, 82 | {text: 'branch', size: 20}, 83 | {text: 'into', size: 20}, 84 | {text: 'multiple', size: 20}, 85 | {text: 'message', size: 20}, 86 | {text: 'default', size: 20}, 87 | {text: 'status', size: 20}, 88 | {text: 'source', size: 20}, 89 | {text: 'create', size: 20}, 90 | {text: 'wiki', size: 20}, 91 | {text: 'scorm', size: 20}, 92 | {text: 'setup', size: 20}, 93 | {text: 'what', size: 20}, 94 | {text: 'more', size: 20}, 95 | {text: 'does', size: 20}, 96 | {text: 'configuration', size: 20}, 97 | {text: 'down', size: 20}, 98 | {text: 'list', size: 20}, 99 | {text: 'getting', size: 20}, 100 | {text: 'server', size: 20} 101 | ]; 102 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | import string 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, subject FROM Subjects') 10 | subjects = dict() 11 | for message_row in cur : 12 | subjects[message_row[0]] = message_row[1] 13 | 14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 15 | cur.execute('SELECT subject_id FROM Messages') 16 | counts = dict() 17 | for message_row in cur : 18 | text = subjects[message_row[0]] 19 | text = text.translate(str.maketrans('','',string.punctuation)) 20 | text = text.translate(str.maketrans('','','1234567890')) 21 | text = text.strip() 22 | text = text.lower() 23 | words = text.split() 24 | for word in words: 25 | if len(word) < 4 : continue 26 | counts[word] = counts.get(word,0) + 1 27 | 28 | x = sorted(counts, key=counts.get, reverse=True) 29 | highest = None 30 | lowest = None 31 | for k in x[:100]: 32 | if highest is None or highest < counts[k] : 33 | highest = counts[k] 34 | if lowest is None or lowest > counts[k] : 35 | lowest = counts[k] 36 | print('Range of counts:',highest,lowest) 37 | 38 | # Spread the font sizes across 20-100 based on the count 39 | bigsize = 80 40 | smallsize = 20 41 | 42 | fhand = open('gword.js','w') 43 | fhand.write("gword = [") 44 | first = True 45 | for k in x[:100]: 46 | if not first : fhand.write( ",\n") 47 | first = False 48 | size = counts[k] 49 | size = (size - lowest) / float(highest - lowest) 50 | size = int((size * bigsize) + smallsize) 51 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 52 | fhand.write( "\n];\n") 53 | fhand.close() 54 | 55 | print("Output written to gword.js") 56 | print("Open gword.htm in a browser to see the vizualization") 57 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib.request, urllib.parse, urllib.error 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, sender FROM Senders') 10 | senders = dict() 11 | for message_row in cur : 12 | senders[message_row[0]] = message_row[1] 13 | 14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 15 | messages = dict() 16 | for message_row in cur : 17 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 18 | 19 | print("Loaded messages=",len(messages),"senders=",len(senders)) 20 | 21 | sendorgs = dict() 22 | for (message_id, message) in list(messages.items()): 23 | sender = message[1] 24 | pieces = senders[sender].split("@") 25 | if len(pieces) != 2 : continue 26 | dns = pieces[1] 27 | sendorgs[dns] = sendorgs.get(dns,0) + 1 28 | 29 | # pick the top schools 30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 31 | orgs = orgs[:10] 32 | print("Top 10 Oranizations") 33 | print(orgs) 34 | # orgs = ['total'] + orgs 35 | 36 | counts = dict() 37 | months = list() 38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 39 | for (message_id, message) in list(messages.items()): 40 | sender = message[1] 41 | pieces = senders[sender].split("@") 42 | if len(pieces) != 2 : continue 43 | dns = pieces[1] 44 | if dns not in orgs : continue 45 | month = message[3][:4] 46 | if month not in months : months.append(month) 47 | key = (month, dns) 48 | counts[key] = counts.get(key,0) + 1 49 | tkey = (month, 'total') 50 | counts[tkey] = counts.get(tkey,0) + 1 51 | 52 | months.sort() 53 | # print counts 54 | # print months 55 | 56 | fhand = open('gline.js','w') 57 | fhand.write("gline = [ ['Year'") 58 | for org in orgs: 59 | fhand.write(",'"+org+"'") 60 | fhand.write("]") 61 | 62 | for month in months[1:-1]: 63 | fhand.write(",\n['"+month+"'") 64 | for org in orgs: 65 | key = (month, org) 66 | val = counts.get(key,0) 67 | fhand.write(","+str(val)) 68 | fhand.write("]"); 69 | 70 | fhand.write("\n];\n") 71 | fhand.close() 72 | 73 | print("Output written to gline.js") 74 | print("Open gline.htm to visualize the data") 75 | 76 | -------------------------------------------------------------------------------- /Capstone/Mailing List I/index.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/index.sqlite -------------------------------------------------------------------------------- /Capstone/Mailing List I/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Mailing List I/mapping.sqlite -------------------------------------------------------------------------------- /Capstone/Pagerank/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Capstone/Pagerank/Pagerank Dr.Chuck.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/Pagerank Dr.Chuck.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/Pagerank WP.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/Pagerank WP.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Note: Windows has difficulty in displaying UTF-8 characters 17 | in the console so for each console window you open, you may need 18 | to type the following command before running this code: 19 | 20 | chcp 65001 21 | 22 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 23 | 24 | Mac: rm spider.sqlite 25 | Mac: python3 spider.py 26 | 27 | Win: del spider.sqlite 28 | Win: spider.py 29 | 30 | Enter web url or enter: http://www.dr-chuck.com/ 31 | ['http://www.dr-chuck.com'] 32 | How many pages:2 33 | 1 http://www.dr-chuck.com/ 12 34 | 2 http://www.dr-chuck.com/csev-blog/ 57 35 | How many pages: 36 | 37 | In this sample run, we told it to crawl a website and retrieve two 38 | pages. If you restart the program again and tell it to crawl more 39 | pages, it will not re-crawl any pages already in the database. Upon 40 | restart it goes to a random non-crawled page and starts there. So 41 | each successive run of spider.py is additive. 42 | 43 | Mac: python3 spider.py 44 | Win: spider.py 45 | 46 | Enter web url or enter: http://www.dr-chuck.com/ 47 | ['http://www.dr-chuck.com'] 48 | How many pages:3 49 | 3 http://www.dr-chuck.com/csev-blog 57 50 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 51 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 52 | How many pages: 53 | 54 | You can have multiple starting points in the same database - 55 | within the program these are called "webs". The spider 56 | chooses randomly amongst all non-visited links across all 57 | the webs. 58 | 59 | If you want to dump the contents of the spider.sqlite file, you can 60 | run spdump.py as follows: 61 | 62 | Mac: python3 spdump.py 63 | Win: spdump.py 64 | 65 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 66 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 67 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 68 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 69 | 4 rows. 70 | 71 | This shows the number of incoming links, the old page rank, the new page 72 | rank, the id of the page, and the url of the page. The spdump.py program 73 | only shows pages that have at least one incoming link to them. 74 | 75 | Once you have a few pages in the database, you can run Page Rank on the 76 | pages using the sprank.py program. You simply tell it how many Page 77 | Rank iterations to run. 78 | 79 | Mac: python3 sprank.py 80 | Win: sprank.py 81 | 82 | How many iterations:2 83 | 1 0.546848992536 84 | 2 0.226714939664 85 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 86 | 87 | You can dump the database again to see that page rank has been updated: 88 | 89 | Mac: python3 spdump.py 90 | Win: spdump.py 91 | 92 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 93 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 94 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 95 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 96 | 4 rows. 97 | 98 | You can run sprank.py as many times as you like and it will simply refine 99 | the page rank the more times you run it. You can even run sprank.py a few times 100 | and then go spider a few more pages sith spider.py and then run sprank.py 101 | to converge the page ranks. 102 | 103 | If you want to restart the Page Rank calculations without re-spidering the 104 | web pages, you can use spreset.py 105 | 106 | Mac: python3 spreset.py 107 | Win: spreset.py 108 | 109 | All pages set to a rank of 1.0 110 | 111 | Mac: python3 sprank.py 112 | Win: sprank.py 113 | 114 | How many iterations:50 115 | 1 0.546848992536 116 | 2 0.226714939664 117 | 3 0.0659516187242 118 | 4 0.0244199333 119 | 5 0.0102096489546 120 | 6 0.00610244329379 121 | ... 122 | 42 0.000109076928206 123 | 43 9.91987599002e-05 124 | 44 9.02151706798e-05 125 | 45 8.20451504471e-05 126 | 46 7.46150183837e-05 127 | 47 6.7857770908e-05 128 | 48 6.17124694224e-05 129 | 49 5.61236959327e-05 130 | 50 5.10410499467e-05 131 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 132 | 133 | For each iteration of the page rank algorithm it prints the average 134 | change per page of the page rank. The network initially is quite 135 | unbalanced and so the individual page ranks are changeing wildly. 136 | But in a few short iterations, the page rank converges. You 137 | should run prank.py long enough that the page ranks converge. 138 | 139 | If you want to visualize the current top pages in terms of page rank, 140 | run spjson.py to write the pages out in JSON format to be viewed in a 141 | web browser. 142 | 143 | Mac: python3 spjson.py 144 | Win: spjson.py 145 | 146 | Creating JSON output on spider.js... 147 | How many nodes? 30 148 | Open force.html in a browser to view the visualization 149 | 150 | You can view this data by opening the file force.html in your web browser. 151 | This shows an automatic layout of the nodes and links. You can click and 152 | drag any node and you can also double click on a node to find the URL 153 | that is represented by the node. 154 | 155 | This visualization is provided using the force layout from: 156 | 157 | http://mbostock.github.com/d3/ 158 | 159 | If you rerun the other utilities and then re-run spjson.py - you merely 160 | have to press refresh in the browser to get the new data from spider.js. 161 | 162 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import sys 4 | from bs4.element import ( 5 | CharsetMetaAttributeValue, 6 | ContentMetaAttributeValue, 7 | whitespace_re 8 | ) 9 | 10 | __all__ = [ 11 | 'HTMLTreeBuilder', 12 | 'SAXTreeBuilder', 13 | 'TreeBuilder', 14 | 'TreeBuilderRegistry', 15 | ] 16 | 17 | # Some useful features for a TreeBuilder to have. 18 | FAST = 'fast' 19 | PERMISSIVE = 'permissive' 20 | STRICT = 'strict' 21 | XML = 'xml' 22 | HTML = 'html' 23 | HTML_5 = 'html5' 24 | 25 | 26 | class TreeBuilderRegistry(object): 27 | 28 | def __init__(self): 29 | self.builders_for_feature = defaultdict(list) 30 | self.builders = [] 31 | 32 | def register(self, treebuilder_class): 33 | """Register a treebuilder based on its advertised features.""" 34 | for feature in treebuilder_class.features: 35 | self.builders_for_feature[feature].insert(0, treebuilder_class) 36 | self.builders.insert(0, treebuilder_class) 37 | 38 | def lookup(self, *features): 39 | if len(self.builders) == 0: 40 | # There are no builders at all. 41 | return None 42 | 43 | if len(features) == 0: 44 | # They didn't ask for any features. Give them the most 45 | # recently registered builder. 46 | return self.builders[0] 47 | 48 | # Go down the list of features in order, and eliminate any builders 49 | # that don't match every feature. 50 | features = list(features) 51 | features.reverse() 52 | candidates = None 53 | candidate_set = None 54 | while len(features) > 0: 55 | feature = features.pop() 56 | we_have_the_feature = self.builders_for_feature.get(feature, []) 57 | if len(we_have_the_feature) > 0: 58 | if candidates is None: 59 | candidates = we_have_the_feature 60 | candidate_set = set(candidates) 61 | else: 62 | # Eliminate any candidates that don't have this feature. 63 | candidate_set = candidate_set.intersection( 64 | set(we_have_the_feature)) 65 | 66 | # The only valid candidates are the ones in candidate_set. 67 | # Go through the original list of candidates and pick the first one 68 | # that's in candidate_set. 69 | if candidate_set is None: 70 | return None 71 | for candidate in candidates: 72 | if candidate in candidate_set: 73 | return candidate 74 | return None 75 | 76 | # The BeautifulSoup class will take feature lists from developers and use them 77 | # to look up builders in this registry. 78 | builder_registry = TreeBuilderRegistry() 79 | 80 | class TreeBuilder(object): 81 | """Turn a document into a Beautiful Soup object tree.""" 82 | 83 | NAME = "[Unknown tree builder]" 84 | ALTERNATE_NAMES = [] 85 | features = [] 86 | 87 | is_xml = False 88 | picklable = False 89 | preserve_whitespace_tags = set() 90 | empty_element_tags = None # A tag will be considered an empty-element 91 | # tag when and only when it has no contents. 92 | 93 | # A value for these tag/attribute combinations is a space- or 94 | # comma-separated list of CDATA, rather than a single CDATA. 95 | cdata_list_attributes = {} 96 | 97 | 98 | def __init__(self): 99 | self.soup = None 100 | 101 | def reset(self): 102 | pass 103 | 104 | def can_be_empty_element(self, tag_name): 105 | """Might a tag with this name be an empty-element tag? 106 | 107 | The final markup may or may not actually present this tag as 108 | self-closing. 109 | 110 | For instance: an HTMLBuilder does not consider a

tag to be 111 | an empty-element tag (it's not in 112 | HTMLBuilder.empty_element_tags). This means an empty

tag 113 | will be presented as "

", not "

". 114 | 115 | The default implementation has no opinion about which tags are 116 | empty-element tags, so a tag will be presented as an 117 | empty-element tag if and only if it has no contents. 118 | "" will become "", and "bar" will 119 | be left alone. 120 | """ 121 | if self.empty_element_tags is None: 122 | return True 123 | return tag_name in self.empty_element_tags 124 | 125 | def feed(self, markup): 126 | raise NotImplementedError() 127 | 128 | def prepare_markup(self, markup, user_specified_encoding=None, 129 | document_declared_encoding=None): 130 | return markup, None, None, False 131 | 132 | def test_fragment_to_document(self, fragment): 133 | """Wrap an HTML fragment to make it look like a document. 134 | 135 | Different parsers do this differently. For instance, lxml 136 | introduces an empty tag, and html5lib 137 | doesn't. Abstracting this away lets us write simple tests 138 | which run HTML fragments through the parser and compare the 139 | results against other HTML fragments. 140 | 141 | This method should not be used outside of tests. 142 | """ 143 | return fragment 144 | 145 | def set_up_substitutions(self, tag): 146 | return False 147 | 148 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 149 | """Replaces class="foo bar" with class=["foo", "bar"] 150 | 151 | Modifies its input in place. 152 | """ 153 | if not attrs: 154 | return attrs 155 | if self.cdata_list_attributes: 156 | universal = self.cdata_list_attributes.get('*', []) 157 | tag_specific = self.cdata_list_attributes.get( 158 | tag_name.lower(), None) 159 | for attr in list(attrs.keys()): 160 | if attr in universal or (tag_specific and attr in tag_specific): 161 | # We have a "class"-type attribute whose string 162 | # value is a whitespace-separated list of 163 | # values. Split it into a list. 164 | value = attrs[attr] 165 | if isinstance(value, str): 166 | values = whitespace_re.split(value) 167 | else: 168 | # html5lib sometimes calls setAttributes twice 169 | # for the same tag when rearranging the parse 170 | # tree. On the second call the attribute value 171 | # here is already a list. If this happens, 172 | # leave the value alone rather than trying to 173 | # split it again. 174 | values = value 175 | attrs[attr] = values 176 | return attrs 177 | 178 | class SAXTreeBuilder(TreeBuilder): 179 | """A Beautiful Soup treebuilder that listens for SAX events.""" 180 | 181 | def feed(self, markup): 182 | raise NotImplementedError() 183 | 184 | def close(self): 185 | pass 186 | 187 | def startElement(self, name, attrs): 188 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 189 | #print "Start %s, %r" % (name, attrs) 190 | self.soup.handle_starttag(name, attrs) 191 | 192 | def endElement(self, name): 193 | #print "End %s" % name 194 | self.soup.handle_endtag(name) 195 | 196 | def startElementNS(self, nsTuple, nodeName, attrs): 197 | # Throw away (ns, nodeName) for now. 198 | self.startElement(nodeName, attrs) 199 | 200 | def endElementNS(self, nsTuple, nodeName): 201 | # Throw away (ns, nodeName) for now. 202 | self.endElement(nodeName) 203 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 204 | 205 | def startPrefixMapping(self, prefix, nodeValue): 206 | # Ignore the prefix for now. 207 | pass 208 | 209 | def endPrefixMapping(self, prefix): 210 | # Ignore the prefix for now. 211 | # handler.endPrefixMapping(prefix) 212 | pass 213 | 214 | def characters(self, content): 215 | self.soup.handle_data(content) 216 | 217 | def startDocument(self): 218 | pass 219 | 220 | def endDocument(self): 221 | pass 222 | 223 | 224 | class HTMLTreeBuilder(TreeBuilder): 225 | """This TreeBuilder knows facts about HTML. 226 | 227 | Such as which tags are empty-element tags. 228 | """ 229 | 230 | preserve_whitespace_tags = set(['pre', 'textarea']) 231 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 232 | 'spacer', 'link', 'frame', 'base']) 233 | 234 | # The HTML standard defines these attributes as containing a 235 | # space-separated list of values, not a single value. That is, 236 | # class="foo bar" means that the 'class' attribute has two values, 237 | # 'foo' and 'bar', not the single value 'foo bar'. When we 238 | # encounter one of these attributes, we will parse its value into 239 | # a list of values if possible. Upon output, the list will be 240 | # converted back into a string. 241 | cdata_list_attributes = { 242 | "*" : ['class', 'accesskey', 'dropzone'], 243 | "a" : ['rel', 'rev'], 244 | "link" : ['rel', 'rev'], 245 | "td" : ["headers"], 246 | "th" : ["headers"], 247 | "td" : ["headers"], 248 | "form" : ["accept-charset"], 249 | "object" : ["archive"], 250 | 251 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 252 | "area" : ["rel"], 253 | "icon" : ["sizes"], 254 | "iframe" : ["sandbox"], 255 | "output" : ["for"], 256 | } 257 | 258 | def set_up_substitutions(self, tag): 259 | # We are only interested in tags 260 | if tag.name != 'meta': 261 | return False 262 | 263 | http_equiv = tag.get('http-equiv') 264 | content = tag.get('content') 265 | charset = tag.get('charset') 266 | 267 | # We are interested in tags that say what encoding the 268 | # document was originally in. This means HTML 5-style 269 | # tags that provide the "charset" attribute. It also means 270 | # HTML 4-style tags that provide the "content" 271 | # attribute and have "http-equiv" set to "content-type". 272 | # 273 | # In both cases we will replace the value of the appropriate 274 | # attribute with a standin object that can take on any 275 | # encoding. 276 | meta_encoding = None 277 | if charset is not None: 278 | # HTML 5 style: 279 | # 280 | meta_encoding = charset 281 | tag['charset'] = CharsetMetaAttributeValue(charset) 282 | 283 | elif (content is not None and http_equiv is not None 284 | and http_equiv.lower() == 'content-type'): 285 | # HTML 4 style: 286 | # 287 | tag['content'] = ContentMetaAttributeValue(content) 288 | 289 | return (meta_encoding is not None) 290 | 291 | def register_treebuilders_from(module): 292 | """Copy TreeBuilders from the given module into this module.""" 293 | # I'm fairly sure this is not the best way to do this. 294 | this_module = sys.modules['bs4.builder'] 295 | for name in module.__all__: 296 | obj = getattr(module, name) 297 | 298 | if issubclass(obj, TreeBuilder): 299 | setattr(this_module, name, obj) 300 | this_module.__all__.append(name) 301 | # Register the builder while we're at it. 302 | this_module.builder_registry.register(obj) 303 | 304 | class ParserRejectedMarkup(Exception): 305 | pass 306 | 307 | # Builders are registered in reverse order of priority, so that custom 308 | # builder registrations will take precedence. In general, we want lxml 309 | # to take precedence over html5lib, because it's faster. And we only 310 | # want to use HTMLParser as a last result. 311 | from . import _htmlparser 312 | register_treebuilders_from(_htmlparser) 313 | try: 314 | from . import _html5lib 315 | register_treebuilders_from(_html5lib) 316 | except ImportError: 317 | # They don't have html5lib installed. 318 | pass 319 | try: 320 | from . import _lxml 321 | register_treebuilders_from(_lxml) 322 | except ImportError: 323 | # They don't have lxml installed. 324 | pass 325 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/builder/_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | __all__ = [ 4 | 'HTMLParserTreeBuilder', 5 | ] 6 | 7 | from html.parser import HTMLParser 8 | 9 | try: 10 | from html.parser import HTMLParseError 11 | except ImportError as e: 12 | # HTMLParseError is removed in Python 3.5. Since it can never be 13 | # thrown in 3.5, we can just define our own class as a placeholder. 14 | class HTMLParseError(Exception): 15 | pass 16 | 17 | import sys 18 | import warnings 19 | 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 21 | # argument, which we'd like to set to False. Unfortunately, 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet 23 | # before Python 3.2.3. 24 | # 25 | # At the end of this file, we monkeypatch HTMLParser so that 26 | # strict=True works well on Python 3.2.2. 27 | major, minor, release = sys.version_info[:3] 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 31 | 32 | 33 | from bs4.element import ( 34 | CData, 35 | Comment, 36 | Declaration, 37 | Doctype, 38 | ProcessingInstruction, 39 | ) 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit 41 | 42 | from bs4.builder import ( 43 | HTML, 44 | HTMLTreeBuilder, 45 | STRICT, 46 | ) 47 | 48 | 49 | HTMLPARSER = 'html.parser' 50 | 51 | class BeautifulSoupHTMLParser(HTMLParser): 52 | def handle_starttag(self, name, attrs): 53 | # XXX namespace 54 | attr_dict = {} 55 | for key, value in attrs: 56 | # Change None attribute values to the empty string 57 | # for consistency with the other tree builders. 58 | if value is None: 59 | value = '' 60 | attr_dict[key] = value 61 | attrvalue = '""' 62 | self.soup.handle_starttag(name, None, None, attr_dict) 63 | 64 | def handle_endtag(self, name): 65 | self.soup.handle_endtag(name) 66 | 67 | def handle_data(self, data): 68 | self.soup.handle_data(data) 69 | 70 | def handle_charref(self, name): 71 | # XXX workaround for a bug in HTMLParser. Remove this once 72 | # it's fixed in all supported versions. 73 | # http://bugs.python.org/issue13633 74 | if name.startswith('x'): 75 | real_name = int(name.lstrip('x'), 16) 76 | elif name.startswith('X'): 77 | real_name = int(name.lstrip('X'), 16) 78 | else: 79 | real_name = int(name) 80 | 81 | try: 82 | data = chr(real_name) 83 | except (ValueError, OverflowError) as e: 84 | data = "\N{REPLACEMENT CHARACTER}" 85 | 86 | self.handle_data(data) 87 | 88 | def handle_entityref(self, name): 89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 90 | if character is not None: 91 | data = character 92 | else: 93 | data = "&%s;" % name 94 | self.handle_data(data) 95 | 96 | def handle_comment(self, data): 97 | self.soup.endData() 98 | self.soup.handle_data(data) 99 | self.soup.endData(Comment) 100 | 101 | def handle_decl(self, data): 102 | self.soup.endData() 103 | if data.startswith("DOCTYPE "): 104 | data = data[len("DOCTYPE "):] 105 | elif data == 'DOCTYPE': 106 | # i.e. "" 107 | data = '' 108 | self.soup.handle_data(data) 109 | self.soup.endData(Doctype) 110 | 111 | def unknown_decl(self, data): 112 | if data.upper().startswith('CDATA['): 113 | cls = CData 114 | data = data[len('CDATA['):] 115 | else: 116 | cls = Declaration 117 | self.soup.endData() 118 | self.soup.handle_data(data) 119 | self.soup.endData(cls) 120 | 121 | def handle_pi(self, data): 122 | self.soup.endData() 123 | self.soup.handle_data(data) 124 | self.soup.endData(ProcessingInstruction) 125 | 126 | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 128 | 129 | is_xml = False 130 | picklable = True 131 | NAME = HTMLPARSER 132 | features = [NAME, HTML, STRICT] 133 | 134 | def __init__(self, *args, **kwargs): 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 136 | kwargs['strict'] = False 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 138 | kwargs['convert_charrefs'] = False 139 | self.parser_args = (args, kwargs) 140 | 141 | def prepare_markup(self, markup, user_specified_encoding=None, 142 | document_declared_encoding=None, exclude_encodings=None): 143 | """ 144 | :return: A 4-tuple (markup, original encoding, encoding 145 | declared within markup, whether any characters had to be 146 | replaced with REPLACEMENT CHARACTER). 147 | """ 148 | if isinstance(markup, str): 149 | yield (markup, None, None, False) 150 | return 151 | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 154 | exclude_encodings=exclude_encodings) 155 | yield (dammit.markup, dammit.original_encoding, 156 | dammit.declared_html_encoding, 157 | dammit.contains_replacement_characters) 158 | 159 | def feed(self, markup): 160 | args, kwargs = self.parser_args 161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 162 | parser.soup = self.soup 163 | try: 164 | parser.feed(markup) 165 | except HTMLParseError as e: 166 | warnings.warn(RuntimeWarning( 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 168 | raise e 169 | 170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 171 | # 3.2.3 code. This ensures they don't treat markup like

as a 172 | # string. 173 | # 174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 176 | import re 177 | attrfind_tolerant = re.compile( 178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 181 | 182 | locatestarttagend = re.compile(r""" 183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 184 | (?:\s+ # whitespace before attribute name 185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 186 | (?:\s*=\s* # value indicator 187 | (?:'[^']*' # LITA-enclosed value 188 | |\"[^\"]*\" # LIT-enclosed value 189 | |[^'\">\s]+ # bare value 190 | ) 191 | )? 192 | ) 193 | )* 194 | \s* # trailing whitespace 195 | """, re.VERBOSE) 196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 197 | 198 | from html.parser import tagfind, attrfind 199 | 200 | def parse_starttag(self, i): 201 | self.__starttag_text = None 202 | endpos = self.check_for_whole_start_tag(i) 203 | if endpos < 0: 204 | return endpos 205 | rawdata = self.rawdata 206 | self.__starttag_text = rawdata[i:endpos] 207 | 208 | # Now parse the data between i+1 and j into a tag and attrs 209 | attrs = [] 210 | match = tagfind.match(rawdata, i+1) 211 | assert match, 'unexpected call to parse_starttag()' 212 | k = match.end() 213 | self.lasttag = tag = rawdata[i+1:k].lower() 214 | while k < endpos: 215 | if self.strict: 216 | m = attrfind.match(rawdata, k) 217 | else: 218 | m = attrfind_tolerant.match(rawdata, k) 219 | if not m: 220 | break 221 | attrname, rest, attrvalue = m.group(1, 2, 3) 222 | if not rest: 223 | attrvalue = None 224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 225 | attrvalue[:1] == '"' == attrvalue[-1:]: 226 | attrvalue = attrvalue[1:-1] 227 | if attrvalue: 228 | attrvalue = self.unescape(attrvalue) 229 | attrs.append((attrname.lower(), attrvalue)) 230 | k = m.end() 231 | 232 | end = rawdata[k:endpos].strip() 233 | if end not in (">", "/>"): 234 | lineno, offset = self.getpos() 235 | if "\n" in self.__starttag_text: 236 | lineno = lineno + self.__starttag_text.count("\n") 237 | offset = len(self.__starttag_text) \ 238 | - self.__starttag_text.rfind("\n") 239 | else: 240 | offset = offset + len(self.__starttag_text) 241 | if self.strict: 242 | self.error("junk characters in start tag: %r" 243 | % (rawdata[k:endpos][:20],)) 244 | self.handle_data(rawdata[i:endpos]) 245 | return endpos 246 | if end.endswith('/>'): 247 | # XHTML-style empty tag: 248 | self.handle_startendtag(tag, attrs) 249 | else: 250 | self.handle_starttag(tag, attrs) 251 | if tag in self.CDATA_CONTENT_ELEMENTS: 252 | self.set_cdata_mode(tag) 253 | return endpos 254 | 255 | def set_cdata_mode(self, elem): 256 | self.cdata_elem = elem.lower() 257 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 258 | 259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 261 | 262 | CONSTRUCTOR_TAKES_STRICT = True 263 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/builder/_htmlparser.py.bak: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | __all__ = [ 4 | 'HTMLParserTreeBuilder', 5 | ] 6 | 7 | from HTMLParser import HTMLParser 8 | 9 | try: 10 | from HTMLParser import HTMLParseError 11 | except ImportError, e: 12 | # HTMLParseError is removed in Python 3.5. Since it can never be 13 | # thrown in 3.5, we can just define our own class as a placeholder. 14 | class HTMLParseError(Exception): 15 | pass 16 | 17 | import sys 18 | import warnings 19 | 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 21 | # argument, which we'd like to set to False. Unfortunately, 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet 23 | # before Python 3.2.3. 24 | # 25 | # At the end of this file, we monkeypatch HTMLParser so that 26 | # strict=True works well on Python 3.2.2. 27 | major, minor, release = sys.version_info[:3] 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 31 | 32 | 33 | from bs4.element import ( 34 | CData, 35 | Comment, 36 | Declaration, 37 | Doctype, 38 | ProcessingInstruction, 39 | ) 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit 41 | 42 | from bs4.builder import ( 43 | HTML, 44 | HTMLTreeBuilder, 45 | STRICT, 46 | ) 47 | 48 | 49 | HTMLPARSER = 'html.parser' 50 | 51 | class BeautifulSoupHTMLParser(HTMLParser): 52 | def handle_starttag(self, name, attrs): 53 | # XXX namespace 54 | attr_dict = {} 55 | for key, value in attrs: 56 | # Change None attribute values to the empty string 57 | # for consistency with the other tree builders. 58 | if value is None: 59 | value = '' 60 | attr_dict[key] = value 61 | attrvalue = '""' 62 | self.soup.handle_starttag(name, None, None, attr_dict) 63 | 64 | def handle_endtag(self, name): 65 | self.soup.handle_endtag(name) 66 | 67 | def handle_data(self, data): 68 | self.soup.handle_data(data) 69 | 70 | def handle_charref(self, name): 71 | # XXX workaround for a bug in HTMLParser. Remove this once 72 | # it's fixed in all supported versions. 73 | # http://bugs.python.org/issue13633 74 | if name.startswith('x'): 75 | real_name = int(name.lstrip('x'), 16) 76 | elif name.startswith('X'): 77 | real_name = int(name.lstrip('X'), 16) 78 | else: 79 | real_name = int(name) 80 | 81 | try: 82 | data = unichr(real_name) 83 | except (ValueError, OverflowError), e: 84 | data = u"\N{REPLACEMENT CHARACTER}" 85 | 86 | self.handle_data(data) 87 | 88 | def handle_entityref(self, name): 89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 90 | if character is not None: 91 | data = character 92 | else: 93 | data = "&%s;" % name 94 | self.handle_data(data) 95 | 96 | def handle_comment(self, data): 97 | self.soup.endData() 98 | self.soup.handle_data(data) 99 | self.soup.endData(Comment) 100 | 101 | def handle_decl(self, data): 102 | self.soup.endData() 103 | if data.startswith("DOCTYPE "): 104 | data = data[len("DOCTYPE "):] 105 | elif data == 'DOCTYPE': 106 | # i.e. "" 107 | data = '' 108 | self.soup.handle_data(data) 109 | self.soup.endData(Doctype) 110 | 111 | def unknown_decl(self, data): 112 | if data.upper().startswith('CDATA['): 113 | cls = CData 114 | data = data[len('CDATA['):] 115 | else: 116 | cls = Declaration 117 | self.soup.endData() 118 | self.soup.handle_data(data) 119 | self.soup.endData(cls) 120 | 121 | def handle_pi(self, data): 122 | self.soup.endData() 123 | self.soup.handle_data(data) 124 | self.soup.endData(ProcessingInstruction) 125 | 126 | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 128 | 129 | is_xml = False 130 | picklable = True 131 | NAME = HTMLPARSER 132 | features = [NAME, HTML, STRICT] 133 | 134 | def __init__(self, *args, **kwargs): 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 136 | kwargs['strict'] = False 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 138 | kwargs['convert_charrefs'] = False 139 | self.parser_args = (args, kwargs) 140 | 141 | def prepare_markup(self, markup, user_specified_encoding=None, 142 | document_declared_encoding=None, exclude_encodings=None): 143 | """ 144 | :return: A 4-tuple (markup, original encoding, encoding 145 | declared within markup, whether any characters had to be 146 | replaced with REPLACEMENT CHARACTER). 147 | """ 148 | if isinstance(markup, unicode): 149 | yield (markup, None, None, False) 150 | return 151 | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 154 | exclude_encodings=exclude_encodings) 155 | yield (dammit.markup, dammit.original_encoding, 156 | dammit.declared_html_encoding, 157 | dammit.contains_replacement_characters) 158 | 159 | def feed(self, markup): 160 | args, kwargs = self.parser_args 161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 162 | parser.soup = self.soup 163 | try: 164 | parser.feed(markup) 165 | except HTMLParseError, e: 166 | warnings.warn(RuntimeWarning( 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 168 | raise e 169 | 170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 171 | # 3.2.3 code. This ensures they don't treat markup like

as a 172 | # string. 173 | # 174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 176 | import re 177 | attrfind_tolerant = re.compile( 178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 181 | 182 | locatestarttagend = re.compile(r""" 183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 184 | (?:\s+ # whitespace before attribute name 185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 186 | (?:\s*=\s* # value indicator 187 | (?:'[^']*' # LITA-enclosed value 188 | |\"[^\"]*\" # LIT-enclosed value 189 | |[^'\">\s]+ # bare value 190 | ) 191 | )? 192 | ) 193 | )* 194 | \s* # trailing whitespace 195 | """, re.VERBOSE) 196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 197 | 198 | from html.parser import tagfind, attrfind 199 | 200 | def parse_starttag(self, i): 201 | self.__starttag_text = None 202 | endpos = self.check_for_whole_start_tag(i) 203 | if endpos < 0: 204 | return endpos 205 | rawdata = self.rawdata 206 | self.__starttag_text = rawdata[i:endpos] 207 | 208 | # Now parse the data between i+1 and j into a tag and attrs 209 | attrs = [] 210 | match = tagfind.match(rawdata, i+1) 211 | assert match, 'unexpected call to parse_starttag()' 212 | k = match.end() 213 | self.lasttag = tag = rawdata[i+1:k].lower() 214 | while k < endpos: 215 | if self.strict: 216 | m = attrfind.match(rawdata, k) 217 | else: 218 | m = attrfind_tolerant.match(rawdata, k) 219 | if not m: 220 | break 221 | attrname, rest, attrvalue = m.group(1, 2, 3) 222 | if not rest: 223 | attrvalue = None 224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 225 | attrvalue[:1] == '"' == attrvalue[-1:]: 226 | attrvalue = attrvalue[1:-1] 227 | if attrvalue: 228 | attrvalue = self.unescape(attrvalue) 229 | attrs.append((attrname.lower(), attrvalue)) 230 | k = m.end() 231 | 232 | end = rawdata[k:endpos].strip() 233 | if end not in (">", "/>"): 234 | lineno, offset = self.getpos() 235 | if "\n" in self.__starttag_text: 236 | lineno = lineno + self.__starttag_text.count("\n") 237 | offset = len(self.__starttag_text) \ 238 | - self.__starttag_text.rfind("\n") 239 | else: 240 | offset = offset + len(self.__starttag_text) 241 | if self.strict: 242 | self.error("junk characters in start tag: %r" 243 | % (rawdata[k:endpos][:20],)) 244 | self.handle_data(rawdata[i:endpos]) 245 | return endpos 246 | if end.endswith('/>'): 247 | # XHTML-style empty tag: 248 | self.handle_startendtag(tag, attrs) 249 | else: 250 | self.handle_starttag(tag, attrs) 251 | if tag in self.CDATA_CONTENT_ELEMENTS: 252 | self.set_cdata_mode(tag) 253 | return endpos 254 | 255 | def set_cdata_mode(self, elem): 256 | self.cdata_elem = elem.lower() 257 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 258 | 259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 261 | 262 | CONSTRUCTOR_TAKES_STRICT = True 263 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from io import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import ( 11 | Comment, 12 | Doctype, 13 | NamespacedAttribute, 14 | ProcessingInstruction, 15 | ) 16 | from bs4.builder import ( 17 | FAST, 18 | HTML, 19 | HTMLTreeBuilder, 20 | PERMISSIVE, 21 | ParserRejectedMarkup, 22 | TreeBuilder, 23 | XML) 24 | from bs4.dammit import EncodingDetector 25 | 26 | LXML = 'lxml' 27 | 28 | class LXMLTreeBuilderForXML(TreeBuilder): 29 | DEFAULT_PARSER_CLASS = etree.XMLParser 30 | 31 | is_xml = True 32 | 33 | NAME = "lxml-xml" 34 | ALTERNATE_NAMES = ["xml"] 35 | 36 | # Well, it's permissive by XML parser standards. 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 38 | 39 | CHUNK_SIZE = 512 40 | 41 | # This namespace mapping is specified in the XML Namespace 42 | # standard. 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44 | 45 | def default_parser(self, encoding): 46 | # This can either return a parser object or a class, which 47 | # will be instantiated with default arguments. 48 | if self._default_parser is not None: 49 | return self._default_parser 50 | return etree.XMLParser( 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) 52 | 53 | def parser_for(self, encoding): 54 | # Use the default parser. 55 | parser = self.default_parser(encoding) 56 | 57 | if isinstance(parser, collections.Callable): 58 | # Instantiate the parser with default arguments 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 60 | return parser 61 | 62 | def __init__(self, parser=None, empty_element_tags=None): 63 | # TODO: Issue a warning if parser is present but not a 64 | # callable, since that means there's no way to create new 65 | # parsers for different encodings. 66 | self._default_parser = parser 67 | if empty_element_tags is not None: 68 | self.empty_element_tags = set(empty_element_tags) 69 | self.soup = None 70 | self.nsmaps = [self.DEFAULT_NSMAPS] 71 | 72 | def _getNsTag(self, tag): 73 | # Split the namespace URL out of a fully-qualified lxml tag 74 | # name. Copied from lxml's src/lxml/sax.py. 75 | if tag[0] == '{': 76 | return tuple(tag[1:].split('}', 1)) 77 | else: 78 | return (None, tag) 79 | 80 | def prepare_markup(self, markup, user_specified_encoding=None, 81 | exclude_encodings=None, 82 | document_declared_encoding=None): 83 | """ 84 | :yield: A series of 4-tuples. 85 | (markup, encoding, declared encoding, 86 | has undergone character replacement) 87 | 88 | Each 4-tuple represents a strategy for parsing the document. 89 | """ 90 | if isinstance(markup, str): 91 | # We were given Unicode. Maybe lxml can parse Unicode on 92 | # this system? 93 | yield markup, None, document_declared_encoding, False 94 | 95 | if isinstance(markup, str): 96 | # No, apparently not. Convert the Unicode to UTF-8 and 97 | # tell lxml to parse it as UTF-8. 98 | yield (markup.encode("utf8"), "utf8", 99 | document_declared_encoding, False) 100 | 101 | # Instead of using UnicodeDammit to convert the bytestring to 102 | # Unicode using different encodings, use EncodingDetector to 103 | # iterate over the encodings, and tell lxml to try to parse 104 | # the document as each one in turn. 105 | is_html = not self.is_xml 106 | try_encodings = [user_specified_encoding, document_declared_encoding] 107 | detector = EncodingDetector( 108 | markup, try_encodings, is_html, exclude_encodings) 109 | for encoding in detector.encodings: 110 | yield (detector.markup, encoding, document_declared_encoding, False) 111 | 112 | def feed(self, markup): 113 | if isinstance(markup, bytes): 114 | markup = BytesIO(markup) 115 | elif isinstance(markup, str): 116 | markup = StringIO(markup) 117 | 118 | # Call feed() at least once, even if the markup is empty, 119 | # or the parser won't be initialized. 120 | data = markup.read(self.CHUNK_SIZE) 121 | try: 122 | self.parser = self.parser_for(self.soup.original_encoding) 123 | self.parser.feed(data) 124 | while len(data) != 0: 125 | # Now call feed() on the rest of the data, chunk by chunk. 126 | data = markup.read(self.CHUNK_SIZE) 127 | if len(data) != 0: 128 | self.parser.feed(data) 129 | self.parser.close() 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 131 | raise ParserRejectedMarkup(str(e)) 132 | 133 | def close(self): 134 | self.nsmaps = [self.DEFAULT_NSMAPS] 135 | 136 | def start(self, name, attrs, nsmap={}): 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138 | attrs = dict(attrs) 139 | nsprefix = None 140 | # Invert each namespace map as it comes in. 141 | if len(self.nsmaps) > 1: 142 | # There are no new namespaces for this tag, but 143 | # non-default namespaces are in play, so we need a 144 | # separate tag stack to know when they end. 145 | self.nsmaps.append(None) 146 | elif len(nsmap) > 0: 147 | # A new namespace mapping has come into play. 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 149 | self.nsmaps.append(inverted_nsmap) 150 | # Also treat the namespace mapping as a set of attributes on the 151 | # tag, so we can recreate it later. 152 | attrs = attrs.copy() 153 | for prefix, namespace in list(nsmap.items()): 154 | attribute = NamespacedAttribute( 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156 | attrs[attribute] = namespace 157 | 158 | # Namespaces are in play. Find any attributes that came in 159 | # from lxml with namespaces attached to their names, and 160 | # turn then into NamespacedAttribute objects. 161 | new_attrs = {} 162 | for attr, value in list(attrs.items()): 163 | namespace, attr = self._getNsTag(attr) 164 | if namespace is None: 165 | new_attrs[attr] = value 166 | else: 167 | nsprefix = self._prefix_for_namespace(namespace) 168 | attr = NamespacedAttribute(nsprefix, attr, namespace) 169 | new_attrs[attr] = value 170 | attrs = new_attrs 171 | 172 | namespace, name = self._getNsTag(name) 173 | nsprefix = self._prefix_for_namespace(namespace) 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175 | 176 | def _prefix_for_namespace(self, namespace): 177 | """Find the currently active prefix for the given namespace.""" 178 | if namespace is None: 179 | return None 180 | for inverted_nsmap in reversed(self.nsmaps): 181 | if inverted_nsmap is not None and namespace in inverted_nsmap: 182 | return inverted_nsmap[namespace] 183 | return None 184 | 185 | def end(self, name): 186 | self.soup.endData() 187 | completed_tag = self.soup.tagStack[-1] 188 | namespace, name = self._getNsTag(name) 189 | nsprefix = None 190 | if namespace is not None: 191 | for inverted_nsmap in reversed(self.nsmaps): 192 | if inverted_nsmap is not None and namespace in inverted_nsmap: 193 | nsprefix = inverted_nsmap[namespace] 194 | break 195 | self.soup.handle_endtag(name, nsprefix) 196 | if len(self.nsmaps) > 1: 197 | # This tag, or one of its parents, introduced a namespace 198 | # mapping, so pop it off the stack. 199 | self.nsmaps.pop() 200 | 201 | def pi(self, target, data): 202 | self.soup.endData() 203 | self.soup.handle_data(target + ' ' + data) 204 | self.soup.endData(ProcessingInstruction) 205 | 206 | def data(self, content): 207 | self.soup.handle_data(content) 208 | 209 | def doctype(self, name, pubid, system): 210 | self.soup.endData() 211 | doctype = Doctype.for_name_and_ids(name, pubid, system) 212 | self.soup.object_was_parsed(doctype) 213 | 214 | def comment(self, content): 215 | "Handle comments as Comment objects." 216 | self.soup.endData() 217 | self.soup.handle_data(content) 218 | self.soup.endData(Comment) 219 | 220 | def test_fragment_to_document(self, fragment): 221 | """See `TreeBuilder`.""" 222 | return '\n%s' % fragment 223 | 224 | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226 | 227 | NAME = LXML 228 | ALTERNATE_NAMES = ["lxml-html"] 229 | 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231 | is_xml = False 232 | 233 | def default_parser(self, encoding): 234 | return etree.HTMLParser 235 | 236 | def feed(self, markup): 237 | encoding = self.soup.original_encoding 238 | try: 239 | self.parser = self.parser_for(encoding) 240 | self.parser.feed(markup) 241 | self.parser.close() 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 243 | raise ParserRejectedMarkup(str(e)) 244 | 245 | 246 | def test_fragment_to_document(self, fragment): 247 | """See `TreeBuilder`.""" 248 | return '%s' % fragment 249 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/builder/_lxml.py.bak: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from StringIO import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import ( 11 | Comment, 12 | Doctype, 13 | NamespacedAttribute, 14 | ProcessingInstruction, 15 | ) 16 | from bs4.builder import ( 17 | FAST, 18 | HTML, 19 | HTMLTreeBuilder, 20 | PERMISSIVE, 21 | ParserRejectedMarkup, 22 | TreeBuilder, 23 | XML) 24 | from bs4.dammit import EncodingDetector 25 | 26 | LXML = 'lxml' 27 | 28 | class LXMLTreeBuilderForXML(TreeBuilder): 29 | DEFAULT_PARSER_CLASS = etree.XMLParser 30 | 31 | is_xml = True 32 | 33 | NAME = "lxml-xml" 34 | ALTERNATE_NAMES = ["xml"] 35 | 36 | # Well, it's permissive by XML parser standards. 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 38 | 39 | CHUNK_SIZE = 512 40 | 41 | # This namespace mapping is specified in the XML Namespace 42 | # standard. 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44 | 45 | def default_parser(self, encoding): 46 | # This can either return a parser object or a class, which 47 | # will be instantiated with default arguments. 48 | if self._default_parser is not None: 49 | return self._default_parser 50 | return etree.XMLParser( 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) 52 | 53 | def parser_for(self, encoding): 54 | # Use the default parser. 55 | parser = self.default_parser(encoding) 56 | 57 | if isinstance(parser, collections.Callable): 58 | # Instantiate the parser with default arguments 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 60 | return parser 61 | 62 | def __init__(self, parser=None, empty_element_tags=None): 63 | # TODO: Issue a warning if parser is present but not a 64 | # callable, since that means there's no way to create new 65 | # parsers for different encodings. 66 | self._default_parser = parser 67 | if empty_element_tags is not None: 68 | self.empty_element_tags = set(empty_element_tags) 69 | self.soup = None 70 | self.nsmaps = [self.DEFAULT_NSMAPS] 71 | 72 | def _getNsTag(self, tag): 73 | # Split the namespace URL out of a fully-qualified lxml tag 74 | # name. Copied from lxml's src/lxml/sax.py. 75 | if tag[0] == '{': 76 | return tuple(tag[1:].split('}', 1)) 77 | else: 78 | return (None, tag) 79 | 80 | def prepare_markup(self, markup, user_specified_encoding=None, 81 | exclude_encodings=None, 82 | document_declared_encoding=None): 83 | """ 84 | :yield: A series of 4-tuples. 85 | (markup, encoding, declared encoding, 86 | has undergone character replacement) 87 | 88 | Each 4-tuple represents a strategy for parsing the document. 89 | """ 90 | if isinstance(markup, unicode): 91 | # We were given Unicode. Maybe lxml can parse Unicode on 92 | # this system? 93 | yield markup, None, document_declared_encoding, False 94 | 95 | if isinstance(markup, unicode): 96 | # No, apparently not. Convert the Unicode to UTF-8 and 97 | # tell lxml to parse it as UTF-8. 98 | yield (markup.encode("utf8"), "utf8", 99 | document_declared_encoding, False) 100 | 101 | # Instead of using UnicodeDammit to convert the bytestring to 102 | # Unicode using different encodings, use EncodingDetector to 103 | # iterate over the encodings, and tell lxml to try to parse 104 | # the document as each one in turn. 105 | is_html = not self.is_xml 106 | try_encodings = [user_specified_encoding, document_declared_encoding] 107 | detector = EncodingDetector( 108 | markup, try_encodings, is_html, exclude_encodings) 109 | for encoding in detector.encodings: 110 | yield (detector.markup, encoding, document_declared_encoding, False) 111 | 112 | def feed(self, markup): 113 | if isinstance(markup, bytes): 114 | markup = BytesIO(markup) 115 | elif isinstance(markup, unicode): 116 | markup = StringIO(markup) 117 | 118 | # Call feed() at least once, even if the markup is empty, 119 | # or the parser won't be initialized. 120 | data = markup.read(self.CHUNK_SIZE) 121 | try: 122 | self.parser = self.parser_for(self.soup.original_encoding) 123 | self.parser.feed(data) 124 | while len(data) != 0: 125 | # Now call feed() on the rest of the data, chunk by chunk. 126 | data = markup.read(self.CHUNK_SIZE) 127 | if len(data) != 0: 128 | self.parser.feed(data) 129 | self.parser.close() 130 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 131 | raise ParserRejectedMarkup(str(e)) 132 | 133 | def close(self): 134 | self.nsmaps = [self.DEFAULT_NSMAPS] 135 | 136 | def start(self, name, attrs, nsmap={}): 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138 | attrs = dict(attrs) 139 | nsprefix = None 140 | # Invert each namespace map as it comes in. 141 | if len(self.nsmaps) > 1: 142 | # There are no new namespaces for this tag, but 143 | # non-default namespaces are in play, so we need a 144 | # separate tag stack to know when they end. 145 | self.nsmaps.append(None) 146 | elif len(nsmap) > 0: 147 | # A new namespace mapping has come into play. 148 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 149 | self.nsmaps.append(inverted_nsmap) 150 | # Also treat the namespace mapping as a set of attributes on the 151 | # tag, so we can recreate it later. 152 | attrs = attrs.copy() 153 | for prefix, namespace in nsmap.items(): 154 | attribute = NamespacedAttribute( 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156 | attrs[attribute] = namespace 157 | 158 | # Namespaces are in play. Find any attributes that came in 159 | # from lxml with namespaces attached to their names, and 160 | # turn then into NamespacedAttribute objects. 161 | new_attrs = {} 162 | for attr, value in attrs.items(): 163 | namespace, attr = self._getNsTag(attr) 164 | if namespace is None: 165 | new_attrs[attr] = value 166 | else: 167 | nsprefix = self._prefix_for_namespace(namespace) 168 | attr = NamespacedAttribute(nsprefix, attr, namespace) 169 | new_attrs[attr] = value 170 | attrs = new_attrs 171 | 172 | namespace, name = self._getNsTag(name) 173 | nsprefix = self._prefix_for_namespace(namespace) 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175 | 176 | def _prefix_for_namespace(self, namespace): 177 | """Find the currently active prefix for the given namespace.""" 178 | if namespace is None: 179 | return None 180 | for inverted_nsmap in reversed(self.nsmaps): 181 | if inverted_nsmap is not None and namespace in inverted_nsmap: 182 | return inverted_nsmap[namespace] 183 | return None 184 | 185 | def end(self, name): 186 | self.soup.endData() 187 | completed_tag = self.soup.tagStack[-1] 188 | namespace, name = self._getNsTag(name) 189 | nsprefix = None 190 | if namespace is not None: 191 | for inverted_nsmap in reversed(self.nsmaps): 192 | if inverted_nsmap is not None and namespace in inverted_nsmap: 193 | nsprefix = inverted_nsmap[namespace] 194 | break 195 | self.soup.handle_endtag(name, nsprefix) 196 | if len(self.nsmaps) > 1: 197 | # This tag, or one of its parents, introduced a namespace 198 | # mapping, so pop it off the stack. 199 | self.nsmaps.pop() 200 | 201 | def pi(self, target, data): 202 | self.soup.endData() 203 | self.soup.handle_data(target + ' ' + data) 204 | self.soup.endData(ProcessingInstruction) 205 | 206 | def data(self, content): 207 | self.soup.handle_data(content) 208 | 209 | def doctype(self, name, pubid, system): 210 | self.soup.endData() 211 | doctype = Doctype.for_name_and_ids(name, pubid, system) 212 | self.soup.object_was_parsed(doctype) 213 | 214 | def comment(self, content): 215 | "Handle comments as Comment objects." 216 | self.soup.endData() 217 | self.soup.handle_data(content) 218 | self.soup.endData(Comment) 219 | 220 | def test_fragment_to_document(self, fragment): 221 | """See `TreeBuilder`.""" 222 | return u'\n%s' % fragment 223 | 224 | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226 | 227 | NAME = LXML 228 | ALTERNATE_NAMES = ["lxml-html"] 229 | 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231 | is_xml = False 232 | 233 | def default_parser(self, encoding): 234 | return etree.HTMLParser 235 | 236 | def feed(self, markup): 237 | encoding = self.soup.original_encoding 238 | try: 239 | self.parser = self.parser_for(encoding) 240 | self.parser.feed(markup) 241 | self.parser.close() 242 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 243 | raise ParserRejectedMarkup(str(e)) 244 | 245 | 246 | def test_fragment_to_document(self, fragment): 247 | """See `TreeBuilder`.""" 248 | return u'%s' % fragment 249 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from io import StringIO 4 | from html.parser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print("Diagnostic running on Beautiful Soup %s" % __version__) 21 | print("Python version %s" % sys.version) 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print(( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name)) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | try: 37 | from lxml import etree 38 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 39 | except ImportError as e: 40 | print ( 41 | "lxml is not installed or couldn't be imported.") 42 | 43 | 44 | if 'html5lib' in basic_parsers: 45 | try: 46 | import html5lib 47 | print("Found html5lib version %s" % html5lib.__version__) 48 | except ImportError as e: 49 | print ( 50 | "html5lib is not installed or couldn't be imported.") 51 | 52 | if hasattr(data, 'read'): 53 | data = data.read() 54 | elif os.path.exists(data): 55 | print('"%s" looks like a filename. Reading data from the file.' % data) 56 | data = open(data).read() 57 | elif data.startswith("http:") or data.startswith("https:"): 58 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) 59 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") 60 | return 61 | print() 62 | 63 | for parser in basic_parsers: 64 | print("Trying to parse your markup with %s" % parser) 65 | success = False 66 | try: 67 | soup = BeautifulSoup(data, parser) 68 | success = True 69 | except Exception as e: 70 | print("%s could not parse the markup." % parser) 71 | traceback.print_exc() 72 | if success: 73 | print("Here's what %s did with the markup:" % parser) 74 | print(soup.prettify()) 75 | 76 | print("-" * 80) 77 | 78 | def lxml_trace(data, html=True, **kwargs): 79 | """Print out the lxml events that occur during parsing. 80 | 81 | This lets you see how lxml parses a document when no Beautiful 82 | Soup code is running. 83 | """ 84 | from lxml import etree 85 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 86 | print(("%s, %4s, %s" % (event, element.tag, element.text))) 87 | 88 | class AnnouncingParser(HTMLParser): 89 | """Announces HTMLParser parse events, without doing anything else.""" 90 | 91 | def _p(self, s): 92 | print(s) 93 | 94 | def handle_starttag(self, name, attrs): 95 | self._p("%s START" % name) 96 | 97 | def handle_endtag(self, name): 98 | self._p("%s END" % name) 99 | 100 | def handle_data(self, data): 101 | self._p("%s DATA" % data) 102 | 103 | def handle_charref(self, name): 104 | self._p("%s CHARREF" % name) 105 | 106 | def handle_entityref(self, name): 107 | self._p("%s ENTITYREF" % name) 108 | 109 | def handle_comment(self, data): 110 | self._p("%s COMMENT" % data) 111 | 112 | def handle_decl(self, data): 113 | self._p("%s DECL" % data) 114 | 115 | def unknown_decl(self, data): 116 | self._p("%s UNKNOWN-DECL" % data) 117 | 118 | def handle_pi(self, data): 119 | self._p("%s PI" % data) 120 | 121 | def htmlparser_trace(data): 122 | """Print out the HTMLParser events that occur during parsing. 123 | 124 | This lets you see how HTMLParser parses a document when no 125 | Beautiful Soup code is running. 126 | """ 127 | parser = AnnouncingParser() 128 | parser.feed(data) 129 | 130 | _vowels = "aeiou" 131 | _consonants = "bcdfghjklmnpqrstvwxyz" 132 | 133 | def rword(length=5): 134 | "Generate a random word-like string." 135 | s = '' 136 | for i in range(length): 137 | if i % 2 == 0: 138 | t = _consonants 139 | else: 140 | t = _vowels 141 | s += random.choice(t) 142 | return s 143 | 144 | def rsentence(length=4): 145 | "Generate a random sentence-like string." 146 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 147 | 148 | def rdoc(num_elements=1000): 149 | """Randomly generate an invalid HTML document.""" 150 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 151 | elements = [] 152 | for i in range(num_elements): 153 | choice = random.randint(0,3) 154 | if choice == 0: 155 | # New tag. 156 | tag_name = random.choice(tag_names) 157 | elements.append("<%s>" % tag_name) 158 | elif choice == 1: 159 | elements.append(rsentence(random.randint(1,4))) 160 | elif choice == 2: 161 | # Close a tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("" % tag_name) 164 | return "" + "\n".join(elements) + "" 165 | 166 | def benchmark_parsers(num_elements=100000): 167 | """Very basic head-to-head performance benchmark.""" 168 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 169 | data = rdoc(num_elements) 170 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) 171 | 172 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 173 | success = False 174 | try: 175 | a = time.time() 176 | soup = BeautifulSoup(data, parser) 177 | b = time.time() 178 | success = True 179 | except Exception as e: 180 | print("%s could not parse the markup." % parser) 181 | traceback.print_exc() 182 | if success: 183 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 184 | 185 | from lxml import etree 186 | a = time.time() 187 | etree.HTML(data) 188 | b = time.time() 189 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) 190 | 191 | import html5lib 192 | parser = html5lib.HTMLParser() 193 | a = time.time() 194 | parser.parse(data) 195 | b = time.time() 196 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 197 | 198 | def profile(num_elements=100000, parser="lxml"): 199 | 200 | filehandle = tempfile.NamedTemporaryFile() 201 | filename = filehandle.name 202 | 203 | data = rdoc(num_elements) 204 | vars = dict(bs4=bs4, data=data, parser=parser) 205 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 206 | 207 | stats = pstats.Stats(filename) 208 | # stats.strip_dirs() 209 | stats.sort_stats("cumulative") 210 | stats.print_stats('_html5lib|bs4', 50) 211 | 212 | if __name__ == '__main__': 213 | diagnose(sys.stdin.read()) 214 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/diagnose.py.bak: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from StringIO import StringIO 4 | from HTMLParser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print "Diagnostic running on Beautiful Soup %s" % __version__ 21 | print "Python version %s" % sys.version 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print ( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | try: 37 | from lxml import etree 38 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 39 | except ImportError, e: 40 | print ( 41 | "lxml is not installed or couldn't be imported.") 42 | 43 | 44 | if 'html5lib' in basic_parsers: 45 | try: 46 | import html5lib 47 | print "Found html5lib version %s" % html5lib.__version__ 48 | except ImportError, e: 49 | print ( 50 | "html5lib is not installed or couldn't be imported.") 51 | 52 | if hasattr(data, 'read'): 53 | data = data.read() 54 | elif os.path.exists(data): 55 | print '"%s" looks like a filename. Reading data from the file.' % data 56 | data = open(data).read() 57 | elif data.startswith("http:") or data.startswith("https:"): 58 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 59 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 60 | return 61 | print 62 | 63 | for parser in basic_parsers: 64 | print "Trying to parse your markup with %s" % parser 65 | success = False 66 | try: 67 | soup = BeautifulSoup(data, parser) 68 | success = True 69 | except Exception, e: 70 | print "%s could not parse the markup." % parser 71 | traceback.print_exc() 72 | if success: 73 | print "Here's what %s did with the markup:" % parser 74 | print soup.prettify() 75 | 76 | print "-" * 80 77 | 78 | def lxml_trace(data, html=True, **kwargs): 79 | """Print out the lxml events that occur during parsing. 80 | 81 | This lets you see how lxml parses a document when no Beautiful 82 | Soup code is running. 83 | """ 84 | from lxml import etree 85 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 86 | print("%s, %4s, %s" % (event, element.tag, element.text)) 87 | 88 | class AnnouncingParser(HTMLParser): 89 | """Announces HTMLParser parse events, without doing anything else.""" 90 | 91 | def _p(self, s): 92 | print(s) 93 | 94 | def handle_starttag(self, name, attrs): 95 | self._p("%s START" % name) 96 | 97 | def handle_endtag(self, name): 98 | self._p("%s END" % name) 99 | 100 | def handle_data(self, data): 101 | self._p("%s DATA" % data) 102 | 103 | def handle_charref(self, name): 104 | self._p("%s CHARREF" % name) 105 | 106 | def handle_entityref(self, name): 107 | self._p("%s ENTITYREF" % name) 108 | 109 | def handle_comment(self, data): 110 | self._p("%s COMMENT" % data) 111 | 112 | def handle_decl(self, data): 113 | self._p("%s DECL" % data) 114 | 115 | def unknown_decl(self, data): 116 | self._p("%s UNKNOWN-DECL" % data) 117 | 118 | def handle_pi(self, data): 119 | self._p("%s PI" % data) 120 | 121 | def htmlparser_trace(data): 122 | """Print out the HTMLParser events that occur during parsing. 123 | 124 | This lets you see how HTMLParser parses a document when no 125 | Beautiful Soup code is running. 126 | """ 127 | parser = AnnouncingParser() 128 | parser.feed(data) 129 | 130 | _vowels = "aeiou" 131 | _consonants = "bcdfghjklmnpqrstvwxyz" 132 | 133 | def rword(length=5): 134 | "Generate a random word-like string." 135 | s = '' 136 | for i in range(length): 137 | if i % 2 == 0: 138 | t = _consonants 139 | else: 140 | t = _vowels 141 | s += random.choice(t) 142 | return s 143 | 144 | def rsentence(length=4): 145 | "Generate a random sentence-like string." 146 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 147 | 148 | def rdoc(num_elements=1000): 149 | """Randomly generate an invalid HTML document.""" 150 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 151 | elements = [] 152 | for i in range(num_elements): 153 | choice = random.randint(0,3) 154 | if choice == 0: 155 | # New tag. 156 | tag_name = random.choice(tag_names) 157 | elements.append("<%s>" % tag_name) 158 | elif choice == 1: 159 | elements.append(rsentence(random.randint(1,4))) 160 | elif choice == 2: 161 | # Close a tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("" % tag_name) 164 | return "" + "\n".join(elements) + "" 165 | 166 | def benchmark_parsers(num_elements=100000): 167 | """Very basic head-to-head performance benchmark.""" 168 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 169 | data = rdoc(num_elements) 170 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 171 | 172 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 173 | success = False 174 | try: 175 | a = time.time() 176 | soup = BeautifulSoup(data, parser) 177 | b = time.time() 178 | success = True 179 | except Exception, e: 180 | print "%s could not parse the markup." % parser 181 | traceback.print_exc() 182 | if success: 183 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 184 | 185 | from lxml import etree 186 | a = time.time() 187 | etree.HTML(data) 188 | b = time.time() 189 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 190 | 191 | import html5lib 192 | parser = html5lib.HTMLParser() 193 | a = time.time() 194 | parser.parse(data) 195 | b = time.time() 196 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) 197 | 198 | def profile(num_elements=100000, parser="lxml"): 199 | 200 | filehandle = tempfile.NamedTemporaryFile() 201 | filename = filehandle.name 202 | 203 | data = rdoc(num_elements) 204 | vars = dict(bs4=bs4, data=data, parser=parser) 205 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 206 | 207 | stats = pstats.Stats(filename) 208 | # stats.strip_dirs() 209 | stats.sort_stats("cumulative") 210 | stats.print_stats('_html5lib|bs4', 50) 211 | 212 | if __name__ == '__main__': 213 | diagnose(sys.stdin.read()) 214 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | import warnings 5 | 6 | from bs4 import BeautifulSoup 7 | from bs4.builder import ( 8 | builder_registry as registry, 9 | HTMLParserTreeBuilder, 10 | TreeBuilderRegistry, 11 | ) 12 | 13 | try: 14 | from bs4.builder import HTML5TreeBuilder 15 | HTML5LIB_PRESENT = True 16 | except ImportError: 17 | HTML5LIB_PRESENT = False 18 | 19 | try: 20 | from bs4.builder import ( 21 | LXMLTreeBuilderForXML, 22 | LXMLTreeBuilder, 23 | ) 24 | LXML_PRESENT = True 25 | except ImportError: 26 | LXML_PRESENT = False 27 | 28 | 29 | class BuiltInRegistryTest(unittest.TestCase): 30 | """Test the built-in registry with the default builders registered.""" 31 | 32 | def test_combination(self): 33 | if LXML_PRESENT: 34 | self.assertEqual(registry.lookup('fast', 'html'), 35 | LXMLTreeBuilder) 36 | 37 | if LXML_PRESENT: 38 | self.assertEqual(registry.lookup('permissive', 'xml'), 39 | LXMLTreeBuilderForXML) 40 | self.assertEqual(registry.lookup('strict', 'html'), 41 | HTMLParserTreeBuilder) 42 | if HTML5LIB_PRESENT: 43 | self.assertEqual(registry.lookup('html5lib', 'html'), 44 | HTML5TreeBuilder) 45 | 46 | def test_lookup_by_markup_type(self): 47 | if LXML_PRESENT: 48 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 49 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 50 | else: 51 | self.assertEqual(registry.lookup('xml'), None) 52 | if HTML5LIB_PRESENT: 53 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 54 | else: 55 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 56 | 57 | def test_named_library(self): 58 | if LXML_PRESENT: 59 | self.assertEqual(registry.lookup('lxml', 'xml'), 60 | LXMLTreeBuilderForXML) 61 | self.assertEqual(registry.lookup('lxml', 'html'), 62 | LXMLTreeBuilder) 63 | if HTML5LIB_PRESENT: 64 | self.assertEqual(registry.lookup('html5lib'), 65 | HTML5TreeBuilder) 66 | 67 | self.assertEqual(registry.lookup('html.parser'), 68 | HTMLParserTreeBuilder) 69 | 70 | def test_beautifulsoup_constructor_does_lookup(self): 71 | 72 | with warnings.catch_warnings(record=True) as w: 73 | # This will create a warning about not explicitly 74 | # specifying a parser, but we'll ignore it. 75 | 76 | # You can pass in a string. 77 | BeautifulSoup("", features="html") 78 | # Or a list of strings. 79 | BeautifulSoup("", features=["html", "fast"]) 80 | 81 | # You'll get an exception if BS can't find an appropriate 82 | # builder. 83 | self.assertRaises(ValueError, BeautifulSoup, 84 | "", features="no-such-feature") 85 | 86 | class RegistryTest(unittest.TestCase): 87 | """Test the TreeBuilderRegistry class in general.""" 88 | 89 | def setUp(self): 90 | self.registry = TreeBuilderRegistry() 91 | 92 | def builder_for_features(self, *feature_list): 93 | cls = type('Builder_' + '_'.join(feature_list), 94 | (object,), {'features' : feature_list}) 95 | 96 | self.registry.register(cls) 97 | return cls 98 | 99 | def test_register_with_no_features(self): 100 | builder = self.builder_for_features() 101 | 102 | # Since the builder advertises no features, you can't find it 103 | # by looking up features. 104 | self.assertEqual(self.registry.lookup('foo'), None) 105 | 106 | # But you can find it by doing a lookup with no features, if 107 | # this happens to be the only registered builder. 108 | self.assertEqual(self.registry.lookup(), builder) 109 | 110 | def test_register_with_features_makes_lookup_succeed(self): 111 | builder = self.builder_for_features('foo', 'bar') 112 | self.assertEqual(self.registry.lookup('foo'), builder) 113 | self.assertEqual(self.registry.lookup('bar'), builder) 114 | 115 | def test_lookup_fails_when_no_builder_implements_feature(self): 116 | builder = self.builder_for_features('foo', 'bar') 117 | self.assertEqual(self.registry.lookup('baz'), None) 118 | 119 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 120 | builder1 = self.builder_for_features('foo') 121 | builder2 = self.builder_for_features('bar') 122 | self.assertEqual(self.registry.lookup(), builder2) 123 | 124 | def test_lookup_fails_when_no_tree_builders_registered(self): 125 | self.assertEqual(self.registry.lookup(), None) 126 | 127 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 128 | has_one = self.builder_for_features('foo') 129 | has_the_other = self.builder_for_features('bar') 130 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 131 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 132 | lacks_one = self.builder_for_features('bar') 133 | has_the_other = self.builder_for_features('foo') 134 | 135 | # There are two builders featuring 'foo' and 'bar', but 136 | # the one that also features 'quux' was registered later. 137 | self.assertEqual(self.registry.lookup('foo', 'bar'), 138 | has_both_late) 139 | 140 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 141 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 142 | has_both_early) 143 | 144 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 145 | builder1 = self.builder_for_features('foo', 'bar') 146 | builder2 = self.builder_for_features('foo', 'baz') 147 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 148 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError as e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual("

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_html5lib.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from pdb import set_trace 5 | import pickle 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 7 | from bs4.builder import HTMLParserTreeBuilder 8 | 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 10 | 11 | @property 12 | def default_builder(self): 13 | return HTMLParserTreeBuilder() 14 | 15 | def test_namespaced_system_doctype(self): 16 | # html.parser can't handle namespaced doctypes, so skip this one. 17 | pass 18 | 19 | def test_namespaced_public_doctype(self): 20 | # html.parser can't handle namespaced doctypes, so skip this one. 21 | pass 22 | 23 | def test_builder_is_pickled(self): 24 | """Unlike most tree builders, HTMLParserTreeBuilder and will 25 | be restored after pickling. 26 | """ 27 | tree = self.soup("foo") 28 | dumped = pickle.dumps(tree, 2) 29 | loaded = pickle.loads(dumped) 30 | self.assertTrue(isinstance(loaded.builder, type(tree.builder))) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError as e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual("", str(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Capstone/Pagerank/bs4/tests/test_lxml.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError, e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Capstone/Pagerank/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /Capstone/Pagerank/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Force-Directed Layout 5 | 6 | 7 | 8 | 9 | 10 | 13 |
14 | 15 |

If you don't see a chart above, check the JavaScript console. You may 16 | need to use a different browser.

17 | 18 | 19 | -------------------------------------------------------------------------------- /Capstone/Pagerank/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /Capstone/Pagerank/pagerank orginal.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/pagerank orginal.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print(row) 14 | count = count + 1 15 | print(count, 'rows.') 16 | cur.close() 17 | -------------------------------------------------------------------------------- /Capstone/Pagerank/spdump.py Dr. Chuck.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spdump.py Dr. Chuck.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/spdump.py WP.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spdump.py WP.PNG -------------------------------------------------------------------------------- /Capstone/Pagerank/spider.js: -------------------------------------------------------------------------------- 1 | spiderJson = {"nodes":[ 2 | {"weight":126,"rank":19.0, "id":6, "url":"https://www.washingtonpost.com"}, 3 | {"weight":7,"rank":0.1367546748927559, "id":12, "url":"https://www.washingtonpost.com/news/arts-and-entertainment/wp/2017/10/26/megyn-kelly-responds-to-those-bill-oreilly-thank-you-notes-its-right-out-of-the-playbook"}, 4 | {"weight":7,"rank":0.1367546748927559, "id":14, "url":"https://www.washingtonpost.com/news/politics/wp/2017/09/25/a-trump-judicial-pick-said-transgender-children-are-proof-that-satans-plan-is-working/?utm_term=.8e0dac432ca8"}, 5 | {"weight":12,"rank":0.1367546748927559, "id":17, "url":"https://www.washingtonpost.com/news/animalia/wp/2017/10/26/this-bears-tongue-was-monstrously-enlarged-a-team-of-vets-gave-him-a-second-chance-at-life"}, 6 | {"weight":7,"rank":0.1367546748927559, "id":18, "url":"https://www.washingtonpost.com/news/speaking-of-science/wp/2017/10/25/this-bug-is-real-and-now-so-are-your-nightmares"}, 7 | {"weight":4,"rank":0.0, "id":20, "url":"https://www.washingtonpost.com/news/early-lead/wp/2017/10/26/joe-girardi-out-as-yankees-manager-and-the-nationals-just-happen-to-have-an-opening"}, 8 | {"weight":9,"rank":0.11777961277413226, "id":22, "url":"https://www.washingtonpost.com/people/mike-debonis"}, 9 | {"weight":4,"rank":0.0, "id":25, "url":"https://www.washingtonpost.com/news/post-politics/wp/2017/10/26/trump-plans-to-declare-the-opioid-crisis-a-public-health-emergency"}, 10 | {"weight":5,"rank":0.0, "id":32, "url":"https://www.washingtonpost.com/people/samantha-schmidt"}, 11 | {"weight":5,"rank":0.0, "id":38, "url":"https://www.washingtonpost.com/people/erik-wemple"}, 12 | {"weight":5,"rank":0.0, "id":41, "url":"https://www.washingtonpost.com/news/the-watch/wp/2017/10/26/mississippi-judge-resigns-after-barring-mother-from-seeing-newborn-because-of-unpaid-court-fees"}, 13 | {"weight":7,"rank":0.2567572862009351, "id":42, "url":"https://www.washingtonpost.com/people/radley-balko"}, 14 | {"weight":8,"rank":0.0, "id":43, "url":"https://www.washingtonpost.com/news/posteverything/wp/2017/10/26/george-w-bush-is-not-the-resistance-hes-part-of-what-brought-us-trump"}, 15 | {"weight":5,"rank":0.0, "id":45, "url":"https://www.washingtonpost.com/people/damian-paletta"}, 16 | {"weight":7,"rank":0.10600165188954895, "id":47, "url":"https://www.washingtonpost.com/powerpost/booker-graham-and-elias-expected-to-testify-today-in-menendez-corruption-trial/2017/10/26/34d9219c-ba4a-11e7-9e58-e6288544af98_story.html"}, 17 | {"weight":5,"rank":0.0, "id":51, "url":"https://www.washingtonpost.com/people/anna-fifield"}, 18 | {"weight":5,"rank":0.0, "id":53, "url":"https://www.washingtonpost.com/people/william-booth"}, 19 | {"weight":7,"rank":0.018975062118623574, "id":54, "url":"https://www.washingtonpost.com/local/virginia-politics/trump-again-tweets-on-virginias-governors-race-says-northam-will-be-very-weak-on-crime/2017/10/26/4c5d5ea6-ba58-11e7-be94-fabb0f1e9ffb_story.html"}, 20 | {"weight":5,"rank":0.0, "id":57, "url":"https://www.washingtonpost.com/people/perry-stein"}, 21 | {"weight":5,"rank":0.0, "id":59, "url":"https://www.washingtonpost.com/news/wonk/wp/2017/10/26/why-mcdonalds-is-beating-out-the-fresh-healthy-competition"}, 22 | {"weight":9,"rank":0.0, "id":64, "url":"https://www.washingtonpost.com/local/obituaries/notable-deaths-so-far-this-year/2017/01/17/750ed23a-dcf5-11e6-acdf-14da832ae861_gallery.html"}, 23 | {"weight":16,"rank":1.3854428993176182, "id":66, "url":"https://www.washingtonpost.com/powerpost/gops-insurgents-step-up-campaign-against-mcconnell/2017/10/25/ec3a5af4-b9a0-11e7-9e58-e6288544af98_story.html?tid=pm_pop"}, 24 | {"weight":10,"rank":0.006418872637825229, "id":74, "url":"https://www.washingtonpost.com/crossword-puzzles"}, 25 | {"weight":4,"rank":0.0, "id":80, "url":"https://www.washingtonpost.com/news/the-fix/wp/2017/10/26/trump-flubs-another-promise-declaring-the-opioid-crisis-a-national-emergency"}, 26 | {"weight":4,"rank":0.0, "id":83, "url":"https://www.washingtonpost.com/news/worldviews/wp/2017/10/26/whats-a-scary-superstition-in-your-part-of-the-world-share-it-with-us"}, 27 | {"weight":4,"rank":0.0, "id":90, "url":"https://www.washingtonpost.com/news/posteverything/wp/2017/10/26/will-republicans-regret-enabling-a-demagogue-my-iranian-parents-did"}], 28 | "links":[ 29 | {"source":0,"target":0,"value":3}, 30 | {"source":0,"target":1,"value":3}, 31 | {"source":0,"target":2,"value":3}, 32 | {"source":0,"target":3,"value":3}, 33 | {"source":0,"target":4,"value":3}, 34 | {"source":0,"target":5,"value":3}, 35 | {"source":0,"target":6,"value":3}, 36 | {"source":0,"target":7,"value":3}, 37 | {"source":0,"target":8,"value":3}, 38 | {"source":0,"target":9,"value":3}, 39 | {"source":0,"target":10,"value":3}, 40 | {"source":0,"target":11,"value":3}, 41 | {"source":0,"target":12,"value":3}, 42 | {"source":0,"target":13,"value":3}, 43 | {"source":0,"target":14,"value":3}, 44 | {"source":0,"target":15,"value":3}, 45 | {"source":0,"target":16,"value":3}, 46 | {"source":0,"target":17,"value":3}, 47 | {"source":0,"target":18,"value":3}, 48 | {"source":0,"target":19,"value":3}, 49 | {"source":0,"target":20,"value":3}, 50 | {"source":0,"target":21,"value":3}, 51 | {"source":0,"target":22,"value":3}, 52 | {"source":0,"target":23,"value":3}, 53 | {"source":0,"target":24,"value":3}, 54 | {"source":0,"target":25,"value":3}, 55 | {"source":7,"target":0,"value":3}, 56 | {"source":7,"target":7,"value":3}, 57 | {"source":9,"target":9,"value":3}, 58 | {"source":9,"target":0,"value":3}, 59 | {"source":9,"target":21,"value":3}, 60 | {"source":17,"target":0,"value":3}, 61 | {"source":17,"target":17,"value":3}, 62 | {"source":23,"target":0,"value":3}, 63 | {"source":23,"target":23,"value":3}, 64 | {"source":11,"target":11,"value":3}, 65 | {"source":11,"target":0,"value":3}, 66 | {"source":11,"target":21,"value":3}, 67 | {"source":24,"target":0,"value":3}, 68 | {"source":24,"target":24,"value":3}, 69 | {"source":19,"target":0,"value":3}, 70 | {"source":19,"target":19,"value":3}, 71 | {"source":22,"target":22,"value":3}, 72 | {"source":22,"target":0,"value":3}, 73 | {"source":8,"target":8,"value":3}, 74 | {"source":8,"target":0,"value":3}, 75 | {"source":8,"target":21,"value":3}, 76 | {"source":14,"target":0,"value":3}, 77 | {"source":14,"target":14,"value":3}, 78 | {"source":15,"target":15,"value":3}, 79 | {"source":15,"target":0,"value":3}, 80 | {"source":15,"target":21,"value":3}, 81 | {"source":4,"target":0,"value":3}, 82 | {"source":4,"target":4,"value":3}, 83 | {"source":13,"target":13,"value":3}, 84 | {"source":13,"target":0,"value":3}, 85 | {"source":13,"target":21,"value":3}, 86 | {"source":21,"target":0,"value":3}, 87 | {"source":21,"target":21,"value":3}, 88 | {"source":5,"target":0,"value":3}, 89 | {"source":5,"target":5,"value":3}, 90 | {"source":16,"target":16,"value":3}, 91 | {"source":16,"target":0,"value":3}, 92 | {"source":16,"target":21,"value":3}, 93 | {"source":18,"target":18,"value":3}, 94 | {"source":18,"target":0,"value":3}, 95 | {"source":18,"target":21,"value":3}, 96 | {"source":25,"target":0,"value":3}, 97 | {"source":25,"target":25,"value":3}, 98 | {"source":3,"target":0,"value":3}, 99 | {"source":3,"target":3,"value":3}, 100 | {"source":3,"target":21,"value":3}, 101 | {"source":12,"target":0,"value":3}, 102 | {"source":12,"target":12,"value":3}, 103 | {"source":20,"target":0,"value":3}, 104 | {"source":20,"target":20,"value":3}, 105 | {"source":1,"target":0,"value":3}, 106 | {"source":1,"target":1,"value":3}, 107 | {"source":6,"target":6,"value":3}, 108 | {"source":6,"target":0,"value":3}, 109 | {"source":6,"target":21,"value":3}, 110 | {"source":10,"target":0,"value":3}, 111 | {"source":10,"target":10,"value":3}, 112 | {"source":10,"target":11,"value":3}, 113 | {"source":2,"target":0,"value":3}, 114 | {"source":2,"target":2,"value":3}]}; -------------------------------------------------------------------------------- /Capstone/Pagerank/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib.error 3 | import ssl 4 | from urllib.parse import urljoin 5 | from urllib.parse import urlparse 6 | from urllib.request import urlopen 7 | from bs4 import BeautifulSoup 8 | 9 | # Ignore SSL certificate errors 10 | ctx = ssl.create_default_context() 11 | ctx.check_hostname = False 12 | ctx.verify_mode = ssl.CERT_NONE 13 | 14 | conn = sqlite3.connect('spider.sqlite') 15 | cur = conn.cursor() 16 | 17 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 18 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 19 | error INTEGER, old_rank REAL, new_rank REAL)''') 20 | 21 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 22 | (from_id INTEGER, to_id INTEGER)''') 23 | 24 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 25 | 26 | # Check to see if we are already in progress... 27 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 28 | row = cur.fetchone() 29 | if row is not None: 30 | print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.") 31 | else : 32 | starturl = input('Enter web url or enter: ') 33 | if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/' 34 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 35 | web = starturl 36 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 37 | pos = starturl.rfind('/') 38 | web = starturl[:pos] 39 | 40 | if ( len(web) > 1 ) : 41 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 42 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 43 | conn.commit() 44 | 45 | # Get the current webs 46 | cur.execute('''SELECT url FROM Webs''') 47 | webs = list() 48 | for row in cur: 49 | webs.append(str(row[0])) 50 | 51 | print(webs) 52 | 53 | many = 0 54 | while True: 55 | if ( many < 1 ) : 56 | sval = input('How many pages:') 57 | if ( len(sval) < 1 ) : break 58 | many = int(sval) 59 | many = many - 1 60 | 61 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 62 | try: 63 | row = cur.fetchone() 64 | # print row 65 | fromid = row[0] 66 | url = row[1] 67 | except: 68 | print('No unretrieved HTML pages found') 69 | many = 0 70 | break 71 | 72 | print(fromid, url, end=' ') 73 | 74 | # If we are retrieving this page, there should be no links from it 75 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 76 | try: 77 | document = urlopen(url, context=ctx) 78 | 79 | html = document.read() 80 | if document.getcode() != 200 : 81 | print("Error on page: ",document.getcode()) 82 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 83 | 84 | if 'text/html' != document.info().get_content_type() : 85 | print("Ignore non text/html page") 86 | cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) 87 | cur.execute('UPDATE Pages SET error=0 WHERE url=?', (url, ) ) 88 | conn.commit() 89 | continue 90 | 91 | print('('+str(len(html))+')', end=' ') 92 | 93 | soup = BeautifulSoup(html, "html.parser") 94 | except KeyboardInterrupt: 95 | print('') 96 | print('Program interrupted by user...') 97 | break 98 | except: 99 | print("Unable to retrieve or parse page") 100 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 101 | conn.commit() 102 | continue 103 | 104 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 105 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) 106 | conn.commit() 107 | 108 | # Retrieve all of the anchor tags 109 | tags = soup('a') 110 | count = 0 111 | for tag in tags: 112 | href = tag.get('href', None) 113 | if ( href is None ) : continue 114 | # Resolve relative references like href="/contact" 115 | up = urlparse(href) 116 | if ( len(up.scheme) < 1 ) : 117 | href = urljoin(url, href) 118 | ipos = href.find('#') 119 | if ( ipos > 1 ) : href = href[:ipos] 120 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 121 | if ( href.endswith('/') ) : href = href[:-1] 122 | # print href 123 | if ( len(href) < 1 ) : continue 124 | 125 | # Check if the URL is in any of the webs 126 | found = False 127 | for web in webs: 128 | if ( href.startswith(web) ) : 129 | found = True 130 | break 131 | if not found : continue 132 | 133 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 134 | count = count + 1 135 | conn.commit() 136 | 137 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 138 | try: 139 | row = cur.fetchone() 140 | toid = row[0] 141 | except: 142 | print('Could not retrieve id') 143 | continue 144 | # print fromid, toid 145 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 146 | 147 | 148 | print(count) 149 | 150 | cur.close() 151 | -------------------------------------------------------------------------------- /Capstone/Pagerank/spider.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Capstone/Pagerank/spider.sqlite -------------------------------------------------------------------------------- /Capstone/Pagerank/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print("Creating JSON output on spider.js...") 7 | howmany = int(input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank is None or maxrank < rank: maxrank = rank 22 | if minrank is None or minrank > rank : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print("Error - please run sprank.py to compute page rank") 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print("Open force.html in a browser to view the visualization") 62 | -------------------------------------------------------------------------------- /Capstone/Pagerank/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print("Nothing to page rank. Check data.") 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in list(prev_ranks.items()): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in list(prev_ranks.items()): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in list(next_ranks.items()): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in list(next_ranks.items()): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in list(prev_ranks.items()): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print(i+1, avediff) 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print(list(next_ranks.items())[:5]) 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in list(next_ranks.items()) : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /Capstone/Pagerank/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print("All pages set to a rank of 1.0") 12 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 1/Hello World.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 08:59:26 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | print("Hello World!") -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 2/Assignment 2.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:00:05 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | name = input("Enter your name") 9 | print("Hello %s" % name) -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 2/Assignment 2.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:01:38 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | hrs = input("Enter Hours:") 9 | rate = input("Enter Rate:") 10 | cost = float(hrs)*float(rate) 11 | 12 | print("Pay: %s" % cost) 13 | 14 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 3/Assignment 3.1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:03:50 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | try: 9 | h = input("please input your hour:") 10 | hour = float(h) 11 | r = input("please input your rate:") 12 | rate = float(r) 13 | if hour < 0: 14 | print("Please,input your positive numberic") 15 | elif rate < 0: 16 | print("Please,input your positive numberic") 17 | elif hour > 40: 18 | print("%.2f" % (40*rate+(hour-40)*1.5*rate)) 19 | else: 20 | print("%.2f" % (hour*rate)) 21 | except: 22 | print("Please,input your numberic") 23 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 3/Assignment 3.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:04:59 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | score = float(input("Enter a score between 0.0 and 1.0: ")) 9 | if score<0.0 or score>1.0: 10 | print("Error: Score out of range") 11 | if score<0.6: 12 | print("F") 13 | elif score>=0.6 and score<0.7: 14 | print("D") 15 | elif score>=0.7 and score<0.8: 16 | print("C") 17 | elif score>=0.8 and score<0.9: 18 | print("B") 19 | elif score>=0.9: 20 | print("A") 21 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 4/Assignment 4.6.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:07:25 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | def computepay(hrs,rps): 9 | final_pay = 0.0 10 | hours = float(hrs) 11 | rate_per_hour = float(rps) 12 | if hours>40: 13 | final_pay += 40 * float(rate_per_hour) 14 | hours -= 40 15 | final_pay += hours * rate_per_hour * 1.5 16 | else: 17 | final_pay += hours * rate_per_hour 18 | 19 | return final_pay 20 | 21 | 22 | hrs = input("Enter Hours: ") 23 | rate = input("Enter Rate: ") 24 | p = computepay(hrs,rate) 25 | print(p) 26 | -------------------------------------------------------------------------------- /Programming for Everybody/Chapter 5/Assignment 5.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:09:05 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | largest = None 9 | smallest = None 10 | while True: 11 | try: 12 | num = input("Enter a number: ") 13 | if num == "done" : break 14 | n = int(num) 15 | if largest is None: 16 | largest = n 17 | if smallest is None: 18 | smallest = n 19 | if n > largest: 20 | largest = n 21 | if n < smallest: 22 | smallest = n 23 | except: 24 | print('Invalid input') 25 | 26 | print("Maximum is", largest) 27 | print("Minimum is", smallest) 28 | -------------------------------------------------------------------------------- /Python Data Structures/Atom Editor Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Python Data Structures/Atom Editor Test.PNG -------------------------------------------------------------------------------- /Python Data Structures/Chapter 10/Assignment 10.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:29:38 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | name = input("Enter file:") 9 | if len(name) < 1 : name = "mbox-short.txt" 10 | handle = open(name) 11 | a = handle.read() 12 | b = a.split("\n") 13 | d = [] 14 | for i in b: 15 | if i.startswith("From "): 16 | c = i.split(":") 17 | d.append(c[0][-2:]) 18 | d.sort() 19 | counts = {} 20 | for j in d: 21 | counts[j] = d.count(j) 22 | for k, l in counts.items(): 23 | print(k, l) 24 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 6/Assignment 6.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:11:42 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | text = "X-DSPAM-Confidence: 0.8475"; 9 | a = text[-6:] 10 | b = float(a) 11 | print(b) 12 | text.find(":") 13 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 7/Assignment 7.1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:12:55 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | # Use words.txt as the file name 9 | fname = input("Enter file name: ") 10 | fh = open(fname) 11 | a = fh.read() 12 | print(a.upper().rstrip()) -------------------------------------------------------------------------------- /Python Data Structures/Chapter 7/Assignment 7.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:17:11 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | # Use the file name mbox-short.txt as the file name 9 | fname = input("Enter file name: ") 10 | fh = open(fname) 11 | a = [] 12 | for line in fh: 13 | if line.startswith("X-DSPAM-Confidence:"): 14 | a.append(float(line[-6:])) 15 | total = 0 16 | for i in a: 17 | total = total + i 18 | mean = total/(len(a)) 19 | print("Average spam confidence:", mean) 20 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 7/words.txt: -------------------------------------------------------------------------------- 1 | Writing programs or programming is a very creative 2 | and rewarding activity You can write programs for 3 | many reasons ranging from making your living to solving 4 | a difficult data analysis problem to having fun to helping 5 | someone else solve a problem This book assumes that 6 | {\em everyone} needs to know how to program and that once 7 | you know how to program, you will figure out what you want 8 | to do with your newfound skills 9 | 10 | We are surrounded in our daily lives with computers ranging 11 | from laptops to cell phones We can think of these computers 12 | as our personal assistants who can take care of many things 13 | on our behalf The hardware in our current-day computers 14 | is essentially built to continuously ask us the question 15 | What would you like me to do next 16 | 17 | Our computers are fast and have vasts amounts of memory and 18 | could be very helpful to us if we only knew the language to 19 | speak to explain to the computer what we would like it to 20 | do next If we knew this language we could tell the 21 | computer to do tasks on our behalf that were reptitive 22 | Interestingly, the kinds of things computers can do best 23 | are often the kinds of things that we humans find boring 24 | and mind-numbing 25 | 26 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 8/Assignment 8.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Spyder Editor 4 | 5 | This is a temporary script file. 6 | """ 7 | 8 | get = input('Please enter file name:') 9 | handle = open(get) 10 | text = list() 11 | for line in handle: 12 | line = line.rstrip() 13 | line = line.split() 14 | for i in line: 15 | if i in text: 16 | continue 17 | else: 18 | text.append(i) 19 | text.sort() 20 | print(text) 21 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 8/Assignment 8.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:25:10 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | file = input('Please enter file name:') 9 | handle = open(file) 10 | count = 0 11 | for line in handle: 12 | line = line.rstrip() 13 | if not line.startswith('From '): 14 | continue 15 | line = line.split() 16 | print(line[1]) 17 | count = count+1 18 | print('There were', count, 'lines in the file with From as the first word') 19 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 8/romeo.txt: -------------------------------------------------------------------------------- 1 | But soft what light through yonder window breaks 2 | It is the east and Juliet is the sun 3 | Arise fair sun and kill the envious moon 4 | Who is already sick and pale with grief 5 | 6 | -------------------------------------------------------------------------------- /Python Data Structures/Chapter 9/Assignment 9.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 09:27:17 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | file = input('Please enter file name: ') 9 | handle = open(file) 10 | data1 = list() 11 | data2 = dict() 12 | for line in handle: 13 | line = line.rstrip() 14 | if not line.startswith('From '): 15 | continue 16 | line = line.split() 17 | line = line[1] 18 | data1.append(line) 19 | for i in data1: 20 | data2[i] = data2.get(i,0)+1 21 | 22 | word = None 23 | max = None 24 | 25 | for aa, bb in data2.items(): 26 | if max is None or bb > max: 27 | word = aa 28 | max = bb 29 | 30 | print(word, max) -------------------------------------------------------------------------------- /Python Data Structures/Directory Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Python Data Structures/Directory Test.PNG -------------------------------------------------------------------------------- /Python Data Structures/Test.py: -------------------------------------------------------------------------------- 1 | print("I am writing a line on code!") 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-for-Everybody-Coursera 2 | Coursera courses for the Python for Everybody Specialization by the University of Michigan. This specialization teaches the fundamentals on how to get started on learning to use Python. I for myself started out in a non-technical background and found a way to learn the material. 3 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/First Database.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 2/First Database.db -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/First Database.db.sqbpro: -------------------------------------------------------------------------------- 1 | CREATE TABLE Ages ( 2 | name VARCHAR(128), 3 | age INTEGER 4 | ) 5 | DELETE FROM Ages; 6 | INSERT INTO Ages (name, age) VALUES ('Davie', 20); 7 | INSERT INTO Ages (name, age) VALUES ('Daanyaal', 20); 8 | INSERT INTO Ages (name, age) VALUES ('Ireayomide', 19); 9 | INSERT INTO Ages (name, age) VALUES ('Jagat', 34); 10 | SELECT hex(name || age) AS X FROM Ages ORDER BY X 11 | 12 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/First Database.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE Ages ( 2 | name VARCHAR(128), 3 | age INTEGER 4 | ) 5 | DELETE FROM Ages; 6 | INSERT INTO Ages (name, age) VALUES ('Davie', 20); 7 | INSERT INTO Ages (name, age) VALUES ('Daanyaal', 20); 8 | INSERT INTO Ages (name, age) VALUES ('Ireayomide', 19); 9 | INSERT INTO Ages (name, age) VALUES ('Jagat', 34); 10 | 11 | SELECT hex(name || age) AS X FROM Ages ORDER BY X 12 | 13 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/emaildb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 23 09:12:07 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import sqlite3 9 | 10 | conn = sqlite3.connect('emaildb.sqlite') 11 | cur = conn.cursor() 12 | 13 | cur.execute(''' 14 | DROP TABLE IF EXISTS Counts''') 15 | 16 | cur.execute(''' 17 | CREATE TABLE Counts (org TEXT, count INTEGER)''') 18 | 19 | fname = input('Enter file name: ') 20 | if (len(fname) < 1): fname = 'mbox-short.txt' 21 | fh = open(fname) 22 | for line in fh: 23 | if not line.startswith('From: '): continue 24 | pieces = line.split() 25 | org = pieces[1].split('@')[1] 26 | cur.execute('SELECT count FROM Counts WHERE org = ? ', (org,)) 27 | row = cur.fetchone() 28 | if row is None: 29 | cur.execute('''INSERT INTO Counts (org, count) 30 | VALUES (?, 1)''', (org,)) 31 | else: 32 | cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?', 33 | (org,)) 34 | conn.commit() 35 | 36 | # https://www.sqlite.org/lang_select.html 37 | sqlstr = 'SELECT org, count FROM Counts ORDER BY count DESC LIMIT 10' 38 | 39 | for row in cur.execute(sqlstr): 40 | print(str(row[0]), row[1]) 41 | 42 | cur.close() 43 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 2/emaildb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 2/emaildb.sqlite -------------------------------------------------------------------------------- /Using Databases with Python/Week 3/README.txt: -------------------------------------------------------------------------------- 1 | To export your own Library.xml from iTunes 2 | 3 | File -> Library -> Export Library 4 | 5 | Make sure it is in the correct folder. Of course iTUnes might change 6 | UI and/or export format any time - so good luck :) 7 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 3/trackdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 3/trackdb.sqlite -------------------------------------------------------------------------------- /Using Databases with Python/Week 3/tracks.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import sqlite3 3 | 4 | conn = sqlite3.connect('trackdb.sqlite') 5 | cur = conn.cursor() 6 | 7 | # Make some fresh tables using executescript() 8 | cur.executescript(''' 9 | DROP TABLE IF EXISTS Artist; 10 | DROP TABLE IF EXISTS Album; 11 | DROP TABLE IF EXISTS Track; 12 | CREATE TABLE Artist ( 13 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 14 | name TEXT UNIQUE 15 | ); 16 | CREATE TABLE Genre ( 17 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 18 | name TEXT UNIQUE 19 | ); 20 | CREATE TABLE Album ( 21 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 22 | artist_id INTEGER, 23 | title TEXT UNIQUE 24 | ); 25 | CREATE TABLE Track ( 26 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 27 | title TEXT UNIQUE, 28 | album_id INTEGER, 29 | genre_id INTEGER, 30 | len INTEGER, rating INTEGER, count INTEGER 31 | ); 32 | ''') 33 | 34 | 35 | fname = input('Enter file name: ') 36 | if (len(fname) < 1 ) : fname = 'Library.xml' 37 | 38 | # Track ID369 39 | # NameAnother One Bites The Dust 40 | # ArtistQueen 41 | def lookup(d, key): 42 | found = False 43 | for child in d: 44 | if found : return child.text 45 | if child.tag == 'key' and child.text == key : 46 | found = True 47 | return None 48 | 49 | stuff = ET.parse(fname) 50 | all = stuff.findall('dict/dict/dict') 51 | print('Dict count:', len(all)) 52 | for entry in all: 53 | if ( lookup(entry, 'Track ID') is None ) : continue 54 | 55 | name = lookup(entry, 'Name') 56 | artist = lookup(entry, 'Artist') 57 | album = lookup(entry, 'Album') 58 | count = lookup(entry, 'Play Count') 59 | rating = lookup(entry, 'Rating') 60 | length = lookup(entry, 'Total Time') 61 | genre = lookup(entry, 'Genre') 62 | 63 | if name is None or artist is None or album is None or genre is None : 64 | continue 65 | 66 | print(name, artist, album, count, rating, length, genre) 67 | 68 | cur.execute('''INSERT OR IGNORE INTO Artist (name) 69 | VALUES ( ? )''', ( artist, ) ) 70 | cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, )) 71 | artist_id = cur.fetchone()[0] 72 | 73 | cur.execute('''INSERT OR IGNORE INTO Genre (name) 74 | VALUES ( ? )''', ( genre, ) ) 75 | cur.execute('SELECT id FROM Genre WHERE name = ?', (genre, )) 76 | genre_id = cur.fetchone()[0] 77 | 78 | cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 79 | VALUES ( ?, ? )''', ( album, artist_id ) ) 80 | cur.execute('SELECT id FROM Album WHERE title = ? ', (album, )) 81 | album_id = cur.fetchone()[0] 82 | 83 | cur.execute('''INSERT OR REPLACE INTO Track 84 | (title, album_id, len, rating, count, genre_id) 85 | VALUES ( ?, ?, ?, ?, ?, ?)''', 86 | ( name, album_id, length, rating, count, genre_id) ) 87 | 88 | conn.commit() 89 | 90 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 3/trackscomplete.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 23 09:52:41 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import xml.etree.ElementTree as ET 9 | #parses the xml into element form 10 | import sqlite3 11 | #creates if does not exist database for tracks/if it does just establishes connection to the db 12 | conn = sqlite3.connect('Trackdb.sqlite') 13 | #establishes connection to the database/ 14 | cur = conn.cursor() 15 | 16 | #clears all tables out so no conflicting or bad info to begin 17 | cur.executescript(''' 18 | DROP TABLE IF EXISTS Artist; 19 | DROP TABLE IF EXISTS Album; 20 | DROP TABLE IF EXISTS Genre; 21 | DROP TABLE IF EXISTS Track;''') 22 | 23 | # creates the table 'Artist' id key autoincrements and must be unique, the artists name is stored there and will be referenced by table downstream 24 | cur.execute(''' CREATE TABLE Artist ( 25 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 26 | name TEXT UNIQUE 27 | );''') 28 | 29 | cur.execute(''' CREATE TABLE Genre ( 30 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 31 | name TEXT UNIQUE 32 | );''') # each line starts with the name of the prospective column followed by the schema thats being set for it entries 33 | # 34 | cur.execute(''' CREATE TABLE Album ( 35 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 36 | artist_id INTEGER, 37 | title TEXT UNIQUE 38 | );''') 39 | cur.execute(''' CREATE TABLE Track ( 40 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 41 | title TEXT UNIQUE, 42 | album_id INTEGER, 43 | genre_id INTEGER, 44 | len INTEGER, 45 | rating INTEGER, 46 | count INTEGER 47 | );''') 48 | 49 | 50 | tree = ET.parse(r'C:\Users\\atse\Documents\Python for Everybody Specialization\Using Databases with Python\Week 3\Library.xml') 51 | 52 | root = tree.getroot() 53 | # . - selects current node (plist) most useful at beginning path 54 | #/dict/dict//dict - then goes dict branch then next child then // 55 | # //dict selects all child elements 'dict' within that wrung of tree (ladder) 56 | itlist = root.findall('./dict/dict//dict') 57 | 58 | def lookup(dic, key): 59 | found = False 60 | for child in dic: 61 | if found: return child.text 62 | if child.tag == 'key' and child.text == key: 63 | found = True 64 | return None 65 | 66 | print('Dict count', len(itlist)) 67 | for entry in itlist: 68 | if (lookup(entry,'Track ID') is None) : continue 69 | 70 | name = lookup(entry, 'Name') 71 | artist = lookup(entry, 'Artist') 72 | album = lookup(entry, 'Album') 73 | genre = lookup(entry, 'Genre') 74 | length = lookup(entry, 'Total Time') 75 | count = lookup(entry, 'Play Count') 76 | rating = lookup(entry, 'Rating') 77 | 78 | if name is None or artist is None or album is None or genre is None : continue 79 | 80 | 81 | print(name, artist, album, count, rating, length) 82 | #insert the new row into DB and table, then specify colum 83 | #to place information into, BUT b/c using variable 84 | #need to use ? placeholder, b/c value is in a variable 85 | #and not directly entered into the VALUES field 86 | cur.execute('''INSERT or IGNORE INTO Artist (name) 87 | VALUES (?)''', (artist,)) 88 | # grabs the row with the corresponding artist name, impt ONLY 1 though 89 | # selects this info, this way, because no human error, computer handles 90 | # getting the value, and need the value because going to use it in the 91 | # following entry 92 | cur.execute('SELECT id FROM Artist WHERE name = ?', (artist,)) 93 | artist_id = cur.fetchone()[0] 94 | 95 | cur.execute('''INSERT OR IGNORE INTO Genre (name) 96 | VALUES (?)''', ( genre, ) ) 97 | 98 | cur.execute('SELECT id FROM Genre WHERE name = ?', ( genre, ) ) 99 | genre_id = cur.fetchone()[0] 100 | 101 | cur.execute('''INSERT or IGNORE INTO Album (title, artist_id) 102 | VALUES (?,?)''', (album, artist_id)) 103 | cur.execute('SELECT id FROM Album WHERE title = ?', (album,)) 104 | # good way to think here is that row is actually what the cursor is pointing at 105 | album_id = cur.fetchone()[0] 106 | 107 | cur.execute('''INSERT OR REPLACE INTO Track 108 | (title, album_id, genre_id, len, rating, count) 109 | VALUES (?,?,?,?,?,?)''', 110 | (name, album_id, genre_id, length, rating, count)) 111 | 112 | conn.commit() -------------------------------------------------------------------------------- /Using Databases with Python/Week 3/tracksdb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 23 09:59:12 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import xml.etree.ElementTree as ET 9 | import sqlite3 10 | 11 | conn = sqlite3.connect('trackdb.sqlite') 12 | cur = conn.cursor() 13 | 14 | # Make some fresh tables using executescript() 15 | cur.executescript(''' 16 | DROP TABLE IF EXISTS Artist; 17 | DROP TABLE IF EXISTS Genre; 18 | DROP TABLE IF EXISTS Album; 19 | DROP TABLE IF EXISTS Track; 20 | CREATE TABLE Artist ( 21 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 22 | name TEXT UNIQUE 23 | ); 24 | CREATE TABLE Genre ( 25 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 26 | name TEXT UNIQUE 27 | ); 28 | CREATE TABLE Album ( 29 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 30 | artist_id INTEGER, 31 | title TEXT UNIQUE 32 | ); 33 | CREATE TABLE Track ( 34 | id INTEGER NOT NULL PRIMARY KEY 35 | AUTOINCREMENT UNIQUE, 36 | title TEXT UNIQUE, 37 | album_id INTEGER, 38 | genre_id INTEGER, 39 | len INTEGER, rating INTEGER, count INTEGER 40 | ); 41 | ''') 42 | 43 | 44 | fname = input('Enter file name: ') 45 | if ( len(fname) < 1 ) : fname = 'Library.xml' 46 | 47 | # Track ID369 48 | # NameAnother One Bites The Dust 49 | # ArtistQueen 50 | def lookup(d, key): 51 | found = False 52 | for child in d: 53 | if found : return child.text 54 | if child.tag == 'key' and child.text == key : 55 | found = True 56 | return None 57 | 58 | stuff = ET.parse(fname) 59 | all = stuff.findall('dict/dict/dict') 60 | print ('Dict count:', len(all)) 61 | for entry in all: 62 | if ( lookup(entry, 'Track ID') is None ) : continue 63 | 64 | name = lookup(entry, 'Name') 65 | artist = lookup(entry, 'Artist') 66 | album = lookup(entry, 'Album') 67 | genre = lookup(entry, 'Genre') 68 | count = lookup(entry, 'Play Count') 69 | rating = lookup(entry, 'Rating') 70 | length = lookup(entry, 'Total Time') 71 | 72 | if name is None or artist is None or genre is None or album is None : 73 | continue 74 | 75 | print(name, artist, album, genre, count, rating, length) 76 | 77 | cur.execute('''INSERT OR IGNORE INTO Artist (name) 78 | VALUES ( ? )''', ( artist, ) ) 79 | cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, )) 80 | artist_id = cur.fetchone()[0] 81 | 82 | cur.execute('''INSERT OR IGNORE INTO Genre (name) 83 | VALUES ( ? )''', ( genre, ) ) 84 | cur.execute('SELECT id FROM Genre WHERE name = ? ', (genre, )) 85 | genre_id = cur.fetchone()[0] 86 | 87 | cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 88 | VALUES ( ?, ? )''', ( album, artist_id ) ) 89 | cur.execute('SELECT id FROM Album WHERE title = ? ', (album, )) 90 | album_id = cur.fetchone()[0] 91 | 92 | cur.execute('''INSERT OR REPLACE INTO Track 93 | (title, album_id, genre_id, len, rating, count) 94 | VALUES ( ?, ?, ?, ?, ?, ? )''', 95 | ( name, album_id, genre_id, length, rating, count ) ) 96 | 97 | conn.commit() -------------------------------------------------------------------------------- /Using Databases with Python/Week 4/HW Result.sql: -------------------------------------------------------------------------------- 1 | SELECT hex(User.name || Course.title || Member.role ) AS X FROM 2 | User JOIN Member JOIN Course 3 | ON User.id = Member.user_id AND Member.course_id = Course.id 4 | ORDER BY X 5 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 4/roster.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 23 20:23:04 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import json 9 | import sqlite3 10 | 11 | conn = sqlite3.connect('rosterdb.sqlite') 12 | cur = conn.cursor() 13 | 14 | # Do some setup 15 | cur.executescript(''' 16 | DROP TABLE IF EXISTS User; 17 | DROP TABLE IF EXISTS Member; 18 | DROP TABLE IF EXISTS Course; 19 | CREATE TABLE User ( 20 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 21 | name TEXT UNIQUE 22 | ); 23 | CREATE TABLE Course ( 24 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 25 | title TEXT UNIQUE 26 | ); 27 | CREATE TABLE Member ( 28 | user_id INTEGER, 29 | course_id INTEGER, 30 | role INTEGER, 31 | PRIMARY KEY (user_id, course_id) 32 | ) 33 | ''') 34 | 35 | fname = input('Enter file name: ') 36 | if ( len(fname) < 1 ) : fname = 'roster_data.json' 37 | 38 | # [ 39 | # [ "Charley", "si110", 1 ], 40 | # [ "Mea", "si110", 0 ], 41 | 42 | str_data = open(fname).read() 43 | json_data = json.loads(str_data) 44 | 45 | for entry in json_data: 46 | 47 | name = entry[0]; 48 | title = entry[1]; 49 | role = entry[2]; 50 | 51 | print(name, title, role) 52 | 53 | cur.execute('''INSERT OR IGNORE INTO User (name) 54 | VALUES ( ? )''', ( name, ) ) 55 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, )) 56 | user_id = cur.fetchone()[0] 57 | 58 | cur.execute('''INSERT OR IGNORE INTO Course (title) 59 | VALUES ( ? )''', ( title, ) ) 60 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, )) 61 | course_id = cur.fetchone()[0] 62 | 63 | cur.execute('''INSERT OR REPLACE INTO Member 64 | (user_id, course_id, role) VALUES ( ?, ?, ? )''', 65 | ( user_id, course_id, role ) ) 66 | 67 | conn.commit() 68 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 4/rosterdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 4/rosterdb.sqlite -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/Google API Key.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/Google API Key.doc -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/README.txt: -------------------------------------------------------------------------------- 1 | Using the Google Places API with a Database and 2 | Visualizing Data on Google Map 3 | 4 | In this project, we are using the Google geocoding API 5 | to clean up some user-entered geographic locations of 6 | university names and then placing the data on a Google 7 | Map. 8 | 9 | Note: Windows has difficulty in displaying UTF-8 characters 10 | in the console so for each command window you open, you may need 11 | to type the following command before running this code: 12 | 13 | chcp 65001 14 | 15 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 16 | 17 | 18 | You should install the SQLite browser to view and modify 19 | the databases from: 20 | 21 | http://sqlitebrowser.org/ 22 | 23 | The first problem to solve is that the Google geocoding 24 | API is rate limited to a fixed number of requests per day. 25 | So if you have a lot of data you might need to stop and 26 | restart the lookup process several times. So we break 27 | the problem into two phases. 28 | 29 | In the first phase we take our input data in the file 30 | (where.data) and read it one line at a time, and retrieve the 31 | geocoded response and store it in a database (geodata.sqlite). 32 | Before we use the geocoding API, we simply check to see if 33 | we already have the data for that particular line of input. 34 | 35 | You can re-start the process at any time by removing the file 36 | geodata.sqlite 37 | 38 | Run the geoload.py program. This program will read the input 39 | lines in where.data and for each line check to see if it is already 40 | in the database and if we don't have the data for the location, 41 | call the geocoding API to retrieve the data and store it in 42 | the database. 43 | 44 | As of December 2016, the Google Geocoding APIs changed dramatically. 45 | They moved some functionality that we use from the Geocoding API 46 | into the Places API. Also all the Google Geo-related APIs require an 47 | API key. To complete this assignment without a Google account, 48 | without an API key, or from a country that blocks 49 | access to Google, you can use a subset of that data which is 50 | available at: 51 | 52 | http://py4e-data.dr-chuck.net/geojson 53 | 54 | To use this, simply leave the api_key set to False in 55 | geoload.py. 56 | 57 | This URL only has a subset of the data but it has no rate limit so 58 | it is good for testing. 59 | 60 | If you want to try this with the API key, follow the 61 | instructions at: 62 | 63 | https://developers.google.com/maps/documentation/geocoding/intro 64 | 65 | and put the API key in the code. 66 | 67 | Here is a sample run after there is already some data in the 68 | database: 69 | 70 | Mac: python3 geoload.py 71 | Win: geoload.py 72 | 73 | Found in database Northeastern University 74 | 75 | Found in database University of Hong Kong, Illinois Institute of Technology, Bradley University 76 | 77 | Found in database Technion 78 | 79 | Found in database Viswakarma Institute, Pune, India 80 | 81 | Found in database UMD 82 | 83 | Found in database Tufts University 84 | 85 | Resolving Monash University 86 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Monash+University 87 | Retrieved 2063 characters { "results" : [ 88 | {u'status': u'OK', u'results': ... } 89 | 90 | Resolving Kokshetau Institute of Economics and Management 91 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Kokshetau+Institute+of+Economics+and+Management 92 | Retrieved 1749 characters { "results" : [ 93 | {u'status': u'OK', u'results': ... } 94 | 95 | The first five locations are already in the database and so they 96 | are skipped. The program scans to the point where it finds un-retrieved 97 | locations and starts retrieving them. 98 | 99 | The geoload.py can be stopped at any time, and there is a counter 100 | that you can use to limit the number of calls to the geocoding 101 | API for each run. 102 | 103 | Once you have some data loaded into geodata.sqlite, you can 104 | visualize the data using the (geodump.py) program. This 105 | program reads the database and writes tile file (where.js) 106 | with the location, latitude, and longitude in the form of 107 | executable JavaScript code. 108 | 109 | A run of the geodump.py program is as follows: 110 | 111 | Mac: python3 geodump.py 112 | Win: geodump.py 113 | 114 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975 115 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811 116 | ... 117 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667 118 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682 119 | Kokshetau, Kazakhstan 53.2833333 69.3833333 120 | ... 121 | 12 records written to where.js 122 | Open where.html to view the data in a browser 123 | 124 | The file (where.html) consists of HTML and JavaScript to visualize 125 | a Google Map. It reads the most recent data in where.js to get 126 | the data to be visualized. Here is the format of the where.js file: 127 | 128 | myData = [ 129 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'], 130 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'], 131 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'], 132 | ... 133 | ]; 134 | 135 | This is a JavaScript list of lists. The syntax for JavaScript 136 | list constants is very similar to Python so the syntax should 137 | be familiar to you. 138 | 139 | Simply open where.html in a browser to see the locations. You 140 | can hover over each map pin to find the location that the 141 | gecoding API returned for the user-entered input. If you 142 | cannot see any data when you open the where.html file, you might 143 | want to check the JavaScript or developer console for your browser. 144 | 145 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/geodata.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/geodata.sqlite -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/geodump.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/geodump.png -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/geodump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import codecs 4 | 5 | conn = sqlite3.connect('geodata.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT * FROM Locations') 9 | fhand = codecs.open('where.js', 'w', "utf-8") 10 | fhand.write("myData = [\n") 11 | count = 0 12 | for row in cur : 13 | data = str(row[1].decode()) 14 | try: js = json.loads(str(data)) 15 | except: continue 16 | 17 | if not('status' in js and js['status'] == 'OK') : continue 18 | 19 | lat = js["results"][0]["geometry"]["location"]["lat"] 20 | lng = js["results"][0]["geometry"]["location"]["lng"] 21 | if lat == 0 or lng == 0 : continue 22 | where = js['results'][0]['formatted_address'] 23 | where = where.replace("'", "") 24 | try : 25 | print(where, lat, lng) 26 | 27 | count = count + 1 28 | if count > 1 : fhand.write(",\n") 29 | output = "["+str(lat)+","+str(lng)+", '"+where+"']" 30 | fhand.write(output) 31 | except: 32 | continue 33 | 34 | fhand.write("\n];\n") 35 | cur.close() 36 | fhand.close() 37 | print(count, "records written to where.js") 38 | print("Open where.html to view the data in a browser") 39 | 40 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/geoload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/geoload.png -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/geoload.py: -------------------------------------------------------------------------------- 1 | import urllib.request, urllib.parse, urllib.error 2 | import http 3 | import sqlite3 4 | import json 5 | import time 6 | import ssl 7 | import sys 8 | 9 | api_key = False 10 | # If you have a Google Places API key, enter it here 11 | api_key = 'AIzaSyDT8qBVoIMqfs6VvXZynQ9YbutG3kDOdmA ' 12 | 13 | if api_key is False: 14 | serviceurl = "http://py4e-data.dr-chuck.net/geojson?" 15 | else : 16 | serviceurl = "https://maps.googleapis.com/maps/api/place/textsearch/json?" 17 | 18 | # Additional detail for urllib 19 | # http.client.HTTPConnection.debuglevel = 1 20 | 21 | conn = sqlite3.connect('geodata.sqlite') 22 | cur = conn.cursor() 23 | 24 | cur.execute(''' 25 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''') 26 | 27 | # Ignore SSL certificate errors 28 | ctx = ssl.create_default_context() 29 | ctx.check_hostname = False 30 | ctx.verify_mode = ssl.CERT_NONE 31 | 32 | fh = open("where.data") 33 | count = 0 34 | for line in fh: 35 | if count > 200 : 36 | print('Retrieved 200 locations, restart to retrieve more') 37 | break 38 | 39 | address = line.strip() 40 | print('') 41 | cur.execute("SELECT geodata FROM Locations WHERE address= ?", 42 | (memoryview(address.encode()), )) 43 | 44 | try: 45 | data = cur.fetchone()[0] 46 | print("Found in database ",address) 47 | continue 48 | except: 49 | pass 50 | 51 | parms = dict() 52 | parms["query"] = address 53 | if api_key is not False: parms['key'] = api_key 54 | url = serviceurl + urllib.parse.urlencode(parms) 55 | 56 | print('Retrieving', url) 57 | uh = urllib.request.urlopen(url, context=ctx) 58 | data = uh.read().decode() 59 | print('Retrieved', len(data), 'characters', data[:20].replace('\n', ' ')) 60 | count = count + 1 61 | 62 | try: 63 | js = json.loads(data) 64 | except: 65 | print(data) # We print in case unicode causes an error 66 | continue 67 | 68 | if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') : 69 | print('==== Failure To Retrieve ====') 70 | print(data) 71 | break 72 | 73 | cur.execute('''INSERT INTO Locations (address, geodata) 74 | VALUES ( ?, ? )''', (memoryview(address.encode()), memoryview(data.encode()) ) ) 75 | conn.commit() 76 | if count % 10 == 0 : 77 | print('Pausing for a bit...') 78 | time.sleep(5) 79 | 80 | print("Run geodump.py to read the data from the database so you can vizualize it on a map.") 81 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/where.data: -------------------------------------------------------------------------------- 1 | AGH University of Science and Technology 2 | Academy of Fine Arts Warsaw Poland 3 | American University in Cairo 4 | Arizona State University 5 | Athens Information Technology 6 | BITS Pilani 7 | Babcock University 8 | Banaras Hindu University 9 | Bangalore University 10 | Baylor University 11 | Beijing normal university 12 | Belarusian State University 13 | Belgrade University 14 | Beloit College 15 | Belorussian State University 16 | Ben Gurion University 17 | Bharthidasan University 18 | Boston University 19 | California Polytechnic State University of San Luis Obispo 20 | California State University San Bernardino 21 | City of Westminster College 22 | Columbia University 23 | Cranfield University 24 | Czech Technical University in Prague 25 | Dartmouth 26 | De Anza College 27 | Distant University of Hagen 28 | Dnipropetrovsk National University 29 | Dokuz Eylul University 30 | Drexel 31 | Drexel University and University of Texas at Austin 32 | Duke University 33 | EM Lyon 34 | Ecole centrale de PARIS 35 | Elon University 36 | Erhvervsakademi Sydvest 37 | Escuela Superior Politecnica del Litoral 38 | Fachhochschule Dusseldorf 39 | Fachhochschule FH Salzburg 40 | Faculdade de Tecnologia do Estado de Sao Paulo 41 | Faculty of Technical Sciences Novi Sad Serbia 42 | Farmingdale State University 43 | Federal University of Minas Gerais 44 | Florida Atlantic University 45 | Franklin Pierce College 46 | Gauhati University 47 | George Mason University 48 | Georgetown University Law Center 49 | Georgia State University 50 | Grandville 51 | Groep T University 52 | Hanoi University of Science and Technology 53 | Hebrew University 54 | IIIT Hyderabad 55 | IIT KANPUR 56 | IT College of Estonia 57 | IU 58 | IUAV Venezia 59 | Illinois Institute of Technology 60 | Illinois State University Joliet Junior College 61 | Indian Institute of Technology 62 | Indian Institute of Technology Kharagpur India 63 | Indian School of Mines Dhanbad 64 | Indiana University 65 | Indiana University at Bloomington 66 | Institut Superieur de technologies 67 | Institute of Business and Modern Technologies 68 | Instituto Tecnologico de Santo Domingo 69 | International Institute of Information Technology Hyderabad 70 | Irkutsk State University 71 | JADAVPUR UNIVERSITY 72 | Jawaharlal Nehru Technological University 73 | Jawaharlal Nehru University 74 | Jordan University of Science and Technology 75 | K-State 76 | KUL 77 | Kalamazoo College 78 | Kaunas Technology University 79 | Kaunas university of technology 80 | Kazan Federal University 81 | Kent State University 82 | Kharkiv State Academy of Municipal Economy Ukraine 83 | King Mongkuts University of Technology Thonburi 84 | Kokshetau Institute of Economics and Management 85 | Kyiv Polytechnic Institute 86 | Kyiv Polytechnical Institute 87 | Kyiv Unisersity of Oriental Language 88 | Laurentian University 89 | Lisandro Alvarado 90 | Lodz University of Technology 91 | Lviv University 92 | MSU 93 | Madras university 94 | Magnitogorsk State Technical University 95 | Malayer Azad University 96 | Marietta College 97 | Masdar Institute 98 | Matematicki fakultet Beograd 99 | Michigan State University 100 | Middle East Technical University 101 | Missouri University of Science and Technology 102 | Monash 103 | Monash University 104 | Monash University Churchill Australia 105 | Monterrey Institute of Technology and Higher Education 106 | Moscow Engineering-Physics Institute 107 | Moscow Institute of Physics & Technology 108 | Moscow State University 109 | NIT ROURKELA 110 | NYU 111 | Nagpur University 112 | Nanyang Technological University 113 | National Institute of Technology Jalandhar 114 | National Taiwan University 115 | National University of Engineering 116 | North Central College 117 | Northeastern University 118 | Northwestern University 119 | Obninsk Technical University of Nuclear Power Engineering Russia 120 | Old Dominion University 121 | Oregon Institute of Technology 122 | PUCMM 123 | Payame Noor University 124 | Penn State University 125 | Politecnico di Milano 126 | Politehnica University Bucharest 127 | Polytechnic University of Timisoara 128 | Pondicherry University 129 | Pontificia universidad catolica de chile 130 | Portland State University 131 | Purdue University Indianapolis 132 | R V College of Engineering 133 | RPI 134 | Ramapo College of New Jersey 135 | Rochester Institute of Technology 136 | SASTRA University 137 | Saint Petersburg State University 138 | Saint Petersburg State University of Aerospace Instrumentation 139 | Saint-Petersburg Polytechnic Univesity 140 | San Francisco State University 141 | San Jose State University 142 | Shanghai Jiao Tong University 143 | Sharif University of Technology 144 | Simon Bolivar University 145 | Simon Fraser University 146 | Smolensk State University 147 | Sonoma State University 148 | South Federal University 149 | Spiru Haret University 150 | Stanford 151 | State University of Campinas 152 | State University of New York College at Oswego 153 | Stellenbosch University 154 | Stonehill College 155 | Tallinn University 156 | Tallinn University of Technology 157 | Tampere University of Technology 158 | Tanta University 159 | Tarrant County College 160 | Technical University of Cluj-Napoca 161 | Technion 162 | Tel Aviv University 163 | The Jerusalem collage of engineering 164 | The University of Latvia 165 | The University of Manchester 166 | The University of South Africa 167 | Transilvania University 168 | Tufts University 169 | UC Berkeley 170 | UCLA 171 | UCSD 172 | UIUC 173 | UMD 174 | UNISA 175 | UNIVERSIDAD DE Buenos Aires 176 | UOC 177 | USC 178 | UW Madison 179 | Universidad Central de Venezuela 180 | Universidad Complutense de Madrid 181 | Universidad Cooperativa de Colombia 182 | Universidad Nacional Autonoma de Mexico 183 | Universidad Nacional Costa Rica 184 | Universidad Nacional de Colombia 185 | Universidad Tecnologica Boliviana 186 | Universidad de Buenos Aires 187 | Universidad de Castilla La Mancha 188 | Universidad de Los Andes Colombia 189 | Universidad de Oriente 190 | Universidad de San Carlos de Guatemala 191 | Universidad de Valladolid 192 | Universidad de la Sabana 193 | Universidad del Valle de Guatemala 194 | Universidade Federal da Paraiba 195 | Universidade Federal de Santa Catarina 196 | Universidade Federal do Rio Grande do Sul 197 | Universidade Federal do Rio de Janeiro 198 | Universidade Tecnica de Lisboa 199 | Universidade de Sao Paulo 200 | Universidade do Minho 201 | Universitas Gadjah Mada 202 | Universitat Politecnica de Valencia 203 | Universite Catholique de Louvain 204 | University College Dublin 205 | University Munich 206 | University of Akron 207 | University of Alberta 208 | University of Amsterdam 209 | University of Arkansas 210 | University of Athens 211 | University of Belgrade 212 | University of Birmingham 213 | University of Buenos Aires 214 | University of Cambridge 215 | University of Central Oklahoma 216 | University of Chicago 217 | University of Cincinnati 218 | University of Colorado at Boulder 219 | University of Connecticut 220 | University of Dallas 221 | University of Debrecen 222 | University of Delaware 223 | University of Erlangen-Nuremberg 224 | University of Essex 225 | University of Evora 226 | University of Florida 227 | University of Gothenburg 228 | University of Greifswald 229 | University of Hamburg 230 | University of Hawaii 231 | University of Helsinki 232 | University of Ilorin Kwara State 233 | University of Jaffna 234 | University of Kansas 235 | University of Kerala 236 | University of London 237 | University of Malaga 238 | University of Malaya 239 | University of Manchester 240 | University of Michigan 241 | University of Missouri - Columbia 242 | University of Moratuwa 243 | University of Mumbai 244 | University of Nebraska 245 | University of Nebraska - Lincoln 246 | University of New Haven 247 | University of New South Wales 248 | University of Notre Dame 249 | University of Oklahoma 250 | University of Ottawa 251 | University of Oxford 252 | University of Padua 253 | University of Pavia Italy 254 | University of Pennsylvania 255 | University of Piraeus Athens 256 | University of Pretoria 257 | University of Salamanca 258 | University of Sao Paulo 259 | University of Sarajevo 260 | University of Southern California 261 | University of Stellenbosch 262 | University of Tartu 263 | University of Tehran 264 | University of Texas 265 | University of Texas at Austin 266 | University of Toronto 267 | University of Tuebingen 268 | University of Twente 269 | University of Utah 270 | University of Vienna 271 | University of Warsaw 272 | University of Washington 273 | University of Washington - Bothell 274 | University of Waterloo 275 | University of West Florida 276 | University of Wisconsin 277 | University of the Punjab Lahore 278 | University of the Witwatersrand 279 | Vilnius Gediminas Technical University 280 | Vilnius University 281 | Virginia Commonwealth University 282 | Virginia Tech 283 | Viswakarma Institute Pune India 284 | Warsaw University 285 | Washington State University 286 | Wayne State 287 | Weber State 288 | Weizmann Institute of Science 289 | Western Governors University 290 | Xavier University 291 | Zagazig University 292 | allama iqbal open university islamabad 293 | arizona state university 294 | federal institute of tecnology and education from southeastern Minas Gerais 295 | kansas state university 296 | universidad complutense de madrid 297 | university of Patras 298 | university of padua 299 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/where.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A Map of Information 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 41 | 42 | 43 |
44 |

About this Map

45 |

46 | This is a cool map from 47 | www.py4e.com. 48 |

49 | 50 | 51 | -------------------------------------------------------------------------------- /Using Databases with Python/Week 5/where.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Databases with Python/Week 5/where.png -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 1/Atom Editor Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Python to Access Web Data/Week 1/Atom Editor Test.PNG -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 1/Directory Test.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atse0612/Python-for-Everybody-Coursera/60eac4010931892f89e442497c5f7f3cd043831a/Using Python to Access Web Data/Week 1/Directory Test.PNG -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 1/Test.py: -------------------------------------------------------------------------------- 1 | print("I am writing a line on code!") 2 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 2/Week 2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 20:06:30 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import re 9 | pile = open('regex_sum_41647.txt') 10 | gold = pile.read() 11 | copier = re.findall("[0-9]+", gold) 12 | dice = [int(i) for i in copier] 13 | sum = 0 14 | for k in dice: 15 | sum += k 16 | print(sum) 17 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 3/Week 3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 20:27:29 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import socket 9 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 10 | mysock.connect(('data.pr4e.org', 80)) 11 | cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode() 12 | mysock.send(cmd) 13 | 14 | lit = list() 15 | while True: 16 | data = mysock.recv(512) 17 | lit.append(data) 18 | if (len(data) < 1): 19 | break 20 | print(data.decode()) 21 | mysock.close() 22 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 3/intro-short.txt: -------------------------------------------------------------------------------- 1 | Why should you learn to write programs? 2 | 3 | Writing programs (or programming) is a very creative 4 | and rewarding activity. You can write programs for 5 | many reasons, ranging from making your living to solving 6 | a difficult data analysis problem to having fun to helping 7 | someone else solve a problem. This book assumes that 8 | everyone needs to know how to program, and that once 9 | you know how to program you will figure out what you want 10 | to do with your newfound skills. 11 | 12 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 4/Following Links in HTML Using BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:12:49 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | from bs4 import BeautifulSoup 10 | import ssl 11 | ctx = ssl.create_default_context() 12 | ctx.check_hostname = False 13 | ctx.verify_mode = ssl.CERT_NONE 14 | 15 | url = input('Enter URL: ') 16 | num = input('Enter count: ') 17 | pos = input('Enter position: ') 18 | print('Retrieving: ', url) 19 | for times in range(int(num)): 20 | html = urllib.request.urlopen(url, context=ctx).read() 21 | soup = BeautifulSoup(html, 'html.parser') 22 | tags = soup('a') 23 | print('Retrieving: ', tags[int(pos)-1].get('href', None)) 24 | url = tags[int(pos)-1].get('href', None) 25 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 4/Scraping HTML Data with BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:10:37 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | from urllib.request import urlopen 9 | from bs4 import BeautifulSoup 10 | import ssl 11 | 12 | ctx = ssl.create_default_context() 13 | ctx.check_hostname = False 14 | ctx.verify_mode = ssl.CERT_NONE 15 | html = urlopen(' http://py4e-data.dr-chuck.net/comments_41649.html', context=ctx).read() 16 | soup = BeautifulSoup(html, "html.parser") 17 | tags = soup('span') 18 | sum = 0 19 | coun = 0 20 | print('Enter - ') 21 | for tag in tags: 22 | coun += 1 23 | sum += int(tag.contents[0]) 24 | print('Count', coun, '\nSum', sum) 25 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 5/Extracting Data from XML.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:43:48 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | import xml.etree.ElementTree as ET 10 | import ssl 11 | ctx = ssl.create_default_context() 12 | ctx.check_hostname = False 13 | ctx.verify_mode = ssl.CERT_NONE 14 | 15 | url = input('Enter location: ') 16 | print ('Retrieving ', url) 17 | html = urllib.request.urlopen(url, context=ctx).read() 18 | print ('Retrieved', len(html), 'characters') 19 | tree = ET.fromstring(html) 20 | print ('Count: ',len(tree.findall('.//count'))) 21 | total = 0 22 | for r in tree.findall("./comments/comment"): 23 | total += int(r.find('count').text) 24 | print ('Sum: ', total) 25 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 6/Extracting Data from JSON.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:45:18 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | import json 10 | 11 | url = input('Enter location: ') 12 | data = urllib.request.urlopen(url).read() 13 | info = json.loads(data) 14 | info = info['comments'] 15 | print ('Retrieving', url, '\nRetrieved', len(data), 'caracters', '\nCount:', len(info)) 16 | num = 0 17 | for item in info: 18 | num += int(item['count']) 19 | print ('Sum:', num) 20 | -------------------------------------------------------------------------------- /Using Python to Access Web Data/Week 6/GEOSON API.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 16 21:46:57 2017 4 | 5 | @author: atse 6 | """ 7 | 8 | import urllib.request, urllib.parse, urllib.error 9 | import json 10 | 11 | serviceurl = 'http://python-data.dr-chuck.net/geojson' 12 | address = input('Enter location: ') 13 | url = serviceurl + '?' + urllib.parse.urlencode({'sensor':'false', 'address': address}) 14 | data = urllib.request.urlopen(url).read().decode() 15 | info = json.loads(data) 16 | info = info['results'] 17 | print ('Retrieving', url, '\nRetrieved', len(data), 'caracters') 18 | for item in info: 19 | key = item['place_id'] 20 | print ('Place id:', key) 21 | --------------------------------------------------------------------------------