51 |
52 |
53 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/geodata/where.js:
--------------------------------------------------------------------------------
1 | myData = [
2 | [42.340082,-71.0894884, 'Northeastern, Boston, MA 02115, USA'],
3 | [38.2113643,-85.7470011, 'Bradley Ave, Louisville, KY, USA'],
4 | [32.778949,35.019648, 'Technion/ Sports Building, Haifa'],
5 | [18.4574518,73.8837999, 'Vishwakarma Institutes Play Ground, Yashodhan Society, Kapil Nagar, Kondhwa Budrukh, Vishwakarma, Maharashtra 411048, India'],
6 | [33.1561058,131.826132, 'Japan, 〒875-0002 Ōita-ken, Usuki-shi, Shitanoe, 1232−2 UMD'],
7 | [42.4036847,-71.120482, 'South Hall Tufts University, 30 Lower Campus Rd, Somerville, MA 02144, USA'],
8 | [-38.1518106,145.1345412, 'Monash University, Frankston VIC 3199, Australia'],
9 | [53.2948229,69.4047872, 'Kokshetau 020000, Kazakhstan'],
10 | [40.7127837,-74.0059413, 'New York, NY, USA'],
11 | [52.2869741,104.3050183, 'Irkutsk, Irkutsk Oblast, Russia'],
12 | [31.1790053,121.4237432, 'Shang Hai Jiao Tong Da Xue Fu Shu Di Liu Ren Min Yi Yuan, Xuhui Qu, Shanghai Shi, China, 200231'],
13 | [8.481302,4.611479, 'University Rd, Ilorin, Nigeria'],
14 | [-34.9222085,138.5921522, 'Yungondi Building, Adelaide SA 5000, Australia'],
15 | [47.80949,13.05501, 'Salzburg, Austria'],
16 | [61.4977524,23.7609535, 'Tampere, Finland'],
17 | [59.9342802,30.3350986, 'St Petersburg, Russia'],
18 | [28.6853472,-106.1015266, 'São Paulo, Chihuahua, Chih., Mexico'],
19 | [54.7903112,32.0503663, 'Smolensk, Smolensk Oblast, Russia'],
20 | [24.8614622,67.0099388, 'Karachi, Pakistan'],
21 | [40.4469796,-3.7278167, 'Av. Complutense, Madrid, Madrid, Spain'],
22 | [24.4325423,54.6174842, 'Masdar Institute Bus Station - Abu Dhabi - United Arab Emirates'],
23 | [51.5266171,-0.1260773, 'University Of London, 1-11 Cartwright Gardens, Kings Cross, London WC1H 9EB, UK'],
24 | [39.5069974,-84.745231, 'Oxford, OH 45056, USA'],
25 | [59.393847,24.6650872, 'TTÜ staadion, 12616 Tallinn, Estonia'],
26 | [58.3733281,26.7265098, 'Tartu Ülikooli Füüsikahoone, 50103 Tartu, Estonia'],
27 | [33.6778327,-117.8151285, 'Padua, Irvine, CA 92614, USA'],
28 | [18.5544976,73.8257325, 'Pune University, Ganeshkhind, Pune, Maharashtra, India'],
29 | [37.8764984,-122.2804342, 'California St, Berkeley, CA, USA'],
30 | [43.0412831,-89.4301473, 'University of Wisconsin-Madison Arboretum, 1207 Seminole Hwy, Madison, WI 53711, USA'],
31 | [51.745806,19.4489068, 'Instytut Informatyki Stosowanej, Politechnika Łódzka, 90-924 Łódź, Poland'],
32 | [38.3946981,27.0322689, 'İnciraltı, Dokuz Eylül Ünv. Hst., 35330 Balçova/İzmir, Turkey'],
33 | [39.9314428,116.3049709, 'Bei Jing Shi Fan Da Xue, Haidian Qu, Beijing Shi, China, 100000'],
34 | [33.9519347,-83.357567, 'Athens, GA, USA'],
35 | [10.7295115,79.0196067, 'Sastra University Road, Tirumalaisamudram, Tamil Nadu 613401, India'],
36 | [21.1470404,79.0397862, 'Nagpur University Campus, Nagpur, Maharashtra 440033, India'],
37 | [41.9197689,-91.649501, 'Duke St SW, Cedar Rapids, IA 52404, USA'],
38 | [37.7634731,-122.4390636, 'States St, San Francisco, CA 94114, USA'],
39 | [-23.5505199,-46.6333094, 'São Paulo, São Paulo - State of São Paulo, Brazil'],
40 | [30.2850284,-97.7335226, 'University of Texas at Austin, Austin, TX, USA'],
41 | [61.6887271,27.2721457, 'Mikkeli, Finland'],
42 | [32.4204729,-85.0323718, 'H. Curtis Pitts Hall, 3413 S Seale Rd, Phenix City, AL 36869, USA'],
43 | [41.557583,-8.397568, 'Universidade do Minho, 4710 Braga, Portugal'],
44 | [28.1655981,112.9526566, 'Yue Lu Shan Guo Jia Da Xue Ke Ji Yuan Chuang Ye Da Sha, Yuelu Qu, Changsha Shi, Hunan Sheng, China, 410006'],
45 | [-33.0444219,-71.6066334, 'Pontificia Universidad Catolica De Valparaiso - Gimpert, Valparaíso, Valparaíso, Región de Valparaíso, Chile'],
46 | [40.6331249,-89.3985283, 'Illinois, USA'],
47 | [30.0199119,31.5001527, 'AUC Library, Cairo Governorate 11835, Egypt'],
48 | [55.1170375,36.5970818, 'Obninsk, Kaluga Oblast, Russia'],
49 | [31.767879,-106.440736, 'Washington, El Paso, TX 79905, USA'],
50 | [49.9935,36.230383, 'Kharkiv, Kharkiv Oblast, Ukraine'],
51 | [43.8562586,18.4130763, 'Sarajevo, Bosnia and Herzegovina'],
52 | [3.4321247,-76.5461709, 'Parqueadero Universidad Del Valle, Cali, Cali, Valle del Cauca, Colombia'],
53 | [40.0082221,-105.2591119, 'Colorado Ave & University Heights, Boulder, CO 80302, USA'],
54 | [53.4129429,59.0016233, 'Magnitogorsk, Chelyabinsk Oblast, Russia'],
55 | [46.4062583,8.9040484, 'Usc, 6749, Switzerland'],
56 | [52.124815,-106.589195, 'Simon Fraser Crescent, Saskatoon, SK S7H, Canada'],
57 | [34.0247033,-81.0131844, 'New York Ave, Columbia, SC 29204, USA'],
58 | [38.1999105,-85.7659121, 'Southern Pkwy, Louisville, KY, USA'],
59 | [14.6063194,121.0977669, 'Warsaw, Pasig, Metro Manila, Philippines'],
60 | [52.2296756,21.0122287, 'Warsaw, Poland'],
61 | [-40.900557,174.885971, 'New Zealand'],
62 | [-40.3850866,175.6140639, 'Massey University, Palmerston North, New Zealand'],
63 | [35.8715218,-97.5672431, 'Noble Ave, Guthrie, OK 73044, USA'],
64 | [45.1847248,9.1582069, '27100 Pavia PV, Italy'],
65 | [38.6598662,-90.3123536, 'Columbia Ave, University City, MO 63130, USA'],
66 | [50.0755381,14.4378005, 'Prague, Czech Republic'],
67 | [41.8313852,-87.6272216, 'Iit Tower, 10 W 35th St, Chicago, IL 60616, USA'],
68 | [40.7933949,-77.8600012, 'State College, PA, USA'],
69 | [33.4249307,-111.8884532, 'Utah, Tempe, AZ 85281, USA'],
70 | [39.4813156,-0.3505, 'Universitat Politècnica, 46022 Valencia, Valencia, Spain'],
71 | [33.6140008,-117.8440006, 'Vienna, Newport Beach, CA 92660, USA'],
72 | [44.4267674,26.1025384, 'Bucharest, Romania'],
73 | [33.7063317,-117.7733121, 'New Haven, Irvine, CA 92620, USA'],
74 | [47.761605,-122.19303, 'UW Bothell & Cascadia College, Bothell, WA 98011, USA'],
75 | [38.6679152,-90.3322259, 'Drexel Dr, University City, MO 63130, USA'],
76 | [32.083852,34.79197, 'Helsinki St, Tel Aviv-Yafo, Israel'],
77 | [42.320138,-83.230993, 'University of Michigan, Dearborn, MI 48128, USA'],
78 | [40.4432289,-79.9441368, 'Carnegie Mellon University, Pausch Bridge, Pittsburgh, PA 15213, USA'],
79 | [55.8304307,49.0660806, 'Kazan, Tatarstan, Russia'],
80 | [12.0263438,79.8492812, 'Pondicherry University, Kalapet, Puducherry 605014, India'],
81 | [30.7897514,120.7760636, 'Jia Xing Nan Yang Zhi Ye Ji Shu Xue Yuan, Xiuzhou Qu, Jiaxing Shi, Zhejiang Sheng, China, 314000'],
82 | [35.712815,135.9711705, 'Nyu, Mihama, Mikata District, Fukui Prefecture 919-1201, Japan'],
83 | [-23.5431786,-46.6291845, 'State of São Paulo, Brazil'],
84 | [47.5584793,21.620443, 'Debrecen, Debrecen University-Botanical Garden, 4032 Hungary'],
85 | [34.1515641,-117.3354402, 'N State St, California, USA'],
86 | [50.4501,30.5234, 'Kiev, Ukraine, 02000'],
87 | [46.4618977,-80.9664534, 'University Laurentian, Copper Cliff, ON P0M 1N0, Canada'],
88 | [55.755826,37.6173, 'Moscow, Russia'],
89 | [52.2016671,0.1177882, 'University Of Cambridge, Cambridge, Cambridge, Cambridgeshire CB2, UK'],
90 | [35.9525664,51.490619, 'Payame Noor, Meygun, Tehran, Iran'],
91 | [35.246756,33.0307541, 'ODTÜ Misafirhane, Kalkanlı'],
92 | [46.5189865,6.5676007, 'EPFL, 1015 Lausanne, Switzerland'],
93 | [45.2671352,19.8335496, 'Novi Sad, Serbia'],
94 | [57.6954209,11.9853213, 'Göteborgs universitetsbibliotek, Renströmsgatan 4, 412 55 Göteborg, Sweden'],
95 | [45.7488716,21.2086793, 'Timișoara, Romania'],
96 | [53.8931837,27.547338, 'Monument to Fallen Professors and Students of the Belarusian State University, Minsk, Belarus'],
97 | [22.4828735,88.394867, 'Jadavpur University Lake, Sahid Smirity Colony, Pancha Sayar, Kolkata, West Bengal 700094'],
98 | [26.1529683,91.6639235, 'Gauhati University, Jalukbari, Guwahati, Assam, India'],
99 | [-34.5178509,-58.4831979, 'Universidad, Vicente López, Buenos Aires, Argentina'],
100 | [44.4061457,8.9682634, 'Università degli studi di Genova - Dipartimento di Medicina Sperimentale (DIMES), 16143 Genova, Italy'],
101 | [13.7164911,100.4874338, 'Thon Buri, Bangkok 10600, Thailand'],
102 | [4.8602595,-74.0333032, 'Universidad De La Sabana, Chía, Chía, Cundinamarca, Colombia'],
103 | [43.4553461,-76.5104973, 'Oswego, NY, USA'],
104 | [17.4930263,78.3906218, 'Jawaharlal Nehru Technological University, Kukatpally Housing Board Colony, Kukatpally, Hyderabad, Telangana 500085, India'],
105 | [50.503887,4.469936, 'Belgium'],
106 | [42.3518484,-71.1107301, 'Boston University Bridge, Massachusetts, USA'],
107 | [64.9078809,-147.7117155, 'Manchester Loop, Fairbanks, AK 99712, USA'],
108 | [51.1877226,6.7938734, 'Fachhochschule Düsseldorf, Stadtbezirk 3, 40225 Düsseldorf, Germany'],
109 | [27.6169691,-99.4631289, 'Simon Bolivar Blvd, Laredo, TX 78045, USA'],
110 | [39.174335,-86.505469, 'Hilltop Garden and Nature Center at Indiana University, 2367 E 10th St, Bloomington, IN 47408, USA'],
111 | [18.9331831,72.8341894, 'KP Shethi Building, Janmabhoomi Marg, Kala Ghoda, Fort, Mumbai, Maharashtra 400001, India'],
112 | [42.3077541,-83.0182189, 'Ottawa St, Windsor, ON, Canada'],
113 | [28.3580163,75.5887989, 'BITS, Pilani, Rajasthan 333031, India'],
114 | [35.8278379,-78.6593111, 'Transylvania Ave, Raleigh, NC 27609, USA'],
115 | [25.25968,82.989115, 'IIT Gymkhana, RR 11, Banaras Hindu University Campus, Varanasi, Uttar Pradesh 221001, India'],
116 | [50.862282,-2.4998561, 'E M Mitchell & Sons, Hermitage, Dorchester, Dorset DT2 7BB, UK'],
117 | [18.4074917,-66.062465, 'Ave Central, San Juan, San Juan, Puerto Rico'],
118 | [50.4471975,30.4522355, 'Obshchezhitiye NTUU KPI №10, Vyborzka St, 2/24, Kyiv, Ukraine'],
119 | [-9.9541653,-67.8384015, 'Tv. Paraíba - Geraldo Fleming, Rio Branco - AC, Brazil'],
120 | [47.497912,19.040235, 'Budapest, Hungary'],
121 | [55.755826,37.6173, 'Moscow, Russia'],
122 | [59.9342802,30.3350986, 'St Petersburg, Russia'],
123 | [41.7508391,-88.1535352, 'Naperville, IL, USA'],
124 | [37.424106,-122.1660756, 'Stanford, CA, USA'],
125 | [45.7484997,21.2399277, 'Cantina Politehnică, Strada Alexandru Vaida - Voievod, Timișoara, Romania'],
126 | [16.4226352,120.5906046, 'National Baguio University, Bokawkan, Baguio, Benguet, Philippines'],
127 | [-35.417,149.1, 'Monash ACT 2904, Australia'],
128 | [-7.2159454,-35.9065247, 'Campo da UFCG - R. Silva Barbosa - Universitário, Campina Grande - PB, 58400-850, Brazil'],
129 | [19.3188895,-99.1843676, 'National Autonomous University of Mexico, Mexico City, Mexico City, Mexico'],
130 | [35.7058075,51.4020909, 'Tehran University, Tehran, Tehran, Iran'],
131 | [36.8838958,-76.3040214, 'Old Dominion University, 5115 Hampton Blvd, Norfolk, VA 23508, USA'],
132 | [50.4501,30.5234, 'Kiev, Ukraine, 02000'],
133 | [32.2366945,-110.9456894, 'Babcock Building, 1717 E Speedway Blvd, Tucson, AZ 85719, USA'],
134 | [44.9715569,-93.231866, 'Essex St SE, Minneapolis, MN 55455, USA'],
135 | [49.9935,36.230383, 'Kharkiv, Kharkiv Oblast, Ukraine'],
136 | [54.8985207,23.9035965, 'Kaunas, Lithuania'],
137 | [42.3423603,-7.8552788, 'Av. de Buenos Aires, 32004 Ourense, Orense, Spain'],
138 | [9.7297203,79.9482992, 'Jaffna College, AB21, Sri Lanka'],
139 | [42.5030209,-89.0295642, 'College St, Beloit, WI 53511, USA'],
140 | [40.5382913,-78.3528584, 'Ucla Ln, Altoona, PA 16602, USA'],
141 | [28.0282578,-82.3924269, 'Chicago Ave, Temple Terrace, FL 33617, USA'],
142 | [30.5848529,31.4843221, 'Rd inside Zagazig University, Shaibet an Nakareyah, Markaz El-Zakazik, Ash Sharqia Governorate, Egypt'],
143 | [33.428283,-111.750401, 'N Alberta, Mesa, AZ 85205, USA'],
144 | [53.8931837,27.547338, 'Monument to Fallen Professors and Students of the Belarusian State University, Minsk, Belarus'],
145 | [28.0735403,-82.4373589, 'University, FL, USA'],
146 | [11.1705436,75.8736048, 'University Rd, Ramanattukara, Kerala, India'],
147 | [45.4723514,9.1964401, 'Via del Vecchio Politecnico, 20121 Milano, Italy'],
148 | [54.6871555,25.2796514, 'Vilnius, Lithuania'],
149 | [20.593684,78.96288, 'India'],
150 | [-33.8812733,18.6264694, 'Stellenbosch University, Cape Town, 7530, South Africa'],
151 | [28.6777345,77.4504666, 'IMT Rd, Block 14, Sector 10, Raj Nagar, Ghaziabad, Uttar Pradesh 201002, India'],
152 | [33.4238104,-111.8869146, 'Pennsylvania, Tempe, AZ 85281, USA'],
153 | [31.3260152,75.5761829, 'Jalandhar, Punjab 144001, India'],
154 | [36.8743583,-76.1745441, 'Virginia Tech Trail, Virginia Beach, VA 23455, USA'],
155 | [33.4232051,-111.8879509, 'State Ave, Tempe, AZ 85281, USA'],
156 | [22.2567635,-97.8345654, 'Guatemala, Cd Madero, Tamps., Mexico'],
157 | [54.6871555,25.2796514, 'Vilnius, Lithuania'],
158 | [1.2246216,19.7878159, 'Basankusu Airport (BSU), N22, Basankusu, Democratic Republic of the Congo'],
159 | [51.165691,10.451526, 'Germany'],
160 | [30.0742446,31.2765847, 'Internal Medicine, Ain Shams University, ممر خاص مستشفى الدمرداش، Al Waili, Cairo Governorate, Egypt'],
161 | [-4.009976,-79.2085378, 'Colombia, Loja, Ecuador'],
162 | [59.9342802,30.3350986, 'St Petersburg, Russia'],
163 | [10.1345309,-85.4467445, 'Universidad Nacional, 150, Nicoya, Costa Rica'],
164 | [33.952602,-84.5499327, 'Marietta, GA, USA'],
165 | [42.9097484,-85.7630885, 'Grandville, MI, USA'],
166 | [34.3020001,48.8145943, 'Malayer, Hamadan, Iran'],
167 | [1.2877936,103.8665551, 'Marina Bay, Singapore']
168 | ];
169 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/README.txt:
--------------------------------------------------------------------------------
1 | Analyzing an EMAIL Archive vizualizing the data using the
2 | D3 JavaScript library
3 |
4 | Here is a copy of the Sakai Developer Mailing list from 2006-2014.
5 |
6 | http://mbox.dr-chuck.net/
7 |
8 | You should install the SQLite browser to view and modify the databases from:
9 |
10 | http://sqlitebrowser.org/
11 |
12 | The base URL is hard-coded in the gmane.py. Make sure to delete the
13 | content.sqlite file if you switch the base url. The gmane.py file
14 | operates as a spider in that it runs slowly and retrieves one mail
15 | message per second so as to avoid getting throttled. It stores all of
16 | its data in a database and can be interrupted and re-started
17 | as often as needed. It may take many hours to pull all the data
18 | down. So you may need to restart several times.
19 |
20 | To give you a head-start, I have put up 600MB of pre-spidered Sakai
21 | email here:
22 |
23 | https://online.dr-chuck.com/files/sakai/email/content.sqlite.zip
24 |
25 | If you download and unzip this, you can "catch up with the
26 | latest" by running gmane.py.
27 |
28 | Navigate to the folder where you extracted the gmane.zip
29 |
30 | Here is a run of gmane.py getting the last five messages of the
31 | sakai developer list:
32 |
33 | Mac: python gmane.py
34 | Win: gmane.py
35 |
36 | How many messages:10
37 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443
38 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments
39 | http://mbox.dr-chuck.net/sakai.devel/6/7 3586
40 | s-githens@northwestern.edu 2005-12-09T13:32:31-06:00 re: sakaiportallogin and presense
41 | http://mbox.dr-chuck.net/sakai.devel/7/8 10600
42 | john@caret.cam.ac.uk 2005-12-09T13:42:24+00:00 re: lms/vle rants/comments
43 |
44 | The program scans content.sqlite from 1 up to the first message number not
45 | already spidered and starts spidering at that message. It continues spidering
46 | until it has spidered the desired number of messages or it reaches a page
47 | that does not appear to be a properly formatted message.
48 |
49 | Sometimes there is missing a message. Perhaps administrators can delete messages
50 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit
51 | a missing message, go into the SQLite Manager and add a row with the missing id - leave
52 | all the other fields blank - and then restart gmane.py. This will unstick the
53 | spidering process and allow it to continue. These empty messages will be ignored in the next
54 | phase of the process.
55 |
56 | One nice thing is that once you have spidered all of the messages and have them in
57 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the
58 | list. gmane.py will quickly scan to the end of the already-spidered pages and check
59 | if there are new messages and then quickly retrieve those messages and add them
60 | to content.sqlite.
61 |
62 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed.
63 | This is intentional as it allows you to look at content.sqlite to debug the process.
64 | It would be a bad idea to run any queries against this database as they would be
65 | slow.
66 |
67 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw
68 | data from content.sqlite and produces a cleaned-up and well-modeled version of the
69 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X
70 | smaller) than content.sqlite because it also compresses the header and body text.
71 |
72 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing
73 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the
74 | data cleaning process.
75 |
76 | Running gmodel.py works as follows:
77 |
78 | Mac: python gmodel.py
79 | Win: gmodel.py
80 |
81 | Loaded allsenders 1588 and mapping 28 dns mapping 1
82 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com
83 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu
84 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu
85 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu
86 | ...
87 |
88 | The gmodel.py program does a number of data cleaing steps
89 |
90 | Domain names are truncated to two levels for .com, .org, .edu, and .net
91 | other domain names are truncated to three levels. So si.umich.edu becomes
92 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are
93 | forced to lower case and some of the @gmane.org address like the following
94 |
95 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org
96 |
97 | are converted to the real address whenever there is a matching real email
98 | address elsewhere in the message corpus.
99 |
100 | If you look in the content.sqlite database there are two tables that allow
101 | you to map both domain names and individual email addresses that change over
102 | the lifetime of the email list. For example, Steve Githens used the following
103 | email addresses over the life of the Sakai developer list:
104 |
105 | s-githens@northwestern.edu
106 | sgithens@cam.ac.uk
107 | swgithen@mtu.edu
108 |
109 | We can add two entries to the Mapping table
110 |
111 | s-githens@northwestern.edu -> swgithen@mtu.edu
112 | sgithens@cam.ac.uk -> swgithen@mtu.edu
113 |
114 | And so all the mail messages will be collected under one sender even if
115 | they used several email addresses over the lifetime of the mailing list.
116 |
117 | You can also make similar entries in the DNSMapping table if there are multiple
118 | DNS names you want mapped to a single DNS. In the Sakai data I add the following
119 | mapping:
120 |
121 | iupui.edu -> indiana.edu
122 |
123 | So all the folks from the various Indiana University campuses are tracked together
124 |
125 | You can re-run the gmodel.py over and over as you look at the data, and add mappings
126 | to make the data cleaner and cleaner. When you are done, you will have a nicely
127 | indexed version of the email in index.sqlite. This is the file to use to do data
128 | analysis. With this file, data analysis will be really quick.
129 |
130 | The first, simplest data analysis is to do a "who does the most" and "which
131 | organzation does the most"? This is done using gbasic.py:
132 |
133 | Mac: python gbasic.py
134 | Win: gbasic.py
135 |
136 | How many to dump? 5
137 | Loaded messages= 51330 subjects= 25033 senders= 1584
138 |
139 | Top 5 Email list participants
140 | steve.swinsburg@gmail.com 2657
141 | azeckoski@unicon.net 1742
142 | ieb@tfd.co.uk 1591
143 | csev@umich.edu 1304
144 | david.horwitz@uct.ac.za 1184
145 |
146 | Top 5 Email list organizations
147 | gmail.com 7339
148 | umich.edu 6243
149 | uct.ac.za 2451
150 | indiana.edu 2258
151 | unicon.net 2055
152 |
153 | You can look at the data in index.sqlite and if you find a problem, you
154 | can update the Mapping table and DNSMapping table in content.sqlite and
155 | re-run gmodel.py.
156 |
157 | There is a simple vizualization of the word frequence in the subject lines
158 | in the file gword.py:
159 |
160 | Mac: python gword.py
161 | Win: gword.py
162 |
163 | Range of counts: 33229 129
164 | Output written to gword.js
165 |
166 | This produces the file gword.js which you can visualize using the file
167 | gword.htm.
168 |
169 | A second visualization is in gline.py. It visualizes email participation by
170 | organizations over time.
171 |
172 | Mac: python gline.py
173 | Win: gline.py
174 |
175 | Loaded messages= 51330 subjects= 25033 senders= 1584
176 | Top 10 Organizations
177 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk']
178 | Output written to gline.js
179 |
180 | Its output is written to gline.js which is visualized using gline.htm.
181 | If you have a problem with gline.htm, you can try gline2.htm or gline3.htm
182 | to vizualize your data.
183 |
184 | Some URLs for visualization ideas:
185 |
186 | https://developers.google.com/chart/
187 |
188 | https://developers.google.com/chart/interactive/docs/gallery/motionchart
189 |
190 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats
191 |
192 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline
193 |
194 | http://bost.ocks.org/mike/uberdata/
195 |
196 | http://mbostock.github.io/d3/talk/20111018/calendar.html
197 |
198 | http://nltk.org/install.html
199 |
200 | As always - comments welcome.
201 |
202 | -- Dr. Chuck
203 | Sun Sep 29 00:11:01 EDT 2013
204 |
205 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gbasic.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 |
6 | howmany = int(raw_input("How many to dump? "))
7 |
8 | conn = sqlite3.connect('index.sqlite')
9 | conn.text_factory = str
10 | cur = conn.cursor()
11 |
12 | cur.execute('''SELECT Messages.id, sender FROM Messages
13 | JOIN Senders ON Messages.sender_id = Senders.id''')
14 |
15 | sendcounts = dict()
16 | sendorgs = dict()
17 | for message in cur :
18 | sender = message[1]
19 | sendcounts[sender] = sendcounts.get(sender,0) + 1
20 | pieces = sender.split("@")
21 | if len(pieces) != 2 : continue
22 | dns = pieces[1]
23 | sendorgs[dns] = sendorgs.get(dns,0) + 1
24 |
25 | print ''
26 | print 'Top',howmany,'Email list participants'
27 |
28 | x = sorted(sendcounts, key=sendcounts.get, reverse=True)
29 | for k in x[:howmany]:
30 | print k, sendcounts[k]
31 | if sendcounts[k] < 10 : break
32 |
33 | print ''
34 | print 'Top',howmany,'Email list organizations'
35 |
36 | x = sorted(sendorgs, key=sendorgs.get, reverse=True)
37 | for k in x[:howmany]:
38 | print k, sendorgs[k]
39 | if sendorgs[k] < 10 : break
40 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gline.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gline.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 |
6 | conn = sqlite3.connect('index.sqlite')
7 | conn.text_factory = str
8 | cur = conn.cursor()
9 |
10 | # Determine the top ten organizations
11 | cur.execute('''SELECT Messages.id, sender FROM Messages
12 | JOIN Senders ON Messages.sender_id = Senders.id''')
13 |
14 | sendorgs = dict()
15 | for message_row in cur :
16 | sender = message_row[1]
17 | pieces = sender.split("@")
18 | if len(pieces) != 2 : continue
19 | dns = pieces[1]
20 | sendorgs[dns] = sendorgs.get(dns,0) + 1
21 |
22 | # pick the top schools
23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
24 | orgs = orgs[:10]
25 | print "Top 10 Organizations"
26 | print orgs
27 | # orgs = ['total'] + orgs
28 |
29 | # Read through the messages
30 | counts = dict()
31 | months = list()
32 |
33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages
34 | JOIN Senders ON Messages.sender_id = Senders.id''')
35 |
36 | for message_row in cur :
37 | sender = message_row[1]
38 | pieces = sender.split("@")
39 | if len(pieces) != 2 : continue
40 | dns = pieces[1]
41 | if dns not in orgs : continue
42 | month = message_row[2][:7]
43 | if month not in months : months.append(month)
44 | key = (month, dns)
45 | counts[key] = counts.get(key,0) + 1
46 | tkey = (month, 'total')
47 | counts[tkey] = counts.get(tkey,0) + 1
48 |
49 | months.sort()
50 | print counts
51 | print months
52 |
53 | fhand = open('gline.js','w')
54 | fhand.write("gline = [ ['Month'")
55 | for org in orgs:
56 | fhand.write(",'"+org+"'")
57 | fhand.write("]")
58 |
59 | # for month in months[1:-1]:
60 | for month in months:
61 | fhand.write(",\n['"+month+"'")
62 | for org in orgs:
63 | key = (month, org)
64 | val = counts.get(key,0)
65 | fhand.write(","+str(val))
66 | fhand.write("]");
67 |
68 | fhand.write("\n];\n")
69 |
70 | print "Data written to gline.js"
71 | print "Open gline.htm in a browser to view"
72 |
73 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gline2.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gline3.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Line Chart
6 |
7 |
8 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gmane.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import sqlite3
3 | import time
4 | import ssl
5 | import urllib
6 | from urlparse import urljoin
7 | from urlparse import urlparse
8 | import re
9 | from datetime import datetime, timedelta
10 |
11 | # Not all systems have this so conditionally define parser
12 | try:
13 | import dateutil.parser as parser
14 | except:
15 | pass
16 |
17 | def parsemaildate(md) :
18 | # See if we have dateutil
19 | try:
20 | pdate = parser.parse(tdate)
21 | test_at = pdate.isoformat()
22 | return test_at
23 | except:
24 | pass
25 |
26 | # Non-dateutil version - we try our best
27 |
28 | pieces = md.split()
29 | notz = " ".join(pieces[:4]).strip()
30 |
31 | # Try a bunch of format variations - strptime() is *lame*
32 | dnotz = None
33 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
34 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
35 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
36 | try:
37 | dnotz = datetime.strptime(notz, form)
38 | break
39 | except:
40 | continue
41 |
42 | if dnotz is None :
43 | # print 'Bad Date:',md
44 | return None
45 |
46 | iso = dnotz.isoformat()
47 |
48 | tz = "+0000"
49 | try:
50 | tz = pieces[4]
51 | ival = int(tz) # Only want numeric timezone values
52 | if tz == '-0000' : tz = '+0000'
53 | tzh = tz[:3]
54 | tzm = tz[3:]
55 | tz = tzh+":"+tzm
56 | except:
57 | pass
58 |
59 | return iso+tz
60 |
61 | conn = sqlite3.connect('content.sqlite')
62 | cur = conn.cursor()
63 | conn.text_factory = str
64 |
65 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
66 |
67 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages
68 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT,
69 | subject TEXT, headers TEXT, body TEXT)''')
70 |
71 | start = 0
72 | cur.execute('SELECT max(id) FROM Messages')
73 | try:
74 | row = cur.fetchone()
75 | if row[0] is not None:
76 | start = row[0]
77 | except:
78 | start = 0
79 | row = None
80 |
81 | print start
82 |
83 | many = 0
84 |
85 | # Skip up to five messages
86 | skip = 5
87 | while True:
88 | if ( many < 1 ) :
89 | sval = raw_input('How many messages:')
90 | if ( len(sval) < 1 ) : break
91 | many = int(sval)
92 |
93 | start = start + 1
94 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
95 | try:
96 | row = cur.fetchone()
97 | if row is not None : continue
98 | except:
99 | row = None
100 |
101 | many = many - 1
102 | url = baseurl + str(start) + '/' + str(start + 1)
103 |
104 | try:
105 | # Deal with SSL certificate anomalies Python > 2.7
106 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
107 | # document = urllib.urlopen(url, context=scontext)
108 |
109 | document = urllib.urlopen(url)
110 |
111 | text = document.read()
112 | if document.getcode() != 200 :
113 | print "Error code=",document.getcode(), url
114 | break
115 | except KeyboardInterrupt:
116 | print ''
117 | print 'Program interrupted by user...'
118 | break
119 | except:
120 | print "Unable to retrieve or parse page",url
121 | print sys.exc_info()[0]
122 | break
123 |
124 | print url,len(text)
125 |
126 | if not text.startswith("From "):
127 | if skip < 1 :
128 | print text
129 | print "End of mail stream reached..."
130 | quit ()
131 | print " Skipping badly formed message"
132 | skip = skip-1
133 | continue
134 |
135 | pos = text.find("\n\n")
136 | if pos > 0 :
137 | hdr = text[:pos]
138 | body = text[pos+2:]
139 | else:
140 | print text
141 | print "Could not find break between headers and body"
142 | break
143 |
144 | skip = 5 # reset skip count
145 |
146 | email = None
147 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
148 | if len(x) == 1 :
149 | email = x[0];
150 | email = email.strip().lower()
151 | email = email.replace("<","")
152 | else:
153 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
154 | if len(x) == 1 :
155 | email = x[0];
156 | email = email.strip().lower()
157 | email = email.replace("<","")
158 |
159 | date = None
160 | y = re.findall('\Date: .*, (.*)\n', hdr)
161 | if len(y) == 1 :
162 | tdate = y[0]
163 | tdate = tdate[:26]
164 | try:
165 | sent_at = parsemaildate(tdate)
166 | except:
167 | print text
168 | print "Parse fail",tdate
169 | break
170 |
171 | subject = None
172 | z = re.findall('\Subject: (.*)\n', hdr)
173 | if len(z) == 1 : subject = z[0].strip().lower();
174 |
175 | print " ",email,sent_at,subject
176 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body)
177 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
178 |
179 | # Only commit every 50th record
180 | # if (many % 50) == 0 : conn.commit()
181 | time.sleep(1)
182 |
183 | conn.commit()
184 | cur.close()
185 |
186 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gmodel.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import re
5 | import zlib
6 | from datetime import datetime, timedelta
7 | # Not all systems have this
8 | try:
9 | import dateutil.parser as parser
10 | except:
11 | pass
12 |
13 | dnsmapping = dict()
14 | mapping = dict()
15 |
16 | def fixsender(sender,allsenders=None) :
17 | global dnsmapping
18 | global mapping
19 | if sender is None : return None
20 | sender = sender.strip().lower()
21 | sender = sender.replace('<','').replace('>','')
22 |
23 | # Check if we have a hacked gmane.org from address
24 | if allsenders is not None and sender.endswith('gmane.org') :
25 | pieces = sender.split('-')
26 | realsender = None
27 | for s in allsenders:
28 | if s.startswith(pieces[0]) :
29 | realsender = sender
30 | sender = s
31 | # print realsender, sender
32 | break
33 | if realsender is None :
34 | for s in mapping:
35 | if s.startswith(pieces[0]) :
36 | realsender = sender
37 | sender = mapping[s]
38 | # print realsender, sender
39 | break
40 | if realsender is None : sender = pieces[0]
41 |
42 | mpieces = sender.split("@")
43 | if len(mpieces) != 2 : return sender
44 | dns = mpieces[1]
45 | x = dns
46 | pieces = dns.split(".")
47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
48 | dns = ".".join(pieces[-2:])
49 | else:
50 | dns = ".".join(pieces[-3:])
51 | # if dns != x : print x,dns
52 | # if dns != dnsmapping.get(dns,dns) : print dns,dnsmapping.get(dns,dns)
53 | dns = dnsmapping.get(dns,dns)
54 | return mpieces[0] + '@' + dns
55 |
56 | def parsemaildate(md) :
57 | # See if we have dateutil
58 | try:
59 | pdate = parser.parse(tdate)
60 | test_at = pdate.isoformat()
61 | return test_at
62 | except:
63 | pass
64 |
65 | # Non-dateutil version - we try our best
66 |
67 | pieces = md.split()
68 | notz = " ".join(pieces[:4]).strip()
69 |
70 | # Try a bunch of format variations - strptime() is *lame*
71 | dnotz = None
72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
75 | try:
76 | dnotz = datetime.strptime(notz, form)
77 | break
78 | except:
79 | continue
80 |
81 | if dnotz is None :
82 | # print 'Bad Date:',md
83 | return None
84 |
85 | iso = dnotz.isoformat()
86 |
87 | tz = "+0000"
88 | try:
89 | tz = pieces[4]
90 | ival = int(tz) # Only want numeric timezone values
91 | if tz == '-0000' : tz = '+0000'
92 | tzh = tz[:3]
93 | tzm = tz[3:]
94 | tz = tzh+":"+tzm
95 | except:
96 | pass
97 |
98 | return iso+tz
99 |
100 | # Parse out the info...
101 | def parseheader(hdr, allsenders=None):
102 | if hdr is None or len(hdr) < 1 : return None
103 | sender = None
104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
105 | if len(x) >= 1 :
106 | sender = x[0]
107 | else:
108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
109 | if len(x) >= 1 :
110 | sender = x[0]
111 |
112 | # normalize the domain name of Email addresses
113 | sender = fixsender(sender, allsenders)
114 |
115 | date = None
116 | y = re.findall('\nDate: .*, (.*)\n', hdr)
117 | sent_at = None
118 | if len(y) >= 1 :
119 | tdate = y[0]
120 | tdate = tdate[:26]
121 | try:
122 | sent_at = parsemaildate(tdate)
123 | except Exception, e:
124 | # print 'Date ignored ',tdate, e
125 | return None
126 |
127 | subject = None
128 | z = re.findall('\nSubject: (.*)\n', hdr)
129 | if len(z) >= 1 : subject = z[0].strip().lower()
130 |
131 | guid = None
132 | z = re.findall('\nMessage-ID: (.*)\n', hdr)
133 | if len(z) >= 1 : guid = z[0].strip().lower()
134 |
135 | if sender is None or sent_at is None or subject is None or guid is None :
136 | return None
137 | return (guid, sender, subject, sent_at)
138 |
139 | # Open the output database and create empty tables
140 | conn = sqlite3.connect('index.sqlite')
141 | conn.text_factory = str
142 | cur = conn.cursor()
143 |
144 | cur.execute('''DROP TABLE IF EXISTS Messages ''')
145 | cur.execute('''DROP TABLE IF EXISTS Senders ''')
146 | cur.execute('''DROP TABLE IF EXISTS Subjects ''')
147 | cur.execute('''DROP TABLE IF EXISTS Replies ''')
148 |
149 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages
150 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER,
151 | sender_id INTEGER, subject_id INTEGER,
152 | headers BLOB, body BLOB)''')
153 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders
154 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
155 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects
156 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
157 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies
158 | (from_id INTEGER, to_id INTEGER)''')
159 |
160 | # Open the mapping information
161 | conn_1 = sqlite3.connect('mapping.sqlite')
162 | conn_1.text_factory = str
163 | cur_1 = conn_1.cursor()
164 |
165 | # Load up the mapping information into memory structures
166 | cur_1.execute('''SELECT old,new FROM DNSMapping''')
167 | for message_row in cur_1 :
168 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower()
169 |
170 | mapping = dict()
171 | cur_1.execute('''SELECT old,new FROM Mapping''')
172 | for message_row in cur_1 :
173 | old = fixsender(message_row[0])
174 | new = fixsender(message_row[1])
175 | mapping[old] = fixsender(new)
176 |
177 | cur_1.close()
178 |
179 | # Open the raw data retrieved from the network
180 | conn_2 = sqlite3.connect('content.sqlite')
181 | conn_2.text_factory = str
182 | cur_2 = conn_2.cursor()
183 |
184 | allsenders = list()
185 | cur_2.execute('''SELECT email FROM Messages''')
186 | for message_row in cur_2 :
187 | sender = fixsender(message_row[0])
188 | if sender is None : continue
189 | if 'gmane.org' in sender : continue
190 | if sender in allsenders: continue
191 | allsenders.append(sender)
192 |
193 | print "Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)
194 |
195 | cur_2.execute('''SELECT headers, body, sent_at
196 | FROM Messages ORDER BY sent_at''')
197 |
198 | senders = dict()
199 | subjects = dict()
200 | guids = dict()
201 |
202 | count = 0
203 |
204 | for message_row in cur_2 :
205 | hdr = message_row[0]
206 | parsed = parseheader(hdr, allsenders)
207 | if parsed is None: continue
208 | (guid, sender, subject, sent_at) = parsed
209 |
210 | # Apply the sender mapping
211 | sender = mapping.get(sender,sender)
212 |
213 | count = count + 1
214 | if count % 250 == 1 : print count,sent_at, sender
215 | # print guid, sender, subject, sent_at
216 |
217 | if 'gmane.org' in sender:
218 | print "Error in sender ===", sender
219 |
220 | sender_id = senders.get(sender,None)
221 | subject_id = subjects.get(subject,None)
222 | guid_id = guids.get(guid,None)
223 |
224 | if sender_id is None :
225 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
226 | conn.commit()
227 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
228 | try:
229 | row = cur.fetchone()
230 | sender_id = row[0]
231 | senders[sender] = sender_id
232 | except:
233 | print 'Could not retrieve sender id',sender
234 | break
235 | if subject_id is None :
236 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
237 | conn.commit()
238 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
239 | try:
240 | row = cur.fetchone()
241 | subject_id = row[0]
242 | subjects[subject] = subject_id
243 | except:
244 | print 'Could not retrieve subject id',subject
245 | break
246 | # print sender_id, subject_id
247 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )',
248 | ( guid, sender_id, subject_id, sent_at, zlib.compress(message_row[0]), zlib.compress(message_row[1])) )
249 | conn.commit()
250 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
251 | try:
252 | row = cur.fetchone()
253 | message_id = row[0]
254 | guids[guid] = message_id
255 | except:
256 | print 'Could not retrieve guid id',guid
257 | break
258 |
259 | # Close the connections
260 | cur.close()
261 | cur_2.close()
262 |
263 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gword.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
37 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gword.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 | import string
6 |
7 | conn = sqlite3.connect('index.sqlite')
8 | conn.text_factory = str
9 | cur = conn.cursor()
10 |
11 | cur.execute('''SELECT subject_id,subject FROM Messages
12 | JOIN Subjects ON Messages.subject_id = Subjects.id''')
13 |
14 | counts = dict()
15 | for message_row in cur :
16 | text = message_row[1]
17 | text = text.translate(None, string.punctuation)
18 | text = text.translate(None, '1234567890')
19 | text = text.strip()
20 | text = text.lower()
21 | words = text.split()
22 | for word in words:
23 | if len(word) < 4 : continue
24 | counts[word] = counts.get(word,0) + 1
25 |
26 | # Find the top 100 words
27 | words = sorted(counts, key=counts.get, reverse=True)
28 | highest = None
29 | lowest = None
30 | for w in words[:100]:
31 | if highest is None or highest < counts[w] :
32 | highest = counts[w]
33 | if lowest is None or lowest > counts[w] :
34 | lowest = counts[w]
35 | print 'Range of counts:',highest,lowest
36 |
37 | # Spread the font sizes across 20-100 based on the count
38 | bigsize = 80
39 | smallsize = 20
40 |
41 | fhand = open('gword.js','w')
42 | fhand.write("gword = [")
43 | first = True
44 | for k in words[:100]:
45 | if not first : fhand.write( ",\n")
46 | first = False
47 | size = counts[k]
48 | size = (size - lowest) / float(highest - lowest)
49 | size = int((size * bigsize) + smallsize)
50 | fhand.write("{text: '"+k+"', size: "+str(size)+"}")
51 | fhand.write( "\n];\n")
52 |
53 | print "Output written to gword.js"
54 | print "Open gword.htm in a browser to view"
55 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/gyear.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import time
3 | import urllib
4 | import zlib
5 |
6 | conn = sqlite3.connect('index.sqlite')
7 | conn.text_factory = str
8 | cur = conn.cursor()
9 |
10 | # Determine the top ten organizations
11 | cur.execute('''SELECT Messages.id, sender FROM Messages
12 | JOIN Senders ON Messages.sender_id = Senders.id''')
13 |
14 | sendorgs = dict()
15 | for message_row in cur :
16 | sender = message_row[1]
17 | pieces = sender.split("@")
18 | if len(pieces) != 2 : continue
19 | dns = pieces[1]
20 | sendorgs[dns] = sendorgs.get(dns,0) + 1
21 |
22 | # pick the top schools
23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
24 | orgs = orgs[:10]
25 | print "Top 10 Organizations"
26 | print orgs
27 | # orgs = ['total'] + orgs
28 |
29 | # Read through the messages
30 | counts = dict()
31 | years = list()
32 |
33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages
34 | JOIN Senders ON Messages.sender_id = Senders.id''')
35 |
36 | for message_row in cur :
37 | sender = message_row[1]
38 | pieces = sender.split("@")
39 | if len(pieces) != 2 : continue
40 | dns = pieces[1]
41 | if dns not in orgs : continue
42 | year = message_row[2][:4]
43 | if year not in years : years.append(year)
44 | key = (year, dns)
45 | counts[key] = counts.get(key,0) + 1
46 | tkey = (year, 'total')
47 | counts[tkey] = counts.get(tkey,0) + 1
48 |
49 | years.sort()
50 | print counts
51 | print years
52 |
53 | fhand = open('gline.js','w')
54 | fhand.write("gline = [ ['Year'")
55 | for org in orgs:
56 | fhand.write(",'"+org+"'")
57 | fhand.write("]")
58 |
59 | # for year in years[1:-1]:
60 | for year in years:
61 | fhand.write(",\n['"+year+"'")
62 | for org in orgs:
63 | key = (year, org)
64 | val = counts.get(key,0)
65 | fhand.write(","+str(val))
66 | fhand.write("]");
67 |
68 | fhand.write("\n];\n")
69 |
70 | print "Data written to gline.js"
71 | print "Open gline.htm in a browser to view"
72 |
73 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/gmane/mapping.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/gmane/mapping.sqlite
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/mailing_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/mailing_list.png
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/mailing_lists.txt:
--------------------------------------------------------------------------------
1 | Mailing Lists
2 | 1. Crawl archive of mailing list
3 | 2. Analyse and clean-up
4 | 3. Visualise data
5 |
6 |
7 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/multistep_data_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/multistep_data_analysis.png
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/page_rank_web_search.txt:
--------------------------------------------------------------------------------
1 | Page Rank
2 | 1. Write a simple web page crawler
3 | 2. Compute simple version of Google's Page Rank algorithm
4 | 3. Visualize resulting network
5 |
6 | Search Engine Architecture
7 | 1. Web Crawling
8 | - Browses the WWW in a methodical and automated manner
9 | - Create copy of pages to be indexed for fast searching
10 | a. Create list of websites to crawl
11 | b. Retrieve page
12 | c. Look through for links
13 | d. Add links to list
14 | e. Repeat
15 | 2. Index Building
16 | - Collects, parses and stores data
17 | - Facilitate fast & accurate data retrieval
18 |
19 | 3. Searching
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012, Michael Bostock
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * The name Michael Bostock may not be used to endorse or promote products
15 | derived from this software without specific prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/README.txt:
--------------------------------------------------------------------------------
1 | Simple Python Search Spider, Page Ranker, and Visualizer
2 |
3 | This is a set of programs that emulate some of the functions of a
4 | search engine. They store their data in a SQLITE3 database named
5 | 'spider.sqlite'. This file can be removed at any time to restart the
6 | process.
7 |
8 | You should install the SQLite browser to view and modify
9 | the databases from:
10 |
11 | http://sqlitebrowser.org/
12 |
13 | This program crawls a web site and pulls a series of pages into the
14 | database, recording the links between pages.
15 |
16 | Mac: rm spider.sqlite
17 | Mac: python spider.py
18 |
19 | Win: del spider.sqlite
20 | Win: spider.py
21 |
22 | Enter web url or enter: http://www.dr-chuck.com/
23 | ['http://www.dr-chuck.com']
24 | How many pages:2
25 | 1 http://www.dr-chuck.com/ 12
26 | 2 http://www.dr-chuck.com/csev-blog/ 57
27 | How many pages:
28 |
29 | In this sample run, we told it to crawl a website and retrieve two
30 | pages. If you restart the program again and tell it to crawl more
31 | pages, it will not re-crawl any pages already in the database. Upon
32 | restart it goes to a random non-crawled page and starts there. So
33 | each successive run of spider.py is additive.
34 |
35 | Mac: python spider.py
36 | Win: spider.py
37 |
38 | Enter web url or enter: http://www.dr-chuck.com/
39 | ['http://www.dr-chuck.com']
40 | How many pages:3
41 | 3 http://www.dr-chuck.com/csev-blog 57
42 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1
43 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13
44 | How many pages:
45 |
46 | You can have multiple starting points in the same database -
47 | within the program these are called "webs". The spider
48 | chooses randomly amongst all non-visited links across all
49 | the webs.
50 |
51 | If your code fails complainin about certificate probems,
52 | there is some code (SSL) that can be un-commented to work
53 | around certificate problems.
54 |
55 | If you want to dump the contents of the spider.sqlite file, you can
56 | run spdump.py as follows:
57 |
58 | Mac: python spdump.py
59 | Win: spdump.py
60 |
61 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog')
62 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
63 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/')
64 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
65 | 4 rows.
66 |
67 | This shows the number of incoming links, the old page rank, the new page
68 | rank, the id of the page, and the url of the page. The spdump.py program
69 | only shows pages that have at least one incoming link to them.
70 |
71 | Once you have a few pages in the database, you can run Page Rank on the
72 | pages using the sprank.py program. You simply tell it how many Page
73 | Rank iterations to run.
74 |
75 | Mac: python sprank.py
76 | Win: sprank.py
77 |
78 | How many iterations:2
79 | 1 0.546848992536
80 | 2 0.226714939664
81 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)]
82 |
83 | You can dump the database again to see that page rank has been updated:
84 |
85 | Mac: python spdump.py
86 | Win: spdump.py
87 |
88 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog')
89 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm')
90 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/')
91 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm')
92 | 4 rows.
93 |
94 | You can run sprank.py as many times as you like and it will simply refine
95 | the page rank the more times you run it. You can even run sprank.py a few times
96 | and then go spider a few more pages sith spider.py and then run sprank.py
97 | to converge the page ranks.
98 |
99 | If you want to restart the Page Rank calculations without re-spidering the
100 | web pages, you can use spreset.py
101 |
102 | Mac: python spreset.py
103 | Win: spreset.py
104 |
105 | All pages set to a rank of 1.0
106 |
107 | Mac: python sprank.py
108 | Win: sprank.py
109 |
110 | How many iterations:50
111 | 1 0.546848992536
112 | 2 0.226714939664
113 | 3 0.0659516187242
114 | 4 0.0244199333
115 | 5 0.0102096489546
116 | 6 0.00610244329379
117 | ...
118 | 42 0.000109076928206
119 | 43 9.91987599002e-05
120 | 44 9.02151706798e-05
121 | 45 8.20451504471e-05
122 | 46 7.46150183837e-05
123 | 47 6.7857770908e-05
124 | 48 6.17124694224e-05
125 | 49 5.61236959327e-05
126 | 50 5.10410499467e-05
127 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)]
128 |
129 | For each iteration of the page rank algorithm it prints the average
130 | change per page of the page rank. The network initially is quite
131 | unbalanced and so the individual page ranks are changeing wildly.
132 | But in a few short iterations, the page rank converges. You
133 | should run prank.py long enough that the page ranks converge.
134 |
135 | If you want to visualize the current top pages in terms of page rank,
136 | run spjson.py to write the pages out in JSON format to be viewed in a
137 | web browser.
138 |
139 | Mac: python spjson.py
140 | Win: spjson.py
141 |
142 | Creating JSON output on spider.js...
143 | How many nodes? 30
144 | Open force.html in a browser to view the visualization
145 |
146 | You can view this data by opening the file force.html in your web browser.
147 | This shows an automatic layout of the nodes and links. You can click and
148 | drag any node and you can also double click on a node to find the URL
149 | that is represented by the node.
150 |
151 | This visualization is provided using the force layout from:
152 |
153 | http://mbostock.github.com/d3/
154 |
155 | If you rerun the other utilities and then re-run spjson.py - you merely
156 | have to press refresh in the browser to get the new data from spider.js.
157 |
158 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/force.css:
--------------------------------------------------------------------------------
1 | circle.node {
2 | stroke: #fff;
3 | stroke-width: 1.5px;
4 | }
5 |
6 | line.link {
7 | stroke: #999;
8 | stroke-opacity: .6;
9 | }
10 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/force.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Force-Directed Layout
5 |
6 |
7 |
8 |
9 |
10 |
13 |
14 |
15 |
If you don't see a chart above, check the JavaScript console. You may
16 | need to use a different browser.
17 |
18 |
19 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/force.js:
--------------------------------------------------------------------------------
1 | var width = 600,
2 | height = 600;
3 |
4 | var color = d3.scale.category20();
5 |
6 | var dist = (width + height) / 4;
7 |
8 | var force = d3.layout.force()
9 | .charge(-120)
10 | .linkDistance(dist)
11 | .size([width, height]);
12 |
13 | function getrank(rval) {
14 | return (rval/2.0) + 3;
15 | }
16 |
17 | function getcolor(rval) {
18 | return color(rval);
19 | }
20 |
21 | var svg = d3.select("#chart").append("svg")
22 | .attr("width", width)
23 | .attr("height", height);
24 |
25 | function loadData(json) {
26 | force
27 | .nodes(json.nodes)
28 | .links(json.links);
29 |
30 | var k = Math.sqrt(json.nodes.length / (width * height));
31 |
32 | force
33 | .charge(-10 / k)
34 | .gravity(100 * k)
35 | .start();
36 |
37 | var link = svg.selectAll("line.link")
38 | .data(json.links)
39 | .enter().append("line")
40 | .attr("class", "link")
41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); });
42 |
43 | var node = svg.selectAll("circle.node")
44 | .data(json.nodes)
45 | .enter().append("circle")
46 | .attr("class", "node")
47 | .attr("r", function(d) { return getrank(d.rank); } )
48 | .style("fill", function(d) { return getcolor(d.rank); })
49 | .on("dblclick",function(d) {
50 | if ( confirm('Do you want to open '+d.url) )
51 | window.open(d.url,'_new','');
52 | d3.event.stopPropagation();
53 | })
54 | .call(force.drag);
55 |
56 | node.append("title")
57 | .text(function(d) { return d.url; });
58 |
59 | force.on("tick", function() {
60 | link.attr("x1", function(d) { return d.source.x; })
61 | .attr("y1", function(d) { return d.source.y; })
62 | .attr("x2", function(d) { return d.target.x; })
63 | .attr("y2", function(d) { return d.target.y; });
64 |
65 | node.attr("cx", function(d) { return d.x; })
66 | .attr("cy", function(d) { return d.y; });
67 | });
68 |
69 | }
70 | loadData(spiderJson);
71 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/spdump.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url
7 | FROM Pages JOIN Links ON Pages.id = Links.to_id
8 | WHERE html IS NOT NULL
9 | GROUP BY id ORDER BY inbound DESC''')
10 |
11 | count = 0
12 | for row in cur :
13 | if count < 50 : print row
14 | count = count + 1
15 | print count, 'rows.'
16 | cur.close()
17 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/spider.js:
--------------------------------------------------------------------------------
1 | spiderJson = {"nodes":[
2 | {"weight":1,"rank":0.0, "id":1, "url":"http://python-data.dr-chuck.net"},
3 | {"weight":1,"rank":4.66423227024, "id":4, "url":"http://python-data.dr-chuck.net/comments_42.html"},
4 | {"weight":1,"rank":1.38142061792, "id":7, "url":"http://python-data.dr-chuck.net/known_by_42.html"},
5 | {"weight":1,"rank":0.690710255581, "id":9, "url":"http://python-data.dr-chuck.net/known_by_Kaylyn.html"},
6 | {"weight":2,"rank":2.26669663573, "id":40, "url":"http://python-data.dr-chuck.net/known_by_Takua.html"},
7 | {"weight":1,"rank":0.690710255581, "id":82, "url":"http://python-data.dr-chuck.net/known_by_Marwan.html"},
8 | {"weight":2,"rank":7.45553422719, "id":85, "url":"http://python-data.dr-chuck.net/known_by_Samiya.html"},
9 | {"weight":2,"rank":8.48734569457, "id":145, "url":"http://python-data.dr-chuck.net/known_by_Shihed.html"},
10 | {"weight":1,"rank":0.518032667194, "id":189, "url":"http://python-data.dr-chuck.net/known_by_Cassidy.html"},
11 | {"weight":2,"rank":1.56869025396, "id":199, "url":"http://python-data.dr-chuck.net/known_by_Vinnie.html"},
12 | {"weight":2,"rank":2.54881807574, "id":203, "url":"http://python-data.dr-chuck.net/known_by_Charlee.html"},
13 | {"weight":1,"rank":8.83695381234, "id":248, "url":"http://python-data.dr-chuck.net/known_by_Atli.html"},
14 | {"weight":2,"rank":4.16614971195, "id":309, "url":"http://python-data.dr-chuck.net/known_by_Abbiegail.html"},
15 | {"weight":2,"rank":2.2314317079, "id":326, "url":"http://python-data.dr-chuck.net/known_by_Nisha.html"},
16 | {"weight":1,"rank":1.21603900362, "id":382, "url":"http://python-data.dr-chuck.net/known_by_Ciar.html"},
17 | {"weight":1,"rank":1.89945314693, "id":413, "url":"http://python-data.dr-chuck.net/known_by_Brodie.html"},
18 | {"weight":2,"rank":19.0, "id":501, "url":"http://python-data.dr-chuck.net/known_by_Kylar.html"},
19 | {"weight":2,"rank":5.3834045047, "id":642, "url":"http://python-data.dr-chuck.net/known_by_Mohamed.html"},
20 | {"weight":1,"rank":3.93023811326, "id":676, "url":"http://python-data.dr-chuck.net/known_by_Oluwaferanmi.html"},
21 | {"weight":1,"rank":2.59745947896, "id":813, "url":"http://python-data.dr-chuck.net/known_by_Maree.html"},
22 | {"weight":1,"rank":1.77055254257, "id":873, "url":"http://python-data.dr-chuck.net/known_by_Shaw.html"}],
23 | "links":[
24 | {"source":0,"target":1,"value":3},
25 | {"source":0,"target":2,"value":3},
26 | {"source":0,"target":0,"value":3},
27 | {"source":2,"target":3,"value":3},
28 | {"source":2,"target":4,"value":3},
29 | {"source":2,"target":5,"value":3},
30 | {"source":2,"target":6,"value":3},
31 | {"source":5,"target":7,"value":3},
32 | {"source":5,"target":8,"value":3},
33 | {"source":5,"target":9,"value":3},
34 | {"source":5,"target":10,"value":3},
35 | {"source":6,"target":11,"value":3},
36 | {"source":4,"target":12,"value":3},
37 | {"source":4,"target":13,"value":3},
38 | {"source":4,"target":14,"value":3},
39 | {"source":8,"target":15,"value":3},
40 | {"source":7,"target":16,"value":3},
41 | {"source":13,"target":17,"value":3},
42 | {"source":10,"target":18,"value":3},
43 | {"source":14,"target":19,"value":3},
44 | {"source":18,"target":20,"value":3},
45 | {"source":18,"target":17,"value":3},
46 | {"source":20,"target":9,"value":3},
47 | {"source":17,"target":6,"value":3},
48 | {"source":9,"target":12,"value":3}]};
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/spider.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import urllib
3 | import ssl
4 | from urlparse import urljoin
5 | from urlparse import urlparse
6 | from BeautifulSoup import *
7 |
8 | # Deal with SSL certificate anomalies Python > 2.7
9 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
10 | scontext = None
11 |
12 | conn = sqlite3.connect('spider.sqlite')
13 | cur = conn.cursor()
14 |
15 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages
16 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
17 | error INTEGER, old_rank REAL, new_rank REAL)''')
18 |
19 | cur.execute('''CREATE TABLE IF NOT EXISTS Links
20 | (from_id INTEGER, to_id INTEGER)''')
21 |
22 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
23 |
24 | # Check to see if we are already in progress...
25 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
26 | row = cur.fetchone()
27 | if row is not None:
28 | print "Restarting existing crawl. Remove spider.sqlite to start a fresh crawl."
29 | else :
30 | starturl = raw_input('Enter web url or enter: ')
31 | if ( len(starturl) < 1 ) : starturl = 'http://python-data.dr-chuck.net/'
32 | if ( starturl.endswith('/') ) : starturl = starturl[:-1]
33 | web = starturl
34 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
35 | pos = starturl.rfind('/')
36 | web = starturl[:pos]
37 |
38 | if ( len(web) > 1 ) :
39 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
40 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
41 | conn.commit()
42 |
43 | # Get the current webs
44 | cur.execute('''SELECT url FROM Webs''')
45 | webs = list()
46 | for row in cur:
47 | webs.append(str(row[0]))
48 |
49 | print webs
50 |
51 | many = 0
52 | while True:
53 | if ( many < 1 ) :
54 | sval = raw_input('How many pages:')
55 | if ( len(sval) < 1 ) : break
56 | many = int(sval)
57 | many = many - 1
58 |
59 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
60 | try:
61 | row = cur.fetchone()
62 | # print row
63 | fromid = row[0]
64 | url = row[1]
65 | except:
66 | print 'No unretrieved HTML pages found'
67 | many = 0
68 | break
69 |
70 | print fromid, url,
71 |
72 | # If we are retrieving this page, there should be no links from it
73 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
74 | try:
75 | # Deal with SSL certificate anomalies Python > 2.7
76 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
77 | # document = urllib.urlopen(url, context=scontext)
78 |
79 | # Normal Unless you encounter certificate problems
80 | document = urllib.urlopen(url)
81 |
82 | html = document.read()
83 | if document.getcode() != 200 :
84 | print "Error on page: ",document.getcode()
85 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
86 |
87 | if 'text/html' != document.info().gettype() :
88 | print "Ignore non text/html page"
89 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
90 | conn.commit()
91 | continue
92 |
93 | print '('+str(len(html))+')',
94 |
95 | soup = BeautifulSoup(html)
96 | except KeyboardInterrupt:
97 | print ''
98 | print 'Program interrupted by user...'
99 | break
100 | except:
101 | print "Unable to retrieve or parse page"
102 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
103 | conn.commit()
104 | continue
105 |
106 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
107 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (buffer(html), url ) )
108 | conn.commit()
109 |
110 | # Retrieve all of the anchor tags
111 | tags = soup('a')
112 | count = 0
113 | for tag in tags:
114 | href = tag.get('href', None)
115 | if ( href is None ) : continue
116 | # Resolve relative references like href="/contact"
117 | up = urlparse(href)
118 | if ( len(up.scheme) < 1 ) :
119 | href = urljoin(url, href)
120 | ipos = href.find('#')
121 | if ( ipos > 1 ) : href = href[:ipos]
122 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
123 | if ( href.endswith('/') ) : href = href[:-1]
124 | # print href
125 | if ( len(href) < 1 ) : continue
126 |
127 | # Check if the URL is in any of the webs
128 | found = False
129 | for web in webs:
130 | if ( href.startswith(web) ) :
131 | found = True
132 | break
133 | if not found : continue
134 |
135 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
136 | count = count + 1
137 | conn.commit()
138 |
139 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
140 | try:
141 | row = cur.fetchone()
142 | toid = row[0]
143 | except:
144 | print 'Could not retrieve id'
145 | continue
146 | # print fromid, toid
147 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )
148 |
149 |
150 | print count
151 |
152 | cur.close()
153 |
154 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/spjson.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | print "Creating JSON output on spider.js..."
7 | howmany = int(raw_input("How many nodes? "))
8 |
9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url
10 | FROM Pages JOIN Links ON Pages.id = Links.to_id
11 | WHERE html IS NOT NULL AND ERROR IS NULL
12 | GROUP BY id ORDER BY id,inbound''')
13 |
14 | fhand = open('spider.js','w')
15 | nodes = list()
16 | maxrank = None
17 | minrank = None
18 | for row in cur :
19 | nodes.append(row)
20 | rank = row[2]
21 | if maxrank < rank or maxrank is None : maxrank = rank
22 | if minrank > rank or minrank is None : minrank = rank
23 | if len(nodes) > howmany : break
24 |
25 | if maxrank == minrank or maxrank is None or minrank is None:
26 | print "Error - please run sprank.py to compute page rank"
27 | quit()
28 |
29 | fhand.write('spiderJson = {"nodes":[\n')
30 | count = 0
31 | map = dict()
32 | ranks = dict()
33 | for row in nodes :
34 | if count > 0 : fhand.write(',\n')
35 | # print row
36 | rank = row[2]
37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) )
38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',')
39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}')
40 | map[row[3]] = count
41 | ranks[row[3]] = rank
42 | count = count + 1
43 | fhand.write('],\n')
44 |
45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
46 | fhand.write('"links":[\n')
47 |
48 | count = 0
49 | for row in cur :
50 | # print row
51 | if row[0] not in map or row[1] not in map : continue
52 | if count > 0 : fhand.write(',\n')
53 | rank = ranks[row[0]]
54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) )
55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}')
56 | count = count + 1
57 | fhand.write(']};')
58 | fhand.close()
59 | cur.close()
60 |
61 | print "Open force.html in a browser to view the visualization"
62 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/sprank.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | # Find the ids that send out page rank - we only are interested
7 | # in pages in the SCC that have in and out links
8 | cur.execute('''SELECT DISTINCT from_id FROM Links''')
9 | from_ids = list()
10 | for row in cur:
11 | from_ids.append(row[0])
12 |
13 | # Find the ids that receive page rank
14 | to_ids = list()
15 | links = list()
16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
17 | for row in cur:
18 | from_id = row[0]
19 | to_id = row[1]
20 | if from_id == to_id : continue
21 | if from_id not in from_ids : continue
22 | if to_id not in from_ids : continue
23 | links.append(row)
24 | if to_id not in to_ids : to_ids.append(to_id)
25 |
26 | # Get latest page ranks for strongly connected component
27 | prev_ranks = dict()
28 | for node in from_ids:
29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, ))
30 | row = cur.fetchone()
31 | prev_ranks[node] = row[0]
32 |
33 | sval = raw_input('How many iterations:')
34 | many = 1
35 | if ( len(sval) > 0 ) : many = int(sval)
36 |
37 | # Sanity check
38 | if len(prev_ranks) < 1 :
39 | print "Nothing to page rank. Check data."
40 | quit()
41 |
42 | # Lets do Page Rank in memory so it is really fast
43 | for i in range(many):
44 | # print prev_ranks.items()[:5]
45 | next_ranks = dict();
46 | total = 0.0
47 | for (node, old_rank) in prev_ranks.items():
48 | total = total + old_rank
49 | next_ranks[node] = 0.0
50 | # print total
51 |
52 | # Find the number of outbound links and sent the page rank down each
53 | for (node, old_rank) in prev_ranks.items():
54 | # print node, old_rank
55 | give_ids = list()
56 | for (from_id, to_id) in links:
57 | if from_id != node : continue
58 | # print ' ',from_id,to_id
59 |
60 | if to_id not in to_ids: continue
61 | give_ids.append(to_id)
62 | if ( len(give_ids) < 1 ) : continue
63 | amount = old_rank / len(give_ids)
64 | # print node, old_rank,amount, give_ids
65 |
66 | for id in give_ids:
67 | next_ranks[id] = next_ranks[id] + amount
68 |
69 | newtot = 0
70 | for (node, next_rank) in next_ranks.items():
71 | newtot = newtot + next_rank
72 | evap = (total - newtot) / len(next_ranks)
73 |
74 | # print newtot, evap
75 | for node in next_ranks:
76 | next_ranks[node] = next_ranks[node] + evap
77 |
78 | newtot = 0
79 | for (node, next_rank) in next_ranks.items():
80 | newtot = newtot + next_rank
81 |
82 | # Compute the per-page average change from old rank to new rank
83 | # As indication of convergence of the algorithm
84 | totdiff = 0
85 | for (node, old_rank) in prev_ranks.items():
86 | new_rank = next_ranks[node]
87 | diff = abs(old_rank-new_rank)
88 | totdiff = totdiff + diff
89 |
90 | avediff = totdiff / len(prev_ranks)
91 | print i+1, avediff
92 |
93 | # rotate
94 | prev_ranks = next_ranks
95 |
96 | # Put the final ranks back into the database
97 | print next_ranks.items()[:5]
98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''')
99 | for (id, new_rank) in next_ranks.items() :
100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id))
101 | conn.commit()
102 | cur.close()
103 |
104 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/pagerank/spreset.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | conn = sqlite3.connect('spider.sqlite')
4 | cur = conn.cursor()
5 |
6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''')
7 | conn.commit()
8 |
9 | cur.close()
10 |
11 | print "All pages set to a rank of 1.0"
12 |
--------------------------------------------------------------------------------
/python_databases/w5_dbvisualisation/web_crawling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/web_crawling.png
--------------------------------------------------------------------------------