├── .gitignore ├── README.md ├── getting_started_with_python ├── average_loop.py ├── break_loop.py ├── checking_grade.py ├── compute_gross_pay_py2.py ├── compute_gross_pay_py3.py ├── conditional_never_execute_p3.py ├── convert_input_p3.py ├── count_down.py ├── counting_loop.py ├── definite_loop.py ├── definite_loop_friends.py ├── filtering_boolean_loop.py ├── filtering_loop.py ├── for_loop_string.py ├── function.py ├── in_operator.py ├── indentation.py ├── indentation_example.py ├── infinite_loop.py ├── input.py ├── input_question_p2.py ├── input_question_p3.py ├── inside_string.py ├── invalid_loop.py ├── largest_number_loop.py ├── loop.py ├── looping_strings.py ├── multiple_parameters.py ├── new_line.py ├── overtime_pay_try_except.py ├── overtime_pay_try_except_function.py ├── parameters.py ├── parsing.py ├── parsing_2.py ├── return.py ├── slicing_string.py ├── smallest_number.py ├── string_library.py ├── sum_loop.py ├── test.py ├── try_except.py ├── try_except_loop_smallest_largest.py ├── try_except_loop_total_count_average.py └── while.py ├── python_access_web_data ├── test.py ├── w2_regular_expressions │ ├── assignment_parse_extract_regex.py │ ├── mbox-short.txt │ ├── mbox.txt │ ├── notes_regular_expressions.txt │ ├── regex.png │ ├── regex.py │ ├── regex_sum_279460.txt │ └── regex_sum_42.txt ├── w3_networks_sockets │ ├── http.txt │ ├── networked_programs.txt │ ├── socket1.py │ └── urllib1.py ├── w4_web_scraping │ ├── BeautifulSoup.py │ ├── BeautifulSoup.pyc │ ├── assignment_scrap_href_url.py │ ├── assignment_scrap_span_tags.py │ ├── for_i_in_range.py │ ├── scrappin_a_tags.py │ └── webscraping.txt ├── w5_retrieve_parse_xml │ ├── assignment_parse_retrieve_count_xml.py │ ├── geoxml.py │ ├── web_services.txt │ ├── xml.txt │ ├── xml1.py │ └── xml2.py └── w6_json │ ├── access_api.txt │ ├── api.txt │ ├── assignment_extract_data_json.py │ ├── assignment_google_geojson.py │ ├── geojson.py │ ├── json.txt │ ├── json_notes.py │ └── twitter_api │ ├── get_profile_post.py │ ├── get_twitter_friends.py │ ├── hidden.py │ ├── hidden.pyc │ ├── oauth.py │ ├── oauth.pyc │ ├── twurl.py │ └── twurl.pyc ├── python_data_structures ├── 10_most_common_words.py ├── 10_most_common_words_short.py ├── assignment10.2_open_tuples.py ├── assignment7.1_open_read.py ├── assignment7.2_open_seekaverage.py ├── assignment8.4_open_read_split.py ├── assignment8.5_open_read_split.py ├── assignment9.4_dictionaries_count.py ├── average_list.py ├── concat_lists.py ├── count_mbox.py ├── counting_lines.py ├── dictionaries.py ├── dictionaries_count_commonword.py ├── dictionaries_count_prog.py ├── find_day_mbox.py ├── list_methods.py ├── list_worked_exercise.py ├── lists_immutable.py ├── mbox-short.txt ├── mbox.txt ├── newline_p3.py ├── open_function.py ├── range.py ├── romeo.txt ├── searching.py ├── searching_continue_strip.py ├── searching_in_strip.py ├── searching_strip.py ├── slice_lists.py ├── split.py ├── test.py ├── tuples.py ├── tuples_worked_exercise.py └── words.txt └── python_databases ├── w1_oop ├── inheritance.py ├── inheritance.txt ├── object_lifecycle.py ├── object_lifecycle.txt ├── oop.py ├── oop.txt └── terminology.txt ├── w2_sql ├── assignment_count_email.py ├── assignment_email.db ├── assignment_sql_hex.db ├── crud.txt ├── db_intro.txt ├── domain_db.sqlite ├── emaildb.py ├── emaildb.sqlite ├── mbox-short.txt ├── mbox.txt ├── sql1.db ├── sqlitebrowser-3.8.0v5.dmg └── using_db.txt ├── w3_data_models ├── Library.xml ├── asssignment_musicdb.py ├── data_model_table.png ├── data_models.txt ├── join.txt ├── music_mgt.db ├── trackdb.sqlite ├── trackdb_assign.sqlite └── xml_extract_sql.py ├── w4_many_to_many ├── assignment_roster_role.py ├── exercise_db.png ├── exercise_db.sqlite ├── exercise_many2many.py ├── exercise_roster_m2m.py ├── many_to_many.png ├── many_to_many.txt ├── roster_data.json ├── roster_data_assignment.json ├── roster_role_db.sqlite ├── rosterdb.sqlite └── trackdb_assign.sqlite └── w5_dbvisualisation ├── geocoding.png ├── geocoding.txt ├── geodata ├── README.txt ├── geodata.sqlite ├── geodump.py ├── geoload.py ├── where.data ├── where.html └── where.js ├── gmane ├── Chart.bundle.js ├── README.txt ├── d3.layout.cloud.js ├── d3.v3.js ├── gbasic.py ├── gline.htm ├── gline.py ├── gline2.htm ├── gline3.htm ├── gmane.py ├── gmodel.py ├── gword.htm ├── gword.py ├── gyear.py └── mapping.sqlite ├── mailing_list.png ├── mailing_lists.txt ├── multistep_data_analysis.png ├── page_rank_web_search.txt ├── pagerank ├── BeautifulSoup.py ├── LICENSE ├── README.txt ├── d3.v2.js ├── force.css ├── force.html ├── force.js ├── spdump.py ├── spider.js ├── spider.py ├── spjson.py ├── sprank.py └── spreset.py └── web_crawling.png /.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python for Everybody Specialisation 2 | 3 | ## Details on Specialisation 4 | This specialisation is by the University of Michigan on Coursera consisting of 4 courses and 1 capstone project. 5 | 6 | ### Assignment Answers 7 | The answers are entirely created by me. They are not the same as the suggested ones. However, they produce the same results and are roughly similar. Moreover, there are inline notes to help you to understand each step. 8 | 9 | ### Additional Lecture Notes 10 | I created python files on the go when I watched the lectures to document every important point that requires typing code. You will find this very useful when you need help in your assignments. 11 | 12 | ### File names 13 | If the file ends with _p2, this means that the file runs on Python 2.
14 | If the file ends with _p3, this means that the file runs on Python 3.
15 | I tried to create two versions. However, you can easily modify the syntax accordingly. Mostly it's due to the print syntax. 16 | 17 | ### Inline Notes 18 | The notes are in the python files explaining each step with extra notes on what to avoid and what to take note of for that particular script. 19 | 20 | ### Links to Coursera Courses by the University of Michigan 21 | Specialisation
22 | 1. Getting Started with Python
23 | 2. Python Data Structures
24 | 3. Using Python to Access Web Data
25 | 4. Using Databases with Python 26 | 27 | ### Author of Notes and Answers 28 | Ritchie Ng, from the sunny island called Singapore. 29 | 30 | ### Awesome Lecturer 31 | Without him, I would not have been able to create these notes and answers.
32 | Professor Charles Severance, University of Michigan. 33 | 34 | -------------------------------------------------------------------------------- /getting_started_with_python/average_loop.py: -------------------------------------------------------------------------------- 1 | count = 0 2 | sum = 0 3 | 4 | print('Before', count, sum) 5 | 6 | for value in [9, 41, 12, 3, 74, 15]: 7 | count += 1 8 | sum += value 9 | print(count, sum, value) 10 | print('After', count, sum, sum / count) 11 | 12 | -------------------------------------------------------------------------------- /getting_started_with_python/break_loop.py: -------------------------------------------------------------------------------- 1 | while True: 2 | line = input('> ') 3 | if line == 'done': 4 | break 5 | # Breaks out of loop if done 6 | # Otherwise, the loop continues and prints line. 7 | print(line) 8 | print('Done!') -------------------------------------------------------------------------------- /getting_started_with_python/checking_grade.py: -------------------------------------------------------------------------------- 1 | score = raw_input("Your score: ") 2 | 3 | try: 4 | score_int = float(score) 5 | # If try fails, except runs. 6 | except: 7 | score_int = -1 8 | 9 | if 0.90 <= score_int <= 1.00: 10 | print "A" 11 | elif 0.80 <= score_int < 0.90: 12 | print "B" 13 | elif 0.70 <= score_int < 0.80: 14 | print "C" 15 | elif 0.60 <= score_int < 0.70: 16 | print "D" 17 | elif 0.00 <= score_int < 0.60: 18 | print "F" 19 | else: 20 | print "Please enter a score between 0.00 and 1.00." -------------------------------------------------------------------------------- /getting_started_with_python/compute_gross_pay_py2.py: -------------------------------------------------------------------------------- 1 | hrs = raw_input("Enter Hours: ") 2 | hrs = float(hrs) 3 | 4 | hourly_rate = raw_input("Enter Hourly Rate: ") 5 | hourly_rate = float(hourly_rate) 6 | 7 | gross_pay = hourly_rate * hrs 8 | print "Gross pay:", gross_pay 9 | -------------------------------------------------------------------------------- /getting_started_with_python/compute_gross_pay_py3.py: -------------------------------------------------------------------------------- 1 | hrs = input('Enter Hours: ') 2 | hrs = float(hrs) 3 | 4 | hourly_rate = input('Enter Hourly Rate: ') 5 | hourly_rate = float(hourly_rate) 6 | 7 | gross_pay = hourly_rate * hrs 8 | print("Gross pay:", gross_pay) 9 | -------------------------------------------------------------------------------- /getting_started_with_python/conditional_never_execute_p3.py: -------------------------------------------------------------------------------- 1 | x = input('What number do you like?') 2 | x = int(x) 3 | # Convert the input into an integer for use with comparison operators 4 | 5 | if x < 2: 6 | print('Below 2') 7 | elif x < 20: 8 | print('Below 20') 9 | elif x < 10: 10 | print('Below 10') 11 | # This will never run. 12 | else: 13 | print('Above 20') 14 | -------------------------------------------------------------------------------- /getting_started_with_python/convert_input_p3.py: -------------------------------------------------------------------------------- 1 | inp = input('Europe floor? ') 2 | # input() runs first, then it stores the value into variable inp 3 | usf = int(inp) + 1 4 | print("US floor", usf) 5 | -------------------------------------------------------------------------------- /getting_started_with_python/count_down.py: -------------------------------------------------------------------------------- 1 | n = 6 2 | while n > 0: 3 | n -= 1 # augmented assignment & iteration variable 4 | print(n) 5 | print('Blastoff!') 6 | -------------------------------------------------------------------------------- /getting_started_with_python/counting_loop.py: -------------------------------------------------------------------------------- 1 | count = 0 2 | print('Before', count) 3 | 4 | for thing in [9, 41, 12, 3, 74, 15]: 5 | count += 1 6 | # zork = zork + 1 7 | print(count, thing) 8 | 9 | print('After', count) 10 | -------------------------------------------------------------------------------- /getting_started_with_python/definite_loop.py: -------------------------------------------------------------------------------- 1 | for i in [5, 4, 3, 2, 1]: 2 | print i 3 | # Prints 5, 4, 3, 2, 1 using an iteration variable in a finite loop 4 | 5 | print 'Blastoff!' 6 | 7 | 8 | -------------------------------------------------------------------------------- /getting_started_with_python/definite_loop_friends.py: -------------------------------------------------------------------------------- 1 | friends = ['Joseph', 'Glenn', 'Sally'] 2 | 3 | for friend in friends: 4 | print('Happy New Year', friend) 5 | 6 | print('Done!') 7 | 8 | # Iteration variable friend 9 | # Friend changes each time through the loop -------------------------------------------------------------------------------- /getting_started_with_python/filtering_boolean_loop.py: -------------------------------------------------------------------------------- 1 | found = False 2 | 3 | print('Before', found) 4 | 5 | for value in [9, 41, 12, 3, 74, 15]: 6 | if value == 3: 7 | found = True 8 | break 9 | print(found, value) 10 | 11 | print('After', found) -------------------------------------------------------------------------------- /getting_started_with_python/filtering_loop.py: -------------------------------------------------------------------------------- 1 | print('Before') 2 | 3 | for value in [9, 41, 12, 74, 15]: 4 | if value > 20: 5 | print('Large number', value) 6 | 7 | print('After') -------------------------------------------------------------------------------- /getting_started_with_python/for_loop_string.py: -------------------------------------------------------------------------------- 1 | word = 'banana' 2 | 3 | for letter in word: 4 | print(letter) 5 | -------------------------------------------------------------------------------- /getting_started_with_python/function.py: -------------------------------------------------------------------------------- 1 | # Function is some reusable code --> helps to reduce repeat codes 2 | # Takes arguments as input 3 | # Does computation then returns a result 4 | 5 | # def to define a function 6 | # function_name(argument) to invoke with argument 7 | 8 | def hello(): 9 | print('Hello!') 10 | name = input('Who are you? ') 11 | print('Welcome ' + name + ".") 12 | 13 | hello() 14 | # Invoking function 15 | 16 | # Built-in function 17 | big = max('1, 2, 3') 18 | print(big) 19 | # Input: string '1, 2, 3' 20 | # Function: max() --> takes input to produce output 21 | # Result: '3' 22 | -------------------------------------------------------------------------------- /getting_started_with_python/in_operator.py: -------------------------------------------------------------------------------- 1 | fruit = 'baanana' 2 | 3 | if 'n' in fruit: 4 | print('Found it!') 5 | 6 | if "m" in fruit: 7 | print('Found it!') 8 | -------------------------------------------------------------------------------- /getting_started_with_python/indentation.py: -------------------------------------------------------------------------------- 1 | # Increase indent for "if" or "for" statement 2 | 3 | # Maintain indent to indicate scope (which lines affected by if/for statement) 4 | 5 | # Reduce indent to indicate end of the block (if/for statement) 6 | 7 | # Blank lines ignored 8 | 9 | # Comments ignored 10 | -------------------------------------------------------------------------------- /getting_started_with_python/indentation_example.py: -------------------------------------------------------------------------------- 1 | x = 5 2 | if x > 2: 3 | print 'Bigger than 2' 4 | print 'Still bigger' 5 | print 'Done with 2' 6 | 7 | for i in range(5): 8 | print i 9 | if i > 2: 10 | print 'Bigger than 2' 11 | print 'Done with i', i 12 | -------------------------------------------------------------------------------- /getting_started_with_python/infinite_loop.py: -------------------------------------------------------------------------------- 1 | n = 5 2 | while n > 0: 3 | print 'Lather' 4 | print 'Rinse' 5 | 6 | print 'Dry off!' 7 | 8 | -------------------------------------------------------------------------------- /getting_started_with_python/input.py: -------------------------------------------------------------------------------- 1 | # This program says hello and asks for my name. 2 | 3 | print ('Hello World!') 4 | print('What is your name?') # ask for their name 5 | myName = input() 6 | 7 | print('It is good to meet you, ' + myName) 8 | 9 | print('The length of your name is:') 10 | print(len(myName)) 11 | 12 | print('What is your age?') # ask for their age 13 | myAge = input() 14 | 15 | print('You will be ' + str(int(myAge) + 1) + ' in a year.') 16 | 17 | -------------------------------------------------------------------------------- /getting_started_with_python/input_question_p2.py: -------------------------------------------------------------------------------- 1 | nam = raw_input('Who are you?') 2 | print 'Welcome', nam 3 | -------------------------------------------------------------------------------- /getting_started_with_python/input_question_p3.py: -------------------------------------------------------------------------------- 1 | nam = input('Who are you?') 2 | print('Welcome', nam) 3 | 4 | name = raw_input('Enter your name') 5 | print "Hello " + name 6 | -------------------------------------------------------------------------------- /getting_started_with_python/inside_string.py: -------------------------------------------------------------------------------- 1 | fruit = 'banana' 2 | # Character starts from 0 3 | letter = fruit[1] 4 | print letter 5 | 6 | n = 3 7 | letter_2 = fruit[n - 1] 8 | print letter_2 9 | 10 | print len(fruit) 11 | # Counts the number of characters 12 | # 6 13 | # len() is an in-built function -------------------------------------------------------------------------------- /getting_started_with_python/invalid_loop.py: -------------------------------------------------------------------------------- 1 | n = 0 2 | while n > 0: 3 | print 'Zero trip loop' 4 | 5 | print 'Zero trip loop!' 6 | -------------------------------------------------------------------------------- /getting_started_with_python/largest_number_loop.py: -------------------------------------------------------------------------------- 1 | # Finding the largest value 2 | largest_so_far = -1 3 | print('Before', largest_so_far) 4 | 5 | for the_num in [9, 41, 12, 3, 74, 15]: 6 | if the_num > largest_so_far: 7 | largest_so_far = the_num 8 | print(largest_so_far, the_num) 9 | 10 | print('After', largest_so_far) -------------------------------------------------------------------------------- /getting_started_with_python/loop.py: -------------------------------------------------------------------------------- 1 | n = 5 2 | while n > 0: 3 | print n 4 | n -= 1 5 | # n = n - 1 6 | print 'Blastoff!' 7 | print n 8 | 9 | # Results 10 | # 5 --> print 5 then subtract 1 = 4 11 | # 4 --> print 4 then subtract 1 = 3 12 | # 3 13 | # 2 14 | # 1 --> print 1 then subtract 1 = 0 15 | # Blastoff 16 | # 0 --> result of last subtraction 17 | 18 | # Loops (repeated steps) have iteration variables, n 19 | # Iteration variable changes each time through a loop 20 | # Often go through a sequence of numbers 21 | -------------------------------------------------------------------------------- /getting_started_with_python/looping_strings.py: -------------------------------------------------------------------------------- 1 | fruit = 'banana' 2 | 3 | index = 0 4 | 5 | while index < len(fruit): 6 | letter = fruit[index] 7 | print(index, letter) 8 | index += 1 9 | 10 | print (" ") 11 | 12 | for letter in fruit: 13 | print(letter) 14 | -------------------------------------------------------------------------------- /getting_started_with_python/multiple_parameters.py: -------------------------------------------------------------------------------- 1 | def addtwo(a, b): 2 | added = a + b 3 | return added 4 | # a,b are parameters 5 | 6 | x = addtwo(3, 5) 7 | print(x) 8 | # 3,5 are arguments 9 | 10 | -------------------------------------------------------------------------------- /getting_started_with_python/new_line.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/getting_started_with_python/new_line.py -------------------------------------------------------------------------------- /getting_started_with_python/overtime_pay_try_except.py: -------------------------------------------------------------------------------- 1 | # Employees get 1.5x the hourly rate for hours work above 40 hours. 2 | # Error message for non-number input. 3 | # One prompt then quit. No loop for this! 4 | 5 | # Concepts: if, elif, else, try, except, input and print 6 | 7 | 8 | hrs = raw_input("Enter Hours: ") 9 | hrs_int = float(hrs) 10 | 11 | hourly_rate = raw_input("Hourly Rate: ") 12 | hourly_rate_int = float(hourly_rate) 13 | overtime_multiplier = 1.5 14 | hourly_overtime = hourly_rate_int * overtime_multiplier 15 | 16 | if hrs_int <= 40: 17 | gross_pay = hrs_int * hourly_rate_int 18 | print gross_pay 19 | else: 20 | hrs_over = hrs_int - 40 21 | gross_pay_overtime = (40 * hourly_rate_int) + (hrs_over * hourly_overtime) 22 | print gross_pay_overtime 23 | 24 | -------------------------------------------------------------------------------- /getting_started_with_python/overtime_pay_try_except_function.py: -------------------------------------------------------------------------------- 1 | # Employees get 1.5x the hourly rate for hours work above 40 hours. 2 | # Error message for non-number input. 3 | # One prompt then quit. No loop for this! 4 | 5 | # Concepts: if, elif, else, try, except, input, print, and function 6 | 7 | 8 | hourly_rate = raw_input("Hourly rate: ") 9 | hourly_rate_int = float(hourly_rate) 10 | 11 | hours = raw_input("Number of hours: ") 12 | hours_int = float(hours) 13 | 14 | 15 | def computepay(h, r): 16 | overtime_r = r * 1.5 17 | 18 | if h < 40: 19 | return h * r 20 | else: 21 | return (40 * r) + (h - 40) * overtime_r 22 | 23 | p = computepay(hours_int, hourly_rate_int) 24 | print p 25 | -------------------------------------------------------------------------------- /getting_started_with_python/parameters.py: -------------------------------------------------------------------------------- 1 | def greet(lang): 2 | if lang == 'es': 3 | print 'Hola' 4 | elif lang == 'fr': 5 | print 'Bonjour' 6 | else: 7 | print 'Hello' 8 | 9 | greet('') -------------------------------------------------------------------------------- /getting_started_with_python/parsing.py: -------------------------------------------------------------------------------- 1 | # Want to extract domain hotmail.com 2 | data = 'From ritchie_ng@hotmail.com Tues May 31' 3 | at_position = data.find('@') 4 | print(at_position) 5 | 6 | space_position = data.find(' ', at_position) 7 | # Starting from at_position, where's the next space 8 | print(space_position) 9 | 10 | host = data[at_position + 1: space_position] 11 | print(host) -------------------------------------------------------------------------------- /getting_started_with_python/parsing_2.py: -------------------------------------------------------------------------------- 1 | text = "X-DSPAM-Confidence: 0.8475"; 2 | 3 | find_0 = text.find('0') 4 | find_5 = text.find('5') 5 | 6 | number_extraction = float(text[find_0: find_5 + 1]) 7 | print(number_extraction) -------------------------------------------------------------------------------- /getting_started_with_python/return.py: -------------------------------------------------------------------------------- 1 | # Return Values 2 | # Function can return a value 3 | # If you want to do something in the function, use return. 4 | 5 | def greet(lang): 6 | # lang is the parameter 7 | if lang == 'es': 8 | return "Hola" 9 | elif lang == 'fr': 10 | return "Bonjour" 11 | else: 12 | return "Hello" 13 | 14 | print(greet('fr'), "Monster") 15 | # fr is the argument 16 | print(greet('es'), "Monster") 17 | # es is the argument 18 | print(greet('en'), "Ritchie") 19 | # en is the argument 20 | -------------------------------------------------------------------------------- /getting_started_with_python/slicing_string.py: -------------------------------------------------------------------------------- 1 | s = 'Monty Python' 2 | print(s[0: 4]) 3 | # Up to but not including 4 | 5 | print(s[6: 7]) 6 | 7 | print(s[6: 20]) 8 | # Only prints 6 to 11 9 | 10 | print(s[:2]) 11 | # Up to but not including 12 | 13 | print(s[8:]) 14 | # From 8 until end 15 | 16 | print(s[:]) 17 | # Prints all 18 | 19 | -------------------------------------------------------------------------------- /getting_started_with_python/smallest_number.py: -------------------------------------------------------------------------------- 1 | smallest = None 2 | # Seen nothing so far, so will wait. 3 | 4 | print('Before', smallest) 5 | 6 | for value in [9, 41, 12, 3, 74, 15]: 7 | if smallest is None: 8 | smallest = value 9 | # This will happen the first time where smallest = 9 10 | # After the first time, it'll be permanently false 11 | elif value < smallest: 12 | smallest = value 13 | # This runs until the loop iterates through the array 14 | print(smallest, value) 15 | 16 | print('After', smallest) 17 | 18 | # Only use is when checking for None or False or True 19 | # Other times, use comparison operator == 20 | -------------------------------------------------------------------------------- /getting_started_with_python/string_library.py: -------------------------------------------------------------------------------- 1 | greet = ' Hello Bob ' 2 | 3 | zap = greet.lower() 4 | print(zap) 5 | # Converts to lowercase 6 | 7 | large = greet.upper() 8 | print(large) 9 | # Converts to uppercase 10 | 11 | what_type = type(greet) 12 | print(what_type) 13 | 14 | what_dir = greet.find('lo') 15 | print(what_dir) 16 | # Where is this in the string. 17 | 18 | replace_name = greet.replace('Bob', 'Jane') 19 | print(replace_name) 20 | # It doesn't change the value of greet 21 | 22 | replace_letter = greet.replace('o', 'X') 23 | print(replace_letter) 24 | 25 | strip_left = greet.lstrip() 26 | strip_right = greet.rstrip() 27 | strip_all = greet.strip() 28 | 29 | print(strip_all) 30 | print(strip_left) 31 | print(strip_right) 32 | 33 | start_with = greet.startswith('Hello') 34 | print(start_with) 35 | -------------------------------------------------------------------------------- /getting_started_with_python/sum_loop.py: -------------------------------------------------------------------------------- 1 | zork = 0 2 | print('Before', zork) 3 | 4 | for thing in [9, 41, 12, 3, 74, 15]: 5 | zork += thing 6 | # zork = zork + thing 7 | print(zork, thing) 8 | 9 | print('After', zork) 10 | 11 | 12 | -------------------------------------------------------------------------------- /getting_started_with_python/test.py: -------------------------------------------------------------------------------- 1 | x = 'From marquard@uct.ac.za' 2 | find_at = x.find('@') 3 | print(find_at) 4 | find_dot = x.find('.') 5 | print(find_dot) 6 | 7 | uct_cut = x[find_at + 1: find_dot] 8 | print(uct_cut) 9 | 10 | print(len('banana')*7) 11 | 12 | data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008' 13 | pos = data.find('.') 14 | print(data[pos:pos+3]) -------------------------------------------------------------------------------- /getting_started_with_python/try_except.py: -------------------------------------------------------------------------------- 1 | # Try Fails 2 | first_string = input("What's your age? ") 3 | try: 4 | first_integer_string = int(first_string) 5 | # If try fails, except runs. 6 | # Keep as little code as possible so you know what's failing. 7 | except: 8 | first_integer_string = -1 9 | 10 | print('Your Age', first_integer_string) 11 | 12 | # Try Passes 13 | second_string = input("What's your age? ") 14 | try: 15 | second_integer_string = int(second_string) 16 | # If try fails, except runs. 17 | except: 18 | second_integer_string = -1 19 | 20 | print('Your Age, Second Try', second_integer_string) 21 | 22 | # Proper Error Log 23 | third_strong = input("What's your age? ") 24 | try: 25 | third_integer_string = int(third_strong) 26 | except: 27 | third_integer_string = -1 28 | 29 | if third_integer_string > 0: 30 | print('Your age:', third_integer_string) 31 | else: 32 | print('Error: not a number') 33 | -------------------------------------------------------------------------------- /getting_started_with_python/try_except_loop_smallest_largest.py: -------------------------------------------------------------------------------- 1 | largest = None 2 | smallest = None 3 | 4 | while True: 5 | num = raw_input('Enter a number: ') 6 | 7 | # Handle edge cases 8 | if num == "done": 9 | break 10 | if len(num) < 1: 11 | break 12 | # Allows user to press enter to complete 13 | 14 | # Work is done 15 | try: 16 | num_int = float(num) 17 | # If try fails, except runs 18 | except: 19 | print "Invalid input" 20 | continue 21 | # Jumps to the start of the loop without running any code below this line 22 | 23 | if smallest is None: 24 | smallest = num 25 | # This will be permanently false after the first iteration. 26 | elif num < smallest: 27 | smallest = num 28 | # Replaces the iteration variable with smaller input num 29 | 30 | if largest is None: 31 | largest = num 32 | # This will be permanently false after the first iteration. 33 | elif num > largest: 34 | largest = num 35 | # Replaces the iteration variable with larger input num 36 | 37 | print "Maximum is " + largest 38 | print "Minimum is " + smallest 39 | -------------------------------------------------------------------------------- /getting_started_with_python/try_except_loop_total_count_average.py: -------------------------------------------------------------------------------- 1 | count = 0 2 | total = 0 3 | 4 | while True: 5 | inp = raw_input('Enter a number: ') 6 | # Don't count here. Because you will count the last entry which is NaN 7 | 8 | # Handle the edge cases 9 | if inp == 'done': 10 | break 11 | # Breaks out of loop to print "Done!" 12 | if len(inp) < 1: 13 | break 14 | # Allows user to enter an empty line to print "Done!" 15 | 16 | # Do the work. 17 | try: 18 | num = float(inp) 19 | except: 20 | print "Invalid input" 21 | print " " 22 | continue 23 | # continue jumps up to loop without running the code below 24 | count += 1 25 | # count = count + 1 26 | total += num 27 | # total = total + num 28 | print num, total, count 29 | print " " 30 | 31 | print " " 32 | print "Results:" 33 | print "Count = ", count 34 | print "Total = ", total 35 | print "Average = ", total / count 36 | 37 | -------------------------------------------------------------------------------- /getting_started_with_python/while.py: -------------------------------------------------------------------------------- 1 | while True: 2 | print('Who are you?') 3 | name = input() 4 | if name != 'Joe': 5 | continue 6 | print('Hello Joe. What is the password?') 7 | password = input() 8 | if password == 'swordfish': 9 | break 10 | print('Access granted.') -------------------------------------------------------------------------------- /python_access_web_data/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_access_web_data/test.py -------------------------------------------------------------------------------- /python_access_web_data/w2_regular_expressions/assignment_parse_extract_regex.py: -------------------------------------------------------------------------------- 1 | # 1. Import regex 2 | # 2. Read file 3 | # 3. Create 4 | # 4. Look for integers re.findall('[0-9]+', line) 5 | # 5. Convert strings to integers 6 | # 6. Sum integers 7 | 8 | # 1. Import regex 9 | import re 10 | 11 | # 2. Read file 12 | fhandle = open('regex_sum_279460.txt') 13 | 14 | # 3. Create list 15 | numlist = list() 16 | 17 | # 4. Look for integers re.findall('[0-9]+', line) 18 | for line in fhandle: 19 | line = line.rstrip() 20 | # Create lists of numbers 21 | num = re.findall('[0-9]+', line) 22 | 23 | # print num 24 | # confirm that numbers are collated 25 | 26 | # print num 27 | # shows max 3 in a list 28 | 29 | # Skip blank lists 30 | if len(num) < 1: 31 | continue 32 | 33 | elif len(num) == 1: 34 | # 5. Convert strings to integers 35 | num1 = int(num[0]) 36 | numlist.append(num1) 37 | elif len(num) == 2: 38 | num1 = int(num[0]) 39 | num2 = int(num[1]) 40 | numlist.append(num1) 41 | numlist.append(num2) 42 | else: 43 | num1 = int(num[0]) 44 | num2 = int(num[1]) 45 | num3 = int(num[2]) 46 | numlist.append(num1) 47 | numlist.append(num2) 48 | numlist.append(num3) 49 | 50 | # 6. Sum integers in a list 51 | sum_num_int = sum(numlist) 52 | print len(numlist) 53 | print sum_num_int -------------------------------------------------------------------------------- /python_access_web_data/w2_regular_expressions/notes_regular_expressions.txt: -------------------------------------------------------------------------------- 1 | Regular Expression: regex or regexp 2 | - specialised in string matching 3 | - clever wild card expressions for matching and parsing strings 4 | 5 | 6 | Import regex 7 | import re 8 | re.search() 9 | re.findall() -------------------------------------------------------------------------------- /python_access_web_data/w2_regular_expressions/regex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_access_web_data/w2_regular_expressions/regex.png -------------------------------------------------------------------------------- /python_access_web_data/w2_regular_expressions/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | fhand = open('mbox-short.txt') 4 | 5 | # 1. search() 6 | for line in fhand: 7 | line = line.rstrip() 8 | # Find 'From:' at the beginning of line 9 | # ^: beginning of 10 | # True/false if you find or not 11 | if re.search('^From:', line): 12 | print line 13 | 14 | # 1. search() 15 | for line in fhand: 16 | line = line.rstrip() 17 | # * 0 or more digits, any number of times 18 | # . matches any character 19 | if re.search('^X.*:', line): 20 | print line 21 | 22 | # 1. search() 23 | for line in fhand: 24 | line = line.rstrip() 25 | # \S --> non-blank character 26 | # + --> one or more times 27 | # This basically searches for X- with non-blank characters up to : 28 | if re.search('^X-\S+:', line): 29 | print line 30 | 31 | 32 | # 2. findall() 33 | x = 'my 2 favourite numbers are 19 and 24' 34 | # findall: pull out to fill in list 35 | # [0-9] digits and 1 or more digit 36 | # search condition in x 37 | y = re.findall('[0-9]+', x) 38 | print y 39 | 40 | # Find all one or more vowels 41 | y = re.findall('[AEIOU]+', x) 42 | print y 43 | 44 | # 3. Greedy matching expanding out to the max : 45 | x = 'From: Using the : character' 46 | y = re.findall('^F.+:', x) 47 | print y 48 | 49 | # 4. Non-greedy matching to the nearest : 50 | x = 'From: Using the : character' 51 | y = re.findall('^F.+?:', x) 52 | print y 53 | 54 | # \S+ at least one non blank before and after @ 55 | # Greedy 56 | # ( ) gives you what you're looking for 57 | # \S --> non-blank character 58 | line = 'From stephen@u.nus.edu do not' 59 | a = re.findall('^From (\S+@\S+)', line) 60 | print a 61 | 62 | # Extracting only the domain 63 | line = 'From stephen@u.nus.edu do not' 64 | # find @ in line 65 | # ( ) gives you what you're looking for 66 | # [^ ] non blank 67 | # * 0 or more 68 | b = re.findall('@([^ ]*)', line) 69 | print b 70 | 71 | # ^From --> starts From 72 | # .* --> any character before @ 73 | # ( ) --> what you will get 74 | # [^ ]* --> non-blank characters as many as them 75 | c = re.findall('^From .*@([^ ]*)', line) 76 | print c 77 | 78 | # SPAM CONFIDENCE Example 79 | fhand = open('mbox-short.txt') 80 | numlist = list() 81 | for line in fhand: 82 | line = line.rstrip() 83 | stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line) 84 | # skip those you don't find 85 | print stuff 86 | if len(stuff) != 1: 87 | continue 88 | num = float(stuff[0]) 89 | numlist.append(num) 90 | print 'Maximium:', max(numlist) -------------------------------------------------------------------------------- /python_access_web_data/w3_networks_sockets/http.txt: -------------------------------------------------------------------------------- 1 | HyperText Transport Protocol (HTTP) 2 | Set of rules to allow web browsers to retrieve web documents from 3 | servers around the world 4 | 5 | Internet Engineering Task Force (IETF) 6 | 7 | Uniform Resource Locator (URL) 8 | Protocol: http 9 | Host: www.ritchieng.com 10 | Document: /page1.html 11 | 12 | Request Response Cycle 13 | Click --> event in browser 14 | Browser makes connection to server 15 | Makes application GET request 16 | Server respond 17 | 18 | 1. GET 19 | 2. RETRIEVE 20 | 3. DISPLAY 21 | 22 | telnet command 23 | telnet www.dr-chuck.com 80 24 | GET http://www.dr-chuck.com/page1.htm HTTP/1.0 25 | PRESS ENTER TWICE 26 | RETRIEVES 27 | DISPLAY 28 | 29 | Summary 30 | Open a socket 31 | Send a GET 32 | Send a new line 33 | GET back the document 34 | Display the document 35 | DO IT OVER AND OVE -------------------------------------------------------------------------------- /python_access_web_data/w3_networks_sockets/networked_programs.txt: -------------------------------------------------------------------------------- 1 | Transport Control Protocol (TCP) 2 | Built on top of Internet Protocol (IP) 3 | 4 | 5 | TCP Connections or Sockets 6 | Internet or network socket is an endpoint of a bidirectional 7 | inter-process communication flow across an IP-based computer network, 8 | such as the Internet. 9 | 10 | Like a phone call between two applications 11 | 12 | fhandle --> port to look out to exchange data through a socket 13 | 14 | TCP Port Numbers 15 | Port is an application-specific or process-specific 16 | communications endpoint 17 | 18 | Allows multiple networked applications to coexist on same server 19 | 20 | List of well-known TCP port numbers 21 | HTTP 80 22 | HTTPS 443 23 | SSH 24 | IMAP 25 | POP 26 | DNS 27 | SMTP 25 28 | FTP 29 | 30 | Similar to extensions in phone numbers 31 | 32 | Example: 33 | 1. Domain name: www.ritchieng.com 34 | 2. IP Address: 199.10.0.10 35 | 2a. Email: port 25 36 | 2b. Login: port 23 37 | 2c. Web server: port 80 38 | 2d. Personal mail: port 109 39 | -------------------------------------------------------------------------------- /python_access_web_data/w3_networks_sockets/socket1.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | # CREATE ENDPOINT 4 | # socket - library 5 | # socket - method within library 6 | # (socket.AF_INET, socket.SOCK_STREAM) --> make internet socket 7 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 8 | 9 | # PUSH ENDPOINT THROUGH WEB 10 | # Establish connection between me and host with port 80 11 | # Want the other end to be www.py4inf.com 12 | # Most like the open() call to read a file 13 | mysock.connect(('www.pythonlearn.com', 80)) 14 | # www.py4inf.com --> host 15 | # 80 --> port 16 | 17 | # SEND REQUEST 18 | mysock.send('GET http://www.pythonlearn.com/code/intro-short.txt HTTP/1.0\n\n') 19 | 20 | # LOOP TO READ 21 | while True: 22 | # Call to receive 23 | data = mysock.recv(512) 24 | # When last data, break the loop 25 | if len(data) < 1: 26 | break 27 | # Get data 28 | print data 29 | # Close the socket 30 | mysock.close() 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /python_access_web_data/w3_networks_sockets/urllib1.py: -------------------------------------------------------------------------------- 1 | # Using urllib which is easier than socket 2 | # socket is like a phonecall where you determine the connection 3 | 4 | import urllib 5 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt') 6 | print type(fhand) 7 | # for line in fhand: 8 | # print line.strip() 9 | 10 | # You can count the words! 11 | 12 | counts = dict() 13 | for line in fhand: 14 | print line.strip() 15 | words = line.split() 16 | for word in words: 17 | counts[word] = counts.get(word, 0) + 1 18 | print counts -------------------------------------------------------------------------------- /python_access_web_data/w4_web_scraping/BeautifulSoup.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_access_web_data/w4_web_scraping/BeautifulSoup.pyc -------------------------------------------------------------------------------- /python_access_web_data/w4_web_scraping/assignment_scrap_href_url.py: -------------------------------------------------------------------------------- 1 | # 1. Import urllib and Beautiful Soup 2 | # 2. Request raw URL input, count and position 3 | # 3. Read HTML with urllib's urlopen() method 4 | # 4. Parse HTML with BeautifulSoup's BeautifulSoup() method 5 | # 5. Retrieve list of anchor a tags 6 | # 6. Loop through to get names 7 | # 7. Get new url based on (position - 1) due to nature of counts request 8 | # 8. Delete the whole list for a new iteration through (3) to (7) 9 | 10 | # 1. Import urllib and Beautiful Soup 11 | import urllib 12 | from BeautifulSoup import * 13 | 14 | # 2. Request raw URL input, count and position 15 | url = raw_input('Enter URL to scrap: ') 16 | count = raw_input('Enter count: ') 17 | count = int(count) 18 | position = raw_input('Enter position: ') 19 | position = int(position) 20 | 21 | tag_list = list() 22 | # Repeats this count number of times 23 | for i in range(count): 24 | print "Retrieving:", url 25 | # 3. Read HTML with urllib's urlopen() method 26 | html = urllib.urlopen(url).read() 27 | 28 | # 4. Parse HTML with BeautifulSoup's BeautifulSoup() method 29 | soup = BeautifulSoup(html) 30 | 31 | # 5. Retrieve list of anchor a tags 32 | tags = soup('a') 33 | 34 | # 6. Loop through to append tags to a list tag_list 35 | for tag in tags: 36 | tag_list.append(tag) 37 | 38 | # 7. Get new url based on (position - 1) due to nature of counts request 39 | url = tag_list[position - 1].get('href', None) 40 | print tag_list 41 | 42 | # 8. Delete the whole list for a new iteration through (3) to (7) 43 | del tag_list[:] 44 | 45 | print "Retrieving", url -------------------------------------------------------------------------------- /python_access_web_data/w4_web_scraping/assignment_scrap_span_tags.py: -------------------------------------------------------------------------------- 1 | # 1. Import urllib and Beautiful Soup 2 | # 2. Request raw URL input 3 | # 3. Read HTML with urllib's urlopen() method 4 | # 4. Parse HTML with BeautifulSoup's BeautifulSoup() method 5 | # 5. Retrieve list of span tags 6 | # 6. Loop through to get numbers 7 | 8 | # 1. Import urllib and Beautiful Soup 9 | import urllib 10 | from BeautifulSoup import * 11 | 12 | # 2. Request raw URL input 13 | url = raw_input('URL to scrap: ') 14 | 15 | # 3. Read HTML with urllib's urlopen() method 16 | html = urllib.urlopen(url).read() 17 | 18 | # 4. Parse HTML with BeautifulSoup's BeautifulSoup() method 19 | soup = BeautifulSoup(html) 20 | 21 | # 5. Retrieve list of span tags 22 | tags = soup('span') 23 | 24 | total = 0 25 | # 6. Loop through to get numbers 26 | for tag in tags: 27 | # print 'TAG:', tag 28 | # print 'URL:', tag.get('href', None) 29 | # print 'Contents:', tag.contents[0] 30 | # print 'Attrs:', tag.attrs 31 | num = tag.contents[0] 32 | # Retrieve number with positions 0 onwards 33 | num_int = int(num[0:]) 34 | total += num_int 35 | 36 | print total 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /python_access_web_data/w4_web_scraping/for_i_in_range.py: -------------------------------------------------------------------------------- 1 | l = 5 2 | 3 | for i in range(l): 4 | print "test" -------------------------------------------------------------------------------- /python_access_web_data/w4_web_scraping/scrappin_a_tags.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from BeautifulSoup import * 3 | 4 | url =raw_input('Enter - ') 5 | 6 | html = urllib.urlopen(url).read() 7 | soup = BeautifulSoup(html) 8 | 9 | # Retrieve a list of anchor tags 10 | # Each tag is like a dictionary of HTML attributes 11 | 12 | # Just want 13 | tags = soup('a') 14 | 15 | # Look at all the 'a' anchor tags 16 | # Give values of href 17 | for tag in tags: 18 | print tag.get('href', None) 19 | -------------------------------------------------------------------------------- /python_access_web_data/w4_web_scraping/webscraping.txt: -------------------------------------------------------------------------------- 1 | Web Scraping 2 | Program/script that pretends to be a browser 3 | retrieves web pages 4 | looks 5 | extracts 6 | looks at more 7 | Search Engines do this 8 | Web Crawling 9 | 10 | 11 | Browser sends GET 12 | Server responds with HTML 13 | Request Response Cycle 14 | Click --> event in browser 15 | Browser makes connection to server 16 | Makes application GET request 17 | Server respond 18 | 19 | -------------------------------------------------------------------------------- /python_access_web_data/w5_retrieve_parse_xml/assignment_parse_retrieve_count_xml.py: -------------------------------------------------------------------------------- 1 | # 1. Import urllib and xml.etree.ElementTree 2 | # 2. Prompt for URL using raw_input() 3 | # 2b. Print Retrieving, url 4 | # 3a. Read url using urllib method urlopen(url).read() 5 | # 3b. Count length of data 6 | # 4. Parse url 7 | # 4a. Producing structure of grabbing the tree of nodes: de-serialization 8 | # 4b. Grab all the count nodes 9 | # 5. Extract comment counts from XML using loop 10 | # 6. Compute the sum of numbers 11 | 12 | # 1. Import urllib and xml.etree.ElementTree 13 | import urllib 14 | import xml.etree.ElementTree as ET 15 | 16 | # 2. Prompt for URL using raw_input() 17 | url = raw_input('Enter location: ') 18 | 19 | # 2b. Print Retrieving, url 20 | print 'Retrieving', url 21 | 22 | # 3a. Read url using urllib method urlopen(url).read() 23 | data = urllib.urlopen(url).read() 24 | 25 | # 3b. Count length of data 26 | print 'Retrieving %d characters' % (len(data)) 27 | 28 | # 4a. Producing structure of grabbing the tree of nodes: deserialization 29 | tree = ET.fromstring(data) 30 | 31 | # 4b. Grab all the count nodes 32 | comments = tree.findall('.//comment') 33 | 34 | lst = list() 35 | # 5. Extract comment counts from XML using loop 36 | for item in comments: 37 | count = item.find('count').text 38 | count = int(count) 39 | lst.append(count) 40 | 41 | print sum(lst) -------------------------------------------------------------------------------- /python_access_web_data/w5_retrieve_parse_xml/geoxml.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import xml.etree.ElementTree as ET 3 | 4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/xml?' 5 | 6 | while True: 7 | address = raw_input('Enter location: ') 8 | if len(address) < 1 : break 9 | 10 | url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address}) 11 | print 'Retrieving', url 12 | uh = urllib.urlopen(url) 13 | data = uh.read() 14 | print 'Retrieved',len(data),'characters' 15 | print data 16 | tree = ET.fromstring(data) 17 | 18 | 19 | results = tree.findall('result') 20 | lat = results[0].find('geometry').find('location').find('lat').text 21 | lng = results[0].find('geometry').find('location').find('lng').text 22 | location = results[0].find('formatted_address').text 23 | 24 | print 'lat',lat,'lng',lng 25 | print location 26 | -------------------------------------------------------------------------------- /python_access_web_data/w5_retrieve_parse_xml/web_services.txt: -------------------------------------------------------------------------------- 1 | Web Services 2 | 3 | Sending data across the web 4 | Dictionary -->(serialize) Wire Protocol (XML and JSON) -->(de-serialize) Java Hashmap 5 | Serialize: 6 | taking internal structure and creating wire format (XML/JSON) - 7 | convert to common format 8 | De-serialize: 9 | taking wire format & creating internal structures in diff languages 10 | allows creating different applications that work in different languages 11 | convert from common format 12 | 13 | JSON: Javascript Object Notation 14 | XML: eXtensible Markup Language 15 | 16 | -------------------------------------------------------------------------------- /python_access_web_data/w5_retrieve_parse_xml/xml.txt: -------------------------------------------------------------------------------- 1 | XML (eXtensible Markup Language) 2 | View as a tree of nodes (elements and nodes) 3 | 4 | Elements 5 | Simple element: just tags including text 6 | Complex element: include other tags 7 | 8 | Tags 9 | Start/end tag 10 | Attributes 11 | Text Content 12 | Self closing tag 13 | Your tags are customised based on what is most useful 14 | 15 | 16 | White space do not matter 17 | 18 | Tree of nodes 19 | 20 | 21 | text 22 | 23 | /a/b/c 24 | /a/d/ 25 | 26 | 27 | VALIDATION 28 | XML Document & XML Schema Contract --> valid/invalid 29 | 30 | XML Schema Contract 31 | 32 | 33 | 44 | 45 | Types 46 | xs:"string" 47 | xs:"date" 48 | xs:"dateTime" 49 | xs:"decimal" 50 | xs:"integer" 51 | 52 | Time representation ISO 8601 Date/time format 53 | YYYY-MM-DDTHH:MM:SSZ 54 | T 55 | Z: time zone 56 | 57 | minOccurs = 0 (optional) 58 | minOccurs = 1 (compulsory) 59 | maxOccurs="unbounded" (as many as possible) 60 | base="xs:string" --> x number of options -------------------------------------------------------------------------------- /python_access_web_data/w5_retrieve_parse_xml/xml1.py: -------------------------------------------------------------------------------- 1 | # Naming code 2 | import xml.etree.ElementTree as ET 3 | 4 | # ''' represents whole string 5 | data = ''' 6 | 7 | Chuck 8 | 9 | +1 734 303 4456 10 | 11 | 12 | ''' 13 | 14 | # Parse string: de-serialization 15 | tree = ET.fromstring(data) 16 | 17 | # Find tag 'name' --> text 18 | print 'Name:',tree.find('name').text 19 | 20 | # Find tag 'email' --> get 'hide' attribute 21 | print 'Attr:',tree.find('email').get('hide') 22 | 23 | print 'Phone:', tree.find('phone').text 24 | print 'Phone type:', tree.find('phone').get('type') -------------------------------------------------------------------------------- /python_access_web_data/w5_retrieve_parse_xml/xml2.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | input = ''' 4 | 5 | 6 | 7 | 001 8 | Chuck 9 | 10 | 11 | 009 12 | Brent 13 | 14 | 15 | ''' 16 | 17 | # Producing structure of grabbing the tree of nodes 18 | stuff = ET.fromstring(input) 19 | 20 | # Grab all the users/user nodes 21 | lst = stuff.findall('users/user') 22 | print 'User count:', len(lst) 23 | 24 | # Loop 25 | for item in lst: 26 | print 'Name', item.find('name').text 27 | print 'Id', item.find('id').text 28 | print 'Attribute', item.get("x") 29 | 30 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/access_api.txt: -------------------------------------------------------------------------------- 1 | SOAP (Simple Object Access Protocol) 2 | - Remote programs/code which we use over the network 3 | 4 | REST (Representational State Transfer) 5 | Remote resources we create, read, update and delete remotely 6 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/api.txt: -------------------------------------------------------------------------------- 1 | Rate limiting 2 | You can make a certain number of calls per day 3 | 4 | If you want to exceed rate 5 | Start/stop for n-days to cover everything 6 | 7 | Twitter 8 | Create app 9 | Get hidden.py details from the app page 10 | You access with signature (OAuth) 11 | 12 | Service Oriented Architecture 13 | Allows an application to be broken into parts and distributed across 14 | networks 15 | 16 | Application Programming Interface (API) 17 | Contract for interaction 18 | 19 | Web Services 20 | Provide infrastructure for applications cooperating (an API) 21 | over a network 22 | Two styles of web services 23 | SOAP (Simple Object Access Protocol) - Rarely used 24 | REST (Representational State Transfer) 25 | 26 | Serialization Formats 27 | XML and JSON -------------------------------------------------------------------------------- /python_access_web_data/w6_json/assignment_extract_data_json.py: -------------------------------------------------------------------------------- 1 | # 1. Import json and urllib libraries 2 | # 2. Prompt for URL 3 | # 3. Read the URL using urllib open and read() 4 | # 4. Parse URL using json.loads 5 | # 5. Extract comments from dictionary using loop 6 | # 6. Store counts in list by creating list() and append() 7 | # 7. Sum numbers in list using sum() 8 | 9 | 10 | # 1. Import json and urllib libraries 11 | import json 12 | import urllib 13 | 14 | # 2. Prompt for URL 15 | url = raw_input('Enter location: ') 16 | print 'Retrieving', url 17 | 18 | # 3. Read the URL using urllib open and read() 19 | url_handle = urllib.urlopen(url) 20 | url_data = url_handle.read() 21 | print 'Retrieved', len(url_data) 22 | 23 | # 4. Parse URL using json.loads 24 | jsp = json.loads(url_data) 25 | # print jsp 26 | 27 | # 5. Extract comments from dictionary using loop 28 | l = list() 29 | for u in jsp['comments']: 30 | num = int(u['count']) 31 | l.append(num) 32 | 33 | print 'Count:', len(l) 34 | print 'Sum:', sum(l) -------------------------------------------------------------------------------- /python_access_web_data/w6_json/assignment_google_geojson.py: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR EXTRACTING LOCATION INFORMATION FROM GOOGLE API 2 | # 1. Import urllib and json 3 | # 2. Prompt for location raw_input() 4 | # 3. Contact Google's web service using urllib.urlencode 5 | # 3b. Print URL 6 | # 3c. URL handle, read data and print len(URL) 7 | # 4. Parse JSON using json.loads(url_data) 8 | # 4b. Try and except for error handling 9 | # 5. Pretty print JSON using json.dumps(js, indent=4) 10 | # 6. Access JSON to find 'place_id' 11 | 12 | # 1. Import urllib and json 13 | import urllib 14 | import json 15 | 16 | # 2. Prompt for location raw_input() 17 | address = raw_input('Enter location: ') 18 | 19 | 20 | # 3. Contact Google's web service using urllib.urlencode 21 | service_url = 'http://maps.googleapis.com/maps/api/geocode/json?' 22 | url = service_url + urllib.urlencode({'sensor': 'false', 'address': address}) 23 | 24 | # 3b. Print URL 25 | print 'Retrieving', url 26 | 27 | # 3c. URL handle, read data and print len(URL) 28 | url_handle = urllib.urlopen(url) 29 | url_data = url_handle.read() 30 | print 'Retrieved %d characters' % (len(url_data)) 31 | 32 | # 4. Parse JSON using json.loads(url_data) 33 | # jsh = json.loads(url_data) 34 | # print jsh 35 | 36 | # 4b. Try and except for error handling 37 | try: 38 | js_handle = json.loads(url_data) 39 | except: 40 | js_handle = None 41 | 42 | # 5. Pretty print JSON using json.dumps(js, indent=4) 43 | print json.dumps(js_handle, indent=4) 44 | 45 | 46 | # 6. Access JSON to find 'place_id' 47 | place = js_handle["results"][0]["place_id"] 48 | print 'Place id', place 49 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/geojson.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import urllib 3 | import json 4 | 5 | # Access Service API 6 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?' 7 | 8 | # Loop 9 | while True: 10 | # Prompt location 11 | address = raw_input('Enter location: ') 12 | # If input = 0, end loop 13 | if len(address) < 1 : break 14 | 15 | # Encodes using urlencode using a dictionary 16 | url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address}) 17 | 18 | # Print URL 19 | print 'Retrieving', url 20 | 21 | # Open and read URL 22 | uh = urllib.urlopen(url) 23 | data = uh.read() 24 | 25 | # Print URL characters 26 | print 'Retrieved',len(data),'characters' 27 | 28 | # Try and except for bad data handling 29 | # json.loads loads data 30 | try: js = json.loads(str(data)) 31 | except: js = None 32 | if 'status' not in js or js['status'] != 'OK': 33 | print '==== Failure To Retrieve ====' 34 | print data 35 | continue 36 | 37 | # Using json library, dump string using .dumps 38 | # Parse dictionary js 39 | # indent = 4: print it out nicely with indent of 4 40 | print json.dumps(js, indent=4) 41 | 42 | # js is a dictionary 43 | # ["results"] is a list of dictionaries 44 | # ["results"][0] accesses first object 45 | # Access geometry dictionary 46 | # Access location dictionary 47 | # Pull out lat 48 | lat = js["results"][0]["place_id"]["location"]["lat"] 49 | lng = js["results"][0]["geometry"]["location"]["lng"] 50 | print 'lat',lat,'lng',lng 51 | 52 | # Access formatted_address 53 | # Tip, slowly do print [] until you reach where you want 54 | location = js['results'][0]['formatted_address'] 55 | print location 56 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/json.txt: -------------------------------------------------------------------------------- 1 | JSON 2 | Doesn't do well with infinite nested nodes 3 | Key-value pairs separated by : 4 | 5 | List: array 6 | Dictionary: object -------------------------------------------------------------------------------- /python_access_web_data/w6_json/json_notes.py: -------------------------------------------------------------------------------- 1 | # Import json 2 | import json 3 | 4 | # Create Data: Dictionary 5 | # If dictionary, data parsed will be dictionary 6 | data = '''{ 7 | "name" : "John", 8 | "phone" : { 9 | "type" : "international", 10 | "mobile" : "999" 11 | }, 12 | "email" : { 13 | "hide" : "yes" 14 | } 15 | }''' 16 | 17 | # Deserialization from string to internal structure (where you get back a dictionary) 18 | info = json.loads(data) 19 | 20 | # Access native dictionary 21 | print 'Name:', info["name"] 22 | print 'Hide:', info["email"]["hide"] 23 | 24 | 25 | # Create data list 26 | # If list, data parsed will be list 27 | input2 = '''[ 28 | { 29 | "id" : "001", 30 | "x" : "2", 31 | "name" : "Chuck" 32 | }, 33 | { 34 | "id" : "009", 35 | "x" : "7", 36 | "name" : "Chuck" 37 | } 38 | ]''' 39 | 40 | info2 = json.loads(input2) 41 | print 'User count:', len(info2) 42 | for item in info2: 43 | print 'Name', item['name'] 44 | print 'Id', item['id'] 45 | print 'Attribute', item['x'] -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/get_profile_post.py: -------------------------------------------------------------------------------- 1 | # ACCESS POST DATA 2 | # RUN THIS 3 | import urllib 4 | from twurl import augment 5 | 6 | print '* Calling Twitter...' 7 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json', 8 | {'screen_name': 'drchuck', 'count': '2'} ) 9 | print url 10 | connection = urllib.urlopen(url) 11 | data = connection.read() 12 | print data 13 | headers = connection.info().dict 14 | print headers 15 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/get_twitter_friends.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import twurl 3 | import json 4 | 5 | # Base URL: get from API Documentation 6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json' 7 | 8 | while True: 9 | print '' 10 | acct = raw_input('Enter Twitter Account:') 11 | if ( len(acct) < 1 ) : break 12 | # Count: # of friends 13 | url = twurl.augment(TWITTER_URL, 14 | {'screen_name': acct, 'count': '5'} ) 15 | print 'Retrieving', url 16 | # Open URL 17 | connection = urllib.urlopen(url) 18 | # Open body, JSON 19 | data = connection.read() 20 | # Open headers, dictionary 21 | headers = connection.info().dict 22 | # Remaining limit for the day by accessing the dictionary 23 | print 'Remaining', headers['x-rate-limit-remaining'] 24 | # Deserialize the json --> turn to list (native Python) 25 | js = json.loads(data) 26 | # json.dumps for pretty printing of 'js' using 'indent=4' 27 | # Easier readability of JSON data 28 | print json.dumps(js, indent=4) 29 | 30 | # Loop through list of users 31 | for u in js['users']: 32 | print u['screen_name'] 33 | s = u['status']['text'] 34 | print ' ',s[:50] 35 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/hidden.py: -------------------------------------------------------------------------------- 1 | # Keep this file separate 2 | # https://apps.twitter.com/ 3 | 4 | def oauth() : 5 | return { "consumer_key" : "h7Lu...Ng", 6 | "consumer_secret" : "dNKenAC3New...mmn7Q", 7 | "token_key" : "10185562-eibxCp9n2...P4GEQQOSGI", 8 | "token_secret" : "H0ycCFemmC4wyf1...qoIpBo" } 9 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/hidden.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_access_web_data/w6_json/twitter_api/hidden.pyc -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/oauth.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_access_web_data/w6_json/twitter_api/oauth.pyc -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/twurl.py: -------------------------------------------------------------------------------- 1 | # File for twtest.py to import to get post data 2 | 3 | import urllib 4 | import oauth 5 | # Fill in the application's information in hidden 6 | import hidden 7 | 8 | # How to do OAuth signing 9 | def augment(url, parameters): 10 | secrets = hidden.oauth() 11 | consumer = oauth.OAuthConsumer(secrets['consumer_key'], secrets['consumer_secret']) 12 | token = oauth.OAuthToken(secrets['token_key'],secrets['token_secret']) 13 | 14 | oauth_request = oauth.OAuthRequest.from_consumer_and_token(consumer, 15 | token=token, http_method='GET', http_url=url, parameters=parameters) 16 | oauth_request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, token) 17 | return oauth_request.to_url() 18 | 19 | 20 | def test_me(): 21 | print '* Calling Twitter...' 22 | # Count: # of post to load 23 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json', 24 | {'screen_name': 'drchuck', 'count': '2'} ) 25 | print url 26 | connection = urllib.urlopen(url) 27 | 28 | # Gets the body 29 | # JSON 30 | data = connection.read() 31 | print data 32 | 33 | # Gets dictionary of headers 34 | # JSON 35 | headers = connection.info().dict 36 | print headers 37 | -------------------------------------------------------------------------------- /python_access_web_data/w6_json/twitter_api/twurl.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_access_web_data/w6_json/twitter_api/twurl.pyc -------------------------------------------------------------------------------- /python_data_structures/10_most_common_words.py: -------------------------------------------------------------------------------- 1 | # File handle to open the file 2 | fhand = open('romeo.txt') 3 | 4 | # Create dictionary 5 | counts = dict() 6 | 7 | # Populate dictionary 8 | for line in fhand: 9 | # Split the words for looping 10 | words = line.split() 11 | # Loop through words 12 | for word in words: 13 | # 1. If there is no count for the word, add 0 14 | # 2. If there is count for the word, add 1 15 | counts[word] = counts.get(word, 0) + 1 16 | 17 | # Create list 18 | lst = list() 19 | 20 | # Convert to list 21 | for key, val in counts.items(): 22 | lst.append((val, key)) 23 | # Create list for sorting based on value 24 | 25 | # Sort based on highest value 26 | lst.sort(reverse=True) 27 | print lst 28 | 29 | # Print 10 most common words based on key and value 30 | for val, key in lst[:10]: 31 | print key, val -------------------------------------------------------------------------------- /python_data_structures/10_most_common_words_short.py: -------------------------------------------------------------------------------- 1 | c = {'a': 10, 'b': 1, 'c': 22} 2 | 3 | # [ ] list comprehension creates a dynamic list 4 | 5 | print sorted([(v, k) for (k, v) in c.items()]) 6 | -------------------------------------------------------------------------------- /python_data_structures/assignment10.2_open_tuples.py: -------------------------------------------------------------------------------- 1 | # 1. Use file handle to open mbox-short.txt 2 | # 2. Loop through lines to read 'From ' 3 | # 3. Split the line to form list of words 4 | # 4. Seek the time 5 | # 5. Split line using ':' 6 | # 6a. Create counts dictionary 7 | # 6b. Map key-value pairs to dictionary 8 | # 7. Convert counts dictionary to list of tuples 9 | # 8. Sort list of tuples based on ascending order 10 | # 9. Print sorted list with a loop 11 | 12 | # 1. Use file handle to open mbox-short.txt 13 | fhandle = open('mbox-short.txt') 14 | 15 | # 6a. Create counts dictionary 16 | counts = dict() 17 | 18 | # 2. Loop through lines to read 'From ' 19 | for line in fhandle: 20 | # 3. Split the line to form list of words 21 | words = line.split() 22 | 23 | # Guardian pattern for blank lines 24 | # Error code: list index out of range 25 | if len(words) < 1: 26 | continue 27 | 28 | # Exclude 'From:' 29 | if words[0] == 'From:': 30 | continue 31 | 32 | # Exclude everything but 'From' 33 | if words[0] != 'From': 34 | continue 35 | 36 | # 4. Seek the time 37 | word = words[5] 38 | 39 | # 5. Split line using ':' 40 | word = word.split(':') 41 | num = word[0] 42 | 43 | # 6b. Map key-value pairs to dictionary 44 | # i. If there is no count for the word, add 0 45 | # ii. If there is count for the word, add 1 46 | counts[num] = counts.get(num, 0) + 1 47 | 48 | # 7. Convert counts dictionary to list of tuples 49 | counts = counts.items() 50 | 51 | # 8. Sort list of tuples based on ascending order 52 | counts_sorted = sorted(counts) 53 | 54 | # 9. Print sorted list with a loop 55 | for k, v in counts_sorted: 56 | print k, v 57 | -------------------------------------------------------------------------------- /python_data_structures/assignment7.1_open_read.py: -------------------------------------------------------------------------------- 1 | # Write a program that prompts for a file name, then opens that file and reads through the file, 2 | # and print the contents of the file in upper case. Use the file words.txt to produce the output below. 3 | 4 | # Use words.txt as the file name 5 | fname = raw_input('Enter file name: ') 6 | 7 | # Neat Trick to load file 8 | if len(fname) == 0: 9 | fname = 'words.txt' 10 | 11 | # File handler allows iteration 12 | fhand = open(fname) 13 | 14 | for line in fhand: 15 | line = line.rstrip().upper() 16 | # rstrip then upper 17 | print line 18 | 19 | -------------------------------------------------------------------------------- /python_data_structures/assignment7.2_open_seekaverage.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter file name: ') 2 | fhandle = open(fname) 3 | 4 | x = 'X-DSPAM-Confidence:' 5 | y = len(x) 6 | count = 0 7 | total = 0 8 | 9 | # print y 10 | # This would show that the number of characters is 19 11 | 12 | # print x[18:] 13 | # This would show that character 19 is ':' 14 | 15 | for line in fhandle: 16 | if line.startswith('X-DSPAM-Confidence:'): 17 | line_number = line[19:] 18 | line_float = float(line_number) 19 | 20 | count += 1 21 | # Loop, iterates through all numbers to count number of numbers 22 | 23 | total += line_float 24 | # Loop, iterates through all numbers to count sum of numbers 25 | 26 | # print line_float 27 | # This shows that we have successfully extracted the floating numbers 28 | 29 | print 'Number of numbers:', count 30 | # This would show that the number of numbers is 27 31 | 32 | print 'Sum of numbers:', total 33 | # This would show that total is 20.2694 34 | 35 | print 'Average of numbers:', total / count 36 | 37 | -------------------------------------------------------------------------------- /python_data_structures/assignment8.4_open_read_split.py: -------------------------------------------------------------------------------- 1 | # fname = raw_input('Enter file name: ') 2 | fhandle = open('romeo.txt') 3 | words = [] 4 | 5 | for line in fhandle: 6 | line = line.rstrip().split() 7 | for word in line: 8 | if word not in words: 9 | words.append(word) 10 | 11 | # Use sorted when the sort() returns None 12 | print sorted(words) 13 | 14 | -------------------------------------------------------------------------------- /python_data_structures/assignment8.5_open_read_split.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter file name: ') 2 | fhandle = open(fname) 3 | count = 0 4 | 5 | for line in fhandle: 6 | line = line.rstrip() 7 | word = line.split() 8 | 9 | # Guardian pattern 10 | if len(word) < 1: 11 | continue 12 | 13 | # Requires a guardian pattern due to empty line 14 | if word[0] != 'From': 15 | continue 16 | else: 17 | # Counting number 18 | count += 1 19 | 20 | # Shows you have retrieved the emails 21 | print word[1] 22 | 23 | print "There were", count, "lines in the file with From as the first word" -------------------------------------------------------------------------------- /python_data_structures/assignment9.4_dictionaries_count.py: -------------------------------------------------------------------------------- 1 | # 1. Get file name 2 | # 2. Open file 3 | # 3. Look for 'From' each line 4 | # 4. Get second word from each line 5 | # 5. Create dictionary 6 | # 6. Map address/count to dictionary 7 | # 7. Count most common using maximum loop 8 | 9 | # 1. Get file name 10 | # fname = raw_input('Enter file: ') 11 | 12 | # 2. Open file 13 | fhandle = open('mbox-short.txt') 14 | 15 | # 5. Create dictionary 16 | counts = dict() 17 | 18 | # 3. Look for 'From' each line 19 | for line in fhandle: 20 | words = line.split() 21 | 22 | # Guardian pattern for blank lines 23 | if len(words) < 1: 24 | continue 25 | 26 | # To ignore all sentences starting from "From:" 27 | if words[0] == 'From:': 28 | continue 29 | 30 | # To ignore all sentences not starting from "From " 31 | if words[0] != 'From': 32 | continue 33 | 34 | word = words[1] 35 | # 6. Map address/count to dictionary 36 | counts[word] = counts.get(word, 0) + 1 37 | # print words 38 | 39 | # 7. Count most common using maximum loop 40 | bigcount = None 41 | bigword = None 42 | 43 | for word, count in counts.items(): 44 | if bigcount is None or count > bigcount: 45 | bigword = word 46 | bigcount = count 47 | 48 | print bigword, bigcount 49 | # print counts 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /python_data_structures/average_list.py: -------------------------------------------------------------------------------- 1 | # New way to do it with list compared to using single variables 2 | # Performance: this stores data then calculate and requires more memory 3 | 4 | numlist = list() 5 | 6 | while True: 7 | inp = raw_input('Enter a number: ') 8 | if inp == 'done': 9 | break 10 | value = float(inp) 11 | numlist.append(value) 12 | 13 | average = sum(numlist) / len(numlist) 14 | 15 | print 'Average:', average 16 | -------------------------------------------------------------------------------- /python_data_structures/concat_lists.py: -------------------------------------------------------------------------------- 1 | a = [1, 2, 3] 2 | b = [4, 5, 6] 3 | 4 | c = a + b 5 | print c -------------------------------------------------------------------------------- /python_data_structures/count_mbox.py: -------------------------------------------------------------------------------- 1 | # fhand is the handle 2 | fhand = open('mbox.txt') 3 | count = 0 4 | 5 | # Python counts the number of lines using the loop 6 | for line in fhand: 7 | count += 1 8 | 9 | print('Line count: ', count) 10 | 11 | # Read it all and return to string 12 | inp = fhand.read() 13 | 14 | print(inp) 15 | print(len(inp)) 16 | print(inp[2: 4]) 17 | -------------------------------------------------------------------------------- /python_data_structures/counting_lines.py: -------------------------------------------------------------------------------- 1 | fname = input('Enter the file name: ') 2 | 3 | try: 4 | fhand = open(fname) 5 | 6 | except: 7 | print('File cannot be opened', fname) 8 | exit() 9 | 10 | count = 0 11 | for line in fhand: 12 | if line.startswith('Subject:'): 13 | count += 1 14 | print('There were', count, 'subject lines in', fname) -------------------------------------------------------------------------------- /python_data_structures/dictionaries.py: -------------------------------------------------------------------------------- 1 | # Dictionaries store key-value pairs 2 | # They are like list() except that they use keys instead of numbers to look up values 3 | purse = dict() 4 | 5 | # keys: money and candy 6 | purse['money'] = 12 7 | purse['candy'] = 3 8 | 9 | print purse 10 | 11 | # Dictionary is good for adding the number of counts of candy 12 | purse['candy'] += 2 13 | 14 | print purse 15 | print purse['candy'] 16 | 17 | # Check if key in dictionary 18 | print 'candy' in purse 19 | # Returns true 20 | 21 | # You can create a dictionary using curly 22 | bag = {'chuck:': 1, 'ritchie': 0} 23 | print bag 24 | 25 | # Common pattern of counting number of names 26 | counts = dict() 27 | names = ['csev', 'qudas', 'john', 'john'] 28 | for name in names: 29 | if name not in counts: 30 | counts[name] = 1 31 | else: 32 | counts[name] += 1 33 | print counts 34 | 35 | # get built-in function - new idiom 36 | # first parameter: key name 37 | # second parameter: value to give back if key does not exist 38 | ## print counts.get(name, 0) 39 | 40 | # Simplified counting with get() 41 | 42 | for name in names: 43 | counts[name] = counts.get(name, 0) + 1 44 | # No entry: 0 45 | # Entry: +1 46 | print counts 47 | 48 | # This assigns key to the key 49 | for key in counts: 50 | print key, counts[key] 51 | 52 | # Convert dictionaries to list for keys 53 | print counts.keys() 54 | 55 | # Convert dictionaries to list for values 56 | print counts.values() 57 | 58 | # Convert dictionaries to list for tuples (key-value pairs) 59 | print counts.items() 60 | 61 | # Two iteration variables 62 | for a, b in counts.items(): 63 | print a, b 64 | -------------------------------------------------------------------------------- /python_data_structures/dictionaries_count_commonword.py: -------------------------------------------------------------------------------- 1 | name = raw_input('Enter file: ') 2 | handle = open(name, 'r') 3 | text = handle.read() 4 | words = text.split() 5 | 6 | # Full dictionary with key-value pairs of words and counts of each words 7 | counts = dict() 8 | for word in words: 9 | counts[word] = counts.get(word, 0) + 1 10 | 11 | # Find largest key-value pair 12 | 13 | bigcount = None 14 | bigword = None 15 | 16 | for word, count in counts.items(): 17 | if bigcount is None or count > bigcount: 18 | bigword = word 19 | bigcount = count 20 | 21 | print bigword, bigcount -------------------------------------------------------------------------------- /python_data_structures/dictionaries_count_prog.py: -------------------------------------------------------------------------------- 1 | counts = dict() 2 | 3 | fname = raw_input('Key in your file name: ') 4 | fhandle = open('mbox-short.txt') 5 | 6 | for line in fhandle: 7 | line = line.split() 8 | for word in line: 9 | counts[word] = counts.get(word, 0) + 1 10 | 11 | print 'Counts', counts -------------------------------------------------------------------------------- /python_data_structures/find_day_mbox.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | 3 | for line in fhand: 4 | line = line.rstrip() 5 | # Skipping lines without 'From ' 6 | if not line.startswith('From '): 7 | continue 8 | words = line.split() 9 | 10 | # Double split pattern 11 | email = words[1] 12 | pieces_email = email.split('@') 13 | print pieces_email[1] 14 | 15 | print words[2] 16 | 17 | # Format of line without split 18 | # From cwen@iupui.edu Thu Jan 3 16:34:40 2008 19 | # 1: email 20 | # 2: day 21 | # 3: month 22 | # 4: day of month 23 | 24 | 25 | -------------------------------------------------------------------------------- /python_data_structures/list_methods.py: -------------------------------------------------------------------------------- 1 | # Constructor, gives an empty list 2 | # Or x = [] 3 | x = list() 4 | 5 | print type(x) 6 | 7 | # This gives you the methods available 8 | print dir(x) 9 | 10 | # Appending 11 | x.append('book') 12 | x.append(99) 13 | print x 14 | 15 | 16 | # Is something in List? 17 | if 99 in x: 18 | print '99 in x' 19 | 20 | if 88 not in x: 21 | print '88 not in x' 22 | 23 | # Ordering List 24 | a = [3, 1, 2, 5, 4] 25 | 26 | print a.sort() 27 | 28 | print len(a) 29 | 30 | print max(a) 31 | 32 | print min(a) 33 | 34 | print sum(a) 35 | -------------------------------------------------------------------------------- /python_data_structures/list_worked_exercise.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | 3 | for line in fhand: 4 | line = line.rstrip() 5 | word = line.split() 6 | # Use a debug line to see what's happening before the line that has issue 7 | # print '++', line 8 | # print word 9 | # You will see that there's an issue with a blank line 10 | 11 | # Fix to the code below using a guardian pattern 12 | if word == []: 13 | continue 14 | 15 | # This code dies whenever there's a blank line 16 | if word[0] != 'From': 17 | continue 18 | 19 | print word[2] 20 | -------------------------------------------------------------------------------- /python_data_structures/lists_immutable.py: -------------------------------------------------------------------------------- 1 | fruit = 'Banana' 2 | fruit[0] = 'b' 3 | # This gives an error as you can't change a string 4 | 5 | new_fruit = fruit.upper() 6 | print new_fruit 7 | # You can create a new variable -------------------------------------------------------------------------------- /python_data_structures/newline_p3.py: -------------------------------------------------------------------------------- 1 | stuff = 'Hello\nWorld!' 2 | print(stuff) 3 | 4 | print(len(stuff)) 5 | # \n character is counted as one character -------------------------------------------------------------------------------- /python_data_structures/open_function.py: -------------------------------------------------------------------------------- 1 | handle = open(filename, mode) 2 | # 1. Name of file 3 | # 2. How to read: 'r' to read or 'w' to write 4 | 5 | # Handle connects the primary and secondary storage 6 | # Simple: name of file must be same folder as the python code 7 | -------------------------------------------------------------------------------- /python_data_structures/range.py: -------------------------------------------------------------------------------- 1 | # Range returns a list of numbers from 0 to n - 1, n is the parameter 2 | 3 | print range(4) 4 | 5 | friends = ['Ritchie', 'John', 'Sally'] 6 | length_friends = len(friends) 7 | 8 | print range(length_friends) 9 | 10 | # This is preferred 11 | for friend in friends: 12 | print 'Happy New Year!', friend 13 | 14 | # Equivalent Loop where you need value of i 15 | for i in range(length_friends): 16 | friend = friends[i] 17 | print 'Happy New Year!', friend 18 | -------------------------------------------------------------------------------- /python_data_structures/romeo.txt: -------------------------------------------------------------------------------- 1 | But soft what light through yonder window breaks 2 | It is the east and Juliet is the sun 3 | Arise fair sun and kill the envious moon 4 | Who is already sick and pale with grief 5 | -------------------------------------------------------------------------------- /python_data_structures/searching.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox.txt') 2 | for line in fhand: 3 | if line.startswith('From:'): 4 | print line 5 | 6 | # print adds a newline \n to each line therefore explaining the blank lines -------------------------------------------------------------------------------- /python_data_structures/searching_continue_strip.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox.txt') 2 | 3 | for line in fhand: 4 | line = line.rstrip() 5 | # Skip all other lines 6 | if not line.startswith('From:'): 7 | continue 8 | # Process the lines we want 9 | print line 10 | -------------------------------------------------------------------------------- /python_data_structures/searching_in_strip.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox.txt') 2 | 3 | for line in fhand: 4 | line = line.rstrip() 5 | # Skip all other lines 6 | if not '@uct.ac.za' in line: 7 | continue 8 | # Process the lines we want 9 | print line 10 | -------------------------------------------------------------------------------- /python_data_structures/searching_strip.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox.txt') 2 | 3 | for line in fhand: 4 | line = line.rstrip() 5 | if line.startswith('From:'): 6 | print line 7 | 8 | # This strips the line of a new blank line by removing all characters after the end of the string 9 | # This means any whitespace characters are removed like \n in each space -------------------------------------------------------------------------------- /python_data_structures/slice_lists.py: -------------------------------------------------------------------------------- 1 | a = [1, 2, 3, 4, 5] 2 | 3 | print len(a) 4 | 5 | print a[0: 2] 6 | 7 | print a[:5] 8 | print a[0:] 9 | -------------------------------------------------------------------------------- /python_data_structures/split.py: -------------------------------------------------------------------------------- 1 | abc = 'With three words' 2 | 3 | # Finds the spaces and split them into a list of strings 4 | stuff = abc.split() 5 | print stuff 6 | print len(stuff) 7 | print stuff[0] 8 | 9 | # Can access through a loop 10 | for w in stuff: 11 | print w 12 | 13 | # Many spaces = 1 space 14 | line = 'A lot of spaces' 15 | line_split = line.split() 16 | print line_split 17 | 18 | # Split based on other characters; not spaces 19 | line_semicolon = 'first; second; third' 20 | line_semicolon_split = line_semicolon.split('; ') 21 | print line_semicolon_split 22 | 23 | 24 | -------------------------------------------------------------------------------- /python_data_structures/test.py: -------------------------------------------------------------------------------- 1 | stuff = dict() 2 | stuff['money'] = 1 3 | stuff['apples'] = 0 4 | 5 | stuff['google'] = 2 6 | print stuff -------------------------------------------------------------------------------- /python_data_structures/tuples.py: -------------------------------------------------------------------------------- 1 | # Tuples are immutable lists with key-value pair 2 | # You can do what you do with lists 3 | # Loop through them 4 | # max(tuple) 5 | # min(tuple) 6 | # CANNOT 7 | # sort() 8 | # append() 9 | 10 | 11 | # Mutable List 12 | x = [1, 2] 13 | x[0] = 2 14 | print x 15 | 16 | print " " 17 | 18 | # Immutable list or tuple, z 19 | z = (5, 4, 3) 20 | # z[0] = 1 21 | # This does not work as a tuple is immutable 22 | print z[0] 23 | 24 | print " " 25 | 26 | # Less in-built functions compared to lists 27 | print dir(z) 28 | 29 | print " " 30 | 31 | # When to use tuples? 32 | # 1. Temporary variables 33 | # 2. Immutable variables 34 | # 3. Saving processing time 35 | 36 | # Tupple assignments 37 | (a, b) = (99, 98) 38 | print (a, b) 39 | 40 | print " " 41 | 42 | # Converting dictionaries to list of tuples to loop through list 43 | d = dict() 44 | d['BMW'] = 10 45 | d['VW'] = 5 46 | for (k, v) in d.items(): 47 | print k, v 48 | 49 | print " " 50 | 51 | t = d.items() 52 | print t 53 | 54 | # Tuples are comparable 55 | 56 | # (0, 1, 20) < (5, 1, 2) 57 | # TRUE 58 | # Compares the first only 59 | # If equal, goes to the next value to compare 60 | 61 | print " " 62 | 63 | # Sorted: takes an unsorted list as input and returns a sorted list 64 | f = sorted(d.items()) 65 | 66 | for k, v in f: 67 | print k, v 68 | 69 | print " " 70 | # Sort by values instead of key 71 | c = {'a': 10, 'b': 1, 'c': 22} 72 | tmp = list() 73 | 74 | for k, v in c.items(): 75 | tmp.append((v, k)) 76 | print tmp 77 | 78 | tmp = sorted(tmp, reverse=True) 79 | print tmp 80 | 81 | -------------------------------------------------------------------------------- /python_data_structures/tuples_worked_exercise.py: -------------------------------------------------------------------------------- 1 | # File handle to open the file 2 | fhand = open('romeo.txt') 3 | 4 | # Create dictionary 5 | counts = dict() 6 | 7 | # Populate dictionary 8 | for line in fhand: 9 | # Split the words for looping 10 | words = line.split() 11 | # Loop through words 12 | for word in words: 13 | # 1. If there is no count for the word, add 0 14 | # 2. If there is count for the word, add 1 15 | counts[word] = counts.get(word, 0) + 1 16 | 17 | # Create list 18 | lst = list() 19 | 20 | # Convert to list 21 | for key, val in counts.items(): 22 | lst.append((val, key)) 23 | # Create list for sorting based on value 24 | 25 | # Sort based on highest value 26 | lst.sort(reverse=True) 27 | 28 | # Print 10 most common words based on key and value 29 | for val, key in lst[:10]: 30 | print key, val -------------------------------------------------------------------------------- /python_data_structures/words.txt: -------------------------------------------------------------------------------- 1 | Writing programs or programming is a very creative 2 | and rewarding activity You can write programs for 3 | many reasons ranging from making your living to solving 4 | a difficult data analysis problem to having fun to helping 5 | someone else solve a problem This book assumes that 6 | {\em everyone} needs to know how to program and that once 7 | you know how to program, you will figure out what you want 8 | to do with your newfound skills 9 | 10 | We are surrounded in our daily lives with computers ranging 11 | from laptops to cell phones We can think of these computers 12 | as our personal assistants who can take care of many things 13 | on our behalf The hardware in our current-day computers 14 | is essentially built to continuously ask us the question 15 | What would you like me to do next 16 | 17 | Our computers are fast and have vasts amounts of memory and 18 | could be very helpful to us if we only knew the language to 19 | speak to explain to the computer what we would like it to 20 | do next If we knew this language we could tell the 21 | computer to do tasks on our behalf that were reptitive 22 | Interestingly, the kinds of things computers can do best 23 | are often the kinds of things that we humans find boring 24 | and mind-numbing 25 | -------------------------------------------------------------------------------- /python_databases/w1_oop/inheritance.py: -------------------------------------------------------------------------------- 1 | class PartyAnimal: 2 | # Two INSTANCES/VARIABLES here 3 | x = 0 4 | name = '' 5 | 6 | # CONSTRUCTOR 7 | # Double underscores, methods when something happens 8 | # When constructor runs, this happens 9 | # constructor 2 params 10 | # self: alias of instance we're in 11 | # nam: name param 12 | def __init__(self, nam): 13 | self.name = nam 14 | print self.name, 'constructed' 15 | 16 | # Self becomes an alias for s or j 17 | def party(self): 18 | self.x += 1 19 | print self.name, 'party count', self.x 20 | 21 | # FootballFan is a class that extends PartyAnimal 22 | # All the capabilities of PartyAnimal and more 23 | class FootballFan(PartyAnimal): 24 | # Add new variable/ 25 | points = 0 26 | 27 | # Add one new method 28 | def touchdown(self): 29 | self.points += 7 30 | self.party() 31 | print self.name, 'points', self.points 32 | 33 | # Create PartyAnimal object/instance 34 | s = PartyAnimal('Sally') 35 | # short-form is s.party() 36 | PartyAnimal.party(s) 37 | 38 | # Create FootballFan object/instance 39 | j = FootballFan('Jim') 40 | # short-form is j.party() 41 | FootballFan.party(j) 42 | 43 | # short-form is j.touchdown() 44 | FootballFan.touchdown(j) 45 | 46 | # 3 OBJECTS/INSTANCES in total 47 | # x, name and points 48 | -------------------------------------------------------------------------------- /python_databases/w1_oop/inheritance.txt: -------------------------------------------------------------------------------- 1 | OOP: INHERITANCE 2 | Reuse existing class and inherit (borrow) all capabilities 3 | of an existing class & add more to make a new class 4 | 5 | "Extending a class" 6 | 7 | Write once and use many times 8 | 9 | The new class (child) has the capabilities + more of the old 10 | class (parent) 11 | 12 | -------------------------------------------------------------------------------- /python_databases/w1_oop/object_lifecycle.py: -------------------------------------------------------------------------------- 1 | # ONE INSTANCE EXAMPLE ALONG WITH CONSTRUCTOR 2 | class PartyAnimal: 3 | # One instance here 4 | x = 0 5 | 6 | # Double underscores, methods when something happens 7 | # When constructor runs, this happens 8 | def __init__(self): 9 | print "I am constructed" 10 | 11 | def party(self): 12 | self.x = self.x + 1 13 | print 'So far', self.x 14 | 15 | def __del__(self): 16 | print 'I am destructed', self.x 17 | 18 | an = PartyAnimal() 19 | 20 | an.party() 21 | an.party() 22 | an.party() 23 | 24 | # TWO INSTANCE VARIABLE 25 | class PartyAnimal2: 26 | # 1st instance variable 27 | x = 0 28 | # 2nd instance variable 29 | name = '' 30 | 31 | # constructor 2 params 32 | # self: alias of instance we're in 33 | # nam: name param 34 | def __init__(self, nam): 35 | self.name = nam 36 | print self.name, 'constructed' 37 | 38 | # Self becomes an alias for s or j 39 | def party(self): 40 | self.x = self.x + 1 41 | print self.name, 'party count', self.x 42 | 43 | s = PartyAnimal2('Sally') 44 | # Short-form of PartyAnimal2.party(s) 45 | s.party() 46 | 47 | j = PartyAnimal2('Jim') 48 | # Short-form of PartyAnimal2.party(j) 49 | j.party() 50 | s.party() -------------------------------------------------------------------------------- /python_databases/w1_oop/object_lifecycle.txt: -------------------------------------------------------------------------------- 1 | Object Life Cycle 2 | Objects are created, used and discarded 3 | Special blocks of code (methods) that get called 4 | During creation (constructor) - common 5 | During destruction (destructor) - rare 6 | 7 | Constructor 8 | Set up some instance variables to have proper 9 | initial values when the object is created 10 | 11 | Constructor in a class is a special block of statements 12 | called when an object is created 13 | 14 | Many Instances 15 | We can create many objects (using class as template for object) 16 | 17 | We can store each distinct object in its own variable 18 | 19 | Each instance have its own copy of the instance variables -------------------------------------------------------------------------------- /python_databases/w1_oop/oop.py: -------------------------------------------------------------------------------- 1 | # PartyAnimal: class for making PartyAnimal objects 2 | class PartyAnimal: 3 | # x = 0: each PartyAnimal object has data 4 | x = 0 5 | 6 | # def: each PartyAnimal object has code 7 | # every method must have >= 1 parameters 8 | def party(self): 9 | self.x = self.x + 1 10 | print 'So far', self.x 11 | 12 | # Create PartyAnimal object/instance 13 | an = PartyAnimal() 14 | # an would have 15 | # data: x = 0 16 | # code: party() 17 | print type(an) 18 | print dir(an) 19 | 20 | # Call party() method 21 | # Short-form of PartyAnimal.party(an) 22 | an.party() 23 | # self is an alias of an 24 | # data: x = 0 + 1 = 1 25 | # code: party() 26 | an.party() 27 | # data: x = 1 + 1 = 2 28 | # code: party() 29 | an.party() 30 | # data: x = 2 + 1 = 3 31 | # code: party() 32 | 33 | a = list() 34 | # list() is built-in class 35 | 36 | type(a) 37 | 38 | print dir(a) 39 | # methods: operations that the object can perform 40 | # append() 41 | # pop() -------------------------------------------------------------------------------- /python_databases/w1_oop/oop.txt: -------------------------------------------------------------------------------- 1 | Program is made up of many coordinating objects 2 | Each object is a "little island" 3 | Cooperatively working with one another 4 | Objects make use of one another's capabilities 5 | 6 | Object 7 | self-contained code and data 8 | String objects 9 | Integer objects 10 | Dictionary objects 11 | List objects 12 | 13 | Object-approach 14 | Break problem into smaller parts 15 | Hide complex parts 16 | Represent easier framework 17 | You care about the interactions 18 | 19 | INPUT --> (COORDINATING OBJECTS) --> OUTPUT 20 | bits of code/data 21 | 22 | OOP 23 | Structured approach to code reuse 24 | 25 | Can group data and functionality together 26 | and create many independent instances 27 | of a class 28 | 29 | Fields = Attributes = Class Variables -------------------------------------------------------------------------------- /python_databases/w1_oop/terminology.txt: -------------------------------------------------------------------------------- 1 | Class: template 2 | Dog 3 | Method/message: defined capability of a class 4 | Call a method to the class 5 | Activating the code 6 | Bark() 7 | Field/attribute: bit of data in class 8 | Variable 9 | Object/instance: particular instance of a class 10 | 11 | -------------------------------------------------------------------------------- /python_databases/w2_sql/assignment_count_email.py: -------------------------------------------------------------------------------- 1 | # 1. Import sqlite3 2 | # 2. Call .connect() method to create connection object 3 | # 3. Create cursor object 4 | # 4. Delete table if it exists 5 | # 5. Create table with domains and counts as attributes 6 | # 6. Request file name 7 | # 7. Create file handle 8 | # 8. Loop through file to retrieve domain of emails 9 | # 9a. Retrieve data 10 | # 9b. Call fetchone() method to query db 11 | # 10. if/else statement similar to get() 12 | # 11. Commit changes with commit() 13 | # 12. Print counts 14 | 15 | # 1. Import sqlite3 16 | import sqlite3 17 | 18 | # 2. Call .connect() method to create connection object 19 | connect_db = sqlite3.connect('domain_db.sqlite') 20 | 21 | # 3. Create cursor object 22 | cursor_db = connect_db.cursor() 23 | 24 | # 4. Delete table if it exists 25 | cursor_db.execute(''' 26 | DROP TABLE IF EXISTS Counts''') 27 | 28 | # 5. Create table with emails and counts as attributes 29 | cursor_db.execute(''' 30 | CREATE TABLE Counts( 31 | org TEXT, 32 | count INTEGER)''') 33 | 34 | # 6. Request file name 35 | fname = raw_input('File name: ') 36 | 37 | # 7. Create file handle 38 | fhandle = open(fname) 39 | 40 | # 8. Loop through file to retrieve domain of emails 41 | for line in fhandle: 42 | if not line.startswith('From: '): 43 | continue 44 | line = line.split() 45 | email = line[1] 46 | email = email.split('@') 47 | org = email[1] 48 | 49 | # Using cursor as iterator 50 | # 9a. Retrieve data 51 | cursor_db.execute('SELECT count FROM Counts WHERE org = ? ', (org,)) 52 | 53 | # 9b. Call fetchone() method to query db 54 | row = cursor_db.fetchone() 55 | 56 | # 10. if/else statement similar to get() 57 | if row is None: 58 | cursor_db.execute('''INSERT INTO Counts (org, count) 59 | VALUES (?, 1)''', (org, )) 60 | else: 61 | cursor_db.execute('''UPDATE Counts 62 | SET count = count + 1 63 | WHERE org = ?''', (org, )) 64 | 65 | # 11. Commit changes with commit() 66 | connect_db.commit() 67 | 68 | # 12. Print counts 69 | sqlstr = '''SELECT org, count FROM Counts ORDER BY count DESC''' 70 | 71 | for row in cursor_db.execute(sqlstr): 72 | print row[0], row[1] 73 | 74 | # 13. Close cursor (communication) 75 | cursor_db.close() -------------------------------------------------------------------------------- /python_databases/w2_sql/assignment_email.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w2_sql/assignment_email.db -------------------------------------------------------------------------------- /python_databases/w2_sql/assignment_sql_hex.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w2_sql/assignment_sql_hex.db -------------------------------------------------------------------------------- /python_databases/w2_sql/crud.txt: -------------------------------------------------------------------------------- 1 | # CREATE TABLE 2 | 3 | CREATE TABLE Users( 4 | name VARCHAR(128), 5 | email VARCHAR(128) 6 | ) 7 | 8 | # DELETE TABLE 9 | DROP TABLE IF EXISTS Users 10 | 11 | # INSERT 12 | INSERT INTO Users (name, email) VALUES ('Kristin', 'kf@me.com') 13 | 14 | # UPDATE 15 | UPDATE Users SET name='Charles' WHERE email='fred@live.com' 16 | 17 | # Retrieve 18 | # * all rows 19 | SELECT * FROM Users WHERE email='fred@live.com' 20 | SELECT * FROM Users ORDER BY email 21 | SELECT * FROM Users ORDER BY name 22 | 23 | # DELETE 24 | DELETE FROM Users WHERE email='sally@me.com' 25 | 26 | # Delete everything 27 | DELETE FROM Users 28 | 29 | # Count number of rows or users 30 | SELECT COUNT(*) FROM Users 31 | 32 | # 33 | SELECT hex(name || age) AS X FROM Ages ORDER BY X 34 | -------------------------------------------------------------------------------- /python_databases/w2_sql/db_intro.txt: -------------------------------------------------------------------------------- 1 | INTRODUCTION TO DATABASES 2 | 3 | Relational Databases 4 | Storing rows and columns in tables 5 | Multiple tables 6 | Relationship between tables 7 | 8 | Database: contains many tables 9 | 10 | Relation (table): contains tuples and attributes (variables/fields) 11 | 12 | Tuple (row): a set of fields that generally represents an "object" like 13 | person or a music track 14 | 15 | Attribute (column/field): one of possibly many elements of data 16 | corresponding to the object represented in the row 17 | 18 | Structured Query Language (SQL): language to issue commands to the databases 19 | Create data 20 | Retrieve data 21 | Update data 22 | Delete data 23 | CRUD! 24 | 25 | Abstraction of complexity of SQL 26 | We communicate with SQL instead of the database directly 27 | 28 | Python cleans data 29 | SQL CRUD data 30 | -------------------------------------------------------------------------------- /python_databases/w2_sql/domain_db.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w2_sql/domain_db.sqlite -------------------------------------------------------------------------------- /python_databases/w2_sql/emaildb.py: -------------------------------------------------------------------------------- 1 | # Importing library to talk to SQL db 2 | import sqlite3 3 | 4 | # Use db, store db in file emaildb.sqlite 5 | # SQLite Browser to view this 6 | # Create connection object 7 | conn = sqlite3.connect('emaildb.sqlite') 8 | # .cursor() so you can communicate 9 | cur = conn.cursor() 10 | 11 | # Call execute() method 12 | # The DROP TABLE statement is used to delete a table. 13 | cur.execute(''' 14 | DROP TABLE IF EXISTS Counts''') 15 | 16 | 17 | # Call execute() methods 18 | # CREATE TABLE 19 | cur.execute(''' 20 | CREATE TABLE Counts (email TEXT, count INTEGER)''') 21 | 22 | fname = raw_input('Enter file name: ') 23 | if ( len(fname) < 1 ) : fname = 'mbox-short.txt' 24 | fh = open(fname) 25 | for line in fh: 26 | # Looping through and retrieving email 27 | if not line.startswith('From: '): 28 | continue 29 | pieces = line.split() 30 | email = pieces[1] 31 | print email 32 | 33 | 34 | # first param: ? is a placeholder to be filled in 35 | # second param: one tuple --> first thing in tuple to substitute question mark 36 | cur.execute('SELECT count FROM Counts WHERE email = ? ', (email, )) 37 | 38 | # This method retrieves the next row of a query result set and 39 | # returns a single sequence, or None if no more rows are available. 40 | row = cur.fetchone() 41 | if row is None: 42 | cur.execute('''INSERT INTO Counts (email, count) 43 | VALUES (?, 1)''', (email, )) 44 | else: 45 | cur.execute('UPDATE Counts SET count=count+1 WHERE email = ?', 46 | (email, )) 47 | 48 | # This statement commits outstanding changes to disk each 49 | # time through the loop - the program can be made faster 50 | # by moving the commit so it runs only after the loop completes 51 | conn.commit() 52 | 53 | # https://www.sqlite.org/lang_select.html 54 | # DESC so large number on the top 55 | # LIMIT 10: only give 10 out of n records --> top 10 email senders 56 | sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10' 57 | 58 | print "Counts:" 59 | for row in cur.execute(sqlstr): 60 | print str(row[0]), row[1] 61 | 62 | cur.close() 63 | 64 | -------------------------------------------------------------------------------- /python_databases/w2_sql/emaildb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w2_sql/emaildb.sqlite -------------------------------------------------------------------------------- /python_databases/w2_sql/sql1.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w2_sql/sql1.db -------------------------------------------------------------------------------- /python_databases/w2_sql/sqlitebrowser-3.8.0v5.dmg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w2_sql/sqlitebrowser-3.8.0v5.dmg -------------------------------------------------------------------------------- /python_databases/w2_sql/using_db.txt: -------------------------------------------------------------------------------- 1 | USING DATABASES 2 | 3 | Roles in Large Projects 4 | Application developer: builds logic, looks, and monitors 5 | Database Administrator: design, implementation etc. 6 | 7 | Data Analysis Structure 8 | 1. Read & clean input data (Python) 9 | 2. CUD data (SQL) 10 | 3. Read data (SQL) 11 | 4. Output files (R/Excel/D3.js) (Python) 12 | 13 | Database Model or Database Schema 14 | Structure or format of a database 15 | Application of a data model in conjunction with a database management system 16 | 17 | Common Database Systems 18 | Oracle: large, commercial, enterprise-scale, very very tweakable 19 | MySql (open-source): simpler but fast and scalable - commercial open-source 20 | Use for online website 21 | SqlServer: from Microsoft (Access) 22 | Postgress (open-source): imitation of Oracle 23 | 24 | SQLite: Embedded database (built in Python using import) 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /python_databases/w3_data_models/asssignment_musicdb.py: -------------------------------------------------------------------------------- 1 | # 1. Import ElementTree and SQLite libraries 2 | import xml.etree.ElementTree as ET 3 | import sqlite3 4 | 5 | # 2. Call .connect() method to create connection object 6 | connect_db = sqlite3.connect('trackdb_assign.sqlite') 7 | 8 | # 3. Create cursor object 9 | cursor_db = connect_db.cursor() 10 | 11 | # 4. Create Tables 12 | # Artist 13 | # Genre 14 | # Album 15 | # Track 16 | 17 | # Delete table if it exists 18 | cursor_db.execute(''' 19 | DROP TABLE IF EXISTS Artist''') 20 | 21 | # 4a. Create ARTIST TABLE 22 | cursor_db.execute(''' 23 | CREATE TABLE IF NOT EXISTS Artist ( 24 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 25 | name TEXT UNIQUE 26 | ) 27 | ''') 28 | 29 | # Delete table if it exists 30 | cursor_db.execute(''' 31 | DROP TABLE IF EXISTS Genre''') 32 | 33 | # 4b. Create GENRE TABLE 34 | cursor_db.execute(''' 35 | CREATE TABLE IF NOT EXISTS Genre ( 36 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 37 | name TEXT UNIQUE 38 | ) 39 | ''') 40 | 41 | # Delete table if it exists 42 | cursor_db.execute(''' 43 | DROP TABLE IF EXISTS Album''') 44 | 45 | # 4c. Create ALBUM TABLE 46 | cursor_db.execute(''' 47 | CREATE TABLE IF NOT EXISTS Album ( 48 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 49 | artist_id INTEGER, 50 | title TEXT UNIQUE 51 | ) 52 | ''') 53 | 54 | # Delete table if it exists 55 | cursor_db.execute(''' 56 | DROP TABLE IF EXISTS Track''') 57 | 58 | # 4d. TRACK TABLE 59 | cursor_db.execute(''' 60 | CREATE TABLE IF NOT EXISTS Track ( 61 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 62 | title TEXT UNIQUE, 63 | album_id INTEGER, 64 | genre_id INTEGER, 65 | len INTEGER, 66 | rating INTEGER, 67 | count INTEGER 68 | ) 69 | ''') 70 | 71 | # 5 Request file name 72 | fname = raw_input('File name: ') 73 | # 5a. Error handling 74 | if len(fname) < 1: 75 | fname = 'Library.xml' 76 | 77 | # 6. Parse XML using ElementTree 78 | stuff = ET.parse(fname) 79 | 80 | # 7. Find all keys and text 81 | # Dictionary nested 3x 82 | all = stuff.findall('dict/dict/dict') 83 | # print 'Dictionary count:', len(all) 84 | 85 | # 8. Define lookup method to seek stuff you want 86 | def lookup(d, key): 87 | found = False 88 | for child in d: 89 | if found: 90 | return child.text 91 | if child.tag == 'key' and child.text == key: 92 | found = True 93 | return None 94 | 95 | # 9. Loop through data to find necessary information 96 | # Name 97 | # Artist 98 | # Album 99 | # Play Count 100 | # Rating 101 | # Total Time 102 | for entry in all: 103 | if lookup(entry, 'Track ID') is None: 104 | continue 105 | 106 | name = lookup(entry, 'Name') 107 | artist = lookup(entry, 'Artist') 108 | album = lookup(entry, 'Album') 109 | count = lookup(entry, 'Play Count') 110 | rating = lookup(entry, 'Rating') 111 | length = lookup(entry, 'Total Time') 112 | genre = lookup(entry, 'Genre') 113 | 114 | if name is None or artist is None or album is None or genre is None: 115 | continue 116 | 117 | # print name, artist, album, genre, count, rating, length 118 | 119 | # 10. Idioms to place data into database 120 | 121 | # 10a. Artist Idiom 122 | # If artist don't exist, insert 123 | # If exist, ignore 124 | cursor_db.execute('''INSERT OR IGNORE INTO Artist (name) 125 | VALUES (?)''', (artist,)) 126 | # Retrieve Data 127 | cursor_db.execute('SELECT id FROM Artist WHERE name = ?', (artist,)) 128 | # Call fetchone() method to query db 129 | artist_id = cursor_db.fetchone()[0] 130 | 131 | # 10b. Genre Idiom 132 | # If genre don't exist, insert 133 | # If exist, ignore 134 | cursor_db.execute('''INSERT OR IGNORE INTO Genre (name) 135 | VALUES (?)''', (genre,)) 136 | # Retrieve Data 137 | cursor_db.execute('SELECT id FROM Genre WHERE name = ?', (genre,)) 138 | # Call fetchone() method to query db 139 | genre_id = cursor_db.fetchone()[0] 140 | 141 | # 10c. ALBUM IDIOM 142 | # Album has foreign key to artist 143 | cursor_db.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 144 | VALUES (?, ?)''', (album, artist_id)) 145 | cursor_db.execute('SELECT id FROM Album WHERE title = ?', (album,)) 146 | album_id = cursor_db.fetchone()[0] 147 | 148 | # 10d. TRACK IDIOM 149 | cursor_db.execute('''INSERT OR REPLACE INTO Track 150 | (title, album_id, genre_id, len, rating, count) 151 | VALUES (?, ?, ?, ?, ?, ?)''', 152 | (name, album_id, genre_id, length, rating, count)) 153 | 154 | # Commit 155 | connect_db.commit() 156 | 157 | cursor_db.execute('''SELECT Track.title, Artist.name, Album.title, Genre.name 158 | FROM Track 159 | JOIN Genre JOIN Album JOIN Artist 160 | ON Track.genre_id = Genre.ID AND Track.album_id = Album.id 161 | AND Album.artist_id = Artist.id 162 | ORDER BY Artist.name LIMIT 3''') 163 | 164 | -------------------------------------------------------------------------------- /python_databases/w3_data_models/data_model_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w3_data_models/data_model_table.png -------------------------------------------------------------------------------- /python_databases/w3_data_models/data_models.txt: -------------------------------------------------------------------------------- 1 | Schema (contract) 2 | Many tables 3 | Some columns related to other tables 4 | 5 | Building Data Model 6 | Don't put same string twice (use relationship) 7 | 8 | For each piece of info 9 | Is the column an object or attribute of another object? 10 | 11 | Once we define objects, we need to define the relationships 12 | between objects 13 | 14 | Data Modelling of Music Management 15 | Track 16 | Time 17 | Rating 18 | Count 19 | Album 20 | Artist 21 | Genre 22 | 23 | Start with most essential function to determine which 24 | tables to build 25 | Music track management --> start with Tracks 26 | 27 | Move to which are attributes 28 | Time --> part of Tracks 29 | Rating --> part of Tracks 30 | Count --> part of Tracks 31 | 32 | 33 | Move to which are attributes of objects 34 | Album TABLE --> Tracks belong to Album 35 | Artist TABLE --> Album belong to Artist 36 | Genre Table --> Tracks belong to Genre 37 | 38 | Primary key: id (one key for every row to point to other tables) 39 | Logical key: might use this in a WHERE or ORDER BY clause 40 | Foreign key 41 | 42 | WORK FROM OUTWARD IN --> LEAVES TO BRANCH 43 | 1. Artist 44 | 2. Album 45 | 3. Genre 46 | 4. Tracks 47 | -------------------------------------------------------------------------------- /python_databases/w3_data_models/join.txt: -------------------------------------------------------------------------------- 1 | JOIN operation 2 | links across several tables as part of a select operation 3 | 4 | tell JOIN to use the keys that make the connection 5 | between tables using an ON clause 6 | 7 | # JOIN Album & Artist 8 | SELECT Album.title, Artist.name 9 | FROM Album 10 | JOIN Artist 11 | ON Album.artist_id = Artist.id 12 | # SELECT Album.title, Artist.name --> what we want to see 13 | # Artist.name FROM Album JOIN Artist --> join 2 tables (title and name columns) 14 | # ON Album.artist_id = Artist.id --> how tables are linked 15 | # starting arrow --> Album.artist_id (foreign key) 16 | # ending arrow --> Artist.id (primary key) 17 | 18 | # Show source of connection 19 | SELECT Album.title, Album.artist_id, Artist.id, Artist.name 20 | FROM Album 21 | JOIN Artist 22 | ON Album.artist_id = Artist.id 23 | 24 | # JOIN Track & Genre 25 | # Canonical syntax 26 | SELECT Track.title, Genre.name 27 | FROM Track 28 | JOIN Genre 29 | ON Track.genre_id = Genre.id 30 | 31 | # If no ON clause, there will be combinations 32 | SELECT Track.title, Track.genre_id, Genre.name, Genre.id 33 | FROM Track 34 | JOIN Genre 35 | ON Track.genre_id = Genre.id 36 | 37 | # JOIN --> all possible matches amongst keys 38 | # ON --> pick keys match (foreign and primary key) 39 | 40 | # More complex JOINs 41 | SELECT Track.title, Artist.name, Album.title, Genre.name 42 | FROM Track 43 | JOIN Artist JOIN Album JOIN Genre 44 | ON Track.genre_id = Genre.id and Track.album_id = Album.id and Album.artist_id = Artist.id -------------------------------------------------------------------------------- /python_databases/w3_data_models/music_mgt.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w3_data_models/music_mgt.db -------------------------------------------------------------------------------- /python_databases/w3_data_models/trackdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w3_data_models/trackdb.sqlite -------------------------------------------------------------------------------- /python_databases/w3_data_models/trackdb_assign.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w3_data_models/trackdb_assign.sqlite -------------------------------------------------------------------------------- /python_databases/w3_data_models/xml_extract_sql.py: -------------------------------------------------------------------------------- 1 | # Import ET for parsing XML 2 | import xml.etree.ElementTree as ET 3 | import sqlite3 4 | 5 | # Call .connect() method to create connection object 6 | connect_db = sqlite3.connect('trackdb.sqlite') 7 | 8 | # Create cursor object 9 | cursor_db = connect_db.cursor() 10 | 11 | # Make tables if they do not exist 12 | 13 | # Create ARTIST TABLE 14 | cursor_db.execute(''' 15 | CREATE TABLE IF NOT EXISTS Artist ( 16 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 17 | name TEXT UNIQUE 18 | ) 19 | ''') 20 | 21 | # Create ALBUM TABLE 22 | cursor_db.execute(''' 23 | CREATE TABLE IF NOT EXISTS Album ( 24 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 25 | artist_id INTEGER, 26 | title TEXT UNIQUE 27 | ) 28 | ''') 29 | 30 | # TRACK TABLE 31 | cursor_db.execute(''' 32 | CREATE TABLE IF NOT EXISTS Track ( 33 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 34 | title TEXT UNIQUE, 35 | album_id INTEGER, 36 | len INTEGER, 37 | rating INTEGER, 38 | count INTEGER 39 | ) 40 | ''') 41 | 42 | # Request file name 43 | fname = raw_input('Enter file name: ') 44 | if len(fname) < 1: 45 | fname = 'Library.xml' 46 | 47 | 48 | # Parse XML using ET 49 | stuff = ET.parse(fname) 50 | # findall() returns a list of matching Elements 51 | # find() efficiently returns only the first match 52 | # findtext() returns the .text content of the first match 53 | all = stuff.findall('dict/dict/dict') 54 | print 'Dict count:', len(all) 55 | 56 | # Define lookup method 57 | def lookup(d, key): 58 | found = False 59 | for child in d: 60 | if found: 61 | return child.text 62 | if child.tag == 'key' and child.text == key: 63 | found = True 64 | return None 65 | 66 | for entry in all: 67 | if lookup(entry, 'Track ID') is None: 68 | continue 69 | 70 | name = lookup(entry, 'Name') 71 | artist = lookup(entry, 'Artist') 72 | album = lookup(entry, 'Album') 73 | count = lookup(entry, 'Play Count') 74 | rating = lookup(entry, 'Rating') 75 | length = lookup(entry, 'Total Time') 76 | 77 | if name is None or artist is None or album is None: 78 | continue 79 | 80 | print name, artist, album, count, rating, length 81 | 82 | # ARTIST IDIOM 83 | # If artist don't exist, insert 84 | # If exist, ignore 85 | cursor_db.execute('''INSERT OR IGNORE INTO Artist (name) 86 | VALUES (?)''', (artist, )) 87 | # Retrieve Data 88 | cursor_db.execute('SELECT id FROM Artist WHERE name = ?', (artist, )) 89 | # Call fetchone() method to query db 90 | artist_id = cursor_db.fetchone()[0] 91 | 92 | # ALBUM IDIOM 93 | # Album has foreign key to artist 94 | cursor_db.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 95 | VALUES (?, ?)''', (album, artist_id)) 96 | cursor_db.execute('SELECT id FROM Album WHERE title = ?', (album, )) 97 | album_id = cursor_db.fetchone()[0] 98 | 99 | # TRACK IDIOM 100 | cursor_db.execute('''INSERT OR REPLACE INTO Track 101 | (title, album_id, len, rating, count) 102 | VALUES (?, ?, ?, ?, ?)''', 103 | (name, album_id, length, rating, count)) 104 | 105 | # Commit 106 | connect_db.commit() -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/assignment_roster_role.py: -------------------------------------------------------------------------------- 1 | # 1. Import JSON and SQLite libraries 2 | import json 3 | import sqlite3 4 | 5 | # 2. Call .connect() method to create connection object 6 | connect_db = sqlite3.connect('roster_role_db.sqlite') 7 | 8 | # 3. Create cursor object to send commands 9 | cursor_db = connect_db.cursor() 10 | 11 | # 4. Create tables 12 | # Using multiple SQL commands using .executescript() 13 | # Connector table: Member 14 | # Member: 2 foreign keys, 1 composite primary key (concatenated) 15 | 16 | cursor_db.executescript(''' 17 | DROP TABLE IF EXISTS User; 18 | DROP TABLE IF EXISTS Member; 19 | DROP TABLE IF EXISTS Course; 20 | 21 | CREATE TABLE User ( 22 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 23 | name TEXT UNIQUE 24 | ); 25 | 26 | CREATE TABLE Course ( 27 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 28 | title TEXT UNIQUE 29 | ); 30 | 31 | CREATE TABLE Member ( 32 | user_id INTEGER, 33 | course_id INTEGER, 34 | role INTEGER, 35 | PRIMARY KEY (user_id, course_id) 36 | ) 37 | ''') 38 | 39 | # 5 Request file name 40 | fname = raw_input('File name: ') 41 | # 5a. Error handling 42 | if len(fname) < 1: 43 | fname = 'roster_data_assignment.json' 44 | 45 | # JSON Data example 46 | # [ 47 | # [ 48 | # "Charley", 49 | # "si110", 50 | # 1 51 | # ], 52 | # [ 53 | # "Mea", 54 | # "si110", 55 | # 0 56 | # ], 57 | 58 | # 6. Open and load json 59 | str_data = open(fname).read() 60 | json_data = json.loads(str_data) 61 | 62 | for entry in json_data: 63 | name = entry[0] 64 | title = entry[1] 65 | role = entry[2] 66 | print name, title, role 67 | 68 | # User: Insert, Retrieve and Query 69 | # Insert Data 70 | cursor_db.execute(''' 71 | INSERT OR IGNORE INTO User (name) 72 | VALUES (?)''', (name, )) 73 | 74 | # Retrieve Data 75 | cursor_db.execute(''' 76 | SELECT id 77 | FROM User 78 | WHERE name = ?''', (name, )) 79 | 80 | # Call fetchone() method to query db 81 | user_id = cursor_db.fetchone()[0] 82 | 83 | # Course: Insert, Retrieve and Query 84 | # Insert Data 85 | cursor_db.execute(''' 86 | INSERT OR IGNORE INTO Course (title) 87 | VALUES (?)''', (title,)) 88 | 89 | # Retrieve Data 90 | cursor_db.execute(''' 91 | SELECT id 92 | FROM Course 93 | WHERE title = ?''', (title,)) 94 | 95 | # Call fetchone() method to query db 96 | course_id = cursor_db.fetchone()[0] 97 | 98 | # Member: Insert 99 | cursor_db.execute(''' 100 | INSERT OR REPLACE INTO Member (user_id, course_id, role) 101 | VALUES (?, ?, ?)''', (user_id, course_id, role)) 102 | 103 | # Commit changes 104 | connect_db.commit() 105 | -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/exercise_db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w4_many_to_many/exercise_db.png -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/exercise_db.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w4_many_to_many/exercise_db.sqlite -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/exercise_many2many.py: -------------------------------------------------------------------------------- 1 | # 1. Import ElementTree and SQLite libraries 2 | import xml.etree.ElementTree as ET 3 | import sqlite3 4 | 5 | # 2. Call .connect() method to create connection object 6 | connect_db = sqlite3.connect('exercise_db.sqlite') 7 | 8 | # 3. Create cursor object 9 | cursor_db = connect_db.cursor() 10 | 11 | # 4. Create Tables 12 | # User 13 | # Course 14 | # Member 15 | 16 | # Delete table if it exists 17 | cursor_db.execute('DROP TABLE IF EXISTS User') 18 | 19 | # Create table 20 | cursor_db.execute(''' 21 | CREATE TABLE IF NOT EXISTS User ( 22 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 23 | name TEXT, 24 | email TEXT 25 | ) 26 | ''') 27 | 28 | # Delete table if it exists 29 | cursor_db.execute('DROP TABLE IF EXISTS Course') 30 | 31 | # Create table 32 | cursor_db.execute(''' 33 | CREATE TABLE IF NOT EXISTS Course ( 34 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 35 | title TEXT 36 | ) 37 | ''') 38 | 39 | # Delete table if it exists 40 | cursor_db.execute('DROP TABLE IF EXISTS Member') 41 | 42 | # Create CONNECTOR (Junction) table 43 | cursor_db.execute(''' 44 | CREATE TABLE IF NOT EXISTS Member ( 45 | user_id INTEGER, 46 | course_id INTEGER, 47 | role INTEGER, 48 | PRIMARY KEY(user_id, course_id) 49 | ) 50 | ''') 51 | 52 | # Insert User and Course Data 53 | # INSERT INTO User (name, email) VALUES ('Jane', 'jane@me.com'); 54 | # INSERT INTO User (name, email) VALUES ('Ed', 'ed@me.com'); 55 | # INSERT INTO User (name, email) VALUES ('Sue', 'sue@me.com'); 56 | # INSERT INTO Course (title) VALUES ('Python'); 57 | # INSERT INTO Course (title) VALUES ('SQL'); 58 | # INSERT INTO Course (title) VALUES ('PHP'); 59 | 60 | # Insert Member Data 61 | # INSERT INTO Member (user_id, course_id, role) VALUES (1, 1, 1); 62 | # INSERT INTO Member (user_id, course_id, role) VALUES (2, 1, 0); 63 | # INSERT INTO Member (user_id, course_id, role) VALUES (3, 1, 0); 64 | # 65 | # INSERT INTO Member (user_id, course_id, role) VALUES (1, 2, 0); 66 | # INSERT INTO Member (user_id, course_id, role) VALUES (2, 2, 1); 67 | # 68 | # INSERT INTO Member (user_id, course_id, role) VALUES (2, 3, 1); 69 | # INSERT INTO Member (user_id, course_id, role) VALUES (3, 3, 0); 70 | 71 | 72 | # Select, Join, On and Order By 73 | # SELECT User.name, Member.role, Course.title 74 | # FROM User 75 | # JOIN Member JOIN Course 76 | # ON Member.user_id = User.id AND Member.course_id = Course.id 77 | # ORDER BY Course.title, Member.role DESC, User.name 78 | 79 | # Course.title first --> Member.role first --> User.name last -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/exercise_roster_m2m.py: -------------------------------------------------------------------------------- 1 | # 1. Import JSON and SQLite libraries 2 | import json 3 | import sqlite3 4 | 5 | # 2. Call .connect() method to create connection object 6 | connect_db = sqlite3.connect('rosterdb.sqlite') 7 | 8 | # 3. Create cursor object to send commands 9 | cursor_db = connect_db.cursor() 10 | 11 | # 4. Create tables 12 | # Using multiple SQL commands using .executescript() 13 | # Connector table: Member 14 | # Member: 2 foreign keys, 1 composite primary key (concatenated) 15 | 16 | cursor_db.executescript(''' 17 | DROP TABLE IF EXISTS User; 18 | DROP TABLE IF EXISTS Member; 19 | DROP TABLE IF EXISTS Course; 20 | 21 | CREATE TABLE User ( 22 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 23 | name TEXT UNIQUE 24 | ); 25 | 26 | CREATE TABLE Course ( 27 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 28 | title TEXT UNIQUE 29 | ); 30 | 31 | CREATE TABLE Member ( 32 | user_id INTEGER, 33 | course_id INTEGER, 34 | PRIMARY KEY (user_id, course_id) 35 | ) 36 | ''') 37 | 38 | # 5 Request file name 39 | fname = raw_input('File name: ') 40 | # 5a. Error handling 41 | if len(fname) < 1: 42 | fname = 'roster_data.json' 43 | 44 | # JSON Data example 45 | # [ 46 | # [ 47 | # "Charley", 48 | # "si110", 49 | # 1 50 | # ], 51 | # [ 52 | # "Mea", 53 | # "si110", 54 | # 0 55 | # ], 56 | 57 | # 6. Open and load json 58 | str_data = open(fname).read() 59 | json_data = json.loads(str_data) 60 | 61 | for entry in json_data: 62 | name = entry[0] 63 | title = entry[1] 64 | 65 | print name, title 66 | 67 | # User: Insert, Retrieve and Query 68 | # Insert Data 69 | cursor_db.execute(''' 70 | INSERT OR IGNORE INTO User (name) 71 | VALUES (?)''', (name, )) 72 | 73 | # Retrieve Data 74 | cursor_db.execute(''' 75 | SELECT id 76 | FROM User 77 | WHERE name = ?''', (name, )) 78 | 79 | # Call fetchone() method to query db 80 | user_id = cursor_db.fetchone()[0] 81 | 82 | # Course: Insert, Retrieve and Query 83 | # Insert Data 84 | cursor_db.execute(''' 85 | INSERT OR IGNORE INTO Course (title) 86 | VALUES (?)''', (title,)) 87 | 88 | # Retrieve Data 89 | cursor_db.execute(''' 90 | SELECT id 91 | FROM Course 92 | WHERE title = ?''', (title,)) 93 | 94 | # Call fetchone() method to query db 95 | course_id = cursor_db.fetchone()[0] 96 | 97 | # Member: Insert 98 | cursor_db.execute(''' 99 | INSERT OR REPLACE INTO Member (user_id, course_id) 100 | VALUES (?, ?)''', (user_id, course_id)) 101 | 102 | # Commit changes 103 | connect_db.commit() 104 | -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/many_to_many.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w4_many_to_many/many_to_many.png -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/many_to_many.txt: -------------------------------------------------------------------------------- 1 | One to Many or Many to One 2 | 1 Artist TO MANY Tracks 3 | 1 Genre TO MANY Tracks 4 | 5 | Many to Many 6 | Courses and Users 7 | Decompose to M-O and O-M relationships 8 | 9 | -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/roster_role_db.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w4_many_to_many/roster_role_db.sqlite -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/rosterdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w4_many_to_many/rosterdb.sqlite -------------------------------------------------------------------------------- /python_databases/w4_many_to_many/trackdb_assign.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w4_many_to_many/trackdb_assign.sqlite -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geocoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/geocoding.png -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geocoding.txt: -------------------------------------------------------------------------------- 1 | Data Mining Technologies 2 | Hadoop (Apache) 3 | Spark (Apache) 4 | Redshift (AWS) 5 | Pentaho 6 | 7 | GeoData 8 | Makes a Google Map from user data 9 | 10 | 11 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/README.txt: -------------------------------------------------------------------------------- 1 | Using the Google Geocoding API with a Database and 2 | Visualizing data on Google Map 3 | 4 | In this project, we are using the Google geocoding API 5 | to clean up some user-entered geographic locations of 6 | university names and then placing the data on a Google 7 | Map. 8 | 9 | You should install the SQLite browser to view and modify 10 | the databases from: 11 | 12 | http://sqlitebrowser.org/ 13 | 14 | The first problem to solve is that the Google geocoding 15 | API is rate limited to 2500 requests per day. So if you have 16 | a lot of data you might need to stop and restart the lookup 17 | process several times. So we break the problem into two 18 | phases. 19 | 20 | In the first phase we take our input data in the file 21 | (where.data) and read it one line at a time, and retreive the 22 | geocoded response and store it in a database (geodata.sqlite). 23 | Before we use the geocoding API, we simply check to see if 24 | we already have the data for that particular line of input. 25 | 26 | You can re-start the process at any time by removing the file 27 | geodata.sqlite 28 | 29 | Run the geoload.py program. This program will read the input 30 | lines in where.data and for each line check to see if it is already 31 | in the database and if we don't have the data for the location, 32 | call the geocoding API to retrieve the data and store it in 33 | the database. 34 | 35 | Here is a sample run after there is already some data in the 36 | database: 37 | 38 | Mac: python geoload.py 39 | Win: geoload.py 40 | 41 | Found in database Northeastern University 42 | 43 | Found in database University of Hong Kong, Illinois Institute of Technology, Bradley University 44 | 45 | Found in database Technion 46 | 47 | Found in database Viswakarma Institute, Pune, India 48 | 49 | Found in database UMD 50 | 51 | Found in database Tufts University 52 | 53 | Resolving Monash University 54 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Monash+University 55 | Retrieved 2063 characters { "results" : [ 56 | {u'status': u'OK', u'results': ... } 57 | 58 | Resolving Kokshetau Institute of Economics and Management 59 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Kokshetau+Institute+of+Economics+and+Management 60 | Retrieved 1749 characters { "results" : [ 61 | {u'status': u'OK', u'results': ... } 62 | 63 | The first five locations are already in the database and so they 64 | are skipped. The program scans to the point where it finds un-retrieved 65 | locations and starts retrieving them. 66 | 67 | The geoload.py can be stopped at any time, and there is a counter 68 | that you can use to limit the number of calls to the geocoding 69 | API for each run. 70 | 71 | Once you have some data loaded into geodata.sqlite, you can 72 | visualize the data using the (geodump.py) program. This 73 | program reads the database and writes tile file (where.js) 74 | with the location, latitude, and longitude in the form of 75 | executable JavaScript code. 76 | 77 | A run of the geodump.py program is as follows: 78 | 79 | Mac: python geodump.py 80 | Win: geodump.py 81 | 82 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975 83 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811 84 | ... 85 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667 86 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682 87 | Kokshetau, Kazakhstan 53.2833333 69.3833333 88 | ... 89 | 12 records written to where.js 90 | Open where.html to view the data in a browser 91 | 92 | The file (where.html) consists of HTML and JavaScript to visualize 93 | a Google Map. It reads the most recent data in where.js to get 94 | the data to be visualized. Here is the format of the where.js file: 95 | 96 | myData = [ 97 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'], 98 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'], 99 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'], 100 | ... 101 | ]; 102 | 103 | This is a JavaScript list of lists. The syntax for JavaScript 104 | list constants is very similar to Python so the syntax should 105 | be familiar to you. 106 | 107 | Simply open where.html in a browser to see the locations. You 108 | can hover over each map pin to find the location that the 109 | gecoding API returned for the user-entered input. If you 110 | cannot see any data when you open the where.html file, you might 111 | want to check the JavaScript or developer console for your browser. 112 | 113 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/geodata.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/geodata/geodata.sqlite -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/geodump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import codecs 4 | 5 | conn = sqlite3.connect('geodata.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT * FROM Locations') 9 | fhand = codecs.open('where.js', 'w', "utf-8") 10 | fhand.write("myData = [\n") 11 | count = 0 12 | for row in cur: 13 | # [0] address 14 | # [1] actual data 15 | data = str(row[1]) 16 | try: 17 | js = json.loads(str(data)) 18 | except: 19 | continue 20 | 21 | if not('status' in js and js['status'] == 'OK'): 22 | continue 23 | 24 | lat = js["results"][0]["geometry"]["location"]["lat"] 25 | lng = js["results"][0]["geometry"]["location"]["lng"] 26 | if lat == 0 or lng == 0: 27 | continue 28 | where = js['results'][0]['formatted_address'] 29 | where = where.replace("'", "") 30 | try: 31 | print where, lat, lng 32 | 33 | count = count + 1 34 | if count > 1 : fhand.write(",\n") 35 | output = "["+str(lat)+","+str(lng)+", '"+where+"']" 36 | fhand.write(output) 37 | except: 38 | continue 39 | 40 | fhand.write("\n];\n") 41 | cur.close() 42 | fhand.close() 43 | print count, "records written to where.js" 44 | print "Open where.html to view the data in a browser" 45 | 46 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/geoload.py: -------------------------------------------------------------------------------- 1 | # 1. Imports 2 | import urllib 3 | import sqlite3 4 | import json 5 | import time 6 | import ssl 7 | 8 | # 2. API's URL to connect 9 | serviceurl = "http://maps.googleapis.com/maps/api/geocode/json?" 10 | 11 | # Deal with SSL certificate anomalies Python > 2.7 12 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 13 | scontext = None 14 | 15 | # 2. Call .connect() method to create connection object 16 | conn = sqlite3.connect('geodata.sqlite') 17 | 18 | # 3. Create cursor object 19 | cur = conn.cursor() 20 | 21 | # 4. Create table 22 | cur.execute(''' 23 | CREATE TABLE IF NOT EXISTS Locations ( 24 | address TEXT, 25 | geodata TEXT 26 | ) 27 | ''') 28 | 29 | # Open input data 30 | fh = open("where.data") 31 | count = 0 32 | 33 | # Loop through data 34 | for line in fh: 35 | if count > 200: 36 | break 37 | address = line.strip() 38 | print '' 39 | # Buffer to force it to what we want 40 | # Retrieve data 41 | cur.execute(''' 42 | SELECT geodata 43 | FROM Locations 44 | WHERE address= ?''', (buffer(address), )) 45 | 46 | try: 47 | # Query data 48 | # fetchone() grabs a row 49 | # [0] grabs first column 50 | data = cur.fetchone()[0] 51 | print "Found in database ",address 52 | continue 53 | except: 54 | pass 55 | 56 | print 'Resolving', address 57 | # Create URL 58 | url = serviceurl + urllib.urlencode({"sensor":"false", "address": address}) 59 | print 'Retrieving', url 60 | 61 | # Open URL 62 | uh = urllib.urlopen(url, context=scontext) 63 | 64 | # Read URL 65 | data = uh.read() 66 | print 'Retrieved',len(data),'characters',data[:20].replace('\n',' ') 67 | count = count + 1 68 | try: 69 | js = json.loads(str(data)) 70 | # print js # We print in case unicode causes an error 71 | except: 72 | # If bad JSON 73 | continue 74 | 75 | if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS'): 76 | print '==== Failure To Retrieve ====' 77 | print data 78 | break 79 | 80 | # SQL Command 81 | cur.execute('''INSERT INTO Locations (address, geodata) 82 | VALUES ( ?, ? )''', (buffer(address), buffer(data))) 83 | 84 | # Write to disk to ensure we've data if it blows up halfway 85 | conn.commit() 86 | time.sleep(1) 87 | 88 | print "Run geodump.py to read the data from the database so you can visualize it on a map." 89 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/where.data: -------------------------------------------------------------------------------- 1 | Northeastern University 2 | University of Hong Kong, Illinois Institute of Technology, Bradley University 3 | Technion 4 | Viswakarma Institute, Pune, India 5 | UMD 6 | Tufts University 7 | Monash University 8 | Kokshetau Institute of Economics and Management 9 | RSU named S.A. Esenin 10 | Tavrida National V.I. Vernadsky University 11 | UOC 12 | Irkutsk State University 13 | Institute of Technology Telkom 14 | Shanghai Jiao Tong University 15 | University of Ilorin, Kwara State. Nigeria 16 | Monash University Churchill Australia 17 | UNISA 18 | Fachhochschule FH Salzburg 19 | Tampere University of Technology (Tampere, Finland) 20 | Saint Petersburg State University 21 | University of São Paulo 22 | Smolensk State University (Russia) 23 | Institute of Business Administration, Karachi 24 | universidad complutense de madrid 25 | Masdar Institute 26 | University of London 27 | University of Oxford 28 | Tallinn University of Technology 29 | University of Tartu 30 | University of Padua 31 | University of Pune, India 32 | National Kyiv Shevchenko University 33 | UC Berkeley 34 | University of Wisconsin - Madison 35 | Lodz University of Technology 36 | NRU IFMO 37 | Dniepropetrovsk National University (Ukraine), Applied Math Faculty 38 | Dokuz Eylul University, Izmir, Turkey 39 | Beijing normal university 40 | University of Piraeus, Athens 41 | Universidad de Buenos Aires (UBA). Argentina. 42 | SASTRA University 43 | Nagpur University 44 | Duke University 45 | San Francisco State University 46 | FATEC-SP - Faculdade de Tecnologia do Estado de São Paulo 47 | University of Texas at Austin 48 | University of applied sciense of Mikkeli (Finland) 49 | Troy University 50 | Universidade do Minho 51 | National University of Sciences and Technology (NUST)-Pakistan 52 | Pontificia universidad catolica de chile 53 | Illinois State University Joliet Junior College 54 | American University in Cairo (AUC) 55 | Obninsk Technical University of Nuclear Power Engineering, Russia 56 | Vyatka State Humanitarian University 57 | Weizmann Institute of Science (Israel) 58 | University of Washington 59 | Kharkiv State Academy of Municipal Economy, Ukraine 60 | Faculty of Electrical Engineering in Sarajevo, University of Sarajevo 61 | Universidad de Los Andes Colombia 62 | University of Colorado at Boulder 63 | Magnitogorsk State Technical University 64 | USC 65 | Simon Fraser University 66 | Columbia University (New York) 67 | University of Southern California 68 | University of Warsaw 69 | Warsaw University of Technology 70 | (Some place in New Zealand you haven't heard of.) 71 | Massey university part-time Distance learning 72 | University of Oklahoma 73 | University of Pavia, Italy 74 | University of Missouri - Columbia 75 | Czech Technical University in Prague 76 | Illinois Institute of Technology 77 | Penn State University 78 | University of Utah 79 | Faculty of Science, University of Zagreb - Department of Mathematics 80 | Universitat Politecnica de Valencia 81 | University of Vienna 82 | University of Puerto Rico - Mayaguez Campus 83 | University "Hyperion" of Bucharest 84 | University of New Haven 85 | University of Washington -Bothell 86 | Drexel University 87 | University of Texas at Austin 88 | University of Helsinki 89 | University of Michigan 90 | Carnegie Mellon University 91 | Kazan Federal University 92 | Pondicherry University 93 | Far-Eastern State University 94 | Nanyang Technological University 95 | Slovak University of Technology 96 | NYU 97 | UFABC - Universidade Federal do ABC, Sanso André - SP - Brazil 98 | University of Debrecen 99 | California State University, San Bernardino 100 | National University "Kyiv-Mohyla Academy" (Kyiv, Ukraine) 101 | Laurentian University 102 | Humanities Institute of TV and Radio, Moscow, Russia 103 | University of Cambridge, UK 104 | Payame Noor University, Tehran, Iran 105 | Middle East Technical University 106 | EPFL 107 | Faculty of Technical Sciences, Novi Sad, Serbia 108 | University of Gothenburg, Sweden 109 | Polytechnic University of Timisoara 110 | University of Hawaii (Go, Rainbows!) 111 | Belarusian State University 112 | Haaga-Helia university of applied sciences 113 | JADAVPUR UNIVERSITY 114 | Gauhati University, India 115 | Universidad de Buenos Aires 116 | Università degli Studi di Genova, Genova, Italia 117 | King Mongkut's University of Technology Thonburi 118 | Universidad de la Sabana, Chia, Colombia 119 | State University of New York (SUNY) College at Oswego 120 | Kyrgyz Slavic Russian University 121 | De La Salle University http://www.dlsu.edu.ph 122 | Jawaharlal Nehru Technological University, INDIA 123 | UCL (Université Catholique de Louvain) in Belgium 124 | Boston University 125 | The University of Manchester 126 | Fachhochschule Düsseldorf 127 | Pine Manor College (AA), Harvard University (BA), Lesley University (MEd) 128 | Simón Bolívar University 129 | Indiana University at Bloomington 130 | RPI 131 | University of Ottawa, Canada 132 | Ural Federal University 133 | BITS Pilani 134 | Transilvania University 135 | IIT(BHU), Varanasi, India 136 | EM Lyon 137 | Universidad Central de Venezuela 138 | NTUU "KPI" 139 | Universidade Federal da Paraiba, Brazil 140 | Budapest University of Technology and Economics 141 | Moscow Institute of Physics & Technology (State University) 142 | Saint Petersburg State University of Aerospace Instrumentation, Russia 143 | North Central College, Naperville, IL 144 | Tech. Uni. Denmark (DTU) 145 | Stanford 146 | "Politehnica" Timisoara 147 | National University of Engineering 148 | Monash 149 | Federal University of Campina Grande (UFCG) 150 | Universidade Federal do Rio Grande do Sul (UFRGS) 151 | Universidad Nacional Autónoma de México 152 | University of New South Wales Harvard Business School 153 | University of Tehran 154 | Old Dominion University 155 | Kyiv Unisersity of Oriental Language 156 | Babcock University 157 | University of Essex 158 | Kharkiv National University of Radio Electronics (Ukraine) 159 | Kaunas Technology University 160 | University of Buenos Aires 161 | University of Jaffna. 162 | R V College of Engineering, Bangalore, India for BE in Instrumentation Technology 163 | Beloit College 164 | UCLA 165 | University of Chicago 166 | University of Sciences and Technology of Oran. Mohamed Boudiaf (USTO-MB). 167 | Zagazig University, Egypt 168 | University of Alberta 169 | Belorussian State University 170 | Jones International University (online) Illinois State Univeristy 171 | University of Florida 172 | Too many to mention. 173 | University of Kerala, India 174 | Politecnico di Milano 175 | Vilnius Gediminas Technical University 176 | Madras university/ Bharthidasan University in India . 177 | Universidade Tecnica de Lisboa - Instituto Superior Técnico 178 | Does not apply. 179 | Stellenbosch University 180 | imt ghazIABAD INDIA 181 | University of Pennsylvania 182 | National Institute of Technology, Jalandhar (India) 183 | Universidad ICESI 184 | Virginia Tech 185 | arizona state university 186 | Universidad del Valle de Guatemala 187 | Mykolas Romeris University, Vilnius, Lithuania 188 | BSU 189 | Distance Learning Center at the Technical University of Kaiserslautern in Germany 190 | Ain shams university, Cairo, Egypt 191 | Universidad Nacional de Colombia 192 | Saint-Petersburg Polytechnic Univesity 193 | NAIT (Northern Alberta Institute of Technology) 194 | Wayne State took courses at U of M 195 | Universidad Nacional, Costa Rica 196 | Marietta College (Ohio) Northwestern University 197 | Grandville 198 | Portland State University, Oregon Institute of Technology 199 | Malayer Azad University, Iran 200 | Marina Bay, Singapore 201 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/where.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A Map of Information 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 43 | 44 | 45 |
46 |

About this Map

47 |
51 | 52 | 53 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/geodata/where.js: -------------------------------------------------------------------------------- 1 | myData = [ 2 | [42.340082,-71.0894884, 'Northeastern, Boston, MA 02115, USA'], 3 | [38.2113643,-85.7470011, 'Bradley Ave, Louisville, KY, USA'], 4 | [32.778949,35.019648, 'Technion/ Sports Building, Haifa'], 5 | [18.4574518,73.8837999, 'Vishwakarma Institutes Play Ground, Yashodhan Society, Kapil Nagar, Kondhwa Budrukh, Vishwakarma, Maharashtra 411048, India'], 6 | [33.1561058,131.826132, 'Japan, 〒875-0002 Ōita-ken, Usuki-shi, Shitanoe, 1232−2 UMD'], 7 | [42.4036847,-71.120482, 'South Hall Tufts University, 30 Lower Campus Rd, Somerville, MA 02144, USA'], 8 | [-38.1518106,145.1345412, 'Monash University, Frankston VIC 3199, Australia'], 9 | [53.2948229,69.4047872, 'Kokshetau 020000, Kazakhstan'], 10 | [40.7127837,-74.0059413, 'New York, NY, USA'], 11 | [52.2869741,104.3050183, 'Irkutsk, Irkutsk Oblast, Russia'], 12 | [31.1790053,121.4237432, 'Shang Hai Jiao Tong Da Xue Fu Shu Di Liu Ren Min Yi Yuan, Xuhui Qu, Shanghai Shi, China, 200231'], 13 | [8.481302,4.611479, 'University Rd, Ilorin, Nigeria'], 14 | [-34.9222085,138.5921522, 'Yungondi Building, Adelaide SA 5000, Australia'], 15 | [47.80949,13.05501, 'Salzburg, Austria'], 16 | [61.4977524,23.7609535, 'Tampere, Finland'], 17 | [59.9342802,30.3350986, 'St Petersburg, Russia'], 18 | [28.6853472,-106.1015266, 'São Paulo, Chihuahua, Chih., Mexico'], 19 | [54.7903112,32.0503663, 'Smolensk, Smolensk Oblast, Russia'], 20 | [24.8614622,67.0099388, 'Karachi, Pakistan'], 21 | [40.4469796,-3.7278167, 'Av. Complutense, Madrid, Madrid, Spain'], 22 | [24.4325423,54.6174842, 'Masdar Institute Bus Station - Abu Dhabi - United Arab Emirates'], 23 | [51.5266171,-0.1260773, 'University Of London, 1-11 Cartwright Gardens, Kings Cross, London WC1H 9EB, UK'], 24 | [39.5069974,-84.745231, 'Oxford, OH 45056, USA'], 25 | [59.393847,24.6650872, 'TTÜ staadion, 12616 Tallinn, Estonia'], 26 | [58.3733281,26.7265098, 'Tartu Ülikooli Füüsikahoone, 50103 Tartu, Estonia'], 27 | [33.6778327,-117.8151285, 'Padua, Irvine, CA 92614, USA'], 28 | [18.5544976,73.8257325, 'Pune University, Ganeshkhind, Pune, Maharashtra, India'], 29 | [37.8764984,-122.2804342, 'California St, Berkeley, CA, USA'], 30 | [43.0412831,-89.4301473, 'University of Wisconsin-Madison Arboretum, 1207 Seminole Hwy, Madison, WI 53711, USA'], 31 | [51.745806,19.4489068, 'Instytut Informatyki Stosowanej, Politechnika Łódzka, 90-924 Łódź, Poland'], 32 | [38.3946981,27.0322689, 'İnciraltı, Dokuz Eylül Ünv. Hst., 35330 Balçova/İzmir, Turkey'], 33 | [39.9314428,116.3049709, 'Bei Jing Shi Fan Da Xue, Haidian Qu, Beijing Shi, China, 100000'], 34 | [33.9519347,-83.357567, 'Athens, GA, USA'], 35 | [10.7295115,79.0196067, 'Sastra University Road, Tirumalaisamudram, Tamil Nadu 613401, India'], 36 | [21.1470404,79.0397862, 'Nagpur University Campus, Nagpur, Maharashtra 440033, India'], 37 | [41.9197689,-91.649501, 'Duke St SW, Cedar Rapids, IA 52404, USA'], 38 | [37.7634731,-122.4390636, 'States St, San Francisco, CA 94114, USA'], 39 | [-23.5505199,-46.6333094, 'São Paulo, São Paulo - State of São Paulo, Brazil'], 40 | [30.2850284,-97.7335226, 'University of Texas at Austin, Austin, TX, USA'], 41 | [61.6887271,27.2721457, 'Mikkeli, Finland'], 42 | [32.4204729,-85.0323718, 'H. Curtis Pitts Hall, 3413 S Seale Rd, Phenix City, AL 36869, USA'], 43 | [41.557583,-8.397568, 'Universidade do Minho, 4710 Braga, Portugal'], 44 | [28.1655981,112.9526566, 'Yue Lu Shan Guo Jia Da Xue Ke Ji Yuan Chuang Ye Da Sha, Yuelu Qu, Changsha Shi, Hunan Sheng, China, 410006'], 45 | [-33.0444219,-71.6066334, 'Pontificia Universidad Catolica De Valparaiso - Gimpert, Valparaíso, Valparaíso, Región de Valparaíso, Chile'], 46 | [40.6331249,-89.3985283, 'Illinois, USA'], 47 | [30.0199119,31.5001527, 'AUC Library, Cairo Governorate 11835, Egypt'], 48 | [55.1170375,36.5970818, 'Obninsk, Kaluga Oblast, Russia'], 49 | [31.767879,-106.440736, 'Washington, El Paso, TX 79905, USA'], 50 | [49.9935,36.230383, 'Kharkiv, Kharkiv Oblast, Ukraine'], 51 | [43.8562586,18.4130763, 'Sarajevo, Bosnia and Herzegovina'], 52 | [3.4321247,-76.5461709, 'Parqueadero Universidad Del Valle, Cali, Cali, Valle del Cauca, Colombia'], 53 | [40.0082221,-105.2591119, 'Colorado Ave & University Heights, Boulder, CO 80302, USA'], 54 | [53.4129429,59.0016233, 'Magnitogorsk, Chelyabinsk Oblast, Russia'], 55 | [46.4062583,8.9040484, 'Usc, 6749, Switzerland'], 56 | [52.124815,-106.589195, 'Simon Fraser Crescent, Saskatoon, SK S7H, Canada'], 57 | [34.0247033,-81.0131844, 'New York Ave, Columbia, SC 29204, USA'], 58 | [38.1999105,-85.7659121, 'Southern Pkwy, Louisville, KY, USA'], 59 | [14.6063194,121.0977669, 'Warsaw, Pasig, Metro Manila, Philippines'], 60 | [52.2296756,21.0122287, 'Warsaw, Poland'], 61 | [-40.900557,174.885971, 'New Zealand'], 62 | [-40.3850866,175.6140639, 'Massey University, Palmerston North, New Zealand'], 63 | [35.8715218,-97.5672431, 'Noble Ave, Guthrie, OK 73044, USA'], 64 | [45.1847248,9.1582069, '27100 Pavia PV, Italy'], 65 | [38.6598662,-90.3123536, 'Columbia Ave, University City, MO 63130, USA'], 66 | [50.0755381,14.4378005, 'Prague, Czech Republic'], 67 | [41.8313852,-87.6272216, 'Iit Tower, 10 W 35th St, Chicago, IL 60616, USA'], 68 | [40.7933949,-77.8600012, 'State College, PA, USA'], 69 | [33.4249307,-111.8884532, 'Utah, Tempe, AZ 85281, USA'], 70 | [39.4813156,-0.3505, 'Universitat Politècnica, 46022 Valencia, Valencia, Spain'], 71 | [33.6140008,-117.8440006, 'Vienna, Newport Beach, CA 92660, USA'], 72 | [44.4267674,26.1025384, 'Bucharest, Romania'], 73 | [33.7063317,-117.7733121, 'New Haven, Irvine, CA 92620, USA'], 74 | [47.761605,-122.19303, 'UW Bothell & Cascadia College, Bothell, WA 98011, USA'], 75 | [38.6679152,-90.3322259, 'Drexel Dr, University City, MO 63130, USA'], 76 | [32.083852,34.79197, 'Helsinki St, Tel Aviv-Yafo, Israel'], 77 | [42.320138,-83.230993, 'University of Michigan, Dearborn, MI 48128, USA'], 78 | [40.4432289,-79.9441368, 'Carnegie Mellon University, Pausch Bridge, Pittsburgh, PA 15213, USA'], 79 | [55.8304307,49.0660806, 'Kazan, Tatarstan, Russia'], 80 | [12.0263438,79.8492812, 'Pondicherry University, Kalapet, Puducherry 605014, India'], 81 | [30.7897514,120.7760636, 'Jia Xing Nan Yang Zhi Ye Ji Shu Xue Yuan, Xiuzhou Qu, Jiaxing Shi, Zhejiang Sheng, China, 314000'], 82 | [35.712815,135.9711705, 'Nyu, Mihama, Mikata District, Fukui Prefecture 919-1201, Japan'], 83 | [-23.5431786,-46.6291845, 'State of São Paulo, Brazil'], 84 | [47.5584793,21.620443, 'Debrecen, Debrecen University-Botanical Garden, 4032 Hungary'], 85 | [34.1515641,-117.3354402, 'N State St, California, USA'], 86 | [50.4501,30.5234, 'Kiev, Ukraine, 02000'], 87 | [46.4618977,-80.9664534, 'University Laurentian, Copper Cliff, ON P0M 1N0, Canada'], 88 | [55.755826,37.6173, 'Moscow, Russia'], 89 | [52.2016671,0.1177882, 'University Of Cambridge, Cambridge, Cambridge, Cambridgeshire CB2, UK'], 90 | [35.9525664,51.490619, 'Payame Noor, Meygun, Tehran, Iran'], 91 | [35.246756,33.0307541, 'ODTÜ Misafirhane, Kalkanlı'], 92 | [46.5189865,6.5676007, 'EPFL, 1015 Lausanne, Switzerland'], 93 | [45.2671352,19.8335496, 'Novi Sad, Serbia'], 94 | [57.6954209,11.9853213, 'Göteborgs universitetsbibliotek, Renströmsgatan 4, 412 55 Göteborg, Sweden'], 95 | [45.7488716,21.2086793, 'Timișoara, Romania'], 96 | [53.8931837,27.547338, 'Monument to Fallen Professors and Students of the Belarusian State University, Minsk, Belarus'], 97 | [22.4828735,88.394867, 'Jadavpur University Lake, Sahid Smirity Colony, Pancha Sayar, Kolkata, West Bengal 700094'], 98 | [26.1529683,91.6639235, 'Gauhati University, Jalukbari, Guwahati, Assam, India'], 99 | [-34.5178509,-58.4831979, 'Universidad, Vicente López, Buenos Aires, Argentina'], 100 | [44.4061457,8.9682634, 'Università degli studi di Genova - Dipartimento di Medicina Sperimentale (DIMES), 16143 Genova, Italy'], 101 | [13.7164911,100.4874338, 'Thon Buri, Bangkok 10600, Thailand'], 102 | [4.8602595,-74.0333032, 'Universidad De La Sabana, Chía, Chía, Cundinamarca, Colombia'], 103 | [43.4553461,-76.5104973, 'Oswego, NY, USA'], 104 | [17.4930263,78.3906218, 'Jawaharlal Nehru Technological University, Kukatpally Housing Board Colony, Kukatpally, Hyderabad, Telangana 500085, India'], 105 | [50.503887,4.469936, 'Belgium'], 106 | [42.3518484,-71.1107301, 'Boston University Bridge, Massachusetts, USA'], 107 | [64.9078809,-147.7117155, 'Manchester Loop, Fairbanks, AK 99712, USA'], 108 | [51.1877226,6.7938734, 'Fachhochschule Düsseldorf, Stadtbezirk 3, 40225 Düsseldorf, Germany'], 109 | [27.6169691,-99.4631289, 'Simon Bolivar Blvd, Laredo, TX 78045, USA'], 110 | [39.174335,-86.505469, 'Hilltop Garden and Nature Center at Indiana University, 2367 E 10th St, Bloomington, IN 47408, USA'], 111 | [18.9331831,72.8341894, 'KP Shethi Building, Janmabhoomi Marg, Kala Ghoda, Fort, Mumbai, Maharashtra 400001, India'], 112 | [42.3077541,-83.0182189, 'Ottawa St, Windsor, ON, Canada'], 113 | [28.3580163,75.5887989, 'BITS, Pilani, Rajasthan 333031, India'], 114 | [35.8278379,-78.6593111, 'Transylvania Ave, Raleigh, NC 27609, USA'], 115 | [25.25968,82.989115, 'IIT Gymkhana, RR 11, Banaras Hindu University Campus, Varanasi, Uttar Pradesh 221001, India'], 116 | [50.862282,-2.4998561, 'E M Mitchell & Sons, Hermitage, Dorchester, Dorset DT2 7BB, UK'], 117 | [18.4074917,-66.062465, 'Ave Central, San Juan, San Juan, Puerto Rico'], 118 | [50.4471975,30.4522355, 'Obshchezhitiye NTUU KPI №10, Vyborzka St, 2/24, Kyiv, Ukraine'], 119 | [-9.9541653,-67.8384015, 'Tv. Paraíba - Geraldo Fleming, Rio Branco - AC, Brazil'], 120 | [47.497912,19.040235, 'Budapest, Hungary'], 121 | [55.755826,37.6173, 'Moscow, Russia'], 122 | [59.9342802,30.3350986, 'St Petersburg, Russia'], 123 | [41.7508391,-88.1535352, 'Naperville, IL, USA'], 124 | [37.424106,-122.1660756, 'Stanford, CA, USA'], 125 | [45.7484997,21.2399277, 'Cantina Politehnică, Strada Alexandru Vaida - Voievod, Timișoara, Romania'], 126 | [16.4226352,120.5906046, 'National Baguio University, Bokawkan, Baguio, Benguet, Philippines'], 127 | [-35.417,149.1, 'Monash ACT 2904, Australia'], 128 | [-7.2159454,-35.9065247, 'Campo da UFCG - R. Silva Barbosa - Universitário, Campina Grande - PB, 58400-850, Brazil'], 129 | [19.3188895,-99.1843676, 'National Autonomous University of Mexico, Mexico City, Mexico City, Mexico'], 130 | [35.7058075,51.4020909, 'Tehran University, Tehran, Tehran, Iran'], 131 | [36.8838958,-76.3040214, 'Old Dominion University, 5115 Hampton Blvd, Norfolk, VA 23508, USA'], 132 | [50.4501,30.5234, 'Kiev, Ukraine, 02000'], 133 | [32.2366945,-110.9456894, 'Babcock Building, 1717 E Speedway Blvd, Tucson, AZ 85719, USA'], 134 | [44.9715569,-93.231866, 'Essex St SE, Minneapolis, MN 55455, USA'], 135 | [49.9935,36.230383, 'Kharkiv, Kharkiv Oblast, Ukraine'], 136 | [54.8985207,23.9035965, 'Kaunas, Lithuania'], 137 | [42.3423603,-7.8552788, 'Av. de Buenos Aires, 32004 Ourense, Orense, Spain'], 138 | [9.7297203,79.9482992, 'Jaffna College, AB21, Sri Lanka'], 139 | [42.5030209,-89.0295642, 'College St, Beloit, WI 53511, USA'], 140 | [40.5382913,-78.3528584, 'Ucla Ln, Altoona, PA 16602, USA'], 141 | [28.0282578,-82.3924269, 'Chicago Ave, Temple Terrace, FL 33617, USA'], 142 | [30.5848529,31.4843221, 'Rd inside Zagazig University, Shaibet an Nakareyah, Markaz El-Zakazik, Ash Sharqia Governorate, Egypt'], 143 | [33.428283,-111.750401, 'N Alberta, Mesa, AZ 85205, USA'], 144 | [53.8931837,27.547338, 'Monument to Fallen Professors and Students of the Belarusian State University, Minsk, Belarus'], 145 | [28.0735403,-82.4373589, 'University, FL, USA'], 146 | [11.1705436,75.8736048, 'University Rd, Ramanattukara, Kerala, India'], 147 | [45.4723514,9.1964401, 'Via del Vecchio Politecnico, 20121 Milano, Italy'], 148 | [54.6871555,25.2796514, 'Vilnius, Lithuania'], 149 | [20.593684,78.96288, 'India'], 150 | [-33.8812733,18.6264694, 'Stellenbosch University, Cape Town, 7530, South Africa'], 151 | [28.6777345,77.4504666, 'IMT Rd, Block 14, Sector 10, Raj Nagar, Ghaziabad, Uttar Pradesh 201002, India'], 152 | [33.4238104,-111.8869146, 'Pennsylvania, Tempe, AZ 85281, USA'], 153 | [31.3260152,75.5761829, 'Jalandhar, Punjab 144001, India'], 154 | [36.8743583,-76.1745441, 'Virginia Tech Trail, Virginia Beach, VA 23455, USA'], 155 | [33.4232051,-111.8879509, 'State Ave, Tempe, AZ 85281, USA'], 156 | [22.2567635,-97.8345654, 'Guatemala, Cd Madero, Tamps., Mexico'], 157 | [54.6871555,25.2796514, 'Vilnius, Lithuania'], 158 | [1.2246216,19.7878159, 'Basankusu Airport (BSU), N22, Basankusu, Democratic Republic of the Congo'], 159 | [51.165691,10.451526, 'Germany'], 160 | [30.0742446,31.2765847, 'Internal Medicine, Ain Shams University, ممر خاص مستشفى الدمرداش، Al Waili, Cairo Governorate, Egypt'], 161 | [-4.009976,-79.2085378, 'Colombia, Loja, Ecuador'], 162 | [59.9342802,30.3350986, 'St Petersburg, Russia'], 163 | [10.1345309,-85.4467445, 'Universidad Nacional, 150, Nicoya, Costa Rica'], 164 | [33.952602,-84.5499327, 'Marietta, GA, USA'], 165 | [42.9097484,-85.7630885, 'Grandville, MI, USA'], 166 | [34.3020001,48.8145943, 'Malayer, Hamadan, Iran'], 167 | [1.2877936,103.8665551, 'Marina Bay, Singapore'] 168 | ]; 169 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive vizualizing the data using the 2 | D3 JavaScript library 3 | 4 | Here is a copy of the Sakai Developer Mailing list from 2006-2014. 5 | 6 | http://mbox.dr-chuck.net/ 7 | 8 | You should install the SQLite browser to view and modify the databases from: 9 | 10 | http://sqlitebrowser.org/ 11 | 12 | The base URL is hard-coded in the gmane.py. Make sure to delete the 13 | content.sqlite file if you switch the base url. The gmane.py file 14 | operates as a spider in that it runs slowly and retrieves one mail 15 | message per second so as to avoid getting throttled. It stores all of 16 | its data in a database and can be interrupted and re-started 17 | as often as needed. It may take many hours to pull all the data 18 | down. So you may need to restart several times. 19 | 20 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 21 | email here: 22 | 23 | https://online.dr-chuck.com/files/sakai/email/content.sqlite.zip 24 | 25 | If you download and unzip this, you can "catch up with the 26 | latest" by running gmane.py. 27 | 28 | Navigate to the folder where you extracted the gmane.zip 29 | 30 | Here is a run of gmane.py getting the last five messages of the 31 | sakai developer list: 32 | 33 | Mac: python gmane.py 34 | Win: gmane.py 35 | 36 | How many messages:10 37 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 38 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 39 | http://mbox.dr-chuck.net/sakai.devel/6/7 3586 40 | s-githens@northwestern.edu 2005-12-09T13:32:31-06:00 re: sakaiportallogin and presense 41 | http://mbox.dr-chuck.net/sakai.devel/7/8 10600 42 | john@caret.cam.ac.uk 2005-12-09T13:42:24+00:00 re: lms/vle rants/comments 43 | 44 | The program scans content.sqlite from 1 up to the first message number not 45 | already spidered and starts spidering at that message. It continues spidering 46 | until it has spidered the desired number of messages or it reaches a page 47 | that does not appear to be a properly formatted message. 48 | 49 | Sometimes there is missing a message. Perhaps administrators can delete messages 50 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 51 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 52 | all the other fields blank - and then restart gmane.py. This will unstick the 53 | spidering process and allow it to continue. These empty messages will be ignored in the next 54 | phase of the process. 55 | 56 | One nice thing is that once you have spidered all of the messages and have them in 57 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 58 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 59 | if there are new messages and then quickly retrieve those messages and add them 60 | to content.sqlite. 61 | 62 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 63 | This is intentional as it allows you to look at content.sqlite to debug the process. 64 | It would be a bad idea to run any queries against this database as they would be 65 | slow. 66 | 67 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 68 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 69 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 70 | smaller) than content.sqlite because it also compresses the header and body text. 71 | 72 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 73 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 74 | data cleaning process. 75 | 76 | Running gmodel.py works as follows: 77 | 78 | Mac: python gmodel.py 79 | Win: gmodel.py 80 | 81 | Loaded allsenders 1588 and mapping 28 dns mapping 1 82 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 83 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 84 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 85 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 86 | ... 87 | 88 | The gmodel.py program does a number of data cleaing steps 89 | 90 | Domain names are truncated to two levels for .com, .org, .edu, and .net 91 | other domain names are truncated to three levels. So si.umich.edu becomes 92 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 93 | forced to lower case and some of the @gmane.org address like the following 94 | 95 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 96 | 97 | are converted to the real address whenever there is a matching real email 98 | address elsewhere in the message corpus. 99 | 100 | If you look in the content.sqlite database there are two tables that allow 101 | you to map both domain names and individual email addresses that change over 102 | the lifetime of the email list. For example, Steve Githens used the following 103 | email addresses over the life of the Sakai developer list: 104 | 105 | s-githens@northwestern.edu 106 | sgithens@cam.ac.uk 107 | swgithen@mtu.edu 108 | 109 | We can add two entries to the Mapping table 110 | 111 | s-githens@northwestern.edu -> swgithen@mtu.edu 112 | sgithens@cam.ac.uk -> swgithen@mtu.edu 113 | 114 | And so all the mail messages will be collected under one sender even if 115 | they used several email addresses over the lifetime of the mailing list. 116 | 117 | You can also make similar entries in the DNSMapping table if there are multiple 118 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 119 | mapping: 120 | 121 | iupui.edu -> indiana.edu 122 | 123 | So all the folks from the various Indiana University campuses are tracked together 124 | 125 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 126 | to make the data cleaner and cleaner. When you are done, you will have a nicely 127 | indexed version of the email in index.sqlite. This is the file to use to do data 128 | analysis. With this file, data analysis will be really quick. 129 | 130 | The first, simplest data analysis is to do a "who does the most" and "which 131 | organzation does the most"? This is done using gbasic.py: 132 | 133 | Mac: python gbasic.py 134 | Win: gbasic.py 135 | 136 | How many to dump? 5 137 | Loaded messages= 51330 subjects= 25033 senders= 1584 138 | 139 | Top 5 Email list participants 140 | steve.swinsburg@gmail.com 2657 141 | azeckoski@unicon.net 1742 142 | ieb@tfd.co.uk 1591 143 | csev@umich.edu 1304 144 | david.horwitz@uct.ac.za 1184 145 | 146 | Top 5 Email list organizations 147 | gmail.com 7339 148 | umich.edu 6243 149 | uct.ac.za 2451 150 | indiana.edu 2258 151 | unicon.net 2055 152 | 153 | You can look at the data in index.sqlite and if you find a problem, you 154 | can update the Mapping table and DNSMapping table in content.sqlite and 155 | re-run gmodel.py. 156 | 157 | There is a simple vizualization of the word frequence in the subject lines 158 | in the file gword.py: 159 | 160 | Mac: python gword.py 161 | Win: gword.py 162 | 163 | Range of counts: 33229 129 164 | Output written to gword.js 165 | 166 | This produces the file gword.js which you can visualize using the file 167 | gword.htm. 168 | 169 | A second visualization is in gline.py. It visualizes email participation by 170 | organizations over time. 171 | 172 | Mac: python gline.py 173 | Win: gline.py 174 | 175 | Loaded messages= 51330 subjects= 25033 senders= 1584 176 | Top 10 Organizations 177 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 178 | Output written to gline.js 179 | 180 | Its output is written to gline.js which is visualized using gline.htm. 181 | If you have a problem with gline.htm, you can try gline2.htm or gline3.htm 182 | to vizualize your data. 183 | 184 | Some URLs for visualization ideas: 185 | 186 | https://developers.google.com/chart/ 187 | 188 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 189 | 190 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 191 | 192 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 193 | 194 | http://bost.ocks.org/mike/uberdata/ 195 | 196 | http://mbostock.github.io/d3/talk/20111018/calendar.html 197 | 198 | http://nltk.org/install.html 199 | 200 | As always - comments welcome. 201 | 202 | -- Dr. Chuck 203 | Sun Sep 29 00:11:01 EDT 2013 204 | 205 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | 6 | howmany = int(raw_input("How many to dump? ")) 7 | 8 | conn = sqlite3.connect('index.sqlite') 9 | conn.text_factory = str 10 | cur = conn.cursor() 11 | 12 | cur.execute('''SELECT Messages.id, sender FROM Messages 13 | JOIN Senders ON Messages.sender_id = Senders.id''') 14 | 15 | sendcounts = dict() 16 | sendorgs = dict() 17 | for message in cur : 18 | sender = message[1] 19 | sendcounts[sender] = sendcounts.get(sender,0) + 1 20 | pieces = sender.split("@") 21 | if len(pieces) != 2 : continue 22 | dns = pieces[1] 23 | sendorgs[dns] = sendorgs.get(dns,0) + 1 24 | 25 | print '' 26 | print 'Top',howmany,'Email list participants' 27 | 28 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 29 | for k in x[:howmany]: 30 | print k, sendcounts[k] 31 | if sendcounts[k] < 10 : break 32 | 33 | print '' 34 | print 'Top',howmany,'Email list organizations' 35 | 36 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 37 | for k in x[:howmany]: 38 | print k, sendorgs[k] 39 | if sendorgs[k] < 10 : break 40 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | conn.text_factory = str 8 | cur = conn.cursor() 9 | 10 | # Determine the top ten organizations 11 | cur.execute('''SELECT Messages.id, sender FROM Messages 12 | JOIN Senders ON Messages.sender_id = Senders.id''') 13 | 14 | sendorgs = dict() 15 | for message_row in cur : 16 | sender = message_row[1] 17 | pieces = sender.split("@") 18 | if len(pieces) != 2 : continue 19 | dns = pieces[1] 20 | sendorgs[dns] = sendorgs.get(dns,0) + 1 21 | 22 | # pick the top schools 23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 24 | orgs = orgs[:10] 25 | print "Top 10 Organizations" 26 | print orgs 27 | # orgs = ['total'] + orgs 28 | 29 | # Read through the messages 30 | counts = dict() 31 | months = list() 32 | 33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages 34 | JOIN Senders ON Messages.sender_id = Senders.id''') 35 | 36 | for message_row in cur : 37 | sender = message_row[1] 38 | pieces = sender.split("@") 39 | if len(pieces) != 2 : continue 40 | dns = pieces[1] 41 | if dns not in orgs : continue 42 | month = message_row[2][:7] 43 | if month not in months : months.append(month) 44 | key = (month, dns) 45 | counts[key] = counts.get(key,0) + 1 46 | tkey = (month, 'total') 47 | counts[tkey] = counts.get(tkey,0) + 1 48 | 49 | months.sort() 50 | print counts 51 | print months 52 | 53 | fhand = open('gline.js','w') 54 | fhand.write("gline = [ ['Month'") 55 | for org in orgs: 56 | fhand.write(",'"+org+"'") 57 | fhand.write("]") 58 | 59 | # for month in months[1:-1]: 60 | for month in months: 61 | fhand.write(",\n['"+month+"'") 62 | for org in orgs: 63 | key = (month, org) 64 | val = counts.get(key,0) 65 | fhand.write(","+str(val)) 66 | fhand.write("]"); 67 | 68 | fhand.write("\n];\n") 69 | 70 | print "Data written to gline.js" 71 | print "Open gline.htm in a browser to view" 72 | 73 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gline2.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 21 | 22 | 23 |
24 | 25 | 26 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gline3.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Line Chart 6 | 7 | 8 | 15 | 16 | 17 | 18 |
19 | 20 |
21 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gmane.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import sqlite3 3 | import time 4 | import ssl 5 | import urllib 6 | from urlparse import urljoin 7 | from urlparse import urlparse 8 | import re 9 | from datetime import datetime, timedelta 10 | 11 | # Not all systems have this so conditionally define parser 12 | try: 13 | import dateutil.parser as parser 14 | except: 15 | pass 16 | 17 | def parsemaildate(md) : 18 | # See if we have dateutil 19 | try: 20 | pdate = parser.parse(tdate) 21 | test_at = pdate.isoformat() 22 | return test_at 23 | except: 24 | pass 25 | 26 | # Non-dateutil version - we try our best 27 | 28 | pieces = md.split() 29 | notz = " ".join(pieces[:4]).strip() 30 | 31 | # Try a bunch of format variations - strptime() is *lame* 32 | dnotz = None 33 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 34 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 35 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 36 | try: 37 | dnotz = datetime.strptime(notz, form) 38 | break 39 | except: 40 | continue 41 | 42 | if dnotz is None : 43 | # print 'Bad Date:',md 44 | return None 45 | 46 | iso = dnotz.isoformat() 47 | 48 | tz = "+0000" 49 | try: 50 | tz = pieces[4] 51 | ival = int(tz) # Only want numeric timezone values 52 | if tz == '-0000' : tz = '+0000' 53 | tzh = tz[:3] 54 | tzm = tz[3:] 55 | tz = tzh+":"+tzm 56 | except: 57 | pass 58 | 59 | return iso+tz 60 | 61 | conn = sqlite3.connect('content.sqlite') 62 | cur = conn.cursor() 63 | conn.text_factory = str 64 | 65 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 66 | 67 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 68 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 69 | subject TEXT, headers TEXT, body TEXT)''') 70 | 71 | start = 0 72 | cur.execute('SELECT max(id) FROM Messages') 73 | try: 74 | row = cur.fetchone() 75 | if row[0] is not None: 76 | start = row[0] 77 | except: 78 | start = 0 79 | row = None 80 | 81 | print start 82 | 83 | many = 0 84 | 85 | # Skip up to five messages 86 | skip = 5 87 | while True: 88 | if ( many < 1 ) : 89 | sval = raw_input('How many messages:') 90 | if ( len(sval) < 1 ) : break 91 | many = int(sval) 92 | 93 | start = start + 1 94 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 95 | try: 96 | row = cur.fetchone() 97 | if row is not None : continue 98 | except: 99 | row = None 100 | 101 | many = many - 1 102 | url = baseurl + str(start) + '/' + str(start + 1) 103 | 104 | try: 105 | # Deal with SSL certificate anomalies Python > 2.7 106 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 107 | # document = urllib.urlopen(url, context=scontext) 108 | 109 | document = urllib.urlopen(url) 110 | 111 | text = document.read() 112 | if document.getcode() != 200 : 113 | print "Error code=",document.getcode(), url 114 | break 115 | except KeyboardInterrupt: 116 | print '' 117 | print 'Program interrupted by user...' 118 | break 119 | except: 120 | print "Unable to retrieve or parse page",url 121 | print sys.exc_info()[0] 122 | break 123 | 124 | print url,len(text) 125 | 126 | if not text.startswith("From "): 127 | if skip < 1 : 128 | print text 129 | print "End of mail stream reached..." 130 | quit () 131 | print " Skipping badly formed message" 132 | skip = skip-1 133 | continue 134 | 135 | pos = text.find("\n\n") 136 | if pos > 0 : 137 | hdr = text[:pos] 138 | body = text[pos+2:] 139 | else: 140 | print text 141 | print "Could not find break between headers and body" 142 | break 143 | 144 | skip = 5 # reset skip count 145 | 146 | email = None 147 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 148 | if len(x) == 1 : 149 | email = x[0]; 150 | email = email.strip().lower() 151 | email = email.replace("<","") 152 | else: 153 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 154 | if len(x) == 1 : 155 | email = x[0]; 156 | email = email.strip().lower() 157 | email = email.replace("<","") 158 | 159 | date = None 160 | y = re.findall('\Date: .*, (.*)\n', hdr) 161 | if len(y) == 1 : 162 | tdate = y[0] 163 | tdate = tdate[:26] 164 | try: 165 | sent_at = parsemaildate(tdate) 166 | except: 167 | print text 168 | print "Parse fail",tdate 169 | break 170 | 171 | subject = None 172 | z = re.findall('\Subject: (.*)\n', hdr) 173 | if len(z) == 1 : subject = z[0].strip().lower(); 174 | 175 | print " ",email,sent_at,subject 176 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 177 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 178 | 179 | # Only commit every 50th record 180 | # if (many % 50) == 0 : conn.commit() 181 | time.sleep(1) 182 | 183 | conn.commit() 184 | cur.close() 185 | 186 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import re 5 | import zlib 6 | from datetime import datetime, timedelta 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print realsender, sender 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print realsender, sender 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print x,dns 52 | # if dns != dnsmapping.get(dns,dns) : print dns,dnsmapping.get(dns,dns) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print 'Bad Date:',md 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception, e: 124 | # print 'Date ignored ',tdate, e 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | # Open the output database and create empty tables 140 | conn = sqlite3.connect('index.sqlite') 141 | conn.text_factory = str 142 | cur = conn.cursor() 143 | 144 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 145 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 146 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 147 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 148 | 149 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 150 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 151 | sender_id INTEGER, subject_id INTEGER, 152 | headers BLOB, body BLOB)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 154 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 156 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 157 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 158 | (from_id INTEGER, to_id INTEGER)''') 159 | 160 | # Open the mapping information 161 | conn_1 = sqlite3.connect('mapping.sqlite') 162 | conn_1.text_factory = str 163 | cur_1 = conn_1.cursor() 164 | 165 | # Load up the mapping information into memory structures 166 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 167 | for message_row in cur_1 : 168 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 169 | 170 | mapping = dict() 171 | cur_1.execute('''SELECT old,new FROM Mapping''') 172 | for message_row in cur_1 : 173 | old = fixsender(message_row[0]) 174 | new = fixsender(message_row[1]) 175 | mapping[old] = fixsender(new) 176 | 177 | cur_1.close() 178 | 179 | # Open the raw data retrieved from the network 180 | conn_2 = sqlite3.connect('content.sqlite') 181 | conn_2.text_factory = str 182 | cur_2 = conn_2.cursor() 183 | 184 | allsenders = list() 185 | cur_2.execute('''SELECT email FROM Messages''') 186 | for message_row in cur_2 : 187 | sender = fixsender(message_row[0]) 188 | if sender is None : continue 189 | if 'gmane.org' in sender : continue 190 | if sender in allsenders: continue 191 | allsenders.append(sender) 192 | 193 | print "Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping) 194 | 195 | cur_2.execute('''SELECT headers, body, sent_at 196 | FROM Messages ORDER BY sent_at''') 197 | 198 | senders = dict() 199 | subjects = dict() 200 | guids = dict() 201 | 202 | count = 0 203 | 204 | for message_row in cur_2 : 205 | hdr = message_row[0] 206 | parsed = parseheader(hdr, allsenders) 207 | if parsed is None: continue 208 | (guid, sender, subject, sent_at) = parsed 209 | 210 | # Apply the sender mapping 211 | sender = mapping.get(sender,sender) 212 | 213 | count = count + 1 214 | if count % 250 == 1 : print count,sent_at, sender 215 | # print guid, sender, subject, sent_at 216 | 217 | if 'gmane.org' in sender: 218 | print "Error in sender ===", sender 219 | 220 | sender_id = senders.get(sender,None) 221 | subject_id = subjects.get(subject,None) 222 | guid_id = guids.get(guid,None) 223 | 224 | if sender_id is None : 225 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 226 | conn.commit() 227 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 228 | try: 229 | row = cur.fetchone() 230 | sender_id = row[0] 231 | senders[sender] = sender_id 232 | except: 233 | print 'Could not retrieve sender id',sender 234 | break 235 | if subject_id is None : 236 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 237 | conn.commit() 238 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 239 | try: 240 | row = cur.fetchone() 241 | subject_id = row[0] 242 | subjects[subject] = subject_id 243 | except: 244 | print 'Could not retrieve subject id',subject 245 | break 246 | # print sender_id, subject_id 247 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 248 | ( guid, sender_id, subject_id, sent_at, zlib.compress(message_row[0]), zlib.compress(message_row[1])) ) 249 | conn.commit() 250 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 251 | try: 252 | row = cur.fetchone() 253 | message_id = row[0] 254 | guids[guid] = message_id 255 | except: 256 | print 'Could not retrieve guid id',guid 257 | break 258 | 259 | # Close the connections 260 | cur.close() 261 | cur_2.close() 262 | 263 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | import string 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | conn.text_factory = str 9 | cur = conn.cursor() 10 | 11 | cur.execute('''SELECT subject_id,subject FROM Messages 12 | JOIN Subjects ON Messages.subject_id = Subjects.id''') 13 | 14 | counts = dict() 15 | for message_row in cur : 16 | text = message_row[1] 17 | text = text.translate(None, string.punctuation) 18 | text = text.translate(None, '1234567890') 19 | text = text.strip() 20 | text = text.lower() 21 | words = text.split() 22 | for word in words: 23 | if len(word) < 4 : continue 24 | counts[word] = counts.get(word,0) + 1 25 | 26 | # Find the top 100 words 27 | words = sorted(counts, key=counts.get, reverse=True) 28 | highest = None 29 | lowest = None 30 | for w in words[:100]: 31 | if highest is None or highest < counts[w] : 32 | highest = counts[w] 33 | if lowest is None or lowest > counts[w] : 34 | lowest = counts[w] 35 | print 'Range of counts:',highest,lowest 36 | 37 | # Spread the font sizes across 20-100 based on the count 38 | bigsize = 80 39 | smallsize = 20 40 | 41 | fhand = open('gword.js','w') 42 | fhand.write("gword = [") 43 | first = True 44 | for k in words[:100]: 45 | if not first : fhand.write( ",\n") 46 | first = False 47 | size = counts[k] 48 | size = (size - lowest) / float(highest - lowest) 49 | size = int((size * bigsize) + smallsize) 50 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 51 | fhand.write( "\n];\n") 52 | 53 | print "Output written to gword.js" 54 | print "Open gword.htm in a browser to view" 55 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | conn.text_factory = str 8 | cur = conn.cursor() 9 | 10 | # Determine the top ten organizations 11 | cur.execute('''SELECT Messages.id, sender FROM Messages 12 | JOIN Senders ON Messages.sender_id = Senders.id''') 13 | 14 | sendorgs = dict() 15 | for message_row in cur : 16 | sender = message_row[1] 17 | pieces = sender.split("@") 18 | if len(pieces) != 2 : continue 19 | dns = pieces[1] 20 | sendorgs[dns] = sendorgs.get(dns,0) + 1 21 | 22 | # pick the top schools 23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 24 | orgs = orgs[:10] 25 | print "Top 10 Organizations" 26 | print orgs 27 | # orgs = ['total'] + orgs 28 | 29 | # Read through the messages 30 | counts = dict() 31 | years = list() 32 | 33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages 34 | JOIN Senders ON Messages.sender_id = Senders.id''') 35 | 36 | for message_row in cur : 37 | sender = message_row[1] 38 | pieces = sender.split("@") 39 | if len(pieces) != 2 : continue 40 | dns = pieces[1] 41 | if dns not in orgs : continue 42 | year = message_row[2][:4] 43 | if year not in years : years.append(year) 44 | key = (year, dns) 45 | counts[key] = counts.get(key,0) + 1 46 | tkey = (year, 'total') 47 | counts[tkey] = counts.get(tkey,0) + 1 48 | 49 | years.sort() 50 | print counts 51 | print years 52 | 53 | fhand = open('gline.js','w') 54 | fhand.write("gline = [ ['Year'") 55 | for org in orgs: 56 | fhand.write(",'"+org+"'") 57 | fhand.write("]") 58 | 59 | # for year in years[1:-1]: 60 | for year in years: 61 | fhand.write(",\n['"+year+"'") 62 | for org in orgs: 63 | key = (year, org) 64 | val = counts.get(key,0) 65 | fhand.write(","+str(val)) 66 | fhand.write("]"); 67 | 68 | fhand.write("\n];\n") 69 | 70 | print "Data written to gline.js" 71 | print "Open gline.htm in a browser to view" 72 | 73 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/gmane/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/gmane/mapping.sqlite -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/mailing_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/mailing_list.png -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/mailing_lists.txt: -------------------------------------------------------------------------------- 1 | Mailing Lists 2 | 1. Crawl archive of mailing list 3 | 2. Analyse and clean-up 4 | 3. Visualise data 5 | 6 | 7 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/multistep_data_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/multistep_data_analysis.png -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/page_rank_web_search.txt: -------------------------------------------------------------------------------- 1 | Page Rank 2 | 1. Write a simple web page crawler 3 | 2. Compute simple version of Google's Page Rank algorithm 4 | 3. Visualize resulting network 5 | 6 | Search Engine Architecture 7 | 1. Web Crawling 8 | - Browses the WWW in a methodical and automated manner 9 | - Create copy of pages to be indexed for fast searching 10 | a. Create list of websites to crawl 11 | b. Retrieve page 12 | c. Look through for links 13 | d. Add links to list 14 | e. Repeat 15 | 2. Index Building 16 | - Collects, parses and stores data 17 | - Facilitate fast & accurate data retrieval 18 | 19 | 3. Searching 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Mac: rm spider.sqlite 17 | Mac: python spider.py 18 | 19 | Win: del spider.sqlite 20 | Win: spider.py 21 | 22 | Enter web url or enter: http://www.dr-chuck.com/ 23 | ['http://www.dr-chuck.com'] 24 | How many pages:2 25 | 1 http://www.dr-chuck.com/ 12 26 | 2 http://www.dr-chuck.com/csev-blog/ 57 27 | How many pages: 28 | 29 | In this sample run, we told it to crawl a website and retrieve two 30 | pages. If you restart the program again and tell it to crawl more 31 | pages, it will not re-crawl any pages already in the database. Upon 32 | restart it goes to a random non-crawled page and starts there. So 33 | each successive run of spider.py is additive. 34 | 35 | Mac: python spider.py 36 | Win: spider.py 37 | 38 | Enter web url or enter: http://www.dr-chuck.com/ 39 | ['http://www.dr-chuck.com'] 40 | How many pages:3 41 | 3 http://www.dr-chuck.com/csev-blog 57 42 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 43 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 44 | How many pages: 45 | 46 | You can have multiple starting points in the same database - 47 | within the program these are called "webs". The spider 48 | chooses randomly amongst all non-visited links across all 49 | the webs. 50 | 51 | If your code fails complainin about certificate probems, 52 | there is some code (SSL) that can be un-commented to work 53 | around certificate problems. 54 | 55 | If you want to dump the contents of the spider.sqlite file, you can 56 | run spdump.py as follows: 57 | 58 | Mac: python spdump.py 59 | Win: spdump.py 60 | 61 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 62 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 63 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 64 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 65 | 4 rows. 66 | 67 | This shows the number of incoming links, the old page rank, the new page 68 | rank, the id of the page, and the url of the page. The spdump.py program 69 | only shows pages that have at least one incoming link to them. 70 | 71 | Once you have a few pages in the database, you can run Page Rank on the 72 | pages using the sprank.py program. You simply tell it how many Page 73 | Rank iterations to run. 74 | 75 | Mac: python sprank.py 76 | Win: sprank.py 77 | 78 | How many iterations:2 79 | 1 0.546848992536 80 | 2 0.226714939664 81 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 82 | 83 | You can dump the database again to see that page rank has been updated: 84 | 85 | Mac: python spdump.py 86 | Win: spdump.py 87 | 88 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 89 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 90 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 91 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 92 | 4 rows. 93 | 94 | You can run sprank.py as many times as you like and it will simply refine 95 | the page rank the more times you run it. You can even run sprank.py a few times 96 | and then go spider a few more pages sith spider.py and then run sprank.py 97 | to converge the page ranks. 98 | 99 | If you want to restart the Page Rank calculations without re-spidering the 100 | web pages, you can use spreset.py 101 | 102 | Mac: python spreset.py 103 | Win: spreset.py 104 | 105 | All pages set to a rank of 1.0 106 | 107 | Mac: python sprank.py 108 | Win: sprank.py 109 | 110 | How many iterations:50 111 | 1 0.546848992536 112 | 2 0.226714939664 113 | 3 0.0659516187242 114 | 4 0.0244199333 115 | 5 0.0102096489546 116 | 6 0.00610244329379 117 | ... 118 | 42 0.000109076928206 119 | 43 9.91987599002e-05 120 | 44 9.02151706798e-05 121 | 45 8.20451504471e-05 122 | 46 7.46150183837e-05 123 | 47 6.7857770908e-05 124 | 48 6.17124694224e-05 125 | 49 5.61236959327e-05 126 | 50 5.10410499467e-05 127 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 128 | 129 | For each iteration of the page rank algorithm it prints the average 130 | change per page of the page rank. The network initially is quite 131 | unbalanced and so the individual page ranks are changeing wildly. 132 | But in a few short iterations, the page rank converges. You 133 | should run prank.py long enough that the page ranks converge. 134 | 135 | If you want to visualize the current top pages in terms of page rank, 136 | run spjson.py to write the pages out in JSON format to be viewed in a 137 | web browser. 138 | 139 | Mac: python spjson.py 140 | Win: spjson.py 141 | 142 | Creating JSON output on spider.js... 143 | How many nodes? 30 144 | Open force.html in a browser to view the visualization 145 | 146 | You can view this data by opening the file force.html in your web browser. 147 | This shows an automatic layout of the nodes and links. You can click and 148 | drag any node and you can also double click on a node to find the URL 149 | that is represented by the node. 150 | 151 | This visualization is provided using the force layout from: 152 | 153 | http://mbostock.github.com/d3/ 154 | 155 | If you rerun the other utilities and then re-run spjson.py - you merely 156 | have to press refresh in the browser to get the new data from spider.js. 157 | 158 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Force-Directed Layout 5 | 6 | 7 | 8 | 9 | 10 | 13 |
14 | 15 |

If you don't see a chart above, check the JavaScript console. You may 16 | need to use a different browser.

17 | 18 | 19 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print row 14 | count = count + 1 15 | print count, 'rows.' 16 | cur.close() 17 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/spider.js: -------------------------------------------------------------------------------- 1 | spiderJson = {"nodes":[ 2 | {"weight":1,"rank":0.0, "id":1, "url":"http://python-data.dr-chuck.net"}, 3 | {"weight":1,"rank":4.66423227024, "id":4, "url":"http://python-data.dr-chuck.net/comments_42.html"}, 4 | {"weight":1,"rank":1.38142061792, "id":7, "url":"http://python-data.dr-chuck.net/known_by_42.html"}, 5 | {"weight":1,"rank":0.690710255581, "id":9, "url":"http://python-data.dr-chuck.net/known_by_Kaylyn.html"}, 6 | {"weight":2,"rank":2.26669663573, "id":40, "url":"http://python-data.dr-chuck.net/known_by_Takua.html"}, 7 | {"weight":1,"rank":0.690710255581, "id":82, "url":"http://python-data.dr-chuck.net/known_by_Marwan.html"}, 8 | {"weight":2,"rank":7.45553422719, "id":85, "url":"http://python-data.dr-chuck.net/known_by_Samiya.html"}, 9 | {"weight":2,"rank":8.48734569457, "id":145, "url":"http://python-data.dr-chuck.net/known_by_Shihed.html"}, 10 | {"weight":1,"rank":0.518032667194, "id":189, "url":"http://python-data.dr-chuck.net/known_by_Cassidy.html"}, 11 | {"weight":2,"rank":1.56869025396, "id":199, "url":"http://python-data.dr-chuck.net/known_by_Vinnie.html"}, 12 | {"weight":2,"rank":2.54881807574, "id":203, "url":"http://python-data.dr-chuck.net/known_by_Charlee.html"}, 13 | {"weight":1,"rank":8.83695381234, "id":248, "url":"http://python-data.dr-chuck.net/known_by_Atli.html"}, 14 | {"weight":2,"rank":4.16614971195, "id":309, "url":"http://python-data.dr-chuck.net/known_by_Abbiegail.html"}, 15 | {"weight":2,"rank":2.2314317079, "id":326, "url":"http://python-data.dr-chuck.net/known_by_Nisha.html"}, 16 | {"weight":1,"rank":1.21603900362, "id":382, "url":"http://python-data.dr-chuck.net/known_by_Ciar.html"}, 17 | {"weight":1,"rank":1.89945314693, "id":413, "url":"http://python-data.dr-chuck.net/known_by_Brodie.html"}, 18 | {"weight":2,"rank":19.0, "id":501, "url":"http://python-data.dr-chuck.net/known_by_Kylar.html"}, 19 | {"weight":2,"rank":5.3834045047, "id":642, "url":"http://python-data.dr-chuck.net/known_by_Mohamed.html"}, 20 | {"weight":1,"rank":3.93023811326, "id":676, "url":"http://python-data.dr-chuck.net/known_by_Oluwaferanmi.html"}, 21 | {"weight":1,"rank":2.59745947896, "id":813, "url":"http://python-data.dr-chuck.net/known_by_Maree.html"}, 22 | {"weight":1,"rank":1.77055254257, "id":873, "url":"http://python-data.dr-chuck.net/known_by_Shaw.html"}], 23 | "links":[ 24 | {"source":0,"target":1,"value":3}, 25 | {"source":0,"target":2,"value":3}, 26 | {"source":0,"target":0,"value":3}, 27 | {"source":2,"target":3,"value":3}, 28 | {"source":2,"target":4,"value":3}, 29 | {"source":2,"target":5,"value":3}, 30 | {"source":2,"target":6,"value":3}, 31 | {"source":5,"target":7,"value":3}, 32 | {"source":5,"target":8,"value":3}, 33 | {"source":5,"target":9,"value":3}, 34 | {"source":5,"target":10,"value":3}, 35 | {"source":6,"target":11,"value":3}, 36 | {"source":4,"target":12,"value":3}, 37 | {"source":4,"target":13,"value":3}, 38 | {"source":4,"target":14,"value":3}, 39 | {"source":8,"target":15,"value":3}, 40 | {"source":7,"target":16,"value":3}, 41 | {"source":13,"target":17,"value":3}, 42 | {"source":10,"target":18,"value":3}, 43 | {"source":14,"target":19,"value":3}, 44 | {"source":18,"target":20,"value":3}, 45 | {"source":18,"target":17,"value":3}, 46 | {"source":20,"target":9,"value":3}, 47 | {"source":17,"target":6,"value":3}, 48 | {"source":9,"target":12,"value":3}]}; -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib 3 | import ssl 4 | from urlparse import urljoin 5 | from urlparse import urlparse 6 | from BeautifulSoup import * 7 | 8 | # Deal with SSL certificate anomalies Python > 2.7 9 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 10 | scontext = None 11 | 12 | conn = sqlite3.connect('spider.sqlite') 13 | cur = conn.cursor() 14 | 15 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 16 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 17 | error INTEGER, old_rank REAL, new_rank REAL)''') 18 | 19 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 20 | (from_id INTEGER, to_id INTEGER)''') 21 | 22 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 23 | 24 | # Check to see if we are already in progress... 25 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 26 | row = cur.fetchone() 27 | if row is not None: 28 | print "Restarting existing crawl. Remove spider.sqlite to start a fresh crawl." 29 | else : 30 | starturl = raw_input('Enter web url or enter: ') 31 | if ( len(starturl) < 1 ) : starturl = 'http://python-data.dr-chuck.net/' 32 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 33 | web = starturl 34 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 35 | pos = starturl.rfind('/') 36 | web = starturl[:pos] 37 | 38 | if ( len(web) > 1 ) : 39 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 40 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 41 | conn.commit() 42 | 43 | # Get the current webs 44 | cur.execute('''SELECT url FROM Webs''') 45 | webs = list() 46 | for row in cur: 47 | webs.append(str(row[0])) 48 | 49 | print webs 50 | 51 | many = 0 52 | while True: 53 | if ( many < 1 ) : 54 | sval = raw_input('How many pages:') 55 | if ( len(sval) < 1 ) : break 56 | many = int(sval) 57 | many = many - 1 58 | 59 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 60 | try: 61 | row = cur.fetchone() 62 | # print row 63 | fromid = row[0] 64 | url = row[1] 65 | except: 66 | print 'No unretrieved HTML pages found' 67 | many = 0 68 | break 69 | 70 | print fromid, url, 71 | 72 | # If we are retrieving this page, there should be no links from it 73 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 74 | try: 75 | # Deal with SSL certificate anomalies Python > 2.7 76 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 77 | # document = urllib.urlopen(url, context=scontext) 78 | 79 | # Normal Unless you encounter certificate problems 80 | document = urllib.urlopen(url) 81 | 82 | html = document.read() 83 | if document.getcode() != 200 : 84 | print "Error on page: ",document.getcode() 85 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 86 | 87 | if 'text/html' != document.info().gettype() : 88 | print "Ignore non text/html page" 89 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 90 | conn.commit() 91 | continue 92 | 93 | print '('+str(len(html))+')', 94 | 95 | soup = BeautifulSoup(html) 96 | except KeyboardInterrupt: 97 | print '' 98 | print 'Program interrupted by user...' 99 | break 100 | except: 101 | print "Unable to retrieve or parse page" 102 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 103 | conn.commit() 104 | continue 105 | 106 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 107 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (buffer(html), url ) ) 108 | conn.commit() 109 | 110 | # Retrieve all of the anchor tags 111 | tags = soup('a') 112 | count = 0 113 | for tag in tags: 114 | href = tag.get('href', None) 115 | if ( href is None ) : continue 116 | # Resolve relative references like href="/contact" 117 | up = urlparse(href) 118 | if ( len(up.scheme) < 1 ) : 119 | href = urljoin(url, href) 120 | ipos = href.find('#') 121 | if ( ipos > 1 ) : href = href[:ipos] 122 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 123 | if ( href.endswith('/') ) : href = href[:-1] 124 | # print href 125 | if ( len(href) < 1 ) : continue 126 | 127 | # Check if the URL is in any of the webs 128 | found = False 129 | for web in webs: 130 | if ( href.startswith(web) ) : 131 | found = True 132 | break 133 | if not found : continue 134 | 135 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 136 | count = count + 1 137 | conn.commit() 138 | 139 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 140 | try: 141 | row = cur.fetchone() 142 | toid = row[0] 143 | except: 144 | print 'Could not retrieve id' 145 | continue 146 | # print fromid, toid 147 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 148 | 149 | 150 | print count 151 | 152 | cur.close() 153 | 154 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print "Creating JSON output on spider.js..." 7 | howmany = int(raw_input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank < rank or maxrank is None : maxrank = rank 22 | if minrank > rank or minrank is None : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print "Error - please run sprank.py to compute page rank" 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print "Open force.html in a browser to view the visualization" 62 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = raw_input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print "Nothing to page rank. Check data." 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in prev_ranks.items(): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in prev_ranks.items(): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in next_ranks.items(): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in next_ranks.items(): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in prev_ranks.items(): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print i+1, avediff 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print next_ranks.items()[:5] 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in next_ranks.items() : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/pagerank/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print "All pages set to a rank of 1.0" 12 | -------------------------------------------------------------------------------- /python_databases/w5_dbvisualisation/web_crawling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritchieng/python-for-everybody/3a185b2d34badd5f64ff067b0ec9185b76f92a70/python_databases/w5_dbvisualisation/web_crawling.png --------------------------------------------------------------------------------

48 | This is a cool map from 49 | www.pythonlearn.com. 50 |