├── .gitignore ├── .project ├── .pydevproject ├── Anaconda_CheatSheet.pdf ├── Course 1 - Programming for everybody (Getting started) ├── ex01 │ ├── first.py │ └── test.py ├── ex02 │ ├── ex02_02.py │ └── ex02_03.py ├── ex03 │ ├── ex03_01.py │ └── ex03_03.py ├── ex04 │ └── ex04_06.py └── ex05 │ ├── ex05_01.py │ └── ex05_02.py ├── Course 2 - Python data structures ├── ex06 │ └── ex06_05.py ├── ex07 │ ├── ex07_01.py │ ├── ex07_02.py │ ├── mbox-short.txt │ └── words.txt ├── ex08 │ ├── ex08_04.py │ ├── ex08_05.py │ ├── mbox-short.txt │ └── romeo.txt ├── ex09 │ ├── ex09_04.py │ └── mbox-short.txt └── ex10 │ ├── ex10_02.py │ └── mbox-short.txt ├── Course 3 - Using Python to access web data ├── ex11 │ ├── ex11.py │ ├── regex_sum_27824.txt │ └── regex_sum_42.txt ├── ex12 │ ├── following_links.py │ ├── scraping.py │ └── socket1.py └── ex13 │ ├── extracting_data_from_JSON.py │ ├── extracting_data_from_xml.py │ └── using_GeoJSON_API.py ├── Course 4 - Using databases with Python ├── ex15 │ ├── Library.xml │ ├── emaildb.sqlite │ ├── ex15 - hexa.db │ ├── ex15_counting_e-mail_in_a_database.py │ ├── ex15_multi-table_database_tracks.py │ ├── ex15_roster.py │ ├── mbox.txt │ ├── roster_data.json │ ├── rosterdb.sqlite │ └── trackdb.sqlite └── ex16 │ ├── README.txt │ ├── geodata.sqlite │ ├── geodump running.jpg │ ├── geodump.py │ ├── geoload running.jpg │ ├── geoload.py │ ├── where.data │ ├── where.html │ ├── where.js │ └── zoomed map with added location.jpg ├── Course 5 - Capstone - Retrieving, processing and visualising data with Python ├── ex17 │ ├── LICENSE │ ├── README.txt │ ├── __pycache__ │ │ └── spider.cpython-36.pyc │ ├── d3.v2.js │ ├── dr-chuck-site-dump.jpg │ ├── dr-chuck-site-top25.jpg │ ├── force.css │ ├── force.html │ ├── force.js │ ├── spdump.py │ ├── spider-coincube.sqlite │ ├── spider-dr-chuck.sqlite │ ├── spider.js │ ├── spider.py │ ├── spider.sqlite │ ├── spjson.py │ ├── sprank.py │ ├── spreset.py │ ├── variance-site-dump.jpg │ └── variance-top25.jpg └── ex18 │ └── gmane │ ├── README.txt │ ├── content.sqlite │ ├── content.sqlite-journal.temp │ ├── content.sqlite.first.jpg │ ├── d3.layout.cloud.js │ ├── d3.v2.js │ ├── gbasic.py │ ├── gbasic.py.running.jpg │ ├── gbasic.py.running2.jpg │ ├── gline.htm │ ├── gline.jpg │ ├── gline.js │ ├── gline.py │ ├── gmane.py │ ├── gmodel.py │ ├── gmodel.py.running.jpg │ ├── gword.htm │ ├── gword.jpg │ ├── gword.js │ ├── gword.py │ ├── gyear.py │ ├── index.sqlite │ ├── index.sqlite.second.jpg │ └── mapping.sqlite ├── Python for Everybody.pdf └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | othw_cmd.JPG 2 | othw_editor.JPG 3 | pythonlearn.epub 4 | */*.jpg 5 | */*/*.jpg -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | py4e 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | 6 | python 3.6 7 | Default 8 | 9 | -------------------------------------------------------------------------------- /Anaconda_CheatSheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Anaconda_CheatSheet.pdf -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex01/first.py: -------------------------------------------------------------------------------- 1 | print('hello world') -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex01/test.py: -------------------------------------------------------------------------------- 1 | print('test') 2 | -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex02/ex02_02.py: -------------------------------------------------------------------------------- 1 | # Welcoming the user 2 | name = input("Enter your name") 3 | print("Hello", name) -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex02/ex02_03.py: -------------------------------------------------------------------------------- 1 | # Ask user to input hours worked and rate 2 | hrs = input("Enter Hours:") 3 | rate = input("Enter Rate:") 4 | 5 | # Compute pay and print 6 | pay = float(hrs) * float(rate) 7 | print("Pay:", pay) -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex03/ex03_01.py: -------------------------------------------------------------------------------- 1 | # Asking input from user on hours and rate 2 | hrs = input("Enter Hours:") 3 | rate = input("Rate per Hour:") 4 | 5 | # Converting the input into float numbers 6 | h = float(hrs) 7 | r = float(rate) 8 | 9 | # Print gross pay using a multiplier for rates above 40 hours 10 | if h <= 40.0 : 11 | print(h * r) 12 | else : 13 | multiplier = 1.5 14 | print((40 * r) + ((h - 40) * r * multiplier)) -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex03/ex03_03.py: -------------------------------------------------------------------------------- 1 | score = input("Enter Score: ") 2 | try : 3 | sc = float(score) 4 | except : 5 | print("Error, please enter a number.") 6 | quit() 7 | 8 | if sc >= 1.0 : 9 | print("Please enter a number smaller than or equal to 1.0.") 10 | elif sc >= 0.9 : 11 | print("A") 12 | elif sc >= 0.8 : 13 | print("B") 14 | elif sc >= 0.7 : 15 | print("C") 16 | elif sc >= 0.6 : 17 | print("D") 18 | elif sc >= 0.0 : 19 | print("F") 20 | else : 21 | print("Please enter a number greater than or equal to 0.0.") -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex04/ex04_06.py: -------------------------------------------------------------------------------- 1 | # Defining the "computepay" function to return the gross pay according to schedule 2 | def computepay(h, r): 3 | if h <= 40.0 : 4 | return h * r 5 | else : 6 | m = 1.5 7 | return (40 * r) + ((h - 40) * r * m) 8 | 9 | 10 | # Asking input from user on hours and rate 11 | hrs = input("Enter Hours: ") 12 | rate = input("Rate per Hour: ") 13 | 14 | # Converting the input into float numbers 15 | h = float(hrs) 16 | r = float(rate) 17 | 18 | # Print gross pay using a multiplier for rates above 40 hours 19 | print(computepay(h, r)) -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex05/ex05_01.py: -------------------------------------------------------------------------------- 1 | # Initialising variables 2 | count = 0 3 | total = 0 4 | 5 | # Starting loop 6 | while True: 7 | # Ask for the user input 8 | sval = input('Enter a number: ') 9 | 10 | # If user types 'done' then exit 11 | if sval == 'done': 12 | break 13 | 14 | # Trying to convert user input to a value, but if it is not working, give an error message 15 | try: 16 | fval = float(sval) 17 | except: 18 | print('Invalid input') 19 | continue 20 | 21 | # Adjusting counters 22 | count = count + 1 23 | total = total + fval 24 | 25 | # Printing the total, the number of inputs and the average 26 | print(total, count, total/count) -------------------------------------------------------------------------------- /Course 1 - Programming for everybody (Getting started)/ex05/ex05_02.py: -------------------------------------------------------------------------------- 1 | # Initialising variables 2 | smallest = None 3 | largest = None 4 | 5 | # Starting loop 6 | while True: 7 | # Ask for the user input 8 | sval = input('Enter a number: ') 9 | 10 | # If user types 'done' then exit 11 | if sval == 'done': break 12 | 13 | # Trying to convert user input to a value, but if it is not working, give an error message 14 | try: 15 | fval = float(sval) 16 | except: 17 | print('Invalid input') 18 | continue 19 | 20 | # Adjusting counters 21 | if ((smallest is None) or (fval < smallest)): 22 | smallest = int(fval) 23 | if ((largest is None) or (fval > largest)): 24 | largest = int(fval) 25 | 26 | # Printing the total, the number of inputs and the average 27 | print('Maximum is', largest) 28 | print('Minimum is', smallest) -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex06/ex06_05.py: -------------------------------------------------------------------------------- 1 | text = "X-DSPAM-Confidence: 0.8475"; 2 | 3 | # Finding the colon ':' sign in the text 4 | colon = text.find(':') 5 | # Taking the part of the text after the colon 6 | snumforslice = text[colon+1:] 7 | # Stripping the text taken 8 | snum = snumforslice.strip() 9 | # Printing the converted text to float 10 | print(float(snum)) -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex07/ex07_01.py: -------------------------------------------------------------------------------- 1 | # Use words.txt as the file name 2 | fname = input("Enter file name: ") 3 | fh = open(fname) 4 | ftext = fh.read() 5 | ftext = ftext.strip() 6 | print(ftext.upper()) -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex07/ex07_02.py: -------------------------------------------------------------------------------- 1 | # Use the file name mbox-short.txt as the file name 2 | fname = input("Enter file name: ") 3 | fh = open(fname) 4 | total = 0 5 | count = 0 6 | for line in fh: 7 | if not line.startswith("X-DSPAM-Confidence:") : 8 | continue 9 | else : 10 | snumt = line[20:] 11 | snum = snumt.strip() 12 | num = float(snum) 13 | total = total + num 14 | count = count + 1 15 | avg = total / count 16 | print("Average spam confidence:",avg) 17 | -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex07/words.txt: -------------------------------------------------------------------------------- 1 | Writing programs or programming is a very creative 2 | and rewarding activity You can write programs for 3 | many reasons ranging from making your living to solving 4 | a difficult data analysis problem to having fun to helping 5 | someone else solve a problem This book assumes that 6 | {\em everyone} needs to know how to program and that once 7 | you know how to program, you will figure out what you want 8 | to do with your newfound skills 9 | 10 | We are surrounded in our daily lives with computers ranging 11 | from laptops to cell phones We can think of these computers 12 | as our personal assistants who can take care of many things 13 | on our behalf The hardware in our current-day computers 14 | is essentially built to continuously ask us the question 15 | What would you like me to do next 16 | 17 | Our computers are fast and have vasts amounts of memory and 18 | could be very helpful to us if we only knew the language to 19 | speak to explain to the computer what we would like it to 20 | do next If we knew this language we could tell the 21 | computer to do tasks on our behalf that were reptitive 22 | Interestingly, the kinds of things computers can do best 23 | are often the kinds of things that we humans find boring 24 | and mind-numbing 25 | -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex08/ex08_04.py: -------------------------------------------------------------------------------- 1 | # Opening the file 2 | fname = input("Enter file name: ") 3 | fh = open(fname) 4 | 5 | # Defining the list to be built 6 | lst = list() 7 | 8 | # Reading the file line-by-line 9 | for line in fh: 10 | # For each line, splitting the line into words 11 | line.rstrip() 12 | ls = line.split() 13 | # For each word checking if already being on the list and appending to it if not 14 | for word in ls: 15 | if word not in lst: lst.append(word) 16 | 17 | # Sorting the list and printing 18 | lst.sort() 19 | print(lst) -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex08/ex08_05.py: -------------------------------------------------------------------------------- 1 | fname = input("Enter file name: ") 2 | if len(fname) < 1 : fname = "mbox-short.txt" 3 | 4 | fh = open(fname) 5 | count = 0 6 | 7 | # Reading file line-by-line 8 | for line in fh: 9 | line.rstrip() 10 | # Looking for lines starting with 'From' 11 | if line.startswith('From ') : 12 | # Splitting lines starting with 'From' 13 | ls = line.split() 14 | # Printing the second item (e-mail address) of the split 15 | print(ls[1]) 16 | count = count + 1 17 | 18 | # Printing the count of e-mail addresses 19 | print("There were", count, "lines in the file with From as the first word") -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex08/romeo.txt: -------------------------------------------------------------------------------- 1 | But soft what light through yonder window breaks 2 | It is the east and Juliet is the sun 3 | Arise fair sun and kill the envious moon 4 | Who is already sick and pale with grief 5 | -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex09/ex09_04.py: -------------------------------------------------------------------------------- 1 | # Asking user to enter the source file 2 | fname = input("Enter file:") 3 | if len(fname) < 1 : fname = "mbox-short.txt" 4 | # Opening the source file 5 | fhandle = open(fname) 6 | 7 | # Creating a dictionary for the senders' e-mail addresses 8 | sender = dict() 9 | 10 | # Reading file line-by-line 11 | for line in fhandle: 12 | line.rstrip() 13 | # Looking for lines starting with 'From' 14 | if line.startswith('From ') : 15 | # Splitting lines starting with 'From' 16 | ls = line.split() 17 | # Adding the second item (e-mail address) of the split if not yet in the dictionary and counting 18 | sender[ls[1]] = sender.get(ls[1], 0) + 1 19 | 20 | # Selecting the most frequently occuring e-mail address and its count 21 | most_email = None 22 | most_count = None 23 | for email,count in sender.items(): 24 | if most_count is None or count > most_count : 25 | most_email = email 26 | most_count = count 27 | 28 | # Printing the most frequent e-mail address and its number of occurence 29 | print(most_email, most_count) -------------------------------------------------------------------------------- /Course 2 - Python data structures/ex10/ex10_02.py: -------------------------------------------------------------------------------- 1 | # Asking user to enter the source file 2 | fname = input("Enter file:") 3 | if len(fname) < 1 : fname = "mbox-short.txt" 4 | # Opening the source file 5 | fhandle = open(fname) 6 | 7 | # Creating a dictionary for the hours 8 | hours = dict() 9 | 10 | # Reading file line-by-line 11 | for line in fhandle: 12 | line.rstrip() 13 | # Looking for lines starting with 'From' 14 | if line.startswith('From ') : 15 | # Splitting lines starting with 'From' 16 | ls = line.split() 17 | # Taking the split with the time 18 | time = ls[5] 19 | # Splitting the time 20 | tm = time.split(':') 21 | # Taking the split with the hour 22 | hour = tm[0] 23 | # Adding the hour in the dictionary and counting 24 | hours[hour] = hours.get(hour, 0) + 1 25 | 26 | # Printing the hours and their counts in ascending order by hours 27 | for k,v in sorted(hours.items()): 28 | print(k,v) -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex11/ex11.py: -------------------------------------------------------------------------------- 1 | # Importing the regex library 2 | import re 3 | 4 | # Asking user to enter the source file 5 | fname = input("Enter file name: ") 6 | if len(fname) < 1 : fname = "regex_sum_42.txt" 7 | # Opening the source file 8 | fhandle = open(fname) 9 | 10 | # Initialising the sum of the numbers 11 | total = 0 12 | 13 | # Reading file line-by-line 14 | for line in fhandle: 15 | # Finding all the numbers as strings into listOfNums 16 | listOfNums = re.findall('([0-9]+)', line.rstrip()) 17 | # If there is not any number in a line go to the next loop... 18 | if len(listOfNums) < 1 : 19 | continue 20 | else : 21 | # ... else looping through the list of numbers found... 22 | for snum in listOfNums : 23 | # ... and add up the converted numbers 24 | total = total + int(snum) 25 | 26 | # Printing the sum of the numbers 27 | print(total) 28 | -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex11/regex_sum_27824.txt: -------------------------------------------------------------------------------- 1 | This file contains the actual data for your assignment - good luck! 2 | 3 | 4 | Why should you learn to write programs? 5 | 6 | Writing programs (or programming) is a very creative 7 | and rewarding activity. You can write programs for 8 | many reasons, ranging from making your living to solving 9 | a difficult data analysis problem to having fun to helping 10 | someone else solve a problem. This book assumes that 11 | everyone needs to know how to program, and that once 12 | you know how to program you will figure out what you want 13 | to do with your newfound skills. 14 | 15 | We are surrounded in our daily lives with computers ranging 16 | from laptops to cell phones. We can think of these computers 17 | as our personal assistants who can take care of many things 18 | on our behalf. The hardware in our current-day computers 19 | 7971 is essentially built to continuously ask us the question, 20 | 3634 What would you like me to do next? 6057 21 | 22 | Programmers add an operating system and a set of applications 23 | to the hardware and we end up with a Personal Digital 24 | Assistant that is quite helpful and capable of helping 25 | us do many different things. 26 | 27 | Our computers are fast and have vast amounts of memory and 28 | could be very helpful to us if we only knew the language to 29 | speak to explain to the computer what we would like it to 30 | 5789 do next. If we knew this language, we could tell the 4701 31 | computer to do tasks on our behalf that were repetitive. 32 | Interestingly, the kinds of things computers can do best 33 | are often the kinds of things that we humans find boring 34 | and mind-numbing. 35 | 36 | For example, look at the first three paragraphs of this 37 | chapter and tell me the most commonly used word and how 38 | many times the word is used. While you were able to read 39 | and understand the words in a few seconds, counting them 40 | is almost painful because it is not the kind of problem 41 | that human minds are designed to solve. For a computer 42 | the opposite is true, reading and understanding text 43 | from a piece of paper is hard for a computer to do 44 | but counting the words and telling you how many times 45 | the most used word was used is very easy for the 46 | 2412 computer: 216 47 | 48 | Our personal information analysis assistant quickly 49 | told us that the word to was used sixteen times in the 50 | first three paragraphs of this chapter. 51 | 52 | This very fact that computers are good at things 53 | that humans are not is why you need to become 54 | skilled at talking computer language. Once you learn 55 | this new language, you can delegate mundane tasks 56 | to your partner (the computer), leaving more time 57 | for you to do the 58 | things that you are uniquely suited for. You bring 59 | creativity, intuition, and inventiveness to this 60 | partnership. 61 | 62 | Creativity and motivation 63 | 9783 1690 64 | 589 While this book is not intended for professional programmers, professional 5572 65 | programming can be a very rewarding job both financially and personally. 66 | Building useful, elegant, and clever programs for others to use is a very 67 | creative activity. Your computer or Personal Digital Assistant (PDA) 68 | usually contains many different programs from many different groups of 69 | programmers, each competing for your attention and interest. They try 70 | their best to meet your needs and give you a great user experience in the 71 | process. In some situations, when you choose a piece of software, the 72 | programmers are directly compensated because of your choice. 73 | 74 | If we think of programs as the creative output of groups of programmers, 75 | perhaps the following figure is a more sensible version of our PDA: 76 | 77 | For now, our primary motivation is not to make money or please end users, but 78 | instead for us to be more productive in handling the data and 79 | information that we will encounter in our lives. 80 | When you first start, you will be both the programmer and the end user of 81 | your programs. As you gain skill as a programmer and 82 | 6807 programming feels more creative to you, your thoughts may turn 6119 83 | toward developing programs for others. 84 | 85 | 9372 Computer hardware architecture 1821 86 | 87 | Before 3145 we 3306 start 558 learning the language we 88 | speak to give instructions to computers to 89 | develop software, we need to learn a small amount about 90 | how computers are built. 91 | 92 | Central Processing Unit (or CPU) is 93 | the part of the computer that is built to be obsessed 94 | with what is next? If your computer is rated 95 | at three Gigahertz, it means that the CPU will ask What next? 96 | three billion times per second. You are going to have to 97 | learn how to talk fast to keep up with the CPU. 98 | 99 | Main Memory is used to store information 100 | that the CPU needs in a hurry. The main memory is nearly as 101 | fast as the CPU. But the information stored in the main 102 | memory vanishes when the computer is turned off. 103 | 104 | Secondary Memory is also used to store 105 | information, but it is much slower than the main memory. 106 | The advantage of the secondary memory is that it can 107 | store information even when there is no power to the 108 | computer. Examples of secondary memory are disk drives 109 | or flash memory (typically found in USB sticks and portable 110 | music players). 111 | 112 | Input and Output Devices are simply our 113 | screen, keyboard, mouse, microphone, speaker, touchpad, etc. 114 | They are all of the ways we interact with the computer. 115 | 116 | These days, most computers also have a 117 | Network Connection to retrieve information over a network. 118 | We can think of the network as a very slow place to store and 119 | retrieve data that might not always be up. So in a sense, 120 | the network is a slower and at times unreliable form of 121 | Secondary Memory. 122 | 123 | 4108 While most of the detail of how these components work is best left 124 | to computer builders, it helps to have some terminology 125 | so we can talk about these different parts as we write our programs. 126 | 2439 127 | As a programmer, your job is to use and orchestrate 128 | each of these resources to solve the problem that you need to solve 129 | 6725 and analyze the data you get from the solution. As a programmer you will 130 | mostly be talking to the CPU and telling it what to 131 | do next. Sometimes you will tell the CPU to use the main memory, 132 | secondary 4782 memory, 3702 network, 4024 or the input/output devices. 133 | 134 | You need to be the person who answers the CPU's What next? 135 | question. But it would be very uncomfortable to shrink you 136 | down to five mm tall and insert you into the computer just so you 137 | could issue a command three billion times per second. So instead, 138 | you must write down your instructions in advance. 139 | We call these stored instructions a program and the act 140 | of writing these instructions down and getting the instructions to 141 | be correct programming. 142 | 143 | 5151 6693 581 144 | 145 | In the rest of this book, we will try to turn you into a person 146 | who is skilled in the art of programming. In the end you will be a 147 | programmer --- perhaps not a professional programmer, but 148 | at least you will have the skills to look at a data/information 149 | analysis problem and develop a program to solve the problem. 150 | 2000 2477 151 | 8429 problem solving 7557 152 | 153 | In a sense, you need two skills to be a programmer: 154 | 155 | First, you need to know the programming language (Python) - 156 | you need to know the vocabulary and the grammar. You need to be able 157 | to spell the words in this new language properly and know how to construct 158 | 5272 well-formed sentences in this new language. 7193 159 | 160 | Second, you need to tell a story. In writing a story, 161 | you combine words and sentences to convey an idea to the reader. 162 | There is a skill and art in constructing the story, and skill in 163 | story writing is improved by doing some writing and getting some 164 | feedback. In programming, our program is the story and the 165 | problem you are trying to solve is the idea. 166 | 167 | itemize 168 | 169 | Once you learn one programming language such as Python, you will 170 | find it much easier to learn a second programming language such 171 | as JavaScript or C++. The new programming language has very different 172 | vocabulary and grammar but the problem-solving skills 173 | will be the same across all programming languages. 174 | 175 | You will learn the vocabulary and sentences of Python pretty quickly. 176 | It will take longer for you to be able to write a coherent program 177 | to solve a brand-new problem. We teach programming much like we teach 178 | 4790 writing. We start reading and explaining programs, then we write 1143 179 | simple programs, and then we write increasingly complex programs over time. 180 | At some point you get your muse and see the patterns on your own 181 | and can see more naturally how to take a problem and 182 | write a program that solves that problem. And once you get 183 | to that point, programming becomes a very pleasant and creative process. 184 | 185 | 7747 We start with the vocabulary and structure of Python programs. Be patient 179 186 | as the simple examples remind you of when you started reading for the first 187 | time. 188 | 8360 4425 2668 189 | Words and sentences 190 | 191 | Unlike human languages, the Python vocabulary is actually pretty small. 192 | We call this vocabulary the reserved words. These are words that 193 | have very special meaning to Python. When Python sees these words in 194 | a Python program, they have one and only one meaning to Python. Later 195 | as you write programs you will make up your own words that have meaning to 196 | you called variables. You will have great latitude in choosing 197 | your names for your variables, but you cannot use any of Python's 198 | reserved words as a name for a variable. 199 | 200 | When we train a dog, we use special words like 201 | sit, stay, and fetch. When you talk to a dog and 202 | don't use any of the reserved words, they just look at you with a 203 | quizzical look on their face until you say a reserved word. 204 | For example, if you say, 205 | I wish more people would walk to improve their overall health, 206 | what most dogs likely hear is, 207 | blah blah blah walk blah blah blah blah. 208 | That is because walk is a reserved word in dog language. 209 | 210 | The reserved words in the language where humans talk to 211 | Python include the following: 212 | 213 | and del from not while 214 | as elif global or with 215 | assert else if pass yield 216 | break except import print 217 | class exec in raise 218 | continue finally is return 219 | def for lambda try 220 | 221 | That is it, and unlike a dog, Python is already completely trained. 222 | When you say try, Python will try every time you say it without 223 | fail. 224 | 6741 225 | We will learn these reserved words and how they are used in good time, 226 | 577 but for now we will focus on the Python equivalent of speak (in 227 | human-to-dog language). The nice thing about telling Python to speak 228 | is that we can even tell it what to say by giving it a message in quotes: 229 | 230 | And we have even written our first syntactically correct Python sentence. 231 | Our sentence starts with the reserved word print followed 232 | by a string of text of our choosing enclosed in single quotes. 233 | 234 | Conversing with Python 235 | 236 | Now that we have a word and a simple sentence that we know in Python, 237 | we need to know how to start a conversation with Python to test 238 | our new language skills. 239 | 240 | Before you can converse with Python, you must first install the Python 241 | 2435 software on your computer and learn how to start Python on your 242 | computer. That is too much detail for this chapter so I suggest 243 | that you consult www.py4e.com where I have detailed 244 | instructions and screencasts of setting up and starting Python 245 | on Macintosh and Windows systems. At some point, you will be in 246 | a terminal or command window and you will type python and 247 | the Python interpreter will start executing in interactive mode 248 | and appear somewhat as follows: 249 | interactive mode 250 | 9658 251 | The >>> prompt is the Python interpreter's way of asking you, What 252 | do you want me to do next? Python is ready to have a conversation with 253 | you. All you have to know is how to speak the Python language. 254 | 255 | 1246 Let's say for example that you did not know even the simplest Python language 81 256 | words or sentences. You might want to use the standard line that astronauts 257 | use 9474 when 7688 they 4987 land on a faraway planet and try to speak with the inhabitants 258 | of the planet: 259 | 260 | This is not going so well. Unless you think of something quickly, 261 | the inhabitants of the planet are likely to stab you with their spears, 262 | put you on a spit, roast you over a fire, and eat you for dinner. 263 | 264 | At this point, you should also realize that while Python 265 | 7286 is amazingly complex and powerful and very picky about 266 | the syntax you use to communicate with it, Python is 267 | not intelligent. You are really just having a conversation 268 | with yourself, but using proper syntax. 269 | 270 | In a sense, when you use a program written by someone else 271 | the conversation is between you and those other 272 | programmers with Python acting as an intermediary. Python 273 | is a way for the creators of programs to express how the 274 | conversation is supposed to proceed. And 275 | in just a few more chapters, you will be one of those 276 | programmers using Python to talk to the users of your program. 277 | 278 | Before we leave our first conversation with the Python 279 | interpreter, you should probably know the proper way 280 | to say good-bye when interacting with the inhabitants 281 | of Planet Python: 282 | 283 | You will notice that the error is different for the first two 284 | incorrect attempts. The second error is different because 285 | if is a reserved word and Python saw the reserved word 286 | and thought we were trying to say something but got the syntax 287 | of the sentence wrong. 288 | 9667 289 | Terminology: interpreter and compiler 290 | 749 291 | Python is a high-level language intended to be relatively 292 | straightforward for humans to read and write and for computers 293 | to read and process. Other high-level languages include Java, C++, 294 | PHP, Ruby, Basic, Perl, JavaScript, and many more. The actual hardware 295 | inside the Central Processing Unit (CPU) does not understand any 296 | of these high-level languages. 297 | 5978 298 | The CPU understands a language we call machine language. Machine 299 | language is very simple and frankly very tiresome to write because it 300 | is represented all in zeros and ones. 301 | 302 | Machine language seems quite simple on the surface, given that there 303 | are only zeros and ones, but its syntax is even more complex 304 | and 8985 far 3142 more 1330 intricate than Python. So very few programmers ever write 305 | machine language. Instead we build various translators to allow 306 | programmers to write in high-level languages like Python or JavaScript 307 | 6699 and these translators convert the programs to machine language for actual 308 | execution by the CPU. 309 | 310 | Since machine language is tied to the computer hardware, machine language 311 | is not portable across different types of hardware. Programs written in 312 | high-level languages can be moved between different computers by using a 313 | different interpreter on the new machine or recompiling the code to create 314 | 7192 a machine language version of the program for the new machine. 1097 315 | 5657 316 | These programming language translators fall into two general categories: 317 | (one) interpreters and (two) compilers. 318 | 319 | An interpreter reads the source code of the program as written by the 320 | programmer, parses the source code, and interprets the instructions on the fly. 321 | Python is an interpreter and when we are running Python interactively, 322 | we can type a line of Python (a sentence) and Python processes it immediately 323 | and is ready for us to type another line of Python. 324 | 325 | Some of the lines of Python tell Python that you want it to remember some 326 | value for later. We need to pick a name for that value to be remembered and 327 | we can use that symbolic name to retrieve the value later. We use the 328 | term variable to refer to the labels we use to refer to this stored data. 329 | 330 | In this example, we ask Python to remember the value six and use the label x 331 | so we can retrieve the value later. We verify that Python has actually remembered 332 | the value using x and multiply 333 | it by seven and put the newly computed value in y. Then we ask Python to print out 334 | the value currently in y. 335 | 336 | Even though we are typing these commands into Python one line at a time, Python 337 | is treating them as an ordered sequence of statements with later statements able 338 | to retrieve data created in earlier statements. We are writing our first 339 | simple paragraph with four sentences in a logical and meaningful order. 340 | 341 | It is the nature of an interpreter to be able to have an interactive conversation 342 | as shown above. A compiler needs to be handed the entire program in a file, and then 343 | it runs a process to translate the high-level source code into machine language 344 | and then the compiler puts the resulting machine language into a file for later 345 | execution. 346 | 347 | If you have a Windows system, often these executable machine language programs have a 348 | suffix of .exe or .dll which stand for executable and dynamic link 349 | library respectively. In Linux and Macintosh, there is no suffix that uniquely marks 350 | a file as executable. 351 | 352 | If you were to open an executable file in a text editor, it would look 353 | completely crazy and be unreadable: 354 | 355 | It is not easy to read or write machine language, so it is nice that we have 356 | compilers that allow us to write in high-level 357 | 8728 languages like Python or C. 358 | 359 | Now at this point in our discussion of compilers and interpreters, you should 360 | be wondering a bit about the Python interpreter itself. What language is 361 | it written in? Is it written in a compiled language? When we type 362 | python, what exactly is happening? 363 | 364 | The Python interpreter is written in a high-level language called C. 365 | You can look at the actual source code for the Python interpreter by 366 | going to www.python.org and working your way to their source code. 367 | So Python is a program itself and it is compiled into machine code. 368 | When you installed Python on your computer (or the vendor installed it), 369 | you 9124 copied 4446 a 4202 machine-code copy of the translated Python program onto your 370 | system. In Windows, the executable machine code for Python itself is likely 371 | in a file. 372 | 373 | That is more than you really need to know to be a Python programmer, but 374 | sometimes it pays to answer those little nagging questions right at 375 | 8716 the beginning. 376 | 377 | 2096 Writing a program 8218 378 | 379 | Typing commands into the Python interpreter is a great way to experiment 380 | with Python's features, but it is not recommended for solving more complex problems. 381 | 382 | When we want to write a program, 383 | we use a text editor to write the Python instructions into a file, 384 | which is called a script. By 385 | convention, Python scripts have names that end with .py. 386 | 387 | script 388 | 389 | To execute the script, you have to tell the Python interpreter 390 | the name of the file. In a Unix or Windows command window, 391 | you would type python hello.py as follows: 392 | 2939 393 | We call the Python interpreter and tell it to read its source code from 394 | the file hello.py instead of prompting us for lines of Python code 395 | interactively. 396 | 397 | You will notice that there was no need to have quit() at the end of 398 | the Python program in the file. When Python is reading your source code 399 | from a file, it knows to stop when it reaches the end of the file. 400 | 401 | What is a program? 402 | 403 | The definition of a program at its most basic is a sequence 404 | of Python statements that have been crafted to do something. 405 | Even our simple hello.py script is a program. It is a one-line 406 | program and is not particularly useful, but in the strictest definition, 407 | it is a Python program. 408 | 409 | It might be easiest to understand what a program is by thinking about a problem 410 | that a program might be built to solve, and then looking at a program 411 | that would solve that problem. 412 | 413 | Lets say you are doing Social Computing research on Facebook posts and 414 | you are interested in the most frequently used word in a series of posts. 415 | You could print out the stream of Facebook posts and pore over the text 416 | looking for the most common word, but that would take a long time and be very 417 | mistake prone. You would be smart to write a Python program to handle the 418 | task quickly and accurately so you can spend the weekend doing something 419 | fun. 420 | 421 | For example, look at the following text about a clown and a car. Look at the 422 | text and figure out the most common word and how many times it occurs. 423 | 424 | Then imagine that you are doing this task looking at millions of lines of 425 | text. Frankly it would be quicker for you to learn Python and write a 426 | Python program to count the words than it would be to manually 427 | scan the words. 428 | 429 | The even better news is that I already came up with a simple program to 430 | find the most common word in a text file. I wrote it, 431 | tested it, and now I am giving it to you to use so you can save some time. 432 | 433 | You don't even need to know Python to use this program. You will need to get through 434 | Chapter ten of this book to fully understand the awesome Python techniques that were 435 | used to make the program. You are the end user, you simply use the program and marvel 436 | at its cleverness and how it saved you so much manual effort. 437 | You simply type the code 438 | into a file called words.py and run it or you download the source 439 | code from http://www.py4e.com/code3/ and run it. 440 | 441 | This is a good example of how Python and the Python language are acting as an intermediary 442 | between you (the end user) and me (the programmer). Python is a way for us to exchange useful 443 | instruction sequences (i.e., programs) in a common language that can be used by anyone who 444 | installs Python on their computer. So neither of us are talking to Python, 445 | instead we are communicating with each other through Python. 446 | 447 | The building blocks of programs 448 | 449 | In the next few chapters, we will learn more about the vocabulary, sentence structure, 450 | paragraph structure, and story structure of Python. We will learn about the powerful 451 | capabilities of Python and how to compose those capabilities together to create useful 452 | programs. 453 | 454 | There are some low-level conceptual patterns that we use to construct programs. These 455 | constructs are not just for Python programs, they are part of every programming language 456 | from machine language up to the high-level languages. 457 | 458 | description 459 | 460 | Get data from the outside world. This might be 461 | reading data from a file, or even some kind of sensor like 462 | a microphone or GPS. In our initial programs, our input will come from the user 463 | typing data on the keyboard. 464 | 465 | Display the results of the program on a screen 466 | or store them in a file or perhaps write them to a device like a 467 | speaker to play music or speak text. 468 | 469 | Perform statements one after 470 | another in the order they are encountered in the script. 471 | 472 | Check for certain conditions and 473 | then execute or skip a sequence of statements. 474 | 475 | Perform some set of statements 476 | repeatedly, usually with 477 | some variation. 478 | 479 | Write a set of instructions once and give them a name 480 | and then reuse those instructions as needed throughout your program. 481 | 482 | description 483 | 484 | It sounds almost too simple to be true, and of course it is never 485 | so simple. It is like saying that walking is simply 486 | putting one foot in front of the other. The art 487 | of writing a program is composing and weaving these 488 | basic elements together many times over to produce something 489 | that is useful to its users. 490 | 491 | The word counting program above directly uses all of 492 | these patterns except for one. 493 | 494 | What could possibly go wrong? 495 | 496 | As we saw in our earliest conversations with Python, we must 497 | communicate very precisely when we write Python code. The smallest 498 | deviation or mistake will cause Python to give up looking at your 499 | program. 500 | 501 | Beginning programmers often take the fact that Python leaves no 502 | room for errors as evidence that Python is mean, hateful, and cruel. 503 | While Python seems to like everyone else, Python knows them 504 | personally and holds a grudge against them. Because of this grudge, 505 | Python takes our perfectly written programs and rejects them as 506 | unfit just to torment us. 507 | 508 | There is little to be gained by arguing with Python. It is just a tool. 509 | It has no emotions and it is happy and ready to serve you whenever you 510 | need it. Its error messages sound harsh, but they are just Python's 511 | call for help. It has looked at what you typed, and it simply cannot 512 | understand what you have entered. 513 | 514 | Python is much more like a dog, loving you unconditionally, having a few 515 | key words that it understands, looking you with a sweet look on its 516 | face (>>>), and waiting for you to say something it understands. 517 | When Python says SyntaxError: invalid syntax, it is simply wagging 518 | its tail and saying, You seemed to say something but I just don't 519 | understand what you meant, but please keep talking to me (>>>). 520 | 521 | As your programs become increasingly sophisticated, you will encounter three 522 | general types of errors: 523 | 524 | description 525 | 526 | These are the first errors you will make and the easiest 527 | to fix. A syntax error means that you have violated the grammar rules of Python. 528 | Python does its best to point right at the line and character where 529 | it noticed it was confused. The only tricky bit of syntax errors is that sometimes 530 | the mistake that needs fixing is actually earlier in the program than where Python 531 | noticed it was confused. So the line and character that Python indicates in 532 | a syntax error may just be a starting point for your investigation. 533 | 534 | A logic error is when your program has good syntax but there is a mistake 535 | in the order of the statements or perhaps a mistake in how the statements relate to one another. 536 | A good example of a logic error might be, take a drink from your water bottle, put it 537 | in your backpack, walk to the library, and then put the top back on the bottle. 538 | 539 | A semantic error is when your description of the steps to take 540 | is syntactically perfect and in the right order, but there is simply a mistake in 541 | the program. The program is perfectly correct but it does not do what 542 | you intended for it to do. A simple example would 543 | be if you were giving a person directions to a restaurant and said, ...when you reach 544 | the intersection with the gas station, turn left and go one mile and the restaurant 545 | is a red building on your left. Your friend is very late and calls you to tell you that 546 | they are on a farm and walking around behind a barn, with no sign of a restaurant. 547 | Then you say did you turn left or right at the gas station? and 548 | they say, I followed your directions perfectly, I have 549 | them written down, it says turn left and go one mile at the gas station. Then you say, 550 | I am very sorry, because while my instructions were syntactically correct, they 551 | sadly contained a small but undetected semantic error.. 552 | 553 | description 554 | 555 | Again in all three types of errors, Python is merely trying its hardest to 556 | do exactly what you have asked. 557 | 558 | The learning journey 559 | 560 | As you progress through the rest of the book, don't be afraid if the concepts 561 | don't seem to fit together well the first time. When you were learning to speak, 562 | it was not a problem for your first few years that you just made cute gurgling noises. 563 | And it was OK if it took six months for you to move from simple vocabulary to 564 | simple sentences and took five or six more years to move from sentences to paragraphs, and a 565 | few more years to be able to write an interesting complete short story on your own. 566 | 567 | We want you to learn Python much more rapidly, so we teach it all at the same time 568 | over the next few chapters. 569 | But it is like learning a new language that takes time to absorb and understand 570 | before it feels natural. 571 | That leads to some confusion as we visit and revisit 572 | topics to try to get you to see the big picture while we are defining the tiny 573 | fragments that make up that big picture. While the book is written linearly, and 574 | if you are taking a course it will progress in a linear fashion, don't hesitate 575 | to be very nonlinear in how you approach the material. Look forwards and backwards 576 | and read with a light touch. By skimming more advanced material without 577 | fully understanding the details, you can get a better understanding of the why? 578 | of programming. By reviewing previous material and even redoing earlier 579 | exercises, you will realize that you actually learned a lot of material even 580 | if the material you are currently staring at seems a bit impenetrable. 581 | 582 | Usually when you are learning your first programming language, there are a few 583 | wonderful Ah Hah! moments where you can look up from pounding away at some rock 584 | with a hammer and chisel and step away and see that you are indeed building 585 | a beautiful sculpture. 586 | 587 | If something seems particularly hard, there is usually no value in staying up all 588 | night and staring at it. Take a break, take a nap, have a snack, explain what you 589 | are having a problem with to someone (or perhaps your dog), and then come back to it with 590 | fresh eyes. I assure you that once you learn the programming concepts in the book 591 | you will look back and see that it was all really easy and elegant and it simply 592 | took you a bit of time to absorb it. 593 | 42 594 | The end 595 | -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex12/following_links.py: -------------------------------------------------------------------------------- 1 | # This program follows links at a given position for a given number of times and lists the resulting chain 2 | 3 | # Importing 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import ssl 7 | 8 | # Initialising the count and total 9 | name = list() 10 | 11 | # Ignoring SSL certificate errors 12 | ctx = ssl.create_default_context() 13 | ctx.check_hostname = False 14 | ctx.verify_mode = ssl.CERT_NONE 15 | 16 | # Asking user to input parameters 17 | url = input('Enter URL: ') 18 | pos_str = input('Scanning for a tag that is at the following position relative to the first name in the list: ') 19 | rep_str = input('Repeating the process to follow the link for the following number of times: ') 20 | 21 | # Converting the inputs 22 | position = int(pos_str) 23 | repeat = int(rep_str) 24 | 25 | # Looping through the layers of webpages 4 times 26 | for repeat in range(repeat): 27 | # Reading the whole fine into a single long string 28 | html = urlopen(url, context=ctx).read() 29 | 30 | # Creating an organised string (soup) with BeautifulSoup 31 | soup = BeautifulSoup(html, 'html.parser') 32 | 33 | # Retrieving all of the 'a' tags 34 | tags = soup('a') 35 | 36 | # Adding the name of the person at the given position to the list 37 | name.append(tags[position-1].contents[0]) 38 | 39 | # Updating the URL for the next loop 40 | url = tags[position-1].get('href', None) 41 | 42 | # Printing the list with the names 43 | print(name) -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex12/scraping.py: -------------------------------------------------------------------------------- 1 | # This program scrapes a website for numbers and returns their count and sum. 2 | 3 | # Importing 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import ssl 7 | 8 | # Initialising the count and total 9 | count = 0 10 | total = 0 11 | 12 | # Ignoring SSL certificate errors 13 | ctx = ssl.create_default_context() 14 | ctx.check_hostname = False 15 | ctx.verify_mode = ssl.CERT_NONE 16 | 17 | # Asking user to input the URL 18 | url = input('Enter URL: ') 19 | html = urlopen(url, context=ctx).read() 20 | 21 | # Creating an organised string (soup) with BeautifulSoup 22 | soup = BeautifulSoup(html, "html.parser") 23 | 24 | # Retrieve all of the span tags 25 | tags = soup('span') 26 | for tag in tags: 27 | # Trying to convert the tag's content into integer 28 | try: 29 | total = total + int(tag.contents[0]) 30 | count = count + 1 31 | except: 32 | continue 33 | 34 | # Printing the results 35 | print('Count',count) 36 | print('Sum',total) -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex12/socket1.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 4 | mysock.connect(('data.pr4e.org', 80)) 5 | cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode() 6 | mysock.send(cmd) 7 | 8 | while True: 9 | data = mysock.recv(512) 10 | if (len(data) < 1): 11 | break 12 | print(data.decode()) 13 | mysock.close() 14 | -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex13/extracting_data_from_JSON.py: -------------------------------------------------------------------------------- 1 | # This programme reads an online JSON file from which it extracts the values of the "count" items and sums those values 2 | 3 | # Importing libraries 4 | import urllib.request, urllib.parse, urllib.error 5 | import json 6 | 7 | # Initialising 8 | total_count = 0 9 | 10 | # Asking user to input the source URL of the JSON data file 11 | url = input('Enter URL: ') 12 | print('Retrieving',url) 13 | uhandle = urllib.request.urlopen(url) 14 | data = uhandle.read() 15 | 16 | # Transforming the text of the JSON file into a tree 17 | tree = json.loads(data) 18 | 19 | # Looping through all the comments under the "comments" item 20 | for comment in tree['comments']: 21 | # Converting the text of the "count" items into an integer and summing 22 | try: 23 | total_count = total_count + int(comment['count']) 24 | except: 25 | continue 26 | 27 | # Printing the result 28 | print('Retrieved',len(data),'characters') 29 | print('Count:',len(tree['comments'])) 30 | print('Sum:',total_count) -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex13/extracting_data_from_xml.py: -------------------------------------------------------------------------------- 1 | # This programme reads an online XML file from which it extracts the values of the tags and sums those values 2 | 3 | # Importing libraries 4 | import urllib.request, urllib.parse, urllib.error 5 | import xml.etree.ElementTree as ET 6 | 7 | # Initialising 8 | total_count = 0 9 | 10 | # Asking user to input the source URL of the XML data file 11 | url = input('Enter URL: ') 12 | uhandle = urllib.request.urlopen(url) 13 | data = uhandle.read() 14 | 15 | # Transforming the text of the XML file to a tree 16 | tree = ET.fromstring(data) 17 | 18 | # Finding all the tags and putting them into a list 19 | counts_str = tree.findall('.//count') 20 | 21 | # Looping through all the nodes 22 | for count_str in counts_str: 23 | # Converting the text of the nodes into an integer and summing 24 | try: 25 | total_count = total_count + int(count_str.text) 26 | except: 27 | continue 28 | 29 | # Printing the result 30 | print('Receiving',len(data),'characters') 31 | print('Count:',len(counts_str)) 32 | print('Sum:',total_count) -------------------------------------------------------------------------------- /Course 3 - Using Python to access web data/ex13/using_GeoJSON_API.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # This programme reads an online JSON file from which it extracts the first 3 | # "place_id", which uniquely identifies a place on Google Maps 4 | #=============================================================================== 5 | 6 | # Importing libraries 7 | import urllib.request, urllib.parse, urllib.error 8 | import json 9 | 10 | # Initialising 11 | service_url = 'http://py4e-data.dr-chuck.net/geojson?' 12 | 13 | # Asking user to input the source URL of the JSON data file 14 | loc = input('Enter location: ') 15 | 16 | # Concatinating the URL for the request 17 | url = service_url + urllib.parse.urlencode({'address' : loc}) 18 | print('Retrieving', loc, 'here:', url) 19 | 20 | # Opening connection to the JSON data file 21 | uhandle = urllib.request.urlopen(url) 22 | data = uhandle.read().decode() 23 | print('Retrieved', len(data), 'characters') 24 | 25 | # Transforming the text of the JSON file into a tree 26 | try: 27 | tree = json.loads(data) 28 | except: 29 | tree = None 30 | 31 | # Finding the first "place_id" in the JSON data file 32 | place_id = tree["results"][0]["place_id"] 33 | 34 | # Printing the result 35 | print('Place ID:', place_id) -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/emaildb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex15/emaildb.sqlite -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/ex15 - hexa.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex15/ex15 - hexa.db -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/ex15_counting_e-mail_in_a_database.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # This programme reads an offline txt file and counts the number of email 3 | # messages per organization (i.e. domain name of the email address) using 4 | # a database to maintain the counts. 5 | #=============================================================================== 6 | 7 | # Importing libraries 8 | import sqlite3 9 | 10 | #--- Creating and connecting to database 11 | conn = sqlite3.connect('emaildb.sqlite') 12 | cur = conn.cursor() 13 | 14 | #--- Initialising database 15 | cur.execute('DROP TABLE IF EXISTS Counts') 16 | cur.execute(''' 17 | CREATE TABLE Counts (org TEXT, count INTEGER)''') 18 | 19 | #--- Asking user to input the data file name and provide 'mbox.txt' as a fallback 20 | fname = input('Enter file name: ') 21 | if (len(fname) < 1): fname = 'mbox.txt' 22 | 23 | #--- Opening and reading the file 24 | fh = open(fname) 25 | for line in fh: 26 | #--- Skipping the irrelevant lines 27 | if not line.startswith('From: '): continue 28 | #--- Splitting the lines, taking the split with the e-mail, then splitting the e-mail to get the domain 29 | email = line.split()[1] 30 | domain = email.split('@')[1] 31 | #--- Getting the current count value from the database 32 | cur.execute('SELECT count FROM Counts WHERE org = ? ', (domain,)) 33 | row = cur.fetchone() 34 | #--- If the e-mail is not yet in the database, then add it... 35 | if row is None: 36 | cur.execute('''INSERT INTO Counts (org, count) 37 | VALUES (?, 1)''', (domain,)) 38 | #--- ... else update the count value in the database 39 | else: 40 | cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?', 41 | (domain,)) 42 | #--- Commit changes to the database 43 | conn.commit() 44 | 45 | #--- Querying the results 46 | sqlstr = 'SELECT org, count FROM Counts ORDER BY count DESC LIMIT 10' 47 | for row in cur.execute(sqlstr): 48 | print(str(row[0]), row[1]) 49 | 50 | #--- Closing connection with the database 51 | cur.close() 52 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/ex15_multi-table_database_tracks.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # This application will read an iTunes export file in XML and produce a properly 3 | # normalized database. 4 | #=============================================================================== 5 | 6 | # Importing libraries 7 | import xml.etree.ElementTree as ET 8 | import sqlite3 9 | 10 | #--- Creating and connecting to database 11 | conn = sqlite3.connect('trackdb.sqlite') 12 | cur = conn.cursor() 13 | 14 | #--- Initialising database 15 | cur.executescript(''' 16 | DROP TABLE IF EXISTS Artist; 17 | DROP TABLE IF EXISTS Genre; 18 | DROP TABLE IF EXISTS Album; 19 | DROP TABLE IF EXISTS Track; 20 | ''') 21 | cur.executescript(''' 22 | CREATE TABLE Artist ( 23 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 24 | name TEXT UNIQUE 25 | ); 26 | 27 | CREATE TABLE Genre ( 28 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 29 | name TEXT UNIQUE 30 | ); 31 | 32 | CREATE TABLE Album ( 33 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 34 | artist_id INTEGER, 35 | title TEXT UNIQUE 36 | ); 37 | 38 | CREATE TABLE Track ( 39 | id INTEGER NOT NULL PRIMARY KEY 40 | AUTOINCREMENT UNIQUE, 41 | title TEXT UNIQUE, 42 | album_id INTEGER, 43 | genre_id INTEGER, 44 | len INTEGER, rating INTEGER, count INTEGER 45 | ); 46 | ''') 47 | 48 | #--- Asking user to input the data file name and provide 'Library.xml' as a fallback 49 | fname = input('Enter file name: ') 50 | if ( len(fname) < 1 ) : fname = 'Library.xml' 51 | 52 | #--- The structure of the of the interesting objects in the XML 53 | # Track ID369 54 | # NameAnother One Bites The Dust 55 | # ArtistQueen 56 | 57 | #--- Defining a function to lookup the value of a 'key' tag 58 | def lookup(d, key): 59 | found = False 60 | for child in d: 61 | if found : return child.text 62 | if child.tag == 'key' and child.text == key : 63 | found = True 64 | return None 65 | 66 | #--- Opening and parsing the file 67 | stuff = ET.parse(fname) 68 | #--- Selecting the 'dict' tags in the third depth and counting them 69 | all = stuff.findall('dict/dict/dict') 70 | print('Dict count:', len(all)) 71 | #--- Looping through all result elements 'dict' 72 | for entry in all: 73 | #--- Trying to look up the different data fields. If not found, moving on to the next element 74 | if ( lookup(entry, 'Track ID') is None ) : continue 75 | 76 | name = lookup(entry, 'Name') 77 | artist = lookup(entry, 'Artist') 78 | album = lookup(entry, 'Album') 79 | genre = lookup(entry, 'Genre') 80 | count = lookup(entry, 'Play Count') 81 | rating = lookup(entry, 'Rating') 82 | length = lookup(entry, 'Total Time') 83 | 84 | #--- If a data field is not found, then move on to the next element 85 | if name is None or artist is None or album is None or genre is None: 86 | continue 87 | 88 | #--- Printing the data field of the search result element for the user 89 | print(name, artist, album, genre, count, rating, length) 90 | 91 | #--- Updating the relevant tables with the data field of the search result element 92 | cur.execute('''INSERT OR IGNORE INTO Artist (name) 93 | VALUES ( ? )''', ( artist, ) ) 94 | cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, )) 95 | artist_id = cur.fetchone()[0] 96 | 97 | cur.execute('''INSERT OR IGNORE INTO Genre (name) 98 | VALUES ( ? )''', ( genre, ) ) 99 | cur.execute('SELECT id FROM Genre WHERE name = ? ', (genre, )) 100 | genre_id = cur.fetchone()[0] 101 | 102 | cur.execute('''INSERT OR IGNORE INTO Album (artist_id, title) 103 | VALUES ( ?, ? )''', ( artist_id, album ) ) 104 | cur.execute('SELECT id FROM Album WHERE title = ? ', (album, )) 105 | album_id = cur.fetchone()[0] 106 | 107 | cur.execute('''INSERT OR REPLACE INTO Track 108 | (title, album_id, genre_id, len, rating, count) 109 | VALUES ( ?, ?, ?, ?, ?, ? )''', 110 | ( name, album_id, genre_id, length, rating, count ) ) 111 | 112 | #--- Committing the changes 113 | conn.commit() 114 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/ex15_roster.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # This application will read roster data in JSON format, parse the file, and 3 | # then produce an SQLite database that contains a User, Course, and Member 4 | # table and populate the tables from the data file. 5 | #=============================================================================== 6 | 7 | # Importing libraries 8 | import json 9 | import sqlite3 10 | 11 | #--- Creating and connecting to database 12 | conn = sqlite3.connect('rosterdb.sqlite') 13 | cur = conn.cursor() 14 | 15 | #--- Initialising database 16 | cur.executescript(''' 17 | DROP TABLE IF EXISTS User; 18 | DROP TABLE IF EXISTS Member; 19 | DROP TABLE IF EXISTS Course; 20 | 21 | CREATE TABLE User ( 22 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 23 | name TEXT UNIQUE 24 | ); 25 | 26 | CREATE TABLE Course ( 27 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 28 | title TEXT UNIQUE 29 | ); 30 | 31 | CREATE TABLE Member ( 32 | user_id INTEGER, 33 | course_id INTEGER, 34 | role INTEGER, 35 | PRIMARY KEY (user_id, course_id) 36 | ) 37 | ''') 38 | 39 | #--- Asking user to input the JSON data file name and provide 'roster_data_sample.json' as a fallback 40 | fname = input('Enter file name: ') 41 | if len(fname) < 1: 42 | fname = 'roster_data.json' 43 | 44 | #--- The structure of the JSON object is the following: 45 | # [ 46 | # [ "Charley", "si110", 1 ], 47 | # [ "Mea", "si110", 0 ], 48 | 49 | #--- Opening and reading the file and putting it into a JSON object 50 | str_data = open(fname).read() 51 | json_data = json.loads(str_data) 52 | 53 | #--- Looping through the JSON object 54 | for entry in json_data: 55 | 56 | #--- Looking up the different data fields 57 | name = entry[0]; 58 | title = entry[1]; 59 | role = entry[2]; 60 | 61 | #--- Printing the data field of the search result JSON object for the user 62 | print((name, title, role)) 63 | 64 | #--- Updating the relevant tables with the data field of the search result JSON object 65 | cur.execute('''INSERT OR IGNORE INTO User (name) 66 | VALUES ( ? )''', ( name, ) ) 67 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, )) 68 | user_id = cur.fetchone()[0] 69 | 70 | cur.execute('''INSERT OR IGNORE INTO Course (title) 71 | VALUES ( ? )''', ( title, ) ) 72 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, )) 73 | course_id = cur.fetchone()[0] 74 | 75 | cur.execute('''INSERT OR REPLACE INTO Member 76 | (user_id, course_id, role) VALUES ( ?, ?, ? )''', 77 | ( user_id, course_id, role ) ) 78 | 79 | #--- Committing the changes 80 | conn.commit() 81 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/roster_data.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "Ijay", 4 | "si110", 5 | 1 6 | ], 7 | [ 8 | "Vrishin", 9 | "si110", 10 | 0 11 | ], 12 | [ 13 | "Aleesha", 14 | "si110", 15 | 0 16 | ], 17 | [ 18 | "Brea", 19 | "si110", 20 | 0 21 | ], 22 | [ 23 | "Lana", 24 | "si110", 25 | 0 26 | ], 27 | [ 28 | "Kaine", 29 | "si110", 30 | 0 31 | ], 32 | [ 33 | "Denon", 34 | "si110", 35 | 0 36 | ], 37 | [ 38 | "Dillon", 39 | "si110", 40 | 0 41 | ], 42 | [ 43 | "Temba", 44 | "si110", 45 | 0 46 | ], 47 | [ 48 | "Gemma", 49 | "si110", 50 | 0 51 | ], 52 | [ 53 | "Bree", 54 | "si110", 55 | 0 56 | ], 57 | [ 58 | "Etienne", 59 | "si110", 60 | 0 61 | ], 62 | [ 63 | "Ami", 64 | "si110", 65 | 0 66 | ], 67 | [ 68 | "Konstancja", 69 | "si110", 70 | 0 71 | ], 72 | [ 73 | "Kenton", 74 | "si110", 75 | 0 76 | ], 77 | [ 78 | "Martin", 79 | "si110", 80 | 0 81 | ], 82 | [ 83 | "Rhuairidh", 84 | "si110", 85 | 0 86 | ], 87 | [ 88 | "Meshach", 89 | "si110", 90 | 0 91 | ], 92 | [ 93 | "Dareh", 94 | "si110", 95 | 0 96 | ], 97 | [ 98 | "Adie", 99 | "si110", 100 | 0 101 | ], 102 | [ 103 | "Kamila", 104 | "si110", 105 | 0 106 | ], 107 | [ 108 | "Grace", 109 | "si110", 110 | 0 111 | ], 112 | [ 113 | "Juan", 114 | "si110", 115 | 0 116 | ], 117 | [ 118 | "Rhiannin", 119 | "si110", 120 | 0 121 | ], 122 | [ 123 | "Azzedine", 124 | "si110", 125 | 0 126 | ], 127 | [ 128 | "Derrie", 129 | "si110", 130 | 0 131 | ], 132 | [ 133 | "Maaz", 134 | "si110", 135 | 0 136 | ], 137 | [ 138 | "Marina", 139 | "si110", 140 | 0 141 | ], 142 | [ 143 | "Rhys", 144 | "si110", 145 | 0 146 | ], 147 | [ 148 | "Oriana", 149 | "si110", 150 | 0 151 | ], 152 | [ 153 | "Evelyn", 154 | "si110", 155 | 0 156 | ], 157 | [ 158 | "Tigan", 159 | "si110", 160 | 0 161 | ], 162 | [ 163 | "Reuben", 164 | "si110", 165 | 0 166 | ], 167 | [ 168 | "Aadit", 169 | "si110", 170 | 0 171 | ], 172 | [ 173 | "Arooba", 174 | "si110", 175 | 0 176 | ], 177 | [ 178 | "Neave", 179 | "si110", 180 | 0 181 | ], 182 | [ 183 | "Kaira", 184 | "si110", 185 | 0 186 | ], 187 | [ 188 | "Igor", 189 | "si110", 190 | 0 191 | ], 192 | [ 193 | "Vuyolwethu", 194 | "si110", 195 | 0 196 | ], 197 | [ 198 | "Garoa", 199 | "si110", 200 | 0 201 | ], 202 | [ 203 | "Obieluem", 204 | "si110", 205 | 0 206 | ], 207 | [ 208 | "Alecia", 209 | "si110", 210 | 0 211 | ], 212 | [ 213 | "Pablo", 214 | "si110", 215 | 0 216 | ], 217 | [ 218 | "Lara", 219 | "si110", 220 | 0 221 | ], 222 | [ 223 | "Roxanna", 224 | "si106", 225 | 1 226 | ], 227 | [ 228 | "Iman", 229 | "si106", 230 | 0 231 | ], 232 | [ 233 | "Edie", 234 | "si106", 235 | 0 236 | ], 237 | [ 238 | "Cambell", 239 | "si106", 240 | 0 241 | ], 242 | [ 243 | "Kiarash", 244 | "si106", 245 | 0 246 | ], 247 | [ 248 | "Stuart", 249 | "si106", 250 | 0 251 | ], 252 | [ 253 | "Naideen", 254 | "si106", 255 | 0 256 | ], 257 | [ 258 | "Ammara", 259 | "si106", 260 | 0 261 | ], 262 | [ 263 | "Thara", 264 | "si106", 265 | 0 266 | ], 267 | [ 268 | "Ishbel", 269 | "si106", 270 | 0 271 | ], 272 | [ 273 | "Heyden", 274 | "si106", 275 | 0 276 | ], 277 | [ 278 | "Aaryn", 279 | "si106", 280 | 0 281 | ], 282 | [ 283 | "Abdul", 284 | "si106", 285 | 0 286 | ], 287 | [ 288 | "Josephina", 289 | "si106", 290 | 0 291 | ], 292 | [ 293 | "Chen", 294 | "si106", 295 | 0 296 | ], 297 | [ 298 | "Zayne", 299 | "si106", 300 | 0 301 | ], 302 | [ 303 | "Todd", 304 | "si106", 305 | 0 306 | ], 307 | [ 308 | "Miyha", 309 | "si106", 310 | 0 311 | ], 312 | [ 313 | "Murry", 314 | "si106", 315 | 0 316 | ], 317 | [ 318 | "Kacy", 319 | "si106", 320 | 0 321 | ], 322 | [ 323 | "Harman", 324 | "si106", 325 | 0 326 | ], 327 | [ 328 | "Tyllor", 329 | "si106", 330 | 0 331 | ], 332 | [ 333 | "Jonah", 334 | "si106", 335 | 0 336 | ], 337 | [ 338 | "Rayna", 339 | "si206", 340 | 1 341 | ], 342 | [ 343 | "Rylie", 344 | "si206", 345 | 0 346 | ], 347 | [ 348 | "Maeya", 349 | "si206", 350 | 0 351 | ], 352 | [ 353 | "Elleanne", 354 | "si206", 355 | 0 356 | ], 357 | [ 358 | "Ryleigh", 359 | "si206", 360 | 0 361 | ], 362 | [ 363 | "Sophi", 364 | "si206", 365 | 0 366 | ], 367 | [ 368 | "Easton", 369 | "si206", 370 | 0 371 | ], 372 | [ 373 | "Bobbie", 374 | "si206", 375 | 0 376 | ], 377 | [ 378 | "Caley", 379 | "si206", 380 | 0 381 | ], 382 | [ 383 | "Meabh", 384 | "si206", 385 | 0 386 | ], 387 | [ 388 | "Kenneth", 389 | "si206", 390 | 0 391 | ], 392 | [ 393 | "Heather", 394 | "si206", 395 | 0 396 | ], 397 | [ 398 | "Aaima", 399 | "si206", 400 | 0 401 | ], 402 | [ 403 | "Gigha", 404 | "si206", 405 | 0 406 | ], 407 | [ 408 | "Moayd", 409 | "si206", 410 | 0 411 | ], 412 | [ 413 | "Kaydie", 414 | "si206", 415 | 0 416 | ], 417 | [ 418 | "Zayn", 419 | "si206", 420 | 0 421 | ], 422 | [ 423 | "Kaytie", 424 | "si206", 425 | 0 426 | ], 427 | [ 428 | "Malisa", 429 | "si206", 430 | 0 431 | ], 432 | [ 433 | "Ceol", 434 | "si206", 435 | 0 436 | ], 437 | [ 438 | "Kaeden", 439 | "si206", 440 | 0 441 | ], 442 | [ 443 | "Meah", 444 | "si206", 445 | 0 446 | ], 447 | [ 448 | "Scout", 449 | "si206", 450 | 0 451 | ], 452 | [ 453 | "Lukmaan", 454 | "si206", 455 | 0 456 | ], 457 | [ 458 | "Enoghado", 459 | "si206", 460 | 0 461 | ], 462 | [ 463 | "Elyse", 464 | "si206", 465 | 0 466 | ], 467 | [ 468 | "Ellisha", 469 | "si206", 470 | 0 471 | ], 472 | [ 473 | "Mahek", 474 | "si206", 475 | 0 476 | ], 477 | [ 478 | "Shazil", 479 | "si206", 480 | 0 481 | ], 482 | [ 483 | "Xavier", 484 | "si206", 485 | 0 486 | ], 487 | [ 488 | "Elodie", 489 | "si206", 490 | 0 491 | ], 492 | [ 493 | "Shayaan", 494 | "si206", 495 | 0 496 | ], 497 | [ 498 | "Saul", 499 | "si206", 500 | 0 501 | ], 502 | [ 503 | "Ishwari", 504 | "si206", 505 | 0 506 | ], 507 | [ 508 | "Alessandra", 509 | "si206", 510 | 0 511 | ], 512 | [ 513 | "Fraser", 514 | "si206", 515 | 0 516 | ], 517 | [ 518 | "Estelle", 519 | "si206", 520 | 0 521 | ], 522 | [ 523 | "Braeden", 524 | "si206", 525 | 0 526 | ], 527 | [ 528 | "Daylen", 529 | "si206", 530 | 0 531 | ], 532 | [ 533 | "Conlyn", 534 | "si206", 535 | 0 536 | ], 537 | [ 538 | "Abdihakim", 539 | "si206", 540 | 0 541 | ], 542 | [ 543 | "Kaleb", 544 | "si206", 545 | 0 546 | ], 547 | [ 548 | "Karol", 549 | "si206", 550 | 0 551 | ], 552 | [ 553 | "Lilyana", 554 | "si206", 555 | 0 556 | ], 557 | [ 558 | "Jesuseun", 559 | "si206", 560 | 0 561 | ], 562 | [ 563 | "Talorcan", 564 | "si206", 565 | 0 566 | ], 567 | [ 568 | "Windsor", 569 | "si206", 570 | 0 571 | ], 572 | [ 573 | "Airen", 574 | "si206", 575 | 0 576 | ], 577 | [ 578 | "Rayan", 579 | "si206", 580 | 0 581 | ], 582 | [ 583 | "Blair", 584 | "si301", 585 | 1 586 | ], 587 | [ 588 | "Daksh", 589 | "si301", 590 | 0 591 | ], 592 | [ 593 | "Dhani", 594 | "si301", 595 | 0 596 | ], 597 | [ 598 | "Conlly", 599 | "si301", 600 | 0 601 | ], 602 | [ 603 | "Radmiras", 604 | "si301", 605 | 0 606 | ], 607 | [ 608 | "Nicki", 609 | "si301", 610 | 0 611 | ], 612 | [ 613 | "Likitta", 614 | "si301", 615 | 0 616 | ], 617 | [ 618 | "Shwetika", 619 | "si301", 620 | 0 621 | ], 622 | [ 623 | "Kaycie", 624 | "si301", 625 | 0 626 | ], 627 | [ 628 | "Leiten", 629 | "si301", 630 | 0 631 | ], 632 | [ 633 | "Madisen", 634 | "si301", 635 | 0 636 | ], 637 | [ 638 | "Nelly", 639 | "si301", 640 | 0 641 | ], 642 | [ 643 | "Clark", 644 | "si301", 645 | 0 646 | ], 647 | [ 648 | "Guy", 649 | "si301", 650 | 0 651 | ], 652 | [ 653 | "Teagan", 654 | "si301", 655 | 0 656 | ], 657 | [ 658 | "Alba", 659 | "si301", 660 | 0 661 | ], 662 | [ 663 | "Ty", 664 | "si301", 665 | 0 666 | ], 667 | [ 668 | "Carrie", 669 | "si301", 670 | 0 671 | ], 672 | [ 673 | "Husnain", 674 | "si301", 675 | 0 676 | ], 677 | [ 678 | "Regan", 679 | "si301", 680 | 0 681 | ], 682 | [ 683 | "Keryn", 684 | "si301", 685 | 0 686 | ], 687 | [ 688 | "Hui", 689 | "si301", 690 | 0 691 | ], 692 | [ 693 | "Celeste", 694 | "si301", 695 | 0 696 | ], 697 | [ 698 | "Eshaal", 699 | "si301", 700 | 0 701 | ], 702 | [ 703 | "Cadie", 704 | "si301", 705 | 0 706 | ], 707 | [ 708 | "Mirren", 709 | "si301", 710 | 0 711 | ], 712 | [ 713 | "Areeb", 714 | "si301", 715 | 0 716 | ], 717 | [ 718 | "Vasyl", 719 | "si301", 720 | 0 721 | ], 722 | [ 723 | "Rachael", 724 | "si301", 725 | 0 726 | ], 727 | [ 728 | "Annalicia", 729 | "si301", 730 | 0 731 | ], 732 | [ 733 | "Mikolaj", 734 | "si301", 735 | 0 736 | ], 737 | [ 738 | "Ruairi", 739 | "si301", 740 | 0 741 | ], 742 | [ 743 | "Zubair", 744 | "si301", 745 | 0 746 | ], 747 | [ 748 | "Clarisse", 749 | "si301", 750 | 0 751 | ], 752 | [ 753 | "Arda", 754 | "si301", 755 | 0 756 | ], 757 | [ 758 | "Alfred", 759 | "si301", 760 | 0 761 | ], 762 | [ 763 | "Anita", 764 | "si301", 765 | 0 766 | ], 767 | [ 768 | "Robby", 769 | "si301", 770 | 0 771 | ], 772 | [ 773 | "Sinali", 774 | "si301", 775 | 0 776 | ], 777 | [ 778 | "Joss", 779 | "si301", 780 | 0 781 | ], 782 | [ 783 | "Milie", 784 | "si310", 785 | 1 786 | ], 787 | [ 788 | "Jacqui", 789 | "si310", 790 | 0 791 | ], 792 | [ 793 | "Fionnah", 794 | "si310", 795 | 0 796 | ], 797 | [ 798 | "Luic", 799 | "si310", 800 | 0 801 | ], 802 | [ 803 | "Krista", 804 | "si310", 805 | 0 806 | ], 807 | [ 808 | "Amie", 809 | "si310", 810 | 0 811 | ], 812 | [ 813 | "Edith", 814 | "si310", 815 | 0 816 | ], 817 | [ 818 | "Evey", 819 | "si310", 820 | 0 821 | ], 822 | [ 823 | "Carmen", 824 | "si310", 825 | 0 826 | ], 827 | [ 828 | "Marla", 829 | "si310", 830 | 0 831 | ], 832 | [ 833 | "Avani", 834 | "si310", 835 | 0 836 | ], 837 | [ 838 | "Teagan", 839 | "si310", 840 | 0 841 | ], 842 | [ 843 | "Forbes", 844 | "si310", 845 | 0 846 | ], 847 | [ 848 | "Shayna", 849 | "si310", 850 | 0 851 | ], 852 | [ 853 | "Oliwia", 854 | "si310", 855 | 0 856 | ], 857 | [ 858 | "Zita", 859 | "si310", 860 | 0 861 | ], 862 | [ 863 | "Maison", 864 | "si310", 865 | 0 866 | ], 867 | [ 868 | "Jarred", 869 | "si310", 870 | 0 871 | ], 872 | [ 873 | "Kean", 874 | "si310", 875 | 0 876 | ], 877 | [ 878 | "Glydel", 879 | "si310", 880 | 0 881 | ], 882 | [ 883 | "Harish", 884 | "si310", 885 | 0 886 | ], 887 | [ 888 | "Rahman", 889 | "si310", 890 | 0 891 | ], 892 | [ 893 | "Christian", 894 | "si310", 895 | 0 896 | ], 897 | [ 898 | "Aamna", 899 | "si310", 900 | 0 901 | ], 902 | [ 903 | "Melania", 904 | "si310", 905 | 0 906 | ], 907 | [ 908 | "Reigan", 909 | "si310", 910 | 0 911 | ], 912 | [ 913 | "Andrew", 914 | "si310", 915 | 0 916 | ], 917 | [ 918 | "Zachery", 919 | "si310", 920 | 0 921 | ], 922 | [ 923 | "Aurlah", 924 | "si310", 925 | 0 926 | ], 927 | [ 928 | "Laison", 929 | "si334", 930 | 1 931 | ], 932 | [ 933 | "Iagan", 934 | "si334", 935 | 0 936 | ], 937 | [ 938 | "Jeffrey", 939 | "si334", 940 | 0 941 | ], 942 | [ 943 | "Alvern", 944 | "si334", 945 | 0 946 | ], 947 | [ 948 | "Zachary", 949 | "si334", 950 | 0 951 | ], 952 | [ 953 | "Presley", 954 | "si334", 955 | 0 956 | ], 957 | [ 958 | "Naideen", 959 | "si334", 960 | 0 961 | ], 962 | [ 963 | "Matthias", 964 | "si334", 965 | 0 966 | ], 967 | [ 968 | "Chala", 969 | "si334", 970 | 0 971 | ], 972 | [ 973 | "Uzma", 974 | "si334", 975 | 0 976 | ], 977 | [ 978 | "Kevin", 979 | "si334", 980 | 0 981 | ], 982 | [ 983 | "Xavier", 984 | "si334", 985 | 0 986 | ], 987 | [ 988 | "Orrin", 989 | "si334", 990 | 0 991 | ], 992 | [ 993 | "Zaynab", 994 | "si334", 995 | 0 996 | ], 997 | [ 998 | "Kye", 999 | "si334", 1000 | 0 1001 | ], 1002 | [ 1003 | "Kia", 1004 | "si334", 1005 | 0 1006 | ], 1007 | [ 1008 | "Ebony", 1009 | "si334", 1010 | 0 1011 | ], 1012 | [ 1013 | "Morwena", 1014 | "si334", 1015 | 0 1016 | ], 1017 | [ 1018 | "Lyndsay", 1019 | "si334", 1020 | 0 1021 | ], 1022 | [ 1023 | "Jagat", 1024 | "si334", 1025 | 0 1026 | ], 1027 | [ 1028 | "Kirsty", 1029 | "si334", 1030 | 0 1031 | ], 1032 | [ 1033 | "Regan", 1034 | "si334", 1035 | 0 1036 | ], 1037 | [ 1038 | "Clove", 1039 | "si334", 1040 | 0 1041 | ], 1042 | [ 1043 | "Jude", 1044 | "si334", 1045 | 0 1046 | ], 1047 | [ 1048 | "Cacie", 1049 | "si334", 1050 | 0 1051 | ], 1052 | [ 1053 | "Caolain", 1054 | "si334", 1055 | 0 1056 | ], 1057 | [ 1058 | "Aileigh", 1059 | "si334", 1060 | 0 1061 | ], 1062 | [ 1063 | "Macy", 1064 | "si334", 1065 | 0 1066 | ], 1067 | [ 1068 | "Sol", 1069 | "si334", 1070 | 0 1071 | ], 1072 | [ 1073 | "Aaryn", 1074 | "si334", 1075 | 0 1076 | ], 1077 | [ 1078 | "Oskar", 1079 | "si334", 1080 | 0 1081 | ], 1082 | [ 1083 | "Kiah", 1084 | "si334", 1085 | 0 1086 | ], 1087 | [ 1088 | "Eliza", 1089 | "si334", 1090 | 0 1091 | ], 1092 | [ 1093 | "Bayleigh", 1094 | "si334", 1095 | 0 1096 | ], 1097 | [ 1098 | "Murdina", 1099 | "si334", 1100 | 0 1101 | ], 1102 | [ 1103 | "Sohaa", 1104 | "si363", 1105 | 1 1106 | ], 1107 | [ 1108 | "Cliodhna", 1109 | "si363", 1110 | 0 1111 | ], 1112 | [ 1113 | "Kyla", 1114 | "si363", 1115 | 0 1116 | ], 1117 | [ 1118 | "Emma", 1119 | "si363", 1120 | 0 1121 | ], 1122 | [ 1123 | "Sorley", 1124 | "si363", 1125 | 0 1126 | ], 1127 | [ 1128 | "Frankie", 1129 | "si363", 1130 | 0 1131 | ], 1132 | [ 1133 | "Blaise", 1134 | "si363", 1135 | 0 1136 | ], 1137 | [ 1138 | "Rowyn", 1139 | "si363", 1140 | 0 1141 | ], 1142 | [ 1143 | "Pele", 1144 | "si363", 1145 | 0 1146 | ], 1147 | [ 1148 | "Lindsay", 1149 | "si363", 1150 | 0 1151 | ], 1152 | [ 1153 | "Sonni", 1154 | "si363", 1155 | 0 1156 | ], 1157 | [ 1158 | "Nihal", 1159 | "si363", 1160 | 0 1161 | ], 1162 | [ 1163 | "Elsi", 1164 | "si363", 1165 | 0 1166 | ], 1167 | [ 1168 | "Kruz", 1169 | "si363", 1170 | 0 1171 | ], 1172 | [ 1173 | "Pedram", 1174 | "si363", 1175 | 0 1176 | ], 1177 | [ 1178 | "Caolain", 1179 | "si363", 1180 | 0 1181 | ], 1182 | [ 1183 | "Symon", 1184 | "si363", 1185 | 0 1186 | ], 1187 | [ 1188 | "Simonne", 1189 | "si363", 1190 | 0 1191 | ], 1192 | [ 1193 | "Raith", 1194 | "si363", 1195 | 0 1196 | ], 1197 | [ 1198 | "Rubi", 1199 | "si363", 1200 | 0 1201 | ], 1202 | [ 1203 | "Cadon", 1204 | "si363", 1205 | 0 1206 | ], 1207 | [ 1208 | "Trey", 1209 | "si363", 1210 | 0 1211 | ], 1212 | [ 1213 | "Gytis", 1214 | "si363", 1215 | 0 1216 | ], 1217 | [ 1218 | "Elshan", 1219 | "si363", 1220 | 0 1221 | ], 1222 | [ 1223 | "Rhiannin", 1224 | "si363", 1225 | 0 1226 | ], 1227 | [ 1228 | "Flyn", 1229 | "si363", 1230 | 0 1231 | ], 1232 | [ 1233 | "Cormac", 1234 | "si363", 1235 | 0 1236 | ], 1237 | [ 1238 | "Alina", 1239 | "si363", 1240 | 0 1241 | ], 1242 | [ 1243 | "Millie", 1244 | "si363", 1245 | 0 1246 | ], 1247 | [ 1248 | "Jorji", 1249 | "si363", 1250 | 0 1251 | ], 1252 | [ 1253 | "Stevie", 1254 | "si363", 1255 | 0 1256 | ], 1257 | [ 1258 | "Celina", 1259 | "si363", 1260 | 0 1261 | ], 1262 | [ 1263 | "Peaches", 1264 | "si363", 1265 | 0 1266 | ], 1267 | [ 1268 | "Meryl", 1269 | "si363", 1270 | 0 1271 | ], 1272 | [ 1273 | "Bronwen", 1274 | "si363", 1275 | 0 1276 | ], 1277 | [ 1278 | "Kalvyn", 1279 | "si363", 1280 | 0 1281 | ], 1282 | [ 1283 | "Donald", 1284 | "si363", 1285 | 0 1286 | ], 1287 | [ 1288 | "Nevin", 1289 | "si363", 1290 | 0 1291 | ], 1292 | [ 1293 | "Crispin", 1294 | "si363", 1295 | 0 1296 | ], 1297 | [ 1298 | "Kaelynn", 1299 | "si363", 1300 | 0 1301 | ], 1302 | [ 1303 | "Braeden", 1304 | "si363", 1305 | 0 1306 | ], 1307 | [ 1308 | "Karli", 1309 | "si364", 1310 | 1 1311 | ], 1312 | [ 1313 | "Harleen", 1314 | "si364", 1315 | 0 1316 | ], 1317 | [ 1318 | "Florin", 1319 | "si364", 1320 | 0 1321 | ], 1322 | [ 1323 | "Phinehas", 1324 | "si364", 1325 | 0 1326 | ], 1327 | [ 1328 | "Ellia", 1329 | "si364", 1330 | 0 1331 | ], 1332 | [ 1333 | "Carla", 1334 | "si364", 1335 | 0 1336 | ], 1337 | [ 1338 | "Yuri", 1339 | "si364", 1340 | 0 1341 | ], 1342 | [ 1343 | "Ana", 1344 | "si364", 1345 | 0 1346 | ], 1347 | [ 1348 | "Habeeb", 1349 | "si364", 1350 | 0 1351 | ], 1352 | [ 1353 | "Haley", 1354 | "si364", 1355 | 0 1356 | ], 1357 | [ 1358 | "Shauntel", 1359 | "si364", 1360 | 0 1361 | ], 1362 | [ 1363 | "Olaoluwapolorimi", 1364 | "si364", 1365 | 0 1366 | ], 1367 | [ 1368 | "Haneeah", 1369 | "si364", 1370 | 0 1371 | ], 1372 | [ 1373 | "Eryk", 1374 | "si364", 1375 | 0 1376 | ], 1377 | [ 1378 | "Anousha", 1379 | "si364", 1380 | 0 1381 | ], 1382 | [ 1383 | "Annaleigh", 1384 | "si364", 1385 | 0 1386 | ], 1387 | [ 1388 | "Taqwa", 1389 | "si364", 1390 | 0 1391 | ], 1392 | [ 1393 | "Hogan", 1394 | "si364", 1395 | 0 1396 | ], 1397 | [ 1398 | "Tasia", 1399 | "si364", 1400 | 0 1401 | ], 1402 | [ 1403 | "Sophie", 1404 | "si364", 1405 | 0 1406 | ], 1407 | [ 1408 | "Ghyll", 1409 | "si364", 1410 | 0 1411 | ], 1412 | [ 1413 | "CJ", 1414 | "si364", 1415 | 0 1416 | ], 1417 | [ 1418 | "Sahaib", 1419 | "si364", 1420 | 0 1421 | ], 1422 | [ 1423 | "Keziah", 1424 | "si364", 1425 | 0 1426 | ], 1427 | [ 1428 | "Keiron", 1429 | "si364", 1430 | 0 1431 | ], 1432 | [ 1433 | "Dalton", 1434 | "si364", 1435 | 0 1436 | ], 1437 | [ 1438 | "Abdurraheem", 1439 | "si364", 1440 | 0 1441 | ], 1442 | [ 1443 | "Caitlinn", 1444 | "si364", 1445 | 0 1446 | ], 1447 | [ 1448 | "Toby", 1449 | "si364", 1450 | 0 1451 | ], 1452 | [ 1453 | "Taliya", 1454 | "si364", 1455 | 0 1456 | ], 1457 | [ 1458 | "Dyllon", 1459 | "si364", 1460 | 0 1461 | ], 1462 | [ 1463 | "Roman", 1464 | "si364", 1465 | 0 1466 | ], 1467 | [ 1468 | "Caoilfhinn", 1469 | "si364", 1470 | 0 1471 | ], 1472 | [ 1473 | "Ismail", 1474 | "si364", 1475 | 0 1476 | ], 1477 | [ 1478 | "Karley", 1479 | "si364", 1480 | 0 1481 | ], 1482 | [ 1483 | "Brajan", 1484 | "si364", 1485 | 0 1486 | ], 1487 | [ 1488 | "Almaas", 1489 | "si364", 1490 | 0 1491 | ], 1492 | [ 1493 | "Callie", 1494 | "si364", 1495 | 0 1496 | ], 1497 | [ 1498 | "Jess", 1499 | "si422", 1500 | 1 1501 | ], 1502 | [ 1503 | "Dillon", 1504 | "si422", 1505 | 0 1506 | ], 1507 | [ 1508 | "Mehmet", 1509 | "si422", 1510 | 0 1511 | ], 1512 | [ 1513 | "Micaila", 1514 | "si422", 1515 | 0 1516 | ], 1517 | [ 1518 | "Alexx", 1519 | "si422", 1520 | 0 1521 | ], 1522 | [ 1523 | "Dolan", 1524 | "si422", 1525 | 0 1526 | ], 1527 | [ 1528 | "Braden", 1529 | "si422", 1530 | 0 1531 | ], 1532 | [ 1533 | "Elena", 1534 | "si422", 1535 | 0 1536 | ], 1537 | [ 1538 | "Joaquin", 1539 | "si422", 1540 | 0 1541 | ], 1542 | [ 1543 | "Miley", 1544 | "si422", 1545 | 0 1546 | ], 1547 | [ 1548 | "Dearbhla", 1549 | "si422", 1550 | 0 1551 | ], 1552 | [ 1553 | "Francisca", 1554 | "si422", 1555 | 0 1556 | ], 1557 | [ 1558 | "Bracken", 1559 | "si422", 1560 | 0 1561 | ], 1562 | [ 1563 | "Stewarty", 1564 | "si422", 1565 | 0 1566 | ], 1567 | [ 1568 | "Tea", 1569 | "si422", 1570 | 0 1571 | ], 1572 | [ 1573 | "Stephen", 1574 | "si422", 1575 | 0 1576 | ], 1577 | [ 1578 | "Abar", 1579 | "si422", 1580 | 0 1581 | ], 1582 | [ 1583 | "Martin", 1584 | "si430", 1585 | 1 1586 | ], 1587 | [ 1588 | "Carol", 1589 | "si430", 1590 | 0 1591 | ], 1592 | [ 1593 | "Coray", 1594 | "si430", 1595 | 0 1596 | ], 1597 | [ 1598 | "Reggie", 1599 | "si430", 1600 | 0 1601 | ], 1602 | [ 1603 | "Jayhan", 1604 | "si430", 1605 | 0 1606 | ], 1607 | [ 1608 | "Phoenix", 1609 | "si430", 1610 | 0 1611 | ], 1612 | [ 1613 | "Cieran", 1614 | "si430", 1615 | 0 1616 | ], 1617 | [ 1618 | "Annaliesse", 1619 | "si430", 1620 | 0 1621 | ], 1622 | [ 1623 | "Eabha", 1624 | "si430", 1625 | 0 1626 | ], 1627 | [ 1628 | "Cesar", 1629 | "si430", 1630 | 0 1631 | ], 1632 | [ 1633 | "Nikol", 1634 | "si430", 1635 | 0 1636 | ], 1637 | [ 1638 | "Anesu", 1639 | "si430", 1640 | 0 1641 | ], 1642 | [ 1643 | "Elspeth", 1644 | "si430", 1645 | 0 1646 | ], 1647 | [ 1648 | "Greg", 1649 | "si430", 1650 | 0 1651 | ], 1652 | [ 1653 | "Chi", 1654 | "si430", 1655 | 0 1656 | ], 1657 | [ 1658 | "Kaia", 1659 | "si430", 1660 | 0 1661 | ], 1662 | [ 1663 | "Kaceylee", 1664 | "si430", 1665 | 0 1666 | ], 1667 | [ 1668 | "Madelyn", 1669 | "si430", 1670 | 0 1671 | ], 1672 | [ 1673 | "Ayan", 1674 | "si430", 1675 | 0 1676 | ], 1677 | [ 1678 | "Zuzia", 1679 | "si430", 1680 | 0 1681 | ], 1682 | [ 1683 | "Tasia", 1684 | "si430", 1685 | 0 1686 | ], 1687 | [ 1688 | "Renars", 1689 | "si430", 1690 | 0 1691 | ], 1692 | [ 1693 | "Fia", 1694 | "si430", 1695 | 0 1696 | ], 1697 | [ 1698 | "Trey", 1699 | "si430", 1700 | 0 1701 | ], 1702 | [ 1703 | "Leven", 1704 | "si430", 1705 | 0 1706 | ], 1707 | [ 1708 | "VJay", 1709 | "si430", 1710 | 0 1711 | ], 1712 | [ 1713 | "Fiza", 1714 | "si430", 1715 | 0 1716 | ], 1717 | [ 1718 | "Sanaa", 1719 | "si430", 1720 | 0 1721 | ], 1722 | [ 1723 | "Ingrid", 1724 | "si430", 1725 | 0 1726 | ], 1727 | [ 1728 | "Laurajane", 1729 | "si430", 1730 | 0 1731 | ], 1732 | [ 1733 | "Cyrus", 1734 | "si430", 1735 | 0 1736 | ], 1737 | [ 1738 | "Juniper", 1739 | "si430", 1740 | 0 1741 | ], 1742 | [ 1743 | "Aref", 1744 | "si430", 1745 | 0 1746 | ], 1747 | [ 1748 | "Lex", 1749 | "si430", 1750 | 0 1751 | ], 1752 | [ 1753 | "Deshawn", 1754 | "si430", 1755 | 0 1756 | ], 1757 | [ 1758 | "Raigen", 1759 | "si430", 1760 | 0 1761 | ], 1762 | [ 1763 | "Karl", 1764 | "si430", 1765 | 0 1766 | ], 1767 | [ 1768 | "Aron", 1769 | "si430", 1770 | 0 1771 | ], 1772 | [ 1773 | "Navneet", 1774 | "si430", 1775 | 0 1776 | ], 1777 | [ 1778 | "Ruaidhri", 1779 | "si430", 1780 | 0 1781 | ], 1782 | [ 1783 | "Cate", 1784 | "si430", 1785 | 0 1786 | ], 1787 | [ 1788 | "Gary", 1789 | "si430", 1790 | 0 1791 | ], 1792 | [ 1793 | "Nazia", 1794 | "si430", 1795 | 0 1796 | ], 1797 | [ 1798 | "Efan", 1799 | "si430", 1800 | 0 1801 | ], 1802 | [ 1803 | "Khizar", 1804 | "si430", 1805 | 0 1806 | ], 1807 | [ 1808 | "Swarnalakshmi", 1809 | "si430", 1810 | 0 1811 | ], 1812 | [ 1813 | "Dorian", 1814 | "si430", 1815 | 0 1816 | ], 1817 | [ 1818 | "Roisin", 1819 | "si430", 1820 | 0 1821 | ], 1822 | [ 1823 | "Anona", 1824 | "si430", 1825 | 0 1826 | ] 1827 | ] -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/rosterdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex15/rosterdb.sqlite -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex15/trackdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex15/trackdb.sqlite -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/README.txt: -------------------------------------------------------------------------------- 1 | Using the Google Places API with a Database and 2 | Visualizing Data on Google Map 3 | 4 | In this project, we are using the Google geocoding API 5 | to clean up some user-entered geographic locations of 6 | university names and then placing the data on a Google 7 | Map. 8 | 9 | Note: Windows has difficulty in displaying UTF-8 characters 10 | in the console so for each command window you open, you may need 11 | to type the following command before running this code: 12 | 13 | chcp 65001 14 | 15 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 16 | 17 | 18 | You should install the SQLite browser to view and modify 19 | the databases from: 20 | 21 | http://sqlitebrowser.org/ 22 | 23 | The first problem to solve is that the Google geocoding 24 | API is rate limited to a fixed number of requests per day. 25 | So if you have a lot of data you might need to stop and 26 | restart the lookup process several times. So we break 27 | the problem into two phases. 28 | 29 | In the first phase we take our input data in the file 30 | (where.data) and read it one line at a time, and retrieve the 31 | geocoded response and store it in a database (geodata.sqlite). 32 | Before we use the geocoding API, we simply check to see if 33 | we already have the data for that particular line of input. 34 | 35 | You can re-start the process at any time by removing the file 36 | geodata.sqlite 37 | 38 | Run the geoload.py program. This program will read the input 39 | lines in where.data and for each line check to see if it is already 40 | in the database and if we don't have the data for the location, 41 | call the geocoding API to retrieve the data and store it in 42 | the database. 43 | 44 | As of December 2016, the Google Geocoding APIs changed dramatically. 45 | They moved some functionality that we use from the Geocoding API 46 | into the Places API. Also all the Google Geo-related APIs require an 47 | API key. To complete this assignment without a Google account, 48 | without an API key, or from a country that blocks 49 | access to Google, you can use a subset of that data which is 50 | available at: 51 | 52 | http://py4e-data.dr-chuck.net/geojson 53 | 54 | To use this, simply leave the api_key set to False in 55 | geoload.py. 56 | 57 | This URL only has a subset of the data but it has no rate limit so 58 | it is good for testing. 59 | 60 | If you want to try this with the API key, follow the 61 | instructions at: 62 | 63 | https://developers.google.com/maps/documentation/geocoding/intro 64 | 65 | and put the API key in the code. 66 | 67 | Here is a sample run after there is already some data in the 68 | database: 69 | 70 | Mac: python3 geoload.py 71 | Win: geoload.py 72 | 73 | Found in database Northeastern University 74 | 75 | Found in database University of Hong Kong, Illinois Institute of Technology, Bradley University 76 | 77 | Found in database Technion 78 | 79 | Found in database Viswakarma Institute, Pune, India 80 | 81 | Found in database UMD 82 | 83 | Found in database Tufts University 84 | 85 | Resolving Monash University 86 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Monash+University 87 | Retrieved 2063 characters { "results" : [ 88 | {u'status': u'OK', u'results': ... } 89 | 90 | Resolving Kokshetau Institute of Economics and Management 91 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Kokshetau+Institute+of+Economics+and+Management 92 | Retrieved 1749 characters { "results" : [ 93 | {u'status': u'OK', u'results': ... } 94 | 95 | The first five locations are already in the database and so they 96 | are skipped. The program scans to the point where it finds un-retrieved 97 | locations and starts retrieving them. 98 | 99 | The geoload.py can be stopped at any time, and there is a counter 100 | that you can use to limit the number of calls to the geocoding 101 | API for each run. 102 | 103 | Once you have some data loaded into geodata.sqlite, you can 104 | visualize the data using the (geodump.py) program. This 105 | program reads the database and writes tile file (where.js) 106 | with the location, latitude, and longitude in the form of 107 | executable JavaScript code. 108 | 109 | A run of the geodump.py program is as follows: 110 | 111 | Mac: python3 geodump.py 112 | Win: geodump.py 113 | 114 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975 115 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811 116 | ... 117 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667 118 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682 119 | Kokshetau, Kazakhstan 53.2833333 69.3833333 120 | ... 121 | 12 records written to where.js 122 | Open where.html to view the data in a browser 123 | 124 | The file (where.html) consists of HTML and JavaScript to visualize 125 | a Google Map. It reads the most recent data in where.js to get 126 | the data to be visualized. Here is the format of the where.js file: 127 | 128 | myData = [ 129 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'], 130 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'], 131 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'], 132 | ... 133 | ]; 134 | 135 | This is a JavaScript list of lists. The syntax for JavaScript 136 | list constants is very similar to Python so the syntax should 137 | be familiar to you. 138 | 139 | Simply open where.html in a browser to see the locations. You 140 | can hover over each map pin to find the location that the 141 | gecoding API returned for the user-entered input. If you 142 | cannot see any data when you open the where.html file, you might 143 | want to check the JavaScript or developer console for your browser. 144 | 145 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/geodata.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex16/geodata.sqlite -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/geodump running.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex16/geodump running.jpg -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/geodump.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # This project uses the Google GeoCoding API to retrieve data and then uses 3 | # Google Maps to visualize the data. 4 | # This part of the project reads the data from a database and writed the 5 | # parts into a javascript file for future visualisation. 6 | #=============================================================================== 7 | 8 | #--- Importing libraries 9 | import sqlite3 10 | import json 11 | import codecs 12 | 13 | #--- Opening connection to the SQLite database 14 | conn = sqlite3.connect('geodata.sqlite') 15 | cur = conn.cursor() 16 | 17 | #--- Reading everything from the database into the cursor 18 | cur.execute('SELECT * FROM Locations') 19 | 20 | #--- Opening javascript file for writing data for visualisation 21 | fhand = codecs.open('where.js', 'w', "utf-8") 22 | fhand.write("myData = [\n") 23 | count = 0 24 | 25 | #--- Looping through all the data read from the database 26 | for row in cur : 27 | #--- Transforming and loading the data into a JSON object 28 | data = str(row[1].decode()) 29 | try: js = json.loads(str(data)) 30 | except: continue 31 | 32 | #--- Checking if the status is all right 33 | if not('status' in js and js['status'] == 'OK') : continue 34 | 35 | #--- Getting the latitude, longitude and the formatted address 36 | lat = js["results"][0]["geometry"]["location"]["lat"] 37 | lng = js["results"][0]["geometry"]["location"]["lng"] 38 | if lat == 0 or lng == 0 : continue 39 | where = js['results'][0]['formatted_address'] 40 | where = where.replace("'", "") 41 | 42 | #--- Printing the latitude, longitude and the formatted address 43 | try : 44 | print(where, lat, lng) 45 | 46 | count = count + 1 47 | if count > 1 : fhand.write(",\n") 48 | output = "["+str(lat)+","+str(lng)+", '"+where+"']" 49 | fhand.write(output) 50 | except: 51 | continue 52 | 53 | #--- Finishing writing into the javascript file and closing the file 54 | fhand.write("\n];\n") 55 | cur.close() 56 | fhand.close() 57 | 58 | #--- Printing the number of records written into the javascript file and a message 59 | #--- to continue with visualisation of the data in 'where.html' 60 | print(count, "records written to where.js") 61 | print("Open where.html to view the data in a browser") 62 | 63 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/geoload running.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex16/geoload running.jpg -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/geoload.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # This project uses the Google GeoCoding API to retrieve data and then uses 3 | # Google Maps to visualize the data. 4 | # This part of the project reads the data from a file, requests geocode of 5 | # the read location from the Google Maps Geocoding API and writes the results 6 | # into a database. 7 | #=============================================================================== 8 | 9 | #--- Importing libraries 10 | import urllib.request, urllib.parse, urllib.error 11 | import http 12 | import sqlite3 13 | import json 14 | import time 15 | import ssl 16 | import sys 17 | 18 | #--- Initialising 19 | api_key = 'AIzaSyD9z8vvTzJzjSbTXyR082e7BzUWQsimj-w' #False 20 | # If you have a Google Places API key, enter it here 21 | # api_key = 'AIzaSy___IDByT70' 22 | 23 | if api_key is False: 24 | serviceurl = "http://py4e-data.dr-chuck.net/geojson?" 25 | else : 26 | serviceurl = "https://maps.googleapis.com/maps/api/geocode/json?" #place/textsearch/json?" 27 | 28 | # Additional detail for urllib 29 | # http.client.HTTPConnection.debuglevel = 1 30 | 31 | #--- Opening connection to the SQLite database 32 | conn = sqlite3.connect('geodata.sqlite') 33 | cur = conn.cursor() 34 | 35 | #--- Initialising database 36 | cur.execute(''' 37 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''') 38 | 39 | #--- Ignoring SSL certificate errors 40 | ctx = ssl.create_default_context() 41 | ctx.check_hostname = False 42 | ctx.verify_mode = ssl.CERT_NONE 43 | 44 | #--- Opening connection to the data file 45 | fh = open("where.data") 46 | count = 0 47 | #--- Looping through the data file by 200 lines at a time 48 | for line in fh: 49 | if count > 200 : 50 | print('Retrieved 200 locations, restart to retrieve more') 51 | break 52 | 53 | #--- Getting the geodata of the address from the database 54 | address = line.strip() 55 | print('') 56 | cur.execute("SELECT geodata FROM Locations WHERE address= ?", 57 | (memoryview(address.encode()), )) 58 | 59 | #--- ... and printing message if geodata is already in the database 60 | try: 61 | data = cur.fetchone()[0] 62 | print("Found in database ",address) 63 | continue 64 | except: 65 | pass 66 | 67 | #--- Constructing the encoded URL using the address and the API key 68 | parms = dict() 69 | parms["address"] = address #parms["query"] = address 70 | if api_key is not False: parms['key'] = api_key 71 | url = serviceurl + urllib.parse.urlencode(parms) 72 | 73 | #--- Printing the URL for the user's convenience 74 | print('Retrieving', url) 75 | 76 | #--- Submitting the service request by opening the URL 77 | uh = urllib.request.urlopen(url, context=ctx) 78 | data = uh.read().decode() 79 | #--- Printing the first 20 characters of the result 80 | print('Retrieved', len(data), 'characters', data[:20].replace('\n', ' ')) 81 | count = count + 1 82 | 83 | #--- Loading the received result in as JSON object or printing an error message 84 | try: 85 | js = json.loads(data) 86 | except: 87 | print(data) # We print in case unicode causes an error 88 | continue 89 | 90 | #--- Printing an error message if something is wrong with the status 91 | if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') : 92 | print('==== Failure To Retrieve ====') 93 | print(data) 94 | break 95 | 96 | #--- Adding the received result into the SQLite database 97 | cur.execute('''INSERT INTO Locations (address, geodata) 98 | VALUES ( ?, ? )''', (memoryview(address.encode()), memoryview(data.encode()) ) ) 99 | conn.commit() 100 | 101 | #--- Adding a delay at every 10 request 102 | if count % 10 == 0 : 103 | print('Pausing for a bit...') 104 | time.sleep(1) 105 | 106 | #--- Printing a message at the end to continue with dumping the data using 'geodump.py' 107 | print("Run geodump.py to read the data from the database so you can vizualize it on a map.") 108 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/where.data: -------------------------------------------------------------------------------- 1 | AGH University of Science and Technology 2 | Academy of Fine Arts Warsaw Poland 3 | American University in Cairo 4 | Arizona State University 5 | Athens Information Technology 6 | BITS Pilani 7 | Babcock University 8 | Banaras Hindu University 9 | Bangalore University 10 | Baylor University 11 | Beijing normal university 12 | Belarusian State University 13 | Belgrade University 14 | Beloit College 15 | Belorussian State University 16 | Ben Gurion University 17 | Bharthidasan University 18 | Boston University 19 | Budapesti Corvinus Egyetem 20 | California Polytechnic State University of San Luis Obispo 21 | California State University San Bernardino 22 | City of Westminster College 23 | Columbia University 24 | Cranfield University 25 | Czech Technical University in Prague 26 | Dartmouth 27 | De Anza College 28 | Distant University of Hagen 29 | Dnipropetrovsk National University 30 | Dokuz Eylul University 31 | Drexel 32 | Drexel University and University of Texas at Austin 33 | Duke University 34 | EM Lyon 35 | Ecole centrale de PARIS 36 | Elon University 37 | Erhvervsakademi Sydvest 38 | Escuela Superior Politecnica del Litoral 39 | Fachhochschule Dusseldorf 40 | Fachhochschule FH Salzburg 41 | Faculdade de Tecnologia do Estado de Sao Paulo 42 | Faculty of Technical Sciences Novi Sad Serbia 43 | Farmingdale State University 44 | Federal University of Minas Gerais 45 | Florida Atlantic University 46 | Franklin Pierce College 47 | Gauhati University 48 | George Mason University 49 | Georgetown University Law Center 50 | Georgia State University 51 | Grandville 52 | Groep T University 53 | Hanoi University of Science and Technology 54 | Hebrew University 55 | IIIT Hyderabad 56 | IIT KANPUR 57 | IT College of Estonia 58 | IU 59 | IUAV Venezia 60 | Illinois Institute of Technology 61 | Illinois State University Joliet Junior College 62 | Indian Institute of Technology 63 | Indian Institute of Technology Kharagpur India 64 | Indian School of Mines Dhanbad 65 | Indiana University 66 | Indiana University at Bloomington 67 | Institut Superieur de technologies 68 | Institute of Business and Modern Technologies 69 | Instituto Tecnologico de Santo Domingo 70 | International Institute of Information Technology Hyderabad 71 | Irkutsk State University 72 | JADAVPUR UNIVERSITY 73 | Jawaharlal Nehru Technological University 74 | Jawaharlal Nehru University 75 | Jordan University of Science and Technology 76 | K-State 77 | KUL 78 | Kalamazoo College 79 | Kaunas Technology University 80 | Kaunas university of technology 81 | Kazan Federal University 82 | Kent State University 83 | Kharkiv State Academy of Municipal Economy Ukraine 84 | King Mongkuts University of Technology Thonburi 85 | Kokshetau Institute of Economics and Management 86 | Kyiv Polytechnic Institute 87 | Kyiv Polytechnical Institute 88 | Kyiv Unisersity of Oriental Language 89 | Laurentian University 90 | Lisandro Alvarado 91 | Lodz University of Technology 92 | Lviv University 93 | MSU 94 | Madras university 95 | Magnitogorsk State Technical University 96 | Malayer Azad University 97 | Marietta College 98 | Masdar Institute 99 | Matematicki fakultet Beograd 100 | Michigan State University 101 | Middle East Technical University 102 | Missouri University of Science and Technology 103 | Monash 104 | Monash University 105 | Monash University Churchill Australia 106 | Monterrey Institute of Technology and Higher Education 107 | Moscow Engineering-Physics Institute 108 | Moscow Institute of Physics & Technology 109 | Moscow State University 110 | NIT ROURKELA 111 | NYU 112 | Nagpur University 113 | Nanyang Technological University 114 | National Institute of Technology Jalandhar 115 | National Taiwan University 116 | National University of Engineering 117 | North Central College 118 | Northeastern University 119 | Northwestern University 120 | Obninsk Technical University of Nuclear Power Engineering Russia 121 | Old Dominion University 122 | Oregon Institute of Technology 123 | PUCMM 124 | Payame Noor University 125 | Penn State University 126 | Politecnico di Milano 127 | Politehnica University Bucharest 128 | Polytechnic University of Timisoara 129 | Pondicherry University 130 | Pontificia universidad catolica de chile 131 | Portland State University 132 | Purdue University Indianapolis 133 | R V College of Engineering 134 | RPI 135 | Ramapo College of New Jersey 136 | Rochester Institute of Technology 137 | SASTRA University 138 | Saint Petersburg State University 139 | Saint Petersburg State University of Aerospace Instrumentation 140 | Saint-Petersburg Polytechnic Univesity 141 | San Francisco State University 142 | San Jose State University 143 | Shanghai Jiao Tong University 144 | Sharif University of Technology 145 | Simon Bolivar University 146 | Simon Fraser University 147 | Smolensk State University 148 | Sonoma State University 149 | South Federal University 150 | Spiru Haret University 151 | Stanford 152 | State University of Campinas 153 | State University of New York College at Oswego 154 | Stellenbosch University 155 | Stonehill College 156 | Tallinn University 157 | Tallinn University of Technology 158 | Tampere University of Technology 159 | Tanta University 160 | Tarrant County College 161 | Technical University of Cluj-Napoca 162 | Technion 163 | Tel Aviv University 164 | The Jerusalem collage of engineering 165 | The University of Latvia 166 | The University of Manchester 167 | The University of South Africa 168 | Transilvania University 169 | Tufts University 170 | UC Berkeley 171 | UCLA 172 | UCSD 173 | UIUC 174 | UMD 175 | UNISA 176 | UNIVERSIDAD DE Buenos Aires 177 | UOC 178 | USC 179 | UW Madison 180 | Universidad Central de Venezuela 181 | Universidad Complutense de Madrid 182 | Universidad Cooperativa de Colombia 183 | Universidad Nacional Autonoma de Mexico 184 | Universidad Nacional Costa Rica 185 | Universidad Nacional de Colombia 186 | Universidad Tecnologica Boliviana 187 | Universidad de Buenos Aires 188 | Universidad de Castilla La Mancha 189 | Universidad de Los Andes Colombia 190 | Universidad de Oriente 191 | Universidad de San Carlos de Guatemala 192 | Universidad de Valladolid 193 | Universidad de la Sabana 194 | Universidad del Valle de Guatemala 195 | Universidade Federal da Paraiba 196 | Universidade Federal de Santa Catarina 197 | Universidade Federal do Rio Grande do Sul 198 | Universidade Federal do Rio de Janeiro 199 | Universidade Tecnica de Lisboa 200 | Universidade de Sao Paulo 201 | Universidade do Minho 202 | Universitas Gadjah Mada 203 | Universitat Politecnica de Valencia 204 | Universite Catholique de Louvain 205 | University College Dublin 206 | University Munich 207 | University of Akron 208 | University of Alberta 209 | University of Amsterdam 210 | University of Arkansas 211 | University of Athens 212 | University of Belgrade 213 | University of Birmingham 214 | University of Buenos Aires 215 | University of Cambridge 216 | University of Central Oklahoma 217 | University of Chicago 218 | University of Cincinnati 219 | University of Colorado at Boulder 220 | University of Connecticut 221 | University of Dallas 222 | University of Debrecen 223 | University of Delaware 224 | University of Erlangen-Nuremberg 225 | University of Essex 226 | University of Evora 227 | University of Florida 228 | University of Gothenburg 229 | University of Greifswald 230 | University of Hamburg 231 | University of Hawaii 232 | University of Helsinki 233 | University of Ilorin Kwara State 234 | University of Jaffna 235 | University of Kansas 236 | University of Kerala 237 | University of London 238 | University of Malaga 239 | University of Malaya 240 | University of Manchester 241 | University of Michigan 242 | University of Missouri - Columbia 243 | University of Moratuwa 244 | University of Mumbai 245 | University of Nebraska 246 | University of Nebraska - Lincoln 247 | University of New Haven 248 | University of New South Wales 249 | University of Notre Dame 250 | University of Oklahoma 251 | University of Ottawa 252 | University of Oxford 253 | University of Padua 254 | University of Pavia Italy 255 | University of Pennsylvania 256 | University of Piraeus Athens 257 | University of Pretoria 258 | University of Salamanca 259 | University of Sao Paulo 260 | University of Sarajevo 261 | University of Southern California 262 | University of Stellenbosch 263 | University of Tartu 264 | University of Tehran 265 | University of Texas 266 | University of Texas at Austin 267 | University of Toronto 268 | University of Tuebingen 269 | University of Twente 270 | University of Utah 271 | University of Vienna 272 | University of Warsaw 273 | University of Washington 274 | University of Washington - Bothell 275 | University of Waterloo 276 | University of West Florida 277 | University of Wisconsin 278 | University of the Punjab Lahore 279 | University of the Witwatersrand 280 | Vilnius Gediminas Technical University 281 | Vilnius University 282 | Virginia Commonwealth University 283 | Virginia Tech 284 | Viswakarma Institute Pune India 285 | Warsaw University 286 | Washington State University 287 | Wayne State 288 | Weber State 289 | Weizmann Institute of Science 290 | Western Governors University 291 | Xavier University 292 | Zagazig University 293 | allama iqbal open university islamabad 294 | arizona state university 295 | federal institute of tecnology and education from southeastern Minas Gerais 296 | kansas state university 297 | universidad complutense de madrid 298 | university of Patras 299 | university of padua 300 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/where.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A Map of Information 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 45 | 46 | 47 |
48 |

About this Map

49 |

50 | This is a cool map from 51 | www.py4e.com. 52 |

53 | 54 | 55 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/where.js: -------------------------------------------------------------------------------- 1 | myData = [ 2 | [50.06688579999999,19.9136192, 'aleja Adama Mickiewicza 30, 30-059 Kraków, Poland'], 3 | [52.2394019,21.0150792, 'Krakowskie Przedmieście 5, 00-068 Warszawa, Poland'], 4 | [33.4641541,-111.9231478, '1475 N Scottsdale Rd, Scottsdale, AZ 85257, USA'], 5 | [38.0399391,23.8030901, 'Monumental Plaza, Building C, 1st Floor, Leof. Kifisias 44, Marousi 151 25, Greece'], 6 | [28.3639976,75.58696809999999, 'VidyaVihar Campus, Pilani, Rajasthan 333031, India'], 7 | [6.8919631,3.7186605, 'Ilishan Remo Ogun State Nigeria, ILISHAN REMO, Nigeria'], 8 | [25.2677203,82.99125819999999, 'Ajagara, Banaras Hindu University Campus, Varanasi, Uttar Pradesh 221005, India'], 9 | [12.9503878,77.5022224, 'Mysore Road, Jnana Bharathi, Bengaluru, Karnataka 560056, India'], 10 | [31.549841,-97.1143146, '1301 S University Parks Dr, Waco, TX 76706, USA'], 11 | [39.9619537,116.3662615, '19 Xinjiekou Outer St, BeiTaiPingZhuang, Haidian Qu, Beijing Shi, China, 100875'], 12 | [53.8930389,27.5455567, 'praspiekt Niezaliežnasci 4, Minsk, Belarus'], 13 | [44.8184339,20.4575676, 'Studentski trg 1, Beograd, Serbia'], 14 | [42.5030333,-89.0309048, '700 College St, Beloit, WI 53511, USA'], 15 | [53.8930389,27.5455567, 'praspiekt Niezaliežnasci 4, Minsk, Belarus'], 16 | [10.6779085,78.74454879999999, 'Palkalaiperur, Tiruchirappalli, Tamil Nadu 620024, India'], 17 | [42.3504997,-71.1053991, 'Boston, MA 02215, USA'], 18 | [47.486135,19.057964, 'Budapest, Fővám tér 8., 1093 Hungary'], 19 | [35.3050053,-120.6624942, 'San Luis Obispo, CA 93407, USA'] 20 | ]; 21 | -------------------------------------------------------------------------------- /Course 4 - Using databases with Python/ex16/zoomed map with added location.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 4 - Using databases with Python/ex16/zoomed map with added location.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Note: Windows has difficulty in displaying UTF-8 characters 17 | in the console so for each console window you open, you may need 18 | to type the following command before running this code: 19 | 20 | chcp 65001 21 | 22 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 23 | 24 | Mac: rm spider.sqlite 25 | Mac: python3 spider.py 26 | 27 | Win: del spider.sqlite 28 | Win: spider.py 29 | 30 | Enter web url or enter: http://www.dr-chuck.com/ 31 | ['http://www.dr-chuck.com'] 32 | How many pages:2 33 | 1 http://www.dr-chuck.com/ 12 34 | 2 http://www.dr-chuck.com/csev-blog/ 57 35 | How many pages: 36 | 37 | In this sample run, we told it to crawl a website and retrieve two 38 | pages. If you restart the program again and tell it to crawl more 39 | pages, it will not re-crawl any pages already in the database. Upon 40 | restart it goes to a random non-crawled page and starts there. So 41 | each successive run of spider.py is additive. 42 | 43 | Mac: python3 spider.py 44 | Win: spider.py 45 | 46 | Enter web url or enter: http://www.dr-chuck.com/ 47 | ['http://www.dr-chuck.com'] 48 | How many pages:3 49 | 3 http://www.dr-chuck.com/csev-blog 57 50 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 51 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 52 | How many pages: 53 | 54 | You can have multiple starting points in the same database - 55 | within the program these are called "webs". The spider 56 | chooses randomly amongst all non-visited links across all 57 | the webs. 58 | 59 | If you want to dump the contents of the spider.sqlite file, you can 60 | run spdump.py as follows: 61 | 62 | Mac: python3 spdump.py 63 | Win: spdump.py 64 | 65 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 66 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 67 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 68 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 69 | 4 rows. 70 | 71 | This shows the number of incoming links, the old page rank, the new page 72 | rank, the id of the page, and the url of the page. The spdump.py program 73 | only shows pages that have at least one incoming link to them. 74 | 75 | Once you have a few pages in the database, you can run Page Rank on the 76 | pages using the sprank.py program. You simply tell it how many Page 77 | Rank iterations to run. 78 | 79 | Mac: python3 sprank.py 80 | Win: sprank.py 81 | 82 | How many iterations:2 83 | 1 0.546848992536 84 | 2 0.226714939664 85 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 86 | 87 | You can dump the database again to see that page rank has been updated: 88 | 89 | Mac: python3 spdump.py 90 | Win: spdump.py 91 | 92 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 93 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 94 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 95 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 96 | 4 rows. 97 | 98 | You can run sprank.py as many times as you like and it will simply refine 99 | the page rank the more times you run it. You can even run sprank.py a few times 100 | and then go spider a few more pages sith spider.py and then run sprank.py 101 | to converge the page ranks. 102 | 103 | If you want to restart the Page Rank calculations without re-spidering the 104 | web pages, you can use spreset.py 105 | 106 | Mac: python3 spreset.py 107 | Win: spreset.py 108 | 109 | All pages set to a rank of 1.0 110 | 111 | Mac: python3 sprank.py 112 | Win: sprank.py 113 | 114 | How many iterations:50 115 | 1 0.546848992536 116 | 2 0.226714939664 117 | 3 0.0659516187242 118 | 4 0.0244199333 119 | 5 0.0102096489546 120 | 6 0.00610244329379 121 | ... 122 | 42 0.000109076928206 123 | 43 9.91987599002e-05 124 | 44 9.02151706798e-05 125 | 45 8.20451504471e-05 126 | 46 7.46150183837e-05 127 | 47 6.7857770908e-05 128 | 48 6.17124694224e-05 129 | 49 5.61236959327e-05 130 | 50 5.10410499467e-05 131 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 132 | 133 | For each iteration of the page rank algorithm it prints the average 134 | change per page of the page rank. The network initially is quite 135 | unbalanced and so the individual page ranks are changing wildly. 136 | But in a few short iterations, the page rank converges. You 137 | should run prank.py long enough that the page ranks converge. 138 | 139 | If you want to visualize the current top pages in terms of page rank, 140 | run spjson.py to write the pages out in JSON format to be viewed in a 141 | web browser. 142 | 143 | Mac: python3 spjson.py 144 | Win: spjson.py 145 | 146 | Creating JSON output on spider.js... 147 | How many nodes? 30 148 | Open force.html in a browser to view the visualization 149 | 150 | You can view this data by opening the file force.html in your web browser. 151 | This shows an automatic layout of the nodes and links. You can click and 152 | drag any node and you can also double click on a node to find the URL 153 | that is represented by the node. 154 | 155 | This visualization is provided using the force layout from: 156 | 157 | http://mbostock.github.com/d3/ 158 | 159 | If you rerun the other utilities and then re-run spjson.py - you merely 160 | have to press refresh in the browser to get the new data from spider.js. 161 | 162 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/__pycache__/spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/__pycache__/spider.cpython-36.pyc -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/dr-chuck-site-dump.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/dr-chuck-site-dump.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/dr-chuck-site-top25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/dr-chuck-site-top25.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Force-Directed Layout 5 | 6 | 7 | 8 | 9 | 10 | 13 |
14 | 15 |

If you don't see a chart above, check the JavaScript console. You may 16 | need to use a different browser.

17 | 18 | 19 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print(row) 14 | count = count + 1 15 | print(count, 'rows.') 16 | cur.close() 17 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider-coincube.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider-coincube.sqlite -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider-dr-chuck.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider-dr-chuck.sqlite -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider.js: -------------------------------------------------------------------------------- 1 | spiderJson = {"nodes":[ 2 | {"weight":21,"rank":19.0, "id":1, "url":"http://variance.hu"}, 3 | {"weight":27,"rank":15.558784255770082, "id":22, "url":"http://variance.hu/2018/01/03/kilenc-ev"}, 4 | {"weight":22,"rank":19.0, "id":107, "url":"http://variance.hu/tag/ant"}, 5 | {"weight":23,"rank":19.0, "id":150, "url":"http://variance.hu/tag/block"}, 6 | {"weight":22,"rank":19.0, "id":157, "url":"http://variance.hu/tag/bor"}, 7 | {"weight":22,"rank":19.0, "id":213, "url":"http://variance.hu/tag/cyptocurrency"}, 8 | {"weight":20,"rank":15.558784255770082, "id":299, "url":"http://variance.hu/tag/gephaz"}, 9 | {"weight":21,"rank":15.558784255770082, "id":341, "url":"http://variance.hu/tag/jobs"}, 10 | {"weight":21,"rank":15.558784255770082, "id":342, "url":"http://variance.hu/tag/job-seeking"}, 11 | {"weight":22,"rank":15.756062326850063, "id":354, "url":"http://variance.hu/tag/kotveny"}, 12 | {"weight":20,"rank":15.558784255770082, "id":361, "url":"http://variance.hu/tag/linear-regression"}, 13 | {"weight":20,"rank":16.528019996293462, "id":372, "url":"http://variance.hu/tag/manipulation"}, 14 | {"weight":27,"rank":15.609796663166051, "id":469, "url":"http://variance.hu/tag/python"}, 15 | {"weight":20,"rank":15.558784255770082, "id":553, "url":"http://variance.hu/tag/ta"}, 16 | {"weight":20,"rank":15.558784255770082, "id":556, "url":"http://variance.hu/tag/tangle"}, 17 | {"weight":21,"rank":15.558784255770082, "id":602, "url":"http://variance.hu/tag/utxo"}, 18 | {"weight":20,"rank":15.558784255770082, "id":621, "url":"http://variance.hu/tag/whales"}, 19 | {"weight":29,"rank":15.558784255770082, "id":636, "url":"http://variance.hu/tag/zec"}, 20 | {"weight":19,"rank":0.0, "id":661, "url":"http://variance.hu/2017/08/28/ismerkedes-a-bittrex-api-val-python"}, 21 | {"weight":9,"rank":0.00812587020466735, "id":675, "url":"http://variance.hu/2014/01/09/enervalt-piaci-hangulat-mellett-szaguld-az-oko-szektor"}], 22 | "links":[ 23 | {"source":0,"target":0,"value":3}, 24 | {"source":0,"target":1,"value":3}, 25 | {"source":0,"target":2,"value":3}, 26 | {"source":0,"target":3,"value":3}, 27 | {"source":0,"target":4,"value":3}, 28 | {"source":0,"target":5,"value":3}, 29 | {"source":0,"target":6,"value":3}, 30 | {"source":0,"target":7,"value":3}, 31 | {"source":0,"target":8,"value":3}, 32 | {"source":0,"target":9,"value":3}, 33 | {"source":0,"target":10,"value":3}, 34 | {"source":0,"target":11,"value":3}, 35 | {"source":0,"target":12,"value":3}, 36 | {"source":0,"target":13,"value":3}, 37 | {"source":0,"target":14,"value":3}, 38 | {"source":0,"target":15,"value":3}, 39 | {"source":0,"target":16,"value":3}, 40 | {"source":0,"target":17,"value":3}, 41 | {"source":2,"target":0,"value":3}, 42 | {"source":2,"target":2,"value":3}, 43 | {"source":2,"target":3,"value":3}, 44 | {"source":2,"target":4,"value":3}, 45 | {"source":2,"target":5,"value":3}, 46 | {"source":2,"target":6,"value":3}, 47 | {"source":2,"target":7,"value":3}, 48 | {"source":2,"target":8,"value":3}, 49 | {"source":2,"target":9,"value":3}, 50 | {"source":2,"target":10,"value":3}, 51 | {"source":2,"target":11,"value":3}, 52 | {"source":2,"target":12,"value":3}, 53 | {"source":2,"target":13,"value":3}, 54 | {"source":2,"target":14,"value":3}, 55 | {"source":2,"target":15,"value":3}, 56 | {"source":2,"target":16,"value":3}, 57 | {"source":2,"target":17,"value":3}, 58 | {"source":2,"target":1,"value":3}, 59 | {"source":13,"target":0,"value":3}, 60 | {"source":13,"target":13,"value":3}, 61 | {"source":13,"target":2,"value":3}, 62 | {"source":13,"target":3,"value":3}, 63 | {"source":13,"target":4,"value":3}, 64 | {"source":13,"target":5,"value":3}, 65 | {"source":13,"target":6,"value":3}, 66 | {"source":13,"target":7,"value":3}, 67 | {"source":13,"target":8,"value":3}, 68 | {"source":13,"target":9,"value":3}, 69 | {"source":13,"target":10,"value":3}, 70 | {"source":13,"target":11,"value":3}, 71 | {"source":13,"target":12,"value":3}, 72 | {"source":13,"target":14,"value":3}, 73 | {"source":13,"target":15,"value":3}, 74 | {"source":13,"target":16,"value":3}, 75 | {"source":13,"target":17,"value":3}, 76 | {"source":13,"target":1,"value":3}, 77 | {"source":5,"target":0,"value":3}, 78 | {"source":5,"target":5,"value":3}, 79 | {"source":5,"target":2,"value":3}, 80 | {"source":5,"target":3,"value":3}, 81 | {"source":5,"target":4,"value":3}, 82 | {"source":5,"target":6,"value":3}, 83 | {"source":5,"target":7,"value":3}, 84 | {"source":5,"target":8,"value":3}, 85 | {"source":5,"target":9,"value":3}, 86 | {"source":5,"target":10,"value":3}, 87 | {"source":5,"target":11,"value":3}, 88 | {"source":5,"target":12,"value":3}, 89 | {"source":5,"target":13,"value":3}, 90 | {"source":5,"target":14,"value":3}, 91 | {"source":5,"target":15,"value":3}, 92 | {"source":5,"target":16,"value":3}, 93 | {"source":5,"target":17,"value":3}, 94 | {"source":5,"target":1,"value":3}, 95 | {"source":15,"target":0,"value":3}, 96 | {"source":15,"target":15,"value":3}, 97 | {"source":15,"target":2,"value":3}, 98 | {"source":15,"target":3,"value":3}, 99 | {"source":15,"target":4,"value":3}, 100 | {"source":15,"target":5,"value":3}, 101 | {"source":15,"target":6,"value":3}, 102 | {"source":15,"target":7,"value":3}, 103 | {"source":15,"target":8,"value":3}, 104 | {"source":15,"target":9,"value":3}, 105 | {"source":15,"target":10,"value":3}, 106 | {"source":15,"target":11,"value":3}, 107 | {"source":15,"target":12,"value":3}, 108 | {"source":15,"target":13,"value":3}, 109 | {"source":15,"target":14,"value":3}, 110 | {"source":15,"target":16,"value":3}, 111 | {"source":15,"target":17,"value":3}, 112 | {"source":15,"target":1,"value":3}, 113 | {"source":4,"target":0,"value":3}, 114 | {"source":4,"target":4,"value":3}, 115 | {"source":4,"target":2,"value":3}, 116 | {"source":4,"target":3,"value":3}, 117 | {"source":4,"target":5,"value":3}, 118 | {"source":4,"target":6,"value":3}, 119 | {"source":4,"target":7,"value":3}, 120 | {"source":4,"target":8,"value":3}, 121 | {"source":4,"target":9,"value":3}, 122 | {"source":4,"target":10,"value":3}, 123 | {"source":4,"target":11,"value":3}, 124 | {"source":4,"target":12,"value":3}, 125 | {"source":4,"target":13,"value":3}, 126 | {"source":4,"target":14,"value":3}, 127 | {"source":4,"target":15,"value":3}, 128 | {"source":4,"target":16,"value":3}, 129 | {"source":4,"target":17,"value":3}, 130 | {"source":4,"target":1,"value":3}, 131 | {"source":14,"target":0,"value":3}, 132 | {"source":14,"target":14,"value":3}, 133 | {"source":14,"target":2,"value":3}, 134 | {"source":14,"target":3,"value":3}, 135 | {"source":14,"target":4,"value":3}, 136 | {"source":14,"target":5,"value":3}, 137 | {"source":14,"target":6,"value":3}, 138 | {"source":14,"target":7,"value":3}, 139 | {"source":14,"target":8,"value":3}, 140 | {"source":14,"target":9,"value":3}, 141 | {"source":14,"target":10,"value":3}, 142 | {"source":14,"target":11,"value":3}, 143 | {"source":14,"target":12,"value":3}, 144 | {"source":14,"target":13,"value":3}, 145 | {"source":14,"target":15,"value":3}, 146 | {"source":14,"target":16,"value":3}, 147 | {"source":14,"target":17,"value":3}, 148 | {"source":14,"target":1,"value":3}, 149 | {"source":12,"target":0,"value":3}, 150 | {"source":12,"target":12,"value":3}, 151 | {"source":12,"target":18,"value":3}, 152 | {"source":12,"target":2,"value":3}, 153 | {"source":12,"target":3,"value":3}, 154 | {"source":12,"target":4,"value":3}, 155 | {"source":12,"target":5,"value":3}, 156 | {"source":12,"target":6,"value":3}, 157 | {"source":12,"target":7,"value":3}, 158 | {"source":12,"target":8,"value":3}, 159 | {"source":12,"target":9,"value":3}, 160 | {"source":12,"target":10,"value":3}, 161 | {"source":12,"target":11,"value":3}, 162 | {"source":12,"target":13,"value":3}, 163 | {"source":12,"target":14,"value":3}, 164 | {"source":12,"target":15,"value":3}, 165 | {"source":12,"target":16,"value":3}, 166 | {"source":12,"target":17,"value":3}, 167 | {"source":12,"target":1,"value":3}, 168 | {"source":7,"target":0,"value":3}, 169 | {"source":7,"target":7,"value":3}, 170 | {"source":7,"target":8,"value":3}, 171 | {"source":7,"target":2,"value":3}, 172 | {"source":7,"target":3,"value":3}, 173 | {"source":7,"target":4,"value":3}, 174 | {"source":7,"target":5,"value":3}, 175 | {"source":7,"target":6,"value":3}, 176 | {"source":7,"target":9,"value":3}, 177 | {"source":7,"target":10,"value":3}, 178 | {"source":7,"target":11,"value":3}, 179 | {"source":7,"target":12,"value":3}, 180 | {"source":7,"target":13,"value":3}, 181 | {"source":7,"target":14,"value":3}, 182 | {"source":7,"target":15,"value":3}, 183 | {"source":7,"target":16,"value":3}, 184 | {"source":7,"target":17,"value":3}, 185 | {"source":7,"target":1,"value":3}, 186 | {"source":16,"target":0,"value":3}, 187 | {"source":16,"target":16,"value":3}, 188 | {"source":16,"target":2,"value":3}, 189 | {"source":16,"target":3,"value":3}, 190 | {"source":16,"target":4,"value":3}, 191 | {"source":16,"target":5,"value":3}, 192 | {"source":16,"target":6,"value":3}, 193 | {"source":16,"target":7,"value":3}, 194 | {"source":16,"target":8,"value":3}, 195 | {"source":16,"target":9,"value":3}, 196 | {"source":16,"target":10,"value":3}, 197 | {"source":16,"target":11,"value":3}, 198 | {"source":16,"target":12,"value":3}, 199 | {"source":16,"target":13,"value":3}, 200 | {"source":16,"target":14,"value":3}, 201 | {"source":16,"target":15,"value":3}, 202 | {"source":16,"target":17,"value":3}, 203 | {"source":16,"target":1,"value":3}, 204 | {"source":9,"target":0,"value":3}, 205 | {"source":9,"target":9,"value":3}, 206 | {"source":9,"target":19,"value":3}, 207 | {"source":9,"target":2,"value":3}, 208 | {"source":9,"target":3,"value":3}, 209 | {"source":9,"target":4,"value":3}, 210 | {"source":9,"target":5,"value":3}, 211 | {"source":9,"target":6,"value":3}, 212 | {"source":9,"target":7,"value":3}, 213 | {"source":9,"target":8,"value":3}, 214 | {"source":9,"target":10,"value":3}, 215 | {"source":9,"target":11,"value":3}, 216 | {"source":9,"target":12,"value":3}, 217 | {"source":9,"target":13,"value":3}, 218 | {"source":9,"target":14,"value":3}, 219 | {"source":9,"target":15,"value":3}, 220 | {"source":9,"target":16,"value":3}, 221 | {"source":9,"target":17,"value":3}, 222 | {"source":9,"target":1,"value":3}, 223 | {"source":19,"target":0,"value":3}, 224 | {"source":19,"target":19,"value":3}, 225 | {"source":19,"target":9,"value":3}, 226 | {"source":19,"target":2,"value":3}, 227 | {"source":19,"target":3,"value":3}, 228 | {"source":19,"target":4,"value":3}, 229 | {"source":19,"target":5,"value":3}, 230 | {"source":18,"target":0,"value":3}, 231 | {"source":18,"target":18,"value":3}, 232 | {"source":18,"target":12,"value":3}, 233 | {"source":18,"target":2,"value":3}, 234 | {"source":18,"target":3,"value":3}, 235 | {"source":18,"target":4,"value":3}, 236 | {"source":18,"target":5,"value":3}, 237 | {"source":18,"target":6,"value":3}, 238 | {"source":18,"target":7,"value":3}, 239 | {"source":18,"target":8,"value":3}, 240 | {"source":18,"target":9,"value":3}, 241 | {"source":18,"target":10,"value":3}, 242 | {"source":18,"target":11,"value":3}, 243 | {"source":18,"target":13,"value":3}, 244 | {"source":18,"target":14,"value":3}, 245 | {"source":18,"target":15,"value":3}, 246 | {"source":18,"target":16,"value":3}, 247 | {"source":18,"target":17,"value":3}, 248 | {"source":18,"target":1,"value":3}, 249 | {"source":3,"target":0,"value":3}, 250 | {"source":3,"target":3,"value":3}, 251 | {"source":3,"target":2,"value":3}, 252 | {"source":3,"target":4,"value":3}, 253 | {"source":3,"target":5,"value":3}, 254 | {"source":3,"target":6,"value":3}, 255 | {"source":3,"target":7,"value":3}, 256 | {"source":3,"target":8,"value":3}, 257 | {"source":3,"target":9,"value":3}, 258 | {"source":3,"target":10,"value":3}, 259 | {"source":3,"target":11,"value":3}, 260 | {"source":3,"target":12,"value":3}, 261 | {"source":3,"target":13,"value":3}, 262 | {"source":3,"target":14,"value":3}, 263 | {"source":3,"target":15,"value":3}, 264 | {"source":3,"target":16,"value":3}, 265 | {"source":3,"target":17,"value":3}, 266 | {"source":3,"target":1,"value":3}, 267 | {"source":1,"target":0,"value":3}, 268 | {"source":1,"target":1,"value":3}, 269 | {"source":1,"target":2,"value":3}, 270 | {"source":1,"target":3,"value":3}, 271 | {"source":1,"target":4,"value":3}, 272 | {"source":1,"target":5,"value":3}, 273 | {"source":1,"target":6,"value":3}, 274 | {"source":1,"target":7,"value":3}, 275 | {"source":1,"target":8,"value":3}, 276 | {"source":1,"target":9,"value":3}, 277 | {"source":1,"target":10,"value":3}, 278 | {"source":1,"target":11,"value":3}, 279 | {"source":1,"target":12,"value":3}, 280 | {"source":1,"target":13,"value":3}, 281 | {"source":1,"target":14,"value":3}, 282 | {"source":1,"target":15,"value":3}, 283 | {"source":1,"target":16,"value":3}, 284 | {"source":1,"target":17,"value":3}, 285 | {"source":10,"target":0,"value":3}, 286 | {"source":10,"target":10,"value":3}, 287 | {"source":10,"target":2,"value":3}, 288 | {"source":10,"target":3,"value":3}, 289 | {"source":10,"target":4,"value":3}, 290 | {"source":10,"target":5,"value":3}, 291 | {"source":10,"target":6,"value":3}, 292 | {"source":10,"target":7,"value":3}, 293 | {"source":10,"target":8,"value":3}, 294 | {"source":10,"target":9,"value":3}, 295 | {"source":10,"target":11,"value":3}, 296 | {"source":10,"target":12,"value":3}, 297 | {"source":10,"target":13,"value":3}, 298 | {"source":10,"target":14,"value":3}, 299 | {"source":10,"target":15,"value":3}, 300 | {"source":10,"target":16,"value":3}, 301 | {"source":10,"target":17,"value":3}, 302 | {"source":10,"target":1,"value":3}, 303 | {"source":17,"target":0,"value":3}, 304 | {"source":17,"target":17,"value":3}, 305 | {"source":17,"target":2,"value":3}, 306 | {"source":17,"target":3,"value":3}, 307 | {"source":17,"target":4,"value":3}, 308 | {"source":17,"target":5,"value":3}, 309 | {"source":17,"target":6,"value":3}, 310 | {"source":17,"target":7,"value":3}, 311 | {"source":17,"target":8,"value":3}, 312 | {"source":17,"target":9,"value":3}, 313 | {"source":17,"target":10,"value":3}, 314 | {"source":17,"target":11,"value":3}, 315 | {"source":17,"target":12,"value":3}, 316 | {"source":17,"target":13,"value":3}, 317 | {"source":17,"target":14,"value":3}, 318 | {"source":17,"target":15,"value":3}, 319 | {"source":17,"target":16,"value":3}, 320 | {"source":17,"target":1,"value":3}, 321 | {"source":8,"target":0,"value":3}, 322 | {"source":8,"target":8,"value":3}, 323 | {"source":8,"target":7,"value":3}, 324 | {"source":8,"target":2,"value":3}, 325 | {"source":8,"target":3,"value":3}, 326 | {"source":8,"target":4,"value":3}, 327 | {"source":8,"target":5,"value":3}, 328 | {"source":8,"target":6,"value":3}, 329 | {"source":8,"target":9,"value":3}, 330 | {"source":8,"target":10,"value":3}, 331 | {"source":8,"target":11,"value":3}, 332 | {"source":8,"target":12,"value":3}, 333 | {"source":8,"target":13,"value":3}, 334 | {"source":8,"target":14,"value":3}, 335 | {"source":8,"target":15,"value":3}, 336 | {"source":8,"target":16,"value":3}, 337 | {"source":8,"target":17,"value":3}, 338 | {"source":8,"target":1,"value":3}, 339 | {"source":6,"target":0,"value":3}, 340 | {"source":6,"target":6,"value":3}, 341 | {"source":6,"target":2,"value":3}, 342 | {"source":6,"target":3,"value":3}, 343 | {"source":6,"target":4,"value":3}, 344 | {"source":6,"target":5,"value":3}, 345 | {"source":6,"target":7,"value":3}, 346 | {"source":6,"target":8,"value":3}, 347 | {"source":6,"target":9,"value":3}, 348 | {"source":6,"target":10,"value":3}, 349 | {"source":6,"target":11,"value":3}, 350 | {"source":6,"target":12,"value":3}, 351 | {"source":6,"target":13,"value":3}, 352 | {"source":6,"target":14,"value":3}, 353 | {"source":6,"target":15,"value":3}, 354 | {"source":6,"target":16,"value":3}, 355 | {"source":6,"target":17,"value":3}, 356 | {"source":6,"target":1,"value":3}, 357 | {"source":11,"target":0,"value":3}, 358 | {"source":11,"target":11,"value":3}, 359 | {"source":11,"target":2,"value":3}, 360 | {"source":11,"target":3,"value":3}, 361 | {"source":11,"target":4,"value":3}, 362 | {"source":11,"target":5,"value":3}]}; -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib.error 3 | import ssl 4 | from urllib.parse import urljoin 5 | from urllib.parse import urlparse 6 | from urllib.request import urlopen 7 | from bs4 import BeautifulSoup 8 | 9 | # Ignore SSL certificate errors 10 | ctx = ssl.create_default_context() 11 | ctx.check_hostname = False 12 | ctx.verify_mode = ssl.CERT_NONE 13 | 14 | conn = sqlite3.connect('spider.sqlite') 15 | cur = conn.cursor() 16 | 17 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 18 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 19 | error INTEGER, old_rank REAL, new_rank REAL)''') 20 | 21 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 22 | (from_id INTEGER, to_id INTEGER)''') 23 | 24 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 25 | 26 | # Check to see if we are already in progress... 27 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 28 | row = cur.fetchone() 29 | if row is not None: 30 | print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.") 31 | else : 32 | starturl = input('Enter web url or enter: ') 33 | if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/' 34 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 35 | web = starturl 36 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 37 | pos = starturl.rfind('/') 38 | web = starturl[:pos] 39 | 40 | if ( len(web) > 1 ) : 41 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 42 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 43 | conn.commit() 44 | 45 | # Get the current webs 46 | cur.execute('''SELECT url FROM Webs''') 47 | webs = list() 48 | for row in cur: 49 | webs.append(str(row[0])) 50 | 51 | print(webs) 52 | 53 | many = 0 54 | while True: 55 | if ( many < 1 ) : 56 | sval = input('How many pages:') 57 | if ( len(sval) < 1 ) : break 58 | many = int(sval) 59 | many = many - 1 60 | 61 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 62 | try: 63 | row = cur.fetchone() 64 | # print row 65 | fromid = row[0] 66 | url = row[1] 67 | except: 68 | print('No unretrieved HTML pages found') 69 | many = 0 70 | break 71 | 72 | print(fromid, url, end=' ') 73 | 74 | # If we are retrieving this page, there should be no links from it 75 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 76 | try: 77 | document = urlopen(url, context=ctx) 78 | 79 | html = document.read() 80 | if document.getcode() != 200 : 81 | print("Error on page: ",document.getcode()) 82 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 83 | 84 | if 'text/html' != document.info().get_content_type() : 85 | print("Ignore non text/html page") 86 | cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) 87 | cur.execute('UPDATE Pages SET error=0 WHERE url=?', (url, ) ) 88 | conn.commit() 89 | continue 90 | 91 | print('('+str(len(html))+')', end=' ') 92 | 93 | soup = BeautifulSoup(html, "html.parser") 94 | except KeyboardInterrupt: 95 | print('') 96 | print('Program interrupted by user...') 97 | break 98 | except: 99 | print("Unable to retrieve or parse page") 100 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 101 | conn.commit() 102 | continue 103 | 104 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 105 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) 106 | conn.commit() 107 | 108 | # Retrieve all of the anchor tags 109 | tags = soup('a') 110 | count = 0 111 | for tag in tags: 112 | href = tag.get('href', None) 113 | if ( href is None ) : continue 114 | # Resolve relative references like href="/contact" 115 | up = urlparse(href) 116 | if ( len(up.scheme) < 1 ) : 117 | href = urljoin(url, href) 118 | ipos = href.find('#') 119 | if ( ipos > 1 ) : href = href[:ipos] 120 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 121 | if ( href.endswith('/') ) : href = href[:-1] 122 | # print href 123 | if ( len(href) < 1 ) : continue 124 | 125 | # Check if the URL is in any of the webs 126 | found = False 127 | for web in webs: 128 | if ( href.startswith(web) ) : 129 | found = True 130 | break 131 | if not found : continue 132 | 133 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 134 | count = count + 1 135 | conn.commit() 136 | 137 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 138 | try: 139 | row = cur.fetchone() 140 | toid = row[0] 141 | except: 142 | print('Could not retrieve id') 143 | continue 144 | # print fromid, toid 145 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 146 | 147 | 148 | print(count) 149 | 150 | cur.close() 151 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spider.sqlite -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print("Creating JSON output on spider.js...") 7 | howmany = int(input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank is None or maxrank < rank: maxrank = rank 22 | if minrank is None or minrank > rank : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print("Error - please run sprank.py to compute page rank") 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print("Open force.html in a browser to view the visualization") 62 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print("Nothing to page rank. Check data.") 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in list(prev_ranks.items()): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in list(prev_ranks.items()): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in list(next_ranks.items()): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in list(next_ranks.items()): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in list(prev_ranks.items()): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print(i+1, avediff) 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print(list(next_ranks.items())[:5]) 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in list(next_ranks.items()) : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print("All pages set to a rank of 1.0") 12 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/variance-site-dump.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/variance-site-dump.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/variance-top25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex17/variance-top25.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive from gmane and vizualizing the data 2 | using the D3 JavaScript library 3 | 4 | This is a set of tools that allow you to pull down an archive 5 | of a gmane repository using the instructions at: 6 | 7 | http://gmane.org/export.php 8 | 9 | In order not to overwhelm the gmane.org server, I have put up 10 | my own copy of the messages at: 11 | 12 | http://mbox.dr-chuck.net/ 13 | 14 | This server will be faster and take a lot of load off the 15 | gmane.org server. 16 | 17 | You should install the SQLite browser to view and modify the databases from: 18 | 19 | http://sqlitebrowser.org/ 20 | 21 | The first step is to spider the gmane repository. The base URL 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai 23 | developer list. You can spider another repository by changing that 24 | base url. Make sure to delete the content.sqlite file if you 25 | switch the base url. The gmane.py file operates as a spider in 26 | that it runs slowly and retrieves one mail message per second so 27 | as to avoid getting throttled by gmane.org. It stores all of 28 | its data in a database and can be interrupted and re-started 29 | as often as needed. It may take many hours to pull all the data 30 | down. So you may need to restart several times. 31 | 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 33 | email here: 34 | 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite 36 | 37 | If you download this, you can "catch up with the latest" by 38 | running gmane.py. 39 | 40 | Navigate to the folder where you extracted the gmane.zip 41 | 42 | Note: Windows has difficulty in displaying UTF-8 characters 43 | in the console so for each console window you open, you may need 44 | to type the following command before running this code: 45 | 46 | chcp 65001 47 | 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 49 | 50 | Here is a run of gmane.py getting the last five messages of the 51 | sakai developer list: 52 | 53 | Mac: python3 gmane.py 54 | Win: gmane.py 55 | 56 | How many messages:10 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662 58 | ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434 60 | csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference: sakai developers break into song 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055 62 | kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721 64 | michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 66 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 67 | Does not start with From 68 | 69 | The program scans content.sqlite from 1 up to the first message number not 70 | already spidered and starts spidering at that message. It continues spidering 71 | until it has spidered the desired number of messages or it reaches a page 72 | that does not appear to be a properly formatted message. 73 | 74 | Sometimes gmane.org is missing a message. Perhaps administrators can delete messages 75 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 77 | all the other fields blank - and then restart gmane.py. This will unstick the 78 | spidering process and allow it to continue. These empty messages will be ignored in the next 79 | phase of the process. 80 | 81 | One nice thing is that once you have spidered all of the messages and have them in 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 83 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 84 | if there are new messages and then quickly retrieve those messages and add them 85 | to content.sqlite. 86 | 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 88 | This is intentional as it allows you to look at content.sqlite to debug the process. 89 | It would be a bad idea to run any queries against this database as they would be 90 | slow. 91 | 92 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 94 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 95 | smaller) than content.sqlite because it also compresses the header and body text. 96 | 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 99 | data cleaning process. 100 | 101 | Running gmodel.py works as follows: 102 | 103 | Mac: python3 gmodel.py 104 | Win: gmodel.py 105 | 106 | Loaded allsenders 1588 and mapping 28 dns mapping 1 107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 111 | ... 112 | 113 | The gmodel.py program does a number of data cleaing steps 114 | 115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 116 | other domain names are truncated to three levels. So si.umich.edu becomes 117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 118 | forced to lower case and some of the @gmane.org address like the following 119 | 120 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 121 | 122 | are converted to the real address whenever there is a matching real email 123 | address elsewhere in the message corpus. 124 | 125 | If you look in the content.sqlite database there are two tables that allow 126 | you to map both domain names and individual email addresses that change over 127 | the lifetime of the email list. For example, Steve Githens used the following 128 | email addresses over the life of the Sakai developer list: 129 | 130 | s-githens@northwestern.edu 131 | sgithens@cam.ac.uk 132 | swgithen@mtu.edu 133 | 134 | We can add two entries to the Mapping table 135 | 136 | s-githens@northwestern.edu -> swgithen@mtu.edu 137 | sgithens@cam.ac.uk -> swgithen@mtu.edu 138 | 139 | And so all the mail messages will be collected under one sender even if 140 | they used several email addresses over the lifetime of the mailing list. 141 | 142 | You can also make similar entries in the DNSMapping table if there are multiple 143 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 144 | mapping: 145 | 146 | iupui.edu -> indiana.edu 147 | 148 | So all the folks from the various Indiana University campuses are tracked together 149 | 150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 151 | to make the data cleaner and cleaner. When you are done, you will have a nicely 152 | indexed version of the email in index.sqlite. This is the file to use to do data 153 | analysis. With this file, data analysis will be really quick. 154 | 155 | The first, simplest data analysis is to do a "who does the most" and "which 156 | organzation does the most"? This is done using gbasic.py: 157 | 158 | Mac: python3 gbasic.py 159 | Win: gbasic.py 160 | 161 | How many to dump? 5 162 | Loaded messages= 51330 subjects= 25033 senders= 1584 163 | 164 | Top 5 Email list participants 165 | steve.swinsburg@gmail.com 2657 166 | azeckoski@unicon.net 1742 167 | ieb@tfd.co.uk 1591 168 | csev@umich.edu 1304 169 | david.horwitz@uct.ac.za 1184 170 | 171 | Top 5 Email list organizations 172 | gmail.com 7339 173 | umich.edu 6243 174 | uct.ac.za 2451 175 | indiana.edu 2258 176 | unicon.net 2055 177 | 178 | You can look at the data in index.sqlite and if you find a problem, you 179 | can update the Mapping table and DNSMapping table in content.sqlite and 180 | re-run gmodel.py. 181 | 182 | There is a simple vizualization of the word frequence in the subject lines 183 | in the file gword.py: 184 | 185 | Mac: python3 gword.py 186 | Win: gword.py 187 | 188 | Range of counts: 33229 129 189 | Output written to gword.js 190 | 191 | This produces the file gword.js which you can visualize using the file 192 | gword.htm. 193 | 194 | A second visualization is in gline.py. It visualizes email participation by 195 | organizations over time. 196 | 197 | Mac: python3 gline.py 198 | Win: gline.py 199 | 200 | Loaded messages= 51330 subjects= 25033 senders= 1584 201 | Top 10 Oranizations 202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 203 | Output written to gline.js 204 | 205 | Its output is written to gline.js which is visualized using gline.htm. 206 | 207 | Some URLs for visualization ideas: 208 | 209 | https://developers.google.com/chart/ 210 | 211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 212 | 213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 214 | 215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 216 | 217 | http://bost.ocks.org/mike/uberdata/ 218 | 219 | http://mbostock.github.io/d3/talk/20111018/calendar.html 220 | 221 | http://nltk.org/install.html 222 | 223 | As always - comments welcome. 224 | 225 | -- Dr. Chuck 226 | Sun Sep 29 00:11:01 EDT 2013 227 | 228 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/content.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/content.sqlite -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/content.sqlite-journal.temp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/content.sqlite-journal.temp -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/content.sqlite.first.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/content.sqlite.first.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/d3.layout.cloud.js: -------------------------------------------------------------------------------- 1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/ 2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf 3 | (function(exports) { 4 | function cloud() { 5 | var size = [256, 256], 6 | text = cloudText, 7 | font = cloudFont, 8 | fontSize = cloudFontSize, 9 | fontStyle = cloudFontNormal, 10 | fontWeight = cloudFontNormal, 11 | rotate = cloudRotate, 12 | padding = cloudPadding, 13 | spiral = archimedeanSpiral, 14 | words = [], 15 | timeInterval = Infinity, 16 | event = d3.dispatch("word", "end"), 17 | timer = null, 18 | cloud = {}; 19 | 20 | cloud.start = function() { 21 | var board = zeroArray((size[0] >> 5) * size[1]), 22 | bounds = null, 23 | n = words.length, 24 | i = -1, 25 | tags = [], 26 | data = words.map(function(d, i) { 27 | d.text = text.call(this, d, i); 28 | d.font = font.call(this, d, i); 29 | d.style = fontStyle.call(this, d, i); 30 | d.weight = fontWeight.call(this, d, i); 31 | d.rotate = rotate.call(this, d, i); 32 | d.size = ~~fontSize.call(this, d, i); 33 | d.padding = cloudPadding.call(this, d, i); 34 | return d; 35 | }).sort(function(a, b) { return b.size - a.size; }); 36 | 37 | if (timer) clearInterval(timer); 38 | timer = setInterval(step, 0); 39 | step(); 40 | 41 | return cloud; 42 | 43 | function step() { 44 | var start = +new Date, 45 | d; 46 | while (+new Date - start < timeInterval && ++i < n && timer) { 47 | d = data[i]; 48 | d.x = (size[0] * (Math.random() + .5)) >> 1; 49 | d.y = (size[1] * (Math.random() + .5)) >> 1; 50 | cloudSprite(d, data, i); 51 | if (place(board, d, bounds)) { 52 | tags.push(d); 53 | event.word(d); 54 | if (bounds) cloudBounds(bounds, d); 55 | else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}]; 56 | // Temporary hack 57 | d.x -= size[0] >> 1; 58 | d.y -= size[1] >> 1; 59 | } 60 | } 61 | if (i >= n) { 62 | cloud.stop(); 63 | event.end(tags, bounds); 64 | } 65 | } 66 | } 67 | 68 | cloud.stop = function() { 69 | if (timer) { 70 | clearInterval(timer); 71 | timer = null; 72 | } 73 | return cloud; 74 | }; 75 | 76 | cloud.timeInterval = function(x) { 77 | if (!arguments.length) return timeInterval; 78 | timeInterval = x == null ? Infinity : x; 79 | return cloud; 80 | }; 81 | 82 | function place(board, tag, bounds) { 83 | var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}], 84 | startX = tag.x, 85 | startY = tag.y, 86 | maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]), 87 | s = spiral(size), 88 | dt = Math.random() < .5 ? 1 : -1, 89 | t = -dt, 90 | dxdy, 91 | dx, 92 | dy; 93 | 94 | while (dxdy = s(t += dt)) { 95 | dx = ~~dxdy[0]; 96 | dy = ~~dxdy[1]; 97 | 98 | if (Math.min(dx, dy) > maxDelta) break; 99 | 100 | tag.x = startX + dx; 101 | tag.y = startY + dy; 102 | 103 | if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 || 104 | tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue; 105 | // TODO only check for collisions within current bounds. 106 | if (!bounds || !cloudCollide(tag, board, size[0])) { 107 | if (!bounds || collideRects(tag, bounds)) { 108 | var sprite = tag.sprite, 109 | w = tag.width >> 5, 110 | sw = size[0] >> 5, 111 | lx = tag.x - (w << 4), 112 | sx = lx & 0x7f, 113 | msx = 32 - sx, 114 | h = tag.y1 - tag.y0, 115 | x = (tag.y + tag.y0) * sw + (lx >> 5), 116 | last; 117 | for (var j = 0; j < h; j++) { 118 | last = 0; 119 | for (var i = 0; i <= w; i++) { 120 | board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0); 121 | } 122 | x += sw; 123 | } 124 | delete tag.sprite; 125 | return true; 126 | } 127 | } 128 | } 129 | return false; 130 | } 131 | 132 | cloud.words = function(x) { 133 | if (!arguments.length) return words; 134 | words = x; 135 | return cloud; 136 | }; 137 | 138 | cloud.size = function(x) { 139 | if (!arguments.length) return size; 140 | size = [+x[0], +x[1]]; 141 | return cloud; 142 | }; 143 | 144 | cloud.font = function(x) { 145 | if (!arguments.length) return font; 146 | font = d3.functor(x); 147 | return cloud; 148 | }; 149 | 150 | cloud.fontStyle = function(x) { 151 | if (!arguments.length) return fontStyle; 152 | fontStyle = d3.functor(x); 153 | return cloud; 154 | }; 155 | 156 | cloud.fontWeight = function(x) { 157 | if (!arguments.length) return fontWeight; 158 | fontWeight = d3.functor(x); 159 | return cloud; 160 | }; 161 | 162 | cloud.rotate = function(x) { 163 | if (!arguments.length) return rotate; 164 | rotate = d3.functor(x); 165 | return cloud; 166 | }; 167 | 168 | cloud.text = function(x) { 169 | if (!arguments.length) return text; 170 | text = d3.functor(x); 171 | return cloud; 172 | }; 173 | 174 | cloud.spiral = function(x) { 175 | if (!arguments.length) return spiral; 176 | spiral = spirals[x + ""] || x; 177 | return cloud; 178 | }; 179 | 180 | cloud.fontSize = function(x) { 181 | if (!arguments.length) return fontSize; 182 | fontSize = d3.functor(x); 183 | return cloud; 184 | }; 185 | 186 | cloud.padding = function(x) { 187 | if (!arguments.length) return padding; 188 | padding = d3.functor(x); 189 | return cloud; 190 | }; 191 | 192 | return d3.rebind(cloud, event, "on"); 193 | } 194 | 195 | function cloudText(d) { 196 | return d.text; 197 | } 198 | 199 | function cloudFont() { 200 | return "serif"; 201 | } 202 | 203 | function cloudFontNormal() { 204 | return "normal"; 205 | } 206 | 207 | function cloudFontSize(d) { 208 | return Math.sqrt(d.value); 209 | } 210 | 211 | function cloudRotate() { 212 | return (~~(Math.random() * 6) - 3) * 30; 213 | } 214 | 215 | function cloudPadding() { 216 | return 1; 217 | } 218 | 219 | // Fetches a monochrome sprite bitmap for the specified text. 220 | // Load in batches for speed. 221 | function cloudSprite(d, data, di) { 222 | if (d.sprite) return; 223 | c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio); 224 | var x = 0, 225 | y = 0, 226 | maxh = 0, 227 | n = data.length; 228 | di--; 229 | while (++di < n) { 230 | d = data[di]; 231 | c.save(); 232 | c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font; 233 | var w = c.measureText(d.text + "m").width * ratio, 234 | h = d.size << 1; 235 | if (d.rotate) { 236 | var sr = Math.sin(d.rotate * cloudRadians), 237 | cr = Math.cos(d.rotate * cloudRadians), 238 | wcr = w * cr, 239 | wsr = w * sr, 240 | hcr = h * cr, 241 | hsr = h * sr; 242 | w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5; 243 | h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr)); 244 | } else { 245 | w = (w + 0x1f) >> 5 << 5; 246 | } 247 | if (h > maxh) maxh = h; 248 | if (x + w >= (cw << 5)) { 249 | x = 0; 250 | y += maxh; 251 | maxh = 0; 252 | } 253 | if (y + h >= ch) break; 254 | c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio); 255 | if (d.rotate) c.rotate(d.rotate * cloudRadians); 256 | c.fillText(d.text, 0, 0); 257 | c.restore(); 258 | d.width = w; 259 | d.height = h; 260 | d.xoff = x; 261 | d.yoff = y; 262 | d.x1 = w >> 1; 263 | d.y1 = h >> 1; 264 | d.x0 = -d.x1; 265 | d.y0 = -d.y1; 266 | x += w; 267 | } 268 | var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data, 269 | sprite = []; 270 | while (--di >= 0) { 271 | d = data[di]; 272 | var w = d.width, 273 | w32 = w >> 5, 274 | h = d.y1 - d.y0, 275 | p = d.padding; 276 | // Zero the buffer 277 | for (var i = 0; i < h * w32; i++) sprite[i] = 0; 278 | x = d.xoff; 279 | if (x == null) return; 280 | y = d.yoff; 281 | var seen = 0, 282 | seenRow = -1; 283 | for (var j = 0; j < h; j++) { 284 | for (var i = 0; i < w; i++) { 285 | var k = w32 * j + (i >> 5), 286 | m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0; 287 | if (p) { 288 | if (j) sprite[k - w32] |= m; 289 | if (j < w - 1) sprite[k + w32] |= m; 290 | m |= (m << 1) | (m >> 1); 291 | } 292 | sprite[k] |= m; 293 | seen |= m; 294 | } 295 | if (seen) seenRow = j; 296 | else { 297 | d.y0++; 298 | h--; 299 | j--; 300 | y++; 301 | } 302 | } 303 | d.y1 = d.y0 + seenRow; 304 | d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32); 305 | } 306 | } 307 | 308 | // Use mask-based collision detection. 309 | function cloudCollide(tag, board, sw) { 310 | sw >>= 5; 311 | var sprite = tag.sprite, 312 | w = tag.width >> 5, 313 | lx = tag.x - (w << 4), 314 | sx = lx & 0x7f, 315 | msx = 32 - sx, 316 | h = tag.y1 - tag.y0, 317 | x = (tag.y + tag.y0) * sw + (lx >> 5), 318 | last; 319 | for (var j = 0; j < h; j++) { 320 | last = 0; 321 | for (var i = 0; i <= w; i++) { 322 | if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0)) 323 | & board[x + i]) return true; 324 | } 325 | x += sw; 326 | } 327 | return false; 328 | } 329 | 330 | function cloudBounds(bounds, d) { 331 | var b0 = bounds[0], 332 | b1 = bounds[1]; 333 | if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0; 334 | if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0; 335 | if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1; 336 | if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1; 337 | } 338 | 339 | function collideRects(a, b) { 340 | return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y; 341 | } 342 | 343 | function archimedeanSpiral(size) { 344 | var e = size[0] / size[1]; 345 | return function(t) { 346 | return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)]; 347 | }; 348 | } 349 | 350 | function rectangularSpiral(size) { 351 | var dy = 4, 352 | dx = dy * size[0] / size[1], 353 | x = 0, 354 | y = 0; 355 | return function(t) { 356 | var sign = t < 0 ? -1 : 1; 357 | // See triangular numbers: T_n = n * (n + 1) / 2. 358 | switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) { 359 | case 0: x += dx; break; 360 | case 1: y += dy; break; 361 | case 2: x -= dx; break; 362 | default: y -= dy; break; 363 | } 364 | return [x, y]; 365 | }; 366 | } 367 | 368 | // TODO reuse arrays? 369 | function zeroArray(n) { 370 | var a = [], 371 | i = -1; 372 | while (++i < n) a[i] = 0; 373 | return a; 374 | } 375 | 376 | var cloudRadians = Math.PI / 180, 377 | cw = 1 << 11 >> 5, 378 | ch = 1 << 11, 379 | canvas, 380 | ratio = 1; 381 | 382 | if (typeof document !== "undefined") { 383 | canvas = document.createElement("canvas"); 384 | canvas.width = 1; 385 | canvas.height = 1; 386 | ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2); 387 | canvas.width = (cw << 5) / ratio; 388 | canvas.height = ch / ratio; 389 | } else { 390 | // node-canvas support 391 | var Canvas = require("canvas"); 392 | canvas = new Canvas(cw << 5, ch); 393 | } 394 | 395 | var c = canvas.getContext("2d"), 396 | spirals = { 397 | archimedean: archimedeanSpiral, 398 | rectangular: rectangularSpiral 399 | }; 400 | c.fillStyle = "red"; 401 | c.textAlign = "center"; 402 | 403 | exports.cloud = cloud; 404 | })(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports); 405 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | howmany = int(input("How many to dump? ")) 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | cur = conn.cursor() 9 | 10 | cur.execute('SELECT id, sender FROM Senders') 11 | senders = dict() 12 | for message_row in cur : 13 | senders[message_row[0]] = message_row[1] 14 | 15 | cur.execute('SELECT id, subject FROM Subjects') 16 | subjects = dict() 17 | for message_row in cur : 18 | subjects[message_row[0]] = message_row[1] 19 | 20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 22 | messages = dict() 23 | for message_row in cur : 24 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 25 | 26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) 27 | 28 | sendcounts = dict() 29 | sendorgs = dict() 30 | for (message_id, message) in list(messages.items()): 31 | sender = message[1] 32 | sendcounts[sender] = sendcounts.get(sender,0) + 1 33 | pieces = senders[sender].split("@") 34 | if len(pieces) != 2 : continue 35 | dns = pieces[1] 36 | sendorgs[dns] = sendorgs.get(dns,0) + 1 37 | 38 | print('') 39 | print('Top',howmany,'Email list participants') 40 | 41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 42 | for k in x[:howmany]: 43 | print(senders[k], sendcounts[k]) 44 | if sendcounts[k] < 10 : break 45 | 46 | print('') 47 | print('Top',howmany,'Email list organizations') 48 | 49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 50 | for k in x[:howmany]: 51 | print(k, sendorgs[k]) 52 | if sendorgs[k] < 10 : break 53 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gbasic.py.running.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gbasic.py.running.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gbasic.py.running2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gbasic.py.running2.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gline.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gline.js: -------------------------------------------------------------------------------- 1 | gline = [ ['Year','umich.edu','indiana.edu','ucdavis.edu','ufp.pt','uct.ac.za','berkeley.edu','columbia.edu','etudes.org','gmail.com','mac.com'], 2 | ['2005-12',57,12,11,10,14,12,13,5,10,12], 3 | ['2006-01',93,29,28,29,25,25,22,26,16,12] 4 | ]; 5 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | conn = sqlite3.connect('index.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT id, sender FROM Senders') 9 | senders = dict() 10 | for message_row in cur : 11 | senders[message_row[0]] = message_row[1] 12 | 13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 14 | messages = dict() 15 | for message_row in cur : 16 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 17 | 18 | print("Loaded messages=",len(messages),"senders=",len(senders)) 19 | 20 | sendorgs = dict() 21 | for (message_id, message) in list(messages.items()): 22 | sender = message[1] 23 | pieces = senders[sender].split("@") 24 | if len(pieces) != 2 : continue 25 | dns = pieces[1] 26 | sendorgs[dns] = sendorgs.get(dns,0) + 1 27 | 28 | # pick the top schools 29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 30 | orgs = orgs[:10] 31 | print("Top 10 Oranizations") 32 | print(orgs) 33 | 34 | counts = dict() 35 | months = list() 36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 37 | for (message_id, message) in list(messages.items()): 38 | sender = message[1] 39 | pieces = senders[sender].split("@") 40 | if len(pieces) != 2 : continue 41 | dns = pieces[1] 42 | if dns not in orgs : continue 43 | month = message[3][:7] 44 | if month not in months : months.append(month) 45 | key = (month, dns) 46 | counts[key] = counts.get(key,0) + 1 47 | 48 | months.sort() 49 | # print counts 50 | # print months 51 | 52 | fhand = open('gline.js','w') 53 | fhand.write("gline = [ ['Year'") 54 | for org in orgs: 55 | fhand.write(",'"+org+"'") 56 | fhand.write("]") 57 | 58 | for month in months: 59 | fhand.write(",\n['"+month+"'") 60 | for org in orgs: 61 | key = (month, org) 62 | val = counts.get(key,0) 63 | fhand.write(","+str(val)) 64 | fhand.write("]"); 65 | 66 | fhand.write("\n];\n") 67 | fhand.close() 68 | 69 | print("Output written to gline.js") 70 | print("Open gline.htm to visualize the data") 71 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gmane.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import ssl 4 | import urllib.request, urllib.parse, urllib.error 5 | from urllib.parse import urljoin 6 | from urllib.parse import urlparse 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | # Not all systems have this so conditionally define parser 11 | try: 12 | import dateutil.parser as parser 13 | except: 14 | pass 15 | 16 | def parsemaildate(md) : 17 | # See if we have dateutil 18 | try: 19 | pdate = parser.parse(tdate) 20 | test_at = pdate.isoformat() 21 | return test_at 22 | except: 23 | pass 24 | 25 | # Non-dateutil version - we try our best 26 | 27 | pieces = md.split() 28 | notz = " ".join(pieces[:4]).strip() 29 | 30 | # Try a bunch of format variations - strptime() is *lame* 31 | dnotz = None 32 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 33 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 34 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 35 | try: 36 | dnotz = datetime.strptime(notz, form) 37 | break 38 | except: 39 | continue 40 | 41 | if dnotz is None : 42 | # print 'Bad Date:',md 43 | return None 44 | 45 | iso = dnotz.isoformat() 46 | 47 | tz = "+0000" 48 | try: 49 | tz = pieces[4] 50 | ival = int(tz) # Only want numeric timezone values 51 | if tz == '-0000' : tz = '+0000' 52 | tzh = tz[:3] 53 | tzm = tz[3:] 54 | tz = tzh+":"+tzm 55 | except: 56 | pass 57 | 58 | return iso+tz 59 | 60 | # Ignore SSL certificate errors 61 | ctx = ssl.create_default_context() 62 | ctx.check_hostname = False 63 | ctx.verify_mode = ssl.CERT_NONE 64 | 65 | conn = sqlite3.connect('content.sqlite') 66 | cur = conn.cursor() 67 | 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 69 | 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 71 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 72 | subject TEXT, headers TEXT, body TEXT)''') 73 | 74 | # Pick up where we left off 75 | start = None 76 | cur.execute('SELECT max(id) FROM Messages' ) 77 | try: 78 | row = cur.fetchone() 79 | if row is None : 80 | start = 0 81 | else: 82 | start = row[0] 83 | except: 84 | start = 0 85 | 86 | if start is None : start = 0 87 | 88 | many = 0 89 | count = 0 90 | fail = 0 91 | while True: 92 | if ( many < 1 ) : 93 | conn.commit() 94 | sval = input('How many messages:') 95 | if ( len(sval) < 1 ) : break 96 | many = int(sval) 97 | 98 | start = start + 1 99 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 100 | try: 101 | row = cur.fetchone() 102 | if row is not None : continue 103 | except: 104 | row = None 105 | 106 | many = many - 1 107 | url = baseurl + str(start) + '/' + str(start + 1) 108 | 109 | text = "None" 110 | try: 111 | # Open with a timeout of 30 seconds 112 | document = urllib.request.urlopen(url, None, 30, context=ctx) 113 | text = document.read().decode() 114 | if document.getcode() != 200 : 115 | print("Error code=",document.getcode(), url) 116 | break 117 | except KeyboardInterrupt: 118 | print('') 119 | print('Program interrupted by user...') 120 | break 121 | except Exception as e: 122 | print("Unable to retrieve or parse page",url) 123 | print("Error",e) 124 | fail = fail + 1 125 | if fail > 5 : break 126 | continue 127 | 128 | print(url,len(text)) 129 | count = count + 1 130 | 131 | if not text.startswith("From "): 132 | print(text) 133 | print("Did not find From ") 134 | fail = fail + 1 135 | if fail > 5 : break 136 | continue 137 | 138 | pos = text.find("\n\n") 139 | if pos > 0 : 140 | hdr = text[:pos] 141 | body = text[pos+2:] 142 | else: 143 | print(text) 144 | print("Could not find break between headers and body") 145 | fail = fail + 1 146 | if fail > 5 : break 147 | continue 148 | 149 | email = None 150 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 151 | if len(x) == 1 : 152 | email = x[0]; 153 | email = email.strip().lower() 154 | email = email.replace("<","") 155 | else: 156 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 157 | if len(x) == 1 : 158 | email = x[0]; 159 | email = email.strip().lower() 160 | email = email.replace("<","") 161 | 162 | date = None 163 | y = re.findall('\Date: .*, (.*)\n', hdr) 164 | if len(y) == 1 : 165 | tdate = y[0] 166 | tdate = tdate[:26] 167 | try: 168 | sent_at = parsemaildate(tdate) 169 | except: 170 | print(text) 171 | print("Parse fail",tdate) 172 | fail = fail + 1 173 | if fail > 5 : break 174 | continue 175 | 176 | subject = None 177 | z = re.findall('\Subject: (.*)\n', hdr) 178 | if len(z) == 1 : subject = z[0].strip().lower(); 179 | 180 | # Reset the fail counter 181 | fail = 0 182 | print(" ",email,sent_at,subject) 183 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 184 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 185 | if count % 50 == 0 : conn.commit() 186 | if count % 100 == 0 : time.sleep(1) 187 | 188 | conn.commit() 189 | cur.close() 190 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import re 4 | import zlib 5 | from datetime import datetime, timedelta 6 | 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print(realsender, sender) 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print(realsender, sender) 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print(x,dns) 52 | # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns)) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print('Bad Date:',md) 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception as e: 124 | # print('Date ignored ',tdate, e) 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | conn = sqlite3.connect('index.sqlite') 140 | cur = conn.cursor() 141 | 142 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 143 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 145 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 146 | 147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 148 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 149 | sender_id INTEGER, subject_id INTEGER, 150 | headers BLOB, body BLOB)''') 151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 152 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 154 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 156 | (from_id INTEGER, to_id INTEGER)''') 157 | 158 | conn_1 = sqlite3.connect('mapping.sqlite') 159 | cur_1 = conn_1.cursor() 160 | 161 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 162 | for message_row in cur_1 : 163 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 164 | 165 | mapping = dict() 166 | cur_1.execute('''SELECT old,new FROM Mapping''') 167 | for message_row in cur_1 : 168 | old = fixsender(message_row[0]) 169 | new = fixsender(message_row[1]) 170 | mapping[old] = fixsender(new) 171 | 172 | # Done with mapping.sqlite 173 | conn_1.close() 174 | 175 | # Open the main content (Read only) 176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True) 177 | cur_1 = conn_1.cursor() 178 | 179 | allsenders = list() 180 | cur_1.execute('''SELECT email FROM Messages''') 181 | for message_row in cur_1 : 182 | sender = fixsender(message_row[0]) 183 | if sender is None : continue 184 | if 'gmane.org' in sender : continue 185 | if sender in allsenders: continue 186 | allsenders.append(sender) 187 | 188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) 189 | 190 | cur_1.execute('''SELECT headers, body, sent_at 191 | FROM Messages ORDER BY sent_at''') 192 | 193 | senders = dict() 194 | subjects = dict() 195 | guids = dict() 196 | 197 | count = 0 198 | 199 | for message_row in cur_1 : 200 | hdr = message_row[0] 201 | parsed = parseheader(hdr, allsenders) 202 | if parsed is None: continue 203 | (guid, sender, subject, sent_at) = parsed 204 | 205 | # Apply the sender mapping 206 | sender = mapping.get(sender,sender) 207 | 208 | count = count + 1 209 | if count % 250 == 1 : print(count,sent_at, sender) 210 | # print(guid, sender, subject, sent_at) 211 | 212 | if 'gmane.org' in sender: 213 | print("Error in sender ===", sender) 214 | 215 | sender_id = senders.get(sender,None) 216 | subject_id = subjects.get(subject,None) 217 | guid_id = guids.get(guid,None) 218 | 219 | if sender_id is None : 220 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 221 | conn.commit() 222 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 223 | try: 224 | row = cur.fetchone() 225 | sender_id = row[0] 226 | senders[sender] = sender_id 227 | except: 228 | print('Could not retrieve sender id',sender) 229 | break 230 | if subject_id is None : 231 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 232 | conn.commit() 233 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 234 | try: 235 | row = cur.fetchone() 236 | subject_id = row[0] 237 | subjects[subject] = subject_id 238 | except: 239 | print('Could not retrieve subject id',subject) 240 | break 241 | # print(sender_id, subject_id) 242 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 243 | ( guid, sender_id, subject_id, sent_at, 244 | zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) ) 245 | conn.commit() 246 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 247 | try: 248 | row = cur.fetchone() 249 | message_id = row[0] 250 | guids[guid] = message_id 251 | except: 252 | print('Could not retrieve guid id',guid) 253 | break 254 | 255 | cur.close() 256 | cur_1.close() 257 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gmodel.py.running.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gmodel.py.running.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gword.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gword.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gword.js: -------------------------------------------------------------------------------- 1 | gword = [{text: 'sakai', size: 100}, 2 | {text: 'with', size: 38}, 3 | {text: 'tool', size: 36}, 4 | {text: 'error', size: 35}, 5 | {text: 'webdav', size: 34}, 6 | {text: 'resources', size: 32}, 7 | {text: 'mysql', size: 30}, 8 | {text: 'problems', size: 29}, 9 | {text: 'changes', size: 28}, 10 | {text: 'problem', size: 26}, 11 | {text: 'working', size: 25}, 12 | {text: 'message', size: 25}, 13 | {text: 'into', size: 24}, 14 | {text: 'content', size: 24}, 15 | {text: 'site', size: 24}, 16 | {text: 'workspace', size: 24}, 17 | {text: 'melete', size: 24}, 18 | {text: 'course', size: 23}, 19 | {text: 'broken', size: 23}, 20 | {text: 'from', size: 23}, 21 | {text: 'password', size: 23}, 22 | {text: 'forgotten', size: 23}, 23 | {text: 'feature', size: 23}, 24 | {text: 'profile', size: 23}, 25 | {text: 'rutgers', size: 23}, 26 | {text: 'accessservlet', size: 23}, 27 | {text: 'aliases', size: 23}, 28 | {text: 'unexpectedly', size: 23}, 29 | {text: 'taken', size: 23}, 30 | {text: 'portalxlogin', size: 23}, 31 | {text: 'samigo', size: 23}, 32 | {text: 'oracle', size: 23}, 33 | {text: 'eclipse', size: 23}, 34 | {text: 'view', size: 23}, 35 | {text: 'tools', size: 23}, 36 | {text: 'update', size: 23}, 37 | {text: 'version', size: 23}, 38 | {text: 'maven', size: 22}, 39 | {text: 'email', size: 22}, 40 | {text: 'center', size: 22}, 41 | {text: 'jforum', size: 22}, 42 | {text: 'files', size: 22}, 43 | {text: 'syllabus', size: 22}, 44 | {text: 'desktop', size: 21}, 45 | {text: 'connection', size: 21}, 46 | {text: 'file', size: 21}, 47 | {text: 'worksite', size: 21}, 48 | {text: 'portal', size: 21}, 49 | {text: 'visual', size: 21}, 50 | {text: 'basic', size: 21}, 51 | {text: 'different', size: 21}, 52 | {text: 'missing', size: 21}, 53 | {text: 'upload', size: 21}, 54 | {text: 'importing', size: 21}, 55 | {text: 'option', size: 21}, 56 | {text: 'information', size: 21}, 57 | {text: 'creating', size: 21}, 58 | {text: 'staleobjectstateexception', size: 21}, 59 | {text: 'updating', size: 21}, 60 | {text: 'sakaiiframemyworkspace', size: 21}, 61 | {text: 'memory', size: 20}, 62 | {text: 'collab', size: 20}, 63 | {text: 'code', size: 20}, 64 | {text: 'section', size: 20}, 65 | {text: 'question', size: 20}, 66 | {text: 'status', size: 20}, 67 | {text: 'production', size: 20}, 68 | {text: 'extending', size: 20}, 69 | {text: 'javaxsqlbasedatasource', size: 20}, 70 | {text: 'apis', size: 20}, 71 | {text: 'wiki', size: 20}, 72 | {text: 'using', size: 20}, 73 | {text: 'tests', size: 20}, 74 | {text: 'branch', size: 20}, 75 | {text: 'permissions', size: 20}, 76 | {text: 'support', size: 20}, 77 | {text: 'size', size: 20}, 78 | {text: 'page', size: 20}, 79 | {text: 'users', size: 20}, 80 | {text: 'sakaiperson', size: 20}, 81 | {text: 'database', size: 20}, 82 | {text: 'casfilter', size: 20}, 83 | {text: 'html', size: 20}, 84 | {text: 'editors', size: 20}, 85 | {text: 'reordering', size: 20}, 86 | {text: 'suppressing', size: 20}, 87 | {text: 'annoying', size: 20}, 88 | {text: 'macos', size: 20}, 89 | {text: 'limit', size: 20}, 90 | {text: 'exceeded', size: 20}, 91 | {text: 'without', size: 20}, 92 | {text: 'uploading', size: 20}, 93 | {text: 'documentation', size: 20}, 94 | {text: 'provider', size: 20}, 95 | {text: 'cannot', size: 20}, 96 | {text: 'development', size: 20}, 97 | {text: 'sakaiscript', size: 20}, 98 | {text: 'again', size: 20}, 99 | {text: 'assigning', size: 20}, 100 | {text: 'quota', size: 20} 101 | ]; 102 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | import string 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, subject FROM Subjects') 10 | subjects = dict() 11 | for message_row in cur : 12 | subjects[message_row[0]] = message_row[1] 13 | 14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 15 | cur.execute('SELECT subject_id FROM Messages') 16 | counts = dict() 17 | for message_row in cur : 18 | text = subjects[message_row[0]] 19 | text = text.translate(str.maketrans('','',string.punctuation)) 20 | text = text.translate(str.maketrans('','','1234567890')) 21 | text = text.strip() 22 | text = text.lower() 23 | words = text.split() 24 | for word in words: 25 | if len(word) < 4 : continue 26 | counts[word] = counts.get(word,0) + 1 27 | 28 | x = sorted(counts, key=counts.get, reverse=True) 29 | highest = None 30 | lowest = None 31 | for k in x[:100]: 32 | if highest is None or highest < counts[k] : 33 | highest = counts[k] 34 | if lowest is None or lowest > counts[k] : 35 | lowest = counts[k] 36 | print('Range of counts:',highest,lowest) 37 | 38 | # Spread the font sizes across 20-100 based on the count 39 | bigsize = 80 40 | smallsize = 20 41 | 42 | fhand = open('gword.js','w') 43 | fhand.write("gword = [") 44 | first = True 45 | for k in x[:100]: 46 | if not first : fhand.write( ",\n") 47 | first = False 48 | size = counts[k] 49 | size = (size - lowest) / float(highest - lowest) 50 | size = int((size * bigsize) + smallsize) 51 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 52 | fhand.write( "\n];\n") 53 | fhand.close() 54 | 55 | print("Output written to gword.js") 56 | print("Open gword.htm in a browser to see the vizualization") 57 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib.request, urllib.parse, urllib.error 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, sender FROM Senders') 10 | senders = dict() 11 | for message_row in cur : 12 | senders[message_row[0]] = message_row[1] 13 | 14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 15 | messages = dict() 16 | for message_row in cur : 17 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 18 | 19 | print("Loaded messages=",len(messages),"senders=",len(senders)) 20 | 21 | sendorgs = dict() 22 | for (message_id, message) in list(messages.items()): 23 | sender = message[1] 24 | pieces = senders[sender].split("@") 25 | if len(pieces) != 2 : continue 26 | dns = pieces[1] 27 | sendorgs[dns] = sendorgs.get(dns,0) + 1 28 | 29 | # pick the top schools 30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 31 | orgs = orgs[:10] 32 | print("Top 10 Oranizations") 33 | print(orgs) 34 | # orgs = ['total'] + orgs 35 | 36 | counts = dict() 37 | months = list() 38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 39 | for (message_id, message) in list(messages.items()): 40 | sender = message[1] 41 | pieces = senders[sender].split("@") 42 | if len(pieces) != 2 : continue 43 | dns = pieces[1] 44 | if dns not in orgs : continue 45 | month = message[3][:4] 46 | if month not in months : months.append(month) 47 | key = (month, dns) 48 | counts[key] = counts.get(key,0) + 1 49 | tkey = (month, 'total') 50 | counts[tkey] = counts.get(tkey,0) + 1 51 | 52 | months.sort() 53 | # print counts 54 | # print months 55 | 56 | fhand = open('gline.js','w') 57 | fhand.write("gline = [ ['Year'") 58 | for org in orgs: 59 | fhand.write(",'"+org+"'") 60 | fhand.write("]") 61 | 62 | for month in months[1:-1]: 63 | fhand.write(",\n['"+month+"'") 64 | for org in orgs: 65 | key = (month, org) 66 | val = counts.get(key,0) 67 | fhand.write(","+str(val)) 68 | fhand.write("]"); 69 | 70 | fhand.write("\n];\n") 71 | fhand.close() 72 | 73 | print("Output written to gline.js") 74 | print("Open gline.htm to visualize the data") 75 | 76 | -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/index.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/index.sqlite -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/index.sqlite.second.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/index.sqlite.second.jpg -------------------------------------------------------------------------------- /Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Course 5 - Capstone - Retrieving, processing and visualising data with Python/ex18/gmane/mapping.sqlite -------------------------------------------------------------------------------- /Python for Everybody.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thmstm/py4e/1ac22d81d43c30f09afff4ad0b53296f3bcf45f1/Python for Everybody.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py4e 2 | Coursera - Python for Everybody codes 3 | https://www.coursera.org/specializations/python 4 | --------------------------------------------------------------------------------