├── .gitignore ├── Makefile ├── README.md ├── all_indexes.sql ├── bin ├── build.py ├── exercises.py └── fetch.py ├── exercises.rst ├── model.py ├── notes.org ├── problem.txt ├── slides.txt └── ugly_and_not.py /.gitignore: -------------------------------------------------------------------------------- 1 | cache 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | tmp/exercises.pdf: tmp/exercises.dvi 4 | cd tmp && dvipdf exercises.dvi 5 | 6 | tmp/exercises.dvi: tmp/exercises.latex 7 | cd tmp && latex exercises.latex 8 | 9 | tmp/exercises.latex: exercises.rst tmp 10 | rst2latex.py --documentoptions=letterpaper exercises.rst > tmp/exercises.latex 11 | 12 | tmp: 13 | mkdir tmp 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | SQL for Python Developers 3 | ========================= 4 | 5 | This repository contains all of the materials 6 | that go with my 2012 PyCon tutorial “SQL For Python Developers” 7 | which is on YouTube: 8 | 9 | https://www.youtube.com/watch?v=Thd8yoBou7k 10 | 11 | The movie database used in the tutorial was assembled from the 12 | [IMDB’s non-commercial data files](https://developer.imdb.com/non-commercial-datasets/) 13 | using scripts in the `bin/` directory, 14 | but I haven’t tested whether they still work. 15 | -------------------------------------------------------------------------------- /all_indexes.sql: -------------------------------------------------------------------------------- 1 | -- Create indexes that support searches and joins from all directions. 2 | 3 | create index idx_movie_title_binary on movie(title COLLATE BINARY); 4 | create index idx_movie_title_nocase on movie(title COLLATE NOCASE); 5 | create index idx_movie_year on movie(year); 6 | 7 | create index idx_actor_name_binary on actor(name COLLATE BINARY); 8 | create index idx_actor_name_nocase on actor(name COLLATE NOCASE); 9 | 10 | create index idx_role_movie on role(movie_id); 11 | create index idx_role_actor on role(actor_id); 12 | create index idx_role_name_binary on role(name COLLATE BINARY); 13 | create index idx_role_name_nocase on role(name COLLATE NOCASE); 14 | -------------------------------------------------------------------------------- /bin/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | POSTGRES=False or True 4 | 5 | import gzip 6 | import os 7 | import re 8 | import sqlite3 9 | import sys 10 | 11 | if POSTGRES: 12 | import psycopg2 13 | 14 | line_re = re.compile(r''' 15 | (?P[^\t]+)? 16 | \t+ 17 | (?P")?(?P[^"]+?)"? 18 | \ +\((?P[\d?]{4})(/(?P[IVX]+))?\) 19 | (\ +\{(?P[^}]+)\})? 20 | (\ *( 21 | \((?Pas\ [^)]+( \([IV]+\))?)\)\)? 22 | | \((?Palso\ as\ [^)]+)\) 23 | | \((?Pattached)\) 24 | | \((?P.*archive\ footage[^)]*)\) 25 | | \((?P.*archive\ sound[^)]*)\) 26 | | (?P7|30) 27 | | (?PCompound\ Guard\ \#3) 28 | | \((?Pcredit\ only[^)]*)\) 29 | | \((?Pfrom [^)]*)\) 30 | | (?PHunter\ \#2) 31 | | \((?Pin\ talks|in\ negotiations)\) 32 | | \((?P(\d+\ )?episode[^)]*)\) 33 | | \((?PV)\) 34 | | \((?Prumored)\) 35 | | \((?Pscenes\ deleted)\) 36 | | \((?Psegment [^)]*)\) 37 | | \((?Psinging\ voice[^)]*)\) 38 | | \((?Psong:? [^)]+)\) 39 | | \((?Psongs?)\) 40 | | \{\{(?PSUSPENDED)\}\} 41 | | \((?PTV)\) 42 | | \((?Punconfirmed)\) 43 | | \((?Puncredited[^)]*)\)\ ?\)? 44 | | \((?PVG)\) 45 | | \(?(?Pvoice[^)]*)\) 46 | | \(?(?P;\ \(voice\))\) 47 | | \((?P\d\d\d\d\ scenes)\) 48 | | \((?P\d\d\d\d\ ?-?\ ?[?\d]?[?\d]?[?\d]?[?\d]?)\) 49 | | Chris 50 | ))* 51 | (\ +\[(?P.+?)\] )? 52 | (\ +<(?P\d+)>)? 53 | \n$ 54 | ''', re.X) 55 | 56 | def import_actors(committer, db, filename, gender): 57 | 58 | lines = iter(gzip.open(filename)) 59 | 60 | while not next(lines).startswith('Name'): 61 | continue 62 | 63 | next(lines) # skip the title underlines 64 | 65 | for line in lines: 66 | 67 | if line.startswith('-----------------------'): 68 | break 69 | if line == '\n': 70 | continue 71 | line = line.decode('latin-1') 72 | 73 | m = line_re.match(line) 74 | if m is None: 75 | print 'Could not parse %r' % line 76 | continue 77 | 78 | g = m.groupdict() 79 | 80 | if g['actor_name']: 81 | actor = g['actor_name'] 82 | 83 | if (g['archive_footage'] 84 | or g['archive_sound'] 85 | or g['credit_only'] 86 | or g['episode'] 87 | or g['in_talks'] 88 | or g['is_televsion'] 89 | or g['rumored'] 90 | or g['scenes_deleted'] 91 | or g['song'] 92 | or g['songs'] 93 | or g['suspended'] 94 | or g['television'] 95 | or g['unconfirmed'] 96 | or g['uncredited'] 97 | or g['videogame'] 98 | ): 99 | continue 100 | 101 | nth = g['nth_movie_that_year'] or '' 102 | made_for_video = bool(g['made_for_video']) 103 | 104 | if POSTGRES: 105 | s = 'INSERT INTO actor_title_role VALUES (%s,%s,%s,%s,%s,%s,%s)' 106 | if g['year'] == '????': 107 | g['year'] = None 108 | else: 109 | s = 'INSERT INTO actor_title_role VALUES (?, ?, ?, ?, ?, ?, ?)' 110 | db.execute(s, 111 | (actor, gender, 112 | g['movie_title'], g['year'], nth, made_for_video, 113 | g['role'])) 114 | 115 | committer.commit() 116 | 117 | if __name__ == '__main__': 118 | if POSTGRES: 119 | connection = psycopg2.connect('dbname=movie') 120 | db = connection.cursor() 121 | else: 122 | if os.path.exists('movie.db'): 123 | print 'Error: database already exists' 124 | sys.exit(1) 125 | connection = db = sqlite3.connect('movie.db') 126 | db.execute(''' 127 | CREATE TABLE actor_title_role ( 128 | actor_name TEXT, gender TEXT, 129 | title TEXT, year INTEGER, nth TEXT, for_video BOOLEAN, 130 | role_name TEXT 131 | ); 132 | ''') 133 | import_actors(connection, db, 'cache/actors.list.gz', 'm') 134 | import_actors(connection, db, 'cache/actresses.list.gz', 'f') 135 | 136 | for cmd in ''' 137 | 138 | CREATE TABLE movie ( 139 | id SERIAL, title TEXT, year INTEGER, nth TEXT, for_video BOOLEAN 140 | ); 141 | CREATE TABLE actor ( 142 | id SERIAL, name TEXT, gender TEXT 143 | ); 144 | CREATE TABLE role ( 145 | movie_id INTEGER, actor_id INTEGER, name TEXT 146 | ); 147 | 148 | INSERT INTO movie (title, year, nth, for_video) 149 | SELECT DISTINCT title, year, nth, for_video 150 | FROM actor_title_role; 151 | 152 | INSERT INTO actor (name, gender) 153 | SELECT DISTINCT actor_name, gender 154 | FROM actor_title_role; 155 | 156 | CREATE INDEX tmp1 ON movie (title, year); 157 | CREATE INDEX tmp2 ON actor (name); 158 | 159 | INSERT INTO role (movie_id, actor_id, name) 160 | SELECT movie.id, actor.id, role_name 161 | FROM actor_title_role 162 | JOIN movie ON ( 163 | movie.title = actor_title_role.title AND 164 | movie.year = actor_title_role.year AND 165 | movie.nth = actor_title_role.nth 166 | ) 167 | JOIN actor ON ( 168 | actor.name = actor_title_role.actor_name AND 169 | actor.gender = actor_title_role.gender 170 | ); 171 | 172 | DROP TABLE actor_title_role; 173 | DROP INDEX tmp1; 174 | DROP INDEX tmp2; 175 | 176 | VACUUM; 177 | 178 | '''.split(';'): 179 | db.execute(cmd) 180 | connection.commit() 181 | 182 | # CREATE INDEX role_unique ON role (role, movie_id, actor_id); 183 | # TODO: why is there a movie from year 7? Fix my RE. 184 | -------------------------------------------------------------------------------- /bin/exercises.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from unittest import TestCase 4 | 5 | class Case(TestCase): 6 | def runTest(self): 7 | pass 8 | 9 | case = Case() 10 | 11 | if __name__ == '__main__': 12 | import sqlite3 13 | db = sqlite3.connect('movie.db') 14 | c = db.cursor() 15 | with open('exercise1.sql') as f: 16 | command = f.read() 17 | c.execute(command) 18 | result = c.fetchall() 19 | # "How many movies are in the database?" 20 | case.assertEqual(result, [(530257,)]) 21 | -------------------------------------------------------------------------------- /bin/fetch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import shutil 5 | from urllib2 import urlopen 6 | 7 | ftp = 'ftp://ftp.fu-berlin.de/pub/misc/movies/database/' 8 | 9 | if __name__ == '__main__': 10 | for filename in 'actors.list.gz', 'actresses.list.gz', 'movies.list.gz': 11 | shutil.copyfileobj( 12 | urlopen(ftp + filename), 13 | open(os.path.join('cache', filename), 'w'), 14 | ) 15 | -------------------------------------------------------------------------------- /exercises.rst: -------------------------------------------------------------------------------- 1 | 2 | ===================================== 3 | First Steps: SQL Tutorial Exercises 4 | ===================================== 5 | 6 | :PyCon: 2012 7 | :Where: Santa Clara, California 8 | :Who: Brandon Rhodes 9 | 10 | 11 | | *Assumption #1: You know Python* 12 | | *Assumption #2: You want to learn SQL* 13 | 14 | ------------------------------------------------------------------------ 15 | 16 | Learning the SELECT statement 17 | ============================= 18 | 19 | 1. Getting our feet wet 20 | ----------------------- 21 | 22 | .. Introduce the idea of a table with “.table” and “.schema”, using both 23 | the terminology “column/row” and “record/field.” Mention that 24 | migration is problematic. Show them SELECT, SELECT-WHERE, COUNT(*), 25 | LIMIT, and the operators “=”, “<>”, “[NOT] LIKE”, “[NOT] GLOB”, 26 | “AND”, and “OR”. 27 | 28 | a. In what year was “Double Indemnity” released? 29 | 30 | .. SELECT year FROM movie WHERE title = 'Double Indemnity'; 31 | 32 | b. How many movies were released in 1929? What about 1930? 1931? 33 | 34 | .. SELECT COUNT(*) FROM movie WHERE year = 1929; 35 | SELECT COUNT(*) FROM movie WHERE year = 1930; 36 | SELECT COUNT(*) FROM movie WHERE year = 1931; 37 | 38 | c. How many movies ever have been released to theaters, and how many 39 | were instead released direct-to-video? 40 | 41 | .. SELECT COUNT(*) FROM movie WHERE for_video = 0; 42 | SELECT COUNT(*) FROM movie WHERE for_video = 1; 43 | 44 | d. … what about movies with “Yoga” in their title? 45 | 46 | .. SELECT COUNT(*) FROM movie WHERE title LIKE '%yoga%' AND for_video = 0; 47 | SELECT COUNT(*) FROM movie WHERE title LIKE '%yoga%' AND for_video = 1; 48 | 49 | e. Draw a two-circle Venn diagram showing how many movies have the word 50 | “life” in their title and how many include the word “death.” 51 | 52 | .. SELECT COUNT(*) FROM movie 53 | WHERE title LIKE '%life%' AND title LIKE '%death%'; 54 | SELECT COUNT(*) FROM movie 55 | WHERE title NOT LIKE '%life%' AND title LIKE'%death%'; 56 | SELECT COUNT(*) FROM movie 57 | WHERE title LIKE '%life%' AND title NOT LIKE '%death%'; 58 | 59 | 2. Putting things well in order 60 | ------------------------------- 61 | 62 | .. Show how ORDER BY can be used with table column names and with 63 | expressions. Explain that SELECT and, thus, LIMIT is normally random 64 | in its delivery of rows, but that ORDER can make them stable, and 65 | thus make OFFSET interesting for paging through data. 66 | 67 | a. What was the year that the very first movies were released, and what 68 | were their titles? 69 | 70 | .. SELECT * FROM movie ORDER BY year ASC LIMIT 10; 71 | 72 | b. What movie has the longest title ever? 73 | 74 | .. SELECT * FROM movie ORDER BY LENGTH(title) DESC LIMIT 3; 75 | or, for clarity, and to lead into next topic: 76 | SELECT LENGTH(title), * FROM movie ORDER BY 1 DESC LIMIT 3; 77 | 78 | c. ... which actor or actress? 79 | 80 | .. SELECT * FROM actor ORDER BY LENGTH(name) DESC LIMIT 3; 81 | 82 | d. Which movie(s), not counting movies whose titles start with 83 | punctuation, come first in alphabetical order, and which come last? 84 | 85 | .. SELECT * FROM movie WHERE title GLOB 'A*' ORDER BY title ASC LIMIT 10; 86 | SELECT * FROM movie WHERE title GLOB 'Z*' ORDER BY title DESC LIMIT 10; 87 | SELECT * FROM movie WHERE title GLOB 'ZZ*' ORDER BY title DESC LIMIT 10; 88 | 89 | e. If someone searched your movie database for movies with “gun” in 90 | their title and you displayed them in the order they were released, 91 | which movies would be on the second page of 10 results each? 92 | 93 | .. SELECT * FROM movie WHERE title LIKE '%gun%' 94 | ORDER BY year LIMIT 10 OFFSET 10; 95 | 96 | 3. Making distinctions 97 | ---------------------- 98 | 99 | .. Now replace the “*” in SELECT with explicit field names. Show that 100 | because we now get less information, running DISTINCT on the output 101 | gives us a smaller result set. 102 | 103 | a. Create a query which, as a sanity check, outputs the two genders 104 | represented in the ``actor`` table. 105 | 106 | .. SELECT DISTINCT gender FROM actor; 107 | 108 | b. Which years in human history have seen at least one movie released? 109 | 110 | .. SELECT DISTINCT year FROM movie; 111 | 112 | c. Produce a list of role names, with each name appearing only once, 113 | which start with the word “Bystander.” 114 | 115 | .. SELECT DISTINCT role FROM role WHERE role LIKE 'bystander%'; 116 | 117 | 4. Counting groups 118 | ------------------ 119 | 120 | .. Introduce GROUP BY and show how it can produce multiple rows, all of 121 | which have a COUNT(*) summary. Also show SUM(). 122 | 123 | a. Write a single query that shows how many men, and how many women, 124 | have ever had roles in film. 125 | 126 | .. SELECT gender, count(*) FROM actor GROUP BY 1; 127 | 128 | b. What was the average length of a film's name in 1905? …in 1990? 129 | 130 | .. SELECT sum(length(title)) / count(*) FROM movie 131 | WHERE year = 1990; 132 | 133 | c. What is the most common movie name ever? 134 | 135 | .. SELECT count(*), title FROM movie GROUP BY 2 ORDER BY 1 DESC LIMIT 10; 136 | 137 | d. … character name ever? 138 | 139 | .. SELECT count(*), role FROM role GROUP BY 2 ORDER BY 1 DESC LIMIT 10; 140 | 141 | 5. Joining up 142 | ------------- 143 | 144 | .. Show how JOIN lets you create an N×M table that combines two real 145 | tables, but how a WHERE clause can reduce the N×M to an interesting 146 | set of rows. Note that field names can (and, in real life, sometimes 147 | must) now be qualified with their table name. 148 | 149 | a. Which movies have featured a character named “King Arthur”? 150 | 151 | .. SELECT * FROM movie JOIN role ON (movie.id = movie_id) 152 | WHERE role = 'King Arthur'; 153 | 154 | b. Which actors have played a character named “King Arthur”? 155 | 156 | .. SELECT * FROM actor JOIN role ON (actor.id = actor_id) 157 | WHERE role = 'King Arthur'; 158 | 159 | c. Which movie had the largest cast ever? 160 | 161 | .. SELECT COUNT(*), title, movie.id 162 | FROM movie JOIN role ON (movie.id = movie_id) 163 | GROUP BY 2 ORDER BY 1 DESC LIMIT 10; 164 | 165 | d. Which 12 actors hold the record for being credited in the most 166 | movies? 167 | 168 | .. SELECT COUNT(*), name 169 | FROM actor JOIN role ON (actor.id = actor_id) 170 | WHERE role <> '' 171 | GROUP BY 2 ORDER BY 1 DESC LIMIT 10; 172 | 173 | e. Which actors have most often reprised the same role, bringing it back 174 | in movie after movie? 175 | 176 | .. SELECT count(*), name, role FROM actor 177 | JOIN role ON (actor.id = actor_id) 178 | WHERE role <> '' 179 | GROUP BY 2, 3 ORDER BY 1 DESC LIMIT 10; 180 | 181 | 6. What will you be having? 182 | --------------------------- 183 | 184 | .. Explain, finally, how HAVING filters rows after aggregation has taken 185 | .. place. Show how an alias lets you name an aggregate column for 186 | .. easier use in the HAVING clause. 187 | 188 | a. Produce a list of actors who have played exactly 99 roles. 189 | 190 | .. SELECT COUNT(*) AS role_count, actor_id, name 191 | FROM actor JOIN role ON (actor.id = actor_id) 192 | WHERE role <> '' 193 | GROUP BY 2, 3 194 | HAVING role_count = 99; 195 | 196 | b. Produce a list of actors who have been in exactly 99 films. 197 | 198 | .. SELECT COUNT(DISTINCT movie_id) AS movie_count, 199 | actor_id, name 200 | FROM actor 201 | JOIN role ON (actor.id = actor_id) 202 | WHERE role <> '' 203 | GROUP BY 2, 3 204 | HAVING movie_count = 99; 205 | 206 | c. In which years was the average length of a film title greater than 20 207 | characters? 208 | 209 | .. SELECT year, AVG(LENGTH(title)) AS average 210 | FROM movie GROUP BY 1 HAVING average > 20; 211 | 212 | Further topics 213 | -------------- 214 | 215 | | Subqueries 216 | | Inserting and updating data 217 | | Transactions 218 | 219 | | *Break* 220 | 221 | | Indexes and performance 222 | | The DB-API and batch operations 223 | | The SQLAlchemy ORM, and others 224 | 225 | Quick Reference 226 | =============== 227 | 228 | :: 229 | 230 | Table Result 231 | ----- ------ 232 | 233 | row x 234 | row row row -> row row -> row A 235 | row x 236 | row x 237 | row row row -> row row -> row B 238 | row row row / 239 | row x 240 | row row row \ 241 | row row row -> row x 242 | row x 243 | 244 | WHERE GROUP BY HAVING ORDER BY 245 | 246 | 247 | The chart above is designed to help you remember 248 | the order in which the major operations of a SELECT take place. 249 | The “paging” restrictions LIMIT and OFFSET occur last, 250 | after all of the steps above have already taken place. 251 | 252 | The SQL language supports several basic expressions. 253 | Several that you will be using in this tutorial are:: 254 | 255 | a + b, a - b, a * b, a / b, et cetera 256 | a = b, a < b, a > b, a <> b 257 | COUNT(), SUM() 258 | LENGTH(string) 259 | string [NOT] LIKE '%case insensitive pattern%' 260 | string [NOT] GLOB 'Case sensitive pattern*' 261 | cond1 AND cond2 262 | cond1 OR cond2 263 | 264 | The basic CRUD (create, read, update, delete) operations are:: 265 | 266 | 1. INSERT INTO table VALUES (a, b, ...); 267 | INSERT INTO table SELECT ...; 268 | 2. SELECT expr, expr, ... FROM table JOIN table ... WHERE ...; 269 | 3. UPDATE table SET field = value, ... WHERE ...; 270 | 4. DELETE FROM table WHERE ...; 271 | 272 | .. 1. The Basics 273 | .. ------------- 274 | 275 | .. * CREATE TABLE 276 | .. * DROP TABLE 277 | .. * CRUD operations: insert, select, update, delete 278 | .. * INSERT 279 | .. * UPDATE 280 | .. * DELETE 281 | .. * Batching operations 282 | 283 | .. 2. Relational Algebra 284 | .. --------------------- 285 | 286 | .. * FOREIGN KEY 287 | .. * PRIMARY KEY 288 | 289 | .. 3. Indexing 290 | .. ----------- 291 | 292 | .. * DB-API 293 | .. * CREATE INDEX 294 | .. * DROP INDEX 295 | .. * Speed of inserting with index vs creating index afterward 296 | 297 | .. 4. Transactions 298 | .. --------------- 299 | 300 | .. * Consistency models 301 | .. * BEGIN 302 | .. * COMMIT 303 | .. * ROLLBACK 304 | .. * CREATE TEMPORARY TABLE 305 | 306 | .. 5. Aggregation 307 | .. -------------- 308 | 309 | .. * HAVING 310 | .. * GROUP BY 311 | .. * OFFSET / LIMIT 312 | .. * SELECT DISTINCT is like GROUP BY but lacks ability to compute SUM() etc 313 | 314 | .. 6. ORMs 315 | .. ------- 316 | 317 | .. * Models 318 | .. * Relations 319 | .. * Lazy vs eager loading 320 | .. * Units of work 321 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, Column, ForeignKey 2 | from sqlalchemy import Integer, String, Boolean 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy.orm import relationship, backref 5 | from sqlalchemy.ext.declarative import declarative_base 6 | 7 | Base = declarative_base() 8 | 9 | class Movie(Base): 10 | __tablename__ = 'movie' 11 | 12 | id = Column(Integer, primary_key=True) 13 | title = Column(String) 14 | year = Column(Integer) 15 | nth = Column(String) 16 | for_video = Column(Boolean) 17 | 18 | def __repr__(self): 19 | return '' % ( 20 | self.title, 21 | (' (%s)' % self.nth if self.nth else ''), 22 | self.year, 23 | (' (V)' if self.for_video else '') 24 | ) 25 | 26 | class Actor(Base): 27 | __tablename__ = 'actor' 28 | 29 | id = Column(Integer, primary_key=True) 30 | name = Column(String) 31 | gender = Column(String) 32 | 33 | def __repr__(self): 34 | suffix = 'or' if self.gender == 'm' else 'ress' 35 | return '' % (suffix, self.name) 36 | 37 | class Role(Base): 38 | __tablename__ = 'role' 39 | 40 | movie_id = Column(Integer, ForeignKey('movie.id'), 41 | primary_key=True) 42 | actor_id = Column(Integer, ForeignKey('actor.id'), 43 | primary_key=True) 44 | name = Column(String, primary_key=True) 45 | 46 | movie = relationship('Movie', backref=backref('roles')) 47 | actor = relationship('Actor', backref=backref('roles')) 48 | 49 | def __repr__(self): 50 | return '' % (self.name) 51 | 52 | def make_session(echo=True): 53 | engine = create_engine('sqlite:///movie.db', echo=echo) 54 | Session = sessionmaker(bind=engine) 55 | return Session() 56 | 57 | if __name__ == '__main__': 58 | session = make_session() 59 | for movie in session.query(Movie).filter_by( 60 | title='Hamlet'): 61 | print movie 62 | for role in movie.roles: 63 | print ' ', role 64 | -------------------------------------------------------------------------------- /notes.org: -------------------------------------------------------------------------------- 1 | 2 | * Overview 3 | ** Data modeling - see slides 4 | *** MongoDB: Roles and actors inline 5 | *** MongoDB: Roles with actor_ids inline 6 | *** SQL: Binary relation between movie and actor 7 | 8 | * Querying 9 | ** Show them exploratory ".tables" ".schema" 10 | ** Show them "* LIMIT 3" with ".mode column" ".header on" 11 | ** Go through exercises 12 | *** Have them do the exercises 13 | *** Review 14 | *** Talk about performance 15 | *** Because whole DB cannot be sorted, we need indexes; add them 16 | *** When ANALYZE comes up, DROP TABLE sqlite_stat1 or sqlite_stat3 17 | ** Mention plural vs singular once they have seen examples 18 | 19 | * JOIN 20 | ** Movies <= 1899, with # of actors 21 | Need LEFT OUTER JOIN 22 | ** Query: movies <= 1899, with their most popular actor 23 | Solution is to create a *subquery* 24 | ** LEFT OUTER JOIN and importance of where criteria go 25 | select * from movie join role on (movie.id = movie_id) 26 | where title = 'Happiness' 27 | and role.name like '%Himself%'; 28 | select * from movie left outer join role 29 | on (movie.id = movie_id) 30 | where title = 'Happiness' 31 | and role.name like '%Himself%'; 32 | select * from movie left outer join role on 33 | (movie.id = movie_id and role.name like '%Himself%') 34 | where title = 'Happiness'; 35 | * Subquery 36 | ** List of actors and their most recent movie "<=2011" 37 | * Different conventions of how to name ids and fields 38 | * raw data structures : objects :: DB-API : ORM 39 | 40 | * ugly_and_not.py 41 | ** start by doing problem in pure SQL 42 | ** compare multi-SELECT approach to JOIN, talk about difference 43 | 44 | * ugly_and_not.py ugly 45 | ** add another level that prints out actor names 46 | ** toggle echo=True and complain about number of queries 47 | ** replace actor-name query with a JOIN 48 | ** change to Dr. Strangelove and show that actor appears twice 49 | ** adjust iterator over JOIN so it only prints actor name once 50 | 51 | * ugly_and_not.py pretty1 52 | ** show the pretty syntax 53 | ** toggle echo=True and complain about number of queries 54 | ** admit that I try to misuse query() by naming the collection 55 | ** move to pretty2 to show how it is really done 56 | 57 | * creating data 58 | ** CREATE TABLE, DROP TABLE 59 | ** INSERT, INSERT-SELECT, concatenated INSERTs for performance 60 | ** UPDATE, DELETE 61 | 62 | * transactions 63 | ** show how inserting a movie at one prompt is not shown in the other 64 | ** but the prompt that created the movie can see it 65 | ** auto-commit vs auto-begin 66 | ** discuss append-only model; old rows are discarded not overwritten 67 | ** consistency models: read-committed vs serializable 68 | ** rebuild a table with BEGIN; DROP TABLE; CREATE TABLE; INSERT; COMMIT 69 | 70 | * temporary tables 71 | 72 | * thinking about performance 73 | ** Gustavo Duarte 74 | ** disk heads moving between index and table, requests queuing up 75 | ** logging to a different device than your database 76 | 77 | -------------------------------------------------------------------------------- /problem.txt: -------------------------------------------------------------------------------- 1 | sqlite> SELECT movie.title, movie.year, role.name, actor.name FROM movie 2 | JOIN role ON (movie.id = role.movie_id) JOIN actor ON (role.actor_id = 3 | actor.id) WHERE movie.title = 'Batman' AND role.name = 'Batman/Bruce 4 | Wayne'; 5 | -------------------------------------------------------------------------------- /slides.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | The Structured Query Language (SQL) 8 | for Python Developers 9 | 10 | 11 | 12 | 13 | 14 | 15 | PyCon 2012 16 | Brandon Rhodes 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | Files 25 | ————— 26 | 27 | • movie.db database 28 | • virtualenv tool 29 | • SQLAlchemy package 30 | • SQLAlchemy documentation 31 | • SQLite3 documentation 32 | • model.py script 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | Principles 43 | —————————— 44 | 45 | • The use of real tools, not just slides 46 | • Seeing me recover from mistakes 47 | • Big picture, not stuff you can look up 48 | • Ask questions! 49 | 50 | 51 | 52 | The Big Picture 53 | ——————————————— 54 | 55 | Python vs SQL: Statements — procedures vs queries 56 | Flow — explicit vs implicit 57 | Variables — objects vs tables 58 | Data types — containers vs relations 59 | Consistency — locks vs transactions 60 | 61 | Query optimization, both single-table and JOIN 62 | 63 | DB-API: Quoting and security 64 | Combining statements for efficiency 65 | 66 | ORM: "Object-Relational Mapper" as persistence 67 | Objects are prettier than the DB-API 68 | Using JOIN to expose your query 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | The Gist 77 | ———————— 78 | 79 | 80 | • One container: TABLE 81 | • One statement: INSERT 82 | • One abstraction: JOIN 83 | 84 | 85 | Why? 86 | 87 | it scales 88 | 89 | 90 | 91 | Another approach: MongoDB 92 | 93 | {title: "Sleuth", 94 | year: 1972, 95 | roles: [ 96 | {name: "Andrew Wyke", 97 | actor: {name: "Laurence Olivier", born: 1907} 98 | }, 99 | {name: "Milo Tindle", 100 | actor: {name: "Michael Caine", born: 1933} 101 | }]} 102 | 103 | {title: "Sleuth", 104 | year: 1972, 105 | roles: [{name: "Andrew Wyke", actor: 1145125}, 106 | {name: "Milo Tindle", actor: 221217} 107 | ]} 108 | 109 | 110 | Role 111 | ┌────────┬────────┬─────────────┐ 112 | │movie_id│actor_id│ name │ 113 | │ . │ . │ . │ 114 | │ 391853│ 1145125│ Andrew Wyke │ 115 | │ . │ . │ . │ 116 | │ 391853│ 221217│ Milo Tindle │ 117 | │ . │ . │ . │ 118 | └────────┴────────┴─────────────┘ 119 | Movie Actor 120 | ┌──────┬──────┬────┐ ┌───────┬──────────────────┬──────┐ 121 | │ id │ name │year│ │ id │ name │gender│ 122 | │ . │ . │ │ │ . │ . │ . │ 123 | │391853│Sleuth│1972│ │ 221217│Caine, Michael (I)│ m │ 124 | │ . │ . │ │ │ . │ . │ . │ 125 | └──────┴──────┴────┘ │1145125│Olivier, Laurence │ m │ 126 | │ . │ . │ . │ 127 | └───────┴──────────────────┴──────┘ 128 | 129 | 130 | 131 | SELECT * FROM table; 132 | SELECT * FROM table LIMIT 3; 133 | SELECT * FROM table WHERE column = value; 134 | SELECT * FROM table WHERE column < value OR column > value; 135 | SELECT * FROM table WHERE column <> value; -- does not eq 136 | SELECT * FROM table ORDER BY column ASC LIMIT 5; 137 | SELECT * FROM table ORDER BY length(column) DESC LIMIT 7; 138 | SELECT column, length(column) FROM table ...; 139 | SELECT DISTINCT gender FROM actor; 140 | 141 | 142 | 143 | ┌┐ ↘ ┌┐ ⎫┌┐ ┌┐ 144 | ├┤ ↗ └┘ ⎭└┘ → └┘ 145 | ├┤ ┌┐ 146 | ├┤ → ├┤ ┌┐ ⎫ 147 | ├┤ └┘ ↘ ├┤ ⎪ ↘ ┌┐ 148 | ├┤ ↗ ├┤ ⎪┌┐ ↗ ├┤ 149 | ├┤ → ┌┐ ↘ ├┤ ⎪└┘ ↘ └┘ 150 | ├┤ └┘ └┘ ⎭ LIMIT 151 | ├┤ WHERE GROUP BY HAVING ORDER BY OFFSET 152 | ├┤ 153 | ├┤ → ┌┐ ┌┐ ⎫ 154 | ├┤ ├┤ ↗ ├┤ ⎪┌┐ ┌┐ 155 | ├┤ └┘ ↘ ├┤ ⎪└┘ → └┘ 156 | ├┤ → ┌┐ ↗ └┘ ⎭ 157 | ├┤ └┘ 158 | ├┤ ┌┐ 159 | └┘ └┘ 160 | 161 | 162 | 163 | EXPLAIN QUERY PLAN SELECT * FROM movie JOIN role ON 164 | (movie.id = movie_id) JOIN actor ON (actor_id = actor.id) 165 | WHERE movie.year >= 1920 AND movie.year <= 1921 AND 166 | actor.name >= 'Smith' AND actor.name <= 'Smitha'; 167 | 168 | CREATE INDEX i1 ON movie (year); 169 | CREATE INDEX i2 ON actor (name); 170 | 171 | (more explains) 172 | 173 | CREATE INDEX i3 ON role (movie_id); 174 | CREATE INDEX i4 ON role (actor_id); 175 | 176 | (more explains) 177 | 178 | ANALYZE; 179 | 180 | Local variables: 181 | mode:text 182 | mode:page 183 | fill-column:59 184 | End: 185 | -------------------------------------------------------------------------------- /ugly_and_not.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import sys 3 | from sqlalchemy.orm import joinedload, subqueryload 4 | from model import Movie, Role, Actor, make_session 5 | 6 | if __name__ == '__main__' and sys.argv[-1] == 'ugly': 7 | 8 | connection = sqlite3.connect('movie.db') 9 | connection.row_factory = sqlite3.Row 10 | cursor = connection.cursor() 11 | title = 'Inception' 12 | 13 | cursor.execute("SELECT * FROM movie WHERE title = :t", 14 | {'t': title}) 15 | for mrow in cursor.fetchall(): 16 | print mrow['title'], 'was made in', mrow['year'] 17 | print 'The characters were named:' 18 | cursor.execute( 19 | "SELECT role.name AS role_name, actor.name AS actor_name" 20 | " FROM role JOIN actor ON (role.actor_id = actor.id)" 21 | " WHERE movie_id = :id", 22 | {'id': mrow['id']}) 23 | for rrow in cursor.fetchall(): 24 | print rrow[0], '/', rrow[1] 25 | 26 | if __name__ == '__main__' and sys.argv[-1] == 'pretty1': 27 | 28 | 29 | session = make_session(echo=False) 30 | title = 'Inception' 31 | 32 | for movie in session.query(Movie).filter_by( 33 | title=title 34 | ): 35 | print movie.title, 'was made in', movie.year 36 | print 'Cast:' 37 | for role in movie.roles: 38 | print '%r played %r' % ( 39 | role.actor.name, role.name) 40 | 41 | if __name__ == '__main__' and sys.argv[-1] == 'pretty2': 42 | 43 | session = make_session(echo=False) 44 | title = 'Inception' 45 | 46 | for movie in session.query(Movie).filter_by( 47 | title=title 48 | ): 49 | print movie.title, 'was made in', movie.year 50 | for role in session.query(Role).filter_by( 51 | movie=movie).options(joinedload('actor') 52 | ): 53 | print '%r played %r' % ( 54 | role.actor.name, role.name) 55 | --------------------------------------------------------------------------------