├── .gitattributes
├── .github
└── workflows
│ └── main.yml
├── .gitignore
├── F-SQL_Misc.pptx
├── MySQL_Install.md
├── README.md
├── TODO.md
├── binder
├── postBuild
└── requirements.txt
├── practice_db_restaurants.sql
├── queries1.ipynb
├── queries2.ipynb
├── session1
├── A1-ER_modeling.pptx
├── A2-ER_modeling-GalleryPainter.pptx
├── A2-ER_modeling-WaterUtility.pptx
├── A3-ER_modeling-DB_vs_Spreadsheets_Normalization.pptx
├── A4-ER_modeling-CreateTablesInSQL.pptx
├── A5-Inserting_Data_in_MySQL_using_Python.ipynb
├── A7-Citibike.ipynb
├── A8-ERD_No_Future_Records.mwb
├── assignment1.pdf
├── assignment2.md
├── cellular_operator_ER_diagram.PNG
├── practice_questions.md
└── practice_questions_solutions.md
├── session2
├── A-Navigation_Queries.ipynb
├── A-SQL_Intro_Navigating_DB.pptx
├── B-SQL_Selection_Queries.pptx
├── B-Selection_Queries.ipynb
├── C-Schemas.pdf
├── C-Schemas.pptx
├── C1-ERD_Facebook.mwb
├── C2-ERD_IMDB.mwb
└── assignment_selection_queries.ipynb
├── session3
├── B3-Filtering_Queries.ipynb
├── B3-SQL_Filtering.pptx
├── assignment_filtering_queries.ipynb
└── practice_questions_filtering.md
├── session4
├── C-Join_Queries.ipynb
├── C-SQL_Joins.pptx
├── README.md
├── assignment_join_queries.ipynb
└── practice_questions_joins_restaurants.md
├── session5
├── D-SQL_Aggregation_Queries.ipynb
├── D-SQL_Aggregation_Queries.pptx
├── assignment_aggregate_queries.ipynb
├── practice_queries_aggregation.md
├── practice_queries_aggregation_solution.ipynb
├── practice_queries_join_and_aggregation.md
└── practice_queries_join_and_aggregation_solutions.ipynb
├── session6
├── F-SQL_Subqueries-7.1.pptx
├── F-SQL_Subqueries.pptx
├── assignment_combined.ipynb
├── book_vs_gender.sql
├── books_and_political_views.sql
├── music_recommendations.sql
├── music_recommendations2.sql
└── music_recommendations3.sql
└── session7
├── G-Window_queries.pptx
├── README.md
└── music_by_gender_rank_example.sql
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=nbstripout
2 |
3 | *.ipynb diff=ipynb
4 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | # This is a basic workflow to help you get started with Actions
2 |
3 | name: Check that all notebooks work
4 |
5 | # Controls when the action will run. Triggers the workflow on push or pull request
6 | # events but only for the master branch
7 | on:
8 | push:
9 | branches: [ master ]
10 | pull_request:
11 | branches: [ master ]
12 |
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 | # This workflow contains a single job called "build"
16 | build:
17 | # The type of runner that the job will run on
18 | runs-on: ubuntu-latest
19 | strategy:
20 | matrix:
21 | python-version: [3.8]
22 |
23 | # Steps represent a sequence of tasks that will be executed as part of the job
24 | steps:
25 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
26 | - uses: actions/checkout@v2
27 |
28 | - name: Set up Python ${{ matrix.python-version }}
29 | uses: actions/setup-python@v1
30 | with:
31 | python-version: ${{ matrix.python-version }}
32 |
33 | - name: Install dependencies and Flake8/Flake8-nb for linting
34 | run: |
35 | python3 -m pip install --upgrade pip
36 | pip3 install -r binder/requirements.txt
37 | - name: Lint with flake8
38 | run: |
39 | # stop the build if there are Python syntax errors or undefined names
40 | # flake8-nb session1/*.ipynb --count --select=E9,F63,F7,F82 --show-source --statistics
41 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
42 | # flake8-nb session1/*.ipynb --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
43 | - name: Run all the notebooks using nbconvert
44 | run: |
45 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session1/A5-Inserting_Data_in_MySQL_using_Python.ipynb
46 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session1/A7-Citibike.ipynb
47 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session2/A-Navigation_Queries.ipynb
48 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session2/B-Selection_Queries.ipynb
49 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session3/B3-Filtering_Queries.ipynb
50 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session4/C-Join_Queries.ipynb
51 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 sql_assignment_template.ipynb
52 |
53 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session2/assignment_selection_queries.ipynb
54 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session3/assignment_filtering_queries.ipynb
55 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session4/assignment_join_queries.ipynb
56 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session5/assignment_aggregate_queries.ipynb
57 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session5/practice_queries_aggregation_solution.ipynb
58 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session5/practice_queries_join_and_aggregation_solutions.ipynb
59 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session6/assignment_combined.ipynb
60 |
61 |
62 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/F-SQL_Misc.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/F-SQL_Misc.pptx
--------------------------------------------------------------------------------
/MySQL_Install.md:
--------------------------------------------------------------------------------
1 | # Install MySQL server
2 |
3 | The instructions below can be used to install MySQL from scratch in any Linux/Ubuntu machine, and to import the databases that we will use as examples. (Most of) these steps have been already completed in the image that is being used for the class, but I keep the instructions here for reference, if a student wants to create a MySQL installation in a new machine.
4 |
5 | ## Install Software on Ubuntu
6 |
7 | Login to the terminal and type:
8 |
9 | `sudo apt-get update`
10 |
11 | and then
12 |
13 | `sudo apt-get -y install mysql-server`
14 |
15 | During installation, you will be prompted to create a password for "root" user. You can use any password you like, but make sure that you remember it. In our own installation, we used the password `dwdstudent2015`.
16 |
17 | After a succesful installation, you will be able to access MySQL server from the console by typing:
18 |
19 | `mysql -u root -p`
20 |
21 | Inside mysql console you can execute SQL commands, for example, the command:
22 |
23 | `mysql> SHOW DATABASES;`
24 |
25 | will show you the databases available. The first that you run the command you will see something like:
26 |
27 |
28 | | Database |
29 | |--------------------|
30 | | information_schema |
31 | | mysql |
32 | | performance_schema |
33 | ```
34 | 3 rows in set (0.00 sec)
35 | ```
36 |
37 | For now, let's get out of the command-line interface, by typing the command `QUIT`
38 |
39 | `mysql> QUIT`
40 |
41 | ### Making MySQL server accessible from host machine
42 |
43 | We need to change a couple of things to make the MySQL database accessible from the host machine. First we need to change the configuration file for MySQL, to allow it to respond to connections from the outside world.
44 |
45 | 1. Make sure that your machine has the port 3306 open in the security settings of your EC2 instance.
46 |
47 | 2. Go and edit the file `/etc/mysql/mysql.conf.d/mysqld.cnf` and find the parameter `bind-address`. By default, the setting is `bind-address = 127.0.0.1`. Change it to `bind-address = 0.0.0.0` in order to allow connections from any machine.
48 |
49 | 3. Connect to MySQL (type `mysql -u root -p` in the shell) and then within MySQL run the following commands:
50 |
51 | `mysql> CREATE USER 'root'@'%' IDENTIFIED BY 'dwdstudent2015';`
52 |
53 | `mysql> GRANT ALL PRIVILEGES ON *.* TO 'root'@'%';`
54 |
55 | `mysql> FLUSH PRIVILEGES;`
56 |
57 | `mysql> exit`
58 |
59 | And then, in the shell:
60 |
61 | `sudo service mysql restart`
62 |
63 | _Note: This is an insecure setup, as it provides admin access to your database, to anyone that has the IP address of your machine and the password._
64 |
65 | ## Import databases
66 |
67 | Now, we are ready to fetch the datasets and store them in the database.
68 |
69 | ### Facebook
70 |
71 | Import a database of the Facebook profiles of the first NYU users (back from 2004-6), before Facebook started paying attention to these annoying issues of privacy and security :-)
72 |
73 | `!zcat data/facebook.sql.gz | mysql -u root -pdwdstudent2015`
74 |
75 |
76 | _Warning_: Importing the Facebook data will take approximately 15-20 minutes, during which the machine will look unresponsive. Please do not stop it.
77 |
78 | ### IMDB
79 |
80 | This database contains a set of tables from the IMDB database.
81 |
82 | `!zcat data/imdb.sql.gz | mysql -u root -pdwdstudent2015`
83 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://colab.research.google.com/github/ipeirotis/introduction-to-databases/blob/master/)
2 |
3 | # Relational Databases and SQL
4 |
5 | This is a brief introductory module to relational databases and SQL. It mainly targets people that are interested in learning SQL, and does not cover topics such as indexing, transactions, stored procedures, etc.
6 |
7 | ## Videos for the class
8 |
9 | [Videos for the class](https://www.youtube.com/playlist?list=PLqAPn_b_yx0QcOgEvAKQQ5yzplFI-FOQI)
10 |
11 | ## Indicative Schedule
12 |
13 | ### Module 1: Entity-Relationship Model and Relational Databases
14 |
15 | * Entities, Primary Keys, and Attributes
16 | * Relations
17 | * Cardinality: One-to-One, One-to-Many, Many-to-Many
18 | * From ER Diagram to a Relational Schema
19 | * (Optional) SQL Statements for Creating Tables
20 | * (Optional) Populating a Database with Data
21 | * Activity 1: Artist-Gallery-Painting example
22 | * Activity 2: Creating a relational schema from an ER diagram
23 | * Activity 3: From Spreadsheet to a Normalized Database
24 |
25 |
26 | ### Module 2: Selection Queries
27 |
28 | * Understand the design of our example databases
29 | * Navigating a Database: `USE`, `SHOW TABLES`, `DESCRIBE`
30 | * Selection queries: `SELECT *`, `SELECT column`, `column AS`,
31 | * Selection queries: `DISTINCT`, `ORDER BY`, `LIMIT`
32 |
33 | ### Module 3: Filtering Queries
34 |
35 | * `WHERE` clause
36 | * Boolean conditions: `AND`, `OR`, `NOT`, `BETWEEN`
37 | * Containment condition: `IN`,
38 | * Approximate matches: `LIKE`
39 | * `NULL` values
40 | * CASE WHEN clause
41 | * Attribute-level functions: NULL functions, date functions, etc.
42 | * Activity: Find People that Live in "New York" (exploration for data cleaning)
43 | * TODO: Create separate slides for Null Functions, Date Functions, and String functions
44 | * TODO: Create videos for CASE WHEN, Null Functions, Date Functions, String functions
45 |
46 | ### Module 4: JOIN queries
47 |
48 | * Inner Joins
49 | * Self Joins
50 | * Outer Joins
51 | * Antinoins and Semijoins
52 |
53 | ### Module 5: Aggregate queries
54 |
55 | * Aggregation functions (COUNT, COUNT DISTINC, SUM, AVG, MAX, MIN, STDEV, CONCAT)
56 | * GROUP BY on a single attribute
57 | * GROUP BY on multiple attributes
58 | * HAVING clause
59 | * Integrated JOIN and GROUP BY queries
60 | * TODO: Add an example for GROUP_CONCAT (e.g. for movie genres)
61 | * TODO: In video ["Integrated Queries: Statistics on Directors"](https://www.youtube.com/watch?v=aeXWO4xHsTw&list=PLqAPn_b_yx0QcOgEvAKQQ5yzplFI-FOQI&index=42) at sec 37 remove the "For movies from the Year 2000"
62 |
63 | ### Module 6: Subqueries
64 |
65 | * Subqueries with single-value results
66 | * Semijoins and Antijoins using subqueries with the IN clause
67 | * Subqueries with derived tables
68 | * Comparison of WITH, temporary tables, views, and tables
69 | * Activity 1: Music recommendations
70 | * Activity 2: Compare Tastes Across Demographic Segments
71 |
72 | ### Module 7: Window queries
73 |
74 | * Window definition: `OVER(ORDER BY)`
75 | * Ranking window functions: `RANK`, `DENSE_RANK`, etc
76 | * Aggregation functions and windows `OVER(PARITION BY ORDER BY)`
77 | * Offset window functions: `LEAD`, `LAG` etc
78 | * Aggregation functions and windows
79 | * Frame definitions and rolling aggregations
80 |
81 |
82 |
83 | ### Planned modules
84 |
85 | * Temporal data (e.g., following https://www.stratascratch.com/guides/sql-time-and-date-skills)
86 | * Geospatial data
87 | * String functions
88 |
89 | * UNION
90 | * ANY/ALL
91 | * ROLLUP / GROUPING
92 | * EXISTS
93 |
94 |
95 |
96 | ## Additional Resources for Learning SQL
97 |
98 | * [StrataScratch](https://platform.stratascratch.com/coding), [Leetcode](https://leetcode.com/problemset/database/), [DataLemur](https://datalemur.com/): SQL Interview questions for data science positions in many tech companies
99 | * [Mode SQL Tutorial](https://mode.com/sql-tutorial/): A well-written and organized tutorial for SQL, with material starting from the very basics up to very advanced.
100 | * [Introduction to SQL](https://www.khanacademy.org/computing/computer-programming/sql) from Khan Academy. Introductory course, with videos explaining the various aspects of SQL.
101 | * [W3Schools SQL](http://www.w3schools.com/sql/): An introduction to SQL with hands-on examples
102 | * [Learn SQL](https://www.codecademy.com/learn/learn-sql) and [SQL: Analyzing Business Metrics](https://www.codecademy.com/learn/sql-analyzing-business-metrics): Two short, self-directed online course from Code Academy
103 | * [SQL Tutorial](http://www.w3resource.com/sql/tutorials.php)
104 | * [Learning MySQL](http://shop.oreilly.com/product/9780596008642.do): A useful textbook for those interested in learning more about SQL
105 | * [W3 Resource](https://www.w3resource.com/sql/tutorials.php) and [SQL exercises](https://www.w3resource.com/sql-exercises/)
106 | * [Become a SELECT Star!](https://gumroad.com/l/sql-zine) by Julia Evans: A very useful e-zine that summarizes in a cartoonish way most of the SQL concepts that we cover in class. Worth the $12.
107 | * [How to Teach People SQL](https://dataschool.com/how-to-teach-people-sql/): Great visualizations for the various SQL operations that we learn.
108 | * [SQL Data Manipulation for Data Scientists](https://www.stratascratch.com/guides/): Advanced data manipulation techniques using SQL.
109 | * [Best practices for SQL](https://data36.com/sql-best-practices-data-analysts/): A set of useful guidelines for writing readable SQL statements.
110 |
111 |
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 |
2 | * Move UNION earlier as well?
3 |
4 | * COALESCE
5 |
6 |
7 |
8 |
9 | Practice
10 |
11 | Read the "Learning MySQL" textbook, chapter 7.
12 | Work on the online SQL Tutorial by W3Scools
13 | Work on the SQL course on Code Academy
14 | Work on the SQL course on Khan Academy
15 |
--------------------------------------------------------------------------------
/binder/postBuild:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | jupyter contrib nbextension install --user
4 |
5 | # Allow collapsible headers
6 | jupyter nbextension enable collapsible_headings/main
7 |
8 | # Add a code formatter - requires yapf package
9 | jupyter nbextension enable code_prettify/code_prettify
10 |
11 | # Adds the "Solution" hiding option
12 | jupyter nbextension enable exercise2/main
13 |
14 | # Spellchecking
15 | jupyter nbextension enable spellchecker/main
16 |
17 |
18 | # Install nbstripout on git and clean the notebooks
19 | nbstripout --install --attributes .gitattributes
20 | nbstripout notes/*.ipynb
21 |
22 |
--------------------------------------------------------------------------------
/binder/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 |
3 | tqdm
4 | matplotlib
5 |
6 | PyMySQL
7 | sqlalchemy
8 | sql_magic
9 |
10 | pandas
11 | openpyxl
12 |
13 | flake8
14 | flake8-nb
15 | jupyter
16 |
17 | mistune>=2.0.3 # not directly required, pinned by Snyk to avoid a vulnerability
18 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
19 |
--------------------------------------------------------------------------------
/practice_db_restaurants.sql:
--------------------------------------------------------------------------------
1 | DROP DATABASE IF EXISTS Restaurants;
2 |
3 | CREATE DATABASE Restaurants;
4 |
5 | USE Restaurants;
6 |
7 | CREATE TABLE Restaurant(
8 | restCode NUMERIC(3) NOT NULL,
9 | restName VARCHAR(30) NOT NULL,
10 | cuisine VARCHAR(25) NOT NULL,
11 | borough VARCHAR(30) NOT NULL,
12 | yearEst SMALLINT NOT NULL CHECK (yearEst >= 1990 AND yearEst <= 2019),
13 | avgPrice NUMERIC(5,2) NOT NULL,
14 | CONSTRAINT Rest_pk PRIMARY KEY (restCode)
15 | );
16 |
17 | CREATE TABLE Critic(
18 | cID NUMERIC(3) NOT NULL,
19 | firstN VARCHAR(25) NOT NULL,
20 | lastN VARCHAR(30) NOT NULL,
21 | affiliation VARCHAR(25),
22 | CONSTRAINT Critic_pk PRIMARY KEY (cID)
23 | );
24 |
25 | CREATE TABLE Rating(
26 | code CHAR(4) NOT NULL,
27 | cID NUMERIC(3) NOT NULL,
28 | restCode NUMERIC(3) NOT NULL,
29 | starRating NUMERIC(1) NOT NULL,
30 | ratingDate DATE ,
31 | comments VARCHAR(250),
32 | CONSTRAINT Rating_pk PRIMARY KEY (code),
33 | CONSTRAINT Rating_rID_fk FOREIGN KEY(cID) REFERENCES Critic(cID),
34 | CONSTRAINT Rating_restCode_fk FOREIGN KEY(restCode) REFERENCES Restaurant(restCode)
35 | );
36 |
37 |
38 | ALTER TABLE Rating
39 | ADD CHECK (starRating <= 5);
40 |
41 | INSERT INTO Restaurant VALUES (101, 'Pok Pok', 'Thai', 'Brooklyn', 2005, 100.00);
42 | INSERT INTO Restaurant VALUES (102, 'Kiin Thai', 'Thai', 'Manhattan', 2013, 75.00);
43 | INSERT INTO Restaurant VALUES (103, 'Carbone', 'Italian', 'Manhattan', 2010, 150.00);
44 | INSERT INTO Restaurant VALUES (104, 'Il Mulino', 'Italian', 'Manhattan', 1999, 250.00);
45 | INSERT INTO Restaurant VALUES (105, 'Don Peppe', 'Italian', 'Queens', 1998, 75.00);
46 | INSERT INTO Restaurant VALUES (106, 'Loukoumi Taverna', 'Greek', 'Queens', 1994, 130.00);
47 | INSERT INTO Restaurant VALUES (107, 'Nisi', 'Greek', 'Manhattan', 2014, 100.00);
48 | INSERT INTO Restaurant VALUES (108, 'Ela Taverna', 'Greek', 'Brooklyn', 2015, 150.00);
49 | INSERT INTO Restaurant VALUES (109, 'Jianbing Company', 'Chinese', 'Brooklyn', 2010, 75.00);
50 | INSERT INTO Restaurant VALUES (110, 'Han Dynasty', 'Chinese', 'Manhattan', 2012, 125.00);
51 | INSERT INTO Restaurant VALUES (111, 'Antonio Trattoria', 'Italian', 'Bronx', 2008, 75.00);
52 |
53 | INSERT INTO Critic VALUES (201,'Sarah', 'Martinez','NYT');
54 | INSERT INTO Critic VALUES (202,'Daniel', 'Lewis', 'WP');
55 | INSERT INTO Critic VALUES (203,'Brittany', 'Harris', 'Vogue');
56 | INSERT INTO Critic VALUES (204,'Mike', 'Anderson',NULL);
57 | INSERT INTO Critic VALUES (205,'Chris', 'Jackson','NYT');
58 | INSERT INTO Critic VALUES (206,'Elizabeth', 'Thomas','Chronicle');
59 | INSERT INTO Critic VALUES (207,'James', 'Cameron','NYP');
60 | INSERT INTO Critic VALUES (208,'Ashley', 'White','NYT');
61 | INSERT INTO Critic (cID, lastN, firstN) VALUES (209, 'Clarke','George');
62 | INSERT INTO Critic VALUES (210,'Sean', 'Thompson','NYP');
63 |
64 | INSERT INTO Rating VALUES ('R1', 201,101,2,'2014-11-13', 'Good food, bad service');
65 | INSERT INTO Rating VALUES ('R2', 201,101,4,'2017-01-15', 'Amazing deserts, so-so appetizers');
66 | INSERT INTO Rating VALUES ('R3', 202,106,4,NULL, 'Great atmosphere, friendly staff');
67 | INSERT INTO Rating VALUES ('R4', 203,103,2,'2015-02-01', 'Disappointed');
68 | INSERT INTO Rating VALUES ('R5', 203,108,4,'2016-03-01', 'Great fish');
69 | INSERT INTO Rating VALUES ('R6', 203,108,2,'2018-06-30', 'Not as good as before');
70 | INSERT INTO Rating VALUES ('R7', 204,101,3,'2017-10-23', NULL);
71 | INSERT INTO Rating VALUES ('R8', 205,103,3,'2012-02-16', NULL);
72 | INSERT INTO Rating (code, cID, restCode,ratingDate,starRating) VALUES ('R9', 205,104,'2000-02-16',2);
73 | INSERT INTO Rating (code, cID, restCode,starRating,comments) VALUES ('R10', 205,108,5, 'Must try fish');
74 | INSERT INTO Rating VALUES ('R11', 206,107,3,'2016-07-02', 'Great food, rude staff');
75 | INSERT INTO Rating VALUES ('R12', 206,106,5,'2001-12-21', 'Loved everything');
76 | INSERT INTO Rating VALUES ('R13', 208,104,3,'2003-06-30', 'Overpriced');
77 | INSERT INTO Rating VALUES ('R14', 209,104,3,'1005-07-30', NULL);
78 |
--------------------------------------------------------------------------------
/session1/A1-ER_modeling.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A1-ER_modeling.pptx
--------------------------------------------------------------------------------
/session1/A2-ER_modeling-GalleryPainter.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A2-ER_modeling-GalleryPainter.pptx
--------------------------------------------------------------------------------
/session1/A2-ER_modeling-WaterUtility.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A2-ER_modeling-WaterUtility.pptx
--------------------------------------------------------------------------------
/session1/A3-ER_modeling-DB_vs_Spreadsheets_Normalization.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A3-ER_modeling-DB_vs_Spreadsheets_Normalization.pptx
--------------------------------------------------------------------------------
/session1/A4-ER_modeling-CreateTablesInSQL.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A4-ER_modeling-CreateTablesInSQL.pptx
--------------------------------------------------------------------------------
/session1/A5-Inserting_Data_in_MySQL_using_Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "view-in-github",
7 | "colab_type": "text"
8 | },
9 | "source": [
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {
17 | "id": "Zq3aQJmsEt1L",
18 | "outputId": "df0cf6b1-57aa-49e8-86e6-425f96359196",
19 | "colab": {
20 | "base_uri": "https://localhost:8080/"
21 | }
22 | },
23 | "outputs": [
24 | {
25 | "output_type": "stream",
26 | "name": "stdout",
27 | "text": [
28 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.8/44.8 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
29 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m40.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
30 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
31 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.9/121.9 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
32 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.4/93.4 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
33 | "\u001b[?25h"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "!sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "IhpLu2WHEt02"
45 | },
46 | "source": [
47 | "## Inserting data in MySQL using Python\n",
48 | "\n",
49 | "First let's start with a basic piece of code that fetches the data that we want to insert in the database. For our example, we will get the data about the Citibike stations, using the correspoding API call provided by the Citibike website:"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {
56 | "id": "g-_StVU-Et03"
57 | },
58 | "outputs": [],
59 | "source": [
60 | "import requests\n",
61 | "import uuid\n",
62 | "from datetime import date, datetime, timedelta"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {
69 | "id": "Ues1lO5FEt09"
70 | },
71 | "outputs": [],
72 | "source": [
73 | "# Let's get the data from the Citibike API\n",
74 | "url = \"https://gbfs.citibikenyc.com/gbfs/en/station_information.json\"\n",
75 | "results = requests.get(url).json()"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 4,
81 | "metadata": {
82 | "id": "Jzs_lCvxEt1B"
83 | },
84 | "outputs": [],
85 | "source": [
86 | "# We only need a subset of the data in the JSON returned by the Citibike API, so we keep only we need\n",
87 | "data = results[\"data\"][\"stations\"]"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 5,
93 | "metadata": {
94 | "id": "gTbzvvk7J-Fd",
95 | "outputId": "1a85e518-ece5-46f2-d7ec-b0d0963d175c",
96 | "colab": {
97 | "base_uri": "https://localhost:8080/"
98 | }
99 | },
100 | "outputs": [
101 | {
102 | "output_type": "execute_result",
103 | "data": {
104 | "text/plain": [
105 | "{'rental_methods': ['KEY', 'CREDITCARD'],\n",
106 | " 'lat': 40.763604677958625,\n",
107 | " 'external_id': 'b442a648-e9f4-4893-951a-64d258bc0e55',\n",
108 | " 'lon': -73.98917958140373,\n",
109 | " 'capacity': 30,\n",
110 | " 'station_id': 'b442a648-e9f4-4893-951a-64d258bc0e55',\n",
111 | " 'eightd_has_key_dispenser': False,\n",
112 | " 'station_type': 'classic',\n",
113 | " 'region_id': '71',\n",
114 | " 'electric_bike_surcharge_waiver': False,\n",
115 | " 'name': 'W 50 St & 9 Ave',\n",
116 | " 'has_kiosk': True,\n",
117 | " 'short_name': '6854.05',\n",
118 | " 'rental_uris': {'android': 'https://bkn.lft.to/lastmile_qr_scan',\n",
119 | " 'ios': 'https://bkn.lft.to/lastmile_qr_scan'},\n",
120 | " 'eightd_station_services': []}"
121 | ]
122 | },
123 | "metadata": {},
124 | "execution_count": 5
125 | }
126 | ],
127 | "source": [
128 | "data[1]"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 6,
134 | "metadata": {
135 | "id": "jAlZwKq8J0OM",
136 | "outputId": "2773c6fe-9716-49e5-f7b0-b00fac0fc09b",
137 | "colab": {
138 | "base_uri": "https://localhost:8080/"
139 | }
140 | },
141 | "outputs": [
142 | {
143 | "output_type": "execute_result",
144 | "data": {
145 | "text/plain": [
146 | "2098"
147 | ]
148 | },
149 | "metadata": {},
150 | "execution_count": 6
151 | }
152 | ],
153 | "source": [
154 | "len(data)"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 7,
160 | "metadata": {
161 | "id": "KxuM5dGREt1Q"
162 | },
163 | "outputs": [],
164 | "source": [
165 | "from sqlalchemy import create_engine\n",
166 | "from sqlalchemy import text\n",
167 | "\n",
168 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/\".format(\n",
169 | " host=\"db.ipeirotis.org\", user=\"student\", password=\"dwdstudent2015\"\n",
170 | ")\n",
171 | "\n",
172 | "engine = create_engine(conn_string)"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {
178 | "id": "VzsnOW_AEt1U"
179 | },
180 | "source": [
181 | "Once we have connected successfully, we need to create our database:"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 8,
187 | "metadata": {
188 | "id": "TQT4IE2FEt1U"
189 | },
190 | "outputs": [],
191 | "source": [
192 | "# Query to create a database\n",
193 | "# In this example, we will try to create the (existing) database \"public\"\n",
194 | "# But in general, we can give any name to the database\n",
195 | "db_name = \"public\"\n",
196 | "create_db_query = (\n",
197 | " f\"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET 'utf8'\"\n",
198 | ")\n",
199 | "\n",
200 | "# Create a database\n",
201 | "with engine.connect() as connection:\n",
202 | " connection.execute(text(create_db_query))"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {
208 | "id": "3sa0ArJhEt1Z"
209 | },
210 | "source": [
211 | "Then we create the table where we will store our data. For our example, we will just import three fields in the database: station_id, station_name, and number_of_docks"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 9,
217 | "metadata": {
218 | "id": "qzWnULWfEt1a",
219 | "outputId": "9ca027be-41d9-4862-905c-d090d6ecb232",
220 | "colab": {
221 | "base_uri": "https://localhost:8080/"
222 | }
223 | },
224 | "outputs": [
225 | {
226 | "output_type": "stream",
227 | "name": "stdout",
228 | "text": [
229 | "d94ec55c\n"
230 | ]
231 | }
232 | ],
233 | "source": [
234 | "# To avoid conflicts between people writing in the same database, we add a random suffix in the tables\n",
235 | "# We only create the variable once while running the notebook\n",
236 | "if \"suffix\" not in globals():\n",
237 | " suffix = str(uuid.uuid4())[:8]\n",
238 | "print(suffix)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 10,
244 | "metadata": {
245 | "id": "GW0oeuOcEt1e"
246 | },
247 | "outputs": [],
248 | "source": [
249 | "table_name = f\"Docks_{suffix}\"\n",
250 | "\n",
251 | "# Drop the table if there is one already\n",
252 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{table_name}\"\n",
253 | "with engine.connect() as connection:\n",
254 | " connection.execute(text(drop_table_query))\n",
255 | "\n",
256 | "# Create a table\n",
257 | "create_table_query = f\"\"\"CREATE TABLE IF NOT EXISTS {db_name}.{table_name}\n",
258 | " (station_id varchar(50),\n",
259 | " station_name varchar(50),\n",
260 | " capacity int,\n",
261 | " PRIMARY KEY(station_id)\n",
262 | " )\"\"\"\n",
263 | "\n",
264 | "with engine.connect() as connection:\n",
265 | " connection.execute(text(create_table_query))\n"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {
271 | "id": "hdI9lReyEt1h"
272 | },
273 | "source": [
274 | "Finally, we import the data into our table, using the INSERT command. (_Note: The `INSERT IGNORE` directs the database to ignore attempts to insert another tuple with the same primary key. In our case, we do not want to allow two entries for the same `station_id`._)"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {
281 | "id": "qQLqOddcEt1i"
282 | },
283 | "outputs": [],
284 | "source": [
285 | "query_template = f\"\"\"\n",
286 | " INSERT IGNORE INTO\n",
287 | " {db_name}.{table_name}(station_id, station_name, capacity)\n",
288 | " VALUES (:station_id, :station_name, :capacity)\n",
289 | " \"\"\"\n",
290 | "\n",
291 | "# THIS IS PROHIBITED\n",
292 | "# query = \"INSERT INTO citibike.Docks(station_id, station_name, number_of_docks) \" + \\\n",
293 | "# \"VALUES (\"+entry[\"id\"]+\", \"+entry[\"stationName\"]+\", \"+entry[\"totalDocks\"]+\")\"\n",
294 | "\n",
295 | "with engine.connect() as connection:\n",
296 | " for entry in data:\n",
297 | " query_parameters = {\n",
298 | " \"station_id\": entry[\"station_id\"],\n",
299 | " \"station_name\": entry[\"name\"],\n",
300 | " \"capacity\": entry[\"capacity\"]\n",
301 | " }\n",
302 | " print(\"Inserting station\", entry[\"station_id\"], \"at\", entry[\"name\"], \"with\", entry[\"capacity\"], \"docks\")\n",
303 | " connection.execute(text(query_template), query_parameters)\n",
304 | " connection.commit()"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "source": [
310 | "## Query the Database to retrieve the data"
311 | ],
312 | "metadata": {
313 | "id": "0y3Dn5m-DEmn"
314 | }
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {
319 | "id": "nZPtYSJOEt1s"
320 | },
321 | "source": [
322 | "Now let's see how to query the database"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 12,
328 | "metadata": {
329 | "id": "qcdbX7AWEt1t"
330 | },
331 | "outputs": [],
332 | "source": [
333 | "with engine.connect() as connection:\n",
334 | " results = connection.execute(text(f\"SELECT station_id, station_name, capacity FROM {db_name}.{table_name}\"))\n",
335 | " rows = results.mappings().all()\n"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 13,
341 | "metadata": {
342 | "id": "BT-lYjnXEt1w",
343 | "outputId": "2f4deaca-3759-48ce-d9b5-9ac9a3ab9563",
344 | "colab": {
345 | "base_uri": "https://localhost:8080/"
346 | }
347 | },
348 | "outputs": [
349 | {
350 | "output_type": "stream",
351 | "name": "stdout",
352 | "text": [
353 | "Number of rows: 2098\n",
354 | "=============================================\n"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "# Let's check how many data points we got back\n",
360 | "print(f\"Number of rows: {len(rows)}\")\n",
361 | "print(\"=============================================\")"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "source": [
367 | "# And now let's go over the results\n",
368 | "for row in rows:\n",
369 | " print(\"Station ID:\", row['station_id'])\n",
370 | " print(\"Station Name:\", row['station_name'])\n",
371 | " print(\"Number of Docks:\", row['capacity'])\n",
372 | " print(\"=============================================\")"
373 | ],
374 | "metadata": {
375 | "id": "yc9ZgIoNvXvY"
376 | },
377 | "execution_count": null,
378 | "outputs": []
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "id": "Ifc_6hwwEt1z"
384 | },
385 | "source": [
386 | "Finally, let's clean up and close our database connection."
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 15,
392 | "metadata": {
393 | "id": "AWMXngKcEt1z"
394 | },
395 | "outputs": [],
396 | "source": [
397 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{table_name}\"\n",
398 | "with engine.connect() as connection:\n",
399 | " connection.execute(text(drop_table_query))"
400 | ]
401 | },
402 | {
403 | "cell_type": "markdown",
404 | "metadata": {
405 | "id": "W9fjdQPNLwfh"
406 | },
407 | "source": [
408 | "## Exercise\n",
409 | "\n",
410 | "At `https://gbfs.citibikenyc.com/gbfs/en/station_status.json` we can access the live status of all the stations (e.g., bikes available etc). Using the approach outlined above, create a table in the database (using the same table suffix that we created above) and store the data in the database."
411 | ]
412 | }
413 | ],
414 | "metadata": {
415 | "colab": {
416 | "name": "A5-Inserting_Data_in_MySQL_using_Python.ipynb",
417 | "provenance": [],
418 | "include_colab_link": true
419 | },
420 | "kernelspec": {
421 | "display_name": "Python 3",
422 | "language": "python",
423 | "name": "python3"
424 | },
425 | "language_info": {
426 | "codemirror_mode": {
427 | "name": "ipython",
428 | "version": 3
429 | },
430 | "file_extension": ".py",
431 | "mimetype": "text/x-python",
432 | "name": "python",
433 | "nbconvert_exporter": "python",
434 | "pygments_lexer": "ipython3",
435 | "version": "3.8.2"
436 | }
437 | },
438 | "nbformat": 4,
439 | "nbformat_minor": 0
440 | }
--------------------------------------------------------------------------------
/session1/A7-Citibike.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# !sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic tqdm"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import requests"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# This gives information for each station that remains stable over time\n",
28 | "url_stations = \"https://gbfs.citibikenyc.com/gbfs/en/station_information.json\""
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# This gives the live status of all the stations (e.g., bikes available etc)\n",
38 | "url_status = \"https://gbfs.citibikenyc.com/gbfs/en/station_status.json\""
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# We fetch for now just the time-invariant data\n",
48 | "results = requests.get(url_stations).json()"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# We only need a subset of the data in the JSON returned by the Citibike API, so we keep only what we need\n",
58 | "stations = results[\"data\"][\"stations\"]"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# We will not be using dataframes for this insertion task. (See the A6 notebook if you want to use Pandas)\n",
68 | "# We just put the data in a dataframe to understand what is going on.\n",
69 | "import pandas as pd\n",
70 | "\n",
71 | "df_stations = pd.DataFrame(stations)\n",
72 | "df_stations.head(5)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "import sqlalchemy\n",
82 | "from sqlalchemy import create_engine\n",
83 | "\n",
84 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/\".format(\n",
85 | " host=\"db.ipeirotis.org\", user=\"student\", password=\"dwdstudent2015\"\n",
86 | ")\n",
87 | "\n",
88 | "engine = create_engine(conn_string)\n",
89 | "\n",
90 | "db_name = \"public\"\n",
91 | "create_db_query = (\n",
92 | " f\"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET 'utf8'\"\n",
93 | ")\n",
94 | "\n",
95 | "# Create a database\n",
96 | "engine.execute(create_db_query)\n",
97 | "\n",
98 | "# And lets switch to the database\n",
99 | "engine.execute(f\"USE {db_name}\")\n",
100 | "\n",
101 | "# To avoid conflicts between people writing in the same database, we add a random suffix in the tables\n",
102 | "# We only create the variable once while running the notebook\n",
103 | "import uuid\n",
104 | "\n",
105 | "if \"suffix\" not in globals():\n",
106 | " suffix = str(uuid.uuid4())[:8]\n",
107 | "print(suffix)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "# Create the two tables. One for storing the time-invariant station data\n",
117 | "# and another table to store the time-varying station status data\n",
118 | "stations_table = f\"Stations_{suffix}\"\n",
119 | "\n",
120 | "sql = f\"\"\"CREATE TABLE IF NOT EXISTS {stations_table}\n",
121 | " (station_id int, \n",
122 | " name varchar(250), \n",
123 | " capacity int,\n",
124 | " lat float,\n",
125 | " lon float,\n",
126 | " region_id int,\n",
127 | " short_name varchar(250),\n",
128 | " rental_url varchar(250),\n",
129 | " eightd_has_key_dispenser bool,\n",
130 | " PRIMARY KEY(station_id)\n",
131 | " )\"\"\"\n",
132 | "engine.execute(sql)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "# Create the time-varying table\n",
142 | "status_table = f\"Status_{suffix}\"\n",
143 | "sql = f\"\"\"CREATE TABLE IF NOT EXISTS {status_table}\n",
144 | " (station_id int, \n",
145 | " last_reported datetime,\n",
146 | " num_bikes_available int,\n",
147 | " num_ebikes_available int,\n",
148 | " num_bikes_disabled int,\n",
149 | " num_docks_available int,\n",
150 | " num_docks_disabled int,\n",
151 | " is_installed bool,\n",
152 | " is_renting bool,\n",
153 | " is_returning bool,\n",
154 | " PRIMARY KEY(station_id, last_reported)\n",
155 | " )\"\"\"\n",
156 | "engine.execute(sql)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "stations[0]"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "# We fetch for now just the time-invariant data\n",
175 | "# Notice that we have the INSERT IGNORE so that even when we add the same entry\n",
176 | "# again, we do not get an error that the line exists. We do get warnings\n",
177 | "# but this is expected\n",
178 | "\n",
179 | "from sqlalchemy.sql import text\n",
180 | "from tqdm.autonotebook import tqdm\n",
181 | "\n",
182 | "query_template = text(\n",
183 | " f\"\"\"INSERT IGNORE INTO {db_name}.{stations_table}\n",
184 | " (station_id, name, capacity, lat, lon,\n",
185 | " region_id, short_name, rental_url, eightd_has_key_dispenser) \n",
186 | " VALUES (:station_id, :name, :capacity, :lat, :lon, :region_id, \n",
187 | " :short_name, :rental_url, :eightd_has_key_dispenser)\"\"\"\n",
188 | ")\n",
189 | "\n",
190 | "# The tqdm(stations) shows a progress bar\n",
191 | "for entry in tqdm(stations):\n",
192 | "\n",
193 | " query_parameters = {\n",
194 | " \"station_id\": int(entry[\"station_id\"]),\n",
195 | " \"name\": entry.get(\"name\"),\n",
196 | " \"capacity\": entry.get(\"capacity\"),\n",
197 | " \"lat\": entry.get(\"lat\"),\n",
198 | " \"lon\": entry.get(\"lon\"),\n",
199 | " \"region_id\": entry.get(\"region_id\"),\n",
200 | " \"short_name\": entry.get(\"short_name\"),\n",
201 | " \"rental_url\": entry.get(\"rental_url\"),\n",
202 | " \"eightd_has_key_dispenser\": entry.get(\"eightd_has_key_dispenser\"),\n",
203 | " }\n",
204 | "\n",
205 | " engine.execute(query_template, **query_parameters)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "check = pd.read_sql(f\"SELECT * FROM {db_name}.{stations_table}\", con=engine)\n",
215 | "check"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "%matplotlib inline\n",
225 | "check.plot(kind=\"scatter\", x=\"lon\", y=\"lat\", s=1, figsize=(10, 10))"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "results = requests.get(url_status).json()\n",
235 | "status = results[\"data\"][\"stations\"]\n",
236 | "status[0]"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "# Now we fetch the data about the time varying elements of the citibike stations\n",
246 | "from datetime import datetime\n",
247 | "\n",
248 | "query_template = text(\n",
249 | " f\"\"\"INSERT IGNORE INTO {db_name}.{status_table}(station_id, \n",
250 | " num_bikes_available,\n",
251 | " num_ebikes_available,\n",
252 | " num_bikes_disabled,\n",
253 | " num_docks_available,\n",
254 | " num_docks_disabled,\n",
255 | " is_installed,\n",
256 | " is_renting,\n",
257 | " is_returning,\n",
258 | " last_reported) \n",
259 | " VALUES (:station_id, :num_bikes_available, :num_ebikes_available, :num_bikes_disabled,\n",
260 | " :num_docks_available, :num_docks_disabled, :is_installed, :is_renting, :is_returning, :last_reported)\"\"\"\n",
261 | ")\n",
262 | "\n",
263 | "for entry in tqdm(status):\n",
264 | " query_parameters = {\n",
265 | " \"station_id\": int(entry[\"station_id\"]),\n",
266 | " \"num_bikes_available\": entry[\"num_bikes_available\"],\n",
267 | " \"num_bikes_disabled\": entry[\"num_bikes_disabled\"],\n",
268 | " \"num_ebikes_available\": entry[\"num_ebikes_available\"],\n",
269 | " \"num_docks_available\": entry[\"num_docks_available\"],\n",
270 | " \"num_docks_disabled\": entry[\"num_docks_disabled\"],\n",
271 | " \"is_installed\": entry[\"is_installed\"],\n",
272 | " \"is_renting\": entry[\"is_renting\"],\n",
273 | " \"is_returning\": entry[\"is_returning\"],\n",
274 | " \"last_reported\": datetime.fromtimestamp(entry[\"last_reported\"]),\n",
275 | " }\n",
276 | "\n",
277 | " engine.execute(query_template, **query_parameters)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "check = pd.read_sql(f\"SELECT * FROM {db_name}.{status_table}\", con=engine)\n",
287 | "check"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{status_table}\"\n",
297 | "print(drop_table_query)\n",
298 | "engine.execute(drop_table_query)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{stations_table}\"\n",
308 | "print(drop_table_query)\n",
309 | "engine.execute(drop_table_query)"
310 | ]
311 | }
312 | ],
313 | "metadata": {
314 | "kernelspec": {
315 | "display_name": "Python 3",
316 | "language": "python",
317 | "name": "python3"
318 | },
319 | "language_info": {
320 | "codemirror_mode": {
321 | "name": "ipython",
322 | "version": 3
323 | },
324 | "file_extension": ".py",
325 | "mimetype": "text/x-python",
326 | "name": "python",
327 | "nbconvert_exporter": "python",
328 | "pygments_lexer": "ipython3",
329 | "version": "3.8.2"
330 | }
331 | },
332 | "nbformat": 4,
333 | "nbformat_minor": 2
334 | }
335 |
--------------------------------------------------------------------------------
/session1/A8-ERD_No_Future_Records.mwb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A8-ERD_No_Future_Records.mwb
--------------------------------------------------------------------------------
/session1/assignment1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/assignment1.pdf
--------------------------------------------------------------------------------
/session1/assignment2.md:
--------------------------------------------------------------------------------
1 | # ER Diagram and Database Design for Time Card Application
2 |
3 |
4 | ## Instructions
5 |
6 | * Create an ER diagram, illustrating the entities, their attributes, the relationship among entities, and the cardinalities of the entities
7 | * Translate the ER diagram into a set of tables, indicating clearly the attributes of each table, the primary key of each table, and the foreign keys that are used to implement the relationships.
8 | * Write the SQL that generates the tables that you created in the step above.
9 |
10 | ## Scenario
11 |
12 | The company you work for wants to digitize their time cards. You are asked to design the database for submitting and approving time cards.
13 |
14 | * Each timecard should have a unique id, hours worked, date submitted, and status, which is either approved, not approved, or pending.
15 | * Each employee has a unique id, name and address, and method of payment: either direct deposit or physical check.
16 | * Each employee submits a time card every pay period (i.e., in 1 year, they will submit multiple time cards).
17 | * Each manager has a unique id and a name.
18 | * Each employee is associated with exactly one manager; each manager is in charge of multiple employees.
19 | * Each manager approves time cards. The manager may approve also timecards for employees that are not necessarily managed by him/her.
20 | * _Trickier part_: How would you handle the case when each manager is also an employee?
21 |
22 | _A quick reminder: In databases, we can limit the domain of values of a variable using two ways:_
23 |
24 | * By making the variable a foreign key, pointing to a different table where we store the potential values.
25 | * By making the domain of the variable an "ENUM" variable that lists the (pre-defined) set of values that the variable can take.
26 |
27 | _Option (1) allows for a bit more flexibility, allowing us to modify the domain of values over time. Option (2) is preferred when the set of values is small, pre-determined, and unlikely to change in the future._
28 |
29 | ## Deliverables
30 |
31 | You can submit your diagram and SQL queries as a Word or PDF file, or any format that we can easily read.
32 |
--------------------------------------------------------------------------------
/session1/cellular_operator_ER_diagram.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/cellular_operator_ER_diagram.PNG
--------------------------------------------------------------------------------
/session1/practice_questions.md:
--------------------------------------------------------------------------------
1 | # Cellular Operator Database Questions
2 |
3 | Look at the diagram "Cellular Operator ER diagram"
4 |
5 | 
6 |
7 | ## a. Can a customer have an unlimited number of plans?
8 | ## b. Can a customer exist without a plan?
9 | ## c. Is it possible to create a plan without knowing who the customer is?
10 | ## d. Does the operator want to limit the types of handsets that can be linked to a specific plan type?
11 | ## e. Is it possible to maintain data regarding a handset without connecting it to a plan?
12 | ## f. Can a handset be associated with multiple plans?
13 | ## g. Assume a handset type exists that can utilize multiple operating systems. Could this situation be accommodated within the model included in Figure 2-24?
14 | ## h. Is the company able to track a manufacturer without maintaining information about its handsets?
15 | ## i. Can the same operating system be used on multiple handset types?
16 | ## j. There are two relationships between Customer and Plan. Explain how they differ.
17 | ## k. Characterize the degree and the cardinalities of the relationship that connects Customer to itself. Explain its meaning.
18 | ## l. Is it possible to link a handset to a specific customer in a plan with multiple customers?
19 | ## m. Can the company track a handset without identifying its operating system?
20 |
--------------------------------------------------------------------------------
/session1/practice_questions_solutions.md:
--------------------------------------------------------------------------------
1 | # Cellular Operator Database Questions
2 |
3 | Look at the diagram "Cellular Operator ER diagram"
4 |
5 | 
6 |
7 | Cellular Operator Database Questions:
8 | ## a. Can a customer have an unlimited number of plans?
9 |
10 | Yes. A Customer may be responsible for 0, 1, or many Plans.
11 |
12 | ## b. Can a customer exist without a plan?
13 | Yes. The minimum cardinality of the Belongs relationship from the Customer to the Plan states that a Customer may exist without a Plan (the minimum cardinality is 0).
14 |
15 | ## c. Is it possible to create a plan without knowing who the customer is?
16 | No. The minimum cardinality of both the “responsible for” and “belongs” relationships between Plan and Customer states that at least one Customer must be related to a Plan.
17 |
18 | ## d. Does the operator want to limit the types of handsets that can be linked to a specific plan type?
19 |
20 | Yes, the cellular operator requires that a Handset (that is a particular type and a particular operating system) is linked to one Plan (that is a particular type of plan). This business rule is to be implemented in this design by indirectly requiring that a Plan Type has 0:M Plans, and each Plan is associated with certain Handsets, and each Handset is of some Handset Type. A given Plan Type is related to Handset Type through the intermediary entity types in this design.
21 |
22 | _Alternative interpretation: No, there is nothing in the current model that creates a condition that would limit – in advance – the handset types that can be related to a specific plan type._
23 |
24 | ## e. Is it possible to maintain data regarding a handset without connecting it to a plan?
25 |
26 | Yes. The minimum cardinality of the Includes relationship between Plan and Handset states that a Handset may be included in 0 or 1 plan. The 0 minimum cardinality means that we can track data about the handset even if it is not connected to a plan; the Handset has optional participation in the Includes relationship with Plan.
27 |
28 | ## f. Can a handset be associated with multiple plans?
29 |
30 | No. The minimum cardinality of the Includes relationship between Plan and Handset states that a Handset may be included in 0 or 1 plan, not multiple plans.
31 |
32 | ## g. Assume a handset type exists that can utilize multiple operating systems. Could this situation be accommodated within the model included in Figure 2-24?
33 |
34 | No. The current model shows that a handset type is associated with one and only one operating system.
35 |
36 | ## h. Is the company able to track a manufacturer without maintaining information about its handsets?
37 |
38 | Yes. The minimum cardinality of the relationship between Manufacturer and Handset Type indicates that we can track data about a Manufacturer even if we have no (or zero) Handset Types in our database.
39 |
40 | ## i. Can the same operating system be used on multiple handset types?
41 |
42 | Yes. The maximum cardinality on the relationship between Operating System and Handset Type indicates that an Operating System may be used on 0, 1, or many Handset types.
43 |
44 | ## j. There are two relationships between Customer and Plan. Explain how they differ.
45 |
46 | The Responsible For relationship is an overall 1:M relationship between Customer and Plan. A Customer can be responsible for 0, 1, or many Plans yet any one Plan will be linked to only 1 Customer for responsibility purposes. The Belongs relationship is an overall M:M relationship that permits the linking of multiple customers to a single plan, as in the case of family members being part of a particular plan or different plans.
47 |
48 | ## k. Characterize the degree and the cardinalities of the relationship that connects Customer to itself. Explain its meaning.
49 |
50 | The “Family Member” relationship that connects Customer to itself has a degree of 1 (unary). It permits the tracking of each family member as a Customer. Any Customer may be a Family Member of 0, 1, or many Customer(s); as a Family Member Customer, the Customer may be linked to 0 or 1 Customer.
51 |
52 | ## l. Is it possible to link a handset to a specific customer in a plan with multiple customers?
53 | No, this is not possible according to the current model. However, the current model could be adjusted to create an Associative Entity to track the particular Customer instance with a particular Plan instance, that is then associated with a particular Handset. This suggested extension to the current model also permits a design that will easily extend the database’s ability to track additional data about the particular Customer instance with a particular Plan instance.
54 |
55 | ## m. Can the company track a handset without identifying its operating system?
56 | No. The minimum cardinality of the relationship between Handset Type and Operating System is 1 and only 1; the minimum of 1 is a mandatory participation for the Handset Type with the Operating System.
57 |
58 |
--------------------------------------------------------------------------------
/session2/A-Navigation_Queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "QBeMTyL26ULI"
7 | },
8 | "source": [
9 | "# SQL Queries: Navigating a database"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {
15 | "id": "w8oa_JjV7F_h"
16 | },
17 | "source": [
18 | "## Setup"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "0SuVR-1d7Jr4"
25 | },
26 | "source": [
27 | "We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "id": "I6vMQAK86ipS"
35 | },
36 | "outputs": [],
37 | "source": [
38 | "!pip3 install -U PyMySQL sqlalchemy sql_magic "
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {
45 | "id": "vOuWkjz36ULS"
46 | },
47 | "outputs": [],
48 | "source": [
49 | "%reload_ext sql_magic"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "id": "KE_bhLw16ULK"
57 | },
58 | "outputs": [],
59 | "source": [
60 | "from sqlalchemy import create_engine\n",
61 | "\n",
62 | "conn_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8'.format(\n",
63 | " host = 'db.ipeirotis.org', \n",
64 | " user = 'student',\n",
65 | " password = 'dwdstudent2015',\n",
66 | " encoding = 'utf-8')\n",
67 | "engine = create_engine(conn_string)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "id": "fivqywfX6ULV"
75 | },
76 | "outputs": [],
77 | "source": [
78 | "%config SQL.conn_name = 'engine'"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {
84 | "id": "dsiY_1S46ULY"
85 | },
86 | "source": [
87 | "## Navigation Queries"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "id": "1psXlZbQ6ULZ"
95 | },
96 | "outputs": [],
97 | "source": [
98 | "%%read_sql\n",
99 | "show databases"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "id": "G64fIWqC6ULc"
107 | },
108 | "outputs": [],
109 | "source": [
110 | "%%read_sql\n",
111 | "use imdb"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "id": "EwGGEhe_6ULh"
119 | },
120 | "outputs": [],
121 | "source": [
122 | "%%read_sql\n",
123 | "show tables"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {
130 | "id": "bFfQ9mqa6ULm"
131 | },
132 | "outputs": [],
133 | "source": [
134 | "%%read_sql\n",
135 | "describe actors"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {
142 | "id": "escUGjfpedCj"
143 | },
144 | "outputs": [],
145 | "source": [
146 | "%%read_sql\n",
147 | "describe movies"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {
154 | "id": "lQhe8-fm6ULt"
155 | },
156 | "outputs": [],
157 | "source": [
158 | "%%read_sql\n",
159 | "describe roles"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "id": "0yDQhYOvedCk"
167 | },
168 | "outputs": [],
169 | "source": [
170 | "%%read_sql\n",
171 | "use facebook"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {
178 | "id": "5r-1inpjedCk"
179 | },
180 | "outputs": [],
181 | "source": [
182 | "%%read_sql\n",
183 | "show tables"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "id": "kr5eW5p1edCk"
191 | },
192 | "outputs": [],
193 | "source": [
194 | "%%read_sql\n",
195 | "describe Profiles"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {
202 | "id": "lA32DuPfedCl"
203 | },
204 | "outputs": [],
205 | "source": []
206 | }
207 | ],
208 | "metadata": {
209 | "colab": {
210 | "name": "A-Navigation_Queries.ipynb",
211 | "provenance": []
212 | },
213 | "kernelspec": {
214 | "display_name": "Python 3",
215 | "language": "python",
216 | "name": "python3"
217 | },
218 | "language_info": {
219 | "codemirror_mode": {
220 | "name": "ipython",
221 | "version": 3
222 | },
223 | "file_extension": ".py",
224 | "mimetype": "text/x-python",
225 | "name": "python",
226 | "nbconvert_exporter": "python",
227 | "pygments_lexer": "ipython3",
228 | "version": "3.8.5"
229 | }
230 | },
231 | "nbformat": 4,
232 | "nbformat_minor": 1
233 | }
234 |
--------------------------------------------------------------------------------
/session2/A-SQL_Intro_Navigating_DB.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/A-SQL_Intro_Navigating_DB.pptx
--------------------------------------------------------------------------------
/session2/B-SQL_Selection_Queries.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/B-SQL_Selection_Queries.pptx
--------------------------------------------------------------------------------
/session2/B-Selection_Queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "view-in-github",
7 | "colab_type": "text"
8 | },
9 | "source": [
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {
16 | "id": "QBeMTyL26ULI"
17 | },
18 | "source": [
19 | "# SQL: Selection Queries"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {
25 | "heading_collapsed": true,
26 | "id": "w8oa_JjV7F_h"
27 | },
28 | "source": [
29 | "## Setup"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {
35 | "hidden": true,
36 | "id": "0SuVR-1d7Jr4"
37 | },
38 | "source": [
39 | "We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook."
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {
46 | "hidden": true,
47 | "id": "I6vMQAK86ipS"
48 | },
49 | "outputs": [],
50 | "source": [
51 | "!sudo pip3 install -U -q PyMySQL 'sqlalchemy<2.0' sql_magic"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {
58 | "hidden": true,
59 | "id": "vOuWkjz36ULS"
60 | },
61 | "outputs": [],
62 | "source": [
63 | "%reload_ext sql_magic"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {
70 | "hidden": true,
71 | "id": "KE_bhLw16ULK"
72 | },
73 | "outputs": [],
74 | "source": [
75 | "from sqlalchemy import create_engine\n",
76 | "\n",
77 | "conn_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8'.format(\n",
78 | " host='db.ipeirotis.org', \n",
79 | " user='student',\n",
80 | " password='dwdstudent2015',\n",
81 | " encoding='utf-8')\n",
82 | "engine = create_engine(conn_string).connect()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "hidden": true,
90 | "id": "fivqywfX6ULV"
91 | },
92 | "outputs": [],
93 | "source": [
94 | "%config SQL.conn_name = 'engine'"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {
100 | "heading_collapsed": true,
101 | "id": "yPvFpNjaxW4C"
102 | },
103 | "source": [
104 | "## `SELECT *` "
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {
110 | "hidden": true,
111 | "id": "IuEncSZuxW4C"
112 | },
113 | "source": [
114 | "### IMDb"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "hidden": true,
122 | "id": "DGOuI3fFxW4D"
123 | },
124 | "outputs": [],
125 | "source": [
126 | "%%read_sql\n",
127 | "USE imdb"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {
133 | "hidden": true,
134 | "id": "oHodKBJ9xW4D"
135 | },
136 | "source": [
137 | "#### Return all movies"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "hidden": true,
145 | "id": "2Zww7GjgxW4D"
146 | },
147 | "outputs": [],
148 | "source": [
149 | "%%read_sql\n",
150 | "SELECT *\n",
151 | "FROM movies"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {
157 | "hidden": true,
158 | "id": "gbOg4kbqxW4E"
159 | },
160 | "source": [
161 | "#### Return all directors\n"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "hidden": true,
169 | "id": "QaKgWz2sxW4F"
170 | },
171 | "outputs": [],
172 | "source": [
173 | "%%read_sql\n",
174 | "SELECT *\n",
175 | "FROM directors"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {
181 | "hidden": true,
182 | "id": "ju8SsxtwxW4F"
183 | },
184 | "source": [
185 | "#### Return all actors\n"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {
192 | "hidden": true,
193 | "id": "0c2iftjExW4F"
194 | },
195 | "outputs": [],
196 | "source": [
197 | "%%read_sql\n",
198 | "SELECT *\n",
199 | "FROM actors"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {
205 | "hidden": true,
206 | "id": "rFk9FivRxW4F"
207 | },
208 | "source": [
209 | "#### Return all roles\n"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "hidden": true,
217 | "id": "IvsNDS2sxW4G"
218 | },
219 | "outputs": [],
220 | "source": [
221 | "%%read_sql\n",
222 | "SELECT *\n",
223 | "FROM roles"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {
229 | "hidden": true,
230 | "id": "jGxenkfNxW4G"
231 | },
232 | "source": [
233 | "#### Return all genres for the movies"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {
240 | "hidden": true,
241 | "id": "nTgSFPSLxW4G"
242 | },
243 | "outputs": [],
244 | "source": [
245 | "%%read_sql\n",
246 | "SELECT *\n",
247 | "FROM movies_genres"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {
253 | "heading_collapsed": true,
254 | "hidden": true,
255 | "id": "pzXIxBXrxW4G"
256 | },
257 | "source": [
258 | "### Facebook"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {
265 | "hidden": true,
266 | "id": "Wix5cQ16xW4H"
267 | },
268 | "outputs": [],
269 | "source": [
270 | "%%read_sql\n",
271 | "USE facebook"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {
277 | "hidden": true,
278 | "id": "lBI1dg24xW4H"
279 | },
280 | "source": [
281 | "#### Return all students"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "metadata": {
288 | "hidden": true,
289 | "id": "6FLhWtCQxW4H"
290 | },
291 | "outputs": [],
292 | "source": [
293 | "%%read_sql\n",
294 | "SELECT *\n",
295 | "FROM Profiles"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {
301 | "hidden": true,
302 | "id": "FJZI_EEsxW4H"
303 | },
304 | "source": [
305 | "#### Return the hobbies of all students"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "hidden": true,
313 | "id": "BCtC4H-UxW4I"
314 | },
315 | "outputs": [],
316 | "source": [
317 | "%%read_sql\n",
318 | "SELECT *\n",
319 | "FROM Hobbies"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {
325 | "hidden": true,
326 | "id": "ngFHnxf2xW4I"
327 | },
328 | "source": [
329 | "#### Return the relationship status for all students"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {
336 | "hidden": true,
337 | "id": "2Ryic6aSxW4I"
338 | },
339 | "outputs": [],
340 | "source": [
341 | "%%read_sql\n",
342 | "SELECT *\n",
343 | "FROM Relationship"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {
349 | "hidden": true,
350 | "id": "fk3nJOgYxW4I"
351 | },
352 | "source": [
353 | "#### Return what students are looking for"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {
360 | "hidden": true,
361 | "id": "3F3rnkRnxW4I"
362 | },
363 | "outputs": [],
364 | "source": [
365 | "%%read_sql\n",
366 | "SELECT *\n",
367 | "FROM LookingFor"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {
374 | "hidden": true,
375 | "id": "JSTMaQXJxW4J"
376 | },
377 | "outputs": [],
378 | "source": []
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "heading_collapsed": true,
384 | "id": "fVwBIvGDxW4J"
385 | },
386 | "source": [
387 | "## `SELECT` _attr_"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {
393 | "heading_collapsed": true,
394 | "hidden": true,
395 | "id": "kQJtzrHYxW4J"
396 | },
397 | "source": [
398 | "### IMDb"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {
405 | "hidden": true,
406 | "id": "sv_uYUxcxW4J"
407 | },
408 | "outputs": [],
409 | "source": [
410 | "%%read_sql\n",
411 | "USE imdb"
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {
417 | "hidden": true,
418 | "id": "ZoKJnhhPxW4J"
419 | },
420 | "source": [
421 | "#### Return the first and last names of actors\n",
422 | "\n"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {
429 | "hidden": true,
430 | "id": "Lc5YDvQtxW4K"
431 | },
432 | "outputs": [],
433 | "source": [
434 | "%%read_sql\n",
435 | "SELECT first_name, last_name\n",
436 | "FROM actors"
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "metadata": {
442 | "hidden": true,
443 | "id": "2vrL_r77xW4K"
444 | },
445 | "source": [
446 | "#### Return year and ranking for each movie"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "hidden": true,
454 | "id": "jGiKGiu-xW4K"
455 | },
456 | "outputs": [],
457 | "source": [
458 | "%%read_sql\n",
459 | "SELECT year, rating\n",
460 | "FROM movies "
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {
466 | "heading_collapsed": true,
467 | "hidden": true,
468 | "id": "psXpcf0pxW4K"
469 | },
470 | "source": [
471 | "### Facebook"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {
478 | "hidden": true,
479 | "id": "azk-jU76xW4L"
480 | },
481 | "outputs": [],
482 | "source": [
483 | "%%read_sql\n",
484 | "USE facebook"
485 | ]
486 | },
487 | {
488 | "cell_type": "markdown",
489 | "metadata": {
490 | "hidden": true,
491 | "id": "fP07lq9axW4L"
492 | },
493 | "source": [
494 | "#### Return Name, Sex, and Birthday of all students\n",
495 | "\n"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {
502 | "hidden": true,
503 | "id": "nykOUqP2xW4L"
504 | },
505 | "outputs": [],
506 | "source": [
507 | "%%read_sql\n",
508 | "SELECT Name, Sex, Birthday\n",
509 | "FROM Profiles"
510 | ]
511 | },
512 | {
513 | "cell_type": "markdown",
514 | "metadata": {
515 | "hidden": true,
516 | "id": "hXsIb1JFxW4L"
517 | },
518 | "source": [
519 | "#### Return Sex, and Political Views of all students\n"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {
526 | "hidden": true,
527 | "id": "L9-QRoVRxW4L"
528 | },
529 | "outputs": [],
530 | "source": [
531 | "%%read_sql\n",
532 | "SELECT Sex, PoliticalViews\n",
533 | "FROM Profiles"
534 | ]
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "metadata": {
539 | "hidden": true,
540 | "id": "ogmxY8GqxW4M"
541 | },
542 | "source": [
543 | "#### Return the Relationship status column"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": null,
549 | "metadata": {
550 | "hidden": true,
551 | "id": "nZgAtjWExW4M"
552 | },
553 | "outputs": [],
554 | "source": [
555 | "%%read_sql\n",
556 | "SELECT Status\n",
557 | "FROM Relationship"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {
563 | "heading_collapsed": true,
564 | "id": "0LKcTbgqxW4M"
565 | },
566 | "source": [
567 | "## `SELECT` _attr_ `AS` _alias_"
568 | ]
569 | },
570 | {
571 | "cell_type": "markdown",
572 | "metadata": {
573 | "heading_collapsed": true,
574 | "hidden": true,
575 | "id": "-dEvRtrgxW4M"
576 | },
577 | "source": [
578 | "### IMDb"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": null,
584 | "metadata": {
585 | "hidden": true,
586 | "id": "R9HUKaOxxW4M"
587 | },
588 | "outputs": [],
589 | "source": [
590 | "%%read_sql\n",
591 | "USE imdb"
592 | ]
593 | },
594 | {
595 | "cell_type": "markdown",
596 | "metadata": {
597 | "hidden": true,
598 | "id": "sl9XSy7pxW4N"
599 | },
600 | "source": [
601 | "#### Return id, first, and last names of actors. Rename id to “actor_id”\n"
602 | ]
603 | },
604 | {
605 | "cell_type": "code",
606 | "execution_count": null,
607 | "metadata": {
608 | "hidden": true,
609 | "id": "J_DJf9tDxW4N"
610 | },
611 | "outputs": [],
612 | "source": [
613 | "%%read_sql\n",
614 | "SELECT id AS actor_id, first_name, last_name\n",
615 | "FROM actors"
616 | ]
617 | },
618 | {
619 | "cell_type": "markdown",
620 | "metadata": {
621 | "hidden": true,
622 | "id": "u-_QtxARxW4N"
623 | },
624 | "source": [
625 | "#### Return name, year, and rank for each movie. Rename name to “movie_title”, year to “release_year”, and rank to “rating”\n"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": null,
631 | "metadata": {
632 | "hidden": true,
633 | "id": "rzMXuNCYxW4O"
634 | },
635 | "outputs": [],
636 | "source": [
637 | "%%read_sql\n",
638 | "SELECT name AS movie_title, year AS release_year, rating\n",
639 | "FROM movies"
640 | ]
641 | },
642 | {
643 | "cell_type": "markdown",
644 | "metadata": {
645 | "heading_collapsed": true,
646 | "hidden": true,
647 | "id": "ELd2839yxW4O"
648 | },
649 | "source": [
650 | "### Facebook"
651 | ]
652 | },
653 | {
654 | "cell_type": "code",
655 | "execution_count": null,
656 | "metadata": {
657 | "hidden": true,
658 | "id": "dVKrn1eexW4O"
659 | },
660 | "outputs": [],
661 | "source": [
662 | "%%read_sql\n",
663 | "USE facebook"
664 | ]
665 | },
666 | {
667 | "cell_type": "markdown",
668 | "metadata": {
669 | "hidden": true,
670 | "id": "JEMcLdzjxW4O"
671 | },
672 | "source": [
673 | "#### Return Sex and Status of all students. Rename Sex to Gender and Status to UniversityStatus\n"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": null,
679 | "metadata": {
680 | "hidden": true,
681 | "id": "AXYi60ujxW4P"
682 | },
683 | "outputs": [],
684 | "source": [
685 | "%%read_sql\n",
686 | "SELECT Sex AS Gender, Status AS UniversityStatus\n",
687 | "FROM Profiles"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "metadata": {
693 | "heading_collapsed": true,
694 | "id": "saRGTmsCxW4P"
695 | },
696 | "source": [
697 | "## `SELECT DISTINCT`"
698 | ]
699 | },
700 | {
701 | "cell_type": "markdown",
702 | "metadata": {
703 | "hidden": true,
704 | "id": "pyEYz880xW4P"
705 | },
706 | "source": [
707 | "### IMDb"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "metadata": {
714 | "hidden": true,
715 | "id": "whoxqP7kxW4P"
716 | },
717 | "outputs": [],
718 | "source": [
719 | "%%read_sql\n",
720 | "USE imdb"
721 | ]
722 | },
723 | {
724 | "cell_type": "markdown",
725 | "metadata": {
726 | "hidden": true,
727 | "id": "Azc9xnTaxW4P"
728 | },
729 | "source": [
730 | "#### Find all the movie genres\n"
731 | ]
732 | },
733 | {
734 | "cell_type": "code",
735 | "execution_count": null,
736 | "metadata": {
737 | "hidden": true,
738 | "id": "WLjFcGdwxW4P"
739 | },
740 | "outputs": [],
741 | "source": [
742 | "%%read_sql\n",
743 | "SELECT DISTINCT genre\n",
744 | "FROM movies_genres"
745 | ]
746 | },
747 | {
748 | "cell_type": "markdown",
749 | "metadata": {
750 | "hidden": true,
751 | "id": "f8Ga6i3DxW4Q"
752 | },
753 | "source": [
754 | "### Facebook"
755 | ]
756 | },
757 | {
758 | "cell_type": "code",
759 | "execution_count": null,
760 | "metadata": {
761 | "hidden": true,
762 | "id": "De8LSi6cxW4Q"
763 | },
764 | "outputs": [],
765 | "source": [
766 | "%%read_sql\n",
767 | "USE facebook"
768 | ]
769 | },
770 | {
771 | "cell_type": "markdown",
772 | "metadata": {
773 | "hidden": true,
774 | "id": "AkIDG3KsxW4Q"
775 | },
776 | "source": [
777 | "#### Return the distinct PoliticalViews from the Profiles table\n",
778 | "\n"
779 | ]
780 | },
781 | {
782 | "cell_type": "code",
783 | "execution_count": null,
784 | "metadata": {
785 | "hidden": true,
786 | "id": "b-ymmWVkxW4Q"
787 | },
788 | "outputs": [],
789 | "source": [
790 | "%%read_sql\n",
791 | "SELECT DISTINCT PoliticalViews\n",
792 | "FROM Profiles"
793 | ]
794 | },
795 | {
796 | "cell_type": "markdown",
797 | "metadata": {
798 | "hidden": true,
799 | "id": "kS0nAehsxW4Q"
800 | },
801 | "source": [
802 | "#### Return the distinct Sex values from the Profiles table\n"
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": null,
808 | "metadata": {
809 | "hidden": true,
810 | "id": "B9LuPxiKxW4R"
811 | },
812 | "outputs": [],
813 | "source": [
814 | "%%read_sql\n",
815 | "SELECT DISTINCT Sex\n",
816 | "FROM Profiles"
817 | ]
818 | },
819 | {
820 | "cell_type": "markdown",
821 | "metadata": {
822 | "hidden": true,
823 | "id": "46-bSulWxW4R"
824 | },
825 | "source": [
826 | "#### Find what students are “LookingFor”\n"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": null,
832 | "metadata": {
833 | "hidden": true,
834 | "id": "cCYsU1QLxW4R"
835 | },
836 | "outputs": [],
837 | "source": [
838 | "%%read_sql\n",
839 | "SELECT DISTINCT LookingFor\n",
840 | "FROM LookingFor"
841 | ]
842 | },
843 | {
844 | "cell_type": "markdown",
845 | "metadata": {
846 | "hidden": true,
847 | "id": "TgSJZ15AxW4R"
848 | },
849 | "source": [
850 | "#### Find all possible “Relationship” statuses\n"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": null,
856 | "metadata": {
857 | "hidden": true,
858 | "id": "dxqC8E6WxW4S"
859 | },
860 | "outputs": [],
861 | "source": [
862 | "%%read_sql\n",
863 | "SELECT DISTINCT Status\n",
864 | "FROM Relationship"
865 | ]
866 | },
867 | {
868 | "cell_type": "markdown",
869 | "metadata": {
870 | "hidden": true,
871 | "id": "Qjv2YeSHxW4S"
872 | },
873 | "source": [
874 | "#### Find all possible Concentrations"
875 | ]
876 | },
877 | {
878 | "cell_type": "code",
879 | "execution_count": null,
880 | "metadata": {
881 | "hidden": true,
882 | "id": "FIGzIj1-xW4S"
883 | },
884 | "outputs": [],
885 | "source": [
886 | "%%read_sql\n",
887 | "SELECT DISTINCT Concentration\n",
888 | "FROM Concentration"
889 | ]
890 | },
891 | {
892 | "cell_type": "markdown",
893 | "metadata": {
894 | "heading_collapsed": true,
895 | "id": "hjO4AvmvxW4S"
896 | },
897 | "source": [
898 | "## `ORDER BY` and `LIMIT`"
899 | ]
900 | },
901 | {
902 | "cell_type": "markdown",
903 | "metadata": {
904 | "heading_collapsed": true,
905 | "hidden": true,
906 | "id": "FbDgVSSKxW4S"
907 | },
908 | "source": [
909 | "### IMDb"
910 | ]
911 | },
912 | {
913 | "cell_type": "code",
914 | "execution_count": null,
915 | "metadata": {
916 | "hidden": true,
917 | "id": "_skmdZT2xW4T"
918 | },
919 | "outputs": [],
920 | "source": [
921 | "%%read_sql\n",
922 | "USE imdb"
923 | ]
924 | },
925 | {
926 | "cell_type": "markdown",
927 | "metadata": {
928 | "hidden": true,
929 | "id": "JYZy92-AxW4T"
930 | },
931 | "source": [
932 | "#### Find the top-10 ranked movies\n",
933 | "* Rank by “rank” first (descending order)\n",
934 | "* Break ties using “year”\n",
935 | "* Break remaining ties using “name”\n"
936 | ]
937 | },
938 | {
939 | "cell_type": "code",
940 | "execution_count": null,
941 | "metadata": {
942 | "hidden": true,
943 | "id": "3tjOeTblxW4T"
944 | },
945 | "outputs": [],
946 | "source": [
947 | "%%read_sql\n",
948 | "SELECT *\n",
949 | "FROM movies\n",
950 | "ORDER BY rating DESC, year, name\n",
951 | "LIMIT 10"
952 | ]
953 | },
954 | {
955 | "cell_type": "markdown",
956 | "metadata": {
957 | "hidden": true,
958 | "id": "6aRMgGMMxW4U"
959 | },
960 | "source": [
961 | "#### List all the distinct years of the movies, in descending order\n",
962 | "\n"
963 | ]
964 | },
965 | {
966 | "cell_type": "code",
967 | "execution_count": null,
968 | "metadata": {
969 | "hidden": true,
970 | "id": "hbtwEevuxW4U"
971 | },
972 | "outputs": [],
973 | "source": [
974 | "%%read_sql\n",
975 | "SELECT DISTINCT year\n",
976 | "FROM movies\n",
977 | "ORDER BY year DESC"
978 | ]
979 | },
980 | {
981 | "cell_type": "markdown",
982 | "metadata": {
983 | "heading_collapsed": true,
984 | "hidden": true,
985 | "id": "SxFeqckTxW4U"
986 | },
987 | "source": [
988 | "### Facebook"
989 | ]
990 | },
991 | {
992 | "cell_type": "code",
993 | "execution_count": null,
994 | "metadata": {
995 | "hidden": true,
996 | "id": "26MWAK52xW4U"
997 | },
998 | "outputs": [],
999 | "source": [
1000 | "%%read_sql\n",
1001 | "USE facebook"
1002 | ]
1003 | },
1004 | {
1005 | "cell_type": "markdown",
1006 | "metadata": {
1007 | "hidden": true,
1008 | "id": "IZuvNF0_xW4U"
1009 | },
1010 | "source": [
1011 | "#### List the first 50 students that joined Facebook at NYU (use the “MemberSince” attribute)"
1012 | ]
1013 | },
1014 | {
1015 | "cell_type": "code",
1016 | "execution_count": null,
1017 | "metadata": {
1018 | "hidden": true,
1019 | "id": "lfYfRjJFxW4V"
1020 | },
1021 | "outputs": [],
1022 | "source": [
1023 | "%%read_sql\n",
1024 | "SELECT *\n",
1025 | "FROM Profiles\n",
1026 | "ORDER BY MemberSince \n",
1027 | "LIMIT 50"
1028 | ]
1029 | },
1030 | {
1031 | "cell_type": "markdown",
1032 | "metadata": {
1033 | "hidden": true,
1034 | "id": "vDiaOtTExW4V"
1035 | },
1036 | "source": [
1037 | "#### List the 10 students that have not updated their profiles for the longest time (use the “LastUpdate” attribute) – what is the problem?"
1038 | ]
1039 | },
1040 | {
1041 | "cell_type": "code",
1042 | "execution_count": null,
1043 | "metadata": {
1044 | "hidden": true,
1045 | "id": "c7pv49jbxW4V"
1046 | },
1047 | "outputs": [],
1048 | "source": [
1049 | "%%read_sql\n",
1050 | "SELECT *\n",
1051 | "FROM Profiles\n",
1052 | "-- WHERE LastUpdate IS NOT NULL -- We need this filtering condition for the query to work as expected\n",
1053 | "ORDER BY LastUpdate\n",
1054 | "LIMIT 10"
1055 | ]
1056 | }
1057 | ],
1058 | "metadata": {
1059 | "colab": {
1060 | "name": "A-Navigation_Queries.ipynb",
1061 | "provenance": [],
1062 | "include_colab_link": true
1063 | },
1064 | "kernelspec": {
1065 | "display_name": "Python 3",
1066 | "language": "python",
1067 | "name": "python3"
1068 | },
1069 | "language_info": {
1070 | "codemirror_mode": {
1071 | "name": "ipython",
1072 | "version": 3
1073 | },
1074 | "file_extension": ".py",
1075 | "mimetype": "text/x-python",
1076 | "name": "python",
1077 | "nbconvert_exporter": "python",
1078 | "pygments_lexer": "ipython3",
1079 | "version": "3.8.2"
1080 | }
1081 | },
1082 | "nbformat": 4,
1083 | "nbformat_minor": 0
1084 | }
--------------------------------------------------------------------------------
/session2/C-Schemas.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C-Schemas.pdf
--------------------------------------------------------------------------------
/session2/C-Schemas.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C-Schemas.pptx
--------------------------------------------------------------------------------
/session2/C1-ERD_Facebook.mwb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C1-ERD_Facebook.mwb
--------------------------------------------------------------------------------
/session2/C2-ERD_IMDB.mwb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C2-ERD_IMDB.mwb
--------------------------------------------------------------------------------
/session2/assignment_selection_queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "view-in-github"
8 | },
9 | "source": [
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {
16 | "colab_type": "text",
17 | "id": "SkZqhwkf8NUg"
18 | },
19 | "source": [
20 | "# Session 2: Selection Queries Assignment\n",
21 | "\n",
22 | "\n",
23 | "In this segment we will connect to the *Music* database."
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {
29 | "colab_type": "text",
30 | "id": "5u_6yLTDT6Kn"
31 | },
32 | "source": [
33 | "## Setup"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "colab": {},
41 | "colab_type": "code",
42 | "id": "O9o9NsaO8hMy"
43 | },
44 | "outputs": [],
45 | "source": [
46 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "colab": {},
54 | "colab_type": "code",
55 | "id": "EkIL-uRK8NUi"
56 | },
57 | "outputs": [],
58 | "source": [
59 | "# This code creates a connection to the database\n",
60 | "from sqlalchemy import create_engine\n",
61 | "\n",
62 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
63 | " host=\"db.ipeirotis.org\",\n",
64 | " user=\"student\",\n",
65 | " db=\"music\",\n",
66 | " password=\"dwdstudent2015\",\n",
67 | " encoding=\"utf8mb4\",\n",
68 | ")\n",
69 | "\n",
70 | "engine = create_engine(conn_string)\n",
71 | "con = engine.connect()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "colab": {},
79 | "colab_type": "code",
80 | "id": "z7muzQXTUFkU"
81 | },
82 | "outputs": [],
83 | "source": [
84 | "%reload_ext sql_magic"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "colab": {},
92 | "colab_type": "code",
93 | "id": "uHRIPxBvUGfC"
94 | },
95 | "outputs": [],
96 | "source": [
97 | "%config SQL.conn_name = 'engine'"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "This is an example of how you can write an SQL query in the notebook.\n",
105 | "You write your SQL query after the `%%read_sql` line"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {
112 | "colab": {
113 | "base_uri": "https://localhost:8080/",
114 | "height": 390
115 | },
116 | "colab_type": "code",
117 | "id": "sWa1Uv_6X9zi",
118 | "outputId": "bbc44d81-02b5-4b8b-b776-42f378ed941a"
119 | },
120 | "outputs": [],
121 | "source": [
122 | "%%read_sql\n",
123 | "SELECT * \n",
124 | "FROM played"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {
130 | "colab_type": "text",
131 | "id": "H0hhloRRUJlV"
132 | },
133 | "source": [
134 | "## Question 1: Show all the tables that appear in the Music database"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "colab": {},
142 | "colab_type": "code",
143 | "id": "eL_CnyPRUSGI"
144 | },
145 | "outputs": [],
146 | "source": [
147 | "%%read_sql\n"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {
153 | "colab_type": "text",
154 | "id": "Hz_1yX-EUeBQ"
155 | },
156 | "source": [
157 | "## Question 2: Show the attributes available for each artist"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {
164 | "colab": {},
165 | "colab_type": "code",
166 | "id": "HXy0Ygy3Uf_m"
167 | },
168 | "outputs": [],
169 | "source": [
170 | "%%read_sql\n"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {
176 | "colab_type": "text",
177 | "id": "UozymxRTW-wx"
178 | },
179 | "source": [
180 | "## Question 3: Show the attributes available for each album"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {
187 | "colab": {},
188 | "colab_type": "code",
189 | "id": "ZwtqsRoGW-wz"
190 | },
191 | "outputs": [],
192 | "source": [
193 | "%%read_sql\n"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {
199 | "colab_type": "text",
200 | "id": "8NFSCApmXGZ8"
201 | },
202 | "source": [
203 | "## Question 4: Show the attributes available for each track"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {
210 | "colab": {},
211 | "colab_type": "code",
212 | "id": "bcr5iccDXGZ9"
213 | },
214 | "outputs": [],
215 | "source": [
216 | "%%read_sql\n"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {
222 | "colab_type": "text",
223 | "id": "nezZleqbUeI_"
224 | },
225 | "source": [
226 | "## Question 5: Show all the artists"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "colab": {},
234 | "colab_type": "code",
235 | "id": "rNfpSzT3UgrM"
236 | },
237 | "outputs": [],
238 | "source": [
239 | "%%read_sql\n"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {
245 | "colab_type": "text",
246 | "id": "_j2bCzADXODe"
247 | },
248 | "source": [
249 | "## Question 6: Show all the albums"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {
256 | "colab": {},
257 | "colab_type": "code",
258 | "id": "o9h005rlXODf"
259 | },
260 | "outputs": [],
261 | "source": [
262 | "%%read_sql\n"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {
268 | "colab_type": "text",
269 | "id": "HF9cHpSDXaZd"
270 | },
271 | "source": [
272 | "## Question 7: Show all the tracks"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {
279 | "colab": {},
280 | "colab_type": "code",
281 | "id": "mvLQdbdiXSmw"
282 | },
283 | "outputs": [],
284 | "source": [
285 | "%%read_sql\n"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {
291 | "colab_type": "text",
292 | "id": "ntj5f4n8U3dT"
293 | },
294 | "source": [
295 | "## Question 8: List all the names of the artists, without the artist ids, sorted alphabetically "
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {
302 | "colab": {},
303 | "colab_type": "code",
304 | "id": "mPCpyhbwU3dV"
305 | },
306 | "outputs": [],
307 | "source": [
308 | "%%read_sql\n"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {
314 | "colab_type": "text",
315 | "id": "23bSA8IUU1Og"
316 | },
317 | "source": [
318 | "## Question 9: Show all the album names and the corresponding artist id, but do not show the album_id. Rename the album_name attribute to album_title."
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {
325 | "colab": {},
326 | "colab_type": "code",
327 | "id": "hTD97qjaU1Oi"
328 | },
329 | "outputs": [],
330 | "source": [
331 | "%%read_sql\n"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {
337 | "colab_type": "text",
338 | "id": "eWgdmtHHURaS"
339 | },
340 | "source": [
341 | "## Question 10: List the 10 shortest tracks, in terms of playing time"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {
348 | "colab": {},
349 | "colab_type": "code",
350 | "id": "ygI_N9gBUhWy"
351 | },
352 | "outputs": [],
353 | "source": [
354 | "%%read_sql\n"
355 | ]
356 | }
357 | ],
358 | "metadata": {
359 | "anaconda-cloud": {},
360 | "colab": {
361 | "collapsed_sections": [],
362 | "include_colab_link": true,
363 | "name": "Session2-Assignment",
364 | "provenance": []
365 | },
366 | "kernelspec": {
367 | "display_name": "Python 3",
368 | "language": "python",
369 | "name": "python3"
370 | },
371 | "language_info": {
372 | "codemirror_mode": {
373 | "name": "ipython",
374 | "version": 3
375 | },
376 | "file_extension": ".py",
377 | "mimetype": "text/x-python",
378 | "name": "python",
379 | "nbconvert_exporter": "python",
380 | "pygments_lexer": "ipython3",
381 | "version": "3.8.2"
382 | }
383 | },
384 | "nbformat": 4,
385 | "nbformat_minor": 1
386 | }
387 |
--------------------------------------------------------------------------------
/session3/B3-Filtering_Queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "QBeMTyL26ULI"
8 | },
9 | "source": [
10 | "# SQL: Filtering Queries"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {
16 | "colab_type": "text",
17 | "id": "w8oa_JjV7F_h"
18 | },
19 | "source": [
20 | "## Setup"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {
26 | "colab_type": "text",
27 | "id": "0SuVR-1d7Jr4"
28 | },
29 | "source": [
30 | "We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook."
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {
37 | "colab": {},
38 | "colab_type": "code",
39 | "id": "I6vMQAK86ipS"
40 | },
41 | "outputs": [],
42 | "source": [
43 | "# !sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "colab": {},
51 | "colab_type": "code",
52 | "id": "vOuWkjz36ULS"
53 | },
54 | "outputs": [],
55 | "source": [
56 | "%reload_ext sql_magic"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "colab": {},
64 | "colab_type": "code",
65 | "id": "KE_bhLw16ULK"
66 | },
67 | "outputs": [],
68 | "source": [
69 | "from sqlalchemy import create_engine\n",
70 | "\n",
71 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/?charset=utf8\".format(\n",
72 | " host=\"db.ipeirotis.org\", user=\"student\", password=\"dwdstudent2015\", encoding=\"utf-8\"\n",
73 | ")\n",
74 | "engine = create_engine(conn_string)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {
81 | "colab": {},
82 | "colab_type": "code",
83 | "id": "fivqywfX6ULV"
84 | },
85 | "outputs": [],
86 | "source": [
87 | "%config SQL.conn_name = 'engine'"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {
93 | "heading_collapsed": true
94 | },
95 | "source": [
96 | "## `WHERE`: Equality Conditions"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {
102 | "hidden": true
103 | },
104 | "source": [
105 | "### IMDb"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {
112 | "hidden": true
113 | },
114 | "outputs": [],
115 | "source": [
116 | "%%read_sql\n",
117 | "USE imdb"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {
123 | "hidden": true
124 | },
125 | "source": [
126 | "#### Find the movie entry with id 64729."
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "hidden": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "%%read_sql\n",
138 | "SELECT *\n",
139 | "FROM movies\n",
140 | "WHERE id = 64729"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {
146 | "hidden": true
147 | },
148 | "source": [
149 | "#### Find the movie entry with movie title ‘Pulp Fiction’"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "hidden": true
157 | },
158 | "outputs": [],
159 | "source": [
160 | "%%read_sql\n",
161 | "SELECT *\n",
162 | "FROM movies\n",
163 | "WHERE name = 'Pulp Fiction'"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {
169 | "hidden": true
170 | },
171 | "source": [
172 | "#### Find the id of the movie “Schindler's List”. (Attention to the quote)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {
179 | "hidden": true
180 | },
181 | "outputs": [],
182 | "source": [
183 | "%%read_sql\n",
184 | "SELECT *\n",
185 | "FROM movies\n",
186 | "WHERE name = 'Schindler\\'s List'"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {
193 | "hidden": true
194 | },
195 | "outputs": [],
196 | "source": [
197 | "%%read_sql\n",
198 | "SELECT *\n",
199 | "FROM movies\n",
200 | "WHERE name = \"Schindler's List\""
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {
206 | "heading_collapsed": true,
207 | "hidden": true
208 | },
209 | "source": [
210 | "#### List all the roles for the movie with id 290070. Sort them alphabetically\n"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {
217 | "hidden": true
218 | },
219 | "outputs": [],
220 | "source": [
221 | "%%read_sql\n",
222 | "SELECT *\n",
223 | "FROM roles\n",
224 | "WHERE movie_id = 290070\n",
225 | "ORDER BY role"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {
231 | "heading_collapsed": true
232 | },
233 | "source": [
234 | "## `WHERE`: Boolean Operators"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {
240 | "heading_collapsed": true,
241 | "hidden": true
242 | },
243 | "source": [
244 | "### IMDb"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {
251 | "hidden": true
252 | },
253 | "outputs": [],
254 | "source": [
255 | "%%read_sql\n",
256 | "USE imdb"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {
262 | "hidden": true
263 | },
264 | "source": [
265 | "#### Fetch all info for actresses (female gender) whose first name is Skyler"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {
272 | "hidden": true
273 | },
274 | "outputs": [],
275 | "source": [
276 | "%%read_sql\n",
277 | "SELECT *\n",
278 | "FROM actors\n",
279 | "WHERE gender = 'F' AND first_name = 'Skyler'"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {
285 | "hidden": true
286 | },
287 | "source": [
288 | "#### Fetch all info for the director Steven Spielberg\n"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {
295 | "hidden": true
296 | },
297 | "outputs": [],
298 | "source": [
299 | "%%read_sql\n",
300 | "SELECT *\n",
301 | "FROM directors\n",
302 | "WHERE first_name = 'Steven' AND last_name = 'Spielberg'"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {
308 | "hidden": true
309 | },
310 | "source": [
311 | "#### Fetch all info for the directors with last names Scorsese, Polanski, and Spielberg. Use the OR for your Boolean query.\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {
318 | "hidden": true
319 | },
320 | "outputs": [],
321 | "source": [
322 | "%%read_sql\n",
323 | "SELECT *\n",
324 | "FROM directors\n",
325 | "WHERE last_name = 'Scorsese' OR last_name = 'Polanski' Or last_name = 'Spielberg'"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {
331 | "hidden": true
332 | },
333 | "source": [
334 | "#### Fetch all info for the directors Quentin Tarantino, Stanley Kubrick, and Orson Welles."
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": null,
340 | "metadata": {
341 | "hidden": true
342 | },
343 | "outputs": [],
344 | "source": [
345 | "%%read_sql\n",
346 | "SELECT *\n",
347 | "FROM directors\n",
348 | "WHERE (first_name = 'Quentin' AND last_name = 'Tarantino') OR \n",
349 | " (first_name = 'Stanley' AND last_name = 'Kubrick') OR \n",
350 | " (first_name = 'Orson' AND last_name = 'Welles')"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {
356 | "heading_collapsed": true
357 | },
358 | "source": [
359 | "## `WHERE`: Inequality Queries"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {
365 | "hidden": true
366 | },
367 | "source": [
368 | "### IMDb"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "metadata": {
375 | "hidden": true
376 | },
377 | "outputs": [],
378 | "source": [
379 | "%%read_sql\n",
380 | "USE imdb"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {
386 | "hidden": true
387 | },
388 | "source": [
389 | "#### Find all information about movies that were released before 1895 (excl)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {
396 | "hidden": true
397 | },
398 | "outputs": [],
399 | "source": [
400 | "%%read_sql\n",
401 | "SELECT *\n",
402 | "FROM movies\n",
403 | "WHERE year<1895"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {
409 | "hidden": true
410 | },
411 | "source": [
412 | "#### Find all information about movies released between 1895 and 1898 (excl) \n",
413 | "\n",
414 | "Try both using Boolean operators and using the BETWEEN operator"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {
421 | "hidden": true
422 | },
423 | "outputs": [],
424 | "source": [
425 | "%%read_sql\n",
426 | "SELECT *\n",
427 | "FROM movies\n",
428 | "WHERE year>1895 AND year<1898"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {
435 | "hidden": true
436 | },
437 | "outputs": [],
438 | "source": [
439 | "%%read_sql\n",
440 | "SELECT *\n",
441 | "FROM movies\n",
442 | "WHERE year BETWEEN 1896 AND 1897 -- notice that BETWEEN is inclusive on both sides"
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "metadata": {
448 | "hidden": true
449 | },
450 | "source": [
451 | "#### Find all information about movies that were released before 1895 and after 2006 (inclusive)"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": null,
457 | "metadata": {
458 | "hidden": true
459 | },
460 | "outputs": [],
461 | "source": [
462 | "%%read_sql\n",
463 | "SELECT *\n",
464 | "FROM movies\n",
465 | "WHERE year<=1895 OR year>=2006"
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "metadata": {
471 | "heading_collapsed": true
472 | },
473 | "source": [
474 | "## The `IN` operator"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {
480 | "heading_collapsed": true,
481 | "hidden": true
482 | },
483 | "source": [
484 | "### IMDb"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {
491 | "hidden": true
492 | },
493 | "outputs": [],
494 | "source": [
495 | "%%read_sql\n",
496 | "USE imdb"
497 | ]
498 | },
499 | {
500 | "cell_type": "markdown",
501 | "metadata": {
502 | "hidden": true
503 | },
504 | "source": [
505 | "#### Fetch all info for the directors with last names Scorsese, Polanski, and Spielberg. Use `IN` for your Boolean query."
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "metadata": {
512 | "hidden": true
513 | },
514 | "outputs": [],
515 | "source": [
516 | "%%read_sql\n",
517 | "SELECT * \n",
518 | "FROM directors\n",
519 | "WHERE last_name IN ( 'Scorsese', 'Spielberg', 'Polanski' );"
520 | ]
521 | },
522 | {
523 | "cell_type": "markdown",
524 | "metadata": {
525 | "hidden": true
526 | },
527 | "source": [
528 | "#### Fetch all info for the directors Quentin Tarantino, Stanley Kubrick, and Orson Welles. Use `IN` for your Boolean query.\n"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": null,
534 | "metadata": {
535 | "hidden": true
536 | },
537 | "outputs": [],
538 | "source": [
539 | "%%read_sql\n",
540 | "SELECT * FROM directors\n",
541 | "WHERE (first_name, last_name) IN (\n",
542 | " ('Quentin', 'Tarantino'), \n",
543 | " ('Stanley', 'Kubrick'), \n",
544 | " ('Orson', 'Welles') \n",
545 | ")\n"
546 | ]
547 | },
548 | {
549 | "cell_type": "markdown",
550 | "metadata": {
551 | "heading_collapsed": true
552 | },
553 | "source": [
554 | "## The `LIKE` operator for approximate queries"
555 | ]
556 | },
557 | {
558 | "cell_type": "markdown",
559 | "metadata": {
560 | "heading_collapsed": true,
561 | "hidden": true
562 | },
563 | "source": [
564 | "### IMDb"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {
571 | "hidden": true
572 | },
573 | "outputs": [],
574 | "source": [
575 | "%%read_sql\n",
576 | "USE imdb"
577 | ]
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "metadata": {
582 | "hidden": true
583 | },
584 | "source": [
585 | "#### Find the entry for Alfred Hitchcock\n",
586 | "\n",
587 | "Hint: Use an approximation for his first name\n"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": null,
593 | "metadata": {
594 | "hidden": true
595 | },
596 | "outputs": [],
597 | "source": [
598 | "%%read_sql\n",
599 | "SELECT *\n",
600 | "FROM directors\n",
601 | "WHERE last_name = 'Hitchcock' AND first_name LIKE 'A%%' -- The double %% is only necessary when writing SQL \n",
602 | " -- within Jupyter notebooks. "
603 | ]
604 | },
605 | {
606 | "cell_type": "markdown",
607 | "metadata": {
608 | "hidden": true
609 | },
610 | "source": [
611 | "#### Find the Godfather movies, released in 1972, 1974, and 1990"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": null,
617 | "metadata": {
618 | "hidden": true
619 | },
620 | "outputs": [],
621 | "source": [
622 | "%%read_sql\n",
623 | "SELECT *\n",
624 | "FROM movies\n",
625 | "WHERE name LIKE 'Godfather%%' AND -- The double %% is only necessary when writing SQL \n",
626 | " year IN (1972, 1974, 1990) -- within Jupyter notebooks. "
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": null,
632 | "metadata": {
633 | "hidden": true
634 | },
635 | "outputs": [],
636 | "source": []
637 | }
638 | ],
639 | "metadata": {
640 | "colab": {
641 | "name": "A-Navigation_Queries.ipynb",
642 | "provenance": []
643 | },
644 | "kernelspec": {
645 | "display_name": "Python 3",
646 | "language": "python",
647 | "name": "python3"
648 | },
649 | "language_info": {
650 | "codemirror_mode": {
651 | "name": "ipython",
652 | "version": 3
653 | },
654 | "file_extension": ".py",
655 | "mimetype": "text/x-python",
656 | "name": "python",
657 | "nbconvert_exporter": "python",
658 | "pygments_lexer": "ipython3",
659 | "version": "3.8.5"
660 | }
661 | },
662 | "nbformat": 4,
663 | "nbformat_minor": 1
664 | }
665 |
--------------------------------------------------------------------------------
/session3/B3-SQL_Filtering.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session3/B3-SQL_Filtering.pptx
--------------------------------------------------------------------------------
/session3/assignment_filtering_queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "colab": {
7 | "name": "Session 3 Assignment",
8 | "provenance": [],
9 | "collapsed_sections": [],
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.6.6"
28 | }
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "SkZqhwkf8NUg"
45 | },
46 | "source": [
47 | "# Session 3: Filtering Queries Assignment\n",
48 | "\n",
49 | "\n",
50 | "In this segment we will connect to the *Music* database."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "5u_6yLTDT6Kn"
57 | },
58 | "source": [
59 | "## Setup"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "O9o9NsaO8hMy"
66 | },
67 | "source": [
68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic"
69 | ],
70 | "execution_count": null,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "EkIL-uRK8NUi"
77 | },
78 | "source": [
79 | "# This code creates a connection to the database\n",
80 | "from sqlalchemy import create_engine\n",
81 | "\n",
82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
83 | " host=\"db.ipeirotis.org\",\n",
84 | " user=\"student\",\n",
85 | " db=\"music\",\n",
86 | " password=\"dwdstudent2015\",\n",
87 | " encoding=\"utf8mb4\",\n",
88 | ")\n",
89 | "\n",
90 | "engine = create_engine(conn_string)\n",
91 | "con = engine.connect()"
92 | ],
93 | "execution_count": null,
94 | "outputs": []
95 | },
96 | {
97 | "cell_type": "code",
98 | "metadata": {
99 | "id": "z7muzQXTUFkU"
100 | },
101 | "source": [
102 | "%reload_ext sql_magic"
103 | ],
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "cell_type": "code",
109 | "metadata": {
110 | "id": "uHRIPxBvUGfC"
111 | },
112 | "source": [
113 | "%config SQL.conn_name = 'engine'"
114 | ],
115 | "execution_count": null,
116 | "outputs": []
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "id": "UJmBOzZyLnTI"
122 | },
123 | "source": [
124 | "This is an example of how you can write an SQL query in the notebook. You write your SQL query after the `%%read_sql` line"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "metadata": {
130 | "id": "sWa1Uv_6X9zi"
131 | },
132 | "source": [
133 | "%%read_sql\n",
134 | "SELECT * \n",
135 | "FROM played"
136 | ],
137 | "execution_count": null,
138 | "outputs": []
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {
143 | "id": "H0hhloRRUJlV"
144 | },
145 | "source": [
146 | "## Question 1: Show the entry for the artist with id equal to 5."
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "metadata": {
152 | "id": "eL_CnyPRUSGI"
153 | },
154 | "source": [
155 | "%%read_sql\n"
156 | ],
157 | "execution_count": null,
158 | "outputs": []
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {
163 | "id": "Hz_1yX-EUeBQ"
164 | },
165 | "source": [
166 | "## Question 2: Show the entry for the artist named `The Rolling Stones` "
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "metadata": {
172 | "id": "HXy0Ygy3Uf_m"
173 | },
174 | "source": [
175 | "%%read_sql\n"
176 | ],
177 | "execution_count": null,
178 | "outputs": []
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {
183 | "id": "UozymxRTW-wx"
184 | },
185 | "source": [
186 | "## Question 3: Using the `id` of Rolling Stones from Question 2, list all the albums of `The Rolling Stones`"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "metadata": {
192 | "id": "ZwtqsRoGW-wz"
193 | },
194 | "source": [
195 | "%%read_sql\n"
196 | ],
197 | "execution_count": null,
198 | "outputs": []
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {
203 | "id": "8NFSCApmXGZ8"
204 | },
205 | "source": [
206 | "## Question 4: Find the tracks for the artist with id `3`, from the artist's album with id `2`."
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "metadata": {
212 | "id": "bcr5iccDXGZ9"
213 | },
214 | "source": [
215 | "%%read_sql"
216 | ],
217 | "execution_count": null,
218 | "outputs": []
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {
223 | "id": "nezZleqbUeI_"
224 | },
225 | "source": [
226 | "## Question 5: Find the tracts with names that earlier alphabetically than (is less than) `M`.\n",
227 | "\n",
228 | "Note that inequality queries can be used with text and not only with numbers."
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "metadata": {
234 | "id": "rNfpSzT3UgrM"
235 | },
236 | "source": [
237 | "%%read_sql\n"
238 | ],
239 | "execution_count": null,
240 | "outputs": []
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {
245 | "id": "_j2bCzADXODe"
246 | },
247 | "source": [
248 | "## Question 6: Find all albums with a title that begins with a character greater than `E` (*not* inclusive of albums that start with `E`) but less than `S` (again, *not* inclusive of albums that start with `S`)."
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "metadata": {
254 | "id": "o9h005rlXODf"
255 | },
256 | "source": [
257 | "%%read_sql\n"
258 | ],
259 | "execution_count": null,
260 | "outputs": []
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {
265 | "id": "HF9cHpSDXaZd"
266 | },
267 | "source": [
268 | "## Question 7: List the 10 longest tracks in terms of time length."
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "metadata": {
274 | "id": "mvLQdbdiXSmw"
275 | },
276 | "source": [
277 | "%%read_sql\n"
278 | ],
279 | "execution_count": null,
280 | "outputs": []
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {
285 | "id": "ntj5f4n8U3dT"
286 | },
287 | "source": [
288 | "## Question 8: List all the tracts for the artists with ids 1, 3, and 5. Show two variations of the query. One query using the `OR` boolean condition, and one query using the `IN` operation."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "metadata": {
294 | "id": "mPCpyhbwU3dV"
295 | },
296 | "source": [
297 | "%%read_sql\n",
298 | "# Using the OR"
299 | ],
300 | "execution_count": null,
301 | "outputs": []
302 | },
303 | {
304 | "cell_type": "code",
305 | "metadata": {
306 | "id": "kGPCoB8NgZZP"
307 | },
308 | "source": [
309 | "%%read_sql\n",
310 | "# Using the IN"
311 | ],
312 | "execution_count": null,
313 | "outputs": []
314 | },
315 | {
316 | "cell_type": "markdown",
317 | "metadata": {
318 | "id": "23bSA8IUU1Og"
319 | },
320 | "source": [
321 | "## Question 9: Find all the tracks that include the word \"Love\" anywhere in the title. It is fine to include tracks where `love` is part of a bigger word (e.g, `lovebird`).\n",
322 | "\n",
323 | "A bit of a complication: In the Colab environment, you need to enter the `%` character twice in a `LIKE` string instead of just once in MySQL Workbench."
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "metadata": {
329 | "id": "hTD97qjaU1Oi"
330 | },
331 | "source": [
332 | "%%read_sql\n"
333 | ],
334 | "execution_count": null,
335 | "outputs": []
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {
340 | "id": "eWgdmtHHURaS"
341 | },
342 | "source": [
343 | "## Question 10: List all the tracks by artist with id `1` starting with the letter `L` and order them in descending order of time length."
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "metadata": {
349 | "id": "ygI_N9gBUhWy"
350 | },
351 | "source": [
352 | "%%read_sql\n"
353 | ],
354 | "execution_count": null,
355 | "outputs": []
356 | }
357 | ]
358 | }
--------------------------------------------------------------------------------
/session3/practice_questions_filtering.md:
--------------------------------------------------------------------------------
1 | # Filtering practice queries.
2 |
3 | ## Restaurants Database
4 |
5 | 1. Output the names of all the Thai restaurants stored in your DB.
6 | 2. Output the names of all the Brooklyn restaurants stored in your DB there were established prior
7 | to 2012.
8 | 3. Show the list of restaurants together with their cuisine and location with the average price
9 | higher than $120.00
10 | 4. Show the names of all the food critics who work in NYT or NYP together with the corresponding
11 | affiliation.
12 | 5. Output the first and last names of the reviewers who are freelancers and to not have any
13 | affiliation.
14 | 6. Output the first and last names of authors whose last name starts with the letter ‘A’.
15 | 7. Show all the restaurant names whose length is less than 10 characters.
16 | 8. Output all the records from the Rating table for the reviews made by the critic with cID 202.
17 | 9. Output all the records from the Rating table for the reviews made by the critic with cID 210.
18 | 10. Output all the records from the Rating table for the reviews with the starRating greater than
19 | 11. Output the names of all the Italian Manhattan restaurants.
20 | 12. Output all the names of all the Bronx restaurants with the average prices greater than $100.00
21 |
22 | ## Facebook Database
23 |
24 | 1. Get the names and sex of all liberal students
25 | 2. Get the names, sex, and political views of liberal and very liberal students
26 | 3. Find all students who live in “Weinstein Hall”, independent of their room number
27 | 4. Find all students with first name “Richard”
28 | 5. Find all students with first names starting with P and last names starting with I (e.g. Panos Ipeirotis)
29 |
--------------------------------------------------------------------------------
/session4/C-SQL_Joins.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session4/C-SQL_Joins.pptx
--------------------------------------------------------------------------------
/session4/README.md:
--------------------------------------------------------------------------------
1 | A nice explanation of inner vs outer joins:
2 | https://www.stratascratch.com/blog/types-of-pandas-joins-and-how-to-use-them-in-python/
3 |
--------------------------------------------------------------------------------
/session4/assignment_join_queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "colab": {
7 | "name": "Session 4 Assignment",
8 | "provenance": [],
9 | "collapsed_sections": [],
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-pysethon",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.6.6"
28 | }
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "SkZqhwkf8NUg"
45 | },
46 | "source": [
47 | "# Session 4: Join Queries Assignment\n",
48 | "\n",
49 | "\n",
50 | "In this segment we will connect to the *Music* database."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "5u_6yLTDT6Kn"
57 | },
58 | "source": [
59 | "## Setup"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "O9o9NsaO8hMy"
66 | },
67 | "source": [
68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic"
69 | ],
70 | "execution_count": null,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "EkIL-uRK8NUi"
77 | },
78 | "source": [
79 | "# This code creates a connection to the database\n",
80 | "from sqlalchemy import create_engine\n",
81 | "\n",
82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
83 | " host=\"db.ipeirotis.org\",\n",
84 | " user=\"student\",\n",
85 | " db=\"music\",\n",
86 | " password=\"dwdstudent2015\",\n",
87 | " encoding=\"utf8mb4\",\n",
88 | ")\n",
89 | "\n",
90 | "engine = create_engine(conn_string)\n",
91 | "con = engine.connect()"
92 | ],
93 | "execution_count": null,
94 | "outputs": []
95 | },
96 | {
97 | "cell_type": "code",
98 | "metadata": {
99 | "id": "z7muzQXTUFkU"
100 | },
101 | "source": [
102 | "%reload_ext sql_magic"
103 | ],
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "cell_type": "code",
109 | "metadata": {
110 | "id": "uHRIPxBvUGfC"
111 | },
112 | "source": [
113 | "%config SQL.conn_name = 'engine'"
114 | ],
115 | "execution_count": null,
116 | "outputs": []
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "id": "-7ZZAPj1yfrZ"
122 | },
123 | "source": [
124 | "This is an example of how you can write an SQL query in the notebook.\n",
125 | "You write your SQL query after the `%%read_sql` line."
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "metadata": {
131 | "id": "sWa1Uv_6X9zi"
132 | },
133 | "source": [
134 | "%%read_sql\n",
135 | "SELECT * \n",
136 | "FROM played"
137 | ],
138 | "execution_count": null,
139 | "outputs": []
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {
144 | "id": "H0hhloRRUJlV"
145 | },
146 | "source": [
147 | "## Question 1: List all the album names by the band `New Order`"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "metadata": {
153 | "id": "eL_CnyPRUSGI"
154 | },
155 | "source": [
156 | "%%read_sql"
157 | ],
158 | "execution_count": null,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {
164 | "id": "Hz_1yX-EUeBQ"
165 | },
166 | "source": [
167 | "## Question 2: List the tracks for the album `Second Coming`"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "metadata": {
173 | "id": "HXy0Ygy3Uf_m"
174 | },
175 | "source": [
176 | "%%read_sql\n"
177 | ],
178 | "execution_count": null,
179 | "outputs": []
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {
184 | "id": "UozymxRTW-wx"
185 | },
186 | "source": [
187 | "## Question 3: List all the track names, the corresponding album name, and the corresponding artist name"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "metadata": {
193 | "id": "ZwtqsRoGW-wz"
194 | },
195 | "source": [
196 | "%%read_sql"
197 | ],
198 | "execution_count": null,
199 | "outputs": []
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {
204 | "id": "8NFSCApmXGZ8"
205 | },
206 | "source": [
207 | "## Question 4: List all all the tracks by the artist `The Stone Roses` and rank them by time length, from shortest to longest"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "metadata": {
213 | "id": "bcr5iccDXGZ9"
214 | },
215 | "source": [
216 | "%%read_sql\n"
217 | ],
218 | "execution_count": null,
219 | "outputs": []
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {
224 | "id": "nezZleqbUeI_"
225 | },
226 | "source": [
227 | "## Question 5: The table `played` contains the tracks that the user listened to, and the time that they listened to the songs. List the _distinct_ names of the artists that the user has listened to."
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "metadata": {
233 | "id": "rNfpSzT3UgrM"
234 | },
235 | "source": [
236 | "%%read_sql\n"
237 | ],
238 | "execution_count": null,
239 | "outputs": []
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {
244 | "id": "_j2bCzADXODe"
245 | },
246 | "source": [
247 | "## Question 6: List the name of the artists and albums that have tracks with time length more than 10 minutes."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "metadata": {
253 | "id": "o9h005rlXODf"
254 | },
255 | "source": [
256 | "%%read_sql"
257 | ],
258 | "execution_count": null,
259 | "outputs": []
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {
264 | "id": "HF9cHpSDXaZd"
265 | },
266 | "source": [
267 | "## Question 7: List the album name, the artist name, and the track names, where both the name of the album and the name of the track contain the string `love` (it is fine if it is part of a longer word)."
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "metadata": {
273 | "id": "mvLQdbdiXSmw"
274 | },
275 | "source": [
276 | "%%read_sql"
277 | ],
278 | "execution_count": null,
279 | "outputs": []
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {
284 | "id": "ntj5f4n8U3dT"
285 | },
286 | "source": [
287 | "## Question 8: The table played contains the tracks that the user listened to, and the time that they listened to the songs. List the tracks that are in the database, but which the user has never listened to. (Note: Need an outer join)."
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "metadata": {
293 | "id": "mPCpyhbwU3dV"
294 | },
295 | "source": [
296 | "%%read_sql\n"
297 | ],
298 | "execution_count": null,
299 | "outputs": []
300 | }
301 | ]
302 | }
--------------------------------------------------------------------------------
/session4/practice_questions_joins_restaurants.md:
--------------------------------------------------------------------------------
1 | # JOIN practice queries.
2 |
3 | ## Restaurants Database
4 |
5 |
6 | 1. Output the names of the restaurants together with the comments written for these restaurants:
7 |
8 | a. Your output should include only those restaurants for which reviews were submitted; do
9 | not output empty comments (NULL values);
10 |
11 | b. Your output should contain the names of all the restaurants from your database; if some
12 | restaurants are not reviewed or their comments are empty, these restaurants should
13 | still be included in your output (with NULL values for the comment attribute);
14 |
15 | c. Your output should contain the names of all the restaurants from your database; if some
16 | restaurants are not reviewed or their comments are empty, these restaurants should
17 | still be included in your output (with NULL values for the comment attribute); make sure
18 | that the output contains only distinct records (e.g., if a restaurant has two reviews with
19 | empty comments, there should be only one record in the output relation corresponding
20 | to this situation).
21 |
22 | 2. For every review stored in the database output the review id, first and last names of the critic
23 | together with the comments left by this critic for each of the reviews.
24 |
25 | 3. Output the critic’s first and last names, restaurant name, and the star rating assigned by the
26 | critic to the restaurant for all the reviews where the star rating is greater or equal to 3.
27 |
28 | 4. For all the Manhattan restaurants output the following information regarding all the reviews
29 | submitted for these restaurants: the name of the restaurant, its cuisine, the name of the food
30 | critic, food critic’s affiliation, star rating assigned by the critic, date the review was written and
31 | the comments:
32 |
33 | a. Include in the output only those restaurants for which there are reviews in your DB.
34 |
35 | b. Include in your output all the information about all the Manhattan restaurants. If there
36 | are Manhattan restaurants for which there are no reviews then the fields for the critic
37 | and review should be null.
38 |
39 | 5. Output the rating code, the critic’s name who submitted this rating, the borough of the
40 | restaurant for which the rating was given, and the star rating for all the ratings submitted after
41 | January 1, 2010.
42 |
--------------------------------------------------------------------------------
/session5/D-SQL_Aggregation_Queries.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session5/D-SQL_Aggregation_Queries.pptx
--------------------------------------------------------------------------------
/session5/assignment_aggregate_queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "colab": {
7 | "name": "Session 5: Aggregate Queries Assignment",
8 | "provenance": [],
9 | "collapsed_sections": [],
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.6.6"
28 | }
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "SkZqhwkf8NUg"
45 | },
46 | "source": [
47 | "# Session 5: Aggregate Queries Assignment\n",
48 | "\n",
49 | "\n",
50 | "In this segment we will connect to the *Music* database."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "5u_6yLTDT6Kn"
57 | },
58 | "source": [
59 | "## Setup"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "O9o9NsaO8hMy"
66 | },
67 | "source": [
68 | "\n",
69 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic"
70 | ],
71 | "execution_count": null,
72 | "outputs": []
73 | },
74 | {
75 | "cell_type": "code",
76 | "metadata": {
77 | "id": "EkIL-uRK8NUi"
78 | },
79 | "source": [
80 | "# This code creates a connection to the database\n",
81 | "from sqlalchemy import create_engine\n",
82 | "\n",
83 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
84 | " host=\"db.ipeirotis.org\",\n",
85 | " user=\"student\",\n",
86 | " db=\"music\",\n",
87 | " password=\"dwdstudent2015\",\n",
88 | " encoding=\"utf8mb4\",\n",
89 | ")\n",
90 | "\n",
91 | "engine = create_engine(conn_string)\n",
92 | "con = engine.connect()"
93 | ],
94 | "execution_count": null,
95 | "outputs": []
96 | },
97 | {
98 | "cell_type": "code",
99 | "metadata": {
100 | "id": "z7muzQXTUFkU"
101 | },
102 | "source": [
103 | "%reload_ext sql_magic"
104 | ],
105 | "execution_count": null,
106 | "outputs": []
107 | },
108 | {
109 | "cell_type": "code",
110 | "metadata": {
111 | "id": "uHRIPxBvUGfC"
112 | },
113 | "source": [
114 | "%config SQL.conn_name = 'engine'"
115 | ],
116 | "execution_count": null,
117 | "outputs": []
118 | },
119 | {
120 | "cell_type": "code",
121 | "metadata": {
122 | "id": "sWa1Uv_6X9zi"
123 | },
124 | "source": [
125 | "%%read_sql\n",
126 | "SELECT * \n",
127 | "FROM played"
128 | ],
129 | "execution_count": null,
130 | "outputs": []
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {
135 | "id": "pJIqxXsU3Ldx"
136 | },
137 | "source": [
138 | "## **ATTENTION: Remember that the primary key for an album consists of both the artist id and the album id. Similarly, the primary key for the tracks consists of the track id, the album id, and the artist id.**"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {
144 | "id": "H0hhloRRUJlV"
145 | },
146 | "source": [
147 | "## Question 1: Count the number of artists in the database"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "metadata": {
153 | "id": "eL_CnyPRUSGI"
154 | },
155 | "source": [
156 | "%%read_sql\n"
157 | ],
158 | "execution_count": null,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {
164 | "id": "McZQXcp__OCu"
165 | },
166 | "source": [
167 | "## Question 2: Count the number of tracks in the database"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "metadata": {
173 | "id": "W7GayX2v_Nl8"
174 | },
175 | "source": [
176 | ""
177 | ],
178 | "execution_count": null,
179 | "outputs": []
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {
184 | "id": "Hz_1yX-EUeBQ"
185 | },
186 | "source": [
187 | "## Question 3: Show the average and standard deviation of the track length"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "metadata": {
193 | "id": "HXy0Ygy3Uf_m"
194 | },
195 | "source": [
196 | "%%read_sql\n"
197 | ],
198 | "execution_count": null,
199 | "outputs": []
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {
204 | "id": "UozymxRTW-wx"
205 | },
206 | "source": [
207 | "## Question 4: Show the earliest and latest date that the user has played a song"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "metadata": {
213 | "id": "ZwtqsRoGW-wz"
214 | },
215 | "source": [
216 | "%%read_sql\n"
217 | ],
218 | "execution_count": null,
219 | "outputs": []
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {
224 | "id": "8NFSCApmXGZ8"
225 | },
226 | "source": [
227 | "## Question 5: For each artist id, count the number of albums in the database. You only need to show the id of the artist, not the name of the artist."
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "metadata": {
233 | "id": "bcr5iccDXGZ9"
234 | },
235 | "source": [
236 | "%%read_sql\n"
237 | ],
238 | "execution_count": null,
239 | "outputs": []
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {
244 | "id": "nezZleqbUeI_"
245 | },
246 | "source": [
247 | "## Question 6: For each album id, count the number of tracks for that album. You only need to show the id of the album, not its name."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "metadata": {
253 | "id": "rNfpSzT3UgrM"
254 | },
255 | "source": [
256 | "%%read_sql\n"
257 | ],
258 | "execution_count": null,
259 | "outputs": []
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {
264 | "id": "_j2bCzADXODe"
265 | },
266 | "source": [
267 | "## Question 7: For each album id, show the total length of all the tracks in the album."
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "metadata": {
273 | "id": "o9h005rlXODf"
274 | },
275 | "source": [
276 | "%%read_sql\n"
277 | ],
278 | "execution_count": null,
279 | "outputs": []
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {
284 | "id": "HF9cHpSDXaZd"
285 | },
286 | "source": [
287 | "## Question 8: List all the album id's, where the total album length (across all the album's tracks) is longer than 60 minutes."
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "metadata": {
293 | "id": "mvLQdbdiXSmw"
294 | },
295 | "source": [
296 | "%%read_sql"
297 | ],
298 | "execution_count": null,
299 | "outputs": []
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {
304 | "id": "ntj5f4n8U3dT"
305 | },
306 | "source": [
307 | "## Question 9: Find all track names that appear in more than one album, and show how many artists used the same track name (if any)."
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "metadata": {
313 | "id": "mPCpyhbwU3dV"
314 | },
315 | "source": [
316 | "%%read_sql"
317 | ],
318 | "execution_count": null,
319 | "outputs": []
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {
324 | "id": "23bSA8IUU1Og"
325 | },
326 | "source": [
327 | "## Question 10: For each artist id, list the first and the last time that a user listened to a song by this artist."
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "metadata": {
333 | "id": "hTD97qjaU1Oi"
334 | },
335 | "source": [
336 | "%%read_sql "
337 | ],
338 | "execution_count": null,
339 | "outputs": []
340 | }
341 | ]
342 | }
--------------------------------------------------------------------------------
/session5/practice_queries_aggregation.md:
--------------------------------------------------------------------------------
1 | # Aggregation, practice queries
2 |
3 | ## Restaurant Database
4 |
5 | GROUP BY, aggregation functions (MAX, MIN, COUNT, etc.)
6 | Review also HAVING (condition for the GROUP BY) and ORDER
7 |
8 | 1. How many Manhattan restaurants are listed in your database;
9 | 2. Output the affiliation (or '-' for freelancers) and how many critics are associated with this affiliation;
10 | 3. Output the critic id together with the maximal star rating ever issued by this critic;
11 | 4. Output the critic id and the restaurant code together with the maximal star rating ever issued by this critic for this restaurant;
12 | 5. For every borough, cuisine pair output the minimal price and order the output by borough in the ascending order (consider only the restaurants outside of Manhattan);
13 | 6. For every borough, cuisine pair output the minimal price where the minimal price is greater than $100;
14 | 7. For every borough, cuisine pair output the minimal price where the minimal price is greater than $100 and order the output by the price value in the descending order.
--------------------------------------------------------------------------------
/session5/practice_queries_aggregation_solution.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "colab": {
7 | "name": "Practice Aggregate Queries: Solutions",
8 | "provenance": [],
9 | "collapsed_sections": [],
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.8.2"
28 | }
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "SkZqhwkf8NUg"
45 | },
46 | "source": [
47 | "# Session 5: Practice Aggregate Queries: Solutions\n",
48 | "\n",
49 | "\n",
50 | "In this segment we will connect to the *Restaurants* database."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "5u_6yLTDT6Kn"
57 | },
58 | "source": [
59 | "## Setup"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "O9o9NsaO8hMy"
66 | },
67 | "source": [
68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic"
69 | ],
70 | "execution_count": null,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "EkIL-uRK8NUi"
77 | },
78 | "source": [
79 | "# This code creates a connection to the database\n",
80 | "from sqlalchemy import create_engine\n",
81 | "\n",
82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
83 | " host=\"db.ipeirotis.org\",\n",
84 | " user=\"student\",\n",
85 | " db=\"restaurants\",\n",
86 | " password=\"dwdstudent2015\",\n",
87 | " encoding=\"utf8mb4\",\n",
88 | ")\n",
89 | "\n",
90 | "engine = create_engine(conn_string)\n",
91 | "con = engine.connect()"
92 | ],
93 | "execution_count": null,
94 | "outputs": []
95 | },
96 | {
97 | "cell_type": "code",
98 | "metadata": {
99 | "id": "z7muzQXTUFkU"
100 | },
101 | "source": [
102 | "%reload_ext sql_magic"
103 | ],
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "cell_type": "code",
109 | "metadata": {
110 | "id": "uHRIPxBvUGfC"
111 | },
112 | "source": [
113 | "%config SQL.conn_name = 'engine'"
114 | ],
115 | "execution_count": null,
116 | "outputs": []
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "id": "H0hhloRRUJlV"
122 | },
123 | "source": [
124 | "## Question 1: How many Manhattan restaurants are listed in your database"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "metadata": {
130 | "id": "eL_CnyPRUSGI"
131 | },
132 | "source": [
133 | "%%read_sql\n",
134 | "SELECT borough, COUNT(*) AS num_restaurants\n",
135 | "FROM Restaurant\n",
136 | "WHERE borough = 'Manhattan'\n",
137 | "GROUP BY borough"
138 | ],
139 | "execution_count": null,
140 | "outputs": []
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {
145 | "id": "McZQXcp__OCu"
146 | },
147 | "source": [
148 | "## Question 2: Output the affiliation (or '-' for freelancers) and how many critics are associated with this affiliation;"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "metadata": {
154 | "id": "W7GayX2v_Nl8"
155 | },
156 | "source": [
157 | "%%read_sql\n",
158 | "SELECT affiliation, COUNT(*) AS num_critics\n",
159 | "FROM Critic\n",
160 | "GROUP BY affiliation"
161 | ],
162 | "execution_count": null,
163 | "outputs": []
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {
168 | "id": "uhU3fLp9RoiR"
169 | },
170 | "source": [
171 | "If we want to replace the NULL value in affiliation with `-`, we use the `COALESCE` command, that replaces NULL with the value that we pass as parameter."
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "metadata": {
177 | "id": "QSyuhaZZRY-O"
178 | },
179 | "source": [
180 | "%%read_sql\n",
181 | "SELECT COALESCE(affiliation, '-') AS affiliation, COUNT(*) AS num_critics\n",
182 | "FROM Critic\n",
183 | "GROUP BY affiliation"
184 | ],
185 | "execution_count": null,
186 | "outputs": []
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "id": "Hz_1yX-EUeBQ"
192 | },
193 | "source": [
194 | "## Question 3: Output the critic id together with the maximal star rating ever issued by this critic;"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "metadata": {
200 | "id": "HXy0Ygy3Uf_m"
201 | },
202 | "source": [
203 | "%%read_sql\n",
204 | "SELECT cID, MAX(starRating) AS maxRating\n",
205 | "FROM Rating\n",
206 | "GROUP BY cID"
207 | ],
208 | "execution_count": null,
209 | "outputs": []
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {
214 | "id": "UozymxRTW-wx"
215 | },
216 | "source": [
217 | "## Question 4: Output the critic id and the restaurant code together with the maximal star rating ever issued by this critic for this restaurant;"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "metadata": {
223 | "id": "ZwtqsRoGW-wz"
224 | },
225 | "source": [
226 | "%%read_sql\n",
227 | "SELECT cID, restCode, MAX(starRating) AS maxRating\n",
228 | "FROM Rating\n",
229 | "GROUP BY cID, restCode"
230 | ],
231 | "execution_count": null,
232 | "outputs": []
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {
237 | "id": "8NFSCApmXGZ8"
238 | },
239 | "source": [
240 | "## Question 5: For every borough, cuisine pair output the minimal price and order the output by borough in the ascending order (consider only the restaurants outside of Manhattan);"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "metadata": {
246 | "id": "bcr5iccDXGZ9"
247 | },
248 | "source": [
249 | "%%read_sql\n",
250 | "SELECT borough, cuisine, MIN(avgPrice) AS minPrice\n",
251 | "FROM Restaurant\n",
252 | "WHERE borough <> \"Manhattan\"\n",
253 | "GROUP BY borough, cuisine"
254 | ],
255 | "execution_count": null,
256 | "outputs": []
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {
261 | "id": "nezZleqbUeI_"
262 | },
263 | "source": [
264 | "## Question 6: For every borough, cuisine pair output the minimal price where the minimal price is greater than 100"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "metadata": {
270 | "id": "rNfpSzT3UgrM"
271 | },
272 | "source": [
273 | "%%read_sql\n",
274 | "SELECT borough, cuisine, MIN(avgPrice) AS minPrice\n",
275 | "FROM Restaurant\n",
276 | "GROUP BY borough, cuisine\n",
277 | "HAVING minPrice>100"
278 | ],
279 | "execution_count": null,
280 | "outputs": []
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {
285 | "id": "_j2bCzADXODe"
286 | },
287 | "source": [
288 | "## Question 7: For every borough, cuisine pair output the minimal price where the minimal price is greater than 100 and order the output by the price value in the descending order."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "metadata": {
294 | "id": "o9h005rlXODf"
295 | },
296 | "source": [
297 | "%%read_sql\n",
298 | "SELECT borough, cuisine, MIN(avgPrice) AS minPrice\n",
299 | "FROM Restaurant\n",
300 | "GROUP BY borough, cuisine\n",
301 | "HAVING minPrice>100\n",
302 | "ORDER BY minPrice DESC"
303 | ],
304 | "execution_count": null,
305 | "outputs": []
306 | }
307 | ]
308 | }
--------------------------------------------------------------------------------
/session5/practice_queries_join_and_aggregation.md:
--------------------------------------------------------------------------------
1 | # Join and Aggregation
2 |
3 | ## Restaurant Database
4 |
5 | 1. Output the restaurant name together with the number of reviews submitted for this restaurant.
6 | 2. For every Manhattan restaurant output its name and the number of reviews submitted for this
7 | restaurant.
8 | 3. For every restaurant that was reviewed more than once output it name and the number or
9 | reviews submitted for this restaurant.
10 | 4. Output the critic's last name and the restaurant name together with the maximal star rating
11 | ever issued by this critic for this restaurant.
12 | 5. For each cuisine-borough pair, output the number of the corresponding restaurants.
13 | 6. For each NYT reporter, output the number of distinct restaurants this reporter reviewed.
14 | 7. For every news outlet, output the average star rating submitted by all the reviewers of this
15 | outlet.
16 | a. consider only Italian restaurants
17 | b. consider only Italian restaurants outside of Manhattan
18 | 8. For every borough output the max star rating submitted for any restaurant within this borough
19 | (in which borough do you have the best restaurant)
--------------------------------------------------------------------------------
/session5/practice_queries_join_and_aggregation_solutions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "colab": {
7 | "name": "Practice Aggregate+Join Queries: Solutions",
8 | "provenance": [],
9 | "collapsed_sections": [],
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.6.6"
28 | }
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "SkZqhwkf8NUg"
45 | },
46 | "source": [
47 | "# Session 5: Practice Aggregate+Join Queries: Solutions\n",
48 | "\n",
49 | "\n",
50 | "In this segment we will connect to the *Restaurants* database."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "5u_6yLTDT6Kn"
57 | },
58 | "source": [
59 | "## Setup"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "O9o9NsaO8hMy"
66 | },
67 | "source": [
68 | "!sudo pip3 install PyMySQL sqlalchemy sql_magic"
69 | ],
70 | "execution_count": null,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "EkIL-uRK8NUi"
77 | },
78 | "source": [
79 | "# This code creates a connection to the database\n",
80 | "from sqlalchemy import create_engine\n",
81 | "\n",
82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
83 | " host=\"db.ipeirotis.org\",\n",
84 | " user=\"student\",\n",
85 | " db=\"restaurants\",\n",
86 | " password=\"dwdstudent2015\",\n",
87 | " encoding=\"utf8mb4\",\n",
88 | ")\n",
89 | "\n",
90 | "engine = create_engine(conn_string)\n",
91 | "con = engine.connect()"
92 | ],
93 | "execution_count": null,
94 | "outputs": []
95 | },
96 | {
97 | "cell_type": "code",
98 | "metadata": {
99 | "id": "z7muzQXTUFkU"
100 | },
101 | "source": [
102 | "%reload_ext sql_magic"
103 | ],
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "cell_type": "code",
109 | "metadata": {
110 | "id": "uHRIPxBvUGfC"
111 | },
112 | "source": [
113 | "%config SQL.conn_name = 'engine'"
114 | ],
115 | "execution_count": null,
116 | "outputs": []
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "id": "H0hhloRRUJlV"
122 | },
123 | "source": [
124 | "## Question 1: Output the restaurant name together with the number of reviews submitted for this restaurant."
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {
130 | "id": "kwraKNb8aZjd"
131 | },
132 | "source": [
133 | "Let's run first the join query, and look at the table.\n",
134 | "\n",
135 | "This will be the table on which the GROUP BY query will operate.\n",
136 | "\n",
137 | "We order by `restName` to visually illustrate the groups that will be created."
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "metadata": {
143 | "id": "QZjEITHKaSlT"
144 | },
145 | "source": [
146 | "%%read_sql\n",
147 | "SELECT *\n",
148 | "FROM Restaurant R INNER JOIN Rating T ON R.restCode=T.restCode\n",
149 | "ORDER BY restName"
150 | ],
151 | "execution_count": null,
152 | "outputs": []
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {
157 | "id": "B3GCj21VcD_9"
158 | },
159 | "source": [
160 | "Now notice the output when we use a LEFT JOIN instead of an INNER JOIN. Notice the extra restaurants that appear, which have received no reviews (and therefore the `code` and `cID` are NULL)"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "metadata": {
166 | "id": "9EtFQf2NcCzq"
167 | },
168 | "source": [
169 | "%%read_sql\n",
170 | "SELECT *\n",
171 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n",
172 | "ORDER BY restName"
173 | ],
174 | "execution_count": null,
175 | "outputs": []
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {
180 | "id": "ILA0T03xbT1Z"
181 | },
182 | "source": [
183 | "We now execute the GROUP BY. Notice how we use the `COUNT` command to count the total number of reviews, and the reviews with comments. Since this is an `INNER JOIN`, the `COUNT(*)` and `COUNT(code)` return the same values."
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "metadata": {
189 | "id": "eL_CnyPRUSGI"
190 | },
191 | "source": [
192 | "%%read_sql\n",
193 | "SELECT restName, \n",
194 | " COUNT(*) AS cnt, \n",
195 | " COUNT(code) AS num_reviews, \n",
196 | " COUNT(comments) AS num_reviews_with_comments\n",
197 | "FROM Restaurant R INNER JOIN Rating T ON R.restCode=T.restCode\n",
198 | "GROUP BY restName"
199 | ],
200 | "execution_count": null,
201 | "outputs": []
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {
206 | "id": "TAX86BZ-cdsn"
207 | },
208 | "source": [
209 | "Now, let's switch to a LEFT JOIN. Notice an important change. We cannot rely on `COUNT(*)` anymore to count the number of reviews, and we need to be using the `COUNT(code)`. For example `Nisi` has one review: both the `cnt` and the `num_reviews` column are 1. However, for the `Don Peppe`, which has no reviews, we see that `cnt` is still 1, but `num_reviews` is 0."
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "metadata": {
215 | "id": "uOIAW9jscSzM"
216 | },
217 | "source": [
218 | "%%read_sql\n",
219 | "SELECT restName, \n",
220 | " COUNT(*) AS cnt, \n",
221 | " COUNT(code) AS num_reviews, \n",
222 | " COUNT(comments) AS num_reviews_with_comments\n",
223 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n",
224 | "GROUP BY restName"
225 | ],
226 | "execution_count": null,
227 | "outputs": []
228 | },
229 | {
230 | "cell_type": "code",
231 | "metadata": {
232 | "id": "FQB0fIIGvNZu"
233 | },
234 | "source": [
235 | "%%read_sql\n",
236 | "SELECT restName, cuisine, borough,\n",
237 | " COUNT(*) AS cnt, \n",
238 | " COUNT(code) AS num_reviews, \n",
239 | " COUNT(comments) AS num_reviews_with_comments\n",
240 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n",
241 | "GROUP BY restName, cuisine, borough\n",
242 | "ORDER BY cuisine, borough"
243 | ],
244 | "execution_count": null,
245 | "outputs": []
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {
250 | "id": "McZQXcp__OCu"
251 | },
252 | "source": [
253 | "## Question 2: For every Manhattan restaurant output its name and the number of reviews submitted for this restaurant."
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "metadata": {
259 | "id": "W7GayX2v_Nl8"
260 | },
261 | "source": [
262 | "%%read_sql\n",
263 | "SELECT restName, COUNT(code) AS num_reviews, COUNT(comments) AS num_reviews_with_comments\n",
264 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n",
265 | "WHERE borough = 'Manhattan'\n",
266 | "GROUP BY restName"
267 | ],
268 | "execution_count": null,
269 | "outputs": []
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {
274 | "id": "Hz_1yX-EUeBQ"
275 | },
276 | "source": [
277 | "## Question 3: For every restaurant that was reviewed more than once output it name and the number or reviews submitted for this restaurant."
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "metadata": {
283 | "id": "HXy0Ygy3Uf_m"
284 | },
285 | "source": [
286 | "%%read_sql\n",
287 | "SELECT restName, COUNT(code) AS num_reviews, COUNT(comments) AS num_reviews_with_comments\n",
288 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n",
289 | "GROUP BY restName\n",
290 | "HAVING num_reviews>1"
291 | ],
292 | "execution_count": null,
293 | "outputs": []
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {
298 | "id": "UozymxRTW-wx"
299 | },
300 | "source": [
301 | "## Question 4: Output the critic's last name and the restaurant name together with the maximal star rating ever issued by this critic for this restaurant."
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {
307 | "id": "wAqIzSNGfk9j"
308 | },
309 | "source": [
310 | "Again, let's execute first the JOIN of all the tables that we need, so that we can see the data that we will be aggregating. Since we want to organize by critic's last name and restaurant name, we also add the corresponding ORDER BY, so that we can visually group together the rows that we will be aggregating."
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "metadata": {
316 | "id": "ZwtqsRoGW-wz"
317 | },
318 | "source": [
319 | "%%read_sql\n",
320 | "SELECT *\n",
321 | "FROM Critic C \n",
322 | " INNER JOIN Rating R ON R.cID = C.cID\n",
323 | " INNER JOIN Restaurant T ON T.restCode = R.restCode\n",
324 | "ORDER BY restName, lastN"
325 | ],
326 | "execution_count": null,
327 | "outputs": []
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {
332 | "id": "2KH4AJvzgP_u"
333 | },
334 | "source": [
335 | "Now, let's report the number of reviews that a critic wrote for the restaurant, together with the mix and max ratings. Obviously, when there is only one review, the min and max ratings are the same."
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "metadata": {
341 | "id": "Uobp11rkfdOk"
342 | },
343 | "source": [
344 | "%%read_sql\n",
345 | "SELECT restName, lastN, \n",
346 | " COUNT(R.code) AS num_reviews, \n",
347 | " MAX(R.starRating) AS maxRating, \n",
348 | " MIN(R.starRating) AS minRating \n",
349 | "FROM Critic C \n",
350 | " INNER JOIN Rating R ON R.cID = C.cID\n",
351 | " INNER JOIN Restaurant T ON T.restCode = R.restCode\n",
352 | "GROUP BY restName, lastN\n",
353 | "ORDER BY restName, lastN"
354 | ],
355 | "execution_count": null,
356 | "outputs": []
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {
361 | "id": "8NFSCApmXGZ8"
362 | },
363 | "source": [
364 | "## Question 5: For each cuisine-borough pair, output the number of the corresponding restaurants."
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "metadata": {
370 | "id": "bcr5iccDXGZ9"
371 | },
372 | "source": [
373 | "%%read_sql\n",
374 | "SELECT cuisine, borough, COUNT(*) AS num_restaurants\n",
375 | "FROM Restaurant\n",
376 | "GROUP BY cuisine, borough"
377 | ],
378 | "execution_count": null,
379 | "outputs": []
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {
384 | "id": "nezZleqbUeI_"
385 | },
386 | "source": [
387 | "## Question 6: For each NYT reporter, output the number of distinct restaurants this reporter reviewed."
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "metadata": {
393 | "id": "rNfpSzT3UgrM"
394 | },
395 | "source": [
396 | "%%read_sql\n",
397 | "SELECT *\n",
398 | "FROM Critic C\n",
399 | " INNER JOIN Rating R ON R.cID = C.cID\n",
400 | "ORDER BY C.cID, R.restCode"
401 | ],
402 | "execution_count": null,
403 | "outputs": []
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "id": "6O4EwRToibt0"
409 | },
410 | "source": [
411 | "Now let's execute the GROUP BY. Notice that we only GROUP by the `C.cID` which is the primary key for the table `Critic C`. Since the `cID` is the primary key for that table, we can also add in the SELECT clause the other attributes of the critic (which are unique for a given cID) without adding these attributes in the GROUP BY clause."
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "metadata": {
417 | "id": "YWIO7tvQh16s"
418 | },
419 | "source": [
420 | "%%read_sql\n",
421 | "SELECT C.cID, \n",
422 | " firstN, lastN, affiliation, \n",
423 | " COUNT(DISTINCT R.restCode) AS num_distinct_restaurants\n",
424 | "FROM Critic C\n",
425 | " INNER JOIN Rating R ON R.cID = C.cID\n",
426 | "GROUP BY C.cID"
427 | ],
428 | "execution_count": null,
429 | "outputs": []
430 | },
431 | {
432 | "cell_type": "code",
433 | "metadata": {
434 | "id": "ISuJTDayiTS5"
435 | },
436 | "source": [
437 | "%%read_sql\n",
438 | "SELECT C.cID, \n",
439 | " firstN, lastN, affiliation, \n",
440 | " COUNT(DISTINCT R.restCode) AS num_distinct_restaurants\n",
441 | "FROM Critic C\n",
442 | " INNER JOIN Rating R ON R.cID = C.cID\n",
443 | "WHERE affiliation = 'NYT'\n",
444 | "GROUP BY C.cID"
445 | ],
446 | "execution_count": null,
447 | "outputs": []
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {
452 | "id": "_j2bCzADXODe"
453 | },
454 | "source": [
455 | "## Question 7: For every news outlet, output the average star rating submitted by all the reviewers of this outlet. a. consider only Italian restaurants b. consider only Italian restaurants outside of Manhattan"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "metadata": {
461 | "id": "o9h005rlXODf"
462 | },
463 | "source": [
464 | "%%read_sql\n",
465 | "SELECT *\n",
466 | "FROM Critic C\n",
467 | " INNER JOIN Rating R ON R.cID = C.cID\n",
468 | " INNER JOIN Restaurant T ON T.restCode = R.restCode\n",
469 | "ORDER BY C.affiliation"
470 | ],
471 | "execution_count": null,
472 | "outputs": []
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {
477 | "id": "OjtmFW39vk8U"
478 | },
479 | "source": [
480 | "Here is the same join as above, but now limited to Italian restaurants"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "metadata": {
486 | "id": "XHXY6wEkuGES"
487 | },
488 | "source": [
489 | "%%read_sql\n",
490 | "SELECT *\n",
491 | "FROM Critic C\n",
492 | " INNER JOIN Rating R ON R.cID = C.cID\n",
493 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n",
494 | "WHERE cuisine = 'Italian'\n",
495 | "ORDER BY C.affiliation"
496 | ],
497 | "execution_count": null,
498 | "outputs": []
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {
503 | "id": "0SOsIIepaAcd"
504 | },
505 | "source": [
506 | "If we limit to Italian restaurants outside Manhattan, you will see that we get nothing back, as there are no reviews for Italian restaurants outside Manhattan. (Notice that there *are* Italian restaurants outside Manhattan, but no reviews for these restaurants.)"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "metadata": {
512 | "id": "nAHmBW2luLDY"
513 | },
514 | "source": [
515 | "%%read_sql\n",
516 | "SELECT *\n",
517 | "FROM Critic C\n",
518 | " INNER JOIN Rating R ON R.cID = C.cID\n",
519 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n",
520 | "WHERE cuisine = 'Italian' AND borough <> 'Manhattan'\n",
521 | "ORDER BY C.affiliation"
522 | ],
523 | "execution_count": null,
524 | "outputs": []
525 | },
526 | {
527 | "cell_type": "code",
528 | "metadata": {
529 | "id": "Fuh0ziUB5f2k"
530 | },
531 | "source": [
532 | "%%read_sql\n",
533 | "SELECT C.affiliation, COUNT(*) AS num_reviews, AVG(starRating) AS avgRating\n",
534 | "FROM Critic C\n",
535 | " INNER JOIN Rating R ON R.cID = C.cID\n",
536 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n",
537 | "WHERE cuisine = 'Italian' \n",
538 | "GROUP BY C.affiliation\n",
539 | "ORDER BY C.affiliation"
540 | ],
541 | "execution_count": null,
542 | "outputs": []
543 | },
544 | {
545 | "cell_type": "code",
546 | "metadata": {
547 | "id": "vfpkBEmPvdk0"
548 | },
549 | "source": [
550 | "%%read_sql\n",
551 | "SELECT C.affiliation, COUNT(*) AS num_reviews, AVG(starRating) AS avgRating\n",
552 | "FROM Critic C\n",
553 | " INNER JOIN Rating R ON R.cID = C.cID\n",
554 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n",
555 | "WHERE cuisine = 'Italian' AND borough <> 'Manhattan'\n",
556 | "GROUP BY C.affiliation\n",
557 | "ORDER BY C.affiliation"
558 | ],
559 | "execution_count": null,
560 | "outputs": []
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {
565 | "id": "dFYmhG1DX3Ob"
566 | },
567 | "source": [
568 | "## Question 8: For every borough output the max star rating submitted for any restaurant within this borough (in which borough do you have the best restaurant)"
569 | ]
570 | },
571 | {
572 | "cell_type": "code",
573 | "metadata": {
574 | "id": "F3nEhrb8FBjm"
575 | },
576 | "source": [
577 | "%%read_sql\n",
578 | "SELECT *\n",
579 | "FROM Restaurant R \n",
580 | " INNER JOIN Rating T ON T.restCode = R.restCode\n",
581 | "ORDER BY borough"
582 | ],
583 | "execution_count": null,
584 | "outputs": []
585 | },
586 | {
587 | "cell_type": "code",
588 | "metadata": {
589 | "id": "CToQxJjeFDyJ"
590 | },
591 | "source": [
592 | "%%read_sql\n",
593 | "SELECT borough, MAX(starRating) AS maxRating\n",
594 | "FROM Restaurant R \n",
595 | " INNER JOIN Rating T ON T.restCode = R.restCode\n",
596 | "GROUP BY borough\n",
597 | "ORDER BY borough"
598 | ],
599 | "execution_count": null,
600 | "outputs": []
601 | },
602 | {
603 | "cell_type": "code",
604 | "metadata": {
605 | "id": "oza36RHjX7Ij"
606 | },
607 | "source": [
608 | "%%read_sql\n",
609 | "SELECT *\n",
610 | "FROM Restaurant R \n",
611 | " LEFT JOIN Rating T ON T.restCode = R.restCode\n",
612 | "ORDER BY borough"
613 | ],
614 | "execution_count": null,
615 | "outputs": []
616 | },
617 | {
618 | "cell_type": "code",
619 | "metadata": {
620 | "id": "48I66Fp4DTcP"
621 | },
622 | "source": [
623 | "%%read_sql\n",
624 | "SELECT borough, MAX(starRating) AS maxRating\n",
625 | "FROM Restaurant R \n",
626 | " LEFT JOIN Rating T ON T.restCode = R.restCode\n",
627 | "GROUP BY borough\n",
628 | "ORDER BY borough"
629 | ],
630 | "execution_count": null,
631 | "outputs": []
632 | }
633 | ]
634 | }
--------------------------------------------------------------------------------
/session6/F-SQL_Subqueries-7.1.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session6/F-SQL_Subqueries-7.1.pptx
--------------------------------------------------------------------------------
/session6/F-SQL_Subqueries.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session6/F-SQL_Subqueries.pptx
--------------------------------------------------------------------------------
/session6/assignment_combined.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "colab": {
7 | "name": "Session 6: Putting All Together Queries",
8 | "provenance": [],
9 | "collapsed_sections": [],
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.6.6"
28 | }
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "SkZqhwkf8NUg"
45 | },
46 | "source": [
47 | "# Session 6: Putting All Together Assignment\n",
48 | "\n",
49 | "\n",
50 | "In this segment we will connect to the *Music* database."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "id": "5u_6yLTDT6Kn"
57 | },
58 | "source": [
59 | "## Setup"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "id": "O9o9NsaO8hMy"
66 | },
67 | "source": [
68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic"
69 | ],
70 | "execution_count": null,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "EkIL-uRK8NUi"
77 | },
78 | "source": [
79 | "# This code creates a connection to the database\n",
80 | "from sqlalchemy import create_engine\n",
81 | "\n",
82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n",
83 | " host=\"db.ipeirotis.org\",\n",
84 | " user=\"student\",\n",
85 | " db=\"music\",\n",
86 | " password=\"dwdstudent2015\",\n",
87 | " encoding=\"utf8mb4\",\n",
88 | ")\n",
89 | "\n",
90 | "engine = create_engine(conn_string)\n",
91 | "con = engine.connect()"
92 | ],
93 | "execution_count": null,
94 | "outputs": []
95 | },
96 | {
97 | "cell_type": "code",
98 | "metadata": {
99 | "id": "z7muzQXTUFkU"
100 | },
101 | "source": [
102 | "%reload_ext sql_magic"
103 | ],
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "cell_type": "code",
109 | "metadata": {
110 | "id": "uHRIPxBvUGfC"
111 | },
112 | "source": [
113 | "%config SQL.conn_name = 'engine'"
114 | ],
115 | "execution_count": null,
116 | "outputs": []
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "id": "NdgXvtmRLayC"
122 | },
123 | "source": [
124 | "This is an example of how you can write an SQL query in the notebook.\n",
125 | " You write your SQL query after the `%%read_sql` line."
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "metadata": {
131 | "id": "sWa1Uv_6X9zi"
132 | },
133 | "source": [
134 | "%%read_sql\n",
135 | "SELECT * \n",
136 | "FROM played"
137 | ],
138 | "execution_count": null,
139 | "outputs": []
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {
144 | "id": "H0hhloRRUJlV"
145 | },
146 | "source": [
147 | "## Question 1: Show the name of the artist and the number of albums for each artist in the database. Name the column that shows the number of albums as `num_albums`."
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "metadata": {
153 | "id": "eL_CnyPRUSGI"
154 | },
155 | "source": [
156 | "%%read_sql\n"
157 | ],
158 | "execution_count": null,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {
164 | "id": "Hz_1yX-EUeBQ"
165 | },
166 | "source": [
167 | "## Question 2: Show the name of the album, the number of tracks in the album, and the total length of the album. Name as `num_tracks` the columns that shows the number of tracks, and name as `length_minutes` the columns that shows the time length of the album."
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "metadata": {
173 | "id": "HXy0Ygy3Uf_m"
174 | },
175 | "source": [
176 | "%%read_sql\n",
177 | "\n"
178 | ],
179 | "execution_count": null,
180 | "outputs": []
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {
185 | "id": "UozymxRTW-wx"
186 | },
187 | "source": [
188 | "## Question 3: Expand query 2, to show not only the name of the album but also the name of the artist, _in addition_ to the information already shown in query 2 (i.e., name of the album, number of tracks, and album length)."
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "metadata": {
194 | "id": "ZwtqsRoGW-wz"
195 | },
196 | "source": [
197 | "%%read_sql\n"
198 | ],
199 | "execution_count": null,
200 | "outputs": []
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {
205 | "id": "8NFSCApmXGZ8"
206 | },
207 | "source": [
208 | "## Question 4: For every artist, show the average length of the tracks, and the standard deviation for the track length.\n",
209 | "\n"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "metadata": {
215 | "id": "bcr5iccDXGZ9"
216 | },
217 | "source": [
218 | "%%read_sql\n"
219 | ],
220 | "execution_count": null,
221 | "outputs": []
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {
226 | "id": "nezZleqbUeI_"
227 | },
228 | "source": [
229 | "## Question 5: Analyze the songs that the user has played and, for each artist, show the first and last time that the user has listened to the artist."
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "metadata": {
235 | "id": "rNfpSzT3UgrM"
236 | },
237 | "source": [
238 | "%%read_sql\n"
239 | ],
240 | "execution_count": null,
241 | "outputs": []
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {
246 | "id": "_j2bCzADXODe"
247 | },
248 | "source": [
249 | "## Question 6: Show the name of each track and the time it was played. Include tracks that were not played at all by the user. (Hint: You need an outer join.)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "metadata": {
255 | "id": "o9h005rlXODf"
256 | },
257 | "source": [
258 | "%%read_sql\n"
259 | ],
260 | "execution_count": null,
261 | "outputs": []
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {
266 | "id": "HF9cHpSDXaZd"
267 | },
268 | "source": [
269 | "## Question 7: List all the names of the albums and the album length (across all the album's tracks), where the total album length is longer than 60 minutes."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "metadata": {
275 | "id": "mvLQdbdiXSmw"
276 | },
277 | "source": [
278 | "%%read_sql\n"
279 | ],
280 | "execution_count": null,
281 | "outputs": []
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {
286 | "id": "ntj5f4n8U3dT"
287 | },
288 | "source": [
289 | "## Question 8: Show the names of the artists that have more than 12 tracks in the database."
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "metadata": {
295 | "id": "mPCpyhbwU3dV"
296 | },
297 | "source": [
298 | "%%read_sql"
299 | ],
300 | "execution_count": null,
301 | "outputs": []
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {
306 | "id": "23bSA8IUU1Og"
307 | },
308 | "source": [
309 | "## Question 9: Show how many tracks are on each of the albums by the artist `New Order`. List the name of the album and the number of tracks in the output."
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "metadata": {
315 | "id": "hTD97qjaU1Oi"
316 | },
317 | "source": [
318 | "%%read_sql\n"
319 | ],
320 | "execution_count": null,
321 | "outputs": []
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {
326 | "id": "eWgdmtHHURaS"
327 | },
328 | "source": [
329 | "## Question 10: By analyzing the songs that the user has played and, for each album, show the first and last time that the user has listened to the album. Include albums that the user has *not* listened at all. List also the artist name for the album. (Hint: you need an *outer* join for this.) "
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "metadata": {
335 | "id": "ygI_N9gBUhWy"
336 | },
337 | "source": [
338 | "%%read_sql\n"
339 | ],
340 | "execution_count": null,
341 | "outputs": []
342 | }
343 | ]
344 | }
--------------------------------------------------------------------------------
/session6/book_vs_gender.sql:
--------------------------------------------------------------------------------
1 | # Total number of users
2 | SELECT COUNT(*)
3 | FROM Profiles;
4 |
5 | SET @allprofiles = (SELECT COUNT(*) FROM Profiles);
6 | # 25784
7 |
8 | SELECT @allprofiles;
9 |
10 | # Break down the user by sex
11 | SELECT Sex,COUNT(*)
12 | FROM Profiles
13 | GROUP BY Sex;
14 |
15 | # Female 12311
16 | # Male 8975
17 | # NULL 4498
18 |
19 |
20 | # Instead of counting all users, lets focus only on users
21 | # that have listed books they like in their profiles
22 | SELECT P.Sex, COUNT(DISTINCT B.ProfileID) AS num_profiles
23 | FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID
24 | GROUP BY P.Sex;
25 |
26 | # 643
27 | # Female 8753
28 | # Male 5974
29 |
30 | SET @males = (SELECT COUNT(DISTINCT B.ProfileID) FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID AND P.Sex='Male');
31 | SET @females = (SELECT COUNT(DISTINCT B.ProfileID) FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID AND P.Sex='Female');
32 | SET @everyone = (SELECT COUNT(DISTINCT B.ProfileID) FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID);
33 |
34 |
35 | # We will only consider books that are liked by a reasonable number
36 | # of users. We will put the threshold at 10, but we we change it
37 | # We will also save the results in a temporary table.
38 | # We divide with the total number of people that like books
39 | DROP TABLE IF EXISTS popular_books;
40 | CREATE TEMPORARY TABLE popular_books AS
41 | SELECT Book, COUNT(*) AS cnt, COUNT(*)/@everyone AS perc
42 | FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID
43 | GROUP BY Book
44 | HAVING cnt >= 10
45 | ORDER BY cnt DESC;
46 |
47 | SELECT * FROM popular_books;
48 |
49 | # We now calculate the number of men / women that like each of the popular books
50 | # It is absolutely crucial here to use a LEFT JOIN so that we can keep
51 | # the list of all popular books, even if no men / no women liked that book.
52 | #
53 | # *** There are a lot of nuanced things in this join. ***
54 | #
55 | # a. Notice that we have the condition P.Sex = 'Male' in the JOIN condition
56 | # If we put the condition in the WHERE clause, the WHERE clause is
57 | # executed after the LEFT JOIN, and eliminates all the non-matched Books
58 | #
59 | # b. Notice that we do a COUNT of the P.ProfileID. If we do a count of B.ProfileID
60 | # the results will be completely different (and wrong). That may seem as a
61 | # headscratcher, but you need to remember the behavior of LEFT JOINS for
62 | # unmatched rows. Try executing the LEFT JOIN without the GROUP BY / aggregations
63 | # to understand what is going on before the GROUP BY aggregation. Select both
64 | # the B.ProfileID and the P.ProfileID, which superficially seem to be the same
65 | # as we have the equality condition P.ProfileID = B.ProfileID in the JOIN clause
66 | #
67 | # c. We use a bit of "smoothing" and add 0.5 to the nominator and 1 to the denominator
68 | # when we calculation the percentage. That is to avoid zeros, as we
69 | # will be dividing with perc_men and perc_women in the next query
70 | #
71 | DROP TABLE IF EXISTS book_men ;
72 | CREATE TEMPORARY TABLE book_men AS
73 | SELECT B.Book,
74 | COUNT(DISTINCT P.ProfileID) AS cnt_men,
75 | (COUNT(DISTINCT P.ProfileID)+0.5)/(@males+1) AS perc_men
76 | FROM popular_books B
77 | LEFT JOIN FavoriteBooks F ON B.Book = F.Book
78 | LEFT JOIN Profiles P ON P.ProfileID = F.ProfileID AND P.Sex = 'Male'
79 | GROUP BY B.Book
80 | ORDER BY perc_men DESC;
81 |
82 |
83 |
84 |
85 | # We repeat the process for women. Same nuances apply here as in the join just above.
86 | DROP TABLE IF EXISTS book_women;
87 | CREATE TEMPORARY TABLE book_women AS
88 | SELECT B.Book,
89 | COUNT(DISTINCT P.ProfileID) AS cnt_women,
90 | (COUNT(DISTINCT P.ProfileID)+0.5)/(@females+1) AS perc_women
91 | FROM popular_books B
92 | LEFT JOIN FavoriteBooks F ON B.Book = F.Book
93 | LEFT JOIN Profiles P ON P.ProfileID = F.ProfileID AND P.Sex = 'Female'
94 | GROUP BY B.Book
95 | ORDER BY perc_women DESC;
96 |
97 |
98 |
99 |
100 | # Once we have our subqueries in place, we join the two tables and calculate the
101 | # "lift". The lift is defined as the "probability of seeing something in one population"
102 | # divided by the "probability of seeing something in a contrasting population".
103 | # In this case, we compare percentages (~probabilities) in the populations of men vs women
104 | #
105 | # Notice here that we calculate the lift by dividing with perc_men (or perc_women later)
106 | # hence the need to have a non-zero value for perc_men when creating the book_men table.
107 | #
108 | # We could have done it with women vs the overall population as well, but the "overall"
109 | # population includes women as well, so the contrast is not great.
110 | #
111 | # Alternatively, we could have done women vs rest; and men vs rest. We leave that
112 | # calculation as an exercise for the interested reader.
113 | SELECT B.Book, B.cnt, B.perc,
114 | M.cnt_men, M.perc_men,
115 | F.cnt_women, F.perc_women,
116 | perc_men /perc_women AS lift_men_vs_women,
117 | perc_women / perc_men AS lift_women_vs_men
118 | FROM popular_books B
119 | LEFT JOIN book_men M ON M.Book = B.Book
120 | LEFT JOIN book_women F ON F.Book = B.Book
121 | ORDER BY lift_women_vs_men DESC, cnt_women DESC;
122 |
123 |
--------------------------------------------------------------------------------
/session6/books_and_political_views.sql:
--------------------------------------------------------------------------------
1 | USE facebook;
2 |
3 | -- We introduce variables to avoid hardcoding
4 | SET @all_students = (SELECT COUNT(*) FROM Profiles);
5 |
6 | -- This is our master table. Contains the total likes for each book.
7 | -- It also contains the "percentage" which normalizes the number of likes
8 | -- with the population that is available to like a book
9 | DROP TABLE IF EXISTS book_likes;
10 | CREATE TEMPORARY TABLE book_likes AS
11 | SELECT Book, COUNT(ProfileID) AS cnt, COUNT(ProfileID)/@all_students AS perc
12 | FROM FavoriteBooks
13 | GROUP BY Book
14 | ORDER BY cnt DESC;
15 |
16 |
17 |
18 | -- We now introduce the table that stores the likes for the liberal population,
19 | -- together with the normalized percentage, after dividing with the number of
20 | -- liberal students
21 | SET @liberals = (SELECT COUNT(*) AS cnt FROM Profiles WHERE PoliticalViews = 'Liberal');
22 |
23 | DROP TABLE IF EXISTS book_liberals;
24 | CREATE TEMPORARY TABLE book_liberals AS
25 | SELECT Book, COUNT(P.ProfileID) AS cnt_libs, COUNT(P.ProfileID)/@liberals AS perc_libs
26 | FROM FavoriteBooks B INNER JOIN Profiles P ON P.ProfileID = B.ProfileID
27 | WHERE P.PoliticalViews = 'Liberal'
28 | GROUP BY Book
29 | ORDER BY cnt_libs DESC;
30 |
31 |
32 | -- And same for the conservatives
33 | SET @conservatives = (SELECT COUNT(*) AS cnt FROM Profiles WHERE PoliticalViews = 'Conservative');
34 |
35 | DROP TABLE IF EXISTS book_conservatives;
36 | CREATE TEMPORARY TABLE book_conservatives AS
37 | SELECT Book, COUNT(P.ProfileID) AS cnt_cons, COUNT(P.ProfileID)/@conservatives AS perc_cons
38 | FROM FavoriteBooks B INNER JOIN Profiles P ON P.ProfileID = B.ProfileID
39 | WHERE P.PoliticalViews = 'Conservative'
40 | GROUP BY Book
41 | ORDER BY cnt_cons DESC;
42 |
43 |
44 | -- Once we have the full list of likes for all books, we can now
45 | -- perform two LEFT JOINS with liberal and conservative likes
46 | -- and have a list of all books. Notice what would have happened
47 | -- if we had used an INNER JOIN instead (we would have missed
48 | -- books without likes in the liberal or in the conservative
49 | -- population)
50 | --
51 | -- Notice also the use of the COALESCE function, which checks
52 | -- if an attribute is NULL; if yes, replaces it with the second
53 | -- argument. In our case, we replace NULLs with 0, as these are the
54 | -- "zero likes" books.
55 | --
56 | -- Finally, notice the calculation of "perc_nonlibs" and "perc_noncons"
57 | -- that is done in a way to remove the "liberals" and "conservatives"
58 | DROP TABLE IF EXISTS book_comparison;
59 | CREATE TEMPORARY TABLE book_comparison AS
60 | SELECT B.Book
61 | , B.cnt
62 | , B.perc AS perc
63 | , COALESCE(L.cnt_libs,0) AS cnt_libs
64 | , COALESCE(L.perc_libs,0) AS perc_libs
65 | , COALESCE(C.cnt_cons,0) AS cnt_cons
66 | , COALESCE(C.perc_cons,0) AS perc_cons
67 | , (B.cnt - COALESCE(L.cnt_libs,0)) / (@all_students - @liberals) AS perc_nonlibs
68 | , (B.cnt - COALESCE(C.cnt_cons,0)) / (@all_students - @conservatives) AS perc_noncons
69 | FROM
70 | book_likes B
71 | LEFT JOIN book_liberals L ON B.Book = L.Book
72 | LEFT JOIN book_conservatives C ON B.Book = C.Book
73 | WHERE B.cnt > 5 -- Only keep books with at least 5 likes overall, to avoid very noisy entries
74 | ORDER BY cnt DESC;
75 |
76 |
77 |
78 | DROP TABLE IF EXISTS book_scores;
79 | CREATE TEMPORARY TABLE book_scores AS
80 | SELECT *
81 | , perc_libs/perc_nonlibs AS lift_libs
82 | , LOG10(perc_libs/perc_nonlibs + 0.001) AS logodds_libs
83 | , perc_cons/perc_noncons AS lift_cons
84 | , LOG10(perc_cons/perc_nonlibs + 0.001) AS logodds_cons
85 | FROM book_comparison;
86 |
87 | SELECT * FROM book_scores;
88 |
89 | -- Now that we have the scores for each book, we can try to score individuals
90 | DROP TABLE IF EXISTS user_scores;
91 | CREATE TEMPORARY TABLE user_scores AS
92 | SELECT P.ProfileID, P.PoliticalViews
93 | , AVG(logodds_libs) AS avg_lib
94 | , AVG(logodds_cons) AS avg_cons
95 | , COUNT(*) AS cnt_books
96 | , CASE WHEN AVG(logodds_libs) > AVG(logodds_cons) THEN "Liberal" ELSE "Conservative" END AS Estimate
97 | FROM
98 | Profiles P
99 | JOIN FavoriteBooks B ON P.ProfileID = B.ProfileID
100 | JOIN book_scores S ON B.Book = S.Book
101 | GROUP BY ProfileID, P.PoliticalViews
102 | ORDER BY ProfileID;
103 |
104 | -- Classification Statistics
105 | SELECT PoliticalViews, Estimate, COUNT(*)
106 | FROM user_scores
107 | GROUP BY PoliticalViews, Estimate
108 | ORDER BY PoliticalViews, Estimate;
109 |
110 | --
111 | -- Classified
112 | -- Cons Libs
113 | -- Correct Cons 360 166 526
114 | -- Libs 455 3829 4284
115 | --
116 | -- Cons Libs
117 | -- Cons 0.684410646 0.315589354
118 | -- Libs 0.10620915 0.89379085
119 |
--------------------------------------------------------------------------------
/session6/music_recommendations.sql:
--------------------------------------------------------------------------------
1 | USE facebook;
2 |
3 | # Store the number of people that have liked (any) music
4 | SET @allmusicfans = (SELECT COUNT(DISTINCT ProfileID) FROM FavoriteMusic);
5 |
6 | # Create a table/view that stores the preferences across all the population
7 | # We store the band and the percentage of people that liked it
8 | DROP TEMPORARY TABLE IF EXISTS MusicPreferences;
9 | CREATE TEMPORARY TABLE MusicPreferences AS
10 | SELECT Music,
11 | ROUND(COUNT(DISTINCT ProfileID)/@allmusicfans,4) AS perc,
12 | COUNT(DISTINCT ProfileID) AS cnt
13 | FROM FavoriteMusic
14 | GROUP BY Music
15 | ORDER BY perc DESC;
16 |
17 |
18 | # Set the band that we are analyzing
19 | SET @band = 'Bon Jovi';
20 |
21 | # Store the number of people that like the specific band
22 | SET @bandfans = (SELECT cnt FROM MusicPreferences WHERE Music = @band);
23 |
24 |
25 | # Create a table with the percentages across only people that like the band
26 | # that we specified in the variable @band
27 | DROP TEMPORARY TABLE IF EXISTS BandFans;
28 | CREATE TEMPORARY TABLE BandFans AS
29 | SELECT Music,
30 | ROUND(COUNT(DISTINCT ProfileID)/@bandfans,4) AS perc,
31 | COUNT(DISTINCT ProfileID) AS cnt
32 | FROM FavoriteMusic
33 | WHERE ProfileID IN (
34 | SELECT ProfileID
35 | FROM FavoriteMusic
36 | WHERE Music = @band
37 | )
38 | GROUP BY Music
39 | ORDER BY perc DESC;
40 |
41 |
42 | # Join the two tables above to compare the percentages of likes
43 | # in the overall population (T.perc) vs the percentage of likes
44 | # across the population of people that like the @band (R.perc)
45 | # We call the ratio of the percentages to as "lift"
46 | SELECT T.Music,
47 | R.perc AS perc_focus, R.cnt AS cnt_focus,
48 | T.perc AS perc_total, T.cnt AS cnt_total,
49 | R.perc/T.perc AS lift_ratio
50 | FROM BandFans R JOIN MusicPreferences T ON R.Music = T.Music
51 | ORDER BY lift_ratio DESC;
52 |
53 | # Improving the details now.
54 | # Below we introduce a few fixes to remove noise and
55 | # make the results more presentable.
56 |
57 |
58 | # To avoid noise, we keep only bands that have at least 100
59 | # likes in the overall population.
60 | # The variable @min_fans is the minimum number of fans
61 | # required, for a band to be analyzed
62 | SET @min_fans = 100;
63 |
64 | # Join the two tables above to compare the percentages of likes
65 | # in the overall population (T.perc) vs the percentage of likes
66 | # across the population of people that like the @band (R.perc)
67 | # We use an OUTER join to keep all the bands from the overall
68 | # population, even if they do not appear in the likes of the
69 | # fans of the target artist.
70 | # The COALESCE function replaces NULL values with 0.0
71 | #
72 | SELECT T.Music,
73 | COALESCE(R.perc,0.0) AS perc_focus, COALESCE(R.cnt,0) AS cnt_focus,
74 | T.perc AS perc_total, T.cnt AS cnt_total,
75 | COALESCE(R.perc/T.perc,0) AS lift_ratio
76 | FROM MusicPreferences T LEFT JOIN BandFans R ON R.Music = T.Music
77 | WHERE T.cnt>@min_fans AND (R.Music IS NULL OR R.Music != @band)
78 | ORDER BY lift_ratio DESC
79 |
--------------------------------------------------------------------------------
/session6/music_recommendations2.sql:
--------------------------------------------------------------------------------
1 | # We use this number to go from raw number of likes to _percentage_ of people that like a band
2 | SET @allfans = (SELECT COUNT(DISTINCT ProfileID) FROM FavoriteMusic);
3 |
4 | # We will not consider bands with less than 100 fans
5 | SET @min_fans = 40;
6 |
7 | WITH
8 |
9 | # Calculate number of people that line a band
10 | # To avoid noise, we keep only bands with more than @min_fans
11 | MusicPreferences AS (
12 | SELECT Music, COUNT(ProfileID) AS cnt, COUNT(ProfileID)/@allfans AS perc
13 | FROM FavoriteMusic
14 | GROUP BY Music
15 | HAVING cnt >= @min_fans
16 | ),
17 |
18 | # For all pairs of bands, calculate number of people that like both bands
19 | # The M1.Music and
21 | CommonFans AS (
22 | SELECT M1.Music AS Music1, M2.music AS Music2, COUNT(*) AS common_fans
23 | FROM FavoriteMusic M1
24 | JOIN FavoriteMusic M2 ON M1.ProfileID = M2.ProfileID AND M1.Music= @min_fans
13 | ),
14 |
15 | # For all pairs of bands, calculate number of people that like both bands
16 | # The M1.Music and
18 | CommonFans AS (
19 | SELECT M1.Music AS Music1, M2.music AS Music2, COUNT(*) AS common_fans
20 | FROM FavoriteMusic M1
21 | JOIN FavoriteMusic M2 ON M1.ProfileID = M2.ProfileID AND M1.Music= @min_fans
24 | )
25 |
26 | # Put together data about common fans, and overall fans for each band,
27 | # calculate percentages of fans of band1 that like band2, and vice versa
28 | SELECT M1.Music AS Music1, M1.cnt AS cnt1_overall,
29 | M2.Music AS Music2, M2.cnt AS cnt2_overall,
30 | C.common_fans, C.common_fans/(M1.cnt + M2.cnt - C.common_fans) AS jaccard
31 | FROM CommonFans C
32 | JOIN MusicPreferences M1 ON M1.Music = C.Music1
33 | JOIN MusicPreferences M2 ON M2.Music = C.Music2
34 | ORDER BY jaccard DESC
35 |
36 |
--------------------------------------------------------------------------------
/session7/G-Window_queries.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session7/G-Window_queries.pptx
--------------------------------------------------------------------------------
/session7/README.md:
--------------------------------------------------------------------------------
1 | # Window Functions
2 |
3 | https://antonz.org/sql-window-functions-book/
4 |
5 | https://www.mysqltutorial.org/mysql-window-functions/
6 |
7 | https://www.stratascratch.com/blog/the-ultimate-guide-to-sql-window-functions/
8 |
9 | https://towardsdatascience.com/a-guide-to-advanced-sql-window-functions-f63f2642cbf9
10 |
11 | https://mode.com/sql-tutorial/sql-window-functions/
12 |
13 | https://www.geeksforgeeks.org/window-functions-in-sql/
14 |
15 | https://www.toptal.com/sql/intro-to-sql-windows-functions
16 |
--------------------------------------------------------------------------------
/session7/music_by_gender_rank_example.sql:
--------------------------------------------------------------------------------
1 | USE facebook;
2 |
3 | # We create first a table of popular music across everyone
4 | # For efficiency we only keep music liked by more than 10 people
5 | DROP TEMPORARY TABLE IF EXISTS popular_music;
6 | CREATE TEMPORARY TABLE popular_music AS
7 | SELECT M.Music AS music, COUNT(M.ProfileID) AS cnt
8 | FROM FavoriteMusic M
9 | GROUP BY M.Music
10 | HAVING cnt > 10
11 | ORDER BY cnt DESC;
12 |
13 |
14 | # We now calculate popularity of each music broken down by sex
15 | DROP TEMPORARY TABLE IF EXISTS popular_music_by_sex;
16 | CREATE TEMPORARY TABLE popular_music_by_sex AS
17 | SELECT M.Music AS music, P.Sex AS gender, COUNT(P.ProfileID) AS cnt
18 | FROM FavoriteMusic M JOIN Profiles P ON P.ProfileID = M.ProfileID
19 | WHERE P.Sex IS NOT NULL
20 | GROUP BY M.Music, P.Sex
21 | ORDER BY cnt DESC;
22 |
23 | # We will now create two tables with music rank, one per gender
24 | # In principle, we could also do a window OVER (PARTITION BY gender ORDER BY cnt DESC)
25 | # However, MySQL has a bug that does not allow temporary tables to
26 | # join with itself, so we end up creating one temp table for males and another for females
27 | DROP TEMPORARY TABLE IF EXISTS chart_male;
28 | CREATE TEMPORARY TABLE chart_male AS
29 | SELECT music, cnt,
30 | RANK() OVER (ORDER BY cnt DESC) AS music_rank
31 | FROM popular_music_by_sex
32 | WHERE gender = 'Male'
33 | ORDER BY music_rank;
34 |
35 | DROP TEMPORARY TABLE IF EXISTS chart_female;
36 | CREATE TEMPORARY TABLE chart_female AS
37 | SELECT music, cnt,
38 | RANK() OVER (ORDER BY cnt DESC) AS music_rank
39 | FROM popular_music_by_sex
40 | WHERE gender = 'Female'
41 | ORDER BY music_rank;
42 |
43 | # Finally we bring everything together.
44 | # Note that we start with popular_music as a reference table
45 | # and we left join the other two tables, as there is no guarantee
46 | # that we will encounter a music in both males and females tables
47 | # To estimate the difference between males and females we take the
48 | # log of the rank, and then the difference; the reason we do that
49 | # is because a difference of 5 between No1 and No6 is very different
50 | # than a difference of 5 between No605 and No610. With LOG we kind
51 | # of estimate difference in "orders of magnitude"
52 | SELECT S.music, S.cnt,
53 | M.cnt AS male_cnt, M.music_rank AS male_rank,
54 | F.cnt AS female_cnt, F.music_rank AS female_rank,
55 | ROUND(-LOG(F.music_rank / M.music_rank),2) AS diff_females
56 | FROM popular_music S
57 | LEFT JOIN chart_female M ON (S.music = M.music)
58 | LEFT JOIN chart_male F ON (S.music = F.music)
59 | ORDER BY diff_females DESC;
60 |
--------------------------------------------------------------------------------