├── .gitattributes ├── .github └── workflows │ └── main.yml ├── .gitignore ├── F-SQL_Misc.pptx ├── MySQL_Install.md ├── README.md ├── TODO.md ├── binder ├── postBuild └── requirements.txt ├── practice_db_restaurants.sql ├── queries1.ipynb ├── queries2.ipynb ├── session1 ├── A1-ER_modeling.pptx ├── A2-ER_modeling-GalleryPainter.pptx ├── A2-ER_modeling-WaterUtility.pptx ├── A3-ER_modeling-DB_vs_Spreadsheets_Normalization.pptx ├── A4-ER_modeling-CreateTablesInSQL.pptx ├── A5-Inserting_Data_in_MySQL_using_Python.ipynb ├── A7-Citibike.ipynb ├── A8-ERD_No_Future_Records.mwb ├── assignment1.pdf ├── assignment2.md ├── cellular_operator_ER_diagram.PNG ├── practice_questions.md └── practice_questions_solutions.md ├── session2 ├── A-Navigation_Queries.ipynb ├── A-SQL_Intro_Navigating_DB.pptx ├── B-SQL_Selection_Queries.pptx ├── B-Selection_Queries.ipynb ├── C-Schemas.pdf ├── C-Schemas.pptx ├── C1-ERD_Facebook.mwb ├── C2-ERD_IMDB.mwb └── assignment_selection_queries.ipynb ├── session3 ├── B3-Filtering_Queries.ipynb ├── B3-SQL_Filtering.pptx ├── assignment_filtering_queries.ipynb └── practice_questions_filtering.md ├── session4 ├── C-Join_Queries.ipynb ├── C-SQL_Joins.pptx ├── README.md ├── assignment_join_queries.ipynb └── practice_questions_joins_restaurants.md ├── session5 ├── D-SQL_Aggregation_Queries.ipynb ├── D-SQL_Aggregation_Queries.pptx ├── assignment_aggregate_queries.ipynb ├── practice_queries_aggregation.md ├── practice_queries_aggregation_solution.ipynb ├── practice_queries_join_and_aggregation.md └── practice_queries_join_and_aggregation_solutions.ipynb ├── session6 ├── F-SQL_Subqueries-7.1.pptx ├── F-SQL_Subqueries.pptx ├── assignment_combined.ipynb ├── book_vs_gender.sql ├── books_and_political_views.sql ├── music_recommendations.sql ├── music_recommendations2.sql └── music_recommendations3.sql └── session7 ├── G-Window_queries.pptx ├── README.md └── music_by_gender_rank_example.sql /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=nbstripout 2 | 3 | *.ipynb diff=ipynb 4 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Check that all notebooks work 4 | 5 | # Controls when the action will run. Triggers the workflow on push or pull request 6 | # events but only for the master branch 7 | on: 8 | push: 9 | branches: [ master ] 10 | pull_request: 11 | branches: [ master ] 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | build: 17 | # The type of runner that the job will run on 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: [3.8] 22 | 23 | # Steps represent a sequence of tasks that will be executed as part of the job 24 | steps: 25 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 26 | - uses: actions/checkout@v2 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v1 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: Install dependencies and Flake8/Flake8-nb for linting 34 | run: | 35 | python3 -m pip install --upgrade pip 36 | pip3 install -r binder/requirements.txt 37 | - name: Lint with flake8 38 | run: | 39 | # stop the build if there are Python syntax errors or undefined names 40 | # flake8-nb session1/*.ipynb --count --select=E9,F63,F7,F82 --show-source --statistics 41 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 42 | # flake8-nb session1/*.ipynb --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 43 | - name: Run all the notebooks using nbconvert 44 | run: | 45 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session1/A5-Inserting_Data_in_MySQL_using_Python.ipynb 46 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session1/A7-Citibike.ipynb 47 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session2/A-Navigation_Queries.ipynb 48 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session2/B-Selection_Queries.ipynb 49 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session3/B3-Filtering_Queries.ipynb 50 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session4/C-Join_Queries.ipynb 51 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 sql_assignment_template.ipynb 52 | 53 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session2/assignment_selection_queries.ipynb 54 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session3/assignment_filtering_queries.ipynb 55 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session4/assignment_join_queries.ipynb 56 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session5/assignment_aggregate_queries.ipynb 57 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session5/practice_queries_aggregation_solution.ipynb 58 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session5/practice_queries_join_and_aggregation_solutions.ipynb 59 | # jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=600 session6/assignment_combined.ipynb 60 | 61 | 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /F-SQL_Misc.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/F-SQL_Misc.pptx -------------------------------------------------------------------------------- /MySQL_Install.md: -------------------------------------------------------------------------------- 1 | # Install MySQL server 2 | 3 | The instructions below can be used to install MySQL from scratch in any Linux/Ubuntu machine, and to import the databases that we will use as examples. (Most of) these steps have been already completed in the image that is being used for the class, but I keep the instructions here for reference, if a student wants to create a MySQL installation in a new machine. 4 | 5 | ## Install Software on Ubuntu 6 | 7 | Login to the terminal and type: 8 | 9 | `sudo apt-get update` 10 | 11 | and then 12 | 13 | `sudo apt-get -y install mysql-server` 14 | 15 | During installation, you will be prompted to create a password for "root" user. You can use any password you like, but make sure that you remember it. In our own installation, we used the password `dwdstudent2015`. 16 | 17 | After a succesful installation, you will be able to access MySQL server from the console by typing: 18 | 19 | `mysql -u root -p` 20 | 21 | Inside mysql console you can execute SQL commands, for example, the command: 22 | 23 | `mysql> SHOW DATABASES;` 24 | 25 | will show you the databases available. The first that you run the command you will see something like: 26 | 27 | 28 | | Database | 29 | |--------------------| 30 | | information_schema | 31 | | mysql | 32 | | performance_schema | 33 | ``` 34 | 3 rows in set (0.00 sec) 35 | ``` 36 | 37 | For now, let's get out of the command-line interface, by typing the command `QUIT` 38 | 39 | `mysql> QUIT` 40 | 41 | ### Making MySQL server accessible from host machine 42 | 43 | We need to change a couple of things to make the MySQL database accessible from the host machine. First we need to change the configuration file for MySQL, to allow it to respond to connections from the outside world. 44 | 45 | 1. Make sure that your machine has the port 3306 open in the security settings of your EC2 instance. 46 | 47 | 2. Go and edit the file `/etc/mysql/mysql.conf.d/mysqld.cnf` and find the parameter `bind-address`. By default, the setting is `bind-address = 127.0.0.1`. Change it to `bind-address = 0.0.0.0` in order to allow connections from any machine. 48 | 49 | 3. Connect to MySQL (type `mysql -u root -p` in the shell) and then within MySQL run the following commands: 50 | 51 | `mysql> CREATE USER 'root'@'%' IDENTIFIED BY 'dwdstudent2015';` 52 | 53 | `mysql> GRANT ALL PRIVILEGES ON *.* TO 'root'@'%';` 54 | 55 | `mysql> FLUSH PRIVILEGES;` 56 | 57 | `mysql> exit` 58 | 59 | And then, in the shell: 60 | 61 | `sudo service mysql restart` 62 | 63 | _Note: This is an insecure setup, as it provides admin access to your database, to anyone that has the IP address of your machine and the password._ 64 | 65 | ## Import databases 66 | 67 | Now, we are ready to fetch the datasets and store them in the database. 68 | 69 | ### Facebook 70 | 71 | Import a database of the Facebook profiles of the first NYU users (back from 2004-6), before Facebook started paying attention to these annoying issues of privacy and security :-) 72 | 73 | `!zcat data/facebook.sql.gz | mysql -u root -pdwdstudent2015` 74 | 75 | 76 | _Warning_: Importing the Facebook data will take approximately 15-20 minutes, during which the machine will look unresponsive. Please do not stop it. 77 | 78 | ### IMDB 79 | 80 | This database contains a set of tables from the IMDB database. 81 | 82 | `!zcat data/imdb.sql.gz | mysql -u root -pdwdstudent2015` 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ipeirotis/introduction-to-databases/blob/master/) 2 | 3 | # Relational Databases and SQL 4 | 5 | This is a brief introductory module to relational databases and SQL. It mainly targets people that are interested in learning SQL, and does not cover topics such as indexing, transactions, stored procedures, etc. 6 | 7 | ## Videos for the class 8 | 9 | [Videos for the class](https://www.youtube.com/playlist?list=PLqAPn_b_yx0QcOgEvAKQQ5yzplFI-FOQI) 10 | 11 | ## Indicative Schedule 12 | 13 | ### Module 1: Entity-Relationship Model and Relational Databases 14 | 15 | * Entities, Primary Keys, and Attributes 16 | * Relations 17 | * Cardinality: One-to-One, One-to-Many, Many-to-Many 18 | * From ER Diagram to a Relational Schema 19 | * (Optional) SQL Statements for Creating Tables 20 | * (Optional) Populating a Database with Data 21 | * Activity 1: Artist-Gallery-Painting example 22 | * Activity 2: Creating a relational schema from an ER diagram 23 | * Activity 3: From Spreadsheet to a Normalized Database 24 | 25 | 26 | ### Module 2: Selection Queries 27 | 28 | * Understand the design of our example databases 29 | * Navigating a Database: `USE`, `SHOW TABLES`, `DESCRIBE` 30 | * Selection queries: `SELECT *`, `SELECT column`, `column AS`, 31 | * Selection queries: `DISTINCT`, `ORDER BY`, `LIMIT` 32 | 33 | ### Module 3: Filtering Queries 34 | 35 | * `WHERE` clause 36 | * Boolean conditions: `AND`, `OR`, `NOT`, `BETWEEN` 37 | * Containment condition: `IN`, 38 | * Approximate matches: `LIKE` 39 | * `NULL` values 40 | * CASE WHEN clause 41 | * Attribute-level functions: NULL functions, date functions, etc. 42 | * Activity: Find People that Live in "New York" (exploration for data cleaning) 43 | * TODO: Create separate slides for Null Functions, Date Functions, and String functions 44 | * TODO: Create videos for CASE WHEN, Null Functions, Date Functions, String functions 45 | 46 | ### Module 4: JOIN queries 47 | 48 | * Inner Joins 49 | * Self Joins 50 | * Outer Joins 51 | * Antinoins and Semijoins 52 | 53 | ### Module 5: Aggregate queries 54 | 55 | * Aggregation functions (COUNT, COUNT DISTINC, SUM, AVG, MAX, MIN, STDEV, CONCAT) 56 | * GROUP BY on a single attribute 57 | * GROUP BY on multiple attributes 58 | * HAVING clause 59 | * Integrated JOIN and GROUP BY queries 60 | * TODO: Add an example for GROUP_CONCAT (e.g. for movie genres) 61 | * TODO: In video ["Integrated Queries: Statistics on Directors"](https://www.youtube.com/watch?v=aeXWO4xHsTw&list=PLqAPn_b_yx0QcOgEvAKQQ5yzplFI-FOQI&index=42) at sec 37 remove the "For movies from the Year 2000" 62 | 63 | ### Module 6: Subqueries 64 | 65 | * Subqueries with single-value results 66 | * Semijoins and Antijoins using subqueries with the IN clause 67 | * Subqueries with derived tables 68 | * Comparison of WITH, temporary tables, views, and tables 69 | * Activity 1: Music recommendations 70 | * Activity 2: Compare Tastes Across Demographic Segments 71 | 72 | ### Module 7: Window queries 73 | 74 | * Window definition: `OVER(ORDER BY)` 75 | * Ranking window functions: `RANK`, `DENSE_RANK`, etc 76 | * Aggregation functions and windows `OVER(PARITION BY ORDER BY)` 77 | * Offset window functions: `LEAD`, `LAG` etc 78 | * Aggregation functions and windows 79 | * Frame definitions and rolling aggregations 80 | 81 | 82 | 83 | ### Planned modules 84 | 85 | * Temporal data (e.g., following https://www.stratascratch.com/guides/sql-time-and-date-skills) 86 | * Geospatial data 87 | * String functions 88 | 89 | * UNION 90 | * ANY/ALL 91 | * ROLLUP / GROUPING 92 | * EXISTS 93 | 94 | 95 | 96 | ## Additional Resources for Learning SQL 97 | 98 | * [StrataScratch](https://platform.stratascratch.com/coding), [Leetcode](https://leetcode.com/problemset/database/), [DataLemur](https://datalemur.com/): SQL Interview questions for data science positions in many tech companies 99 | * [Mode SQL Tutorial](https://mode.com/sql-tutorial/): A well-written and organized tutorial for SQL, with material starting from the very basics up to very advanced. 100 | * [Introduction to SQL](https://www.khanacademy.org/computing/computer-programming/sql) from Khan Academy. Introductory course, with videos explaining the various aspects of SQL. 101 | * [W3Schools SQL](http://www.w3schools.com/sql/): An introduction to SQL with hands-on examples 102 | * [Learn SQL](https://www.codecademy.com/learn/learn-sql) and [SQL: Analyzing Business Metrics](https://www.codecademy.com/learn/sql-analyzing-business-metrics): Two short, self-directed online course from Code Academy 103 | * [SQL Tutorial](http://www.w3resource.com/sql/tutorials.php) 104 | * [Learning MySQL](http://shop.oreilly.com/product/9780596008642.do): A useful textbook for those interested in learning more about SQL 105 | * [W3 Resource](https://www.w3resource.com/sql/tutorials.php) and [SQL exercises](https://www.w3resource.com/sql-exercises/) 106 | * [Become a SELECT Star!](https://gumroad.com/l/sql-zine) by Julia Evans: A very useful e-zine that summarizes in a cartoonish way most of the SQL concepts that we cover in class. Worth the $12. 107 | * [How to Teach People SQL](https://dataschool.com/how-to-teach-people-sql/): Great visualizations for the various SQL operations that we learn. 108 | * [SQL Data Manipulation for Data Scientists](https://www.stratascratch.com/guides/): Advanced data manipulation techniques using SQL. 109 | * [Best practices for SQL](https://data36.com/sql-best-practices-data-analysts/): A set of useful guidelines for writing readable SQL statements. 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | 2 | * Move UNION earlier as well? 3 | 4 | * COALESCE 5 | 6 | 7 | 8 | 9 | Practice 10 | 11 | Read the "Learning MySQL" textbook, chapter 7. 12 | Work on the online SQL Tutorial by W3Scools 13 | Work on the SQL course on Code Academy 14 | Work on the SQL course on Khan Academy 15 | -------------------------------------------------------------------------------- /binder/postBuild: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | jupyter contrib nbextension install --user 4 | 5 | # Allow collapsible headers 6 | jupyter nbextension enable collapsible_headings/main 7 | 8 | # Add a code formatter - requires yapf package 9 | jupyter nbextension enable code_prettify/code_prettify 10 | 11 | # Adds the "Solution" hiding option 12 | jupyter nbextension enable exercise2/main 13 | 14 | # Spellchecking 15 | jupyter nbextension enable spellchecker/main 16 | 17 | 18 | # Install nbstripout on git and clean the notebooks 19 | nbstripout --install --attributes .gitattributes 20 | nbstripout notes/*.ipynb 21 | 22 | -------------------------------------------------------------------------------- /binder/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | 3 | tqdm 4 | matplotlib 5 | 6 | PyMySQL 7 | sqlalchemy 8 | sql_magic 9 | 10 | pandas 11 | openpyxl 12 | 13 | flake8 14 | flake8-nb 15 | jupyter 16 | 17 | mistune>=2.0.3 # not directly required, pinned by Snyk to avoid a vulnerability 18 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability 19 | -------------------------------------------------------------------------------- /practice_db_restaurants.sql: -------------------------------------------------------------------------------- 1 | DROP DATABASE IF EXISTS Restaurants; 2 | 3 | CREATE DATABASE Restaurants; 4 | 5 | USE Restaurants; 6 | 7 | CREATE TABLE Restaurant( 8 | restCode NUMERIC(3) NOT NULL, 9 | restName VARCHAR(30) NOT NULL, 10 | cuisine VARCHAR(25) NOT NULL, 11 | borough VARCHAR(30) NOT NULL, 12 | yearEst SMALLINT NOT NULL CHECK (yearEst >= 1990 AND yearEst <= 2019), 13 | avgPrice NUMERIC(5,2) NOT NULL, 14 | CONSTRAINT Rest_pk PRIMARY KEY (restCode) 15 | ); 16 | 17 | CREATE TABLE Critic( 18 | cID NUMERIC(3) NOT NULL, 19 | firstN VARCHAR(25) NOT NULL, 20 | lastN VARCHAR(30) NOT NULL, 21 | affiliation VARCHAR(25), 22 | CONSTRAINT Critic_pk PRIMARY KEY (cID) 23 | ); 24 | 25 | CREATE TABLE Rating( 26 | code CHAR(4) NOT NULL, 27 | cID NUMERIC(3) NOT NULL, 28 | restCode NUMERIC(3) NOT NULL, 29 | starRating NUMERIC(1) NOT NULL, 30 | ratingDate DATE , 31 | comments VARCHAR(250), 32 | CONSTRAINT Rating_pk PRIMARY KEY (code), 33 | CONSTRAINT Rating_rID_fk FOREIGN KEY(cID) REFERENCES Critic(cID), 34 | CONSTRAINT Rating_restCode_fk FOREIGN KEY(restCode) REFERENCES Restaurant(restCode) 35 | ); 36 | 37 | 38 | ALTER TABLE Rating 39 | ADD CHECK (starRating <= 5); 40 | 41 | INSERT INTO Restaurant VALUES (101, 'Pok Pok', 'Thai', 'Brooklyn', 2005, 100.00); 42 | INSERT INTO Restaurant VALUES (102, 'Kiin Thai', 'Thai', 'Manhattan', 2013, 75.00); 43 | INSERT INTO Restaurant VALUES (103, 'Carbone', 'Italian', 'Manhattan', 2010, 150.00); 44 | INSERT INTO Restaurant VALUES (104, 'Il Mulino', 'Italian', 'Manhattan', 1999, 250.00); 45 | INSERT INTO Restaurant VALUES (105, 'Don Peppe', 'Italian', 'Queens', 1998, 75.00); 46 | INSERT INTO Restaurant VALUES (106, 'Loukoumi Taverna', 'Greek', 'Queens', 1994, 130.00); 47 | INSERT INTO Restaurant VALUES (107, 'Nisi', 'Greek', 'Manhattan', 2014, 100.00); 48 | INSERT INTO Restaurant VALUES (108, 'Ela Taverna', 'Greek', 'Brooklyn', 2015, 150.00); 49 | INSERT INTO Restaurant VALUES (109, 'Jianbing Company', 'Chinese', 'Brooklyn', 2010, 75.00); 50 | INSERT INTO Restaurant VALUES (110, 'Han Dynasty', 'Chinese', 'Manhattan', 2012, 125.00); 51 | INSERT INTO Restaurant VALUES (111, 'Antonio Trattoria', 'Italian', 'Bronx', 2008, 75.00); 52 | 53 | INSERT INTO Critic VALUES (201,'Sarah', 'Martinez','NYT'); 54 | INSERT INTO Critic VALUES (202,'Daniel', 'Lewis', 'WP'); 55 | INSERT INTO Critic VALUES (203,'Brittany', 'Harris', 'Vogue'); 56 | INSERT INTO Critic VALUES (204,'Mike', 'Anderson',NULL); 57 | INSERT INTO Critic VALUES (205,'Chris', 'Jackson','NYT'); 58 | INSERT INTO Critic VALUES (206,'Elizabeth', 'Thomas','Chronicle'); 59 | INSERT INTO Critic VALUES (207,'James', 'Cameron','NYP'); 60 | INSERT INTO Critic VALUES (208,'Ashley', 'White','NYT'); 61 | INSERT INTO Critic (cID, lastN, firstN) VALUES (209, 'Clarke','George'); 62 | INSERT INTO Critic VALUES (210,'Sean', 'Thompson','NYP'); 63 | 64 | INSERT INTO Rating VALUES ('R1', 201,101,2,'2014-11-13', 'Good food, bad service'); 65 | INSERT INTO Rating VALUES ('R2', 201,101,4,'2017-01-15', 'Amazing deserts, so-so appetizers'); 66 | INSERT INTO Rating VALUES ('R3', 202,106,4,NULL, 'Great atmosphere, friendly staff'); 67 | INSERT INTO Rating VALUES ('R4', 203,103,2,'2015-02-01', 'Disappointed'); 68 | INSERT INTO Rating VALUES ('R5', 203,108,4,'2016-03-01', 'Great fish'); 69 | INSERT INTO Rating VALUES ('R6', 203,108,2,'2018-06-30', 'Not as good as before'); 70 | INSERT INTO Rating VALUES ('R7', 204,101,3,'2017-10-23', NULL); 71 | INSERT INTO Rating VALUES ('R8', 205,103,3,'2012-02-16', NULL); 72 | INSERT INTO Rating (code, cID, restCode,ratingDate,starRating) VALUES ('R9', 205,104,'2000-02-16',2); 73 | INSERT INTO Rating (code, cID, restCode,starRating,comments) VALUES ('R10', 205,108,5, 'Must try fish'); 74 | INSERT INTO Rating VALUES ('R11', 206,107,3,'2016-07-02', 'Great food, rude staff'); 75 | INSERT INTO Rating VALUES ('R12', 206,106,5,'2001-12-21', 'Loved everything'); 76 | INSERT INTO Rating VALUES ('R13', 208,104,3,'2003-06-30', 'Overpriced'); 77 | INSERT INTO Rating VALUES ('R14', 209,104,3,'1005-07-30', NULL); 78 | -------------------------------------------------------------------------------- /session1/A1-ER_modeling.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A1-ER_modeling.pptx -------------------------------------------------------------------------------- /session1/A2-ER_modeling-GalleryPainter.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A2-ER_modeling-GalleryPainter.pptx -------------------------------------------------------------------------------- /session1/A2-ER_modeling-WaterUtility.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A2-ER_modeling-WaterUtility.pptx -------------------------------------------------------------------------------- /session1/A3-ER_modeling-DB_vs_Spreadsheets_Normalization.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A3-ER_modeling-DB_vs_Spreadsheets_Normalization.pptx -------------------------------------------------------------------------------- /session1/A4-ER_modeling-CreateTablesInSQL.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A4-ER_modeling-CreateTablesInSQL.pptx -------------------------------------------------------------------------------- /session1/A5-Inserting_Data_in_MySQL_using_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "id": "Zq3aQJmsEt1L", 18 | "outputId": "df0cf6b1-57aa-49e8-86e6-425f96359196", 19 | "colab": { 20 | "base_uri": "https://localhost:8080/" 21 | } 22 | }, 23 | "outputs": [ 24 | { 25 | "output_type": "stream", 26 | "name": "stdout", 27 | "text": [ 28 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.8/44.8 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 29 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m40.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 30 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 31 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.9/121.9 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 32 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.4/93.4 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 33 | "\u001b[?25h" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "!sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "IhpLu2WHEt02" 45 | }, 46 | "source": [ 47 | "## Inserting data in MySQL using Python\n", 48 | "\n", 49 | "First let's start with a basic piece of code that fetches the data that we want to insert in the database. For our example, we will get the data about the Citibike stations, using the correspoding API call provided by the Citibike website:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": { 56 | "id": "g-_StVU-Et03" 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import requests\n", 61 | "import uuid\n", 62 | "from datetime import date, datetime, timedelta" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "id": "Ues1lO5FEt09" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "# Let's get the data from the Citibike API\n", 74 | "url = \"https://gbfs.citibikenyc.com/gbfs/en/station_information.json\"\n", 75 | "results = requests.get(url).json()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": { 82 | "id": "Jzs_lCvxEt1B" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "# We only need a subset of the data in the JSON returned by the Citibike API, so we keep only we need\n", 87 | "data = results[\"data\"][\"stations\"]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": { 94 | "id": "gTbzvvk7J-Fd", 95 | "outputId": "1a85e518-ece5-46f2-d7ec-b0d0963d175c", 96 | "colab": { 97 | "base_uri": "https://localhost:8080/" 98 | } 99 | }, 100 | "outputs": [ 101 | { 102 | "output_type": "execute_result", 103 | "data": { 104 | "text/plain": [ 105 | "{'rental_methods': ['KEY', 'CREDITCARD'],\n", 106 | " 'lat': 40.763604677958625,\n", 107 | " 'external_id': 'b442a648-e9f4-4893-951a-64d258bc0e55',\n", 108 | " 'lon': -73.98917958140373,\n", 109 | " 'capacity': 30,\n", 110 | " 'station_id': 'b442a648-e9f4-4893-951a-64d258bc0e55',\n", 111 | " 'eightd_has_key_dispenser': False,\n", 112 | " 'station_type': 'classic',\n", 113 | " 'region_id': '71',\n", 114 | " 'electric_bike_surcharge_waiver': False,\n", 115 | " 'name': 'W 50 St & 9 Ave',\n", 116 | " 'has_kiosk': True,\n", 117 | " 'short_name': '6854.05',\n", 118 | " 'rental_uris': {'android': 'https://bkn.lft.to/lastmile_qr_scan',\n", 119 | " 'ios': 'https://bkn.lft.to/lastmile_qr_scan'},\n", 120 | " 'eightd_station_services': []}" 121 | ] 122 | }, 123 | "metadata": {}, 124 | "execution_count": 5 125 | } 126 | ], 127 | "source": [ 128 | "data[1]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 6, 134 | "metadata": { 135 | "id": "jAlZwKq8J0OM", 136 | "outputId": "2773c6fe-9716-49e5-f7b0-b00fac0fc09b", 137 | "colab": { 138 | "base_uri": "https://localhost:8080/" 139 | } 140 | }, 141 | "outputs": [ 142 | { 143 | "output_type": "execute_result", 144 | "data": { 145 | "text/plain": [ 146 | "2098" 147 | ] 148 | }, 149 | "metadata": {}, 150 | "execution_count": 6 151 | } 152 | ], 153 | "source": [ 154 | "len(data)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 7, 160 | "metadata": { 161 | "id": "KxuM5dGREt1Q" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "from sqlalchemy import create_engine\n", 166 | "from sqlalchemy import text\n", 167 | "\n", 168 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/\".format(\n", 169 | " host=\"db.ipeirotis.org\", user=\"student\", password=\"dwdstudent2015\"\n", 170 | ")\n", 171 | "\n", 172 | "engine = create_engine(conn_string)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "VzsnOW_AEt1U" 179 | }, 180 | "source": [ 181 | "Once we have connected successfully, we need to create our database:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 8, 187 | "metadata": { 188 | "id": "TQT4IE2FEt1U" 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "# Query to create a database\n", 193 | "# In this example, we will try to create the (existing) database \"public\"\n", 194 | "# But in general, we can give any name to the database\n", 195 | "db_name = \"public\"\n", 196 | "create_db_query = (\n", 197 | " f\"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET 'utf8'\"\n", 198 | ")\n", 199 | "\n", 200 | "# Create a database\n", 201 | "with engine.connect() as connection:\n", 202 | " connection.execute(text(create_db_query))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": { 208 | "id": "3sa0ArJhEt1Z" 209 | }, 210 | "source": [ 211 | "Then we create the table where we will store our data. For our example, we will just import three fields in the database: station_id, station_name, and number_of_docks" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 9, 217 | "metadata": { 218 | "id": "qzWnULWfEt1a", 219 | "outputId": "9ca027be-41d9-4862-905c-d090d6ecb232", 220 | "colab": { 221 | "base_uri": "https://localhost:8080/" 222 | } 223 | }, 224 | "outputs": [ 225 | { 226 | "output_type": "stream", 227 | "name": "stdout", 228 | "text": [ 229 | "d94ec55c\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "# To avoid conflicts between people writing in the same database, we add a random suffix in the tables\n", 235 | "# We only create the variable once while running the notebook\n", 236 | "if \"suffix\" not in globals():\n", 237 | " suffix = str(uuid.uuid4())[:8]\n", 238 | "print(suffix)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 10, 244 | "metadata": { 245 | "id": "GW0oeuOcEt1e" 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "table_name = f\"Docks_{suffix}\"\n", 250 | "\n", 251 | "# Drop the table if there is one already\n", 252 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{table_name}\"\n", 253 | "with engine.connect() as connection:\n", 254 | " connection.execute(text(drop_table_query))\n", 255 | "\n", 256 | "# Create a table\n", 257 | "create_table_query = f\"\"\"CREATE TABLE IF NOT EXISTS {db_name}.{table_name}\n", 258 | " (station_id varchar(50),\n", 259 | " station_name varchar(50),\n", 260 | " capacity int,\n", 261 | " PRIMARY KEY(station_id)\n", 262 | " )\"\"\"\n", 263 | "\n", 264 | "with engine.connect() as connection:\n", 265 | " connection.execute(text(create_table_query))\n" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "id": "hdI9lReyEt1h" 272 | }, 273 | "source": [ 274 | "Finally, we import the data into our table, using the INSERT command. (_Note: The `INSERT IGNORE` directs the database to ignore attempts to insert another tuple with the same primary key. In our case, we do not want to allow two entries for the same `station_id`._)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "id": "qQLqOddcEt1i" 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "query_template = f\"\"\"\n", 286 | " INSERT IGNORE INTO\n", 287 | " {db_name}.{table_name}(station_id, station_name, capacity)\n", 288 | " VALUES (:station_id, :station_name, :capacity)\n", 289 | " \"\"\"\n", 290 | "\n", 291 | "# THIS IS PROHIBITED\n", 292 | "# query = \"INSERT INTO citibike.Docks(station_id, station_name, number_of_docks) \" + \\\n", 293 | "# \"VALUES (\"+entry[\"id\"]+\", \"+entry[\"stationName\"]+\", \"+entry[\"totalDocks\"]+\")\"\n", 294 | "\n", 295 | "with engine.connect() as connection:\n", 296 | " for entry in data:\n", 297 | " query_parameters = {\n", 298 | " \"station_id\": entry[\"station_id\"],\n", 299 | " \"station_name\": entry[\"name\"],\n", 300 | " \"capacity\": entry[\"capacity\"]\n", 301 | " }\n", 302 | " print(\"Inserting station\", entry[\"station_id\"], \"at\", entry[\"name\"], \"with\", entry[\"capacity\"], \"docks\")\n", 303 | " connection.execute(text(query_template), query_parameters)\n", 304 | " connection.commit()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "source": [ 310 | "## Query the Database to retrieve the data" 311 | ], 312 | "metadata": { 313 | "id": "0y3Dn5m-DEmn" 314 | } 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": { 319 | "id": "nZPtYSJOEt1s" 320 | }, 321 | "source": [ 322 | "Now let's see how to query the database" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 12, 328 | "metadata": { 329 | "id": "qcdbX7AWEt1t" 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "with engine.connect() as connection:\n", 334 | " results = connection.execute(text(f\"SELECT station_id, station_name, capacity FROM {db_name}.{table_name}\"))\n", 335 | " rows = results.mappings().all()\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 13, 341 | "metadata": { 342 | "id": "BT-lYjnXEt1w", 343 | "outputId": "2f4deaca-3759-48ce-d9b5-9ac9a3ab9563", 344 | "colab": { 345 | "base_uri": "https://localhost:8080/" 346 | } 347 | }, 348 | "outputs": [ 349 | { 350 | "output_type": "stream", 351 | "name": "stdout", 352 | "text": [ 353 | "Number of rows: 2098\n", 354 | "=============================================\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "# Let's check how many data points we got back\n", 360 | "print(f\"Number of rows: {len(rows)}\")\n", 361 | "print(\"=============================================\")" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "source": [ 367 | "# And now let's go over the results\n", 368 | "for row in rows:\n", 369 | " print(\"Station ID:\", row['station_id'])\n", 370 | " print(\"Station Name:\", row['station_name'])\n", 371 | " print(\"Number of Docks:\", row['capacity'])\n", 372 | " print(\"=============================================\")" 373 | ], 374 | "metadata": { 375 | "id": "yc9ZgIoNvXvY" 376 | }, 377 | "execution_count": null, 378 | "outputs": [] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "id": "Ifc_6hwwEt1z" 384 | }, 385 | "source": [ 386 | "Finally, let's clean up and close our database connection." 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 15, 392 | "metadata": { 393 | "id": "AWMXngKcEt1z" 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{table_name}\"\n", 398 | "with engine.connect() as connection:\n", 399 | " connection.execute(text(drop_table_query))" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": { 405 | "id": "W9fjdQPNLwfh" 406 | }, 407 | "source": [ 408 | "## Exercise\n", 409 | "\n", 410 | "At `https://gbfs.citibikenyc.com/gbfs/en/station_status.json` we can access the live status of all the stations (e.g., bikes available etc). Using the approach outlined above, create a table in the database (using the same table suffix that we created above) and store the data in the database." 411 | ] 412 | } 413 | ], 414 | "metadata": { 415 | "colab": { 416 | "name": "A5-Inserting_Data_in_MySQL_using_Python.ipynb", 417 | "provenance": [], 418 | "include_colab_link": true 419 | }, 420 | "kernelspec": { 421 | "display_name": "Python 3", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.8.2" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 0 440 | } -------------------------------------------------------------------------------- /session1/A7-Citibike.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# !sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic tqdm" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import requests" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# This gives information for each station that remains stable over time\n", 28 | "url_stations = \"https://gbfs.citibikenyc.com/gbfs/en/station_information.json\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# This gives the live status of all the stations (e.g., bikes available etc)\n", 38 | "url_status = \"https://gbfs.citibikenyc.com/gbfs/en/station_status.json\"" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# We fetch for now just the time-invariant data\n", 48 | "results = requests.get(url_stations).json()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# We only need a subset of the data in the JSON returned by the Citibike API, so we keep only what we need\n", 58 | "stations = results[\"data\"][\"stations\"]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# We will not be using dataframes for this insertion task. (See the A6 notebook if you want to use Pandas)\n", 68 | "# We just put the data in a dataframe to understand what is going on.\n", 69 | "import pandas as pd\n", 70 | "\n", 71 | "df_stations = pd.DataFrame(stations)\n", 72 | "df_stations.head(5)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "import sqlalchemy\n", 82 | "from sqlalchemy import create_engine\n", 83 | "\n", 84 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/\".format(\n", 85 | " host=\"db.ipeirotis.org\", user=\"student\", password=\"dwdstudent2015\"\n", 86 | ")\n", 87 | "\n", 88 | "engine = create_engine(conn_string)\n", 89 | "\n", 90 | "db_name = \"public\"\n", 91 | "create_db_query = (\n", 92 | " f\"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET 'utf8'\"\n", 93 | ")\n", 94 | "\n", 95 | "# Create a database\n", 96 | "engine.execute(create_db_query)\n", 97 | "\n", 98 | "# And lets switch to the database\n", 99 | "engine.execute(f\"USE {db_name}\")\n", 100 | "\n", 101 | "# To avoid conflicts between people writing in the same database, we add a random suffix in the tables\n", 102 | "# We only create the variable once while running the notebook\n", 103 | "import uuid\n", 104 | "\n", 105 | "if \"suffix\" not in globals():\n", 106 | " suffix = str(uuid.uuid4())[:8]\n", 107 | "print(suffix)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# Create the two tables. One for storing the time-invariant station data\n", 117 | "# and another table to store the time-varying station status data\n", 118 | "stations_table = f\"Stations_{suffix}\"\n", 119 | "\n", 120 | "sql = f\"\"\"CREATE TABLE IF NOT EXISTS {stations_table}\n", 121 | " (station_id int, \n", 122 | " name varchar(250), \n", 123 | " capacity int,\n", 124 | " lat float,\n", 125 | " lon float,\n", 126 | " region_id int,\n", 127 | " short_name varchar(250),\n", 128 | " rental_url varchar(250),\n", 129 | " eightd_has_key_dispenser bool,\n", 130 | " PRIMARY KEY(station_id)\n", 131 | " )\"\"\"\n", 132 | "engine.execute(sql)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Create the time-varying table\n", 142 | "status_table = f\"Status_{suffix}\"\n", 143 | "sql = f\"\"\"CREATE TABLE IF NOT EXISTS {status_table}\n", 144 | " (station_id int, \n", 145 | " last_reported datetime,\n", 146 | " num_bikes_available int,\n", 147 | " num_ebikes_available int,\n", 148 | " num_bikes_disabled int,\n", 149 | " num_docks_available int,\n", 150 | " num_docks_disabled int,\n", 151 | " is_installed bool,\n", 152 | " is_renting bool,\n", 153 | " is_returning bool,\n", 154 | " PRIMARY KEY(station_id, last_reported)\n", 155 | " )\"\"\"\n", 156 | "engine.execute(sql)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "stations[0]" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# We fetch for now just the time-invariant data\n", 175 | "# Notice that we have the INSERT IGNORE so that even when we add the same entry\n", 176 | "# again, we do not get an error that the line exists. We do get warnings\n", 177 | "# but this is expected\n", 178 | "\n", 179 | "from sqlalchemy.sql import text\n", 180 | "from tqdm.autonotebook import tqdm\n", 181 | "\n", 182 | "query_template = text(\n", 183 | " f\"\"\"INSERT IGNORE INTO {db_name}.{stations_table}\n", 184 | " (station_id, name, capacity, lat, lon,\n", 185 | " region_id, short_name, rental_url, eightd_has_key_dispenser) \n", 186 | " VALUES (:station_id, :name, :capacity, :lat, :lon, :region_id, \n", 187 | " :short_name, :rental_url, :eightd_has_key_dispenser)\"\"\"\n", 188 | ")\n", 189 | "\n", 190 | "# The tqdm(stations) shows a progress bar\n", 191 | "for entry in tqdm(stations):\n", 192 | "\n", 193 | " query_parameters = {\n", 194 | " \"station_id\": int(entry[\"station_id\"]),\n", 195 | " \"name\": entry.get(\"name\"),\n", 196 | " \"capacity\": entry.get(\"capacity\"),\n", 197 | " \"lat\": entry.get(\"lat\"),\n", 198 | " \"lon\": entry.get(\"lon\"),\n", 199 | " \"region_id\": entry.get(\"region_id\"),\n", 200 | " \"short_name\": entry.get(\"short_name\"),\n", 201 | " \"rental_url\": entry.get(\"rental_url\"),\n", 202 | " \"eightd_has_key_dispenser\": entry.get(\"eightd_has_key_dispenser\"),\n", 203 | " }\n", 204 | "\n", 205 | " engine.execute(query_template, **query_parameters)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "check = pd.read_sql(f\"SELECT * FROM {db_name}.{stations_table}\", con=engine)\n", 215 | "check" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "%matplotlib inline\n", 225 | "check.plot(kind=\"scatter\", x=\"lon\", y=\"lat\", s=1, figsize=(10, 10))" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "results = requests.get(url_status).json()\n", 235 | "status = results[\"data\"][\"stations\"]\n", 236 | "status[0]" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# Now we fetch the data about the time varying elements of the citibike stations\n", 246 | "from datetime import datetime\n", 247 | "\n", 248 | "query_template = text(\n", 249 | " f\"\"\"INSERT IGNORE INTO {db_name}.{status_table}(station_id, \n", 250 | " num_bikes_available,\n", 251 | " num_ebikes_available,\n", 252 | " num_bikes_disabled,\n", 253 | " num_docks_available,\n", 254 | " num_docks_disabled,\n", 255 | " is_installed,\n", 256 | " is_renting,\n", 257 | " is_returning,\n", 258 | " last_reported) \n", 259 | " VALUES (:station_id, :num_bikes_available, :num_ebikes_available, :num_bikes_disabled,\n", 260 | " :num_docks_available, :num_docks_disabled, :is_installed, :is_renting, :is_returning, :last_reported)\"\"\"\n", 261 | ")\n", 262 | "\n", 263 | "for entry in tqdm(status):\n", 264 | " query_parameters = {\n", 265 | " \"station_id\": int(entry[\"station_id\"]),\n", 266 | " \"num_bikes_available\": entry[\"num_bikes_available\"],\n", 267 | " \"num_bikes_disabled\": entry[\"num_bikes_disabled\"],\n", 268 | " \"num_ebikes_available\": entry[\"num_ebikes_available\"],\n", 269 | " \"num_docks_available\": entry[\"num_docks_available\"],\n", 270 | " \"num_docks_disabled\": entry[\"num_docks_disabled\"],\n", 271 | " \"is_installed\": entry[\"is_installed\"],\n", 272 | " \"is_renting\": entry[\"is_renting\"],\n", 273 | " \"is_returning\": entry[\"is_returning\"],\n", 274 | " \"last_reported\": datetime.fromtimestamp(entry[\"last_reported\"]),\n", 275 | " }\n", 276 | "\n", 277 | " engine.execute(query_template, **query_parameters)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "check = pd.read_sql(f\"SELECT * FROM {db_name}.{status_table}\", con=engine)\n", 287 | "check" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{status_table}\"\n", 297 | "print(drop_table_query)\n", 298 | "engine.execute(drop_table_query)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "drop_table_query = f\"DROP TABLE IF EXISTS {db_name}.{stations_table}\"\n", 308 | "print(drop_table_query)\n", 309 | "engine.execute(drop_table_query)" 310 | ] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python 3", 316 | "language": "python", 317 | "name": "python3" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 3 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython3", 329 | "version": "3.8.2" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 2 334 | } 335 | -------------------------------------------------------------------------------- /session1/A8-ERD_No_Future_Records.mwb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/A8-ERD_No_Future_Records.mwb -------------------------------------------------------------------------------- /session1/assignment1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/assignment1.pdf -------------------------------------------------------------------------------- /session1/assignment2.md: -------------------------------------------------------------------------------- 1 | # ER Diagram and Database Design for Time Card Application 2 | 3 | 4 | ## Instructions 5 | 6 | * Create an ER diagram, illustrating the entities, their attributes, the relationship among entities, and the cardinalities of the entities 7 | * Translate the ER diagram into a set of tables, indicating clearly the attributes of each table, the primary key of each table, and the foreign keys that are used to implement the relationships. 8 | * Write the SQL that generates the tables that you created in the step above. 9 | 10 | ## Scenario 11 | 12 | The company you work for wants to digitize their time cards. You are asked to design the database for submitting and approving time cards. 13 | 14 | * Each timecard should have a unique id, hours worked, date submitted, and status, which is either approved, not approved, or pending. 15 | * Each employee has a unique id, name and address, and method of payment: either direct deposit or physical check. 16 | * Each employee submits a time card every pay period (i.e., in 1 year, they will submit multiple time cards). 17 | * Each manager has a unique id and a name. 18 | * Each employee is associated with exactly one manager; each manager is in charge of multiple employees. 19 | * Each manager approves time cards. The manager may approve also timecards for employees that are not necessarily managed by him/her. 20 | * _Trickier part_: How would you handle the case when each manager is also an employee? 21 | 22 | _A quick reminder: In databases, we can limit the domain of values of a variable using two ways:_ 23 | 24 | * By making the variable a foreign key, pointing to a different table where we store the potential values. 25 | * By making the domain of the variable an "ENUM" variable that lists the (pre-defined) set of values that the variable can take. 26 | 27 | _Option (1) allows for a bit more flexibility, allowing us to modify the domain of values over time. Option (2) is preferred when the set of values is small, pre-determined, and unlikely to change in the future._ 28 | 29 | ## Deliverables 30 | 31 | You can submit your diagram and SQL queries as a Word or PDF file, or any format that we can easily read. 32 | -------------------------------------------------------------------------------- /session1/cellular_operator_ER_diagram.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session1/cellular_operator_ER_diagram.PNG -------------------------------------------------------------------------------- /session1/practice_questions.md: -------------------------------------------------------------------------------- 1 | # Cellular Operator Database Questions 2 | 3 | Look at the diagram "Cellular Operator ER diagram" 4 | 5 | ![Cellular Operator ER diagram](cellular_operator_ER_diagram.PNG) 6 | 7 | ## a. Can a customer have an unlimited number of plans? 8 | ## b. Can a customer exist without a plan? 9 | ## c. Is it possible to create a plan without knowing who the customer is? 10 | ## d. Does the operator want to limit the types of handsets that can be linked to a specific plan type? 11 | ## e. Is it possible to maintain data regarding a handset without connecting it to a plan? 12 | ## f. Can a handset be associated with multiple plans? 13 | ## g. Assume a handset type exists that can utilize multiple operating systems. Could this situation be accommodated within the model included in Figure 2-24? 14 | ## h. Is the company able to track a manufacturer without maintaining information about its handsets? 15 | ## i. Can the same operating system be used on multiple handset types? 16 | ## j. There are two relationships between Customer and Plan. Explain how they differ. 17 | ## k. Characterize the degree and the cardinalities of the relationship that connects Customer to itself. Explain its meaning. 18 | ## l. Is it possible to link a handset to a specific customer in a plan with multiple customers? 19 | ## m. Can the company track a handset without identifying its operating system? 20 | -------------------------------------------------------------------------------- /session1/practice_questions_solutions.md: -------------------------------------------------------------------------------- 1 | # Cellular Operator Database Questions 2 | 3 | Look at the diagram "Cellular Operator ER diagram" 4 | 5 | ![Cellular Operator ER diagram](cellular_operator_ER_diagram.PNG) 6 | 7 | Cellular Operator Database Questions: 8 | ## a. Can a customer have an unlimited number of plans? 9 | 10 | Yes. A Customer may be responsible for 0, 1, or many Plans. 11 | 12 | ## b. Can a customer exist without a plan? 13 | Yes. The minimum cardinality of the Belongs relationship from the Customer to the Plan states that a Customer may exist without a Plan (the minimum cardinality is 0). 14 | 15 | ## c. Is it possible to create a plan without knowing who the customer is? 16 | No. The minimum cardinality of both the “responsible for” and “belongs” relationships between Plan and Customer states that at least one Customer must be related to a Plan. 17 | 18 | ## d. Does the operator want to limit the types of handsets that can be linked to a specific plan type? 19 | 20 | Yes, the cellular operator requires that a Handset (that is a particular type and a particular operating system) is linked to one Plan (that is a particular type of plan). This business rule is to be implemented in this design by indirectly requiring that a Plan Type has 0:M Plans, and each Plan is associated with certain Handsets, and each Handset is of some Handset Type. A given Plan Type is related to Handset Type through the intermediary entity types in this design. 21 | 22 | _Alternative interpretation: No, there is nothing in the current model that creates a condition that would limit – in advance – the handset types that can be related to a specific plan type._ 23 | 24 | ## e. Is it possible to maintain data regarding a handset without connecting it to a plan? 25 | 26 | Yes. The minimum cardinality of the Includes relationship between Plan and Handset states that a Handset may be included in 0 or 1 plan. The 0 minimum cardinality means that we can track data about the handset even if it is not connected to a plan; the Handset has optional participation in the Includes relationship with Plan. 27 | 28 | ## f. Can a handset be associated with multiple plans? 29 | 30 | No. The minimum cardinality of the Includes relationship between Plan and Handset states that a Handset may be included in 0 or 1 plan, not multiple plans. 31 | 32 | ## g. Assume a handset type exists that can utilize multiple operating systems. Could this situation be accommodated within the model included in Figure 2-24? 33 | 34 | No. The current model shows that a handset type is associated with one and only one operating system. 35 | 36 | ## h. Is the company able to track a manufacturer without maintaining information about its handsets? 37 | 38 | Yes. The minimum cardinality of the relationship between Manufacturer and Handset Type indicates that we can track data about a Manufacturer even if we have no (or zero) Handset Types in our database. 39 | 40 | ## i. Can the same operating system be used on multiple handset types? 41 | 42 | Yes. The maximum cardinality on the relationship between Operating System and Handset Type indicates that an Operating System may be used on 0, 1, or many Handset types. 43 | 44 | ## j. There are two relationships between Customer and Plan. Explain how they differ. 45 | 46 | The Responsible For relationship is an overall 1:M relationship between Customer and Plan. A Customer can be responsible for 0, 1, or many Plans yet any one Plan will be linked to only 1 Customer for responsibility purposes. The Belongs relationship is an overall M:M relationship that permits the linking of multiple customers to a single plan, as in the case of family members being part of a particular plan or different plans. 47 | 48 | ## k. Characterize the degree and the cardinalities of the relationship that connects Customer to itself. Explain its meaning. 49 | 50 | The “Family Member” relationship that connects Customer to itself has a degree of 1 (unary). It permits the tracking of each family member as a Customer. Any Customer may be a Family Member of 0, 1, or many Customer(s); as a Family Member Customer, the Customer may be linked to 0 or 1 Customer. 51 | 52 | ## l. Is it possible to link a handset to a specific customer in a plan with multiple customers? 53 | No, this is not possible according to the current model. However, the current model could be adjusted to create an Associative Entity to track the particular Customer instance with a particular Plan instance, that is then associated with a particular Handset. This suggested extension to the current model also permits a design that will easily extend the database’s ability to track additional data about the particular Customer instance with a particular Plan instance. 54 | 55 | ## m. Can the company track a handset without identifying its operating system? 56 | No. The minimum cardinality of the relationship between Handset Type and Operating System is 1 and only 1; the minimum of 1 is a mandatory participation for the Handset Type with the Operating System. 57 | 58 | -------------------------------------------------------------------------------- /session2/A-Navigation_Queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "QBeMTyL26ULI" 7 | }, 8 | "source": [ 9 | "# SQL Queries: Navigating a database" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "w8oa_JjV7F_h" 16 | }, 17 | "source": [ 18 | "## Setup" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "0SuVR-1d7Jr4" 25 | }, 26 | "source": [ 27 | "We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "id": "I6vMQAK86ipS" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "!pip3 install -U PyMySQL sqlalchemy sql_magic " 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "id": "vOuWkjz36ULS" 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "%reload_ext sql_magic" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "id": "KE_bhLw16ULK" 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "from sqlalchemy import create_engine\n", 61 | "\n", 62 | "conn_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8'.format(\n", 63 | " host = 'db.ipeirotis.org', \n", 64 | " user = 'student',\n", 65 | " password = 'dwdstudent2015',\n", 66 | " encoding = 'utf-8')\n", 67 | "engine = create_engine(conn_string)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "id": "fivqywfX6ULV" 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "%config SQL.conn_name = 'engine'" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "id": "dsiY_1S46ULY" 85 | }, 86 | "source": [ 87 | "## Navigation Queries" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "id": "1psXlZbQ6ULZ" 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "%%read_sql\n", 99 | "show databases" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "id": "G64fIWqC6ULc" 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "%%read_sql\n", 111 | "use imdb" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "id": "EwGGEhe_6ULh" 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "%%read_sql\n", 123 | "show tables" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "id": "bFfQ9mqa6ULm" 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "%%read_sql\n", 135 | "describe actors" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "id": "escUGjfpedCj" 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "%%read_sql\n", 147 | "describe movies" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "id": "lQhe8-fm6ULt" 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "%%read_sql\n", 159 | "describe roles" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "id": "0yDQhYOvedCk" 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "%%read_sql\n", 171 | "use facebook" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "id": "5r-1inpjedCk" 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "%%read_sql\n", 183 | "show tables" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "id": "kr5eW5p1edCk" 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "%%read_sql\n", 195 | "describe Profiles" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "id": "lA32DuPfedCl" 203 | }, 204 | "outputs": [], 205 | "source": [] 206 | } 207 | ], 208 | "metadata": { 209 | "colab": { 210 | "name": "A-Navigation_Queries.ipynb", 211 | "provenance": [] 212 | }, 213 | "kernelspec": { 214 | "display_name": "Python 3", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.8.5" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 1 233 | } 234 | -------------------------------------------------------------------------------- /session2/A-SQL_Intro_Navigating_DB.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/A-SQL_Intro_Navigating_DB.pptx -------------------------------------------------------------------------------- /session2/B-SQL_Selection_Queries.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/B-SQL_Selection_Queries.pptx -------------------------------------------------------------------------------- /session2/B-Selection_Queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "QBeMTyL26ULI" 17 | }, 18 | "source": [ 19 | "# SQL: Selection Queries" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "heading_collapsed": true, 26 | "id": "w8oa_JjV7F_h" 27 | }, 28 | "source": [ 29 | "## Setup" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": { 35 | "hidden": true, 36 | "id": "0SuVR-1d7Jr4" 37 | }, 38 | "source": [ 39 | "We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "hidden": true, 47 | "id": "I6vMQAK86ipS" 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "!sudo pip3 install -U -q PyMySQL 'sqlalchemy<2.0' sql_magic" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "hidden": true, 59 | "id": "vOuWkjz36ULS" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "%reload_ext sql_magic" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "hidden": true, 71 | "id": "KE_bhLw16ULK" 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "from sqlalchemy import create_engine\n", 76 | "\n", 77 | "conn_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8'.format(\n", 78 | " host='db.ipeirotis.org', \n", 79 | " user='student',\n", 80 | " password='dwdstudent2015',\n", 81 | " encoding='utf-8')\n", 82 | "engine = create_engine(conn_string).connect()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "hidden": true, 90 | "id": "fivqywfX6ULV" 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "%config SQL.conn_name = 'engine'" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "heading_collapsed": true, 101 | "id": "yPvFpNjaxW4C" 102 | }, 103 | "source": [ 104 | "## `SELECT *` " 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "hidden": true, 111 | "id": "IuEncSZuxW4C" 112 | }, 113 | "source": [ 114 | "### IMDb" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "hidden": true, 122 | "id": "DGOuI3fFxW4D" 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "%%read_sql\n", 127 | "USE imdb" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "hidden": true, 134 | "id": "oHodKBJ9xW4D" 135 | }, 136 | "source": [ 137 | "#### Return all movies" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "hidden": true, 145 | "id": "2Zww7GjgxW4D" 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "%%read_sql\n", 150 | "SELECT *\n", 151 | "FROM movies" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "hidden": true, 158 | "id": "gbOg4kbqxW4E" 159 | }, 160 | "source": [ 161 | "#### Return all directors\n" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "hidden": true, 169 | "id": "QaKgWz2sxW4F" 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "%%read_sql\n", 174 | "SELECT *\n", 175 | "FROM directors" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "hidden": true, 182 | "id": "ju8SsxtwxW4F" 183 | }, 184 | "source": [ 185 | "#### Return all actors\n" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "hidden": true, 193 | "id": "0c2iftjExW4F" 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "%%read_sql\n", 198 | "SELECT *\n", 199 | "FROM actors" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "hidden": true, 206 | "id": "rFk9FivRxW4F" 207 | }, 208 | "source": [ 209 | "#### Return all roles\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "hidden": true, 217 | "id": "IvsNDS2sxW4G" 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "%%read_sql\n", 222 | "SELECT *\n", 223 | "FROM roles" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "hidden": true, 230 | "id": "jGxenkfNxW4G" 231 | }, 232 | "source": [ 233 | "#### Return all genres for the movies" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "hidden": true, 241 | "id": "nTgSFPSLxW4G" 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "%%read_sql\n", 246 | "SELECT *\n", 247 | "FROM movies_genres" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "heading_collapsed": true, 254 | "hidden": true, 255 | "id": "pzXIxBXrxW4G" 256 | }, 257 | "source": [ 258 | "### Facebook" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "hidden": true, 266 | "id": "Wix5cQ16xW4H" 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "%%read_sql\n", 271 | "USE facebook" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "hidden": true, 278 | "id": "lBI1dg24xW4H" 279 | }, 280 | "source": [ 281 | "#### Return all students" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "hidden": true, 289 | "id": "6FLhWtCQxW4H" 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "%%read_sql\n", 294 | "SELECT *\n", 295 | "FROM Profiles" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "hidden": true, 302 | "id": "FJZI_EEsxW4H" 303 | }, 304 | "source": [ 305 | "#### Return the hobbies of all students" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "hidden": true, 313 | "id": "BCtC4H-UxW4I" 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "%%read_sql\n", 318 | "SELECT *\n", 319 | "FROM Hobbies" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "hidden": true, 326 | "id": "ngFHnxf2xW4I" 327 | }, 328 | "source": [ 329 | "#### Return the relationship status for all students" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "hidden": true, 337 | "id": "2Ryic6aSxW4I" 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "%%read_sql\n", 342 | "SELECT *\n", 343 | "FROM Relationship" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": { 349 | "hidden": true, 350 | "id": "fk3nJOgYxW4I" 351 | }, 352 | "source": [ 353 | "#### Return what students are looking for" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "hidden": true, 361 | "id": "3F3rnkRnxW4I" 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "%%read_sql\n", 366 | "SELECT *\n", 367 | "FROM LookingFor" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "hidden": true, 375 | "id": "JSTMaQXJxW4J" 376 | }, 377 | "outputs": [], 378 | "source": [] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "heading_collapsed": true, 384 | "id": "fVwBIvGDxW4J" 385 | }, 386 | "source": [ 387 | "## `SELECT` _attr_" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": { 393 | "heading_collapsed": true, 394 | "hidden": true, 395 | "id": "kQJtzrHYxW4J" 396 | }, 397 | "source": [ 398 | "### IMDb" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "hidden": true, 406 | "id": "sv_uYUxcxW4J" 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "%%read_sql\n", 411 | "USE imdb" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": { 417 | "hidden": true, 418 | "id": "ZoKJnhhPxW4J" 419 | }, 420 | "source": [ 421 | "#### Return the first and last names of actors\n", 422 | "\n" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": { 429 | "hidden": true, 430 | "id": "Lc5YDvQtxW4K" 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "%%read_sql\n", 435 | "SELECT first_name, last_name\n", 436 | "FROM actors" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": { 442 | "hidden": true, 443 | "id": "2vrL_r77xW4K" 444 | }, 445 | "source": [ 446 | "#### Return year and ranking for each movie" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "hidden": true, 454 | "id": "jGiKGiu-xW4K" 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "%%read_sql\n", 459 | "SELECT year, rating\n", 460 | "FROM movies " 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": { 466 | "heading_collapsed": true, 467 | "hidden": true, 468 | "id": "psXpcf0pxW4K" 469 | }, 470 | "source": [ 471 | "### Facebook" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": { 478 | "hidden": true, 479 | "id": "azk-jU76xW4L" 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "%%read_sql\n", 484 | "USE facebook" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": { 490 | "hidden": true, 491 | "id": "fP07lq9axW4L" 492 | }, 493 | "source": [ 494 | "#### Return Name, Sex, and Birthday of all students\n", 495 | "\n" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": { 502 | "hidden": true, 503 | "id": "nykOUqP2xW4L" 504 | }, 505 | "outputs": [], 506 | "source": [ 507 | "%%read_sql\n", 508 | "SELECT Name, Sex, Birthday\n", 509 | "FROM Profiles" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": { 515 | "hidden": true, 516 | "id": "hXsIb1JFxW4L" 517 | }, 518 | "source": [ 519 | "#### Return Sex, and Political Views of all students\n" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "hidden": true, 527 | "id": "L9-QRoVRxW4L" 528 | }, 529 | "outputs": [], 530 | "source": [ 531 | "%%read_sql\n", 532 | "SELECT Sex, PoliticalViews\n", 533 | "FROM Profiles" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": { 539 | "hidden": true, 540 | "id": "ogmxY8GqxW4M" 541 | }, 542 | "source": [ 543 | "#### Return the Relationship status column" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "hidden": true, 551 | "id": "nZgAtjWExW4M" 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "%%read_sql\n", 556 | "SELECT Status\n", 557 | "FROM Relationship" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": { 563 | "heading_collapsed": true, 564 | "id": "0LKcTbgqxW4M" 565 | }, 566 | "source": [ 567 | "## `SELECT` _attr_ `AS` _alias_" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": { 573 | "heading_collapsed": true, 574 | "hidden": true, 575 | "id": "-dEvRtrgxW4M" 576 | }, 577 | "source": [ 578 | "### IMDb" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": { 585 | "hidden": true, 586 | "id": "R9HUKaOxxW4M" 587 | }, 588 | "outputs": [], 589 | "source": [ 590 | "%%read_sql\n", 591 | "USE imdb" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": { 597 | "hidden": true, 598 | "id": "sl9XSy7pxW4N" 599 | }, 600 | "source": [ 601 | "#### Return id, first, and last names of actors. Rename id to “actor_id”\n" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": { 608 | "hidden": true, 609 | "id": "J_DJf9tDxW4N" 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "%%read_sql\n", 614 | "SELECT id AS actor_id, first_name, last_name\n", 615 | "FROM actors" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": { 621 | "hidden": true, 622 | "id": "u-_QtxARxW4N" 623 | }, 624 | "source": [ 625 | "#### Return name, year, and rank for each movie. Rename name to “movie_title”, year to “release_year”, and rank to “rating”\n" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": { 632 | "hidden": true, 633 | "id": "rzMXuNCYxW4O" 634 | }, 635 | "outputs": [], 636 | "source": [ 637 | "%%read_sql\n", 638 | "SELECT name AS movie_title, year AS release_year, rating\n", 639 | "FROM movies" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": { 645 | "heading_collapsed": true, 646 | "hidden": true, 647 | "id": "ELd2839yxW4O" 648 | }, 649 | "source": [ 650 | "### Facebook" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": { 657 | "hidden": true, 658 | "id": "dVKrn1eexW4O" 659 | }, 660 | "outputs": [], 661 | "source": [ 662 | "%%read_sql\n", 663 | "USE facebook" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": { 669 | "hidden": true, 670 | "id": "JEMcLdzjxW4O" 671 | }, 672 | "source": [ 673 | "#### Return Sex and Status of all students. Rename Sex to Gender and Status to UniversityStatus\n" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": { 680 | "hidden": true, 681 | "id": "AXYi60ujxW4P" 682 | }, 683 | "outputs": [], 684 | "source": [ 685 | "%%read_sql\n", 686 | "SELECT Sex AS Gender, Status AS UniversityStatus\n", 687 | "FROM Profiles" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": { 693 | "heading_collapsed": true, 694 | "id": "saRGTmsCxW4P" 695 | }, 696 | "source": [ 697 | "## `SELECT DISTINCT`" 698 | ] 699 | }, 700 | { 701 | "cell_type": "markdown", 702 | "metadata": { 703 | "hidden": true, 704 | "id": "pyEYz880xW4P" 705 | }, 706 | "source": [ 707 | "### IMDb" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": { 714 | "hidden": true, 715 | "id": "whoxqP7kxW4P" 716 | }, 717 | "outputs": [], 718 | "source": [ 719 | "%%read_sql\n", 720 | "USE imdb" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": { 726 | "hidden": true, 727 | "id": "Azc9xnTaxW4P" 728 | }, 729 | "source": [ 730 | "#### Find all the movie genres\n" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "metadata": { 737 | "hidden": true, 738 | "id": "WLjFcGdwxW4P" 739 | }, 740 | "outputs": [], 741 | "source": [ 742 | "%%read_sql\n", 743 | "SELECT DISTINCT genre\n", 744 | "FROM movies_genres" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": { 750 | "hidden": true, 751 | "id": "f8Ga6i3DxW4Q" 752 | }, 753 | "source": [ 754 | "### Facebook" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": { 761 | "hidden": true, 762 | "id": "De8LSi6cxW4Q" 763 | }, 764 | "outputs": [], 765 | "source": [ 766 | "%%read_sql\n", 767 | "USE facebook" 768 | ] 769 | }, 770 | { 771 | "cell_type": "markdown", 772 | "metadata": { 773 | "hidden": true, 774 | "id": "AkIDG3KsxW4Q" 775 | }, 776 | "source": [ 777 | "#### Return the distinct PoliticalViews from the Profiles table\n", 778 | "\n" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": { 785 | "hidden": true, 786 | "id": "b-ymmWVkxW4Q" 787 | }, 788 | "outputs": [], 789 | "source": [ 790 | "%%read_sql\n", 791 | "SELECT DISTINCT PoliticalViews\n", 792 | "FROM Profiles" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": { 798 | "hidden": true, 799 | "id": "kS0nAehsxW4Q" 800 | }, 801 | "source": [ 802 | "#### Return the distinct Sex values from the Profiles table\n" 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": null, 808 | "metadata": { 809 | "hidden": true, 810 | "id": "B9LuPxiKxW4R" 811 | }, 812 | "outputs": [], 813 | "source": [ 814 | "%%read_sql\n", 815 | "SELECT DISTINCT Sex\n", 816 | "FROM Profiles" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "metadata": { 822 | "hidden": true, 823 | "id": "46-bSulWxW4R" 824 | }, 825 | "source": [ 826 | "#### Find what students are “LookingFor”\n" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "metadata": { 833 | "hidden": true, 834 | "id": "cCYsU1QLxW4R" 835 | }, 836 | "outputs": [], 837 | "source": [ 838 | "%%read_sql\n", 839 | "SELECT DISTINCT LookingFor\n", 840 | "FROM LookingFor" 841 | ] 842 | }, 843 | { 844 | "cell_type": "markdown", 845 | "metadata": { 846 | "hidden": true, 847 | "id": "TgSJZ15AxW4R" 848 | }, 849 | "source": [ 850 | "#### Find all possible “Relationship” statuses\n" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": null, 856 | "metadata": { 857 | "hidden": true, 858 | "id": "dxqC8E6WxW4S" 859 | }, 860 | "outputs": [], 861 | "source": [ 862 | "%%read_sql\n", 863 | "SELECT DISTINCT Status\n", 864 | "FROM Relationship" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": { 870 | "hidden": true, 871 | "id": "Qjv2YeSHxW4S" 872 | }, 873 | "source": [ 874 | "#### Find all possible Concentrations" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "metadata": { 881 | "hidden": true, 882 | "id": "FIGzIj1-xW4S" 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "%%read_sql\n", 887 | "SELECT DISTINCT Concentration\n", 888 | "FROM Concentration" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": { 894 | "heading_collapsed": true, 895 | "id": "hjO4AvmvxW4S" 896 | }, 897 | "source": [ 898 | "## `ORDER BY` and `LIMIT`" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": { 904 | "heading_collapsed": true, 905 | "hidden": true, 906 | "id": "FbDgVSSKxW4S" 907 | }, 908 | "source": [ 909 | "### IMDb" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": null, 915 | "metadata": { 916 | "hidden": true, 917 | "id": "_skmdZT2xW4T" 918 | }, 919 | "outputs": [], 920 | "source": [ 921 | "%%read_sql\n", 922 | "USE imdb" 923 | ] 924 | }, 925 | { 926 | "cell_type": "markdown", 927 | "metadata": { 928 | "hidden": true, 929 | "id": "JYZy92-AxW4T" 930 | }, 931 | "source": [ 932 | "#### Find the top-10 ranked movies\n", 933 | "* Rank by “rank” first (descending order)\n", 934 | "* Break ties using “year”\n", 935 | "* Break remaining ties using “name”\n" 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "execution_count": null, 941 | "metadata": { 942 | "hidden": true, 943 | "id": "3tjOeTblxW4T" 944 | }, 945 | "outputs": [], 946 | "source": [ 947 | "%%read_sql\n", 948 | "SELECT *\n", 949 | "FROM movies\n", 950 | "ORDER BY rating DESC, year, name\n", 951 | "LIMIT 10" 952 | ] 953 | }, 954 | { 955 | "cell_type": "markdown", 956 | "metadata": { 957 | "hidden": true, 958 | "id": "6aRMgGMMxW4U" 959 | }, 960 | "source": [ 961 | "#### List all the distinct years of the movies, in descending order\n", 962 | "\n" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": null, 968 | "metadata": { 969 | "hidden": true, 970 | "id": "hbtwEevuxW4U" 971 | }, 972 | "outputs": [], 973 | "source": [ 974 | "%%read_sql\n", 975 | "SELECT DISTINCT year\n", 976 | "FROM movies\n", 977 | "ORDER BY year DESC" 978 | ] 979 | }, 980 | { 981 | "cell_type": "markdown", 982 | "metadata": { 983 | "heading_collapsed": true, 984 | "hidden": true, 985 | "id": "SxFeqckTxW4U" 986 | }, 987 | "source": [ 988 | "### Facebook" 989 | ] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": null, 994 | "metadata": { 995 | "hidden": true, 996 | "id": "26MWAK52xW4U" 997 | }, 998 | "outputs": [], 999 | "source": [ 1000 | "%%read_sql\n", 1001 | "USE facebook" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "markdown", 1006 | "metadata": { 1007 | "hidden": true, 1008 | "id": "IZuvNF0_xW4U" 1009 | }, 1010 | "source": [ 1011 | "#### List the first 50 students that joined Facebook at NYU (use the “MemberSince” attribute)" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": null, 1017 | "metadata": { 1018 | "hidden": true, 1019 | "id": "lfYfRjJFxW4V" 1020 | }, 1021 | "outputs": [], 1022 | "source": [ 1023 | "%%read_sql\n", 1024 | "SELECT *\n", 1025 | "FROM Profiles\n", 1026 | "ORDER BY MemberSince \n", 1027 | "LIMIT 50" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": { 1033 | "hidden": true, 1034 | "id": "vDiaOtTExW4V" 1035 | }, 1036 | "source": [ 1037 | "#### List the 10 students that have not updated their profiles for the longest time (use the “LastUpdate” attribute) – what is the problem?" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": { 1044 | "hidden": true, 1045 | "id": "c7pv49jbxW4V" 1046 | }, 1047 | "outputs": [], 1048 | "source": [ 1049 | "%%read_sql\n", 1050 | "SELECT *\n", 1051 | "FROM Profiles\n", 1052 | "-- WHERE LastUpdate IS NOT NULL -- We need this filtering condition for the query to work as expected\n", 1053 | "ORDER BY LastUpdate\n", 1054 | "LIMIT 10" 1055 | ] 1056 | } 1057 | ], 1058 | "metadata": { 1059 | "colab": { 1060 | "name": "A-Navigation_Queries.ipynb", 1061 | "provenance": [], 1062 | "include_colab_link": true 1063 | }, 1064 | "kernelspec": { 1065 | "display_name": "Python 3", 1066 | "language": "python", 1067 | "name": "python3" 1068 | }, 1069 | "language_info": { 1070 | "codemirror_mode": { 1071 | "name": "ipython", 1072 | "version": 3 1073 | }, 1074 | "file_extension": ".py", 1075 | "mimetype": "text/x-python", 1076 | "name": "python", 1077 | "nbconvert_exporter": "python", 1078 | "pygments_lexer": "ipython3", 1079 | "version": "3.8.2" 1080 | } 1081 | }, 1082 | "nbformat": 4, 1083 | "nbformat_minor": 0 1084 | } -------------------------------------------------------------------------------- /session2/C-Schemas.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C-Schemas.pdf -------------------------------------------------------------------------------- /session2/C-Schemas.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C-Schemas.pptx -------------------------------------------------------------------------------- /session2/C1-ERD_Facebook.mwb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C1-ERD_Facebook.mwb -------------------------------------------------------------------------------- /session2/C2-ERD_IMDB.mwb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session2/C2-ERD_IMDB.mwb -------------------------------------------------------------------------------- /session2/assignment_selection_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "colab_type": "text", 17 | "id": "SkZqhwkf8NUg" 18 | }, 19 | "source": [ 20 | "# Session 2: Selection Queries Assignment\n", 21 | "\n", 22 | "\n", 23 | "In this segment we will connect to the *Music* database." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "colab_type": "text", 30 | "id": "5u_6yLTDT6Kn" 31 | }, 32 | "source": [ 33 | "## Setup" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "colab": {}, 41 | "colab_type": "code", 42 | "id": "O9o9NsaO8hMy" 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "colab": {}, 54 | "colab_type": "code", 55 | "id": "EkIL-uRK8NUi" 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# This code creates a connection to the database\n", 60 | "from sqlalchemy import create_engine\n", 61 | "\n", 62 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 63 | " host=\"db.ipeirotis.org\",\n", 64 | " user=\"student\",\n", 65 | " db=\"music\",\n", 66 | " password=\"dwdstudent2015\",\n", 67 | " encoding=\"utf8mb4\",\n", 68 | ")\n", 69 | "\n", 70 | "engine = create_engine(conn_string)\n", 71 | "con = engine.connect()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "colab": {}, 79 | "colab_type": "code", 80 | "id": "z7muzQXTUFkU" 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "%reload_ext sql_magic" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "colab": {}, 92 | "colab_type": "code", 93 | "id": "uHRIPxBvUGfC" 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "%config SQL.conn_name = 'engine'" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "This is an example of how you can write an SQL query in the notebook.\n", 105 | "You write your SQL query after the `%%read_sql` line" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "colab": { 113 | "base_uri": "https://localhost:8080/", 114 | "height": 390 115 | }, 116 | "colab_type": "code", 117 | "id": "sWa1Uv_6X9zi", 118 | "outputId": "bbc44d81-02b5-4b8b-b776-42f378ed941a" 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "%%read_sql\n", 123 | "SELECT * \n", 124 | "FROM played" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "colab_type": "text", 131 | "id": "H0hhloRRUJlV" 132 | }, 133 | "source": [ 134 | "## Question 1: Show all the tables that appear in the Music database" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "colab": {}, 142 | "colab_type": "code", 143 | "id": "eL_CnyPRUSGI" 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "%%read_sql\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "colab_type": "text", 154 | "id": "Hz_1yX-EUeBQ" 155 | }, 156 | "source": [ 157 | "## Question 2: Show the attributes available for each artist" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "colab": {}, 165 | "colab_type": "code", 166 | "id": "HXy0Ygy3Uf_m" 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "%%read_sql\n" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "colab_type": "text", 177 | "id": "UozymxRTW-wx" 178 | }, 179 | "source": [ 180 | "## Question 3: Show the attributes available for each album" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "colab": {}, 188 | "colab_type": "code", 189 | "id": "ZwtqsRoGW-wz" 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "%%read_sql\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "colab_type": "text", 200 | "id": "8NFSCApmXGZ8" 201 | }, 202 | "source": [ 203 | "## Question 4: Show the attributes available for each track" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "colab": {}, 211 | "colab_type": "code", 212 | "id": "bcr5iccDXGZ9" 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "%%read_sql\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": { 222 | "colab_type": "text", 223 | "id": "nezZleqbUeI_" 224 | }, 225 | "source": [ 226 | "## Question 5: Show all the artists" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "colab": {}, 234 | "colab_type": "code", 235 | "id": "rNfpSzT3UgrM" 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "%%read_sql\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "colab_type": "text", 246 | "id": "_j2bCzADXODe" 247 | }, 248 | "source": [ 249 | "## Question 6: Show all the albums" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "colab": {}, 257 | "colab_type": "code", 258 | "id": "o9h005rlXODf" 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "%%read_sql\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": { 268 | "colab_type": "text", 269 | "id": "HF9cHpSDXaZd" 270 | }, 271 | "source": [ 272 | "## Question 7: Show all the tracks" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "colab": {}, 280 | "colab_type": "code", 281 | "id": "mvLQdbdiXSmw" 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "%%read_sql\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "colab_type": "text", 292 | "id": "ntj5f4n8U3dT" 293 | }, 294 | "source": [ 295 | "## Question 8: List all the names of the artists, without the artist ids, sorted alphabetically " 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "colab": {}, 303 | "colab_type": "code", 304 | "id": "mPCpyhbwU3dV" 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "%%read_sql\n" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": { 314 | "colab_type": "text", 315 | "id": "23bSA8IUU1Og" 316 | }, 317 | "source": [ 318 | "## Question 9: Show all the album names and the corresponding artist id, but do not show the album_id. Rename the album_name attribute to album_title." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "colab": {}, 326 | "colab_type": "code", 327 | "id": "hTD97qjaU1Oi" 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "%%read_sql\n" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "colab_type": "text", 338 | "id": "eWgdmtHHURaS" 339 | }, 340 | "source": [ 341 | "## Question 10: List the 10 shortest tracks, in terms of playing time" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "colab": {}, 349 | "colab_type": "code", 350 | "id": "ygI_N9gBUhWy" 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "%%read_sql\n" 355 | ] 356 | } 357 | ], 358 | "metadata": { 359 | "anaconda-cloud": {}, 360 | "colab": { 361 | "collapsed_sections": [], 362 | "include_colab_link": true, 363 | "name": "Session2-Assignment", 364 | "provenance": [] 365 | }, 366 | "kernelspec": { 367 | "display_name": "Python 3", 368 | "language": "python", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.8.2" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 1 386 | } 387 | -------------------------------------------------------------------------------- /session3/B3-Filtering_Queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "QBeMTyL26ULI" 8 | }, 9 | "source": [ 10 | "# SQL: Filtering Queries" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "colab_type": "text", 17 | "id": "w8oa_JjV7F_h" 18 | }, 19 | "source": [ 20 | "## Setup" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "colab_type": "text", 27 | "id": "0SuVR-1d7Jr4" 28 | }, 29 | "source": [ 30 | "We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "colab": {}, 38 | "colab_type": "code", 39 | "id": "I6vMQAK86ipS" 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "# !sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "colab": {}, 51 | "colab_type": "code", 52 | "id": "vOuWkjz36ULS" 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "%reload_ext sql_magic" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "colab": {}, 64 | "colab_type": "code", 65 | "id": "KE_bhLw16ULK" 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "from sqlalchemy import create_engine\n", 70 | "\n", 71 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/?charset=utf8\".format(\n", 72 | " host=\"db.ipeirotis.org\", user=\"student\", password=\"dwdstudent2015\", encoding=\"utf-8\"\n", 73 | ")\n", 74 | "engine = create_engine(conn_string)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "colab": {}, 82 | "colab_type": "code", 83 | "id": "fivqywfX6ULV" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "%config SQL.conn_name = 'engine'" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "heading_collapsed": true 94 | }, 95 | "source": [ 96 | "## `WHERE`: Equality Conditions" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "hidden": true 103 | }, 104 | "source": [ 105 | "### IMDb" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "hidden": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "%%read_sql\n", 117 | "USE imdb" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "hidden": true 124 | }, 125 | "source": [ 126 | "#### Find the movie entry with id 64729." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "hidden": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "%%read_sql\n", 138 | "SELECT *\n", 139 | "FROM movies\n", 140 | "WHERE id = 64729" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": { 146 | "hidden": true 147 | }, 148 | "source": [ 149 | "#### Find the movie entry with movie title ‘Pulp Fiction’" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "hidden": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "%%read_sql\n", 161 | "SELECT *\n", 162 | "FROM movies\n", 163 | "WHERE name = 'Pulp Fiction'" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "hidden": true 170 | }, 171 | "source": [ 172 | "#### Find the id of the movie “Schindler's List”. (Attention to the quote)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "hidden": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "%%read_sql\n", 184 | "SELECT *\n", 185 | "FROM movies\n", 186 | "WHERE name = 'Schindler\\'s List'" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "hidden": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "%%read_sql\n", 198 | "SELECT *\n", 199 | "FROM movies\n", 200 | "WHERE name = \"Schindler's List\"" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "heading_collapsed": true, 207 | "hidden": true 208 | }, 209 | "source": [ 210 | "#### List all the roles for the movie with id 290070. Sort them alphabetically\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "hidden": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "%%read_sql\n", 222 | "SELECT *\n", 223 | "FROM roles\n", 224 | "WHERE movie_id = 290070\n", 225 | "ORDER BY role" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "heading_collapsed": true 232 | }, 233 | "source": [ 234 | "## `WHERE`: Boolean Operators" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "heading_collapsed": true, 241 | "hidden": true 242 | }, 243 | "source": [ 244 | "### IMDb" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "hidden": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "%%read_sql\n", 256 | "USE imdb" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "hidden": true 263 | }, 264 | "source": [ 265 | "#### Fetch all info for actresses (female gender) whose first name is Skyler" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "hidden": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "%%read_sql\n", 277 | "SELECT *\n", 278 | "FROM actors\n", 279 | "WHERE gender = 'F' AND first_name = 'Skyler'" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": { 285 | "hidden": true 286 | }, 287 | "source": [ 288 | "#### Fetch all info for the director Steven Spielberg\n" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "hidden": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "%%read_sql\n", 300 | "SELECT *\n", 301 | "FROM directors\n", 302 | "WHERE first_name = 'Steven' AND last_name = 'Spielberg'" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "hidden": true 309 | }, 310 | "source": [ 311 | "#### Fetch all info for the directors with last names Scorsese, Polanski, and Spielberg. Use the OR for your Boolean query.\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "hidden": true 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "%%read_sql\n", 323 | "SELECT *\n", 324 | "FROM directors\n", 325 | "WHERE last_name = 'Scorsese' OR last_name = 'Polanski' Or last_name = 'Spielberg'" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": { 331 | "hidden": true 332 | }, 333 | "source": [ 334 | "#### Fetch all info for the directors Quentin Tarantino, Stanley Kubrick, and Orson Welles." 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "hidden": true 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "%%read_sql\n", 346 | "SELECT *\n", 347 | "FROM directors\n", 348 | "WHERE (first_name = 'Quentin' AND last_name = 'Tarantino') OR \n", 349 | " (first_name = 'Stanley' AND last_name = 'Kubrick') OR \n", 350 | " (first_name = 'Orson' AND last_name = 'Welles')" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": { 356 | "heading_collapsed": true 357 | }, 358 | "source": [ 359 | "## `WHERE`: Inequality Queries" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "hidden": true 366 | }, 367 | "source": [ 368 | "### IMDb" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "hidden": true 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "%%read_sql\n", 380 | "USE imdb" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": { 386 | "hidden": true 387 | }, 388 | "source": [ 389 | "#### Find all information about movies that were released before 1895 (excl)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": { 396 | "hidden": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "%%read_sql\n", 401 | "SELECT *\n", 402 | "FROM movies\n", 403 | "WHERE year<1895" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "hidden": true 410 | }, 411 | "source": [ 412 | "#### Find all information about movies released between 1895 and 1898 (excl) \n", 413 | "\n", 414 | "Try both using Boolean operators and using the BETWEEN operator" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "hidden": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "%%read_sql\n", 426 | "SELECT *\n", 427 | "FROM movies\n", 428 | "WHERE year>1895 AND year<1898" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "hidden": true 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "%%read_sql\n", 440 | "SELECT *\n", 441 | "FROM movies\n", 442 | "WHERE year BETWEEN 1896 AND 1897 -- notice that BETWEEN is inclusive on both sides" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": { 448 | "hidden": true 449 | }, 450 | "source": [ 451 | "#### Find all information about movies that were released before 1895 and after 2006 (inclusive)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "hidden": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "%%read_sql\n", 463 | "SELECT *\n", 464 | "FROM movies\n", 465 | "WHERE year<=1895 OR year>=2006" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": { 471 | "heading_collapsed": true 472 | }, 473 | "source": [ 474 | "## The `IN` operator" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": { 480 | "heading_collapsed": true, 481 | "hidden": true 482 | }, 483 | "source": [ 484 | "### IMDb" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "hidden": true 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "%%read_sql\n", 496 | "USE imdb" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": { 502 | "hidden": true 503 | }, 504 | "source": [ 505 | "#### Fetch all info for the directors with last names Scorsese, Polanski, and Spielberg. Use `IN` for your Boolean query." 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "hidden": true 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "%%read_sql\n", 517 | "SELECT * \n", 518 | "FROM directors\n", 519 | "WHERE last_name IN ( 'Scorsese', 'Spielberg', 'Polanski' );" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "hidden": true 526 | }, 527 | "source": [ 528 | "#### Fetch all info for the directors Quentin Tarantino, Stanley Kubrick, and Orson Welles. Use `IN` for your Boolean query.\n" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": { 535 | "hidden": true 536 | }, 537 | "outputs": [], 538 | "source": [ 539 | "%%read_sql\n", 540 | "SELECT * FROM directors\n", 541 | "WHERE (first_name, last_name) IN (\n", 542 | " ('Quentin', 'Tarantino'), \n", 543 | " ('Stanley', 'Kubrick'), \n", 544 | " ('Orson', 'Welles') \n", 545 | ")\n" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": { 551 | "heading_collapsed": true 552 | }, 553 | "source": [ 554 | "## The `LIKE` operator for approximate queries" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": { 560 | "heading_collapsed": true, 561 | "hidden": true 562 | }, 563 | "source": [ 564 | "### IMDb" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": { 571 | "hidden": true 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "%%read_sql\n", 576 | "USE imdb" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": { 582 | "hidden": true 583 | }, 584 | "source": [ 585 | "#### Find the entry for Alfred Hitchcock\n", 586 | "\n", 587 | "Hint: Use an approximation for his first name\n" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": { 594 | "hidden": true 595 | }, 596 | "outputs": [], 597 | "source": [ 598 | "%%read_sql\n", 599 | "SELECT *\n", 600 | "FROM directors\n", 601 | "WHERE last_name = 'Hitchcock' AND first_name LIKE 'A%%' -- The double %% is only necessary when writing SQL \n", 602 | " -- within Jupyter notebooks. " 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": { 608 | "hidden": true 609 | }, 610 | "source": [ 611 | "#### Find the Godfather movies, released in 1972, 1974, and 1990" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": { 618 | "hidden": true 619 | }, 620 | "outputs": [], 621 | "source": [ 622 | "%%read_sql\n", 623 | "SELECT *\n", 624 | "FROM movies\n", 625 | "WHERE name LIKE 'Godfather%%' AND -- The double %% is only necessary when writing SQL \n", 626 | " year IN (1972, 1974, 1990) -- within Jupyter notebooks. " 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": { 633 | "hidden": true 634 | }, 635 | "outputs": [], 636 | "source": [] 637 | } 638 | ], 639 | "metadata": { 640 | "colab": { 641 | "name": "A-Navigation_Queries.ipynb", 642 | "provenance": [] 643 | }, 644 | "kernelspec": { 645 | "display_name": "Python 3", 646 | "language": "python", 647 | "name": "python3" 648 | }, 649 | "language_info": { 650 | "codemirror_mode": { 651 | "name": "ipython", 652 | "version": 3 653 | }, 654 | "file_extension": ".py", 655 | "mimetype": "text/x-python", 656 | "name": "python", 657 | "nbconvert_exporter": "python", 658 | "pygments_lexer": "ipython3", 659 | "version": "3.8.5" 660 | } 661 | }, 662 | "nbformat": 4, 663 | "nbformat_minor": 1 664 | } 665 | -------------------------------------------------------------------------------- /session3/B3-SQL_Filtering.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session3/B3-SQL_Filtering.pptx -------------------------------------------------------------------------------- /session3/assignment_filtering_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "colab": { 7 | "name": "Session 3 Assignment", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.6" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "SkZqhwkf8NUg" 45 | }, 46 | "source": [ 47 | "# Session 3: Filtering Queries Assignment\n", 48 | "\n", 49 | "\n", 50 | "In this segment we will connect to the *Music* database." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "5u_6yLTDT6Kn" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "O9o9NsaO8hMy" 66 | }, 67 | "source": [ 68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "EkIL-uRK8NUi" 77 | }, 78 | "source": [ 79 | "# This code creates a connection to the database\n", 80 | "from sqlalchemy import create_engine\n", 81 | "\n", 82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 83 | " host=\"db.ipeirotis.org\",\n", 84 | " user=\"student\",\n", 85 | " db=\"music\",\n", 86 | " password=\"dwdstudent2015\",\n", 87 | " encoding=\"utf8mb4\",\n", 88 | ")\n", 89 | "\n", 90 | "engine = create_engine(conn_string)\n", 91 | "con = engine.connect()" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "z7muzQXTUFkU" 100 | }, 101 | "source": [ 102 | "%reload_ext sql_magic" 103 | ], 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "uHRIPxBvUGfC" 111 | }, 112 | "source": [ 113 | "%config SQL.conn_name = 'engine'" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "UJmBOzZyLnTI" 122 | }, 123 | "source": [ 124 | "This is an example of how you can write an SQL query in the notebook. You write your SQL query after the `%%read_sql` line" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "metadata": { 130 | "id": "sWa1Uv_6X9zi" 131 | }, 132 | "source": [ 133 | "%%read_sql\n", 134 | "SELECT * \n", 135 | "FROM played" 136 | ], 137 | "execution_count": null, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "id": "H0hhloRRUJlV" 144 | }, 145 | "source": [ 146 | "## Question 1: Show the entry for the artist with id equal to 5." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "metadata": { 152 | "id": "eL_CnyPRUSGI" 153 | }, 154 | "source": [ 155 | "%%read_sql\n" 156 | ], 157 | "execution_count": null, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": { 163 | "id": "Hz_1yX-EUeBQ" 164 | }, 165 | "source": [ 166 | "## Question 2: Show the entry for the artist named `The Rolling Stones` " 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "metadata": { 172 | "id": "HXy0Ygy3Uf_m" 173 | }, 174 | "source": [ 175 | "%%read_sql\n" 176 | ], 177 | "execution_count": null, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "id": "UozymxRTW-wx" 184 | }, 185 | "source": [ 186 | "## Question 3: Using the `id` of Rolling Stones from Question 2, list all the albums of `The Rolling Stones`" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "metadata": { 192 | "id": "ZwtqsRoGW-wz" 193 | }, 194 | "source": [ 195 | "%%read_sql\n" 196 | ], 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "id": "8NFSCApmXGZ8" 204 | }, 205 | "source": [ 206 | "## Question 4: Find the tracks for the artist with id `3`, from the artist's album with id `2`." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "bcr5iccDXGZ9" 213 | }, 214 | "source": [ 215 | "%%read_sql" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "id": "nezZleqbUeI_" 224 | }, 225 | "source": [ 226 | "## Question 5: Find the tracts with names that earlier alphabetically than (is less than) `M`.\n", 227 | "\n", 228 | "Note that inequality queries can be used with text and not only with numbers." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "rNfpSzT3UgrM" 235 | }, 236 | "source": [ 237 | "%%read_sql\n" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "id": "_j2bCzADXODe" 246 | }, 247 | "source": [ 248 | "## Question 6: Find all albums with a title that begins with a character greater than `E` (*not* inclusive of albums that start with `E`) but less than `S` (again, *not* inclusive of albums that start with `S`)." 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "o9h005rlXODf" 255 | }, 256 | "source": [ 257 | "%%read_sql\n" 258 | ], 259 | "execution_count": null, 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": { 265 | "id": "HF9cHpSDXaZd" 266 | }, 267 | "source": [ 268 | "## Question 7: List the 10 longest tracks in terms of time length." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "metadata": { 274 | "id": "mvLQdbdiXSmw" 275 | }, 276 | "source": [ 277 | "%%read_sql\n" 278 | ], 279 | "execution_count": null, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": { 285 | "id": "ntj5f4n8U3dT" 286 | }, 287 | "source": [ 288 | "## Question 8: List all the tracts for the artists with ids 1, 3, and 5. Show two variations of the query. One query using the `OR` boolean condition, and one query using the `IN` operation." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "metadata": { 294 | "id": "mPCpyhbwU3dV" 295 | }, 296 | "source": [ 297 | "%%read_sql\n", 298 | "# Using the OR" 299 | ], 300 | "execution_count": null, 301 | "outputs": [] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "metadata": { 306 | "id": "kGPCoB8NgZZP" 307 | }, 308 | "source": [ 309 | "%%read_sql\n", 310 | "# Using the IN" 311 | ], 312 | "execution_count": null, 313 | "outputs": [] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": { 318 | "id": "23bSA8IUU1Og" 319 | }, 320 | "source": [ 321 | "## Question 9: Find all the tracks that include the word \"Love\" anywhere in the title. It is fine to include tracks where `love` is part of a bigger word (e.g, `lovebird`).\n", 322 | "\n", 323 | "A bit of a complication: In the Colab environment, you need to enter the `%` character twice in a `LIKE` string instead of just once in MySQL Workbench." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "metadata": { 329 | "id": "hTD97qjaU1Oi" 330 | }, 331 | "source": [ 332 | "%%read_sql\n" 333 | ], 334 | "execution_count": null, 335 | "outputs": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": { 340 | "id": "eWgdmtHHURaS" 341 | }, 342 | "source": [ 343 | "## Question 10: List all the tracks by artist with id `1` starting with the letter `L` and order them in descending order of time length." 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "id": "ygI_N9gBUhWy" 350 | }, 351 | "source": [ 352 | "%%read_sql\n" 353 | ], 354 | "execution_count": null, 355 | "outputs": [] 356 | } 357 | ] 358 | } -------------------------------------------------------------------------------- /session3/practice_questions_filtering.md: -------------------------------------------------------------------------------- 1 | # Filtering practice queries. 2 | 3 | ## Restaurants Database 4 | 5 | 1. Output the names of all the Thai restaurants stored in your DB. 6 | 2. Output the names of all the Brooklyn restaurants stored in your DB there were established prior 7 | to 2012. 8 | 3. Show the list of restaurants together with their cuisine and location with the average price 9 | higher than $120.00 10 | 4. Show the names of all the food critics who work in NYT or NYP together with the corresponding 11 | affiliation. 12 | 5. Output the first and last names of the reviewers who are freelancers and to not have any 13 | affiliation. 14 | 6. Output the first and last names of authors whose last name starts with the letter ‘A’. 15 | 7. Show all the restaurant names whose length is less than 10 characters. 16 | 8. Output all the records from the Rating table for the reviews made by the critic with cID 202. 17 | 9. Output all the records from the Rating table for the reviews made by the critic with cID 210. 18 | 10. Output all the records from the Rating table for the reviews with the starRating greater than 19 | 11. Output the names of all the Italian Manhattan restaurants. 20 | 12. Output all the names of all the Bronx restaurants with the average prices greater than $100.00 21 | 22 | ## Facebook Database 23 | 24 | 1. Get the names and sex of all liberal students 25 | 2. Get the names, sex, and political views of liberal and very liberal students 26 | 3. Find all students who live in “Weinstein Hall”, independent of their room number 27 | 4. Find all students with first name “Richard” 28 | 5. Find all students with first names starting with P and last names starting with I (e.g. Panos Ipeirotis) 29 | -------------------------------------------------------------------------------- /session4/C-SQL_Joins.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session4/C-SQL_Joins.pptx -------------------------------------------------------------------------------- /session4/README.md: -------------------------------------------------------------------------------- 1 | A nice explanation of inner vs outer joins: 2 | https://www.stratascratch.com/blog/types-of-pandas-joins-and-how-to-use-them-in-python/ 3 | -------------------------------------------------------------------------------- /session4/assignment_join_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "colab": { 7 | "name": "Session 4 Assignment", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-pysethon", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.6" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "SkZqhwkf8NUg" 45 | }, 46 | "source": [ 47 | "# Session 4: Join Queries Assignment\n", 48 | "\n", 49 | "\n", 50 | "In this segment we will connect to the *Music* database." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "5u_6yLTDT6Kn" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "O9o9NsaO8hMy" 66 | }, 67 | "source": [ 68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "EkIL-uRK8NUi" 77 | }, 78 | "source": [ 79 | "# This code creates a connection to the database\n", 80 | "from sqlalchemy import create_engine\n", 81 | "\n", 82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 83 | " host=\"db.ipeirotis.org\",\n", 84 | " user=\"student\",\n", 85 | " db=\"music\",\n", 86 | " password=\"dwdstudent2015\",\n", 87 | " encoding=\"utf8mb4\",\n", 88 | ")\n", 89 | "\n", 90 | "engine = create_engine(conn_string)\n", 91 | "con = engine.connect()" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "z7muzQXTUFkU" 100 | }, 101 | "source": [ 102 | "%reload_ext sql_magic" 103 | ], 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "uHRIPxBvUGfC" 111 | }, 112 | "source": [ 113 | "%config SQL.conn_name = 'engine'" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "-7ZZAPj1yfrZ" 122 | }, 123 | "source": [ 124 | "This is an example of how you can write an SQL query in the notebook.\n", 125 | "You write your SQL query after the `%%read_sql` line." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "sWa1Uv_6X9zi" 132 | }, 133 | "source": [ 134 | "%%read_sql\n", 135 | "SELECT * \n", 136 | "FROM played" 137 | ], 138 | "execution_count": null, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "id": "H0hhloRRUJlV" 145 | }, 146 | "source": [ 147 | "## Question 1: List all the album names by the band `New Order`" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "eL_CnyPRUSGI" 154 | }, 155 | "source": [ 156 | "%%read_sql" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "id": "Hz_1yX-EUeBQ" 165 | }, 166 | "source": [ 167 | "## Question 2: List the tracks for the album `Second Coming`" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "metadata": { 173 | "id": "HXy0Ygy3Uf_m" 174 | }, 175 | "source": [ 176 | "%%read_sql\n" 177 | ], 178 | "execution_count": null, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "id": "UozymxRTW-wx" 185 | }, 186 | "source": [ 187 | "## Question 3: List all the track names, the corresponding album name, and the corresponding artist name" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "ZwtqsRoGW-wz" 194 | }, 195 | "source": [ 196 | "%%read_sql" 197 | ], 198 | "execution_count": null, 199 | "outputs": [] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "id": "8NFSCApmXGZ8" 205 | }, 206 | "source": [ 207 | "## Question 4: List all all the tracks by the artist `The Stone Roses` and rank them by time length, from shortest to longest" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "metadata": { 213 | "id": "bcr5iccDXGZ9" 214 | }, 215 | "source": [ 216 | "%%read_sql\n" 217 | ], 218 | "execution_count": null, 219 | "outputs": [] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "nezZleqbUeI_" 225 | }, 226 | "source": [ 227 | "## Question 5: The table `played` contains the tracks that the user listened to, and the time that they listened to the songs. List the _distinct_ names of the artists that the user has listened to." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "metadata": { 233 | "id": "rNfpSzT3UgrM" 234 | }, 235 | "source": [ 236 | "%%read_sql\n" 237 | ], 238 | "execution_count": null, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "id": "_j2bCzADXODe" 245 | }, 246 | "source": [ 247 | "## Question 6: List the name of the artists and albums that have tracks with time length more than 10 minutes." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "metadata": { 253 | "id": "o9h005rlXODf" 254 | }, 255 | "source": [ 256 | "%%read_sql" 257 | ], 258 | "execution_count": null, 259 | "outputs": [] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": { 264 | "id": "HF9cHpSDXaZd" 265 | }, 266 | "source": [ 267 | "## Question 7: List the album name, the artist name, and the track names, where both the name of the album and the name of the track contain the string `love` (it is fine if it is part of a longer word)." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "metadata": { 273 | "id": "mvLQdbdiXSmw" 274 | }, 275 | "source": [ 276 | "%%read_sql" 277 | ], 278 | "execution_count": null, 279 | "outputs": [] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "id": "ntj5f4n8U3dT" 285 | }, 286 | "source": [ 287 | "## Question 8: The table played contains the tracks that the user listened to, and the time that they listened to the songs. List the tracks that are in the database, but which the user has never listened to. (Note: Need an outer join)." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "metadata": { 293 | "id": "mPCpyhbwU3dV" 294 | }, 295 | "source": [ 296 | "%%read_sql\n" 297 | ], 298 | "execution_count": null, 299 | "outputs": [] 300 | } 301 | ] 302 | } -------------------------------------------------------------------------------- /session4/practice_questions_joins_restaurants.md: -------------------------------------------------------------------------------- 1 | # JOIN practice queries. 2 | 3 | ## Restaurants Database 4 | 5 | 6 | 1. Output the names of the restaurants together with the comments written for these restaurants: 7 | 8 | a. Your output should include only those restaurants for which reviews were submitted; do 9 | not output empty comments (NULL values); 10 | 11 | b. Your output should contain the names of all the restaurants from your database; if some 12 | restaurants are not reviewed or their comments are empty, these restaurants should 13 | still be included in your output (with NULL values for the comment attribute); 14 | 15 | c. Your output should contain the names of all the restaurants from your database; if some 16 | restaurants are not reviewed or their comments are empty, these restaurants should 17 | still be included in your output (with NULL values for the comment attribute); make sure 18 | that the output contains only distinct records (e.g., if a restaurant has two reviews with 19 | empty comments, there should be only one record in the output relation corresponding 20 | to this situation). 21 | 22 | 2. For every review stored in the database output the review id, first and last names of the critic 23 | together with the comments left by this critic for each of the reviews. 24 | 25 | 3. Output the critic’s first and last names, restaurant name, and the star rating assigned by the 26 | critic to the restaurant for all the reviews where the star rating is greater or equal to 3. 27 | 28 | 4. For all the Manhattan restaurants output the following information regarding all the reviews 29 | submitted for these restaurants: the name of the restaurant, its cuisine, the name of the food 30 | critic, food critic’s affiliation, star rating assigned by the critic, date the review was written and 31 | the comments: 32 | 33 | a. Include in the output only those restaurants for which there are reviews in your DB. 34 | 35 | b. Include in your output all the information about all the Manhattan restaurants. If there 36 | are Manhattan restaurants for which there are no reviews then the fields for the critic 37 | and review should be null. 38 | 39 | 5. Output the rating code, the critic’s name who submitted this rating, the borough of the 40 | restaurant for which the rating was given, and the star rating for all the ratings submitted after 41 | January 1, 2010. 42 | -------------------------------------------------------------------------------- /session5/D-SQL_Aggregation_Queries.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session5/D-SQL_Aggregation_Queries.pptx -------------------------------------------------------------------------------- /session5/assignment_aggregate_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "colab": { 7 | "name": "Session 5: Aggregate Queries Assignment", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.6" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "SkZqhwkf8NUg" 45 | }, 46 | "source": [ 47 | "# Session 5: Aggregate Queries Assignment\n", 48 | "\n", 49 | "\n", 50 | "In this segment we will connect to the *Music* database." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "5u_6yLTDT6Kn" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "O9o9NsaO8hMy" 66 | }, 67 | "source": [ 68 | "\n", 69 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic" 70 | ], 71 | "execution_count": null, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "metadata": { 77 | "id": "EkIL-uRK8NUi" 78 | }, 79 | "source": [ 80 | "# This code creates a connection to the database\n", 81 | "from sqlalchemy import create_engine\n", 82 | "\n", 83 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 84 | " host=\"db.ipeirotis.org\",\n", 85 | " user=\"student\",\n", 86 | " db=\"music\",\n", 87 | " password=\"dwdstudent2015\",\n", 88 | " encoding=\"utf8mb4\",\n", 89 | ")\n", 90 | "\n", 91 | "engine = create_engine(conn_string)\n", 92 | "con = engine.connect()" 93 | ], 94 | "execution_count": null, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "metadata": { 100 | "id": "z7muzQXTUFkU" 101 | }, 102 | "source": [ 103 | "%reload_ext sql_magic" 104 | ], 105 | "execution_count": null, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "metadata": { 111 | "id": "uHRIPxBvUGfC" 112 | }, 113 | "source": [ 114 | "%config SQL.conn_name = 'engine'" 115 | ], 116 | "execution_count": null, 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "metadata": { 122 | "id": "sWa1Uv_6X9zi" 123 | }, 124 | "source": [ 125 | "%%read_sql\n", 126 | "SELECT * \n", 127 | "FROM played" 128 | ], 129 | "execution_count": null, 130 | "outputs": [] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "pJIqxXsU3Ldx" 136 | }, 137 | "source": [ 138 | "## **ATTENTION: Remember that the primary key for an album consists of both the artist id and the album id. Similarly, the primary key for the tracks consists of the track id, the album id, and the artist id.**" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "id": "H0hhloRRUJlV" 145 | }, 146 | "source": [ 147 | "## Question 1: Count the number of artists in the database" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "eL_CnyPRUSGI" 154 | }, 155 | "source": [ 156 | "%%read_sql\n" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "id": "McZQXcp__OCu" 165 | }, 166 | "source": [ 167 | "## Question 2: Count the number of tracks in the database" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "metadata": { 173 | "id": "W7GayX2v_Nl8" 174 | }, 175 | "source": [ 176 | "" 177 | ], 178 | "execution_count": null, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "id": "Hz_1yX-EUeBQ" 185 | }, 186 | "source": [ 187 | "## Question 3: Show the average and standard deviation of the track length" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "HXy0Ygy3Uf_m" 194 | }, 195 | "source": [ 196 | "%%read_sql\n" 197 | ], 198 | "execution_count": null, 199 | "outputs": [] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "id": "UozymxRTW-wx" 205 | }, 206 | "source": [ 207 | "## Question 4: Show the earliest and latest date that the user has played a song" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "metadata": { 213 | "id": "ZwtqsRoGW-wz" 214 | }, 215 | "source": [ 216 | "%%read_sql\n" 217 | ], 218 | "execution_count": null, 219 | "outputs": [] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "8NFSCApmXGZ8" 225 | }, 226 | "source": [ 227 | "## Question 5: For each artist id, count the number of albums in the database. You only need to show the id of the artist, not the name of the artist." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "metadata": { 233 | "id": "bcr5iccDXGZ9" 234 | }, 235 | "source": [ 236 | "%%read_sql\n" 237 | ], 238 | "execution_count": null, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "id": "nezZleqbUeI_" 245 | }, 246 | "source": [ 247 | "## Question 6: For each album id, count the number of tracks for that album. You only need to show the id of the album, not its name." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "metadata": { 253 | "id": "rNfpSzT3UgrM" 254 | }, 255 | "source": [ 256 | "%%read_sql\n" 257 | ], 258 | "execution_count": null, 259 | "outputs": [] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": { 264 | "id": "_j2bCzADXODe" 265 | }, 266 | "source": [ 267 | "## Question 7: For each album id, show the total length of all the tracks in the album." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "metadata": { 273 | "id": "o9h005rlXODf" 274 | }, 275 | "source": [ 276 | "%%read_sql\n" 277 | ], 278 | "execution_count": null, 279 | "outputs": [] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "id": "HF9cHpSDXaZd" 285 | }, 286 | "source": [ 287 | "## Question 8: List all the album id's, where the total album length (across all the album's tracks) is longer than 60 minutes." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "metadata": { 293 | "id": "mvLQdbdiXSmw" 294 | }, 295 | "source": [ 296 | "%%read_sql" 297 | ], 298 | "execution_count": null, 299 | "outputs": [] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": { 304 | "id": "ntj5f4n8U3dT" 305 | }, 306 | "source": [ 307 | "## Question 9: Find all track names that appear in more than one album, and show how many artists used the same track name (if any)." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "metadata": { 313 | "id": "mPCpyhbwU3dV" 314 | }, 315 | "source": [ 316 | "%%read_sql" 317 | ], 318 | "execution_count": null, 319 | "outputs": [] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": { 324 | "id": "23bSA8IUU1Og" 325 | }, 326 | "source": [ 327 | "## Question 10: For each artist id, list the first and the last time that a user listened to a song by this artist." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "metadata": { 333 | "id": "hTD97qjaU1Oi" 334 | }, 335 | "source": [ 336 | "%%read_sql " 337 | ], 338 | "execution_count": null, 339 | "outputs": [] 340 | } 341 | ] 342 | } -------------------------------------------------------------------------------- /session5/practice_queries_aggregation.md: -------------------------------------------------------------------------------- 1 | # Aggregation, practice queries 2 | 3 | ## Restaurant Database 4 | 5 | GROUP BY, aggregation functions (MAX, MIN, COUNT, etc.) 6 | Review also HAVING (condition for the GROUP BY) and ORDER 7 | 8 | 1. How many Manhattan restaurants are listed in your database; 9 | 2. Output the affiliation (or '-' for freelancers) and how many critics are associated with this affiliation; 10 | 3. Output the critic id together with the maximal star rating ever issued by this critic; 11 | 4. Output the critic id and the restaurant code together with the maximal star rating ever issued by this critic for this restaurant; 12 | 5. For every borough, cuisine pair output the minimal price and order the output by borough in the ascending order (consider only the restaurants outside of Manhattan); 13 | 6. For every borough, cuisine pair output the minimal price where the minimal price is greater than $100; 14 | 7. For every borough, cuisine pair output the minimal price where the minimal price is greater than $100 and order the output by the price value in the descending order. -------------------------------------------------------------------------------- /session5/practice_queries_aggregation_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "colab": { 7 | "name": "Practice Aggregate Queries: Solutions", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.8.2" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "SkZqhwkf8NUg" 45 | }, 46 | "source": [ 47 | "# Session 5: Practice Aggregate Queries: Solutions\n", 48 | "\n", 49 | "\n", 50 | "In this segment we will connect to the *Restaurants* database." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "5u_6yLTDT6Kn" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "O9o9NsaO8hMy" 66 | }, 67 | "source": [ 68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "EkIL-uRK8NUi" 77 | }, 78 | "source": [ 79 | "# This code creates a connection to the database\n", 80 | "from sqlalchemy import create_engine\n", 81 | "\n", 82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 83 | " host=\"db.ipeirotis.org\",\n", 84 | " user=\"student\",\n", 85 | " db=\"restaurants\",\n", 86 | " password=\"dwdstudent2015\",\n", 87 | " encoding=\"utf8mb4\",\n", 88 | ")\n", 89 | "\n", 90 | "engine = create_engine(conn_string)\n", 91 | "con = engine.connect()" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "z7muzQXTUFkU" 100 | }, 101 | "source": [ 102 | "%reload_ext sql_magic" 103 | ], 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "uHRIPxBvUGfC" 111 | }, 112 | "source": [ 113 | "%config SQL.conn_name = 'engine'" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "H0hhloRRUJlV" 122 | }, 123 | "source": [ 124 | "## Question 1: How many Manhattan restaurants are listed in your database" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "metadata": { 130 | "id": "eL_CnyPRUSGI" 131 | }, 132 | "source": [ 133 | "%%read_sql\n", 134 | "SELECT borough, COUNT(*) AS num_restaurants\n", 135 | "FROM Restaurant\n", 136 | "WHERE borough = 'Manhattan'\n", 137 | "GROUP BY borough" 138 | ], 139 | "execution_count": null, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "id": "McZQXcp__OCu" 146 | }, 147 | "source": [ 148 | "## Question 2: Output the affiliation (or '-' for freelancers) and how many critics are associated with this affiliation;" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "metadata": { 154 | "id": "W7GayX2v_Nl8" 155 | }, 156 | "source": [ 157 | "%%read_sql\n", 158 | "SELECT affiliation, COUNT(*) AS num_critics\n", 159 | "FROM Critic\n", 160 | "GROUP BY affiliation" 161 | ], 162 | "execution_count": null, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "id": "uhU3fLp9RoiR" 169 | }, 170 | "source": [ 171 | "If we want to replace the NULL value in affiliation with `-`, we use the `COALESCE` command, that replaces NULL with the value that we pass as parameter." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "QSyuhaZZRY-O" 178 | }, 179 | "source": [ 180 | "%%read_sql\n", 181 | "SELECT COALESCE(affiliation, '-') AS affiliation, COUNT(*) AS num_critics\n", 182 | "FROM Critic\n", 183 | "GROUP BY affiliation" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "id": "Hz_1yX-EUeBQ" 192 | }, 193 | "source": [ 194 | "## Question 3: Output the critic id together with the maximal star rating ever issued by this critic;" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "metadata": { 200 | "id": "HXy0Ygy3Uf_m" 201 | }, 202 | "source": [ 203 | "%%read_sql\n", 204 | "SELECT cID, MAX(starRating) AS maxRating\n", 205 | "FROM Rating\n", 206 | "GROUP BY cID" 207 | ], 208 | "execution_count": null, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": { 214 | "id": "UozymxRTW-wx" 215 | }, 216 | "source": [ 217 | "## Question 4: Output the critic id and the restaurant code together with the maximal star rating ever issued by this critic for this restaurant;" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "ZwtqsRoGW-wz" 224 | }, 225 | "source": [ 226 | "%%read_sql\n", 227 | "SELECT cID, restCode, MAX(starRating) AS maxRating\n", 228 | "FROM Rating\n", 229 | "GROUP BY cID, restCode" 230 | ], 231 | "execution_count": null, 232 | "outputs": [] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "id": "8NFSCApmXGZ8" 238 | }, 239 | "source": [ 240 | "## Question 5: For every borough, cuisine pair output the minimal price and order the output by borough in the ascending order (consider only the restaurants outside of Manhattan);" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "metadata": { 246 | "id": "bcr5iccDXGZ9" 247 | }, 248 | "source": [ 249 | "%%read_sql\n", 250 | "SELECT borough, cuisine, MIN(avgPrice) AS minPrice\n", 251 | "FROM Restaurant\n", 252 | "WHERE borough <> \"Manhattan\"\n", 253 | "GROUP BY borough, cuisine" 254 | ], 255 | "execution_count": null, 256 | "outputs": [] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": { 261 | "id": "nezZleqbUeI_" 262 | }, 263 | "source": [ 264 | "## Question 6: For every borough, cuisine pair output the minimal price where the minimal price is greater than 100" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "metadata": { 270 | "id": "rNfpSzT3UgrM" 271 | }, 272 | "source": [ 273 | "%%read_sql\n", 274 | "SELECT borough, cuisine, MIN(avgPrice) AS minPrice\n", 275 | "FROM Restaurant\n", 276 | "GROUP BY borough, cuisine\n", 277 | "HAVING minPrice>100" 278 | ], 279 | "execution_count": null, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": { 285 | "id": "_j2bCzADXODe" 286 | }, 287 | "source": [ 288 | "## Question 7: For every borough, cuisine pair output the minimal price where the minimal price is greater than 100 and order the output by the price value in the descending order." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "metadata": { 294 | "id": "o9h005rlXODf" 295 | }, 296 | "source": [ 297 | "%%read_sql\n", 298 | "SELECT borough, cuisine, MIN(avgPrice) AS minPrice\n", 299 | "FROM Restaurant\n", 300 | "GROUP BY borough, cuisine\n", 301 | "HAVING minPrice>100\n", 302 | "ORDER BY minPrice DESC" 303 | ], 304 | "execution_count": null, 305 | "outputs": [] 306 | } 307 | ] 308 | } -------------------------------------------------------------------------------- /session5/practice_queries_join_and_aggregation.md: -------------------------------------------------------------------------------- 1 | # Join and Aggregation 2 | 3 | ## Restaurant Database 4 | 5 | 1. Output the restaurant name together with the number of reviews submitted for this restaurant. 6 | 2. For every Manhattan restaurant output its name and the number of reviews submitted for this 7 | restaurant. 8 | 3. For every restaurant that was reviewed more than once output it name and the number or 9 | reviews submitted for this restaurant. 10 | 4. Output the critic's last name and the restaurant name together with the maximal star rating 11 | ever issued by this critic for this restaurant. 12 | 5. For each cuisine-borough pair, output the number of the corresponding restaurants. 13 | 6. For each NYT reporter, output the number of distinct restaurants this reporter reviewed. 14 | 7. For every news outlet, output the average star rating submitted by all the reviewers of this 15 | outlet. 16 | a. consider only Italian restaurants 17 | b. consider only Italian restaurants outside of Manhattan 18 | 8. For every borough output the max star rating submitted for any restaurant within this borough 19 | (in which borough do you have the best restaurant) -------------------------------------------------------------------------------- /session5/practice_queries_join_and_aggregation_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "colab": { 7 | "name": "Practice Aggregate+Join Queries: Solutions", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.6" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "SkZqhwkf8NUg" 45 | }, 46 | "source": [ 47 | "# Session 5: Practice Aggregate+Join Queries: Solutions\n", 48 | "\n", 49 | "\n", 50 | "In this segment we will connect to the *Restaurants* database." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "5u_6yLTDT6Kn" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "O9o9NsaO8hMy" 66 | }, 67 | "source": [ 68 | "!sudo pip3 install PyMySQL sqlalchemy sql_magic" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "EkIL-uRK8NUi" 77 | }, 78 | "source": [ 79 | "# This code creates a connection to the database\n", 80 | "from sqlalchemy import create_engine\n", 81 | "\n", 82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 83 | " host=\"db.ipeirotis.org\",\n", 84 | " user=\"student\",\n", 85 | " db=\"restaurants\",\n", 86 | " password=\"dwdstudent2015\",\n", 87 | " encoding=\"utf8mb4\",\n", 88 | ")\n", 89 | "\n", 90 | "engine = create_engine(conn_string)\n", 91 | "con = engine.connect()" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "z7muzQXTUFkU" 100 | }, 101 | "source": [ 102 | "%reload_ext sql_magic" 103 | ], 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "uHRIPxBvUGfC" 111 | }, 112 | "source": [ 113 | "%config SQL.conn_name = 'engine'" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "H0hhloRRUJlV" 122 | }, 123 | "source": [ 124 | "## Question 1: Output the restaurant name together with the number of reviews submitted for this restaurant." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "id": "kwraKNb8aZjd" 131 | }, 132 | "source": [ 133 | "Let's run first the join query, and look at the table.\n", 134 | "\n", 135 | "This will be the table on which the GROUP BY query will operate.\n", 136 | "\n", 137 | "We order by `restName` to visually illustrate the groups that will be created." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "metadata": { 143 | "id": "QZjEITHKaSlT" 144 | }, 145 | "source": [ 146 | "%%read_sql\n", 147 | "SELECT *\n", 148 | "FROM Restaurant R INNER JOIN Rating T ON R.restCode=T.restCode\n", 149 | "ORDER BY restName" 150 | ], 151 | "execution_count": null, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "id": "B3GCj21VcD_9" 158 | }, 159 | "source": [ 160 | "Now notice the output when we use a LEFT JOIN instead of an INNER JOIN. Notice the extra restaurants that appear, which have received no reviews (and therefore the `code` and `cID` are NULL)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "id": "9EtFQf2NcCzq" 167 | }, 168 | "source": [ 169 | "%%read_sql\n", 170 | "SELECT *\n", 171 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n", 172 | "ORDER BY restName" 173 | ], 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": { 180 | "id": "ILA0T03xbT1Z" 181 | }, 182 | "source": [ 183 | "We now execute the GROUP BY. Notice how we use the `COUNT` command to count the total number of reviews, and the reviews with comments. Since this is an `INNER JOIN`, the `COUNT(*)` and `COUNT(code)` return the same values." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "metadata": { 189 | "id": "eL_CnyPRUSGI" 190 | }, 191 | "source": [ 192 | "%%read_sql\n", 193 | "SELECT restName, \n", 194 | " COUNT(*) AS cnt, \n", 195 | " COUNT(code) AS num_reviews, \n", 196 | " COUNT(comments) AS num_reviews_with_comments\n", 197 | "FROM Restaurant R INNER JOIN Rating T ON R.restCode=T.restCode\n", 198 | "GROUP BY restName" 199 | ], 200 | "execution_count": null, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "id": "TAX86BZ-cdsn" 207 | }, 208 | "source": [ 209 | "Now, let's switch to a LEFT JOIN. Notice an important change. We cannot rely on `COUNT(*)` anymore to count the number of reviews, and we need to be using the `COUNT(code)`. For example `Nisi` has one review: both the `cnt` and the `num_reviews` column are 1. However, for the `Don Peppe`, which has no reviews, we see that `cnt` is still 1, but `num_reviews` is 0." 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "metadata": { 215 | "id": "uOIAW9jscSzM" 216 | }, 217 | "source": [ 218 | "%%read_sql\n", 219 | "SELECT restName, \n", 220 | " COUNT(*) AS cnt, \n", 221 | " COUNT(code) AS num_reviews, \n", 222 | " COUNT(comments) AS num_reviews_with_comments\n", 223 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n", 224 | "GROUP BY restName" 225 | ], 226 | "execution_count": null, 227 | "outputs": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "metadata": { 232 | "id": "FQB0fIIGvNZu" 233 | }, 234 | "source": [ 235 | "%%read_sql\n", 236 | "SELECT restName, cuisine, borough,\n", 237 | " COUNT(*) AS cnt, \n", 238 | " COUNT(code) AS num_reviews, \n", 239 | " COUNT(comments) AS num_reviews_with_comments\n", 240 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n", 241 | "GROUP BY restName, cuisine, borough\n", 242 | "ORDER BY cuisine, borough" 243 | ], 244 | "execution_count": null, 245 | "outputs": [] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": { 250 | "id": "McZQXcp__OCu" 251 | }, 252 | "source": [ 253 | "## Question 2: For every Manhattan restaurant output its name and the number of reviews submitted for this restaurant." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "metadata": { 259 | "id": "W7GayX2v_Nl8" 260 | }, 261 | "source": [ 262 | "%%read_sql\n", 263 | "SELECT restName, COUNT(code) AS num_reviews, COUNT(comments) AS num_reviews_with_comments\n", 264 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n", 265 | "WHERE borough = 'Manhattan'\n", 266 | "GROUP BY restName" 267 | ], 268 | "execution_count": null, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "id": "Hz_1yX-EUeBQ" 275 | }, 276 | "source": [ 277 | "## Question 3: For every restaurant that was reviewed more than once output it name and the number or reviews submitted for this restaurant." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "metadata": { 283 | "id": "HXy0Ygy3Uf_m" 284 | }, 285 | "source": [ 286 | "%%read_sql\n", 287 | "SELECT restName, COUNT(code) AS num_reviews, COUNT(comments) AS num_reviews_with_comments\n", 288 | "FROM Restaurant R LEFT JOIN Rating T ON R.restCode=T.restCode\n", 289 | "GROUP BY restName\n", 290 | "HAVING num_reviews>1" 291 | ], 292 | "execution_count": null, 293 | "outputs": [] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "UozymxRTW-wx" 299 | }, 300 | "source": [ 301 | "## Question 4: Output the critic's last name and the restaurant name together with the maximal star rating ever issued by this critic for this restaurant." 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": { 307 | "id": "wAqIzSNGfk9j" 308 | }, 309 | "source": [ 310 | "Again, let's execute first the JOIN of all the tables that we need, so that we can see the data that we will be aggregating. Since we want to organize by critic's last name and restaurant name, we also add the corresponding ORDER BY, so that we can visually group together the rows that we will be aggregating." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "metadata": { 316 | "id": "ZwtqsRoGW-wz" 317 | }, 318 | "source": [ 319 | "%%read_sql\n", 320 | "SELECT *\n", 321 | "FROM Critic C \n", 322 | " INNER JOIN Rating R ON R.cID = C.cID\n", 323 | " INNER JOIN Restaurant T ON T.restCode = R.restCode\n", 324 | "ORDER BY restName, lastN" 325 | ], 326 | "execution_count": null, 327 | "outputs": [] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "id": "2KH4AJvzgP_u" 333 | }, 334 | "source": [ 335 | "Now, let's report the number of reviews that a critic wrote for the restaurant, together with the mix and max ratings. Obviously, when there is only one review, the min and max ratings are the same." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "metadata": { 341 | "id": "Uobp11rkfdOk" 342 | }, 343 | "source": [ 344 | "%%read_sql\n", 345 | "SELECT restName, lastN, \n", 346 | " COUNT(R.code) AS num_reviews, \n", 347 | " MAX(R.starRating) AS maxRating, \n", 348 | " MIN(R.starRating) AS minRating \n", 349 | "FROM Critic C \n", 350 | " INNER JOIN Rating R ON R.cID = C.cID\n", 351 | " INNER JOIN Restaurant T ON T.restCode = R.restCode\n", 352 | "GROUP BY restName, lastN\n", 353 | "ORDER BY restName, lastN" 354 | ], 355 | "execution_count": null, 356 | "outputs": [] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "id": "8NFSCApmXGZ8" 362 | }, 363 | "source": [ 364 | "## Question 5: For each cuisine-borough pair, output the number of the corresponding restaurants." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "metadata": { 370 | "id": "bcr5iccDXGZ9" 371 | }, 372 | "source": [ 373 | "%%read_sql\n", 374 | "SELECT cuisine, borough, COUNT(*) AS num_restaurants\n", 375 | "FROM Restaurant\n", 376 | "GROUP BY cuisine, borough" 377 | ], 378 | "execution_count": null, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": { 384 | "id": "nezZleqbUeI_" 385 | }, 386 | "source": [ 387 | "## Question 6: For each NYT reporter, output the number of distinct restaurants this reporter reviewed." 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "metadata": { 393 | "id": "rNfpSzT3UgrM" 394 | }, 395 | "source": [ 396 | "%%read_sql\n", 397 | "SELECT *\n", 398 | "FROM Critic C\n", 399 | " INNER JOIN Rating R ON R.cID = C.cID\n", 400 | "ORDER BY C.cID, R.restCode" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "id": "6O4EwRToibt0" 409 | }, 410 | "source": [ 411 | "Now let's execute the GROUP BY. Notice that we only GROUP by the `C.cID` which is the primary key for the table `Critic C`. Since the `cID` is the primary key for that table, we can also add in the SELECT clause the other attributes of the critic (which are unique for a given cID) without adding these attributes in the GROUP BY clause." 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "metadata": { 417 | "id": "YWIO7tvQh16s" 418 | }, 419 | "source": [ 420 | "%%read_sql\n", 421 | "SELECT C.cID, \n", 422 | " firstN, lastN, affiliation, \n", 423 | " COUNT(DISTINCT R.restCode) AS num_distinct_restaurants\n", 424 | "FROM Critic C\n", 425 | " INNER JOIN Rating R ON R.cID = C.cID\n", 426 | "GROUP BY C.cID" 427 | ], 428 | "execution_count": null, 429 | "outputs": [] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "metadata": { 434 | "id": "ISuJTDayiTS5" 435 | }, 436 | "source": [ 437 | "%%read_sql\n", 438 | "SELECT C.cID, \n", 439 | " firstN, lastN, affiliation, \n", 440 | " COUNT(DISTINCT R.restCode) AS num_distinct_restaurants\n", 441 | "FROM Critic C\n", 442 | " INNER JOIN Rating R ON R.cID = C.cID\n", 443 | "WHERE affiliation = 'NYT'\n", 444 | "GROUP BY C.cID" 445 | ], 446 | "execution_count": null, 447 | "outputs": [] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": { 452 | "id": "_j2bCzADXODe" 453 | }, 454 | "source": [ 455 | "## Question 7: For every news outlet, output the average star rating submitted by all the reviewers of this outlet. a. consider only Italian restaurants b. consider only Italian restaurants outside of Manhattan" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "metadata": { 461 | "id": "o9h005rlXODf" 462 | }, 463 | "source": [ 464 | "%%read_sql\n", 465 | "SELECT *\n", 466 | "FROM Critic C\n", 467 | " INNER JOIN Rating R ON R.cID = C.cID\n", 468 | " INNER JOIN Restaurant T ON T.restCode = R.restCode\n", 469 | "ORDER BY C.affiliation" 470 | ], 471 | "execution_count": null, 472 | "outputs": [] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": { 477 | "id": "OjtmFW39vk8U" 478 | }, 479 | "source": [ 480 | "Here is the same join as above, but now limited to Italian restaurants" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "metadata": { 486 | "id": "XHXY6wEkuGES" 487 | }, 488 | "source": [ 489 | "%%read_sql\n", 490 | "SELECT *\n", 491 | "FROM Critic C\n", 492 | " INNER JOIN Rating R ON R.cID = C.cID\n", 493 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n", 494 | "WHERE cuisine = 'Italian'\n", 495 | "ORDER BY C.affiliation" 496 | ], 497 | "execution_count": null, 498 | "outputs": [] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "id": "0SOsIIepaAcd" 504 | }, 505 | "source": [ 506 | "If we limit to Italian restaurants outside Manhattan, you will see that we get nothing back, as there are no reviews for Italian restaurants outside Manhattan. (Notice that there *are* Italian restaurants outside Manhattan, but no reviews for these restaurants.)" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "metadata": { 512 | "id": "nAHmBW2luLDY" 513 | }, 514 | "source": [ 515 | "%%read_sql\n", 516 | "SELECT *\n", 517 | "FROM Critic C\n", 518 | " INNER JOIN Rating R ON R.cID = C.cID\n", 519 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n", 520 | "WHERE cuisine = 'Italian' AND borough <> 'Manhattan'\n", 521 | "ORDER BY C.affiliation" 522 | ], 523 | "execution_count": null, 524 | "outputs": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "metadata": { 529 | "id": "Fuh0ziUB5f2k" 530 | }, 531 | "source": [ 532 | "%%read_sql\n", 533 | "SELECT C.affiliation, COUNT(*) AS num_reviews, AVG(starRating) AS avgRating\n", 534 | "FROM Critic C\n", 535 | " INNER JOIN Rating R ON R.cID = C.cID\n", 536 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n", 537 | "WHERE cuisine = 'Italian' \n", 538 | "GROUP BY C.affiliation\n", 539 | "ORDER BY C.affiliation" 540 | ], 541 | "execution_count": null, 542 | "outputs": [] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "metadata": { 547 | "id": "vfpkBEmPvdk0" 548 | }, 549 | "source": [ 550 | "%%read_sql\n", 551 | "SELECT C.affiliation, COUNT(*) AS num_reviews, AVG(starRating) AS avgRating\n", 552 | "FROM Critic C\n", 553 | " INNER JOIN Rating R ON R.cID = C.cID\n", 554 | " INNER JOIN Restaurant T ON T.restCode = R.restCode \n", 555 | "WHERE cuisine = 'Italian' AND borough <> 'Manhattan'\n", 556 | "GROUP BY C.affiliation\n", 557 | "ORDER BY C.affiliation" 558 | ], 559 | "execution_count": null, 560 | "outputs": [] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": { 565 | "id": "dFYmhG1DX3Ob" 566 | }, 567 | "source": [ 568 | "## Question 8: For every borough output the max star rating submitted for any restaurant within this borough (in which borough do you have the best restaurant)" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "metadata": { 574 | "id": "F3nEhrb8FBjm" 575 | }, 576 | "source": [ 577 | "%%read_sql\n", 578 | "SELECT *\n", 579 | "FROM Restaurant R \n", 580 | " INNER JOIN Rating T ON T.restCode = R.restCode\n", 581 | "ORDER BY borough" 582 | ], 583 | "execution_count": null, 584 | "outputs": [] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "metadata": { 589 | "id": "CToQxJjeFDyJ" 590 | }, 591 | "source": [ 592 | "%%read_sql\n", 593 | "SELECT borough, MAX(starRating) AS maxRating\n", 594 | "FROM Restaurant R \n", 595 | " INNER JOIN Rating T ON T.restCode = R.restCode\n", 596 | "GROUP BY borough\n", 597 | "ORDER BY borough" 598 | ], 599 | "execution_count": null, 600 | "outputs": [] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "metadata": { 605 | "id": "oza36RHjX7Ij" 606 | }, 607 | "source": [ 608 | "%%read_sql\n", 609 | "SELECT *\n", 610 | "FROM Restaurant R \n", 611 | " LEFT JOIN Rating T ON T.restCode = R.restCode\n", 612 | "ORDER BY borough" 613 | ], 614 | "execution_count": null, 615 | "outputs": [] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "metadata": { 620 | "id": "48I66Fp4DTcP" 621 | }, 622 | "source": [ 623 | "%%read_sql\n", 624 | "SELECT borough, MAX(starRating) AS maxRating\n", 625 | "FROM Restaurant R \n", 626 | " LEFT JOIN Rating T ON T.restCode = R.restCode\n", 627 | "GROUP BY borough\n", 628 | "ORDER BY borough" 629 | ], 630 | "execution_count": null, 631 | "outputs": [] 632 | } 633 | ] 634 | } -------------------------------------------------------------------------------- /session6/F-SQL_Subqueries-7.1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session6/F-SQL_Subqueries-7.1.pptx -------------------------------------------------------------------------------- /session6/F-SQL_Subqueries.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session6/F-SQL_Subqueries.pptx -------------------------------------------------------------------------------- /session6/assignment_combined.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "colab": { 7 | "name": "Session 6: Putting All Together Queries", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.6" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "SkZqhwkf8NUg" 45 | }, 46 | "source": [ 47 | "# Session 6: Putting All Together Assignment\n", 48 | "\n", 49 | "\n", 50 | "In this segment we will connect to the *Music* database." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "5u_6yLTDT6Kn" 57 | }, 58 | "source": [ 59 | "## Setup" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "O9o9NsaO8hMy" 66 | }, 67 | "source": [ 68 | "# !sudo pip3 install PyMySQL sqlalchemy sql_magic" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "EkIL-uRK8NUi" 77 | }, 78 | "source": [ 79 | "# This code creates a connection to the database\n", 80 | "from sqlalchemy import create_engine\n", 81 | "\n", 82 | "conn_string = \"mysql+pymysql://{user}:{password}@{host}/{db}?charset={encoding}\".format(\n", 83 | " host=\"db.ipeirotis.org\",\n", 84 | " user=\"student\",\n", 85 | " db=\"music\",\n", 86 | " password=\"dwdstudent2015\",\n", 87 | " encoding=\"utf8mb4\",\n", 88 | ")\n", 89 | "\n", 90 | "engine = create_engine(conn_string)\n", 91 | "con = engine.connect()" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "z7muzQXTUFkU" 100 | }, 101 | "source": [ 102 | "%reload_ext sql_magic" 103 | ], 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "uHRIPxBvUGfC" 111 | }, 112 | "source": [ 113 | "%config SQL.conn_name = 'engine'" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "NdgXvtmRLayC" 122 | }, 123 | "source": [ 124 | "This is an example of how you can write an SQL query in the notebook.\n", 125 | " You write your SQL query after the `%%read_sql` line." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "sWa1Uv_6X9zi" 132 | }, 133 | "source": [ 134 | "%%read_sql\n", 135 | "SELECT * \n", 136 | "FROM played" 137 | ], 138 | "execution_count": null, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "id": "H0hhloRRUJlV" 145 | }, 146 | "source": [ 147 | "## Question 1: Show the name of the artist and the number of albums for each artist in the database. Name the column that shows the number of albums as `num_albums`." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "eL_CnyPRUSGI" 154 | }, 155 | "source": [ 156 | "%%read_sql\n" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "id": "Hz_1yX-EUeBQ" 165 | }, 166 | "source": [ 167 | "## Question 2: Show the name of the album, the number of tracks in the album, and the total length of the album. Name as `num_tracks` the columns that shows the number of tracks, and name as `length_minutes` the columns that shows the time length of the album." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "metadata": { 173 | "id": "HXy0Ygy3Uf_m" 174 | }, 175 | "source": [ 176 | "%%read_sql\n", 177 | "\n" 178 | ], 179 | "execution_count": null, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "id": "UozymxRTW-wx" 186 | }, 187 | "source": [ 188 | "## Question 3: Expand query 2, to show not only the name of the album but also the name of the artist, _in addition_ to the information already shown in query 2 (i.e., name of the album, number of tracks, and album length)." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "metadata": { 194 | "id": "ZwtqsRoGW-wz" 195 | }, 196 | "source": [ 197 | "%%read_sql\n" 198 | ], 199 | "execution_count": null, 200 | "outputs": [] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "id": "8NFSCApmXGZ8" 206 | }, 207 | "source": [ 208 | "## Question 4: For every artist, show the average length of the tracks, and the standard deviation for the track length.\n", 209 | "\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "metadata": { 215 | "id": "bcr5iccDXGZ9" 216 | }, 217 | "source": [ 218 | "%%read_sql\n" 219 | ], 220 | "execution_count": null, 221 | "outputs": [] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "id": "nezZleqbUeI_" 227 | }, 228 | "source": [ 229 | "## Question 5: Analyze the songs that the user has played and, for each artist, show the first and last time that the user has listened to the artist." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "metadata": { 235 | "id": "rNfpSzT3UgrM" 236 | }, 237 | "source": [ 238 | "%%read_sql\n" 239 | ], 240 | "execution_count": null, 241 | "outputs": [] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": { 246 | "id": "_j2bCzADXODe" 247 | }, 248 | "source": [ 249 | "## Question 6: Show the name of each track and the time it was played. Include tracks that were not played at all by the user. (Hint: You need an outer join.)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "metadata": { 255 | "id": "o9h005rlXODf" 256 | }, 257 | "source": [ 258 | "%%read_sql\n" 259 | ], 260 | "execution_count": null, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "id": "HF9cHpSDXaZd" 267 | }, 268 | "source": [ 269 | "## Question 7: List all the names of the albums and the album length (across all the album's tracks), where the total album length is longer than 60 minutes." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "metadata": { 275 | "id": "mvLQdbdiXSmw" 276 | }, 277 | "source": [ 278 | "%%read_sql\n" 279 | ], 280 | "execution_count": null, 281 | "outputs": [] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "id": "ntj5f4n8U3dT" 287 | }, 288 | "source": [ 289 | "## Question 8: Show the names of the artists that have more than 12 tracks in the database." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "id": "mPCpyhbwU3dV" 296 | }, 297 | "source": [ 298 | "%%read_sql" 299 | ], 300 | "execution_count": null, 301 | "outputs": [] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "id": "23bSA8IUU1Og" 307 | }, 308 | "source": [ 309 | "## Question 9: Show how many tracks are on each of the albums by the artist `New Order`. List the name of the album and the number of tracks in the output." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "metadata": { 315 | "id": "hTD97qjaU1Oi" 316 | }, 317 | "source": [ 318 | "%%read_sql\n" 319 | ], 320 | "execution_count": null, 321 | "outputs": [] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "id": "eWgdmtHHURaS" 327 | }, 328 | "source": [ 329 | "## Question 10: By analyzing the songs that the user has played and, for each album, show the first and last time that the user has listened to the album. Include albums that the user has *not* listened at all. List also the artist name for the album. (Hint: you need an *outer* join for this.) " 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "metadata": { 335 | "id": "ygI_N9gBUhWy" 336 | }, 337 | "source": [ 338 | "%%read_sql\n" 339 | ], 340 | "execution_count": null, 341 | "outputs": [] 342 | } 343 | ] 344 | } -------------------------------------------------------------------------------- /session6/book_vs_gender.sql: -------------------------------------------------------------------------------- 1 | # Total number of users 2 | SELECT COUNT(*) 3 | FROM Profiles; 4 | 5 | SET @allprofiles = (SELECT COUNT(*) FROM Profiles); 6 | # 25784 7 | 8 | SELECT @allprofiles; 9 | 10 | # Break down the user by sex 11 | SELECT Sex,COUNT(*) 12 | FROM Profiles 13 | GROUP BY Sex; 14 | 15 | # Female 12311 16 | # Male 8975 17 | # NULL 4498 18 | 19 | 20 | # Instead of counting all users, lets focus only on users 21 | # that have listed books they like in their profiles 22 | SELECT P.Sex, COUNT(DISTINCT B.ProfileID) AS num_profiles 23 | FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID 24 | GROUP BY P.Sex; 25 | 26 | # 643 27 | # Female 8753 28 | # Male 5974 29 | 30 | SET @males = (SELECT COUNT(DISTINCT B.ProfileID) FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID AND P.Sex='Male'); 31 | SET @females = (SELECT COUNT(DISTINCT B.ProfileID) FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID AND P.Sex='Female'); 32 | SET @everyone = (SELECT COUNT(DISTINCT B.ProfileID) FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID); 33 | 34 | 35 | # We will only consider books that are liked by a reasonable number 36 | # of users. We will put the threshold at 10, but we we change it 37 | # We will also save the results in a temporary table. 38 | # We divide with the total number of people that like books 39 | DROP TABLE IF EXISTS popular_books; 40 | CREATE TEMPORARY TABLE popular_books AS 41 | SELECT Book, COUNT(*) AS cnt, COUNT(*)/@everyone AS perc 42 | FROM FavoriteBooks B JOIN Profiles P ON P.ProfileID = B.ProfileID 43 | GROUP BY Book 44 | HAVING cnt >= 10 45 | ORDER BY cnt DESC; 46 | 47 | SELECT * FROM popular_books; 48 | 49 | # We now calculate the number of men / women that like each of the popular books 50 | # It is absolutely crucial here to use a LEFT JOIN so that we can keep 51 | # the list of all popular books, even if no men / no women liked that book. 52 | # 53 | # *** There are a lot of nuanced things in this join. *** 54 | # 55 | # a. Notice that we have the condition P.Sex = 'Male' in the JOIN condition 56 | # If we put the condition in the WHERE clause, the WHERE clause is 57 | # executed after the LEFT JOIN, and eliminates all the non-matched Books 58 | # 59 | # b. Notice that we do a COUNT of the P.ProfileID. If we do a count of B.ProfileID 60 | # the results will be completely different (and wrong). That may seem as a 61 | # headscratcher, but you need to remember the behavior of LEFT JOINS for 62 | # unmatched rows. Try executing the LEFT JOIN without the GROUP BY / aggregations 63 | # to understand what is going on before the GROUP BY aggregation. Select both 64 | # the B.ProfileID and the P.ProfileID, which superficially seem to be the same 65 | # as we have the equality condition P.ProfileID = B.ProfileID in the JOIN clause 66 | # 67 | # c. We use a bit of "smoothing" and add 0.5 to the nominator and 1 to the denominator 68 | # when we calculation the percentage. That is to avoid zeros, as we 69 | # will be dividing with perc_men and perc_women in the next query 70 | # 71 | DROP TABLE IF EXISTS book_men ; 72 | CREATE TEMPORARY TABLE book_men AS 73 | SELECT B.Book, 74 | COUNT(DISTINCT P.ProfileID) AS cnt_men, 75 | (COUNT(DISTINCT P.ProfileID)+0.5)/(@males+1) AS perc_men 76 | FROM popular_books B 77 | LEFT JOIN FavoriteBooks F ON B.Book = F.Book 78 | LEFT JOIN Profiles P ON P.ProfileID = F.ProfileID AND P.Sex = 'Male' 79 | GROUP BY B.Book 80 | ORDER BY perc_men DESC; 81 | 82 | 83 | 84 | 85 | # We repeat the process for women. Same nuances apply here as in the join just above. 86 | DROP TABLE IF EXISTS book_women; 87 | CREATE TEMPORARY TABLE book_women AS 88 | SELECT B.Book, 89 | COUNT(DISTINCT P.ProfileID) AS cnt_women, 90 | (COUNT(DISTINCT P.ProfileID)+0.5)/(@females+1) AS perc_women 91 | FROM popular_books B 92 | LEFT JOIN FavoriteBooks F ON B.Book = F.Book 93 | LEFT JOIN Profiles P ON P.ProfileID = F.ProfileID AND P.Sex = 'Female' 94 | GROUP BY B.Book 95 | ORDER BY perc_women DESC; 96 | 97 | 98 | 99 | 100 | # Once we have our subqueries in place, we join the two tables and calculate the 101 | # "lift". The lift is defined as the "probability of seeing something in one population" 102 | # divided by the "probability of seeing something in a contrasting population". 103 | # In this case, we compare percentages (~probabilities) in the populations of men vs women 104 | # 105 | # Notice here that we calculate the lift by dividing with perc_men (or perc_women later) 106 | # hence the need to have a non-zero value for perc_men when creating the book_men table. 107 | # 108 | # We could have done it with women vs the overall population as well, but the "overall" 109 | # population includes women as well, so the contrast is not great. 110 | # 111 | # Alternatively, we could have done women vs rest; and men vs rest. We leave that 112 | # calculation as an exercise for the interested reader. 113 | SELECT B.Book, B.cnt, B.perc, 114 | M.cnt_men, M.perc_men, 115 | F.cnt_women, F.perc_women, 116 | perc_men /perc_women AS lift_men_vs_women, 117 | perc_women / perc_men AS lift_women_vs_men 118 | FROM popular_books B 119 | LEFT JOIN book_men M ON M.Book = B.Book 120 | LEFT JOIN book_women F ON F.Book = B.Book 121 | ORDER BY lift_women_vs_men DESC, cnt_women DESC; 122 | 123 | -------------------------------------------------------------------------------- /session6/books_and_political_views.sql: -------------------------------------------------------------------------------- 1 | USE facebook; 2 | 3 | -- We introduce variables to avoid hardcoding 4 | SET @all_students = (SELECT COUNT(*) FROM Profiles); 5 | 6 | -- This is our master table. Contains the total likes for each book. 7 | -- It also contains the "percentage" which normalizes the number of likes 8 | -- with the population that is available to like a book 9 | DROP TABLE IF EXISTS book_likes; 10 | CREATE TEMPORARY TABLE book_likes AS 11 | SELECT Book, COUNT(ProfileID) AS cnt, COUNT(ProfileID)/@all_students AS perc 12 | FROM FavoriteBooks 13 | GROUP BY Book 14 | ORDER BY cnt DESC; 15 | 16 | 17 | 18 | -- We now introduce the table that stores the likes for the liberal population, 19 | -- together with the normalized percentage, after dividing with the number of 20 | -- liberal students 21 | SET @liberals = (SELECT COUNT(*) AS cnt FROM Profiles WHERE PoliticalViews = 'Liberal'); 22 | 23 | DROP TABLE IF EXISTS book_liberals; 24 | CREATE TEMPORARY TABLE book_liberals AS 25 | SELECT Book, COUNT(P.ProfileID) AS cnt_libs, COUNT(P.ProfileID)/@liberals AS perc_libs 26 | FROM FavoriteBooks B INNER JOIN Profiles P ON P.ProfileID = B.ProfileID 27 | WHERE P.PoliticalViews = 'Liberal' 28 | GROUP BY Book 29 | ORDER BY cnt_libs DESC; 30 | 31 | 32 | -- And same for the conservatives 33 | SET @conservatives = (SELECT COUNT(*) AS cnt FROM Profiles WHERE PoliticalViews = 'Conservative'); 34 | 35 | DROP TABLE IF EXISTS book_conservatives; 36 | CREATE TEMPORARY TABLE book_conservatives AS 37 | SELECT Book, COUNT(P.ProfileID) AS cnt_cons, COUNT(P.ProfileID)/@conservatives AS perc_cons 38 | FROM FavoriteBooks B INNER JOIN Profiles P ON P.ProfileID = B.ProfileID 39 | WHERE P.PoliticalViews = 'Conservative' 40 | GROUP BY Book 41 | ORDER BY cnt_cons DESC; 42 | 43 | 44 | -- Once we have the full list of likes for all books, we can now 45 | -- perform two LEFT JOINS with liberal and conservative likes 46 | -- and have a list of all books. Notice what would have happened 47 | -- if we had used an INNER JOIN instead (we would have missed 48 | -- books without likes in the liberal or in the conservative 49 | -- population) 50 | -- 51 | -- Notice also the use of the COALESCE function, which checks 52 | -- if an attribute is NULL; if yes, replaces it with the second 53 | -- argument. In our case, we replace NULLs with 0, as these are the 54 | -- "zero likes" books. 55 | -- 56 | -- Finally, notice the calculation of "perc_nonlibs" and "perc_noncons" 57 | -- that is done in a way to remove the "liberals" and "conservatives" 58 | DROP TABLE IF EXISTS book_comparison; 59 | CREATE TEMPORARY TABLE book_comparison AS 60 | SELECT B.Book 61 | , B.cnt 62 | , B.perc AS perc 63 | , COALESCE(L.cnt_libs,0) AS cnt_libs 64 | , COALESCE(L.perc_libs,0) AS perc_libs 65 | , COALESCE(C.cnt_cons,0) AS cnt_cons 66 | , COALESCE(C.perc_cons,0) AS perc_cons 67 | , (B.cnt - COALESCE(L.cnt_libs,0)) / (@all_students - @liberals) AS perc_nonlibs 68 | , (B.cnt - COALESCE(C.cnt_cons,0)) / (@all_students - @conservatives) AS perc_noncons 69 | FROM 70 | book_likes B 71 | LEFT JOIN book_liberals L ON B.Book = L.Book 72 | LEFT JOIN book_conservatives C ON B.Book = C.Book 73 | WHERE B.cnt > 5 -- Only keep books with at least 5 likes overall, to avoid very noisy entries 74 | ORDER BY cnt DESC; 75 | 76 | 77 | 78 | DROP TABLE IF EXISTS book_scores; 79 | CREATE TEMPORARY TABLE book_scores AS 80 | SELECT * 81 | , perc_libs/perc_nonlibs AS lift_libs 82 | , LOG10(perc_libs/perc_nonlibs + 0.001) AS logodds_libs 83 | , perc_cons/perc_noncons AS lift_cons 84 | , LOG10(perc_cons/perc_nonlibs + 0.001) AS logodds_cons 85 | FROM book_comparison; 86 | 87 | SELECT * FROM book_scores; 88 | 89 | -- Now that we have the scores for each book, we can try to score individuals 90 | DROP TABLE IF EXISTS user_scores; 91 | CREATE TEMPORARY TABLE user_scores AS 92 | SELECT P.ProfileID, P.PoliticalViews 93 | , AVG(logodds_libs) AS avg_lib 94 | , AVG(logodds_cons) AS avg_cons 95 | , COUNT(*) AS cnt_books 96 | , CASE WHEN AVG(logodds_libs) > AVG(logodds_cons) THEN "Liberal" ELSE "Conservative" END AS Estimate 97 | FROM 98 | Profiles P 99 | JOIN FavoriteBooks B ON P.ProfileID = B.ProfileID 100 | JOIN book_scores S ON B.Book = S.Book 101 | GROUP BY ProfileID, P.PoliticalViews 102 | ORDER BY ProfileID; 103 | 104 | -- Classification Statistics 105 | SELECT PoliticalViews, Estimate, COUNT(*) 106 | FROM user_scores 107 | GROUP BY PoliticalViews, Estimate 108 | ORDER BY PoliticalViews, Estimate; 109 | 110 | -- 111 | -- Classified 112 | -- Cons Libs 113 | -- Correct Cons 360 166 526 114 | -- Libs 455 3829 4284 115 | -- 116 | -- Cons Libs 117 | -- Cons 0.684410646 0.315589354 118 | -- Libs 0.10620915 0.89379085 119 | -------------------------------------------------------------------------------- /session6/music_recommendations.sql: -------------------------------------------------------------------------------- 1 | USE facebook; 2 | 3 | # Store the number of people that have liked (any) music 4 | SET @allmusicfans = (SELECT COUNT(DISTINCT ProfileID) FROM FavoriteMusic); 5 | 6 | # Create a table/view that stores the preferences across all the population 7 | # We store the band and the percentage of people that liked it 8 | DROP TEMPORARY TABLE IF EXISTS MusicPreferences; 9 | CREATE TEMPORARY TABLE MusicPreferences AS 10 | SELECT Music, 11 | ROUND(COUNT(DISTINCT ProfileID)/@allmusicfans,4) AS perc, 12 | COUNT(DISTINCT ProfileID) AS cnt 13 | FROM FavoriteMusic 14 | GROUP BY Music 15 | ORDER BY perc DESC; 16 | 17 | 18 | # Set the band that we are analyzing 19 | SET @band = 'Bon Jovi'; 20 | 21 | # Store the number of people that like the specific band 22 | SET @bandfans = (SELECT cnt FROM MusicPreferences WHERE Music = @band); 23 | 24 | 25 | # Create a table with the percentages across only people that like the band 26 | # that we specified in the variable @band 27 | DROP TEMPORARY TABLE IF EXISTS BandFans; 28 | CREATE TEMPORARY TABLE BandFans AS 29 | SELECT Music, 30 | ROUND(COUNT(DISTINCT ProfileID)/@bandfans,4) AS perc, 31 | COUNT(DISTINCT ProfileID) AS cnt 32 | FROM FavoriteMusic 33 | WHERE ProfileID IN ( 34 | SELECT ProfileID 35 | FROM FavoriteMusic 36 | WHERE Music = @band 37 | ) 38 | GROUP BY Music 39 | ORDER BY perc DESC; 40 | 41 | 42 | # Join the two tables above to compare the percentages of likes 43 | # in the overall population (T.perc) vs the percentage of likes 44 | # across the population of people that like the @band (R.perc) 45 | # We call the ratio of the percentages to as "lift" 46 | SELECT T.Music, 47 | R.perc AS perc_focus, R.cnt AS cnt_focus, 48 | T.perc AS perc_total, T.cnt AS cnt_total, 49 | R.perc/T.perc AS lift_ratio 50 | FROM BandFans R JOIN MusicPreferences T ON R.Music = T.Music 51 | ORDER BY lift_ratio DESC; 52 | 53 | # Improving the details now. 54 | # Below we introduce a few fixes to remove noise and 55 | # make the results more presentable. 56 | 57 | 58 | # To avoid noise, we keep only bands that have at least 100 59 | # likes in the overall population. 60 | # The variable @min_fans is the minimum number of fans 61 | # required, for a band to be analyzed 62 | SET @min_fans = 100; 63 | 64 | # Join the two tables above to compare the percentages of likes 65 | # in the overall population (T.perc) vs the percentage of likes 66 | # across the population of people that like the @band (R.perc) 67 | # We use an OUTER join to keep all the bands from the overall 68 | # population, even if they do not appear in the likes of the 69 | # fans of the target artist. 70 | # The COALESCE function replaces NULL values with 0.0 71 | # 72 | SELECT T.Music, 73 | COALESCE(R.perc,0.0) AS perc_focus, COALESCE(R.cnt,0) AS cnt_focus, 74 | T.perc AS perc_total, T.cnt AS cnt_total, 75 | COALESCE(R.perc/T.perc,0) AS lift_ratio 76 | FROM MusicPreferences T LEFT JOIN BandFans R ON R.Music = T.Music 77 | WHERE T.cnt>@min_fans AND (R.Music IS NULL OR R.Music != @band) 78 | ORDER BY lift_ratio DESC 79 | -------------------------------------------------------------------------------- /session6/music_recommendations2.sql: -------------------------------------------------------------------------------- 1 | # We use this number to go from raw number of likes to _percentage_ of people that like a band 2 | SET @allfans = (SELECT COUNT(DISTINCT ProfileID) FROM FavoriteMusic); 3 | 4 | # We will not consider bands with less than 100 fans 5 | SET @min_fans = 40; 6 | 7 | WITH 8 | 9 | # Calculate number of people that line a band 10 | # To avoid noise, we keep only bands with more than @min_fans 11 | MusicPreferences AS ( 12 | SELECT Music, COUNT(ProfileID) AS cnt, COUNT(ProfileID)/@allfans AS perc 13 | FROM FavoriteMusic 14 | GROUP BY Music 15 | HAVING cnt >= @min_fans 16 | ), 17 | 18 | # For all pairs of bands, calculate number of people that like both bands 19 | # The M1.Music and 21 | CommonFans AS ( 22 | SELECT M1.Music AS Music1, M2.music AS Music2, COUNT(*) AS common_fans 23 | FROM FavoriteMusic M1 24 | JOIN FavoriteMusic M2 ON M1.ProfileID = M2.ProfileID AND M1.Music= @min_fans 13 | ), 14 | 15 | # For all pairs of bands, calculate number of people that like both bands 16 | # The M1.Music and 18 | CommonFans AS ( 19 | SELECT M1.Music AS Music1, M2.music AS Music2, COUNT(*) AS common_fans 20 | FROM FavoriteMusic M1 21 | JOIN FavoriteMusic M2 ON M1.ProfileID = M2.ProfileID AND M1.Music= @min_fans 24 | ) 25 | 26 | # Put together data about common fans, and overall fans for each band, 27 | # calculate percentages of fans of band1 that like band2, and vice versa 28 | SELECT M1.Music AS Music1, M1.cnt AS cnt1_overall, 29 | M2.Music AS Music2, M2.cnt AS cnt2_overall, 30 | C.common_fans, C.common_fans/(M1.cnt + M2.cnt - C.common_fans) AS jaccard 31 | FROM CommonFans C 32 | JOIN MusicPreferences M1 ON M1.Music = C.Music1 33 | JOIN MusicPreferences M2 ON M2.Music = C.Music2 34 | ORDER BY jaccard DESC 35 | 36 | -------------------------------------------------------------------------------- /session7/G-Window_queries.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipeirotis/introduction-to-databases/429f66f9dd31413fac754c5832f40e0bab8e6aac/session7/G-Window_queries.pptx -------------------------------------------------------------------------------- /session7/README.md: -------------------------------------------------------------------------------- 1 | # Window Functions 2 | 3 | https://antonz.org/sql-window-functions-book/ 4 | 5 | https://www.mysqltutorial.org/mysql-window-functions/ 6 | 7 | https://www.stratascratch.com/blog/the-ultimate-guide-to-sql-window-functions/ 8 | 9 | https://towardsdatascience.com/a-guide-to-advanced-sql-window-functions-f63f2642cbf9 10 | 11 | https://mode.com/sql-tutorial/sql-window-functions/ 12 | 13 | https://www.geeksforgeeks.org/window-functions-in-sql/ 14 | 15 | https://www.toptal.com/sql/intro-to-sql-windows-functions 16 | -------------------------------------------------------------------------------- /session7/music_by_gender_rank_example.sql: -------------------------------------------------------------------------------- 1 | USE facebook; 2 | 3 | # We create first a table of popular music across everyone 4 | # For efficiency we only keep music liked by more than 10 people 5 | DROP TEMPORARY TABLE IF EXISTS popular_music; 6 | CREATE TEMPORARY TABLE popular_music AS 7 | SELECT M.Music AS music, COUNT(M.ProfileID) AS cnt 8 | FROM FavoriteMusic M 9 | GROUP BY M.Music 10 | HAVING cnt > 10 11 | ORDER BY cnt DESC; 12 | 13 | 14 | # We now calculate popularity of each music broken down by sex 15 | DROP TEMPORARY TABLE IF EXISTS popular_music_by_sex; 16 | CREATE TEMPORARY TABLE popular_music_by_sex AS 17 | SELECT M.Music AS music, P.Sex AS gender, COUNT(P.ProfileID) AS cnt 18 | FROM FavoriteMusic M JOIN Profiles P ON P.ProfileID = M.ProfileID 19 | WHERE P.Sex IS NOT NULL 20 | GROUP BY M.Music, P.Sex 21 | ORDER BY cnt DESC; 22 | 23 | # We will now create two tables with music rank, one per gender 24 | # In principle, we could also do a window OVER (PARTITION BY gender ORDER BY cnt DESC) 25 | # However, MySQL has a bug that does not allow temporary tables to 26 | # join with itself, so we end up creating one temp table for males and another for females 27 | DROP TEMPORARY TABLE IF EXISTS chart_male; 28 | CREATE TEMPORARY TABLE chart_male AS 29 | SELECT music, cnt, 30 | RANK() OVER (ORDER BY cnt DESC) AS music_rank 31 | FROM popular_music_by_sex 32 | WHERE gender = 'Male' 33 | ORDER BY music_rank; 34 | 35 | DROP TEMPORARY TABLE IF EXISTS chart_female; 36 | CREATE TEMPORARY TABLE chart_female AS 37 | SELECT music, cnt, 38 | RANK() OVER (ORDER BY cnt DESC) AS music_rank 39 | FROM popular_music_by_sex 40 | WHERE gender = 'Female' 41 | ORDER BY music_rank; 42 | 43 | # Finally we bring everything together. 44 | # Note that we start with popular_music as a reference table 45 | # and we left join the other two tables, as there is no guarantee 46 | # that we will encounter a music in both males and females tables 47 | # To estimate the difference between males and females we take the 48 | # log of the rank, and then the difference; the reason we do that 49 | # is because a difference of 5 between No1 and No6 is very different 50 | # than a difference of 5 between No605 and No610. With LOG we kind 51 | # of estimate difference in "orders of magnitude" 52 | SELECT S.music, S.cnt, 53 | M.cnt AS male_cnt, M.music_rank AS male_rank, 54 | F.cnt AS female_cnt, F.music_rank AS female_rank, 55 | ROUND(-LOG(F.music_rank / M.music_rank),2) AS diff_females 56 | FROM popular_music S 57 | LEFT JOIN chart_female M ON (S.music = M.music) 58 | LEFT JOIN chart_male F ON (S.music = F.music) 59 | ORDER BY diff_females DESC; 60 | --------------------------------------------------------------------------------