├── .gitignore ├── convert_actions.py ├── create_actions.sql ├── customers.tsv ├── data.json ├── drop.sql ├── generate_actions.py ├── generate_actions_dt.py ├── generate_users.py ├── ordered_items.tsv ├── small_actions.csv ├── spark.py ├── spark2.py └── users.csv /.gitignore: -------------------------------------------------------------------------------- 1 | big*_actions.csv 2 | the_database.db 3 | *.pyc 4 | -------------------------------------------------------------------------------- /convert_actions.py: -------------------------------------------------------------------------------- 1 | # This file was made for the class: 2 | # SQL for Marekters: Dominate data analytics, data science, and big data 3 | # 4 | # It can be found at: 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | import json 8 | 9 | with open('big_actions.json', 'w') as f: 10 | for line in open('big_actions.csv'): 11 | r = line.split(',') 12 | j = { 13 | 'name': r[0], 14 | 'product': r[1], 15 | 'action': r[2], 16 | 'price': r[3], 17 | } 18 | f.write("%s\n" % json.dumps(j)) 19 | -------------------------------------------------------------------------------- /create_actions.sql: -------------------------------------------------------------------------------- 1 | -- This file was made for the class: 2 | -- SQL for Marekters: Dominate data analytics, data science, and big data 3 | 4 | -- It can be found at: 5 | -- https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | CREATE TABLE user_actions(name TEXT, product TEXT, action TEXT, price REAL); 8 | -------------------------------------------------------------------------------- /customers.tsv: -------------------------------------------------------------------------------- 1 | customerid firstname lastname city state 2 | 10101 John Gray Lynden Washington 3 | 10298 Leroy Brown Pinetop Arizona 4 | 10299 Elroy Keller Snoqualmie Washington 5 | 10315 Lisa Jones Oshkosh Wisconsin 6 | 10325 Ginger Schultz Pocatello Idaho 7 | 10329 Kelly Mendoza Kailua Hawaii 8 | 10330 Shawn Dalton Cannon Beach Oregon 9 | 10338 Michael Howell Tillamook Oregon 10 | 10339 Anthony Sanchez Winslow Arizona 11 | 10408 Elroy Cleaver Globe Arizona 12 | 10410 Mary Ann Howell Charleston South Carolina 13 | 10413 Donald Davids Gila Bend Arizona 14 | 10419 Linda Sakahara Nogales Arizona 15 | 10429 Sarah Graham Greensboro North Carolina 16 | 10438 Kevin Smith Durango Colorado 17 | 10439 Conrad Giles Telluride Colorado 18 | 10449 Isabela Moore Yuma Arizona -------------------------------------------------------------------------------- /data.json: -------------------------------------------------------------------------------- 1 | {"name":"Bob","age":20} 2 | {"name":"Jane","age":25} -------------------------------------------------------------------------------- /drop.sql: -------------------------------------------------------------------------------- 1 | -- Example of dropping a column (new in SQLite 3.35.0) 2 | 3 | -- create table 4 | create table mytable (first_name text, last_name text, email text); 5 | 6 | -- check schema 7 | -- .schema mytable 8 | 9 | -- insert dummy data 10 | insert into mytable (first_name, last_name, email) values ('Alice', 'In Chains', 'alice@gmail.com'); 11 | insert into mytable (first_name, last_name, email) values ('Bob', 'Baker', 'bob@gmail.com'); 12 | 13 | -- format nicely 14 | -- .mode table 15 | 16 | -- look at the data 17 | select * from mytable; 18 | 19 | -- drop column 20 | alter table mytable drop column email; 21 | 22 | -- look at the data again (email is gone) 23 | select * from mytable; 24 | 25 | -- check schema again 26 | -- .schema mytable 27 | 28 | -- clean up table 29 | drop table mytable; -------------------------------------------------------------------------------- /generate_actions.py: -------------------------------------------------------------------------------- 1 | # This file was made for the class: 2 | # SQL for Marekters: Dominate data analytics, data science, and big data 3 | # 4 | # It can be found at: 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | from __future__ import print_function, division 8 | from builtins import range 9 | # Note: you may need to update your version of future 10 | # sudo pip install -U future 11 | 12 | import sys 13 | import random 14 | 15 | NAMES = ['Alice', 'Bob', 'Carol', 'Dave', 'Emily', 'Frank', 'Gina'] 16 | PRODUCTS = ['Apple', 'Orange', 'Banana', 'Blueberry', 'Raspberry', 'Apricot', 'Cherry', 'Grape', 'Mango'] 17 | ACTIONS = ['view', 'addtocart', 'purchase'] 18 | 19 | def generate(N, fn): 20 | with open(fn, 'w') as f: 21 | i = 0 22 | while i < N: 23 | name = random.choice(NAMES) 24 | product = random.choice(PRODUCTS) 25 | # action = random.choice(ACTIONS) 26 | price = str(0.99) 27 | 28 | # make sure every purchase has an addtocart and view 29 | # make sure every addtocart has a view 30 | a = random.randint(1, 3) 31 | for j in range(a): 32 | action = ACTIONS[j] 33 | f.write("%s,%s,%s,%s\n" % (name, product, action, price)) 34 | i += 1 35 | 36 | 37 | if __name__ == '__main__': 38 | generate(int(sys.argv[1]), sys.argv[2]) 39 | -------------------------------------------------------------------------------- /generate_actions_dt.py: -------------------------------------------------------------------------------- 1 | # This file was made for the class: 2 | # SQL for Marekters: Dominate data analytics, data science, and big data 3 | # 4 | # It can be found at: 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | import sys 8 | import random 9 | # from datetime import datetime 10 | 11 | NAMES = ['Alice', 'Bob', 'Carol', 'Dave', 'Emily', 'Frank', 'Gina'] 12 | PRODUCTS = ['Apple', 'Orange', 'Banana', 'Blueberry', 'Raspberry', 'Apricot', 'Cherry', 'Grape', 'Mango'] 13 | 14 | def generate(N, fn): 15 | with open(fn, 'w') as f: 16 | i = 0 17 | while i < N: 18 | name = random.choice(NAMES) 19 | product = random.choice(PRODUCTS) 20 | price = str(0.99) 21 | 22 | year = random.choice(['2014', '2015']) 23 | month = str(random.choice(range(12)) + 1) 24 | if len(month) == 1: 25 | month = "0" + month 26 | day = str(random.choice(range(28)) + 1) 27 | if len(day) == 1: 28 | day = "0" + day 29 | dt = "%s-%s-%s 00:00:00" % (year, month, day) 30 | 31 | f.write("%s,%s,%s,%s,%s\n" % (name, product, 'purchase', price, dt)) 32 | i += 1 33 | 34 | 35 | if __name__ == '__main__': 36 | generate(1000000, 'dt_actions.csv') 37 | -------------------------------------------------------------------------------- /generate_users.py: -------------------------------------------------------------------------------- 1 | # This file was made for the class: 2 | # SQL for Marekters: Dominate data analytics, data science, and big data 3 | # 4 | # It can be found at: 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | import sys 8 | import random 9 | 10 | from generate_actions import NAMES 11 | 12 | LOCATIONS = ['Los Angeles', 'New York', 'Chicago', 'Las Vegas'] 13 | 14 | def generate(): 15 | with open('users.csv', 'w') as f: 16 | for name in NAMES: 17 | location = random.choice(LOCATIONS) 18 | age = random.randint(18, 65) 19 | f.write("%s,%s,%s\n" % (name, str(age), location)) 20 | 21 | 22 | if __name__ == '__main__': 23 | generate() 24 | -------------------------------------------------------------------------------- /ordered_items.tsv: -------------------------------------------------------------------------------- 1 | customerid order_date item quantity price 2 | 10330 30-Jun-1999 Pogo stick 1 28.00 3 | 10101 30-Jun-1999 Raft 1 58.00 4 | 10298 01-Jul-1999 Skateboard 1 33.00 5 | 10101 01-Jul-1999 Life Vest 4 125.00 6 | 10299 06-Jul-1999 Parachute 1 1250.00 7 | 10339 27-Jul-1999 Umbrella 1 4.50 8 | 10449 13-Aug-1999 Unicycle 1 180.79 9 | 10439 14-Aug-1999 Ski Poles 2 25.50 10 | 10101 18-Aug-1999 Rain Coat 1 18.30 11 | 10449 01-Sep-1999 Snow Shoes 1 45.00 12 | 10439 18-Sep-1999 Tent 1 88.00 13 | 10298 19-Sep-1999 Lantern 2 29.00 14 | 10410 28-Oct-1999 Sleeping Bag 1 89.22 15 | 10438 01-Nov-1999 Umbrella 1 6.75 16 | 10438 02-Nov-1999 Pillow 1 8.50 17 | 10298 01-Dec-1999 Helmet 1 22.00 18 | 10449 15-Dec-1999 Bicycle 1 380.50 19 | 10449 22-Dec-1999 Canoe 1 280.00 20 | 10101 30-Dec-1999 Hoola Hoop 3 14.75 21 | 10330 01-Jan-2000 Flashlight 4 28.00 22 | 10101 02-Jan-2000 Lantern 1 16.00 23 | 10299 18-Jan-2000 Inflatable Mattress 1 38.00 24 | 10438 18-Jan-2000 Tent 1 79.99 25 | 10413 19-Jan-2000 Lawnchair 4 32.00 26 | 10410 30-Jan-2000 Unicycle 1 192.50 27 | 10315 2-Feb-2000 Compass 1 8.00 28 | 10449 29-Feb-2000 Flashlight 1 4.50 29 | 10101 08-Mar-2000 Sleeping Bag 2 88.70 30 | 10298 18-Mar-2000 Pocket Knife 1 22.38 31 | 10449 19-Mar-2000 Canoe paddle 2 40.00 32 | 10298 01-Apr-2000 Ear Muffs 1 12.50 33 | 10330 19-Apr-2000 Shovel 1 16.75 -------------------------------------------------------------------------------- /small_actions.csv: -------------------------------------------------------------------------------- 1 | Gina,Orange,view,0.99 2 | Dave,Apricot,addtocart,0.99 3 | Gina,Mango,purchase,0.99 4 | Carol,Banana,purchase,0.99 5 | Gina,Grape,view,0.99 6 | Gina,Banana,view,0.99 7 | Alice,Cherry,addtocart,0.99 8 | Carol,Mango,addtocart,0.99 9 | Alice,Apricot,purchase,0.99 10 | Gina,Cherry,purchase,0.99 11 | -------------------------------------------------------------------------------- /spark.py: -------------------------------------------------------------------------------- 1 | # This file was made for the class: 2 | # SQL for Marekters: Dominate data analytics, data science, and big data 3 | # 4 | # It can be found at: 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | from pyspark import SparkContext 8 | from pyspark.sql import SQLContext 9 | 10 | sc = SparkContext("local", "Simple App") 11 | sqlContext = SQLContext(sc) 12 | 13 | df = sqlContext.read.json("data.json") 14 | 15 | # Displays the content of the DataFrame to stdout 16 | df.show() -------------------------------------------------------------------------------- /spark2.py: -------------------------------------------------------------------------------- 1 | # This file was made for the class: 2 | # SQL for Marekters: Dominate data analytics, data science, and big data 3 | # 4 | # It can be found at: 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data 6 | 7 | from pyspark import SparkContext 8 | from pyspark.sql import SQLContext 9 | 10 | sc = SparkContext("local", "Simple App") 11 | sqlContext = SQLContext(sc) 12 | 13 | df = sqlContext.read.json("big_actions.json") 14 | df.registerTempTable("user_actions") 15 | 16 | df2 = sqlContext.sql("SELECT COUNT(*), product FROM user_actions WHERE action = 'purchase' GROUP BY product") 17 | 18 | df2.show() 19 | -------------------------------------------------------------------------------- /users.csv: -------------------------------------------------------------------------------- 1 | Alice,29,Chicago 2 | Bob,24,New York 3 | Carol,39,New York 4 | Dave,27,New York 5 | Emily,30,Los Angeles 6 | Frank,36,Los Angeles 7 | Gina,25,Las Vegas 8 | --------------------------------------------------------------------------------