├── .gitignore
├── convert_actions.py
├── create_actions.sql
├── customers.tsv
├── data.json
├── drop.sql
├── generate_actions.py
├── generate_actions_dt.py
├── generate_users.py
├── ordered_items.tsv
├── small_actions.csv
├── spark.py
├── spark2.py
└── users.csv


/.gitignore:
--------------------------------------------------------------------------------
1 | big*_actions.csv
2 | the_database.db
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/convert_actions.py:
--------------------------------------------------------------------------------
 1 | # This file was made for the class:
 2 | # SQL for Marekters: Dominate data analytics, data science, and big data
 3 | #
 4 | # It can be found at:
 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
 6 | 
 7 | import json
 8 | 
 9 | with open('big_actions.json', 'w') as f:
10 |     for line in open('big_actions.csv'):
11 |         r = line.split(',')
12 |         j = {
13 |             'name': r[0],
14 |             'product': r[1],
15 |             'action': r[2],
16 |             'price': r[3],
17 |         }
18 |         f.write("%s\n" % json.dumps(j))
19 | 


--------------------------------------------------------------------------------
/create_actions.sql:
--------------------------------------------------------------------------------
1 | -- This file was made for the class:
2 | -- SQL for Marekters: Dominate data analytics, data science, and big data
3 | 
4 | -- It can be found at:
5 | -- https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
6 | 
7 | CREATE TABLE user_actions(name TEXT, product TEXT, action TEXT, price REAL);
8 | 


--------------------------------------------------------------------------------
/customers.tsv:
--------------------------------------------------------------------------------
 1 | customerid	firstname	lastname	city	state
 2 | 10101	John	Gray	Lynden	Washington
 3 | 10298	Leroy	Brown	Pinetop	Arizona
 4 | 10299	Elroy	Keller	Snoqualmie	Washington
 5 | 10315	Lisa	Jones	Oshkosh	Wisconsin
 6 | 10325	Ginger	Schultz	Pocatello	Idaho
 7 | 10329	Kelly	Mendoza	Kailua	Hawaii
 8 | 10330	Shawn	Dalton	Cannon Beach	Oregon
 9 | 10338	Michael	Howell	Tillamook	Oregon
10 | 10339	Anthony	Sanchez	Winslow	Arizona
11 | 10408	Elroy	Cleaver	Globe	Arizona
12 | 10410	Mary Ann	Howell	Charleston	South Carolina
13 | 10413	Donald	Davids	Gila Bend	Arizona
14 | 10419	Linda	Sakahara	Nogales	Arizona
15 | 10429	Sarah	Graham	Greensboro	North Carolina
16 | 10438	Kevin	Smith	Durango	Colorado
17 | 10439	Conrad	Giles	Telluride	Colorado
18 | 10449	Isabela	Moore	Yuma	Arizona


--------------------------------------------------------------------------------
/data.json:
--------------------------------------------------------------------------------
1 | {"name":"Bob","age":20}
2 | {"name":"Jane","age":25}


--------------------------------------------------------------------------------
/drop.sql:
--------------------------------------------------------------------------------
 1 | -- Example of dropping a column (new in SQLite 3.35.0)
 2 | 
 3 | -- create table
 4 | create table mytable (first_name text, last_name text, email text);
 5 | 
 6 | -- check schema
 7 | -- .schema mytable
 8 | 
 9 | -- insert dummy data
10 | insert into mytable (first_name, last_name, email) values ('Alice', 'In Chains', 'alice@gmail.com');
11 | insert into mytable (first_name, last_name, email) values ('Bob', 'Baker', 'bob@gmail.com');
12 | 
13 | -- format nicely
14 | -- .mode table
15 | 
16 | -- look at the data
17 | select * from mytable;
18 | 
19 | -- drop column
20 | alter table mytable drop column email;
21 | 
22 | -- look at the data again (email is gone)
23 | select * from mytable;
24 | 
25 | -- check schema again
26 | -- .schema mytable
27 | 
28 | -- clean up table
29 | drop table mytable;


--------------------------------------------------------------------------------
/generate_actions.py:
--------------------------------------------------------------------------------
 1 | # This file was made for the class:
 2 | # SQL for Marekters: Dominate data analytics, data science, and big data
 3 | #
 4 | # It can be found at:
 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
 6 | 
 7 | from __future__ import print_function, division
 8 | from builtins import range
 9 | # Note: you may need to update your version of future
10 | # sudo pip install -U future
11 | 
12 | import sys
13 | import random
14 | 
15 | NAMES = ['Alice', 'Bob', 'Carol', 'Dave', 'Emily', 'Frank', 'Gina']
16 | PRODUCTS = ['Apple', 'Orange', 'Banana', 'Blueberry', 'Raspberry', 'Apricot', 'Cherry', 'Grape', 'Mango']
17 | ACTIONS = ['view', 'addtocart', 'purchase']
18 | 
19 | def generate(N, fn):
20 |     with open(fn, 'w') as f:
21 |         i = 0
22 |         while i < N:
23 |             name = random.choice(NAMES)
24 |             product = random.choice(PRODUCTS)
25 |             # action = random.choice(ACTIONS)
26 |             price = str(0.99)
27 | 
28 |             # make sure every purchase has an addtocart and view
29 |             # make sure every addtocart has a view
30 |             a = random.randint(1, 3)
31 |             for j in range(a):
32 |                 action = ACTIONS[j]
33 |                 f.write("%s,%s,%s,%s\n" % (name, product, action, price))
34 |                 i += 1
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     generate(int(sys.argv[1]), sys.argv[2])
39 | 


--------------------------------------------------------------------------------
/generate_actions_dt.py:
--------------------------------------------------------------------------------
 1 | # This file was made for the class:
 2 | # SQL for Marekters: Dominate data analytics, data science, and big data
 3 | #
 4 | # It can be found at:
 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
 6 | 
 7 | import sys
 8 | import random
 9 | # from datetime import datetime
10 | 
11 | NAMES = ['Alice', 'Bob', 'Carol', 'Dave', 'Emily', 'Frank', 'Gina']
12 | PRODUCTS = ['Apple', 'Orange', 'Banana', 'Blueberry', 'Raspberry', 'Apricot', 'Cherry', 'Grape', 'Mango']
13 | 
14 | def generate(N, fn):
15 |     with open(fn, 'w') as f:
16 |         i = 0
17 |         while i < N:
18 |             name = random.choice(NAMES)
19 |             product = random.choice(PRODUCTS)
20 |             price = str(0.99)
21 | 
22 |             year = random.choice(['2014', '2015'])
23 |             month = str(random.choice(range(12)) + 1)
24 |             if len(month) == 1:
25 |                 month = "0" + month
26 |             day = str(random.choice(range(28)) + 1)
27 |             if len(day) == 1:
28 |                 day = "0" + day
29 |             dt = "%s-%s-%s 00:00:00" % (year, month, day)
30 | 
31 |             f.write("%s,%s,%s,%s,%s\n" % (name, product, 'purchase', price, dt))
32 |             i += 1
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     generate(1000000, 'dt_actions.csv')
37 | 


--------------------------------------------------------------------------------
/generate_users.py:
--------------------------------------------------------------------------------
 1 | # This file was made for the class:
 2 | # SQL for Marekters: Dominate data analytics, data science, and big data
 3 | #
 4 | # It can be found at:
 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
 6 | 
 7 | import sys
 8 | import random
 9 | 
10 | from generate_actions import NAMES
11 | 
12 | LOCATIONS = ['Los Angeles', 'New York', 'Chicago', 'Las Vegas']
13 | 
14 | def generate():
15 |     with open('users.csv', 'w') as f:
16 |         for name in NAMES:
17 |             location = random.choice(LOCATIONS)
18 |             age = random.randint(18, 65)
19 |             f.write("%s,%s,%s\n" % (name, str(age), location))
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     generate()
24 | 


--------------------------------------------------------------------------------
/ordered_items.tsv:
--------------------------------------------------------------------------------
 1 | customerid	order_date	item	quantity	price
 2 | 10330	30-Jun-1999	Pogo stick	1	28.00
 3 | 10101	30-Jun-1999	Raft	1	58.00
 4 | 10298	01-Jul-1999	Skateboard	1	33.00
 5 | 10101	01-Jul-1999	Life Vest	4	125.00
 6 | 10299	06-Jul-1999	Parachute	1	1250.00
 7 | 10339	27-Jul-1999	Umbrella	1	4.50
 8 | 10449	13-Aug-1999	Unicycle	1	180.79
 9 | 10439	14-Aug-1999	Ski Poles	2	25.50
10 | 10101	18-Aug-1999	Rain Coat	1	18.30
11 | 10449	01-Sep-1999	Snow Shoes	1	45.00
12 | 10439	18-Sep-1999	Tent	1	88.00
13 | 10298	19-Sep-1999	Lantern	2	29.00
14 | 10410	28-Oct-1999	Sleeping Bag	1	89.22
15 | 10438	01-Nov-1999	Umbrella	1	6.75
16 | 10438	02-Nov-1999	Pillow	1	8.50
17 | 10298	01-Dec-1999	Helmet	1	22.00
18 | 10449	15-Dec-1999	Bicycle	1	380.50
19 | 10449	22-Dec-1999	Canoe	1	280.00
20 | 10101	30-Dec-1999	Hoola Hoop	3	14.75
21 | 10330	01-Jan-2000	Flashlight	4	28.00
22 | 10101	02-Jan-2000	Lantern	1	16.00
23 | 10299	18-Jan-2000	Inflatable Mattress	1	38.00
24 | 10438	18-Jan-2000	Tent	1	79.99
25 | 10413	19-Jan-2000	Lawnchair	4	32.00
26 | 10410	30-Jan-2000	Unicycle	1	192.50
27 | 10315	2-Feb-2000	Compass	1	8.00
28 | 10449	29-Feb-2000	Flashlight	1	4.50
29 | 10101	08-Mar-2000	Sleeping Bag	2	88.70
30 | 10298	18-Mar-2000	Pocket Knife	1	22.38
31 | 10449	19-Mar-2000	Canoe paddle	2	40.00
32 | 10298	01-Apr-2000	Ear Muffs	1	12.50
33 | 10330	19-Apr-2000	Shovel	1	16.75


--------------------------------------------------------------------------------
/small_actions.csv:
--------------------------------------------------------------------------------
 1 | Gina,Orange,view,0.99
 2 | Dave,Apricot,addtocart,0.99
 3 | Gina,Mango,purchase,0.99
 4 | Carol,Banana,purchase,0.99
 5 | Gina,Grape,view,0.99
 6 | Gina,Banana,view,0.99
 7 | Alice,Cherry,addtocart,0.99
 8 | Carol,Mango,addtocart,0.99
 9 | Alice,Apricot,purchase,0.99
10 | Gina,Cherry,purchase,0.99
11 | 


--------------------------------------------------------------------------------
/spark.py:
--------------------------------------------------------------------------------
 1 | # This file was made for the class:
 2 | # SQL for Marekters: Dominate data analytics, data science, and big data
 3 | #
 4 | # It can be found at:
 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
 6 | 
 7 | from pyspark import SparkContext
 8 | from pyspark.sql import SQLContext
 9 | 
10 | sc = SparkContext("local", "Simple App")
11 | sqlContext = SQLContext(sc)
12 | 
13 | df = sqlContext.read.json("data.json")
14 | 
15 | # Displays the content of the DataFrame to stdout
16 | df.show()


--------------------------------------------------------------------------------
/spark2.py:
--------------------------------------------------------------------------------
 1 | # This file was made for the class:
 2 | # SQL for Marekters: Dominate data analytics, data science, and big data
 3 | #
 4 | # It can be found at:
 5 | # https://udemy.com/sql-for-marketers-data-analytics-data-science-big-data
 6 | 
 7 | from pyspark import SparkContext
 8 | from pyspark.sql import SQLContext
 9 | 
10 | sc = SparkContext("local", "Simple App")
11 | sqlContext = SQLContext(sc)
12 | 
13 | df = sqlContext.read.json("big_actions.json")
14 | df.registerTempTable("user_actions")
15 | 
16 | df2 = sqlContext.sql("SELECT COUNT(*), product FROM user_actions WHERE action = 'purchase' GROUP BY product")
17 | 
18 | df2.show()
19 | 


--------------------------------------------------------------------------------
/users.csv:
--------------------------------------------------------------------------------
1 | Alice,29,Chicago
2 | Bob,24,New York
3 | Carol,39,New York
4 | Dave,27,New York
5 | Emily,30,Los Angeles
6 | Frank,36,Los Angeles
7 | Gina,25,Las Vegas
8 | 


--------------------------------------------------------------------------------