├── 10
    ├── images
    │   ├── Complex_NoProc_V3.jpg
    │   └── cnn_arch.png
    ├── requirements.txt
    ├── tutorial_credit_scoring.ipynb
    └── tutorial_medical_expenditure.ipynb
├── .gitignore
├── 01
    ├── git-setup.txt
    ├── java-maven-setup.txt
    ├── python-example
    │   ├── digits.py
    │   ├── instructions.txt
    │   └── requirements.txt
    └── spark-example
    │   ├── instructions.txt
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── scala
    │               └── edu
    │                   └── nyu
    │                       └── deml
    │                           └── RunSparkLocal.scala
├── 02
    ├── adult-sample.csv
    ├── instructions.txt
    ├── requirements.txt
    └── sklearn-pipelines.ipynb
├── 03
    ├── data
    │   ├── eval
    │   │   └── data.csv
    │   ├── serving
    │   │   └── data.csv
    │   └── train
    │   │   └── data.csv
    ├── google-tfx.ipynb
    └── requirements.txt
├── 04
    ├── Apache-Beam-Intro.ipynb
    ├── kinglear.txt
    └── requirements.txt
├── 05
    ├── requirements.txt
    └── simple.ipynb
├── 06
    ├── census.ipynb
    ├── oop.ipynb
    └── requirements.txt
├── 07
    ├── Serving_REST_simple.ipynb
    └── requirements.txt
├── 08
    ├── keras_train.ipynb
    ├── requirements.txt
    └── sklearn_train.ipynb
├── 09
    ├── datawig.ipynb
    ├── dirty_debs2015_1k.csv
    ├── openrefine.txt
    ├── products.csv
    └── requirements.txt
├── assignment1
    ├── adult-sample.csv
    ├── components
    │   ├── constraints.py
    │   ├── learned_imputer.py
    │   └── trainer.py
    ├── instructions.md
    ├── products.csv
    ├── requirements.txt
    ├── task1.py
    ├── task2.py
    └── task3.py
├── assignment2
    ├── README.md
    ├── components
    │   ├── beam_job.py
    │   ├── linear_regression.py
    │   ├── mapreduce.py
    │   └── schema_validation.py
    ├── data.zip
    ├── data
    │   ├── products-data-0.tsv
    │   ├── products-data-1.tsv
    │   ├── products-data-2.tsv
    │   ├── products-data-3.tsv
    │   ├── ratings-0.tsv
    │   ├── ratings-1.tsv
    │   ├── ratings-2.tsv
    │   └── ratings-3.tsv
    ├── regression.png
    ├── requirements-catalina.txt
    ├── requirements.txt
    ├── task1.py
    ├── task1_colab.ipynb
    ├── task2.py
    ├── task2_colab.ipynb
    ├── task3.py
    ├── task4.py
    └── utils.py
├── assignment3
    ├── README.md
    ├── adult-sample.csv
    ├── components
    │   ├── graph.py
    │   └── neuralnetwork.py
    ├── graphs.png
    ├── moon.png
    ├── network.png
    ├── requirements.txt
    ├── task1.py
    ├── task2.py
    ├── task3.py
    └── task4.py
├── extra-assignment
    └── README.md
├── project-paper
    ├── acmart.cls
    ├── projectpaper.pdf
    └── projectpaper.tex
└── project-resources
    ├── README.md
    └── partitioned-data.zip


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.iml
 2 | **/target/
 3 | **/.idea
 4 | **/venv
 5 | **/.ipynb_checkpoints/
 6 | **/__pycache__
 7 | **/*.out
 8 | **/*.blg
 9 | **/*.swp
10 | **/*.bbl
11 | **/*.fdb_latexmk
12 | **/*.log
13 | **/*.fls
14 | **/*.synctex.gz
15 | **/*.aux
16 | 
17 | 


--------------------------------------------------------------------------------
/01/git-setup.txt:
--------------------------------------------------------------------------------
 1 | # If you use SSH to authenticate with github
 2 | git clone git@github.com:schelterlabs/deml-<yourusername>.git
 3 | # Otherwise
 4 | git clone https://github.com/schelterlabs/deml-<yourusername>.git
 5 | 
 6 | # Add the tasks and assignments repository as remote
 7 | cd deml-<yourusername>
 8 | 
 9 | # If you use SSH to authenticate with github
10 | git remote add lab-tasks-repo git@github.com:schelterlabs/deml-lab.git
11 | # Otherwise
12 | git remote add lab-tasks-repo https://github.com/schelterlabs/deml-lab.git
13 | 
14 | git remote set-url lab-tasks-repo --push DISABLE
15 | 
16 | # Setup the repository
17 | git fetch lab-tasks-repo
18 | git checkout -b lab-tasks lab-tasks-repo/master
19 | git checkout master
20 | git push -u origin
21 | 


--------------------------------------------------------------------------------
/01/java-maven-setup.txt:
--------------------------------------------------------------------------------
 1 | # Check if installed
 2 | mvn -version
 3 | java -version
 4 | 
 5 | # If not, download from maven, extract it and move it from downloads to application
 6 | mv apache-maven-3.5.4 /Applications/
 7 | 
 8 | # For JAVA, download from Google Drive and follow instructions
 9 | 
10 | 
11 | # for Mac users. Return to your home user directory, 
12 | # and use your favorite editor to open .bash_profile. 
13 | # If there isn't one, create one.
14 | 
15 | vi .bash_profile
16 | 
17 | ## add the following JAVA path to your bash file.
18 | export JAVA_HOME=$(/usr/libexec/java_home)
19 | 
20 | ## add the following MAVEN path to your bash file.
21 | export M2_HOME=/Applications/apache-maven-3.6.2
22 | export PATH=$PATH:$M2_HOME/bin
23 | 
24 | # source the bash file.
25 | source .bash_profile
26 | 
27 | # test that the path is successfully set.
28 | echo $JAVA_HOME
29 | 
30 | # or for Maven
31 | echo $M2_HOME
32 | 
33 | # something like this should be returned
34 | /Library/Java/JavaVirtualMachines/jdk1.8.0_221.jdk/Contents/Home
35 | 


--------------------------------------------------------------------------------
/01/python-example/digits.py:
--------------------------------------------------------------------------------
 1 | # Import datasets, classifiers and performance metrics
 2 | from sklearn import datasets, svm, metrics
 3 | 
 4 | # The digits dataset
 5 | digits = datasets.load_digits()
 6 | 
 7 | # To apply a classifier on this data, we need to flatten the image, to
 8 | # turn the data in a (samples, feature) matrix:
 9 | n_samples = len(digits.images)
10 | data = digits.images.reshape((n_samples, -1))
11 | 
12 | # Create a classifier: a support vector classifier
13 | classifier = svm.SVC(gamma=0.001)
14 | 
15 | # We learn the digits on the first half of the digits
16 | classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
17 | 
18 | # Now predict the value of the digit on the second half:
19 | expected = digits.target[n_samples // 2:]
20 | predicted = classifier.predict(data[n_samples // 2:])
21 | 
22 | print("Classification report for classifier %s:\n%s\n"
23 |               % (classifier, metrics.classification_report(expected, predicted)))
24 | print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
25 | 
26 | 


--------------------------------------------------------------------------------
/01/python-example/instructions.txt:
--------------------------------------------------------------------------------
1 | python3.6 -m venv venv
2 | source venv/bin/activate
3 | pip install -r requirements.txt
4 | python digits.py
5 | 
6 | 


--------------------------------------------------------------------------------
/01/python-example/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn == 0.21.3
2 | 
3 | 


--------------------------------------------------------------------------------
/01/spark-example/instructions.txt:
--------------------------------------------------------------------------------
1 | mvn scala:run -DmainClass=edu.nyu.deml.RunSparkLocal
2 | 
3 | 


--------------------------------------------------------------------------------
/01/spark-example/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>edu.nyu.deml</groupId>
 8 |     <artifactId>lab01</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 | 
11 |     <properties>
12 |         <maven.compiler.source>1.8</maven.compiler.source>
13 |         <maven.compiler.target>1.8</maven.compiler.target>
14 |         <encoding>UTF-8</encoding>
15 |         <scala.major.version>2.11</scala.major.version>
16 |         <scala.version>${scala.major.version}.5</scala.version>
17 |         <scala-maven-plugin.version>3.4.4</scala-maven-plugin.version>
18 |     </properties>
19 | 
20 |     <dependencies>
21 |         <dependency>
22 |             <groupId>org.scala-lang</groupId>
23 |             <artifactId>scala-library</artifactId>
24 |             <version>${scala.version}</version>
25 |         </dependency>
26 | 
27 |         <dependency>
28 |             <groupId>org.apache.spark</groupId>
29 |             <artifactId>spark-core_${scala.major.version}</artifactId>
30 |             <version>2.2.2</version>
31 |         </dependency>
32 | 
33 |         <dependency>
34 |             <groupId>org.apache.spark</groupId>
35 |             <artifactId>spark-sql_${scala.major.version}</artifactId>
36 |             <version>2.2.2</version>
37 |         </dependency>
38 |     </dependencies>
39 | 
40 |     <build>
41 |         <sourceDirectory>src/main/scala</sourceDirectory>
42 |         <plugins>
43 |             <plugin>
44 |                 <groupId>net.alchim31.maven</groupId>
45 |                 <artifactId>scala-maven-plugin</artifactId>
46 |                 <version>${scala-maven-plugin.version}</version>
47 |                 <configuration>
48 |                     <scalaCompatVersion>${scala.major.version}</scalaCompatVersion>
49 |                     <scalaVersion>${scala.version}</scalaVersion>
50 |                 </configuration>
51 |                 <executions>
52 |                     <execution>
53 |                         <id>scala-compile-first</id>
54 |                         <phase>process-resources</phase>
55 |                         <goals>
56 |                             <goal>add-source</goal>
57 |                             <goal>compile</goal>
58 |                         </goals>
59 |                     </execution>
60 |                     <execution>
61 |                         <id>scala-test-compile</id>
62 |                         <phase>process-test-resources</phase>
63 |                         <goals>
64 |                             <goal>testCompile</goal>
65 |                         </goals>
66 |                     </execution>
67 |                 </executions>
68 |             </plugin>
69 | 
70 |             <!-- disable surefire -->
71 |             <plugin>
72 |                 <groupId>org.apache.maven.plugins</groupId>
73 |                 <artifactId>maven-surefire-plugin</artifactId>
74 |                 <version>2.7</version>
75 |                 <configuration>
76 |                     <skipTests>true</skipTests>
77 |                 </configuration>
78 |             </plugin>
79 |         </plugins>
80 |     </build>
81 | </project>


--------------------------------------------------------------------------------
/01/spark-example/src/main/scala/edu/nyu/deml/RunSparkLocal.scala:
--------------------------------------------------------------------------------
 1 | package edu.nyu.deml
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object RunSparkLocal extends App {
 6 | 
 7 |   case class Item(id: Long, productName: String, description: String, priority: String, numViews: Long)
 8 | 
 9 |   withSpark { session =>
10 | 
11 |     val rdd = session.sparkContext.parallelize(Seq(
12 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
13 |       Item(2, "Thingy B", "available at http://thingb.com", null, 0),
14 |       Item(3, null, null, "low", 5),
15 |       Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
16 |       Item(5, "Thingy E", null, "high", 12)),
17 |       numSlices = 2)
18 | 
19 |     val data = session.createDataFrame(rdd)
20 | 
21 |     val count = data.count()
22 | 
23 |     println(s"$count items found.")
24 |   }
25 | 
26 | 
27 | 
28 | 
29 | 
30 |   def withSpark(func: SparkSession => Unit): Unit = {
31 | 
32 |     val session = SparkSession.builder()
33 |       .master("local")
34 |       .appName("test")
35 |       .config("spark.ui.enabled", "false")
36 |       .config("spark.sql.shuffle.partitions", 2.toString)
37 |       .getOrCreate()
38 |     session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir"))
39 | 
40 |     try {
41 |       func(session)
42 |     } finally {
43 |       session.stop()
44 |       System.clearProperty("spark.driver.port")
45 |     }
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/02/adult-sample.csv:
--------------------------------------------------------------------------------
  1 | age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
  2 | 28,Private,273269,Some-college,10,Never-married,Craft-repair,Not-in-family,Black,Male,0,0,40,United-States,<=50K
  3 | 58,State-gov,123329,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,16,United-States,<=50K
  4 | 34,Private,79637,Bachelors,13,Never-married,Exec-managerial,Own-child,Amer-Indian-Eskimo,Female,0,0,40,United-States,<=50K
  5 | 71,Private,97870,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,15,Germany,<=50K
  6 | 20,State-gov,41103,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K
  7 | 46,Private,125492,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Black,Female,0,0,40,United-States,<=50K
  8 | 31,Private,467579,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,1887,40,United-States,>50K
  9 | 24,Private,376393,Assoc-voc,11,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 10 | 21,Private,56582,11th,7,Never-married,Other-service,Own-child,White,Male,0,0,50,United-States,<=50K
 11 | 38,Private,76317,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K
 12 | 43,Federal-gov,144778,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,>50K
 13 | 47,Private,454989,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
 14 | 23,Private,278254,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,45,United-States,<=50K
 15 | 38,Private,111499,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,7298,0,50,United-States,>50K
 16 | 31,Private,168521,Bachelors,13,Never-married,Exec-managerial,Unmarried,White,Female,0,0,50,United-States,<=50K
 17 | 36,Private,749636,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
 18 | 27,Private,167405,HS-grad,9,Married-spouse-absent,Farming-fishing,Own-child,White,Female,0,0,40,Mexico,<=50K
 19 | 32,Private,317378,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Female,10520,0,40,United-States,>50K
 20 | 55,State-gov,71630,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,1617,40,United-States,<=50K
 21 | 33,Private,182401,10th,6,Never-married,Adm-clerical,Not-in-family,Black,Male,0,0,40,United-States,<=50K
 22 | 21,Private,33616,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,25,United-States,<=50K
 23 | 25,Private,362912,Some-college,10,Never-married,Craft-repair,Own-child,White,Female,0,0,50,United-States,<=50K
 24 | 28,Private,34335,HS-grad,9,Divorced,Sales,Not-in-family,Amer-Indian-Eskimo,Male,14084,0,40,United-States,>50K
 25 | 51,Private,305147,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K
 26 | 26,Private,50103,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 27 | 62,State-gov,221558,Masters,14,Separated,Prof-specialty,Unmarried,White,Female,0,0,24,?,<=50K
 28 | 37,Private,138940,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 29 | 55,Self-emp-not-inc,52888,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,10,United-States,<=50K
 30 | 46,Local-gov,125457,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,38,United-States,>50K
 31 | 47,Private,102771,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,Portugal,<=50K
 32 | 60,?,41517,11th,7,Married-spouse-absent,?,Unmarried,Black,Female,0,0,20,United-States,<=50K
 33 | 34,Private,153614,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,>50K
 34 | 32,Local-gov,157887,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 35 | 35,Private,308691,Masters,14,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States,<=50K
 36 | 48,Self-emp-inc,238966,Some-college,10,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K
 37 | 67,Private,123393,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
 38 | 48,Private,25468,Masters,14,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,50,United-States,>50K
 39 | 30,Private,117393,HS-grad,9,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 40 | 40,Private,175686,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
 41 | 58,Private,259014,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Male,0,0,20,United-States,<=50K
 42 | 19,?,134974,Some-college,10,Never-married,?,Own-child,White,Female,0,0,20,United-States,<=50K
 43 | 25,Private,49092,Bachelors,13,Never-married,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K
 44 | 33,Local-gov,224185,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 45 | 43,Private,136721,12th,8,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 46 | 37,Private,314963,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
 47 | 21,State-gov,337766,Some-college,10,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States,<=50K
 48 | 51,Self-emp-not-inc,111939,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,35,United-States,>50K
 49 | 43,Private,151089,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,<=50K
 50 | 49,Private,120629,Bachelors,13,Divorced,Exec-managerial,Not-in-family,Black,Female,27828,0,60,United-States,>50K
 51 | 38,Local-gov,201410,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
 52 | 49,Private,61307,7th-8th,4,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,38,United-States,<=50K
 53 | 36,Private,135289,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,48,United-States,>50K
 54 | 36,Self-emp-not-inc,89622,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,80,United-States,>50K
 55 | 21,Private,216070,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,Amer-Indian-Eskimo,Female,0,0,46,United-States,>50K
 56 | 42,Private,138662,Some-college,10,Separated,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K
 57 | 35,Private,385847,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 58 | 20,Private,189148,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,48,United-States,<=50K
 59 | 22,Private,252355,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,27,United-States,<=50K
 60 | 46,Private,243743,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K
 61 | 33,Private,290763,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 62 | 23,?,99399,Some-college,10,Never-married,?,Unmarried,Amer-Indian-Eskimo,Female,0,0,25,United-States,<=50K
 63 | 44,Private,160829,Bachelors,13,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,20,United-States,>50K
 64 | 46,Local-gov,329752,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,30,United-States,<=50K
 65 | 52,Private,117674,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K
 66 | 20,?,150084,Some-college,10,Never-married,?,Own-child,White,Male,0,0,25,United-States,<=50K
 67 | 49,State-gov,203039,11th,7,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K
 68 | 38,Private,210438,7th-8th,4,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K
 69 | 29,Private,163265,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
 70 | 18,Private,43272,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States,<=50K
 71 | 54,Self-emp-not-inc,103179,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K
 72 | 48,Private,449354,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,4386,0,45,United-States,>50K
 73 | 29,Private,297544,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 74 | 30,Private,161690,Assoc-voc,11,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K
 75 | 31,Local-gov,219883,HS-grad,9,Never-married,Protective-serv,Not-in-family,Black,Male,0,0,40,United-States,<=50K
 76 | 40,Federal-gov,121012,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,7298,0,48,United-States,>50K
 77 | 32,Private,207172,Some-college,10,Never-married,Sales,Other-relative,White,Female,0,0,40,United-States,<=50K
 78 | 47,Private,148995,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,2415,60,United-States,>50K
 79 | 19,Private,292590,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K
 80 | 45,Private,274657,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,?,<=50K
 81 | 49,Private,189498,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
 82 | 18,Private,25837,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,25,United-States,<=50K
 83 | 33,State-gov,306309,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States,<=50K
 84 | 48,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 85 | 30,Local-gov,289442,HS-grad,9,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
 86 | 55,Private,89690,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,>50K
 87 | 47,Self-emp-not-inc,237731,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,2829,0,65,United-States,<=50K
 88 | 72,?,402306,Some-college,10,Married-civ-spouse,?,Husband,White,Male,0,0,32,Canada,<=50K
 89 | 27,Private,119793,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 90 | 44,Private,116358,Bachelors,13,Married-civ-spouse,Sales,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K
 91 | 23,Private,55215,Bachelors,13,Never-married,Sales,Own-child,White,Male,0,0,55,United-States,<=50K
 92 | 33,Private,184784,10th,6,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
 93 | 43,Private,269015,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,40,Germany,>50K
 94 | 46,Private,146919,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K
 95 | 90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
 96 | 34,Private,19847,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
 97 | 32,Private,108116,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1902,60,United-States,>50K
 98 | 42,Self-emp-not-inc,32185,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K
 99 | 18,Private,333611,5th-6th,3,Never-married,Other-service,Other-relative,White,Male,0,0,54,Mexico,<=50K
100 | 25,Private,50053,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,Japan,<=50K
101 | 28,Private,119287,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,28,United-States,>50K
102 | 


--------------------------------------------------------------------------------
/02/instructions.txt:
--------------------------------------------------------------------------------
 1 | # Update your repo
 2 | git checkout lab-tasks
 3 | git pull
 4 | git checkout master
 5 | git merge lab-tasks
 6 | 
 7 | # Start the jupyter notebook
 8 | cd 02
 9 | python3.6 -m venv venv
10 | source venv/bin/activate
11 | pip install -r requirements.txt
12 | jupyter notebook
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/02/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.21.3
2 | pandas==0.25.1
3 | jupyter==1.0.0
4 | 
5 | 


--------------------------------------------------------------------------------
/03/data/serving/data.csv:
--------------------------------------------------------------------------------
  1 | pickup_community_area,fare,trip_start_month,trip_start_hour,trip_start_day,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_miles,pickup_census_tract,dropoff_census_tract,payment_type,company,trip_seconds,dropoff_community_area
  2 | 8,6.45,9,17,4,1441213200,41.892072635,-87.628874157,41.880994471,-87.632746489,0.9,,17031839100,Credit Card,Taxi Affiliation Services,540,32
  3 | 8,8.05,10,20,7,1414269000,41.899155613,-87.626210532,41.880994471,-87.632746489,0.0,,17031839100,Cash,Taxi Affiliation Services,840,32
  4 | 32,7.65,3,20,2,1395087300,41.880994471,-87.632746489,41.877406123,-87.621971652,1.9,,17031320400,Cash,,480,32
  5 | 8,36.05,6,17,7,1370713500,41.890922026,-87.618868355,41.97907082,-87.903039661,17.4,,17031980000,Cash,Dispatch Taxi Affiliation,2280,76
  6 | 32,4.45,4,12,3,1398775500,41.880994471,-87.632746489,41.880994471,-87.632746489,0.3,,17031839100,Cash,,240,32
  7 | 8,36.45,10,8,6,1413532800,41.892507781,-87.626214906,41.97907082,-87.903039661,17.5,,17031980000,Credit Card,,2040,76
  8 | 76,37.05,3,23,4,1394667000,41.97907082,-87.903039661,41.898331794,-87.620762865,0.0,,17031081300,Cash,Choice Taxi Association,1680,8
  9 | 28,6.25,10,9,5,1380791700,41.879255084,-87.642648998,41.880994471,-87.632746489,0.9,,17031839100,Cash,,480,32
 10 | 8,8.44,10,8,1,1382862600,41.899602111,-87.633308037,41.944226601,-87.655998182,2.8,,,Cash,,360,6
 11 | 32,8.85,7,19,5,1406834100,41.884987192,-87.620992913,41.851017824,-87.635091856,0.0,,17031841100,Cash,Taxi Affiliation Services,720,34
 12 | 28,7.25,11,20,4,1448481600,41.874005383,-87.66351755,41.874005383,-87.66351755,1.9,,,Cash,Taxi Affiliation Services,480,28
 13 | 8,10.5,4,1,7,1461979800,41.90749193,-87.63576009,41.884987192,-87.620992913,2.7,,17031320100,Credit Card,,720,32
 14 | 32,3.65,8,10,1,1407060900,41.880994471,-87.632746489,41.880994471,-87.632746489,0.0,,17031839100,Cash,,60,32
 15 | 32,12.5,10,19,4,1477511100,41.880994471,-87.632746489,41.899155613,-87.626210532,1.8,,17031081201,Credit Card,Dispatch Taxi Affiliation,1440,8
 16 | 8,10.45,5,18,2,1399919400,41.892042136,-87.63186395,41.884987192,-87.620992913,0.1,,17031320100,Cash,Taxi Affiliation Services,960,32
 17 | 28,4.65,2,16,3,1360686600,41.879255084,-87.642648998,,,0.65,,,Cash,,240,
 18 | 76,40.05,11,17,5,1384451100,41.97907082,-87.903039661,41.880994471,-87.632746489,1.0,,17031839100,Cash,Taxi Affiliation Services,4440,32
 19 | 28,3.25,11,12,6,1383308100,41.88528132,-87.6572332,41.88528132,-87.6572332,0.0,,17031833000,Cash,,0,28
 20 | 8,7.05,12,15,6,1417793400,41.900221297,-87.629105186,41.892072635,-87.628874157,1.0,,17031081600,Cash,Dispatch Taxi Affiliation,660,8
 21 | 32,5.05,1,19,3,1421781300,41.880994471,-87.632746489,41.879255084,-87.642648998,0.0,,17031281900,Credit Card,Blue Ribbon Taxi Association Inc.,300,28
 22 | 8,21.45,9,20,5,1441310400,41.899602111,-87.633308037,41.954027649,-87.763399032,0.5,,,Credit Card,Taxi Affiliation Services,1080,15
 23 | 3,7.85,5,22,7,1433023200,41.972437081,-87.671109526,41.946294536,-87.654298084,1.9,,17031832000,Cash,Dispatch Taxi Affiliation,600,6
 24 | 32,24.85,9,11,6,1443181500,41.880994471,-87.632746489,41.785998518,-87.750934289,11.0,,17031980100,Credit Card,,1440,56
 25 | 32,7.25,11,1,1,1446339600,41.877406123,-87.621971652,41.89321636,-87.63784421,0.0,,17031081800,Credit Card,Taxi Affiliation Services,540,8
 26 | 32,6.65,11,18,2,1383590700,41.880994471,-87.632746489,41.899155613,-87.626210532,1.1,,17031081201,Credit Card,Northwest Management LLC,540,8
 27 | 34,15.05,1,22,2,1420495200,41.842076117,-87.633973422,41.775928827,-87.666596265,6.1,,,Unknown,Taxi Affiliation Services,1200,67
 28 | 32,36.65,10,14,6,1445610600,41.880994471,-87.632746489,41.97907082,-87.903039661,17.3,,17031980000,Credit Card,Taxi Affiliation Services,2640,76
 29 | 8,5.45,3,19,1,1364758200,41.899155613,-87.626210532,41.907412816,-87.640901525,0.5,,17031080400,Credit Card,Dispatch Taxi Affiliation,300,8
 30 | 28,35.25,1,16,3,1420560900,41.885300022,-87.642808466,41.97907082,-87.903039661,17.1,,17031980000,Credit Card,Taxi Affiliation Services,1800,76
 31 | 8,5.65,5,20,6,1399060800,41.892507781,-87.626214906,41.880994471,-87.632746489,0.4,,17031839100,Cash,Dispatch Taxi Affiliation,480,32
 32 | 24,8.05,6,21,4,1433972700,41.901206994,-87.676355989,41.878865584,-87.625192142,2.13,,,Credit Card,,480,32
 33 | 28,5.65,1,11,3,1358249400,41.879255084,-87.642648998,41.877406123,-87.621971652,0.0,,17031320400,Cash,Dispatch Taxi Affiliation,360,32
 34 | 33,9.05,8,11,6,1438947900,41.859349715,-87.617358006,41.892042136,-87.63186395,2.3,,17031081700,Credit Card,,660,8
 35 | 32,7.25,10,15,5,1446131700,41.880994471,-87.632746489,41.89503345,-87.619710672,1.4,,17031081401,Cash,Choice Taxi Association,540,8
 36 | 6,6.25,10,2,2,1413167400,41.944226601,-87.655998182,41.922686284,-87.649488729,0.0,,,Cash,Taxi Affiliation Services,360,7
 37 | 8,7.0,2,9,1,1454835600,41.899602111,-87.633308037,41.899602111,-87.633308037,0.0,,,Cash,Choice Taxi Association,360,8
 38 | 28,4.65,5,19,4,1401303600,41.885300022,-87.642808466,41.879255084,-87.642648998,0.6,,17031281900,Cash,Taxi Affiliation Services,180,28
 39 | 7,3.25,6,20,3,1435090500,41.922082541,-87.634156093,,,0.03,,,Cash,,0,
 40 | 6,5.45,9,2,7,1443234600,41.944226601,-87.655998182,41.944226601,-87.655998182,0.0,,,Credit Card,Blue Ribbon Taxi Association Inc.,300,6
 41 | 8,6.0,6,17,7,1465062300,41.89321636,-87.63784421,41.877406123,-87.621971652,0.0,,17031320400,Cash,Blue Ribbon Taxi Association Inc.,300,32
 42 | 56,26.85,4,22,1,1365976800,41.785998518,-87.750934289,41.892072635,-87.628874157,0.0,,17031081600,Cash,Choice Taxi Association,1380,8
 43 | 32,7.85,2,13,6,1424437200,41.880994471,-87.632746489,41.898331794,-87.620762865,2.1,,17031081300,Cash,,540,8
 44 | 32,12.85,12,22,3,1450217700,41.878865584,-87.625192142,41.944226601,-87.655998182,4.7,,,Cash,,780,6
 45 | 76,28.65,6,13,5,1434029400,41.980264315,-87.913624596,41.944226601,-87.655998182,0.0,,,Credit Card,Taxi Affiliation Services,2100,6
 46 | 4,11.65,4,19,3,1396380600,41.975170943,-87.687515515,41.922686284,-87.649488729,0.2,,,Credit Card,Taxi Affiliation Services,1080,7
 47 | 8,8.05,5,11,3,1401188400,41.89321636,-87.63784421,41.880994471,-87.632746489,1.8,,17031839100,Cash,,660,32
 48 | 28,4.65,9,17,2,1378746900,41.885300022,-87.642808466,41.879255084,-87.642648998,0.0,,17031281900,Cash,Taxi Affiliation Services,240,28
 49 | 3,14.05,2,6,3,1360046700,41.96581197,-87.655878786,41.899602111,-87.633308037,0.0,,,Unknown,Taxi Affiliation Services,540,8
 50 | 13,3.25,1,15,7,1421509500,41.983636307,-87.723583185,41.983636307,-87.723583185,0.0,,,Cash,,0,13
 51 | 8,34.65,3,19,1,1396206000,41.890922026,-87.618868355,41.97907082,-87.903039661,17.4,,17031980000,Cash,,1380,76
 52 | 8,5.05,12,2,1,1450576800,41.902788048,-87.62614559,41.892507781,-87.626214906,0.0,,17031081500,Credit Card,Taxi Affiliation Services,300,8
 53 | 8,4.25,4,20,4,1398285900,41.892072635,-87.628874157,41.898331794,-87.620762865,0.0,,17031081300,Credit Card,Blue Ribbon Taxi Association Inc.,120,8
 54 | 77,11.45,9,14,4,1378908000,41.9867118,-87.663416405,41.944226601,-87.655998182,0.0,,,No Charge,Northwest Management LLC,540,6
 55 | 8,9.45,10,23,5,1382657400,41.892042136,-87.63186395,41.89830587,-87.653613982,0.0,,17031842300,Cash,Taxi Affiliation Services,720,24
 56 | 7,7.45,5,20,5,1400791500,41.914616286,-87.631717366,41.928967266,-87.656156831,1.6,,17031070400,Cash,,540,7
 57 | 32,16.25,3,20,2,1364244300,41.870607372,-87.622172937,41.928967266,-87.656156831,0.0,,17031070400,Cash,Choice Taxi Association,1440,7
 58 | 8,5.85,11,15,2,1416238200,41.892507781,-87.626214906,41.880994471,-87.632746489,1.2,,17031839100,Credit Card,Dispatch Taxi Affiliation,300,32
 59 | 8,5.5,1,10,2,1451901600,41.892507781,-87.626214906,41.89503345,-87.619710672,0.5,,17031081401,Cash,KOAM Taxi Association,300,8
 60 | 32,5.25,6,13,6,1435325400,41.880994471,-87.632746489,41.880994471,-87.632746489,0.5,,17031839100,Credit Card,KOAM Taxi Association,300,32
 61 | 6,5.45,7,1,7,1373677200,41.942691844,-87.651770507,41.936237179,-87.656411531,0.8,,17031062900,Credit Card,Choice Taxi Association,360,6
 62 | 32,4.65,8,17,6,1408727700,41.880994471,-87.632746489,41.892042136,-87.63186395,0.6,,17031081700,Cash,,180,8
 63 | 32,4.05,1,12,3,1390305600,41.884987192,-87.620992913,41.884987192,-87.620992913,0.4,,17031320100,Cash,,120,32
 64 | 6,10.75,8,1,1,1471138200,41.945282331,-87.661545096,41.936237179,-87.656411531,2.8,,17031062900,Credit Card,,780,6
 65 | 28,8.45,8,0,7,1438993800,41.885300022,-87.642808466,41.902788048,-87.62614559,2.1,,17031081202,Cash,Taxi Affiliation Services,720,8
 66 | 7,5.25,9,19,7,1442690100,41.914747305,-87.654007029,41.929046937,-87.651310877,0.9,,17031070300,Cash,Taxi Affiliation Services,300,7
 67 | 6,9.25,1,0,7,1422060300,41.944226601,-87.655998182,41.899602111,-87.633308037,0.1,,,Cash,Taxi Affiliation Services,480,8
 68 | 5,11.05,4,0,7,1398470400,41.947791586,-87.683834942,41.983636307,-87.723583185,0.2,,,Cash,Taxi Affiliation Services,840,13
 69 | 32,5.85,10,8,2,1382343300,41.880994471,-87.632746489,41.89503345,-87.619710672,1.1,,17031081401,Credit Card,,300,8
 70 | 32,13.05,8,23,4,1408576500,41.884987192,-87.620992913,41.942577185,-87.647078509,0.3,,17031062000,Cash,Taxi Affiliation Services,660,6
 71 | 33,15.45,9,20,3,1410899400,41.859349715,-87.617358006,41.93057857,-87.642206313,5.54,,17031070102,Cash,,900,7
 72 | 76,45.85,5,15,2,1401118200,41.97907082,-87.903039661,41.880994471,-87.632746489,19.7,,17031839100,Credit Card,Taxi Affiliation Services,4200,32
 73 | 32,17.05,3,18,4,1425492000,41.880994471,-87.632746489,41.938391258,-87.63857492,0.0,,17031063200,Credit Card,Taxi Affiliation Services,1440,6
 74 | 6,19.45,4,19,5,1366916400,41.944226601,-87.655998182,41.850266366,-87.667569312,7.2,,,Cash,,1680,31
 75 | 8,4.65,5,21,3,1431467100,41.892507781,-87.626214906,41.884987192,-87.620992913,0.59,,17031320100,Credit Card,,180,32
 76 | 32,7.25,8,11,3,1408448700,41.884987192,-87.620992913,41.879255084,-87.642648998,1.0,,17031281900,Cash,,660,28
 77 | 8,12.05,5,21,4,1401311700,41.891971508,-87.612945414,41.936237179,-87.656411531,0.2,,17031062900,Cash,Taxi Affiliation Services,720,6
 78 | 24,6.85,6,11,5,1372330800,41.901206994,-87.676355989,41.878865584,-87.625192142,0.0,,,Credit Card,Taxi Affiliation Services,420,32
 79 | 7,8.85,9,11,4,1378899000,41.914747305,-87.654007029,41.879255084,-87.642648998,2.5,,17031281900,Cash,3201 - CID Cab Co Inc,600,28
 80 | 32,6.25,3,15,5,1395933300,41.880994471,-87.632746489,41.89321636,-87.63784421,1.1,,17031081800,Credit Card,,420,8
 81 | 8,7.05,9,9,2,1410773400,41.89321636,-87.63784421,41.880994471,-87.632746489,1.3,,17031839100,Cash,KOAM Taxi Association,480,32
 82 | 7,8.65,2,13,7,1361022300,41.922082541,-87.634156093,41.890922026,-87.618868355,0.0,,17031081403,Cash,Taxi Affiliation Services,540,8
 83 | 28,12.65,9,20,7,1379189700,41.874005383,-87.66351755,41.92276062,-87.699155343,0.0,,,No Charge,Choice Taxi Association,1140,22
 84 | 7,3.25,5,18,4,1432144800,41.928967266,-87.656156831,41.928967266,-87.656156831,0.0,,17031070400,Cash,Blue Ribbon Taxi Association Inc.,60,7
 85 | 8,4.45,8,21,2,1439847900,41.898331794,-87.620762865,41.892507781,-87.626214906,0.5,,17031081500,Cash,Taxi Affiliation Services,240,8
 86 | 7,7.45,5,2,1,1431224100,41.914616286,-87.631717366,41.916005274,-87.675095116,2.2,,17031831000,Cash,Taxi Affiliation Services,420,22
 87 | 7,10.05,3,13,1,1394975700,41.914616286,-87.631717366,41.879255084,-87.642648998,3.0,,17031281900,Cash,Choice Taxi Association,840,28
 88 | 28,9.25,5,14,6,1400856300,41.874005383,-87.66351755,41.901206994,-87.676355989,2.4,,,Cash,Taxi Affiliation Services,720,24
 89 | 32,4.85,5,8,3,1400573700,41.880994471,-87.632746489,41.892507781,-87.626214906,0.7,,17031081500,Cash,Dispatch Taxi Affiliation,240,8
 90 | 8,4.05,10,2,2,1444615200,41.892072635,-87.628874157,41.89321636,-87.63784421,0.6,,17031081800,Cash,Dispatch Taxi Affiliation,120,8
 91 | 8,7.5,10,23,7,1475969400,41.904935302,-87.649907226,41.926811182,-87.642605247,1.6,,17031070103,Cash,Taxi Affiliation Services,540,7
 92 | 76,36.85,1,22,1,1422223200,41.97907082,-87.903039661,41.884987192,-87.620992913,18.1,,17031320100,Credit Card,,1560,32
 93 | 8,37.45,2,10,1,1393150500,41.898331794,-87.620762865,41.97907082,-87.903039661,0.0,,17031980000,Credit Card,Taxi Affiliation Services,1560,76
 94 | 32,6.0,12,23,6,1481325300,41.880994471,-87.632746489,41.880994471,-87.632746489,0.7,,17031839100,Cash,,360,32
 95 | 8,5.05,1,11,4,1389179700,41.900265687,-87.63210922,41.898331794,-87.620762865,0.06,,17031081300,Cash,,0,8
 96 | 8,12.85,8,22,1,1407708000,41.892042136,-87.63186395,41.943237122,-87.643470956,0.49,,17031061901,Cash,,660,6
 97 | 13,30.65,10,18,4,1413396000,41.983636307,-87.723583185,41.922686284,-87.649488729,0.6,,,Cash,Taxi Affiliation Services,3300,7
 98 | 32,10.45,1,7,5,1389252600,41.878865584,-87.625192142,41.874005383,-87.66351755,3.6,,,Cash,,600,28
 99 | 7,7.25,12,7,1,1387698300,41.922686284,-87.649488729,41.901206994,-87.676355989,0.1,,,Cash,Taxi Affiliation Services,420,24
100 | 32,6.65,5,18,5,1401386400,41.880994471,-87.632746489,41.880994471,-87.632746489,0.61,,17031839100,Cash,,540,32
101 | 32,10.5,1,1,3,1452561300,41.878865584,-87.625192142,41.922686284,-87.649488729,2.9,,,Cash,KOAM Taxi Association,600,7
102 | 


--------------------------------------------------------------------------------
/03/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | tensorflow==1.15.2
4 | tensorflow-data-validation==0.13.0
5 | jupyter==1.0.0
6 | 
7 | 


--------------------------------------------------------------------------------
/04/Apache-Beam-Intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Beam word count example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 9,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from __future__ import absolute_import\n",
 17 |     "\n",
 18 |     "import apache_beam as beam\n",
 19 |     "from apache_beam.io import ReadFromText\n",
 20 |     "from apache_beam.io import WriteToText\n",
 21 |     "import re\n",
 22 |     "import pandas as pd"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "A text file `Read` transform is applied to the `Pipeline` object itself, and produces a `PCollection` as output. Each element in the output PCollection represents one line of text from the input file.\n",
 30 |     "\n",
 31 |     "This transform splits the lines in `PCollection<String>`, where each element is an individual word in Shakespeare’s collected texts. "
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 6,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "'DONE'"
 43 |       ]
 44 |      },
 45 |      "execution_count": 6,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "p = beam.Pipeline()\n",
 52 |     "\n",
 53 |     "lines = p | 'read' >> ReadFromText(\"kinglear.txt\")\n",
 54 |     "lines | 'write' >> WriteToText(\"copy-of-kinglear.txt\")\n",
 55 |     "\n",
 56 |     "result = p.run()\n",
 57 |     "result.wait_until_finish()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 7,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "class ExtractWordsDoFn(beam.DoFn):\n",
 67 |     "    def process(self, element):\n",
 68 |     "        text_line = element.strip()\n",
 69 |     "        words = re.findall(r'[\\w\\']+', text_line, re.UNICODE)\n",
 70 |     "        return words\n",
 71 |     "    \n",
 72 |     "\n",
 73 |     "# Count the occurrences of each word.\n",
 74 |     "def count_ones(word_ones):\n",
 75 |     "    (word, ones) = word_ones\n",
 76 |     "    return (word, sum(ones))    \n",
 77 |     "\n",
 78 |     "# Format the counts into a PCollection of strings.\n",
 79 |     "def format_result(word_count):\n",
 80 |     "    (word, count) = word_count\n",
 81 |     "    return '%s\\t%d' % (word, count)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 8,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "data": {
 91 |       "text/plain": [
 92 |        "'DONE'"
 93 |       ]
 94 |      },
 95 |      "execution_count": 8,
 96 |      "metadata": {},
 97 |      "output_type": "execute_result"
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "# Creating a pipeline\n",
102 |     "p = beam.Pipeline()\n",
103 |     "\n",
104 |     "lines = p | 'read' >> ReadFromText(\"kinglear.txt\")\n",
105 |     "\n",
106 |     "counts = (lines\n",
107 |     "    | 'split' >> (beam.ParDo(ExtractWordsDoFn()))\n",
108 |     "    | 'pair_with_one' >> beam.Map(lambda x: (x, 1))\n",
109 |     "    | 'group' >> beam.GroupByKey()\n",
110 |     "    | 'count' >> beam.Map(count_ones))\n",
111 |     "\n",
112 |     "output = counts | 'format' >> beam.Map(format_result)\n",
113 |     "output | 'write' >> WriteToText(\"counts.txt\")\n",
114 |     "\n",
115 |     "result = p.run()\n",
116 |     "result.wait_until_finish()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 11,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/html": [
127 |        "<div>\n",
128 |        "<style scoped>\n",
129 |        "    .dataframe tbody tr th:only-of-type {\n",
130 |        "        vertical-align: middle;\n",
131 |        "    }\n",
132 |        "\n",
133 |        "    .dataframe tbody tr th {\n",
134 |        "        vertical-align: top;\n",
135 |        "    }\n",
136 |        "\n",
137 |        "    .dataframe thead th {\n",
138 |        "        text-align: right;\n",
139 |        "    }\n",
140 |        "</style>\n",
141 |        "<table border=\"1\" class=\"dataframe\">\n",
142 |        "  <thead>\n",
143 |        "    <tr style=\"text-align: right;\">\n",
144 |        "      <th></th>\n",
145 |        "      <th>0</th>\n",
146 |        "      <th>1</th>\n",
147 |        "    </tr>\n",
148 |        "  </thead>\n",
149 |        "  <tbody>\n",
150 |        "    <tr>\n",
151 |        "      <td>0</td>\n",
152 |        "      <td>1606</td>\n",
153 |        "      <td>1</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <td>1</td>\n",
157 |        "      <td>THE</td>\n",
158 |        "      <td>7</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <td>2</td>\n",
162 |        "      <td>TRAGEDY</td>\n",
163 |        "      <td>1</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <td>3</td>\n",
167 |        "      <td>OF</td>\n",
168 |        "      <td>16</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <td>4</td>\n",
172 |        "      <td>KING</td>\n",
173 |        "      <td>1</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <td>...</td>\n",
177 |        "      <td>...</td>\n",
178 |        "      <td>...</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <td>4902</td>\n",
182 |        "      <td>journey</td>\n",
183 |        "      <td>1</td>\n",
184 |        "    </tr>\n",
185 |        "    <tr>\n",
186 |        "      <td>4903</td>\n",
187 |        "      <td>weight</td>\n",
188 |        "      <td>1</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <td>4904</td>\n",
192 |        "      <td>ought</td>\n",
193 |        "      <td>1</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <td>4905</td>\n",
197 |        "      <td>oldest</td>\n",
198 |        "      <td>1</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <td>4906</td>\n",
202 |        "      <td>END</td>\n",
203 |        "      <td>1</td>\n",
204 |        "    </tr>\n",
205 |        "  </tbody>\n",
206 |        "</table>\n",
207 |        "<p>4907 rows × 2 columns</p>\n",
208 |        "</div>"
209 |       ],
210 |       "text/plain": [
211 |        "            0   1\n",
212 |        "0        1606   1\n",
213 |        "1         THE   7\n",
214 |        "2     TRAGEDY   1\n",
215 |        "3          OF  16\n",
216 |        "4        KING   1\n",
217 |        "...       ...  ..\n",
218 |        "4902  journey   1\n",
219 |        "4903   weight   1\n",
220 |        "4904    ought   1\n",
221 |        "4905   oldest   1\n",
222 |        "4906      END   1\n",
223 |        "\n",
224 |        "[4907 rows x 2 columns]"
225 |       ]
226 |      },
227 |      "execution_count": 11,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "data = pd.read_csv('counts.txt-00000-of-00001', sep = '\\t', header = None)\n",
234 |     "data"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 15,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/html": [
245 |        "<div>\n",
246 |        "<style scoped>\n",
247 |        "    .dataframe tbody tr th:only-of-type {\n",
248 |        "        vertical-align: middle;\n",
249 |        "    }\n",
250 |        "\n",
251 |        "    .dataframe tbody tr th {\n",
252 |        "        vertical-align: top;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe thead th {\n",
256 |        "        text-align: right;\n",
257 |        "    }\n",
258 |        "</style>\n",
259 |        "<table border=\"1\" class=\"dataframe\">\n",
260 |        "  <thead>\n",
261 |        "    <tr style=\"text-align: right;\">\n",
262 |        "      <th></th>\n",
263 |        "      <th>0</th>\n",
264 |        "      <th>1</th>\n",
265 |        "    </tr>\n",
266 |        "  </thead>\n",
267 |        "  <tbody>\n",
268 |        "    <tr>\n",
269 |        "      <td>128</td>\n",
270 |        "      <td>the</td>\n",
271 |        "      <td>705</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <td>119</td>\n",
275 |        "      <td>I</td>\n",
276 |        "      <td>620</td>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <td>122</td>\n",
280 |        "      <td>and</td>\n",
281 |        "      <td>587</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <td>13</td>\n",
285 |        "      <td>of</td>\n",
286 |        "      <td>456</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <td>27</td>\n",
290 |        "      <td>to</td>\n",
291 |        "      <td>430</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <td>...</td>\n",
295 |        "      <td>...</td>\n",
296 |        "      <td>...</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <td>2576</td>\n",
300 |        "      <td>outface</td>\n",
301 |        "      <td>1</td>\n",
302 |        "    </tr>\n",
303 |        "    <tr>\n",
304 |        "      <td>2578</td>\n",
305 |        "      <td>persecutions</td>\n",
306 |        "      <td>1</td>\n",
307 |        "    </tr>\n",
308 |        "    <tr>\n",
309 |        "      <td>2579</td>\n",
310 |        "      <td>sky</td>\n",
311 |        "      <td>1</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <td>2581</td>\n",
315 |        "      <td>precedent</td>\n",
316 |        "      <td>1</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <td>4906</td>\n",
320 |        "      <td>END</td>\n",
321 |        "      <td>1</td>\n",
322 |        "    </tr>\n",
323 |        "  </tbody>\n",
324 |        "</table>\n",
325 |        "<p>4907 rows × 2 columns</p>\n",
326 |        "</div>"
327 |       ],
328 |       "text/plain": [
329 |        "                 0    1\n",
330 |        "128            the  705\n",
331 |        "119              I  620\n",
332 |        "122            and  587\n",
333 |        "13              of  456\n",
334 |        "27              to  430\n",
335 |        "...            ...  ...\n",
336 |        "2576       outface    1\n",
337 |        "2578  persecutions    1\n",
338 |        "2579           sky    1\n",
339 |        "2581     precedent    1\n",
340 |        "4906           END    1\n",
341 |        "\n",
342 |        "[4907 rows x 2 columns]"
343 |       ]
344 |      },
345 |      "execution_count": 15,
346 |      "metadata": {},
347 |      "output_type": "execute_result"
348 |     }
349 |    ],
350 |    "source": [
351 |     "data.sort_values(by=[1], ascending=False)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "## Task\n",
359 |     "\n",
360 |     "Copy and adjust the beam job above so that it ignores a set of stop words\n",
361 |     "\n",
362 |     "Use the filename `counts-nostop.txt` for the output\n"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "stopwords = set(['the', 'and', 'of', 'to'])\n",
372 |     "\n",
373 |     "# TODO copy and adjust beam job"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "data = pd.read_csv('counts-nostop.txt-00000-of-00001', sep = '\\t', header = None)\n",
383 |     "data.sort_values(by=[1], ascending=False)"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": []
392 |   }
393 |  ],
394 |  "metadata": {
395 |   "kernelspec": {
396 |    "display_name": "Python 3",
397 |    "language": "python",
398 |    "name": "python3"
399 |   },
400 |   "language_info": {
401 |    "codemirror_mode": {
402 |     "name": "ipython",
403 |     "version": 3
404 |    },
405 |    "file_extension": ".py",
406 |    "mimetype": "text/x-python",
407 |    "name": "python",
408 |    "nbconvert_exporter": "python",
409 |    "pygments_lexer": "ipython3",
410 |    "version": "3.6.9"
411 |   }
412 |  },
413 |  "nbformat": 4,
414 |  "nbformat_minor": 2
415 | }
416 | 


--------------------------------------------------------------------------------
/04/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | jupyter==1.0.0
4 | apache-beam==2.15.0
5 | 
6 | 


--------------------------------------------------------------------------------
/05/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | tensorflow==1.15.2
4 | tensorflow-data-validation==0.14.0
5 | jupyter==1.0.0
6 | 
7 | 


--------------------------------------------------------------------------------
/05/simple.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "Ifon2ScEAsJO"
  8 |    },
  9 |    "source": [
 10 |     "<div class=\"devsite-table-wrapper\"><table class=\"tfo-notebook-buttons\" align=\"left\">\n",
 11 |     "<td><a target=\"_blank\" href=\"https://www.tensorflow.org/tfx/tutorials/transform/simple\">\n",
 12 |     "<img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a></td>\n",
 13 |     "<td><a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/transform/simple.ipynb\">\n",
 14 |     "<img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\">Run in Google Colab</a></td>\n",
 15 |     "<td><a target=\"_blank\" href=\"https://github.com/tensorflow/tfx/blob/master/docs/tutorials/transform/simple.ipynb\">\n",
 16 |     "<img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\">View source on GitHub</a></td>\n",
 17 |     "</table></div>"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {
 23 |     "colab_type": "text",
 24 |     "id": "tghWegsjhpkt"
 25 |    },
 26 |    "source": [
 27 |     "##### Copyright &copy; 2019 Google Inc."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "colab": {},
 35 |     "colab_type": "code",
 36 |     "id": "rSGJWC5biBiG"
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
 41 |     "# you may not use this file except in compliance with the License.\n",
 42 |     "# You may obtain a copy of the License at\n",
 43 |     "#\n",
 44 |     "# https://www.apache.org/licenses/LICENSE-2.0\n",
 45 |     "#\n",
 46 |     "# Unless required by applicable law or agreed to in writing, software\n",
 47 |     "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
 48 |     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
 49 |     "# See the License for the specific language governing permissions and\n",
 50 |     "# limitations under the License."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {
 56 |     "colab_type": "text",
 57 |     "id": "mPt5BHTwy_0F"
 58 |    },
 59 |    "source": [
 60 |     "# Preprocess data with TensorFlow Transform\n",
 61 |     "***The Feature Engineering Component of TensorFlow Extended (TFX)***\n",
 62 |     "\n",
 63 |     "This example colab notebook provides a very simple example of how <a target='_blank' href='https://www.tensorflow.org/tfx/transform/'>TensorFlow Transform (<code>tf.Transform</code>)</a> can be used to preprocess data using exactly the same code for both training a model and serving inferences in production.\n",
 64 |     "\n",
 65 |     "TensorFlow Transform is a library for preprocessing input data for TensorFlow, including creating features that require a full pass over the training dataset.  For example, using TensorFlow Transform you could:\n",
 66 |     "\n",
 67 |     "* Normalize an input value by using the mean and standard deviation\n",
 68 |     "* Convert strings to integers by generating a vocabulary over all of the input values\n",
 69 |     "* Convert floats to integers by assigning them to buckets, based on the observed data distribution\n",
 70 |     "\n",
 71 |     "TensorFlow has built-in support for manipulations on a single example or a batch of examples. `tf.Transform` extends these capabilities to support full passes over the entire training dataset.\n",
 72 |     "\n",
 73 |     "The output of `tf.Transform` is exported as a TensorFlow graph which you can use for both training and serving. Using the same graph for both training and serving can prevent skew, since the same transformations are applied in both stages."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {
 79 |     "colab_type": "text",
 80 |     "id": "RptgLn2RYuK3"
 81 |    },
 82 |    "source": [
 83 |     "## Python check and imports\n",
 84 |     "First, we'll make sure that we're using Python 3. Then, we'll go ahead and install and import the stuff we need."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 1,
 90 |    "metadata": {
 91 |     "colab": {},
 92 |     "colab_type": "code",
 93 |     "id": "tFcdSuXTidhH"
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "sys.version_info(major=3, minor=7, micro=3, releaselevel='final', serial=0)\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "import sys, os\n",
106 |     "# Confirm that we're using Python 3\n",
107 |     "assert sys.version_info.major is 3, 'Oops, not running Python 3'\n",
108 |     "print(sys.version_info)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 2,
114 |    "metadata": {
115 |     "colab": {},
116 |     "colab_type": "code",
117 |     "id": "K4QXVIM7iglN"
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "import pprint\n",
122 |     "import tempfile\n",
123 |     "import warnings; warnings.simplefilter('ignore')\n",
124 |     "\n",
125 |     "import tensorflow as tf\n",
126 |     "import tensorflow_transform as tft\n",
127 |     "import tensorflow_transform.beam.impl as tft_beam\n",
128 |     "from tensorflow_transform.tf_metadata import dataset_metadata\n",
129 |     "from tensorflow_transform.tf_metadata import dataset_schema\n",
130 |     "tf.logging.set_verbosity(tf.logging.ERROR)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {
136 |     "colab_type": "text",
137 |     "id": "CxOxaaOYRfl7"
138 |    },
139 |    "source": [
140 |     "## Data: Create some dummy data\n",
141 |     "We'll create some simple dummy data for our simple example:\n",
142 |     "\n",
143 |     "* `raw_data` is the initial raw data that we're going to preprocess\n",
144 |     "* `raw_data_metadata` contains the schema that tells us the types of each of the columns in `raw_data`.  In this case, it's very simple."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 3,
150 |    "metadata": {
151 |     "colab": {},
152 |     "colab_type": "code",
153 |     "id": "-R236Tkf_ON3"
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "raw_data = [\n",
158 |     "      {'x': 1, 'y': 1, 's': 'hello'},\n",
159 |     "      {'x': 2, 'y': 2, 's': 'world'},\n",
160 |     "      {'x': 3, 'y': 3, 's': 'hello'}\n",
161 |     "  ]\n",
162 |     "\n",
163 |     "raw_data_metadata = dataset_metadata.DatasetMetadata(\n",
164 |     "    dataset_schema.from_feature_spec({\n",
165 |     "        'y': tf.FixedLenFeature([], tf.float32),\n",
166 |     "        'x': tf.FixedLenFeature([], tf.float32),\n",
167 |     "        's': tf.FixedLenFeature([], tf.string),\n",
168 |     "    }))"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "colab_type": "text",
175 |     "id": "Zadh6MXLS3eD"
176 |    },
177 |    "source": [
178 |     "## Transform: Create a preprocessing function\n",
179 |     "The _preprocessing function_ is the most important concept of tf.Transform. A preprocessing function is where the transformation of the dataset really happens. It accepts and returns a dictionary of tensors, where a tensor means a <a target='_blank' href='https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/Tensor'><code>Tensor</code></a> or <a target='_blank' href='https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/SparseTensor'><code>SparseTensor</code></a>. There are two main groups of API calls that typically form the heart of a preprocessing function:\n",
180 |     "\n",
181 |     "1. **TensorFlow Ops:** Any function that accepts and returns tensors, which usually means TensorFlow ops. These add TensorFlow operations to the graph that transforms raw data into transformed data one feature vector at a time.  These will run for every example, during both training and serving.\n",
182 |     "2. **TensorFlow Transform Analyzers:** Any of the analyzers provided by tf.Transform. Analyzers also accept and return tensors, but unlike TensorFlow ops they only run once, during training, and typically make a full pass over the entire training dataset. They create <a target='_blank' href='https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/constant'>tensor constants</a>, which are added to your graph. For example, `tft.min` computes the minimum of a tensor over the training dataset. tf.Transform provides a fixed set of analyzers, but this will be extended in future versions.\n",
183 |     "\n",
184 |     "Caution: When you apply your preprocessing function to serving inferences, the constants that were created by analyzers during training do not change.  If your data has trend or seasonality components, plan accordingly."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 4,
190 |    "metadata": {
191 |     "colab": {},
192 |     "colab_type": "code",
193 |     "id": "H2wANNF_2dCR"
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "def preprocessing_fn(inputs):\n",
198 |     "    \"\"\"Preprocess input columns into transformed columns.\"\"\"\n",
199 |     "    x = inputs['x']\n",
200 |     "    y = inputs['y']\n",
201 |     "    s = inputs['s']\n",
202 |     "    x_centered = x - tft.mean(x)\n",
203 |     "    y_normalized = tft.scale_to_0_1(y)\n",
204 |     "    s_integerized = tft.compute_and_apply_vocabulary(s)\n",
205 |     "    return {\n",
206 |     "        'x_centered': x_centered,\n",
207 |     "        'y_normalized': y_normalized,\n",
208 |     "        's_integerized': s_integerized,\n",
209 |     "    }"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {
215 |     "colab_type": "text",
216 |     "id": "cSl9qyTCbBKR"
217 |    },
218 |    "source": [
219 |     "## Putting it all together\n",
220 |     "Now we're ready to transform our data.  We'll use Apache Beam with a direct runner, and supply three inputs:\n",
221 |     "1. `raw_data` - The raw input data that we created above\n",
222 |     "2. `raw_data_metadata` - The schema for the raw data\n",
223 |     "3. `preprocessing_fn` - The function that we created to do our transformation\n",
224 |     "\n",
225 |     "<aside class=\"key-term\"><b>Key Term:</b> <a target='_blank' href='https://beam.apache.org/'>Apache Beam</a> uses a <a target='_blank' href='https://beam.apache.org/documentation/programming-guide/#applying-transforms'>special syntax to define and invoke transforms</a>.  For example, in this line:\n",
226 |     "\n",
227 |     "<code><blockquote>result = pass_this | 'name this step' >> to_this_call</blockquote></code>\n",
228 |     "\n",
229 |     "The method <code>to_this_call</code> is being invoked and passed the object called <code>pass_this</code>, and <a target='_blank' href='https://stackoverflow.com/questions/50519662/what-does-the-redirection-mean-in-apache-beam-python'>this operation will be referred to as <code>name this step</code> in a stack trace</a>.  The result of the call to <code>to_this_call</code> is returned in <code>result</code>.  You will often see stages of a pipeline chained together like this:\n",
230 |     "\n",
231 |     "<code><blockquote>result = apache_beam.Pipeline() | 'first step' >> do_this_first() | 'second step' >> do_this_last()</blockquote></code>\n",
232 |     "\n",
233 |     "and since that started with a new pipeline, you can continue like this:\n",
234 |     "\n",
235 |     "<code><blockquote>next_result = result | 'doing more stuff' >> another_function()</blockquote></code></aside>"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 5,
241 |    "metadata": {
242 |     "colab": {},
243 |     "colab_type": "code",
244 |     "id": "mAF9w7RTZU7c"
245 |    },
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "\n",
252 |       "Raw data:\n",
253 |       "[{'s': 'hello', 'x': 1, 'y': 1},\n",
254 |       " {'s': 'world', 'x': 2, 'y': 2},\n",
255 |       " {'s': 'hello', 'x': 3, 'y': 3}]\n",
256 |       "\n",
257 |       "Transformed data:\n",
258 |       "[{'s_integerized': 0, 'x_centered': -1.0, 'y_normalized': 0.0},\n",
259 |       " {'s_integerized': 1, 'x_centered': 0.0, 'y_normalized': 0.5},\n",
260 |       " {'s_integerized': 0, 'x_centered': 1.0, 'y_normalized': 1.0}]\n"
261 |      ]
262 |     }
263 |    ],
264 |    "source": [
265 |     "def main():\n",
266 |     "  # Ignore the warnings\n",
267 |     "  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):\n",
268 |     "    transformed_dataset, transform_fn = (  # pylint: disable=unused-variable\n",
269 |     "        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(\n",
270 |     "            preprocessing_fn))\n",
271 |     "\n",
272 |     "    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable\n",
273 |     "\n",
274 |     "    print('\\nRaw data:\\n{}\\n'.format(pprint.pformat(raw_data)))\n",
275 |     "    print('Transformed data:\\n{}'.format(pprint.pformat(transformed_data)))\n",
276 |     "\n",
277 |     "if __name__ == '__main__':\n",
278 |     "    main()"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {
284 |     "colab_type": "text",
285 |     "id": "NO6LyTneNndy"
286 |    },
287 |    "source": [
288 |     "## Is this the right answer?\n",
289 |     "Previously, we used `tf.Transform` to do this:\n",
290 |     "```\n",
291 |     "x_centered = x - tft.mean(x)\n",
292 |     "y_normalized = tft.scale_to_0_1(y)\n",
293 |     "s_integerized = tft.compute_and_apply_vocabulary(s)\n",
294 |     "x_centered_times_y_normalized = (x_centered * y_normalized)\n",
295 |     "```\n",
296 |     "\n",
297 |     "### x_centered\n",
298 |     "With input of `[1, 2, 3]` the mean of x is 2, and we subtract it from x to center our x values at 0.  So our result of `[-1.0, 0.0, 1.0]` is correct.\n",
299 |     "### y_normalized\n",
300 |     "We wanted to scale our y values between 0 and 1.  Our input was `[1, 2, 3]` so our result of `[0.0, 0.5, 1.0]` is correct.\n",
301 |     "### s_integerized\n",
302 |     "We wanted to map our strings to indexes in a vocabulary, and there were only 2 words in our vocabulary (\"hello\" and \"world\").  So with input of `[\"hello\", \"world\", \"hello\"]` our result of `[0, 1, 0]` is correct."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {
308 |     "colab": {},
309 |     "colab_type": "code",
310 |     "id": "YpsQHsMtekQo"
311 |    },
312 |    "source": [
313 |     "## Task: \n",
314 |     "Modify your `preprocessing_fn` to perform feature engineering on the data (e.g. return \"x_centered_times_y_normalized\")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": []
323 |   }
324 |  ],
325 |  "metadata": {
326 |   "colab": {
327 |    "collapsed_sections": [
328 |     "tghWegsjhpkt"
329 |    ],
330 |    "name": "simple.ipynb",
331 |    "private_outputs": true,
332 |    "provenance": [],
333 |    "toc_visible": true
334 |   },
335 |   "kernelspec": {
336 |    "display_name": "Python 3",
337 |    "language": "python",
338 |    "name": "python3"
339 |   },
340 |   "language_info": {
341 |    "codemirror_mode": {
342 |     "name": "ipython",
343 |     "version": 3
344 |    },
345 |    "file_extension": ".py",
346 |    "mimetype": "text/x-python",
347 |    "name": "python",
348 |    "nbconvert_exporter": "python",
349 |    "pygments_lexer": "ipython3",
350 |    "version": "3.7.3"
351 |   }
352 |  },
353 |  "nbformat": 4,
354 |  "nbformat_minor": 1
355 | }
356 | 


--------------------------------------------------------------------------------
/06/oop.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Object oriented programming in Python\n",
  8 |     "### Class structure\n",
  9 |     "the `__init__` method and objects"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Blu is a bird\n",
 22 |       "Woo is also a bird\n",
 23 |       "Blu is 10 years old\n",
 24 |       "Woo is 15 years old\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "class Parrot:\n",
 30 |     "\n",
 31 |     "    # class attribute\n",
 32 |     "    species = \"bird\"\n",
 33 |     "\n",
 34 |     "    # instance attribute\n",
 35 |     "    def __init__(self, name, age):\n",
 36 |     "        self.name = name\n",
 37 |     "        self.age = age\n",
 38 |     "\n",
 39 |     "# instantiate the Parrot class\n",
 40 |     "blu = Parrot(\"Blu\", 10)\n",
 41 |     "woo = Parrot(\"Woo\", 15)\n",
 42 |     "\n",
 43 |     "# access the class attributes\n",
 44 |     "print(\"Blu is a {}\".format(blu.__class__.species))\n",
 45 |     "print(\"Woo is also a {}\".format(woo.__class__.species))\n",
 46 |     "\n",
 47 |     "# access the instance attributes\n",
 48 |     "print(\"{} is {} years old\".format( blu.name, blu.age))\n",
 49 |     "print(\"{} is {} years old\".format( woo.name, woo.age))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "### Methods\n",
 57 |     "Now, we look at creating additional methods"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 2,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "Blu sings 'Happy'\n",
 70 |       "Blu is now dancing\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "class Parrot:\n",
 76 |     "    \n",
 77 |     "    # instance attributes\n",
 78 |     "    def __init__(self, name, age):\n",
 79 |     "        self.name = name\n",
 80 |     "        self.age = age\n",
 81 |     "    \n",
 82 |     "    # instance method\n",
 83 |     "    def sing(self, song):\n",
 84 |     "        return \"{} sings {}\".format(self.name, song)\n",
 85 |     "\n",
 86 |     "    def dance(self):\n",
 87 |     "        return \"{} is now dancing\".format(self.name)\n",
 88 |     "\n",
 89 |     "# instantiate the object\n",
 90 |     "blu = Parrot(\"Blu\", 10)\n",
 91 |     "\n",
 92 |     "# call our instance methods\n",
 93 |     "print(blu.sing(\"'Happy'\"))\n",
 94 |     "print(blu.dance())"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## Inheritance\n",
102 |     "parent and child class"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "Bird is ready\n",
115 |       "Penguin is ready\n",
116 |       "Penguin\n",
117 |       "Swim faster\n",
118 |       "Run faster\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "# parent class\n",
124 |     "class Bird:\n",
125 |     "    \n",
126 |     "    def __init__(self):\n",
127 |     "        print(\"Bird is ready\")\n",
128 |     "\n",
129 |     "    def whoisThis(self):\n",
130 |     "        print(\"Bird\")\n",
131 |     "\n",
132 |     "    def swim(self):\n",
133 |     "        print(\"Swim faster\")\n",
134 |     "\n",
135 |     "# child class\n",
136 |     "class Penguin(Bird):\n",
137 |     "\n",
138 |     "    def __init__(self):\n",
139 |     "        # call super() function\n",
140 |     "        super().__init__()\n",
141 |     "        print(\"Penguin is ready\")\n",
142 |     "\n",
143 |     "    def whoisThis(self):\n",
144 |     "        print(\"Penguin\")\n",
145 |     "\n",
146 |     "    def run(self):\n",
147 |     "        print(\"Run faster\")\n",
148 |     "\n",
149 |     "peggy = Penguin()\n",
150 |     "peggy.whoisThis()\n",
151 |     "peggy.swim()\n",
152 |     "peggy.run()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Encapsulation\n",
160 |     "Using OOP in Python, we can restrict access to methods and variables. This prevent data from direct modification which is called encapsulation. In Python, we denote private attribute using underscore as prefix i.e single “ _ “ or double “ __“."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 4,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "Selling Price: 900\n",
173 |       "Selling Price: 900\n",
174 |       "Selling Price: 1000\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "class Computer:\n",
180 |     "\n",
181 |     "    def __init__(self):\n",
182 |     "        self.__maxprice = 900\n",
183 |     "\n",
184 |     "    def sell(self):\n",
185 |     "        print(\"Selling Price: {}\".format(self.__maxprice))\n",
186 |     "\n",
187 |     "    def setMaxPrice(self, price):\n",
188 |     "        self.__maxprice = price\n",
189 |     "\n",
190 |     "c = Computer()\n",
191 |     "c.sell()\n",
192 |     "\n",
193 |     "# change the price\n",
194 |     "c.__maxprice = 1000\n",
195 |     "c.sell()\n",
196 |     "\n",
197 |     "# using setter function\n",
198 |     "c.setMaxPrice(1000)\n",
199 |     "c.sell()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Polymorphism\n",
207 |     "Polymorphism is an ability (in OOP) to use common interface for multiple form (data types).\n",
208 |     "\n",
209 |     "Suppose, we need to color a shape, there are multiple shape option (rectangle, square, circle). However we could use same method to color any shape. This concept is called Polymorphism."
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 5,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "Parrot can fly\n",
222 |       "Penguin can't fly\n"
223 |      ]
224 |     }
225 |    ],
226 |    "source": [
227 |     "class Parrot:\n",
228 |     "\n",
229 |     "    def fly(self):\n",
230 |     "        print(\"Parrot can fly\")\n",
231 |     "    \n",
232 |     "    def swim(self):\n",
233 |     "        print(\"Parrot can't swim\")\n",
234 |     "\n",
235 |     "class Penguin:\n",
236 |     "\n",
237 |     "    def fly(self):\n",
238 |     "        print(\"Penguin can't fly\")\n",
239 |     "    \n",
240 |     "    def swim(self):\n",
241 |     "        print(\"Penguin can swim\")\n",
242 |     "\n",
243 |     "# common interface\n",
244 |     "def flying_test(bird):\n",
245 |     "    bird.fly()\n",
246 |     "\n",
247 |     "#instantiate objects\n",
248 |     "blu = Parrot()\n",
249 |     "peggy = Penguin()\n",
250 |     "\n",
251 |     "# passing the object\n",
252 |     "flying_test(blu)\n",
253 |     "flying_test(peggy)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": []
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "Python 3",
267 |    "language": "python",
268 |    "name": "python3"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 3
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython3",
280 |    "version": "3.7.3"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 2
285 | }
286 | 


--------------------------------------------------------------------------------
/06/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | tensorflow==1.15.2
4 | tensorflow-transform==0.14.0
5 | jupyter==1.0.0


--------------------------------------------------------------------------------
/07/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | tensorflow==1.15.2
4 | jupyter==1.0.0


--------------------------------------------------------------------------------
/08/keras_train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Using TensorFlow backend.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "'''Trains and evaluate a simple MLP\n",
 18 |     "on the Reuters newswire topic classification task.\n",
 19 |     "'''\n",
 20 |     "from __future__ import print_function\n",
 21 |     "\n",
 22 |     "import numpy as np\n",
 23 |     "import keras\n",
 24 |     "from keras.datasets import reuters\n",
 25 |     "from keras.models import Sequential\n",
 26 |     "from keras.layers import Dense, Dropout, Activation\n",
 27 |     "from keras.preprocessing.text import Tokenizer\n",
 28 |     "\n",
 29 |     "# The following import and function call are the only additions to code required\n",
 30 |     "# to automatically log metrics and parameters to MLflow.\n",
 31 |     "import mlflow.keras"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Loading data...\n",
 44 |       "8982 train sequences\n",
 45 |       "2246 test sequences\n",
 46 |       "46 classes\n",
 47 |       "Vectorizing sequence data...\n",
 48 |       "x_train shape: (8982, 1000)\n",
 49 |       "x_test shape: (2246, 1000)\n",
 50 |       "Convert class vector to binary class matrix (for use with categorical_crossentropy)\n",
 51 |       "y_train shape: (8982, 46)\n",
 52 |       "y_test shape: (2246, 46)\n",
 53 |       "Building model...\n"
 54 |      ]
 55 |     },
 56 |     {
 57 |      "name": "stderr",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "/Users/yuconghu/github/deml-lab/08/venv/lib/python3.7/site-packages/keras/engine/training_utils.py:811: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
 61 |       "  if isinstance(loss, collections.Mapping):\n",
 62 |       "/Users/yuconghu/github/deml-lab/08/venv/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:339: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
 63 |       "  if not isinstance(values, collections.Sequence):\n"
 64 |      ]
 65 |     },
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "Train on 8083 samples, validate on 899 samples\n",
 71 |       "Epoch 1/5\n",
 72 |       "8083/8083 [==============================] - 2s 199us/step - loss: 1.4177 - accuracy: 0.6787 - val_loss: 1.0814 - val_accuracy: 0.7620\n",
 73 |       "Epoch 2/5\n",
 74 |       "8083/8083 [==============================] - 1s 168us/step - loss: 0.7874 - accuracy: 0.8152 - val_loss: 0.9548 - val_accuracy: 0.7887\n",
 75 |       "Epoch 3/5\n",
 76 |       "8083/8083 [==============================] - 1s 156us/step - loss: 0.5516 - accuracy: 0.8639 - val_loss: 0.8593 - val_accuracy: 0.7909\n",
 77 |       "Epoch 4/5\n",
 78 |       "8083/8083 [==============================] - 1s 147us/step - loss: 0.4129 - accuracy: 0.8969 - val_loss: 0.8881 - val_accuracy: 0.8076\n",
 79 |       "Epoch 5/5\n",
 80 |       "8083/8083 [==============================] - 1s 146us/step - loss: 0.3248 - accuracy: 0.9223 - val_loss: 0.8753 - val_accuracy: 0.8209\n",
 81 |       "2246/2246 [==============================] - 0s 49us/step\n",
 82 |       "Test score: 0.8929576534091206\n",
 83 |       "Test accuracy: 0.7898486256599426\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "mlflow.keras.autolog()\n",
 89 |     "\n",
 90 |     "max_words = 1000\n",
 91 |     "batch_size = 32\n",
 92 |     "epochs = 5\n",
 93 |     "\n",
 94 |     "# save np.load\n",
 95 |     "np_load_old = np.load\n",
 96 |     "\n",
 97 |     "# modify the default parameters of np.load\n",
 98 |     "np.load = lambda *a,**k: np_load_old(*a, **k)\n",
 99 |     "\n",
100 |     "print('Loading data...')\n",
101 |     "(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,\n",
102 |     "                                                         test_split=0.2)\n",
103 |     "\n",
104 |     "# restore np.load for future normal usage\n",
105 |     "np.load = np_load_old\n",
106 |     "\n",
107 |     "print(len(x_train), 'train sequences')\n",
108 |     "print(len(x_test), 'test sequences')\n",
109 |     "\n",
110 |     "num_classes = np.max(y_train) + 1\n",
111 |     "print(num_classes, 'classes')\n",
112 |     "\n",
113 |     "print('Vectorizing sequence data...')\n",
114 |     "tokenizer = Tokenizer(num_words=max_words)\n",
115 |     "x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')\n",
116 |     "x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')\n",
117 |     "print('x_train shape:', x_train.shape)\n",
118 |     "print('x_test shape:', x_test.shape)\n",
119 |     "\n",
120 |     "print('Convert class vector to binary class matrix '\n",
121 |     "      '(for use with categorical_crossentropy)')\n",
122 |     "y_train = keras.utils.to_categorical(y_train, num_classes)\n",
123 |     "y_test = keras.utils.to_categorical(y_test, num_classes)\n",
124 |     "print('y_train shape:', y_train.shape)\n",
125 |     "print('y_test shape:', y_test.shape)\n",
126 |     "\n",
127 |     "print('Building model...')\n",
128 |     "model = Sequential()\n",
129 |     "model.add(Dense(512, input_shape=(max_words,)))\n",
130 |     "model.add(Activation('relu'))\n",
131 |     "model.add(Dropout(0.5))\n",
132 |     "model.add(Dense(num_classes))\n",
133 |     "model.add(Activation('softmax'))\n",
134 |     "\n",
135 |     "model.compile(loss='categorical_crossentropy',\n",
136 |     "              optimizer='adam',\n",
137 |     "              metrics=['accuracy'])\n",
138 |     "\n",
139 |     "history = model.fit(x_train, y_train,\n",
140 |     "                    batch_size=batch_size,\n",
141 |     "                    epochs=epochs,\n",
142 |     "                    verbose=1,\n",
143 |     "                    validation_split=0.1)\n",
144 |     "score = model.evaluate(x_test, y_test,\n",
145 |     "                       batch_size=batch_size, verbose=1)\n",
146 |     "print('Test score:', score[0])\n",
147 |     "print('Test accuracy:', score[1])"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": []
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.7.3"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------
/08/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | mlflow==1.3.0
4 | scikit-learn==0.21.3
5 | tensorflow==1.15.2
6 | keras==2.3.1
7 | jupyter==1.0.0


--------------------------------------------------------------------------------
/08/sklearn_train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from __future__ import print_function\n",
 12 |     "\n",
 13 |     "import numpy as np\n",
 14 |     "from sklearn.linear_model import LogisticRegression\n",
 15 |     "\n",
 16 |     "import mlflow\n",
 17 |     "import mlflow.sklearn"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "if __name__ == \"__main__\":\n",
 27 |     "    X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)\n",
 28 |     "    y = np.array([0, 0, 1, 1, 1, 0])\n",
 29 |     "    lr = LogisticRegression()\n",
 30 |     "    lr.fit(X, y)\n",
 31 |     "    score = lr.score(X, y)\n",
 32 |     "    print(\"Score: %s\" % score)\n",
 33 |     "    mlflow.log_metric(\"score\", score)\n",
 34 |     "    mlflow.sklearn.log_model(lr, \"model\")\n",
 35 |     "    print(\"Model saved in run %s\" % mlflow.active_run().info.run_uuid)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Exercise:\n",
 43 |     "Wrap MLFlow around `GridSearchCV` using `sklearn`."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Automatically created module for IPython interactive environment\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "from sklearn import datasets\n",
 61 |     "from sklearn.model_selection import train_test_split\n",
 62 |     "from sklearn.model_selection import GridSearchCV\n",
 63 |     "from sklearn.metrics import classification_report\n",
 64 |     "from sklearn.svm import SVC\n",
 65 |     "\n",
 66 |     "print(__doc__)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# Loading the Digits dataset\n",
 76 |     "digits = datasets.load_digits()\n",
 77 |     "\n",
 78 |     "# To apply an classifier on this data, we need to flatten the image, to\n",
 79 |     "# turn the data in a (samples, feature) matrix:\n",
 80 |     "n_samples = len(digits.images)\n",
 81 |     "X = digits.images.reshape((n_samples, -1))\n",
 82 |     "y = digits.target\n",
 83 |     "\n",
 84 |     "# Split the dataset in two equal parts\n",
 85 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 86 |     "    X, y, test_size=0.5, random_state=0)\n",
 87 |     "\n",
 88 |     "# Set the parameters by cross-validation\n",
 89 |     "tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n",
 90 |     "                     'C': [1, 10, 100, 1000]},\n",
 91 |     "                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "kernelspec": {
104 |    "display_name": "Python 3",
105 |    "language": "python",
106 |    "name": "python3"
107 |   },
108 |   "language_info": {
109 |    "codemirror_mode": {
110 |     "name": "ipython",
111 |     "version": 3
112 |    },
113 |    "file_extension": ".py",
114 |    "mimetype": "text/x-python",
115 |    "name": "python",
116 |    "nbconvert_exporter": "python",
117 |    "pygments_lexer": "ipython3",
118 |    "version": "3.7.3"
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 2
123 | }
124 | 


--------------------------------------------------------------------------------
/09/datawig.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "from sklearn.model_selection import train_test_split\n",
 12 |     "import datawig"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Imputing categorical values"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "df = datawig.utils.generate_df_string(\n",
 29 |     "    num_samples=200, \n",
 30 |     "    data_column_name='sentences', \n",
 31 |     "    label_column_name='label')\n",
 32 |     "\n",
 33 |     "df_train, df_test = datawig.utils.random_split(df)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "df_train"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "#Initialize a SimpleImputer model\n",
 52 |     "imputer = datawig.SimpleImputer(\n",
 53 |     "    input_columns=['sentences'], # column(s) containing information about the column we want to impute\n",
 54 |     "    output_column='label', # the column we'd like to impute values for\n",
 55 |     "    output_path = 'imputer_model' # stores model data and metrics\n",
 56 |     ")\n",
 57 |     "\n",
 58 |     "#Fit an imputer model on the train data\n",
 59 |     "imputer.fit(train_df=df_train)\n",
 60 |     "\n",
 61 |     "#Impute missing values and return original dataframe with predictions\n",
 62 |     "imputed = imputer.predict(df_test)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "imputed"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## Imputing numerical values"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "df_numeric = datawig.utils.generate_df_numeric(\n",
 88 |     "    num_samples=200, \n",
 89 |     "    data_column_name='x', \n",
 90 |     "    label_column_name='y')      \n",
 91 |     "\n",
 92 |     "df_train_numeric, df_test_numeric = datawig.utils.random_split(df_numeric)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "df_train_numeric"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "#Initialize a SimpleImputer model\n",
111 |     "imputer_numeric = datawig.SimpleImputer(\n",
112 |     "    input_columns=['x'], # column(s) containing information about the column we want to impute\n",
113 |     "    output_column='y', # the column we'd like to impute values for\n",
114 |     "    output_path = 'imputer_model_numeric' # stores model data and metrics\n",
115 |     ")\n",
116 |     "\n",
117 |     "#Fit an imputer model on the train data\n",
118 |     "imputer_numeric.fit(train_df=df_train_numeric, num_epochs=50)\n",
119 |     "\n",
120 |     "#Impute missing values and return original dataframe with predictions\n",
121 |     "imputed_numeric = imputer_numeric.predict(df_test_numeric)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "imputed_numeric"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Imputing missing values on real world data"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "product_data = pd.read_csv('products.csv', sep='\\t')\n",
147 |     "training_products, test_products = train_test_split(product_data, test_size=0.2, random_state=42)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### Train an imputation model for the 'category' column and measure how good the imputation works"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": []
163 |   }
164 |  ],
165 |  "metadata": {
166 |   "kernelspec": {
167 |    "display_name": "Python 3",
168 |    "language": "python",
169 |    "name": "python3"
170 |   },
171 |   "language_info": {
172 |    "codemirror_mode": {
173 |     "name": "ipython",
174 |     "version": 3
175 |    },
176 |    "file_extension": ".py",
177 |    "mimetype": "text/x-python",
178 |    "name": "python",
179 |    "nbconvert_exporter": "python",
180 |    "pygments_lexer": "ipython3",
181 |    "version": "3.6.9"
182 |   }
183 |  },
184 |  "nbformat": 4,
185 |  "nbformat_minor": 2
186 | }
187 | 


--------------------------------------------------------------------------------
/09/openrefine.txt:
--------------------------------------------------------------------------------
1 | https://github.com/OpenRefine/OpenRefine/releases/download/3.2/openrefine-linux-3.2.tar.gz
2 | https://github.com/OpenRefine/OpenRefine/releases/download/3.2/openrefine-mac-3.2.dmg
3 | https://github.com/OpenRefine/OpenRefine/releases/download/3.2/openrefine-win-3.2.zip
4 | 
5 | 


--------------------------------------------------------------------------------
/09/requirements.txt:
--------------------------------------------------------------------------------
1 | datawig
2 | scikit-learn==0.21.3
3 | pandas==0.25.1
4 | jupyter==1.0.0
5 | 
6 | 


--------------------------------------------------------------------------------
/10/images/Complex_NoProc_V3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/10/images/Complex_NoProc_V3.jpg


--------------------------------------------------------------------------------
/10/images/cnn_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/10/images/cnn_arch.png


--------------------------------------------------------------------------------
/10/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.17.2
2 | pandas==0.25.1
3 | scikit-learn==0.21.3
4 | matplotlib==3.1.1
5 | aif360==0.2.2
6 | lime==0.1.1.36
7 | jupyter==1.0.0


--------------------------------------------------------------------------------
/10/tutorial_credit_scoring.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Detecting and mitigating age bias on credit decisions \n",
  8 |     "\n",
  9 |     "The goal of this tutorial is to introduce the basic functionality of AI Fairness 360 to an interested developer who may not have a background in bias detection and mitigation.\n",
 10 |     "\n",
 11 |     "### Biases and Machine Learning\n",
 12 |     "A machine learning model makes predictions of an outcome for a particular instance. (Given an instance of a loan application, predict if the applicant will repay the loan.) The model makes these predictions based on a training dataset, where many other instances (other loan applications) and actual outcomes (whether they repaid) are provided. Thus, a machine learning algorithm will attempt to find patterns, or generalizations, in the training dataset to use when a prediction for a new instance is needed. (For example, one pattern it might discover is \"if a person has salary > USD 40K and has outstanding debt < USD 5, they will repay the loan\".) In many domains this technique, called supervised machine learning, has worked very well.\n",
 13 |     "\n",
 14 |     "However, sometimes the patterns that are found may not be desirable or may even be illegal. For example, a loan repay model may determine that age plays a significant role in the prediction of repayment because the training dataset happened to have better repayment for one age group than for another. This raises two problems: 1) the training dataset may not be representative of the true population of people of all age groups, and 2) even if it is representative, it is illegal to base any decision on a applicant's age, regardless of whether this is a good prediction based on historical data.\n",
 15 |     "\n",
 16 |     "AI Fairness 360 is designed to help address this problem with _fairness metrics_ and _bias mitigators_.  Fairness metrics can be used to check for bias in machine learning workflows.  Bias mitigators can be used to overcome bias in the workflow to produce a more fair outcome. \n",
 17 |     "\n",
 18 |     "The loan scenario describes an intuitive example of illegal bias. However, not all undesirable bias in machine learning is illegal it may also exist in more subtle ways.  For example, a loan company may want a diverse portfolio of customers across all income levels, and thus, will deem it undesirable if they are making more loans to high income levels over low income levels.  Although this is not illegal or unethical, it is undesirable for the company's strategy.\n",
 19 |     "\n",
 20 |     "As these two examples illustrate, a bias detection and/or mitigation toolkit needs to be tailored to the particular bias of interest.  More specifically, it needs to know the attribute or attributes, called _protected attributes_, that are of interest: race is one example of a _protected attribute_ and age is a second.\n",
 21 |     "\n",
 22 |     "### The Machine Learning Workflow\n",
 23 |     "To understand how bias can enter a machine learning model, we first review the basics of how a model is created in a supervised machine learning process.  \n",
 24 |     "\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "![image](images/Complex_NoProc_V3.jpg)\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "\n",
 36 |     "First, the process starts with a _training dataset_, which contains a sequence of instances, where each instance has two components: the features and the correct prediction for those features.  Next, a machine learning algorithm is trained on this training dataset to produce a machine learning model.  This generated model can be used to make a prediction when given a new instance.  A second dataset with features and correct predictions, called a _test dataset_, is used to assess the accuracy of the model.\n",
 37 |     "Since this test dataset is the same format as the training dataset, a set of instances of features and prediction pairs, often these two datasets derive from the same initial dataset.  A random partitioning algorithm is used to split the initial dataset into training and test datasets.\n",
 38 |     "\n",
 39 |     "Bias can enter the system in any of the three steps above.  The training data set may be biased in that its outcomes may be biased towards particular kinds of instances.  The algorithm that creates the model may be biased in that it may generate models that are weighted towards particular features in the input. The test data set may be biased in that it has expectations on correct answers that may be biased.  These three points in the machine learning process represent points for testing and mitigating bias.  In AI Fairness 360 codebase, we call these points _pre-processing_, _in-processing_, and _post-processing_. \n",
 40 |     "\n",
 41 |     "### AI Fairness 360\n",
 42 |     "We are now ready to utilize AI Fairness 360 (`aif360`) to detect and mitigate bias.  We will use the German credit dataset, splitting it into a training and test dataset.  We will look for bias in the creation of a machine learning model to predict if an applicant should be given credit based on various features from a typical credit application.  The protected attribute will be \"Age\", with \"1\" (older than or equal to 25) and \"0\" (younger than 25) being the values for the privileged and unprivileged groups, respectively.\n",
 43 |     "For this first tutorial, we will check for bias in the initial training data, mitigate the bias, and recheck.  More sophisticated machine learning workflows are given in the author tutorials and demo notebooks in the codebase.\n",
 44 |     "\n",
 45 |     "Here are the steps involved\n",
 46 |     "#### Step 1: Write import statements\n",
 47 |     "#### Step 2: Set bias detection options, load dataset, and split between train and test\n",
 48 |     "#### Step 3: Compute fairness metric on original training dataset\n",
 49 |     "#### Step 4: Mitigate bias by transforming the original dataset\n",
 50 |     "#### Step 5: Compute fairness metric on transformed training dataset\n",
 51 |     "\n",
 52 |     "### Step 1 Import Statements\n",
 53 |     "As with any python program, the first step will be to import the necessary packages.  Below we import several components from the `aif360` package.  We import the GermanDataset, metrics to check for bias, and classes related to the algorithm we will use to mitigate bias."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 1,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Load all necessary packages\n",
 63 |     "import sys\n",
 64 |     "sys.path.insert(1, \"../\")  \n",
 65 |     "\n",
 66 |     "import numpy as np\n",
 67 |     "np.random.seed(0)\n",
 68 |     "\n",
 69 |     "from aif360.datasets import GermanDataset\n",
 70 |     "from aif360.metrics import BinaryLabelDatasetMetric\n",
 71 |     "from aif360.algorithms.preprocessing import Reweighing\n",
 72 |     "\n",
 73 |     "from IPython.display import Markdown, display"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Step 2 Load dataset, specifying protected attribute, and split dataset into train and test\n",
 81 |     "In Step 2 we load the initial dataset, setting the protected attribute to be age.  We then splits the original dataset into training and testing datasets.  Although we will use only  the training dataset in this tutorial, a normal workflow would also use a test dataset for assessing the efficacy (accuracy, fairness, etc.) during the development of a machine learning model.  Finally, we set two variables (to be used in Step 3) for the privileged (1) and unprivileged (0) values for the age attribute.  These are key inputs for detecting and mitigating bias, which will be Step 3 and Step 4.  "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 2,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "dataset_orig = GermanDataset(\n",
 91 |     "    protected_attribute_names=['age'],           # this dataset also contains protected\n",
 92 |     "                                                 # attribute for \"sex\" which we do not\n",
 93 |     "                                                 # consider in this evaluation\n",
 94 |     "    privileged_classes=[lambda x: x >= 25],      # age >=25 is considered privileged\n",
 95 |     "    features_to_drop=['personal_status', 'sex'] # ignore sex-related attributes\n",
 96 |     ")\n",
 97 |     "\n",
 98 |     "dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)\n",
 99 |     "\n",
100 |     "privileged_groups = [{'age': 1}]\n",
101 |     "unprivileged_groups = [{'age': 0}]"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Step 3 Compute fairness metric on original training dataset\n",
109 |     "Now that we've identified the protected attribute 'age' and defined privileged and unprivileged values, we can use aif360 to detect bias in the dataset.  One simple test is to compare the percentage of favorable results for the privileged and unprivileged groups, subtracting the former percentage from the latter.   A negative value indicates less favorable outcomes for the unprivileged groups.  This is implemented in the method called mean_difference on the BinaryLabelDatasetMetric class.  The code below performs this check and displays the output, showing that the difference is -0.169905."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 3,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/markdown": [
120 |        "#### Original training dataset"
121 |       ],
122 |       "text/plain": [
123 |        "<IPython.core.display.Markdown object>"
124 |       ]
125 |      },
126 |      "metadata": {},
127 |      "output_type": "display_data"
128 |     },
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "Difference in mean outcomes between unprivileged and privileged groups = -0.169905\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train, \n",
139 |     "                                             unprivileged_groups=unprivileged_groups,\n",
140 |     "                                             privileged_groups=privileged_groups)\n",
141 |     "display(Markdown(\"#### Original training dataset\"))\n",
142 |     "print(\"Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_train.mean_difference())"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "### Step 4 Mitigate bias by transforming the original dataset\n",
150 |     "The previous step showed that the privileged group was getting 17% more positive outcomes in the training dataset.   Since this is not desirable, we are going to try to mitigate this bias in the training dataset.  As stated above, this is called _pre-processing_ mitigation because it happens before the creation of the model.  \n",
151 |     "\n",
152 |     "AI Fairness 360 implements several pre-processing mitigation algorithms.  We will choose the Reweighing algorithm [1], which is implemented in the `Reweighing` class in the `aif360.algorithms.preprocessing` package.  This algorithm will transform the dataset to have more equity in positive outcomes on the protected attribute for the privileged and unprivileged groups.\n",
153 |     "\n",
154 |     "We then call the fit and transform methods to perform the transformation, producing a newly transformed training dataset (dataset_transf_train).\n",
155 |     "\n",
156 |     "`[1] F. Kamiran and T. Calders,  \"Data Preprocessing Techniques for Classification without Discrimination,\" Knowledge and Information Systems, 2012.`"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 4,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "RW = Reweighing(unprivileged_groups=unprivileged_groups,\n",
166 |     "                privileged_groups=privileged_groups)\n",
167 |     "dataset_transf_train = RW.fit_transform(dataset_orig_train)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "### Step 5 Compute fairness metric on transformed dataset\n",
175 |     "Now that we have a transformed dataset, we can check how effective it was in removing bias by using the same metric we used for the original training dataset in Step 3.  Once again, we use the function mean_difference in the BinaryLabelDatasetMetric class.   We see the mitigation step was very effective, the difference in mean outcomes is now 0.0.  So we went from a 17% advantage for the privileged group to equality in terms of mean outcome."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 5,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/markdown": [
186 |        "#### Transformed training dataset"
187 |       ],
188 |       "text/plain": [
189 |        "<IPython.core.display.Markdown object>"
190 |       ]
191 |      },
192 |      "metadata": {},
193 |      "output_type": "display_data"
194 |     },
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "Difference in mean outcomes between unprivileged and privileged groups = 0.000000\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train, \n",
205 |     "                                               unprivileged_groups=unprivileged_groups,\n",
206 |     "                                               privileged_groups=privileged_groups)\n",
207 |     "display(Markdown(\"#### Transformed training dataset\"))\n",
208 |     "print(\"Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_transf_train.mean_difference())"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "### Summary\n",
216 |     "The purpose of this tutorial is to give a new user to bias detection and mitigation a gentle introduction to some of the functionality of AI Fairness 360.  A more complete use case would take the next step and see how the transformed dataset impacts the accuracy and fairness of a trained model.  This is implemented in the demo notebook in the examples directory of toolkit, called demo_reweighing_preproc.ipynb.  I highly encourage readers to view that notebook as it is  generalization and extension of this simple tutorial.\n",
217 |     "\n",
218 |     "There are many metrics one can use to detect the presence of bias. AI Fairness 360 provides many of them for your use. Since it is not clear which of these metrics to use, we also provide some guidance. Likewise, there are many different bias mitigation algorithms one can employ, many of which are in AI Fairness 360. Other tutorials will demonstrate the use of some of these metrics and mitigations algorithms.\n",
219 |     "\n",
220 |     "As mentioned earlier, both fairness metrics and mitigation algorithms can be performed at various stages of the machine learning pipeline.  We recommend checking for bias as often as possible, using as many metrics are relevant for the application domain.  We also recommend incorporating bias detection in an automated continouus integration pipeline to ensure bias awareness as a software project evolves."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": []
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "Python 3",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.7.3"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 2
252 | }
253 | 


--------------------------------------------------------------------------------
/assignment1/components/constraints.py:
--------------------------------------------------------------------------------
 1 | class Constraint:
 2 | 
 3 |     def is_satisfied(self, dataframe):
 4 |         return False
 5 | 
 6 | 
 7 | class HasAtLeastNumRecords(Constraint):
 8 |     def __init__(self, num_records):
 9 |         pass
10 | 
11 |     # TODO Implement
12 | 
13 | 
14 | class NotNull(Constraint):
15 |     def __init__(self, column):
16 |         pass
17 | 
18 |     # TODO Implement
19 | 
20 | 
21 | class HasNumDistinctValues(Constraint):
22 |     def __init__(self, column, min_distinct_values, max_distinct_values):
23 |         pass
24 | 
25 |     # TODO Implement
26 | 
27 | 
28 | class IsInRange(Constraint):
29 |     def __init__(self, column, min_value, max_value):
30 |         pass
31 | 
32 |     # TODO Implement
33 | 
34 | 
35 | class And(Constraint):
36 |     def __init__(self, *constraints):
37 |         pass
38 | 
39 |     # TODO Implement
40 | 
41 | 
42 | class Or(Constraint):
43 |     def __init__(self, *constraints):
44 |         pass
45 | 
46 |     # TODO Implement
47 | 


--------------------------------------------------------------------------------
/assignment1/components/learned_imputer.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator, TransformerMixin
 2 | 
 3 | 
 4 | class LearnedImputer(BaseEstimator, TransformerMixin):
 5 | 
 6 |     def __init__(self, target_column):
 7 |         pass
 8 |         # TODO Implement
 9 | 
10 |     def fit(self, dataframe):
11 |         # TODO Implement
12 | 
13 |         return self
14 | 
15 |     def transform(self, dataframe):
16 |         pass
17 |         # TODO Implement
18 | 


--------------------------------------------------------------------------------
/assignment1/components/trainer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def create_pipeline(task):
 3 |     pass
 4 |     # TODO Implement
 5 | 
 6 | 
 7 | def create_label_encoder(task, training_data):
 8 |     pass
 9 |     # TODO Implement
10 | 
11 | 
12 | def train_model_with_crossvalidation(task, pipeline, label_encoder, training_data, seed):
13 |     pass
14 |     # TODO Implement
15 | 


--------------------------------------------------------------------------------
/assignment1/instructions.md:
--------------------------------------------------------------------------------
 1 | ## Prerequisites
 2 | 
 3 | Update your git repository and setup the virtual environment for this assignment analogously to how you did it for the lab exercises.
 4 | 
 5 | The assignment consists of three independent tasks. Each task features a python file to execute. Note that the execution might result in errors if you did not implement the require code yet. In order to fulfill the assignment, you have to **implement python code in the files in the [components](components/) folder**. **Do not edit other files, especially the task files**.
 6 | 
 7 | ## Task 1: Data Validation
 8 | 
 9 | You can execute this task via ```python task1.py```. The goal of this task is to implement a few constraints for data validation defined in the file [components/constraints.py](components/constraints.py). Each constraint applies to a pandas dataframe and tests certain conditions on the contained data (or a specific column) of some [demographic income data](adult-sample.csv).
10 | 
11 |  * `HasAtLeastNumRecords`: checks that the dataframe has at least a given number of records
12 |  * `NotNull`: checks that a column contains no null values
13 |  * `HasNumDistinctValues`: checks that a columns number of distinct values is in a given range (including the min/max)
14 |  * `IsInRange`: checks that a columns values are in a given range (including the min/max)
15 |  * `And`: checks that all of the supplied constraints is satisfied
16 |  * `Or`: checks that at least one of the supplied constraints is satisfied
17 | 
18 | 
19 | ## Task 2: A custom Estimator/Transformer for Missing Value Imputation
20 | 
21 | You can execute this task via ```python task2.py```. The goal of this task is to implement an ML-based missing value imputer as an estimator/transformer in scikit learn in the file [components/learned_imputer.py](components/learned_imputer.py). 
22 | 
23 | You will work on a pandas dataframe, and your goal is to learn a model to impute missing values for the `target_column` given the values from the other columns of the dataframe. The choice of features and model is up to you. 
24 | 
25 | The model will be learned in the `fit` method of the class. For the sake of simplicity, you can assume that the dataframe only contains text columns. In the `transform` method of the class, your learned imputer should fill in missing values for the `target_column`. 
26 | 
27 | As an example, we will impute the category of a product given its `review` and `title` with data taken from [products.csv](products.csv). You can run the tests for your imputer via ```python task2.py```, you should try to achieve an accuracy greater than 0.75.
28 | 
29 | ## Task 3: Declarative Model Training
30 | 
31 | You can execute this task via ```python task3.py```. In this task, we will implement a very simple "AutoML" system for supervised learning. You are given data in the form of a pandas dataframe, as well as a [`LearningTask`](task3.py#L9) which describes which column values we want to predict (`target_column`) and which columns we want to use as features (`categorical_columns`, `numeric_columns`, `textual_columns`). Additional, you are provided with the `learning_algorithm` to use, as well as with the number of folds `num_folds` and a hyperparameter grid for `hyperparam_grid` for grid search.
32 | 
33 | The goal is to implement the following methods in [components/trainer.py](components/trainer.py) to conduct the model training invoked by ```python task3.py```:
34 |  
35 |  * `create_pipeline(task)`: Generate a sklearn pipeline for training a model corresponding to the task 
36 |  * `create_label_encoder(task, training_data)`: return a fitted [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) for the `target_column`
37 |  * `train_model_with_crossvalidation(task, pipeline, label_encoder, training_data, seed)`: train the model (e.g. the pipeline defined earlier) using k-fold cross-validation
38 | 
39 | The tasks file defines different tasks to predict certain columns of a sample of [income data](adult-sample.csv) and should result in an AUC of about 0.8 in most of the cases.
40 | 


--------------------------------------------------------------------------------
/assignment1/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.21.3
2 | pandas==0.25.1
3 | jupyter==1.0.0
4 | 
5 | 


--------------------------------------------------------------------------------
/assignment1/task1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from components.constraints import NotNull, HasNumDistinctValues, IsInRange, And, Or, HasAtLeastNumRecords
 3 | 
 4 | # We load a pandas dataframe with missing values
 5 | data = pd.read_csv('adult-sample.csv', na_values='?')
 6 | 
 7 | # We evaluate constraints on the number of contained records
 8 | assert HasAtLeastNumRecords(500).is_satisfied(data)
 9 | assert not HasAtLeastNumRecords(501).is_satisfied(data)
10 | 
11 | # We evaluate constraints on the range of values in the 'age' column
12 | assert IsInRange('age', 17, 90).is_satisfied(data)
13 | assert IsInRange('age', 10, 100).is_satisfied(data)
14 | assert not IsInRange('age', 50, 100).is_satisfied(data)
15 | 
16 | # We evaluate constraints on the completeness of the 'age' and 'workclass' column
17 | assert NotNull('age').is_satisfied(data)
18 | assert not NotNull('workclass').is_satisfied(data)
19 | 
20 | # We evaluate constraints on the completeness of the 'workclass' and 'education' column
21 | assert HasNumDistinctValues('workclass', 8, 8).is_satisfied(data)
22 | assert HasNumDistinctValues('workclass', 0, 10).is_satisfied(data)
23 | assert not HasNumDistinctValues('workclass', 10, 20).is_satisfied(data)
24 | assert HasNumDistinctValues('education', 16, 16).is_satisfied(data)
25 | 
26 | # We evaluate boolean expressions built from of constraints
27 | assert And(HasAtLeastNumRecords(500), NotNull('age'), IsInRange('age', 17, 90)).is_satisfied(data)
28 | assert And(HasAtLeastNumRecords(500), NotNull('age'), IsInRange('age', 17, 90)).is_satisfied(data)
29 | assert Or(NotNull('age'), NotNull('workclass')).is_satisfied(data)
30 | 
31 | constraint = And(
32 |     And(HasAtLeastNumRecords(500), NotNull('age'), IsInRange('age', 17, 90)),
33 |     Or(NotNull('age'), NotNull('workclass')))
34 | 
35 | assert constraint.is_satisfied(data)
36 | 
37 | 


--------------------------------------------------------------------------------
/assignment1/task2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.model_selection import train_test_split
 4 | from components.learned_imputer import LearnedImputer
 5 | 
 6 | # We load product data for evaluation
 7 | all_products = pd.read_csv('products.csv', sep='\t')
 8 | 
 9 | # We run several experiments with different random seeds
10 | for seed in [42, 129, 788, 555, 123456]:
11 |     data_for_run = all_products.copy(deep=True)
12 | 
13 |     # We define the names of the column to impute
14 |     column_to_impute = 'category'
15 | 
16 |     # In some tests, we will change the column names, the imputer should be general enough to handle that
17 |     if seed % 2 == 1:
18 |         column_to_impute = 'Kategorie'
19 |         data_for_run = data_for_run.rename(columns={"category": "Kategorie", "title": "Titel"})
20 | 
21 |     # We split the data into train and test set
22 |     train_data, test_data = train_test_split(data_for_run, test_size=0.2, random_state=seed)
23 | 
24 |     # We fit the imputer on the training data, it should learn its internal imputation model now
25 |     imputer = LearnedImputer(column_to_impute)
26 | 
27 |     imputer.fit(train_data)
28 | 
29 |     # We create a copy with of the test data and remove the column values to impute
30 |     to_impute = test_data.copy(deep=True)
31 |     to_impute.category = np.nan
32 | 
33 |     # We have the imputer fill in the missing values
34 |     imputed = imputer.transform(to_impute)
35 | 
36 |     # We compute the accuracy of the imputer by comparing the filled in values to the correct ones
37 |     imputed['correct__'] = test_data[column_to_impute]
38 |     correctly_imputed = imputed[(imputed['correct__'] == imputed[column_to_impute])]
39 | 
40 |     accuracy = float(len(correctly_imputed)) / len(imputed)
41 | 
42 |     print("Accuracy:", accuracy)
43 |     assert accuracy >= 0.75
44 | 


--------------------------------------------------------------------------------
/assignment1/task3.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.model_selection import train_test_split
  3 | from sklearn.linear_model import SGDClassifier
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from components.trainer import create_pipeline, create_label_encoder, train_model_with_crossvalidation
  6 | from sklearn.metrics import roc_auc_score
  7 | 
  8 | # This task defines how we want to train a supervised model for some given data
  9 | class LearningTask:
 10 |     def __init__(self,
 11 |                  target_column,
 12 |                  categorical_columns,
 13 |                  numeric_columns,
 14 |                  textual_columns,
 15 |                  learning_algorithm,
 16 |                  num_folds,
 17 |                  hyperparam_grid):
 18 |         self.target_column = target_column
 19 |         self.categorical_columns = categorical_columns
 20 |         self.numeric_columns = numeric_columns
 21 |         self.textual_columns = textual_columns
 22 |         self.learning_algorithm = learning_algorithm
 23 |         self.num_folds = num_folds
 24 |         self.hyperparam_grid = hyperparam_grid
 25 | 
 26 | 
 27 | # We execute the learning tasks using the methods defined in trainer.py
 28 | def execute_task(task, data, current_seed):
 29 | 
 30 |     # We split
 31 |     train_data, test_data = train_test_split(data, test_size=0.2, random_state=current_seed)
 32 | 
 33 |     # We create the pipeline for our task
 34 |     pipeline = create_pipeline(task)
 35 | 
 36 |     # We obtain the fitted label encoder
 37 |     label_encoder = create_label_encoder(task, train_data)
 38 | 
 39 |     # We invoke the model training
 40 |     model = train_model_with_crossvalidation(task, pipeline, label_encoder, train_data, current_seed)
 41 | 
 42 |     # We compute the AUC of the model
 43 |     y_true = label_encoder.transform(test_data[task.target_column])
 44 |     y_pred = model.predict_proba(test_data)[:, 0]
 45 |     auc_score = roc_auc_score(y_true, y_pred)
 46 | 
 47 |     if auc_score < 0.5:
 48 |         auc_score = 1.0 - auc_score
 49 | 
 50 |     return auc_score
 51 | 
 52 | 
 53 | # We evaluate our model on income data
 54 | income_data = pd.read_csv('adult-sample.csv')
 55 | 
 56 | # We run several experiments with different random seeds
 57 | for seed in [42, 12345]:
 58 |     
 59 |     task1 = LearningTask(
 60 |         target_column='income-per-year',
 61 |         categorical_columns=['workclass', 'occupation', 'marital-status'],
 62 |         numeric_columns=['age', 'capital-gain', 'capital-loss'],
 63 |         textual_columns=[],
 64 |         learning_algorithm=SGDClassifier(loss='log', random_state=seed),
 65 |         num_folds=4,
 66 |         hyperparam_grid={'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.001, 0.0001]})
 67 | 
 68 |     auc1 = execute_task(task1, income_data, seed)
 69 |     print("AUC %s for task1 with seed %s" % (auc1, seed))
 70 | 
 71 |     task2 = LearningTask(
 72 |         target_column='income-per-year',
 73 |         categorical_columns=['occupation'],
 74 |         numeric_columns=['age'],
 75 |         textual_columns=[],
 76 |         learning_algorithm=SGDClassifier(loss='log', random_state=seed),
 77 |         num_folds=4,
 78 |         hyperparam_grid={'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.001, 0.0001]})
 79 | 
 80 |     auc2 = execute_task(task2, income_data, seed)
 81 |     print("AUC %s for task2 with seed %s" % (auc2, seed))
 82 | 
 83 |     task3 = LearningTask(
 84 |         target_column='income-per-year',
 85 |         categorical_columns=['occupation'],
 86 |         numeric_columns=['age'],
 87 |         textual_columns=[],
 88 |         learning_algorithm=RandomForestClassifier(random_state=seed),
 89 |         num_folds=4,
 90 |         hyperparam_grid={'n_estimators': [10, 100, 1000]})
 91 | 
 92 |     auc3 = execute_task(task3, income_data, seed)
 93 |     print("AUC %s for task3 with seed %s" % (auc3, seed))
 94 | 
 95 |     task4 = LearningTask(
 96 |         target_column='sex',
 97 |         categorical_columns=['workclass', 'occupation', 'marital-status', 'income-per-year'],
 98 |         numeric_columns=['age', 'capital-gain', 'capital-loss'],
 99 |         textual_columns=[],
100 |         learning_algorithm=SGDClassifier(loss='log', random_state=seed),
101 |         num_folds=4,
102 |         hyperparam_grid={'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.001, 0.0001]})
103 | 
104 |     auc4 = execute_task(task4, income_data, seed)
105 |     print("AUC %s for task4 with seed %s" % (auc4, seed))
106 | 
107 | 


--------------------------------------------------------------------------------
/assignment2/README.md:
--------------------------------------------------------------------------------
 1 | ## Task 1: Data Validation with Google TFX
 2 | 
 3 | The goal of this task is to use [Tensorflow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv) to validate two sets of data files about products and ratings in the [data](data/) folder. 
 4 | 
 5 | We assume that all products data files are valid except for `products-data-3.tsv`. Additionally, we assume that `ratings-2.tsv` and `ratings-3.tsv` have anomalies.
 6 | 
 7 | Eyeballing the data should help you identify differences between the individual files. We ask you use **tfdv**  to infer a schema from the data, adjust the schema if necessary and ensure that your code correctly identifies the files with data anomalies. Use `python task1.py` to execute this task and implement your solution in [components/schema_validation.py](components/schema_validation.py). Looking at this [test code](https://github.com/tensorflow/data-validation/blob/80809cd738fd1178f6c0334b0e4f4e644f445139/tensorflow_data_validation/anomalies/schema_test.cc) from Tensorflow might help you identify schema constraints that are helpful for this task.
 8 | 
 9 | 
10 | ## Task 2: Parallel Data Processing with Apache Beam
11 | 
12 | Next, you will have implement a parallel data preprocessing job using [Apache Beam](https://beam.apache.org). The input data consists of product descriptions in the file [products-data-0.tsv](data/products-data-0.tsv) with the schema `identifier, category, description` and of product ratings in the file [ratings-0.tsv](data/ratings-0.tsv) with the schema `identifier, rating`.
13 | 
14 | You Beam job should conduct the following operations:
15 |   1. Conduct an equi-join on both inputs using the join key `identifier`
16 |   1. Filter the join result to only retain records that have (a) 'Kitchen' as `category` and a `rating` of at least 4 or (b) 'Jewelry' as `category` and a `rating` of 5
17 |   1. Group the join results by `category`
18 |   1. Compute the number of records per group
19 |   1. Write the categories and counts tab separated into a file `category_counts.tsv-00000-of-00001`, a line of this file could look like `Kitchen  123` for example.
20 | 
21 | Use `python task2.py` to execute this task and implement your solution in [components/beam_job.py](components/beam_job.py). 
22 |   
23 | 
24 | ## Task 3: Implement your own MapReduce Engine
25 | 
26 | In this task, you have to implement your own simple MapReduce engine. Note that instead of the typical case of implementing the functions `f_m` and `f_r` that are executed by the MapReduce engine, this task has a different setup. The functions `f_m` and `f_r` are given for a simple wordcount algorithm and you have to implement the underlying runtime that applies them to the input data, according to the MapReduce paradigm.
27 | 
28 | Use `python task3.py` to execute this task and implement your solution in [components/mapreduce.py](components/mapreduce.py). You have to implement methods to run the typical three phases of a MapReduce job: 
29 | 
30 |  1. Run the [map-phase](https://github.com/schelterlabs/deml-lab/blob/master/assignment2/components/mapreduce.py#L28): for each partition, transform each input record with `f_r`
31 |  2. Run the [shuffle-phase](https://github.com/schelterlabs/deml-lab/blob/master/assignment2/components/mapreduce.py#L33): create as many output partitions as we have reducers, and ensure that all records with the same key are put into the same group in the same partition
32 |  3. Run the [reduce phase](https://github.com/schelterlabs/deml-lab/blob/master/assignment2/components/mapreduce.py#L37): apply `f_r` to every group in every partition
33 | 
34 | Note that your implementation does not need to run in parallel or be efficient or be able to handle large datasets.
35 | 
36 | ## Task 4: Parallel Linear Regression with MapReduce
37 | 
38 | In this final task, you will use your MapReduce engine from the previous task to implement the parallel linear regression example from [Map-Reduce for Machine Learning on Multicore](https://papers.nips.cc/paper/3150-map-reduce-for-machine-learning-on-multicore.pdf).
39 | 
40 | Here is the description of the approach:
41 | ![](regression.png)
42 | 
43 | The input data for this task is a simple regression problem generated by scikit-learn's [make_regression](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html). Use `python task4.py` to execute this task and implement your solution in [components/linear_regression.py](components/linear_regression.py). Use the output of `result_key()` as the key for the model in the final result.
44 | 


--------------------------------------------------------------------------------
/assignment2/components/beam_job.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 
3 | from apache_beam.options.pipeline_options import PipelineOptions
4 | import apache_beam as beam
5 | 
6 | def create_and_run_beam_job(path_to_products_file, path_to_ratings_file):
7 |     # TODO implement
8 |     pass


--------------------------------------------------------------------------------
/assignment2/components/linear_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def f_m(key, value):
 5 |     # TODO Implement
 6 |     pass
 7 | 
 8 | 
 9 | def f_r(key, values):
10 |     # TODO Implement
11 |     pass
12 | 
13 | def result_key():
14 |     return 'dummy_key'


--------------------------------------------------------------------------------
/assignment2/components/mapreduce.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class MapReduceEngine:
 3 | 
 4 |     def __init__(self, f_m, f_r, num_reducers):
 5 |         self.f_m = f_m
 6 |         self.f_r = f_r
 7 |         self.num_reducers = num_reducers
 8 | 
 9 |     def execute(self, partitions):
10 |         # Run the map-phase: for each partition, transform each input record with f_r
11 |         map_outputs = []
12 |         for partition in partitions:
13 |             map_output_for_partition = self.map_partition(partition)
14 |             map_outputs.extend(flatten(map_output_for_partition))
15 | 
16 |         # Run the shuffle-phase: create as many output partitions as we have reducers, all records with the same
17 |         # key must land in the same group in the same partition
18 |         shuffle_outputs = self.shuffle(map_outputs)
19 | 
20 |         # Run the reduce phase: apply f_r to every group in every partition
21 |         reduce_outputs = {}
22 |         for partition in shuffle_outputs:
23 |             reduce_output_for_partition = self.reduce_partition(partition)
24 |             reduce_outputs.update(reduce_output_for_partition)
25 | 
26 |         return reduce_outputs
27 | 
28 |     def map_partition(self, partition):
29 |         # TODO implement
30 |         pass
31 | 
32 | 
33 |     def shuffle(self, map_outputs):
34 |         # TODO implement
35 |         pass
36 | 
37 |     def reduce_partition(self, partition):
38 |         # TODO implement
39 |         pass
40 | 
41 | 
42 | def flatten(nested_list):
43 |     return [item for sublist in nested_list for item in sublist]


--------------------------------------------------------------------------------
/assignment2/components/schema_validation.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import tensorflow_data_validation as tfdv
 3 | 
 4 | CSV_DELIMITER = '\t'
 5 | 
 6 | def infer_schema_from_csv(csv_file, column_names):
 7 |     # TODO implement
 8 |     pass
 9 | 
10 | 
11 | def has_anomalies(csv_file, schema):
12 |     # TODO implement
13 |     pass
14 | 
15 | 
16 | def adjust_product_schema(schema):
17 |     # TODO implement
18 |     pass
19 | 
20 | def adjust_rating_schema(schema):
21 |     # TODO implement
22 |     pass


--------------------------------------------------------------------------------
/assignment2/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment2/data.zip


--------------------------------------------------------------------------------
/assignment2/data/ratings-0.tsv:
--------------------------------------------------------------------------------
  1 | ab602aca-9bad-4aa9-bd42-6ce24cdf8680	5
  2 | f98cd8d3-d6fe-4ee3-8c9f-a18c1690f7a6	5
  3 | 396f73e3-e3ef-4c93-a4b7-e8b04c6a2223	3
  4 | 329ed5e3-8b47-4e78-961b-18b89e7da808	1
  5 | e61dca8b-3577-4c88-a9de-47db3bc3cc99	4
  6 | 39191b63-8e02-4292-8d11-67c6f2bb1ae3	5
  7 | b9d72513-97db-4121-baa1-f9665b5dc1cd	5
  8 | a6260267-9a39-454f-80fa-5232909049e0	5
  9 | e3167452-e087-47e6-a77b-d0efb0a1b997	5
 10 | 93163305-6046-41dc-b64b-e85b24e61269	5
 11 | 34fe8d10-3554-4b0c-99a6-a000bdc47b9b	5
 12 | f768c63e-616c-4578-9b60-d2c66972992c	5
 13 | 5e60bd1b-28ca-4119-bc43-c4201cef0247	5
 14 | acc327b7-d749-4900-86d2-5dea1ebb4193	5
 15 | ef3ac854-c6a0-4dd4-8342-4c20d15edac7	4
 16 | b1c07c7e-4f95-4a1e-a25f-df169de46011	5
 17 | 1ad593c0-52c8-4c82-b682-21db37f862cc	1
 18 | 7e51e537-6617-4450-a65b-4e5abee27b43	5
 19 | ffbd5589-5aac-4344-a591-8a4887dd1dc7	1
 20 | 57c9c17b-98fd-4286-85d4-f0539e81a0d3	5
 21 | a14eb4b5-7763-464f-919b-e72804270f9a	5
 22 | 2eb0961b-9bb1-4c1d-8b71-4180e706534f	5
 23 | afd402e6-1bba-43d4-9ef7-b3706409e38c	4
 24 | 34ac55e7-6e90-45b5-a586-e3e8cde05e34	5
 25 | 1cf19399-953a-4444-baa1-c99285443d32	5
 26 | 66491d95-37ce-4f40-9058-d4cb725f2afc	3
 27 | 6621caa1-2a47-4c18-8dd2-d15c4e4312b4	1
 28 | f9c56e5a-2663-4777-8076-76ee5e6309f7	5
 29 | ab09b9c4-bcd2-44ed-9873-79fb0ecd67ac	1
 30 | 1d0f80ab-0fa6-42ec-907e-95e271358b9b	5
 31 | efbc939c-3ad3-4bd7-b2ef-5e34aabd3ef7	5
 32 | a76c0826-b5c9-4c83-b678-74dcdf92d8b5	4
 33 | e9e4fddb-f5d7-4fad-a2ca-84c659a2fec1	4
 34 | cc9d4991-a063-4a56-882e-2bdf2fe24be4	5
 35 | dbd8ada8-2755-4867-8dc2-8046a235c46a	5
 36 | ef884d81-0309-491e-bb76-fbf13173fdfd	5
 37 | bd8f77d8-af0d-4928-9f91-c155044a48cc	5
 38 | 4d736ae6-5950-4241-9d70-d7acd2d0bd09	5
 39 | baa90425-1179-4c11-aad7-bbe646fa2556	1
 40 | 0d3d2799-aaf1-4c31-9533-ee16ec4c3fb3	5
 41 | 714b4282-7773-416f-9031-50784580dc45	5
 42 | dcaab57e-2ae1-4662-96f3-40629b0ec8e5	5
 43 | 77398786-8b44-4f8e-a35e-b87071fe6e3c	5
 44 | cd3b254e-1398-4001-9642-eba75d67dfad	5
 45 | 7026f0cc-8370-4abf-a2d8-1f59244f0700	5
 46 | e8f86669-aa81-4ecc-9711-6f1d7eb6c07f	4
 47 | 36bb9b53-cc30-44d4-8408-23c19b9d6564	5
 48 | 70128d47-b744-4421-96ff-a52d3b0d52cf	5
 49 | 19d3c205-8dfb-473d-ad21-30514e05e2f5	4
 50 | bf4f3700-b6e3-4000-a88d-acf081884641	5
 51 | 1c6417db-3cda-4a02-bb43-71104c2f5f7a	5
 52 | b0e6bb50-a31e-44e5-a68d-caaeec6470ad	5
 53 | 87be9f94-ac75-41d3-bfd1-9c95a2bd65f5	5
 54 | 280256b4-49cd-4f87-a3ca-aa6eea48376e	5
 55 | 2d653c2a-7b5c-4789-82ac-df4e0951131f	1
 56 | 01a0027e-f261-4402-8802-c69ec62eae4c	5
 57 | a71591fc-91c4-47c0-ad0a-dd33c244738e	4
 58 | 033c543c-da1e-4b3c-a11b-261aef50d201	5
 59 | 17b8efe6-e0b2-4e49-a5fe-24bc8568d742	5
 60 | edea43cb-e1b7-453b-9e9b-e0efd1592231	3
 61 | 64a5234d-f9ac-4baa-85d4-22680004effd	3
 62 | 6f406526-b139-426e-bd70-be38de9736f9	5
 63 | 064d6b5e-68da-4a34-9ba8-f521d61ac27f	5
 64 | 45e740ca-473d-47ae-84e1-badd8b3f74b3	5
 65 | d97b6acb-c430-46ca-9751-4a1752af34a7	1
 66 | 3585c0df-540f-4d8c-bb5a-59ee19480cda	5
 67 | b8599cbe-1ec2-4884-8584-7418e848c255	5
 68 | dd623e94-9422-460f-8c20-190d0d5432f4	5
 69 | 5197ffce-4b62-43a6-907c-3d7e3515b1d6	5
 70 | ac66c1cf-e87f-47ab-a391-26acb7797d58	5
 71 | e0e0053b-f9f3-4f7d-9513-d3e1625ceaf0	5
 72 | b007e2a8-cd1f-4ed2-b912-1bd1153d4dd1	5
 73 | 9a5c6344-e6a6-486a-ae0d-134ed7de3e1d	5
 74 | 3df801cb-2325-4e31-bc41-ee71ae851664	5
 75 | 8d9d81bf-1be2-4226-a5bc-0af09506df22	4
 76 | 2cd0cb45-48b7-4742-aeea-df2f991cac53	4
 77 | d9c1203c-f961-438f-9e59-ac61df27e038	5
 78 | 43bf23a7-6a98-4a05-b49e-b25e84c46935	3
 79 | 8adce774-d906-4774-a207-82fa7258e6e7	5
 80 | df28a8ea-508b-4a22-aad0-51c6b986b703	3
 81 | 0f7a8aca-c7d5-4139-8ecd-5d8501d61088	5
 82 | c9dc0726-b779-44d0-a4f4-7e83baa3cca4	2
 83 | f72a5f00-b9fd-4f8d-9092-2dfdc6d4485f	5
 84 | df7e80d4-0708-4d7c-a09d-684b46478dcc	3
 85 | bc9ecd60-708d-4452-8198-1214edf344c4	4
 86 | d9f56461-5547-4ba8-8ba4-a4fdf81de20b	5
 87 | 71d871c8-2a59-42de-96f5-3423ee5d4784	4
 88 | 78a62ae0-7c26-4284-81fb-30a716dd2411	4
 89 | f638499c-6670-496e-a0f6-3e8ad1944343	5
 90 | 00552178-6074-4f0d-8351-6cb508a9c208	1
 91 | 233528b9-514c-4ff9-91f0-3f2517ffac67	5
 92 | 3dfab41d-99be-4109-8721-5a3343cd930d	1
 93 | d8d4ae14-d838-4395-a039-fd2a3b1b5e28	4
 94 | 0e75b933-ded8-4164-865a-4983b7ba0cc4	4
 95 | 81ea7456-0733-4f91-b401-2e9e46c19a3d	5
 96 | c59903d2-a6a1-4b07-ad6b-ba3af90b9f4f	2
 97 | 02845cd0-2518-4b78-820b-106add23787d	1
 98 | 00645388-2228-452f-9b5c-7d1a0df1e9d1	5
 99 | 47e073a8-8bcd-4b50-9d9f-dc3ddd150568	2
100 | 947989d2-3588-4e7c-bb8c-b163cc8d5386	5
101 | ffa40a7d-e690-4369-a98a-f996a465d134	3
102 | 5371f5cb-856d-4a33-a361-cf542fdf8890	3
103 | ebf5111b-fb31-4d62-b42c-b2eaa15a86b1	5
104 | 40902631-7afd-4a9d-aea0-db4f3f1cbb9c	2
105 | 8f4b3c22-5bc8-475b-a428-b167f7995800	3
106 | 1a291ad0-c8e2-4a87-b6ad-88e49e500685	5
107 | d64d7daf-a942-482c-80b1-9d97e0665d8e	5
108 | c2f59659-9fd3-43b4-95ed-c09d19b788dc	5
109 | e9791924-15c3-4197-8c38-6281c4a864ec	5
110 | 7d6206f8-742d-4c01-9c70-2192d3453089	5
111 | b3f807c6-0116-4361-850d-f02b6b840274	5
112 | 881b11fc-641b-42f0-9973-68438b222fe4	5
113 | a73f6147-5013-4fb8-ab0b-caefa2ce5284	3
114 | d4dab584-bc5d-4f21-8fc7-1249388c31c1	5
115 | 3254142e-9931-488e-95a4-27e4f784bbbf	3
116 | 1d92debc-ce16-46dd-9c1b-e8dbb354ecc2	1
117 | fd296019-e728-458e-bdf2-c8b3eabf7c93	5
118 | 712529d5-fca5-49a4-b1fa-824f6c2ff099	5
119 | 7941512a-79e7-4a39-a360-0c24d0cadab1	5
120 | 4527a9ef-3b6c-45dc-956f-10dad7c952b2	5
121 | 5f74113f-aaa3-43ab-97b6-d34e56bc14f2	5
122 | d48625d0-c533-49f3-b57a-d97b18316343	5
123 | 1b171dfe-b13e-4c71-8337-12e102268f99	5
124 | 4e1faad1-d049-4fef-9385-ecf1b0e54d48	4
125 | de02cfdb-b10a-484f-957d-b0cf11cbdac5	5
126 | c5564131-2e9f-4dcb-86af-b9e96f9bf647	5
127 | c8d1378e-ca50-4ad8-99d3-3ebe5283933d	5
128 | b0170384-c254-4720-a79a-63edbd9b8ed8	5
129 | 00965ac4-99f8-4feb-93db-1ff71ce1e134	5
130 | 45d3698e-2f0d-416c-9401-f3995eb13234	2
131 | 779112f6-073a-40bb-908e-514829d4ae1f	5
132 | 063a89ac-dd7b-416c-885d-03e55c89d0aa	5
133 | ce603435-ce39-4ae8-bce2-c2a510fdee00	5
134 | ae0fa17c-9ffb-4071-9fc6-956fc9876005	2
135 | dc03bcf1-46c0-4ea9-8da1-36cfe52070aa	5
136 | 5af384ff-31f8-4b20-96f1-d09c200e8877	5
137 | fbb206f2-47b2-4ff6-b069-8428465d71fa	1
138 | a1023de8-b11f-4d96-9b0a-ace772ad6649	5
139 | fc49957a-cae4-4324-8dac-e2a722880011	3
140 | 0f2e9567-3cbb-4faf-8911-48dd972d9da9	5
141 | 18cbcb38-a63c-4d87-8014-82d5c71fad84	5
142 | fd1f13fa-dd4a-467c-bdec-0e21fd60c1b2	4
143 | 79dde4d8-3aa1-4b82-9a33-302e2be542f0	5
144 | 69918764-0326-4f9a-a6bc-62ce4c4fc40d	5
145 | c5fed535-af63-465f-bdce-116446304c75	5
146 | 1d11ce64-6fa7-4f3d-9d54-750211e4c4fe	5
147 | 3928466a-f794-4952-803b-57687685b694	5
148 | 4066d298-8da2-4d3d-900d-af3606c834a6	5
149 | 9bd5b50f-b2ea-47d9-8ae8-351aefbac7f4	5
150 | 0170b1d0-af6c-43f5-a9c7-e368f84582d6	5
151 | 8acba203-c023-4681-8161-ef6eadff9146	2
152 | 66b030d6-eb0a-4409-aea2-31e474c3740b	5
153 | b6c622ea-ade6-4305-b6ab-8174bd47b0fd	5
154 | 57898d01-775a-4431-ab89-6d597a4f4f0b	1
155 | ae61e9c7-000f-4e3a-9460-a9312bc152eb	1
156 | 30e4b35b-9129-47fc-afec-973862bf1a9b	5
157 | a6a0db46-fb69-47a3-846a-c7800c05b035	5
158 | 18ddc4bc-f7a3-4a99-bf63-80454b150f33	4
159 | 60a48004-f59c-4c3f-aabe-4857317fdaf2	5
160 | c2c30f6a-1a33-42a0-b72e-912931a97da5	5
161 | b795dfcd-76bf-4cec-8c00-2a6d8f95ea44	5
162 | c37e9681-022b-4923-a838-3746dc30827f	5
163 | cf9e68d0-7a62-4d0b-b051-886cca76d093	5
164 | 031fd19f-48a8-4923-b886-bca54d643465	2
165 | 24566674-58a7-422d-9033-b005bd86594e	5
166 | 0a7e6948-85fc-4292-9f8d-df22ccb541a2	2
167 | 052fdb81-f4a9-4b0a-a913-2763be6546b2	1
168 | 6b6a4006-f1c0-4648-acf7-6d8f73f5c3bc	5
169 | 8694795b-258f-47e1-bf54-c12d1c8629d1	5
170 | 05f10bb4-aea1-4808-8c9f-ca1feaa6687e	2
171 | 508381f2-d295-46fd-87da-ec16cb5c2f1a	5
172 | 2069df2f-d8a4-4da6-9c41-80b3d34ae07e	5
173 | 22f8fabd-7884-4dca-88d0-fa1b8bfbc55e	5
174 | cfdd48a7-8ff3-4de9-85ce-394109134144	5
175 | b3e45891-65b2-40d6-84a7-7adc26b5d5b5	3
176 | 75c68af6-bfde-4f38-9bc6-fe65e6b0921c	3
177 | 3729140e-8b82-409b-beda-709d66d3a5ee	5
178 | 0876e7a7-8ede-4451-8ae1-c8e10b20dcdd	4
179 | 888d4ae7-b04e-4584-a302-76e2dcaa1cb6	5
180 | fed7b9e3-172e-48a5-9315-b7b8d2adf095	5
181 | 7d71af97-8409-4851-ab0e-7dd7ec53e0e4	5
182 | b751a1be-2473-4144-853e-ce1f867c1189	5
183 | 677cfb18-46c6-4770-b481-a60eb2b49dc1	5
184 | 53490736-3f8b-4212-9698-afc6a9819c58	5
185 | e9430d61-117e-4578-a6e6-3037e9112fef	5
186 | 9ceb1b30-f0f2-4c2b-9b4d-654f8ddb4574	1
187 | b6fe871a-2e58-4e56-a9dd-656a072f6106	5
188 | 17cd7ef6-e12f-40c5-8797-11105f97c57a	5
189 | dddf4365-a160-49d4-b229-d7bde498db4e	5
190 | 69a36c2b-7250-4308-9661-3776076d7353	5
191 | c35beead-a5ea-43d8-b183-5c4c3fa0d995	4
192 | dcbb3511-b3b0-415f-96d5-34bb80c4c70c	4
193 | 59dc9143-4328-41c1-a518-ce947c442884	2
194 | ab476964-1b81-4aa5-91fd-5af1eaea0a42	4
195 | fb09b323-5b04-4bb3-9f08-30c6f8e63e9c	5
196 | 7a8408f1-8234-4372-b1e6-bbcc9bb744c9	5
197 | 1e7a9da5-e0bd-4830-9637-99799c54ea2b	5
198 | 6b21f9c0-6d36-4f2d-aac5-7bdc2731c36d	1
199 | 7c705f28-80aa-4d3a-a4c5-1a6a9d0f1384	5
200 | bc51bfc6-c3e1-478b-ab29-1c34c576c8db	4
201 | 2e06b8bc-fd6d-4dc0-a364-82ab4abb2689	4
202 | aaae85ef-83ee-4deb-b6f6-f72be1b4b612	5
203 | a5d78c4b-a48b-4efb-9dac-bdc2659f501e	5
204 | d291f359-d8c1-47a1-a745-ddecc3b27dc6	5
205 | 174856b8-e940-488f-8b60-338d181f144b	4
206 | 3fee491b-b845-4d7c-9301-80d5fb854db8	5
207 | d8237ff3-05d0-4a4d-acc4-5690b11a0b74	4
208 | 3a99b893-e7b0-4602-876a-5b31edc3bb02	5
209 | 8dc0b64c-7f2c-4576-b129-4f1a867bf3be	4
210 | 81ca1b53-c347-4741-a230-9571465d9b11	4
211 | 9bb2c221-e16f-4d78-bad8-9048db4e62a7	5
212 | 9d3a40ef-994c-4cb3-b49c-10292583a639	5
213 | 3195b911-42af-4b26-a965-4f017a8ee566	2
214 | 52170b3d-1355-4143-8e56-60a6a9769743	5
215 | 90bade87-e249-4490-8698-4f5ee028f709	5
216 | e9c5886f-3436-4ca4-b90a-d943fd856654	4
217 | e87b1b1b-0a57-4c9a-9982-ab060bdaa975	5
218 | f282686b-f210-467d-b7db-46b2e950e957	5
219 | 29f72764-6d3e-49af-bada-a6bb9a445e22	5
220 | 7a10a71a-8b06-4eb7-b33b-a88d6b3e92c5	5
221 | 7bbc7e4c-dcfc-429d-97de-df378cc2d354	5
222 | d01d3dff-aa01-4200-b262-83be531533a0	4
223 | e20198f2-00a3-4bb2-8c47-9a028ae1c00c	5
224 | dc1899d8-d879-4a5b-b729-b9f711e01a36	1
225 | d4f9b6d1-5d01-45c5-a279-74a47faa4bd7	5
226 | f12a7e5d-2266-49cd-a684-835044185bcf	5
227 | 031d882a-b949-4b4e-8b99-ff4857d387c6	5
228 | 14b9303d-f778-4a07-99aa-57558145e08e	5
229 | 142b538f-388f-4e8e-a9fd-608e3d622790	3
230 | bd86820f-0927-48f4-9180-ea6a83b8ae34	4
231 | 7775b562-d669-413a-9057-08d9ab152c6d	5
232 | e67d74c1-86f1-40ee-b233-d8f70307b55e	1
233 | 4e696a07-2b61-405e-8ff9-b11a5361dff3	5
234 | f8fca6de-c39d-477f-a4fd-b4da20c6db32	5
235 | b82de19b-725c-4b21-ac97-510cd21ff62e	5
236 | 8dc8b0d1-f093-425c-b697-ab56fdd17683	5
237 | 011a4841-6338-4a19-8acc-a6ccbdefc409	5
238 | 53564ce9-c101-4ddb-ad4e-e8a7e7c41734	1
239 | f0f3dcb2-aad7-4729-b5fb-3888dc932441	5
240 | 95421457-f542-434e-aff0-cdd634437e7a	2
241 | e27e5df6-15d5-4633-a495-12c18161ba9e	5
242 | 29449dfb-fbbb-4001-bef6-8fff102f53f7	5
243 | aa06e4ef-4809-4f3f-bcee-7b730344f4a6	5
244 | 83d4a69b-28a6-4178-bb8d-cec15cf28564	5
245 | 605497c8-cb86-47cc-ae22-a00913446d88	1
246 | e15cbf0f-be0f-4c6c-8ffe-7460e1f3cfa9	5
247 | b8a82a12-7ea1-4c3f-946c-e27c9edb5295	5
248 | 110d2ecb-6887-4113-b13a-038fb60d7c07	5
249 | b5688331-6972-4ddc-8b18-8469ed2c76e1	5
250 | 6e3328b3-6458-49d3-b643-d0744f3bd949	5
251 | 18377530-2a79-4ec8-aaf3-41b04560be89	4
252 | 8cb374ae-c8b9-4d12-913f-96f6499e8113	5
253 | 80557652-ac83-48b5-b0bb-2e48a89ae0c6	5
254 | 88e66a00-273c-4e9a-aa8f-67fe0d9b3782	5
255 | 45dd273c-5ac4-43dc-8ed6-d30d108eb293	1
256 | 6a1dad68-0f5b-4189-9281-74a7f8d21ff8	5
257 | 84d4c7ab-04a4-4138-b21f-a5cbe2bbdbeb	3
258 | 16ecc692-3d65-4b40-898f-563df4cd2bfd	4
259 | b5ca790c-b8f1-4daf-9bf4-7463bc4544a9	5
260 | a60c3e9d-79b8-4ffe-9a97-97d2e00b15ff	5
261 | c7e8149a-8310-4d5f-b22d-ec5cd0f64047	5
262 | 7912e691-ce7e-4492-a127-795716e8fd4d	1
263 | 1f647b89-0044-4c1f-a1ea-61ec37b3a535	3
264 | 0c515d16-798d-4e48-8eca-8acc13ad1658	5
265 | 2c4d326e-4c7a-4c10-87c4-3e2e428aef33	1
266 | 9609ec6f-211b-448a-a373-4bc78098560f	5
267 | 0a5409ad-a8c0-4606-bf8d-6faefa57fd05	3
268 | 3bcf812f-fe88-4b61-883d-fa600f3ee484	1
269 | f9a0dfd2-4032-4858-80ac-63d53496e48c	5
270 | 9e7d6cc6-c62d-49ad-b96f-b4e30d198a2e	5
271 | 2ea0ca0b-dac8-4a08-a749-a3fa040960ec	5
272 | bd54b369-254a-4602-afa3-150994297432	3
273 | 9cebb8f0-d861-4e6b-bb59-07b89d6da906	1
274 | 685a2b73-3aec-4d49-9f89-8bc4077f5924	3
275 | 35b9d104-3db6-48df-b3bb-10bc54fbd7cf	4
276 | 7fe242ac-e844-4dfe-bfce-bde7f20b0c59	4
277 | 9ff710f7-896c-44b8-b379-9c2db5ea42e8	4
278 | afadf913-abd4-436a-8ca8-270cb7eb404d	1
279 | de48d136-7df1-4dc8-8410-18f0a97813cd	5
280 | 281fb66f-8395-4cf1-9072-8064bc58d9cc	4
281 | 0c967875-3a6f-4aeb-b922-42c5a185aa94	5
282 | 71b6e072-1597-4368-9ae9-f9204ce3acdb	5
283 | 0d7a1277-c653-4ebb-8524-2467712357ea	4
284 | 6247ac9f-ccf5-4df9-beac-e9af54d86ac0	5
285 | 13daf93c-ce61-4004-85ab-6c88c9f30279	5
286 | 58306131-9591-4793-b05f-eb98b7daa3a9	5
287 | 022ec213-2d13-41d0-b98c-0baf04d8e3d5	5
288 | 06517fdf-b7b0-4055-bee5-e45bd8c6bfef	5
289 | e8a5850d-9849-48c8-a44f-9585a6923aa9	4
290 | a96ad8d7-01f2-4c1a-b429-2dd9c6b06476	5
291 | d17a38d6-058c-4861-b6d3-934517c7c909	5
292 | eef6fbba-dcd7-4c51-8ce4-cd205c323924	5
293 | 26e053b5-910f-40ff-8231-a45162718084	5
294 | 124a8ac2-6ab3-4fee-8200-e0cf484d04ae	5
295 | a0321e36-efcc-4073-9f85-69092180e9ed	1
296 | 2019d0bf-b31d-4ad4-acb8-2657ff6db3c5	5
297 | 4dba88e1-ac67-42c4-80f6-8dd3d449dd71	5
298 | 163c0493-3b32-4bd1-8eb1-367c0d0e3039	5
299 | ad89d9af-1227-436f-aec6-ddc963d8db4e	5
300 | e8dc5827-49b5-490d-9259-3f77425d2b2f	5
301 | 39052c68-6238-41a0-9085-201949e459ff	5
302 | 676a773f-c41c-4e59-bfec-9f7c15118007	5
303 | cd1fc09b-fb28-43a2-bc72-b3329b2a63ff	5
304 | 0e418a70-92ab-4310-9fee-d33efb795849	5
305 | 60327006-34da-4c88-9b22-038b5b286eec	5
306 | 94a47aff-7a93-494f-8f40-2eab54454e95	5
307 | 802f6d9c-614b-4f16-838a-73c7d98dfa21	2
308 | 36fff17d-3edb-4c54-8687-2070b05510d2	5
309 | f5c79205-9269-4fcd-86b5-4267ae7af93c	5
310 | 14ff5559-eb57-4f93-bd91-4aeba852f312	5
311 | a4b96feb-5a11-4209-bc7c-5f088dfc3a4b	1
312 | 7712a1ec-0fbc-45b4-98ad-a8874f775fc9	5
313 | 037329a0-0954-427a-a707-6dfd58bb5044	5
314 | 1de3492d-25a8-47ba-8160-c191fe671881	5
315 | 9805d0d9-1519-4222-8af2-4df43d7d7b27	5
316 | 7be2d9b9-c7e1-48c3-aa23-83950cb2129e	5
317 | 6ba33425-261a-4008-ba50-63a3643cb3c4	2
318 | 56c4600d-cced-4aef-bf15-a23ddec388f1	4
319 | 1d8d6ae5-fb0d-4dbc-b74c-94ffa839d779	1
320 | d1e92bcd-d3d2-4d8b-97f4-c171d85ac5df	5
321 | 6f7ba9e7-60c9-4c6f-b672-3265d93310de	5
322 | d2d3e3ed-e6ff-4829-85d5-20ad40ae9663	5
323 | abb75a7e-86b8-43be-9eb2-6f8e5ba25c3f	3
324 | 19b910d9-3253-4376-9f73-b1aec56ac263	4
325 | 393b2f3c-5158-420a-a2b1-23c1f08e966a	5
326 | fdf56656-1016-485f-a3bf-79da1aeafe6a	1
327 | faf7f15e-80c8-45b2-9b69-4e466894676e	5
328 | cfbb8e61-d818-4bf6-9fc6-861068084631	5
329 | d3f57ceb-38c8-471f-82fd-31d7f31579fc	3
330 | 3de731d5-4854-42c2-b00e-37b50ef2b232	4
331 | 46fdf467-f36d-4f3a-a4bf-3ba42aba50e8	5
332 | 2b87b0cf-1f13-4574-b8a4-83773396ea5e	5
333 | 88f2e073-a32f-4c0f-b5a2-fcbed747a016	4
334 | f76f7b8d-de62-4776-a77f-06352cf7acc0	5
335 | 7bdca966-7a40-4190-ae6c-e878f088f048	5
336 | 7e6eaffe-4e0a-4fcd-be65-528d18737ca1	5
337 | 752b0d2c-adb6-4079-9af1-0fd9493090e3	3
338 | 3fe82b84-1230-45e1-9e70-71408b111458	5
339 | 95ed642c-6cb1-483f-99ae-a6937cb2b3fd	5
340 | 7cf32b45-0d0c-4e97-8063-3b2c4af73ffe	5
341 | 266fc420-775c-48c9-b13d-ed455a51fc2f	5
342 | 149b3fcc-7fcc-455a-9161-3167a9bd9f1f	4
343 | 6d593b63-17d9-47d8-82c9-43de8072fae1	4
344 | ea3ed858-3342-4128-91cb-7724f2348772	5
345 | 4252f1c2-c976-4ba4-966d-30d2eabda9e1	5
346 | ebeb26cb-756a-4c60-a862-0d1f073832b5	5
347 | 54491e98-240f-4f17-aa4e-9278d32e83c0	1
348 | 4bf73b45-efdf-4626-952a-12441dde9de8	4
349 | 0e114a73-5d8f-4aaf-980f-c0012acc3fd9	4
350 | 5a0926db-e238-48e6-8d69-617a5d824c01	1
351 | 5b6718e1-2602-4daa-bced-c38b25005e19	5
352 | c0ccd086-6c02-4126-adfc-ce752a58115f	1
353 | 7966cd84-00c1-42b1-b48c-c7c3369b489b	2
354 | 078f148e-1916-44fd-a7bf-e085af105329	5
355 | 8ff6533f-0dd9-44e4-a3b3-980abfb85edc	2
356 | 63db975b-931e-430d-94fe-5bfda6bf2b96	5
357 | a4f82b2f-c7a5-48e0-aa60-83c58161745c	5
358 | 57dd90e3-bb58-4bce-a6de-d840095bccc2	5
359 | c21b34f1-16c7-4bba-9fa0-619061a791c8	4
360 | e61740ef-1e2d-44f8-a836-07cb14410de0	4
361 | 89300dbe-3d0a-48fa-8f15-4a26bc5aded1	5
362 | 51632de9-8e43-4ae3-82f5-dd9460bda197	1
363 | 8622060e-bc75-4cbf-8910-6cc23bdb8fdf	5
364 | dd1c9b6c-c794-4987-b9d6-dc32d3772db4	5
365 | e03711d9-e0f9-4fe3-9dfd-abbb67c690c1	5
366 | 8b7ddba1-5ad6-4cce-b86f-6436388d2ab9	4
367 | 0cf5e80a-8251-4169-a1fb-a06aa2251e2d	5
368 | 4ae031b9-7226-475b-9e27-9e069f35b7f8	5
369 | 1518a8e6-721b-4a81-8140-e8e4532ffa02	4
370 | 3b7157b3-1aa9-4611-96e4-67b86349876e	5
371 | 9c7577da-c2be-45f3-a75d-0dbfe6111969	1
372 | 5a7236c9-9beb-4718-9397-68a6fca49260	5
373 | 71b6dd38-b9bb-49bb-a099-3d8f3d0aa07f	5
374 | ec1a3edf-76d3-4ec0-8659-9ec2be5e66c0	5
375 | 76fed137-afc9-4fa1-a24b-8bd6c9086244	5
376 | 2e79591c-0df4-475c-8e96-c7cae176c7e6	5
377 | c75989c7-7a46-4c92-a034-eda43f20dfa3	5
378 | e1956382-2e2b-46cc-998e-5576e5d6ec61	5
379 | 1e5de174-a45c-47f3-b4ca-8d4cfc8a90dc	5
380 | 42f0d511-9afb-4874-84a1-1660e08c2c5a	5
381 | 2c15e478-7764-43c7-8378-50f903c28b88	5
382 | 6616ac79-b0df-470c-ba2e-9ab48fc03287	4
383 | 9b8c4f91-8683-4924-a10b-2bcf9cd31e7c	5
384 | df4a557c-7be7-4cc1-ad94-4cbf5b4f1824	5
385 | 262612a3-7693-4291-b815-7534a8b2f832	5
386 | 6dbf90e0-518d-4694-a41f-efce8cc67724	5
387 | 2ee86bb4-bdde-4b09-a5c6-33776de29a05	4
388 | 1ef31326-7983-43fd-87b4-5b51a3c00765	5
389 | 6ee1cda0-bbec-4709-8031-a107e8bb4434	5
390 | e0a9c139-5739-4d7c-815e-4b75488374a9	1
391 | 2c59cc9c-7a86-4df0-8b7c-a64027eba4c5	5
392 | b95c9b15-d55a-4610-8e45-c4a2c6130356	5
393 | 4742e54c-6f9e-4059-b334-a7b1e6f967e1	5
394 | 1ba88839-00a4-47c2-87b3-aff9fbe6449d	5
395 | ad6e14a5-face-4281-a2f6-9d241276746d	3
396 | d5af638b-78a8-470b-ab77-cf66afeb8dab	5
397 | 770450a8-1f52-4f43-b143-6de0748f4369	5
398 | a1a95bf4-4786-421a-8b0b-a1b367b0174e	5
399 | 08f39e61-0914-4e3b-8344-684d02379680	2
400 | 28bccd02-2272-47c2-b666-9c1af9b605c9	5
401 | 872c6cd8-974b-417a-ae44-b085151359f2	5
402 | 771d3df9-74bb-4afa-9fd1-f356f3a98495	5
403 | 5f0999d7-7ef3-45ff-a8cd-ae77d448b104	4
404 | 866d4f6d-6471-478a-a32f-791750041f39	1
405 | c3eb3913-fe60-476e-b265-bee406c7ec69	3
406 | da6778ea-e6e4-4b17-b7ec-5aa554f7dd2f	5
407 | 29f9e887-f401-45e2-9b28-c1d2b7d384b8	2
408 | 150bd722-1fac-4517-99c6-df25e3eb0b10	5
409 | 3a71095f-5b80-42d4-9352-35dc90328ee9	5
410 | 222e92eb-0fe5-47f9-8684-4e8bc9c91999	5
411 | 2ee6efe6-a41f-40a4-bf51-4c654906dd2e	5
412 | 78a7b74e-dd4e-4ef4-9818-2ec1f25b149d	5
413 | 195c681f-d101-460d-8714-dda45eb4a4ef	5
414 | 5a415c97-1263-49d6-a511-66672090a32f	1
415 | f4f47c3e-bd82-4fe6-a8fa-27e72f0d4d10	5
416 | cc547c9b-2aa9-4687-9a47-c6b2e652a5aa	1
417 | c36bbb95-a45c-4748-96bf-62ec0f1846e5	4
418 | 9b38ce05-d944-467f-b881-7c5af8c52376	1
419 | 4e2e0f47-7237-4a78-9726-ebb220ba9e25	5
420 | 91e068a8-bbde-423a-9d2c-8b10bd2ae3e3	5
421 | 03fa3a18-1846-4e77-ab3f-bdcdcf9e4513	4
422 | f823b09f-e6a5-40c2-ab6f-8409e8bd5c6b	5
423 | 0a7c3e45-a112-42b6-a827-f77f5ad91ae9	1
424 | b269d473-151f-48d7-b9d1-5a755a327403	5
425 | ae174371-0ac0-4a15-8470-b4fcf4ff9ab8	4
426 | c25e523f-0b6e-4724-9660-2d954767ad3c	5
427 | 9f797c06-e168-4e6d-b77a-10ce1ceb73bc	2
428 | 65312d76-18f4-4157-b7d9-2bf795fc298b	5
429 | eadc7bee-77e9-46cf-b46a-21606e58a6ed	5
430 | 5b999de3-7248-4395-aaca-d56f42cef495	5
431 | 4b3243ea-c27d-4d9e-9c6f-d219cd266d31	5
432 | 88aaf95c-9ee6-4c5f-acbc-cec905f851c0	1
433 | 8059a81f-a5d5-4d59-a8cb-f365ca7a2f00	5
434 | 7fa25888-6a85-4eda-8510-ca0b8094b9bf	5
435 | d1d4442b-8f6e-479d-a780-eb800a0a6fda	3
436 | 1fbfbbf9-bf62-4ee9-9083-c9808a85ac7d	5
437 | 700664e4-d8ef-4a07-98d4-cd8e75535d97	4
438 | 1f3f51e2-b970-4360-831b-64e052c1f03b	5
439 | 3b9136af-e6f4-4fe8-bee1-a3a64ca054e7	1
440 | d4778c2e-9cea-4443-b0fd-615f13e4a5ec	3
441 | f8ff7ad0-0eb8-4d69-9fdd-d87aecbb0d27	3
442 | 10327f41-cc0a-4c2e-ba62-203be668a4e3	5
443 | 4104232d-98fa-428c-b565-dd7f85792121	5
444 | d560b147-9920-4cd3-aa4d-299a3514431b	3
445 | c10ed89e-4a99-4fc7-b82d-b0b400d6014d	5
446 | bed0ab71-557d-40d5-b4a7-8a63cdd30661	2
447 | 8cf0b389-3033-49e3-88e8-79fc5c3aa13a	5
448 | f3f0bd20-5ed9-4220-b838-b28d7df0c7f1	5
449 | 9b8b611b-05c6-4012-9c37-7fd6bf02e8fb	3
450 | 569af6de-d51f-419b-be01-45c9bfc29294	4
451 | c70cf489-22bf-4912-9fd9-a387cf15e0f0	1
452 | c910b932-fbd4-4a5d-beaf-89054c3cc5df	5
453 | 067fda12-a898-4d12-846c-0a98d42734b6	5
454 | c2273fa4-8a03-4fa3-b97e-c1ef73e976c4	5
455 | 8cb52bf5-2c86-4e17-a063-6c85207ba922	4
456 | 69068ffd-f995-4f4b-baf9-1698fc4b9a12	5
457 | ae255001-97fb-4959-93f6-fc05d2d1b1e0	5
458 | 6b1c35c1-83f9-425f-8c11-3df32bd7de43	4
459 | 802f4ee7-b017-4c39-b9d5-7321496f07a6	5
460 | 90659100-f58e-415e-a389-9d8fa84204b5	5
461 | 3e1d6e5a-992d-4c0b-b361-ef2a9df6d688	5
462 | 02048d7b-5d70-437e-a23c-ca5cbcfbd357	5
463 | 883b7378-f733-4f73-9910-c6a82170d882	4
464 | ccfec946-c11e-43f4-ae0b-cb2fa616eaa2	5
465 | aa0101e5-7d3c-4560-82f5-12c047f4651b	5
466 | 5420f885-e52f-4cde-8b11-876110299ae5	5
467 | 7982ffce-b3c8-4dab-a1a2-e3cd429fea78	5
468 | 6ba86d0a-d5f4-4660-98c1-4d31f5dbe817	5
469 | 0a8e8485-dee6-4cbf-9803-7413ace20c97	5
470 | 3e78b304-b02f-426c-9e2a-287593aea1e8	5
471 | a95739cd-aa4f-4cbd-b752-b6c1f583f62b	5
472 | ab4d76ae-ed22-4630-9afc-d0db02949c03	5
473 | 73c6f830-7d7c-4ed5-9805-15baa3de97fe	5
474 | cbb67e27-5c16-43f1-a725-a94e52922264	1
475 | bbda2255-009f-4e54-ba4a-7f0df689a3a9	4
476 | 6a119ac0-8a06-4cd1-85bb-46dc64c388b7	5
477 | 7aedf762-f413-4b5c-820e-20965e99e9da	5
478 | 838a9902-f7af-4414-ac03-0ec2da33a174	5
479 | 1685a6a2-a482-4d5c-bbfa-beff6f31a36c	5
480 | 7cd5891e-b4a7-4765-8879-5bddfc1eae40	5
481 | 28cc0d9d-2ff0-45ee-ade0-c182b4ec5e1a	3
482 | af1cb938-5a85-44d6-929e-7d0683086f1a	4
483 | c78044fe-d5d7-4132-a1ed-af4359b2a6d1	5
484 | 2b8c9707-0b88-4ca5-b5c6-5c286e6d001f	3
485 | 661302d8-c7de-418b-9d03-3fc0217b46b2	4
486 | 4605d3bb-a3ef-4c14-a5d8-8ee49634cc0e	4
487 | bc7e479f-c4c0-40f6-973c-4eaa5ae680a9	1
488 | df4fb301-6c31-4c99-8977-0a8c7801bc19	5
489 | c7f3f86a-1de7-4ee4-8b75-96228b9d6bba	4
490 | 3a9eaa75-41fe-4140-b7bb-c8f70badfcf7	5
491 | 002c7a2c-d07b-40e4-83d2-140d18db5387	3
492 | fc151034-9330-40d6-8dce-8207ec78c3bf	2
493 | ca944bb8-fe2b-444f-92dc-4726e434c197	4
494 | 737082ee-9363-4c80-98ef-1f2cd25a27d1	2
495 | b7856c82-500f-4cbc-a790-4a931a20f8c5	5
496 | 877bf80b-f1fc-42f9-9d8b-8b7f523d426d	5
497 | 212ef621-603f-4911-9c36-8d0f369250c7	5
498 | c0eddca4-8ad8-4ca5-9ab1-404050dde240	5
499 | d8969393-3567-4cc7-a979-2373c8725738	5
500 | 


--------------------------------------------------------------------------------
/assignment2/data/ratings-3.tsv:
--------------------------------------------------------------------------------
  1 | 09d08220-7f1b-43f5-ab9c-9d70333bd831	5000
  2 | b2802cae-3ff9-461b-9fe1-546b5287ae15	2
  3 | 9455018c-a41c-4d57-8f52-6672e23fd879	5
  4 | cfc66795-e9e3-4fa3-b59e-ad90bb113719	2
  5 | d2d88575-d803-4df9-9c7a-f581c343bac8	5
  6 | a2364dad-c4a9-4740-b033-575ac28ca327	1
  7 | 4918d18b-2c32-45e6-947e-04328d1fc016	5
  8 | ddd7c9cc-7099-407a-9de3-8123b5b38062	4
  9 | 4c634bc8-98fb-4690-9220-c4896e5458f5	4
 10 | ec6f0f08-abda-4974-95a7-9da0fdbd0e42	5
 11 | b56b07dc-968c-41f1-bb5f-9d57a236f949	1
 12 | de68a76f-a65d-4daf-a37c-ad5db06b2667	3000
 13 | 4ed81c48-aec4-4ba8-8ddd-5e36e040e7e0	4
 14 | 77633fc9-ca35-4113-b334-42930f3baa32	5
 15 | b4be6022-72de-469a-9793-90100d06dbc0	5
 16 | 44b4a847-a128-4e20-9d1e-c32a369276f4	5
 17 | fb63019e-8753-40f5-890e-bab67949fd03	2
 18 | c5263109-1501-4963-82e9-05bfdf0b1f80	1000
 19 | 3066c74e-a9f1-4e3a-af08-d6a06b65fbef	5
 20 | de9f055f-56ba-47c6-9632-6a0d2f41c3fe	5
 21 | abb062e5-a118-40a1-af4b-70ac42525033	5
 22 | 6d9f9d48-8984-4594-8357-f256903c1d24	1
 23 | 30d0579d-7d1e-4420-b4af-7820f26d07e7	4
 24 | 12687240-b315-42a5-8797-12ddd5800cd0	10000
 25 | ae51fb44-156a-4ed2-8b4d-88da1acf1f2b	1
 26 | 605c2638-f159-462b-a3fc-6ad1d93920cb	5
 27 | 00e8b63e-a6f5-4fa5-a05b-b2e91e9c8ea5	5
 28 | f7de7d41-ce58-45f6-b283-0a5047de7667	5
 29 | 64600f8e-784b-4833-9b3d-df8ad9059b00	5
 30 | 80044598-caf1-4e2d-bca6-7d5472ae03ca	5
 31 | 324c3799-75ea-49a0-a0b5-61a1a67f5d1b	5
 32 | d154587e-e185-4f22-b299-6c92dc3f29c2	5
 33 | 1f7dcbd0-631f-49af-aae5-9d902119b652	5
 34 | b0e2fa0c-e1a8-4f46-91a3-e178bc338ae9	5
 35 | 903d0d47-9ee1-465e-a882-9896b58d4b53	5
 36 | 5ad05cb0-4750-4a59-a743-9ba1e86deac6	5
 37 | 27e1d531-071f-4cdf-b09e-a1a7201f7c83	5
 38 | e3b7788d-1ffe-4f3d-8261-61c1e634cd6e	1
 39 | fba382f6-92e8-4d83-b192-d8bb1d43db97	5
 40 | 863f5a5d-b8d4-412c-9f56-9354d3fc4ac3	4
 41 | 23a30c1d-11cc-40f8-8b4a-d47ce0ae7e25	5000
 42 | ebbb33e4-8c54-4081-ad53-ab2a3fd6ab55	5
 43 | 2f4659d7-4b98-4cbb-995a-56a6e708a211	5
 44 | d7e1e702-6bd4-490a-91ca-521dfb0396b3	5
 45 | 53f6695b-061f-4cd9-9f97-8f46ffa2d14f	5
 46 | ff108c8e-4b0c-406c-bc3c-28264913e4bf	1
 47 | ad6a5541-71d4-4dc8-81e5-28b4c6d90a60	5
 48 | 2c43e76d-db1c-4056-a311-1f70501dbce3	5
 49 | 83b7ba0f-afa5-41a6-9096-357fc1f85821	5
 50 | a7fad65e-bd4a-41dc-a726-1f550051b23a	2
 51 | 34b00498-ff09-4175-a101-95dcd39ad8e7	4
 52 | f5dc753f-5ff0-4ea6-8b5d-5a0ff7aea486	5
 53 | 201be5e1-02de-495e-9c63-1af4ded62b08	50000
 54 | ee9f9b4b-3a9d-4911-a076-e9a9f56a71c8	5
 55 | 12b08cae-da43-4954-84df-9297fb304fdc	5
 56 | 475ea613-4d38-4597-b6fe-a358c8f12786	5
 57 | bab3be6e-f3ef-4a7d-846e-a6f59dc0e085	1
 58 | 371a5173-683e-48a0-b71b-58c7847ea917	2
 59 | 4da8b371-4135-49f2-a0f9-5f91dc900caa	4
 60 | 9b9213fe-cebe-4eac-a3ac-bd79fcf4bb2e	2
 61 | 8db565b9-b625-46ca-a04b-fd48f4c1c726	5
 62 | f550bb80-a6af-4b36-b2ec-5940bf0055f7	3
 63 | 3ab485f1-8c4c-4fe6-9022-50548eddc78e	1
 64 | d5c17875-a2fe-476a-8b18-b481bb40b569	4
 65 | 80262769-8bb3-41b5-806c-eda6d5cad6ae	4
 66 | 4d3dfd9e-1ea7-4136-81c7-0e55f1f6f1fb	4
 67 | 953dcbaa-2364-44ec-8542-b35d1f49ce4b	5
 68 | 1de45dc9-2e85-4e7d-aefc-35cbd36c79e3	5
 69 | 118c00df-8b08-4794-8921-15463f361e91	4
 70 | 2d4ea92a-5a30-48ca-9c83-dad54bcca288	5
 71 | bb9c7649-379d-4c84-aadc-475a5293cea6	5
 72 | 812a49d1-1503-4292-84a7-bce09f387969	5
 73 | 56629e68-c383-4991-ab5e-f50a02c54985	5000
 74 | 737159fe-af6a-4b1e-bd14-7ffc9a23fe2f	5
 75 | 06defb3f-eaea-43a2-b7c1-73d88497d394	3
 76 | 8e9ae965-0378-454f-a80b-94a9c58434aa	1
 77 | 5159ad9c-111d-4244-afe7-34f94235e75c	5
 78 | 7dd93199-4b31-4b7c-aea5-4c8cc18b9a9c	5
 79 | 6979a062-06f4-42ec-b2f1-eba7ca41eeb4	2
 80 | cf9515cc-9c1c-4ac8-93b0-2a250bdd64ce	1
 81 | 0eb7f81f-33fe-4c58-ba02-0641dc8a224e	5
 82 | e8a71495-e12f-45b9-852f-6e5db9738b16	5
 83 | 66f5a06c-dda5-48be-a885-1d36f9810a04	5
 84 | 926ead60-c6ba-4a9d-9954-8f907973934c	5
 85 | 19fcbf03-dd2f-4765-a9ae-69b8d5a26c18	5
 86 | a36095fa-6c76-4805-9686-87fd24015cf9	5
 87 | 4ffa4495-4537-4a24-89f5-119e8089e110	1
 88 | c1e47ebd-68d9-455e-8399-14303ad9baa0	5
 89 | 67b1abe4-94c1-4bd3-a54c-c2daa4fc69b8	5
 90 | b78ba370-dbc4-4cf1-bb86-82a0565e890b	4
 91 | 710af073-dda5-46d7-af7a-cf989916fb49	4
 92 | 463c58fc-70da-469c-85c0-0e3ab7fe5c75	4
 93 | 8d180f7a-6afb-4d29-8b30-e0db86fe9abb	5
 94 | 613c00ca-6923-42b8-ad0e-d25d084e749f	5
 95 | cc93aeef-5039-4e2f-ab6f-cbe40b9ee659	5
 96 | bfc43db3-53be-42eb-82cd-182f84d41bf5	5
 97 | 0d27de38-b45d-4348-b996-d0c61ed42381	5
 98 | ecd51f89-4f99-424f-bebe-bec79ec28c73	5
 99 | cc367014-35f1-4358-becb-e8ca7957cb23	4
100 | 09b8bcea-6d0c-4709-bc67-8fc78087d646	5
101 | 232ac5ca-5b6b-4bb3-b63e-854b9f44e4bd	5
102 | 06745f5c-d06f-41e6-bbd9-83caccc3be1a	1
103 | 874f17fa-eca0-4ab8-bbee-a49c70145646	5
104 | adc59d80-303c-46d7-aba6-b104f4a4f934	5
105 | 1ea479ee-ca57-4941-b9b7-f60fd0e6430f	5
106 | 6af71cf1-953a-4877-9cb2-b2719531c977	1
107 | f1d03854-3502-4c9f-ba4e-762a61f3b180	1
108 | e36a5f9e-2f9b-4564-9827-b3d9f55453ce	5
109 | 25b033ca-7f99-406a-9cfe-25746c56fc6f	5
110 | bbb39e5f-0f30-42be-b028-9c84733f546e	5
111 | c82e02dc-bd25-4059-8d9f-876c9724107e	5
112 | 1b656987-a7f8-4453-ba3b-f7caa90321ca	5
113 | be83b9bc-02f6-419d-9efd-47ce1671a32e	5
114 | a5d20722-a98a-4031-9612-01160649054d	4
115 | b4417be2-2e7d-4de6-9dcf-443d6fe34b27	5
116 | 027db866-fc96-4aed-bc33-1dcf88cf6b6f	5
117 | ee68ec35-412b-4722-bd16-74370a289985	5
118 | b193216a-7af2-4937-b796-8417b6182c52	5
119 | 6681efd3-8424-4b99-b1b6-3c5e9faf70c4	5
120 | 27d2d9c9-78aa-4f3d-bc4b-5dd363607f12	3
121 | 4ff8461a-83d2-416e-869e-834c8606f1bb	2
122 | 9bed45ef-f7b2-4b6a-b502-6fcd6b116a55	5
123 | 0b32a899-b61e-4b47-95da-315a156a0a3e	5
124 | 50a50ecf-8e50-44b9-baca-64287ae79277	5
125 | a77f272b-ae66-449d-a135-37ca56ee3e24	5
126 | dda7992b-5ad2-4238-bf52-d8b2fec36057	5
127 | c07b056f-fab0-4ba7-a2aa-d05bf861a8ff	3
128 | caea3a80-8937-47a8-b16f-20aee31deeb5	3
129 | 7c0a527d-ece2-4681-9afe-140f1e9c0ed3	5
130 | d582c666-e9d6-41b0-93e9-dbaa2b816759	3
131 | 72132676-ad55-4901-b102-b30fa50b1592	4
132 | b922160c-7331-4433-9068-54e54f375821	4
133 | c0312efa-995d-4232-85ca-1f1b170272b3	4
134 | 34d6f900-6261-470e-9fcf-a353157c8dac	5
135 | 433a695c-838c-45bb-b398-82d0c3c866bc	5
136 | beea6ce0-fbcf-4d6a-8fd3-2a2d8988e75d	5
137 | f77c24b9-a880-45e4-be10-10c57bf6dacc	1
138 | 421e76ce-595a-4ba0-9c4d-d2b420fc4486	5
139 | ad6b33cc-d083-4a79-9a38-36747172bf8b	5
140 | f001fb8c-96a9-49f5-84a0-beb2682a18ba	1
141 | d4c8cd0a-9fe3-491e-a245-4af8aeeafbc8	5
142 | c9456ff9-61f5-4739-9b46-37d8a8ec6f49	2
143 | 35fcda98-9deb-47ae-bec7-9dbbc753d310	5
144 | 57982772-2029-48dc-9da2-cb595024b77e	4
145 | 63511115-001f-44c3-a3ee-1191ce59e20c	5
146 | f472e28b-0c17-42c6-aad1-8f7e70d7b073	5
147 | 95e9d4a9-e598-4d13-8819-a6cab4cf9772	1
148 | 19f78658-0540-4bab-89f9-a22dcb27c023	5
149 | b523452f-5974-44c0-ad1a-a052b011e34d	5
150 | a0814b13-654b-4735-805f-0db842c6414b	5
151 | dbf19161-12fb-4cfd-a3c6-3a999f23485f	5
152 | 414af967-9c18-4e1c-a6ca-f6404c6e1e75	5
153 | 75dd45f4-cd91-45f2-acf6-1c24764c4e6f	5
154 | 21f5ff9a-726c-4e74-b6c7-8b747bac575c	5
155 | ff4a01b6-242e-4775-b31a-3f4f5ca27898	5
156 | 12c03c0a-43b7-48d8-909a-467254e2e635	5
157 | a0af171e-5a6d-4e8a-a8ff-b50419c428bf	2
158 | 9026e8df-fc2d-437b-b1b1-225eeeaac5e4	5
159 | 7def10ce-101d-4117-bfd2-44aeb80bb73a	5
160 | adcc1f23-9130-4e28-97dd-95c06b03f138	2
161 | 42f1a61b-2703-498f-8afc-43aac659554c	2
162 | 7eb48d51-e780-4f7e-a429-ceb397672262	5
163 | 73399229-02b3-4017-a0bd-43577188acb0	5
164 | 13998d8e-7f81-4faa-b722-5f2560275d84	4
165 | c653e9e2-d746-40ab-add5-3842a68d18c7	4
166 | 6650b472-0f70-4049-b582-d4fd94f1e84c	5
167 | 43581635-b6d6-4bcf-a883-646db6a241ce	4
168 | 65fb3482-ba65-487c-9b0f-6cf41869257a	5
169 | 7997635d-7c2d-47f6-a8e1-477deaf34deb	5
170 | 26dd3f25-668d-4ccc-b2ad-2b63e8d4a902	5
171 | 0a3b69ba-0041-4467-8eb7-4603e4e41c4c	1
172 | 6f2f2af0-0c96-44d6-9e04-c82436a14dcb	3
173 | f0204266-2b55-428b-80ca-2d01d6ad7f82	4
174 | f39c382f-e9c3-4ed1-822a-1c3b41d9a1c1	4
175 | fb716b0a-344b-48f5-90f4-92db51c00dde	5
176 | 21f8760b-a8dc-42c2-af8e-df110e0b53b4	5
177 | db849e5b-7d47-4e1d-999d-2b39a6c0a79e	5
178 | 7f1205bf-33a5-4f7d-b030-32c6b9333c20	5
179 | 02b2181a-5c46-4769-bb18-0062d25c1da3	5
180 | 300d14db-dbc1-42c5-83c8-26799e9ee9ca	3
181 | 41f50d5b-42b1-4985-927d-268ae7b63668	5
182 | 639cedc1-b14f-49e1-a39c-226cd7f1d394	4
183 | 93d2a244-a8d8-432f-b60b-0869ac4a7fe5	5
184 | 4305771d-22ce-44da-86d4-0859db6cf7b9	4
185 | b034d5d0-b10e-425f-93aa-45f7e353132c	4
186 | e524f22f-b873-439c-ac70-8de5ddf89d35	1
187 | 5e80cc7b-7875-4596-9b12-6055203b9a10	3
188 | 3fcfc5d0-8c10-43ae-9ef5-50427556c6ed	5
189 | d96a0442-a1a6-4ea8-974f-4b11e3a674b8	3
190 | a4542301-9253-4089-857f-8542dba492f1	5
191 | 1504de06-54de-48cb-a5bd-2e0f5ab66a9a	5
192 | 8cc29b24-9861-4857-8542-e8ba332d6db2	5
193 | b51aaadf-18ef-4e91-af08-69d6ed915a2c	5
194 | 11a515c5-2fb2-40d2-b780-a5efb9cfcef3	5
195 | 295fe272-0b87-43db-8f6f-c39db455185b	5
196 | c254b9e5-80a4-40f3-895c-618e51a2800b	5
197 | 357c8a7e-f6d4-4bf7-8051-1e7880337896	5
198 | e9ec6880-0443-4db8-bd33-d4629c821cdd	4
199 | d764e032-e8ae-4fa7-adcc-773415342a4b	5
200 | c9f54bc9-6103-48b3-9031-96543c412e47	3
201 | 02c0fbd0-795d-4da2-8f4f-58e0930df727	5
202 | d482d3ed-055a-40df-90b3-5f196c2891e0	4
203 | f414b8e5-6307-4d9f-82c8-b4770fc3c5f6	1
204 | 4944ccb5-d0cc-4a97-a2aa-092019e9d571	5
205 | 0decf9f2-5828-488f-bb3a-6d3ea4bf440f	5
206 | 58c13fe9-147b-4542-8678-3c295e5b3a22	4
207 | af824988-667c-4208-9810-407bbde6ad02	5
208 | 5b272de5-2034-4cd3-bef3-72a2f77d4495	5
209 | e5abb4a5-ff0a-4da9-b820-e81b1bd28d53	5
210 | 92fca88e-0f91-4170-8650-4780cbd4f080	5
211 | 5085f036-0861-4986-9c48-8e1f5518b7da	5
212 | 8c866c49-eb7e-419e-89b9-7fa922f18f27	4
213 | cd78c10e-7d26-478a-ae0b-bd999ccfcdde	5
214 | 797be056-fd8b-4979-8a5a-6a78eb486f40	3
215 | 6807bda5-4df8-49e4-a99a-c13354739dec	5
216 | b8e52b05-5b4c-4bf2-bc86-f8981200442f	5
217 | 7a429f6f-0c61-47bc-bedc-a3a9fc9d7d91	5
218 | 59aabd81-771b-47c0-b117-ededfa318040	5
219 | 2741c230-14b8-49ce-8dd5-cde428a9fa10	5
220 | aa8f5dfa-e9f3-4b47-acae-7f238b2c3f8f	5
221 | 52e8b098-a074-4bcc-b73d-90c8c1ec1f63	5
222 | 768aa12e-d10e-4d7b-841a-c65fb24e52cf	5
223 | c6da3828-da52-4111-af4d-7670b40d1ba8	3
224 | 7a6faaee-a05d-4671-a773-9c2cac07aafd	5
225 | ee216f6e-3a00-4ee8-9ddf-3247ddc819cc	4
226 | 7791834b-64c0-4bc0-97db-06b62eb16599	5
227 | f83d88d0-3e97-417b-9b74-55d6262e58b2	5
228 | 79f06680-3022-4e50-be92-9410de37ba54	5
229 | 877fabff-dcbe-440a-9934-f295dd05b155	5
230 | 404c178b-9ed5-4271-aadf-d0826c492b5a	5
231 | 75e59a5f-4df6-455e-b92b-182ee52b6a65	5
232 | e58ff289-8aef-457d-b2f9-bf7be836eb05	1
233 | 030b59b8-85a1-4b2f-ad81-a977bb111222	5
234 | bf731dbb-7087-499a-84f0-e8a6aafcc03e	5
235 | 765dbf23-dd38-4287-878b-2f9ed5cd6c05	5
236 | b6a65567-3b5d-4042-a059-a807461fdcfb	5
237 | 835f64b8-38ae-4a35-9ae3-bd80e8a52eb2	3
238 | 7eb23580-1029-431a-ad06-f9a64b17be73	4
239 | 2de6eda1-0ee6-4c29-8554-752f89295874	2
240 | 15d2620b-4cd6-48a9-9b20-bda00424f136	5
241 | ec86db96-3e5b-4de8-88cc-060911299d5b	3
242 | e57a07d9-c419-4f17-bc74-97d51f6e7389	5
243 | 3e37dff6-711e-4c72-8fc7-8131df880454	4
244 | eef18854-1a1d-48e8-8780-76ebb1c268a5	5
245 | c22abb05-2b91-4941-8b6b-ec72801f99fb	5
246 | b927dd4c-5196-41d4-8bd6-b54aafe7faa1	5
247 | 02a7d169-8925-4d3a-9dd8-253c1ef37e5b	4
248 | def6de43-649a-47c8-9b79-2dd94f21689d	5
249 | a2b774d3-5a75-47bb-91d8-59c8e3769f51	2
250 | 0e6c70dd-44d6-40e5-b635-360302acc275	5
251 | 1df86592-7c6b-4194-ae09-7a2ea30093a0	5
252 | d0876e7b-0267-4ac6-ba5d-e955bd2ffda1	5
253 | 


--------------------------------------------------------------------------------
/assignment2/regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment2/regression.png


--------------------------------------------------------------------------------
/assignment2/requirements-catalina.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.21.3
2 | pandas==0.25.1
3 | jupyter==1.0.0
4 | apache-beam==2.14
5 | tensorflow==1.15.2
6 | tensorflow-data-validation==0.14.1
7 | 


--------------------------------------------------------------------------------
/assignment2/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.21.3
2 | pandas==0.25.1
3 | jupyter==1.0.0
4 | apache-beam==2.15
5 | tensorflow==1.15.2
6 | tensorflow-data-validation==0.13.0
7 | 
8 | 


--------------------------------------------------------------------------------
/assignment2/task1.py:
--------------------------------------------------------------------------------
 1 | from components.schema_validation import infer_schema_from_csv, has_anomalies, adjust_product_schema, adjust_rating_schema
 2 | 
 3 | # We infer the schema from the first data file
 4 | product_schema = infer_schema_from_csv('data/products-data-0.tsv', column_names=['id', 'category', 'description'])
 5 | 
 6 | # We adjust the schema with some constraint that the automatic inference might not have captured
 7 | adjust_product_schema(product_schema)
 8 | 
 9 | # We use the schema to check for anomalies in subsequent data files
10 | assert not has_anomalies('data/products-data-0.tsv', product_schema)
11 | assert not has_anomalies('data/products-data-1.tsv', product_schema)
12 | assert not has_anomalies('data/products-data-2.tsv', product_schema)
13 | assert has_anomalies('data/products-data-3.tsv', product_schema)
14 | 
15 | # We infer the schema from the first data file
16 | rating_schema = infer_schema_from_csv('data/ratings-0.tsv', column_names=['id', 'rating'])
17 | 
18 | # We adjust the schema with some constraint that the automatic inference might not have captured
19 | adjust_rating_schema(rating_schema)
20 | 
21 | # We use the schema to check for anomalies in subsequent data files
22 | assert not has_anomalies('data/ratings-0.tsv', rating_schema)
23 | assert not has_anomalies('data/ratings-1.tsv', rating_schema)
24 | assert has_anomalies('data/ratings-2.tsv', rating_schema)
25 | assert has_anomalies('data/ratings-3.tsv', rating_schema)


--------------------------------------------------------------------------------
/assignment2/task2.py:
--------------------------------------------------------------------------------
 1 | from components.beam_job import create_and_run_beam_job
 2 | from os import path
 3 | 
 4 | # Invoke a beam job to join products and ratings, filter them according to the instructions,
 5 | # and count the number of entries per category
 6 | create_and_run_beam_job(path_to_products_file='data/products-data-0.tsv', path_to_ratings_file='data/ratings-0.tsv')
 7 | 
 8 | 
 9 | # Validate the outputs
10 | assert path.exists('category_counts.tsv-00000-of-00001')
11 | 
12 | counts = {}
13 | with open('category_counts.tsv-00000-of-00001') as results_file:
14 |     for line in results_file:
15 |         category, count = line.strip().split('\t')
16 |         counts[category] = int(count)
17 | 
18 | assert 'Kitchen' in counts
19 | assert counts['Kitchen'] == 217
20 | 
21 | assert 'Jewelry' in counts
22 | assert counts['Jewelry'] == 148


--------------------------------------------------------------------------------
/assignment2/task2_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "task2-todo.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     }
 13 |   },
 14 |   "cells": [
 15 |     {
 16 |       "cell_type": "code",
 17 |       "metadata": {
 18 |         "id": "ejZtKRkQWHEh",
 19 |         "colab_type": "code",
 20 |         "colab": {}
 21 |       },
 22 |       "source": [
 23 |         "import argparse\n",
 24 |         "import os\n",
 25 |         "import pprint\n",
 26 |         "import tempfile\n",
 27 |         "import urllib.request\n",
 28 |         "import zipfile\n",
 29 |         "import warnings; warnings.simplefilter('ignore')\n",
 30 |         "\n",
 31 |         "temp = tempfile.gettempdir()\n",
 32 |         "zip, headers = urllib.request.urlretrieve('https://raw.githubusercontent.com/schelterlabs/deml-lab/master/assignment2/data.zip')\n",
 33 |         "zipfile.ZipFile(zip).extractall(temp)\n",
 34 |         "zipfile.ZipFile(zip).close()\n",
 35 |         "urllib.request.urlcleanup()"
 36 |       ],
 37 |       "execution_count": 0,
 38 |       "outputs": []
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "metadata": {
 43 |         "id": "Sx_D-0a2axzu",
 44 |         "colab_type": "code",
 45 |         "colab": {
 46 |           "base_uri": "https://localhost:8080/",
 47 |           "height": 1000
 48 |         },
 49 |         "outputId": "10d1956d-da67-4aa6-80e1-f05e12e7099a"
 50 |       },
 51 |       "source": [
 52 |         "!pip install apache-beam"
 53 |       ],
 54 |       "execution_count": 3,
 55 |       "outputs": [
 56 |         {
 57 |           "output_type": "stream",
 58 |           "text": [
 59 |             "Collecting apache-beam\n",
 60 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/82/b3/b6dcbd94bf8a5ae6a0be5fc988bdfb0b0dfb87ea37e788dc4dcc039a3aee/apache_beam-2.16.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n",
 61 |             "\u001b[K     |████████████████████████████████| 3.0MB 5.1MB/s \n",
 62 |             "\u001b[?25hCollecting mock<3.0.0,>=1.0.1 (from apache-beam)\n",
 63 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl (56kB)\n",
 64 |             "\u001b[K     |████████████████████████████████| 61kB 28.4MB/s \n",
 65 |             "\u001b[?25hRequirement already satisfied: protobuf<4,>=3.5.0.post1 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (3.7.1)\n",
 66 |             "Requirement already satisfied: pytz>=2018.3 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (2018.9)\n",
 67 |             "Requirement already satisfied: pyarrow<0.15.0,>=0.11.1; python_version >= \"3.0\" or platform_system != \"Windows\" in /usr/local/lib/python3.6/dist-packages (from apache-beam) (0.14.1)\n",
 68 |             "Collecting dill<0.3.1,>=0.3.0 (from apache-beam)\n",
 69 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/39/7a/70803635c850e351257029089d38748516a280864c97cbc73087afef6d51/dill-0.3.0.tar.gz (151kB)\n",
 70 |             "\u001b[K     |████████████████████████████████| 153kB 49.2MB/s \n",
 71 |             "\u001b[?25hCollecting avro-python3<2.0.0,>=1.8.1; python_version >= \"3.0\" (from apache-beam)\n",
 72 |             "  Downloading https://files.pythonhosted.org/packages/76/b2/98a736a31213d3e281a62bcae5572cf297d2546bc429accf36f9ee1604bf/avro-python3-1.9.1.tar.gz\n",
 73 |             "Requirement already satisfied: httplib2<=0.12.0,>=0.8 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (0.11.3)\n",
 74 |             "Collecting hdfs<3.0.0,>=2.1.0 (from apache-beam)\n",
 75 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/82/39/2c0879b1bcfd1f6ad078eb210d09dbce21072386a3997074ee91e60ddc5a/hdfs-2.5.8.tar.gz (41kB)\n",
 76 |             "\u001b[K     |████████████████████████████████| 51kB 25.2MB/s \n",
 77 |             "\u001b[?25hRequirement already satisfied: grpcio<2,>=1.12.1 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (1.15.0)\n",
 78 |             "Collecting oauth2client<4,>=2.0.1 (from apache-beam)\n",
 79 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/c0/7b/bc893e35d6ca46a72faa4b9eaac25c687ce60e1fbe978993fe2de1b0ff0d/oauth2client-3.0.0.tar.gz (77kB)\n",
 80 |             "\u001b[K     |████████████████████████████████| 81kB 30.4MB/s \n",
 81 |             "\u001b[?25hRequirement already satisfied: pydot<2,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (1.3.0)\n",
 82 |             "Requirement already satisfied: crcmod<2.0,>=1.7 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (1.7)\n",
 83 |             "Requirement already satisfied: pymongo<4.0.0,>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (3.9.0)\n",
 84 |             "Collecting fastavro<0.22,>=0.21.4 (from apache-beam)\n",
 85 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e1/28/0206330c0002b1e28e21473117d0dc813defbd5891562d27af5c68c93899/fastavro-0.21.24-cp36-cp36m-manylinux1_x86_64.whl (1.2MB)\n",
 86 |             "\u001b[K     |████████████████████████████████| 1.2MB 30.7MB/s \n",
 87 |             "\u001b[?25hCollecting python-dateutil<3,>=2.8.0 (from apache-beam)\n",
 88 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl (226kB)\n",
 89 |             "\u001b[K     |████████████████████████████████| 235kB 55.1MB/s \n",
 90 |             "\u001b[?25hRequirement already satisfied: pyyaml<4.0.0,>=3.12 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (3.13)\n",
 91 |             "Requirement already satisfied: future<1.0.0,>=0.16.0 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (0.16.0)\n",
 92 |             "Collecting pbr>=0.11 (from mock<3.0.0,>=1.0.1->apache-beam)\n",
 93 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/46/a4/d5c83831a3452713e4b4f126149bc4fbda170f7cb16a86a00ce57ce0e9ad/pbr-5.4.3-py2.py3-none-any.whl (110kB)\n",
 94 |             "\u001b[K     |████████████████████████████████| 112kB 51.3MB/s \n",
 95 |             "\u001b[?25hRequirement already satisfied: six>=1.9 in /usr/local/lib/python3.6/dist-packages (from mock<3.0.0,>=1.0.1->apache-beam) (1.12.0)\n",
 96 |             "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf<4,>=3.5.0.post1->apache-beam) (41.2.0)\n",
 97 |             "Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.6/dist-packages (from pyarrow<0.15.0,>=0.11.1; python_version >= \"3.0\" or platform_system != \"Windows\"->apache-beam) (1.16.5)\n",
 98 |             "Requirement already satisfied: docopt in /usr/local/lib/python3.6/dist-packages (from hdfs<3.0.0,>=2.1.0->apache-beam) (0.6.2)\n",
 99 |             "Requirement already satisfied: requests>=2.7.0 in /usr/local/lib/python3.6/dist-packages (from hdfs<3.0.0,>=2.1.0->apache-beam) (2.21.0)\n",
100 |             "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client<4,>=2.0.1->apache-beam) (0.4.7)\n",
101 |             "Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client<4,>=2.0.1->apache-beam) (0.2.6)\n",
102 |             "Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client<4,>=2.0.1->apache-beam) (4.0)\n",
103 |             "Requirement already satisfied: pyparsing>=2.1.4 in /usr/local/lib/python3.6/dist-packages (from pydot<2,>=1.2.0->apache-beam) (2.4.2)\n",
104 |             "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (3.0.4)\n",
105 |             "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (2.8)\n",
106 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (2019.9.11)\n",
107 |             "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (1.24.3)\n",
108 |             "Building wheels for collected packages: dill, avro-python3, hdfs, oauth2client\n",
109 |             "  Building wheel for dill (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
110 |             "  Created wheel for dill: filename=dill-0.3.0-cp36-none-any.whl size=77513 sha256=e6dfbeb0c7e7fbd0bd6d8d837ccca6a2b6e855d83616aa6909015337e47dcd62\n",
111 |             "  Stored in directory: /root/.cache/pip/wheels/c9/de/a4/a91eec4eea652104d8c81b633f32ead5eb57d1b294eab24167\n",
112 |             "  Building wheel for avro-python3 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
113 |             "  Created wheel for avro-python3: filename=avro_python3-1.9.1-cp36-none-any.whl size=43199 sha256=232b7eb9d62dfdd8f81172a1f1897fc423b39aa4aa1dbf9be51871ef084bf44d\n",
114 |             "  Stored in directory: /root/.cache/pip/wheels/94/54/6f/a5df680fd3224aa45145686f3b1b02a878a90ea769fcf9daaf\n",
115 |             "  Building wheel for hdfs (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
116 |             "  Created wheel for hdfs: filename=hdfs-2.5.8-cp36-none-any.whl size=33214 sha256=ae5926ae12eee4e2f531509699a68794abfb0ad0c69e78f607c10b1e9cd72df8\n",
117 |             "  Stored in directory: /root/.cache/pip/wheels/fe/a7/05/23e3699975fc20f8a30e00ac1e515ab8c61168e982abe4ce70\n",
118 |             "  Building wheel for oauth2client (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
119 |             "  Created wheel for oauth2client: filename=oauth2client-3.0.0-cp36-none-any.whl size=106382 sha256=928061fecdba926e8924bdd25cfd436fc965e9752fd28fb37d948b6d73783338\n",
120 |             "  Stored in directory: /root/.cache/pip/wheels/48/f7/87/b932f09c6335dbcf45d916937105a372ab14f353a9ca431d7d\n",
121 |             "Successfully built dill avro-python3 hdfs oauth2client\n",
122 |             "\u001b[31mERROR: pydrive 1.3.1 has requirement oauth2client>=4.0.0, but you'll have oauth2client 3.0.0 which is incompatible.\u001b[0m\n",
123 |             "\u001b[31mERROR: multiprocess 0.70.9 has requirement dill>=0.3.1, but you'll have dill 0.3.0 which is incompatible.\u001b[0m\n",
124 |             "\u001b[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.\u001b[0m\n",
125 |             "Installing collected packages: pbr, mock, dill, avro-python3, hdfs, oauth2client, fastavro, python-dateutil, apache-beam\n",
126 |             "  Found existing installation: dill 0.3.1.1\n",
127 |             "    Uninstalling dill-0.3.1.1:\n",
128 |             "      Successfully uninstalled dill-0.3.1.1\n",
129 |             "  Found existing installation: oauth2client 4.1.3\n",
130 |             "    Uninstalling oauth2client-4.1.3:\n",
131 |             "      Successfully uninstalled oauth2client-4.1.3\n",
132 |             "  Found existing installation: python-dateutil 2.5.3\n",
133 |             "    Uninstalling python-dateutil-2.5.3:\n",
134 |             "      Successfully uninstalled python-dateutil-2.5.3\n",
135 |             "Successfully installed apache-beam-2.16.0 avro-python3-1.9.1 dill-0.3.0 fastavro-0.21.24 hdfs-2.5.8 mock-2.0.0 oauth2client-3.0.0 pbr-5.4.3 python-dateutil-2.8.0\n"
136 |           ],
137 |           "name": "stdout"
138 |         },
139 |         {
140 |           "output_type": "display_data",
141 |           "data": {
142 |             "application/vnd.colab-display-data+json": {
143 |               "pip_warning": {
144 |                 "packages": [
145 |                   "dateutil"
146 |                 ]
147 |               }
148 |             }
149 |           },
150 |           "metadata": {
151 |             "tags": []
152 |           }
153 |         }
154 |       ]
155 |     },
156 |     {
157 |       "cell_type": "code",
158 |       "metadata": {
159 |         "id": "6u3tv7XLarX8",
160 |         "colab_type": "code",
161 |         "colab": {}
162 |       },
163 |       "source": [
164 |         "from __future__ import absolute_import\n",
165 |         "\n",
166 |         "from apache_beam.options.pipeline_options import PipelineOptions\n",
167 |         "import apache_beam as beam\n",
168 |         "\n",
169 |         "\n",
170 |         "def create_and_run_beam_job(path_to_products_file, path_to_ratings_file):\n",
171 |         "  pass"
172 |       ],
173 |       "execution_count": 0,
174 |       "outputs": []
175 |     },
176 |     {
177 |       "cell_type": "code",
178 |       "metadata": {
179 |         "id": "6cL4oHrObJP6",
180 |         "colab_type": "code",
181 |         "colab": {}
182 |       },
183 |       "source": [
184 |         "def path_to_file(file):\n",
185 |         "  return os.path.join(temp, file)"
186 |       ],
187 |       "execution_count": 0,
188 |       "outputs": []
189 |     },
190 |     {
191 |       "cell_type": "code",
192 |       "metadata": {
193 |         "id": "oIVWS2rpa_ef",
194 |         "colab_type": "code",
195 |         "colab": {
196 |           "base_uri": "https://localhost:8080/",
197 |           "height": 89
198 |         },
199 |         "outputId": "62c7fa90-35ed-4d56-dacf-4ac9285292e2"
200 |       },
201 |       "source": [
202 |         "from os import path\n",
203 |         "\n",
204 |         "create_and_run_beam_job(\n",
205 |         "    path_to_products_file=path_to_file('data/products-data-0.tsv'), \n",
206 |         "    path_to_ratings_file=path_to_file('data/ratings-0.tsv'))\n",
207 |         "\n",
208 |         "\n",
209 |         "\n",
210 |         "assert path.exists('category_counts.tsv-00000-of-00001')\n",
211 |         "\n",
212 |         "counts = {}\n",
213 |         "with open('category_counts.tsv-00000-of-00001') as results_file:\n",
214 |         "    for line in results_file:\n",
215 |         "        category, count = line.strip().split('\\t')\n",
216 |         "        counts[category] = int(count)\n",
217 |         "\n",
218 |         "assert 'Kitchen' in counts\n",
219 |         "assert counts['Kitchen'] == 217\n",
220 |         "\n",
221 |         "assert 'Jewelry' in counts\n",
222 |         "assert counts['Jewelry'] == 148"
223 |       ],
224 |       "execution_count": 7,
225 |       "outputs": [
226 |         {
227 |           "output_type": "stream",
228 |           "text": [
229 |             "Kitchen\t217\n",
230 |             "\n",
231 |             "Jewelry\t148\n",
232 |             "\n"
233 |           ],
234 |           "name": "stdout"
235 |         }
236 |       ]
237 |     }
238 |   ]
239 | }
240 | 


--------------------------------------------------------------------------------
/assignment2/task3.py:
--------------------------------------------------------------------------------
 1 | from components.mapreduce import MapReduceEngine
 2 | 
 3 | partitioned_documents = [
 4 |     [(1, "Hello World"), (2, "hello universe")],
 5 |     [(3, "Hello Galaxy")]
 6 | ]
 7 | 
 8 | 
 9 | 
10 | def tokenize_document(_key, document):
11 |     return [(word.lower(), 1) for word in document.split(" ")]
12 | 
13 | def count_per_word(word, counts):
14 |     return word, sum(counts)
15 | 
16 | 
17 | 
18 | engine = MapReduceEngine(f_m=tokenize_document, f_r=count_per_word, num_reducers=2)
19 | 
20 | results = engine.execute(partitioned_documents)
21 | 
22 | print(results)
23 | 
24 | assert(len(results) == 4)
25 | 
26 | assert('hello' in results)
27 | assert(results['hello'] == 3)
28 | 
29 | assert('world' in results)
30 | assert(results['world'] == 1)
31 | 
32 | assert('universe' in results)
33 | assert(results['universe'] == 1)
34 | 
35 | assert('galaxy' in results)
36 | assert(results['galaxy'] == 1)
37 | 


--------------------------------------------------------------------------------
/assignment2/task4.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import make_regression
 2 | from components.mapreduce import MapReduceEngine
 3 | from components.linear_regression import f_m, f_r, result_key
 4 | from utils import to_partitions
 5 | import numpy as np
 6 | 
 7 | # Generate an artifical regression problem
 8 | X, y = make_regression(n_samples=100, n_features=5, random_state=42)
 9 | 
10 | # Generate partitioned input data
11 | partitions = to_partitions(X, y, num_partitions=10, num_records_per_partition=10)
12 | 
13 | # Run the computation using our self-coded mapreduce engine
14 | engine = MapReduceEngine(f_m=f_m, f_r=f_r, num_reducers=2)
15 | results = engine.execute(partitions)
16 | 
17 | # Retrieve the final result
18 | w_mapreduce = results[result_key()]
19 | 
20 | # Ensure that we computed the correct result
21 | w_local = np.linalg.solve(np.matmul(np.transpose(X), X), np.dot(np.transpose(X), y))
22 | 
23 | assert(w_mapreduce.shape == w_local.shape)
24 | assert(np.linalg.norm(w_local - w_mapreduce) < 0.0001)
25 | 


--------------------------------------------------------------------------------
/assignment2/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def to_partitions(X, y, num_partitions, num_records_per_partition):
 3 |     partitions = []
 4 | 
 5 |     for partition_index in range(0, num_partitions):
 6 |         partition = []
 7 |         for record_index in range(0, num_records_per_partition):
 8 |             i = partition_index * num_records_per_partition + record_index
 9 | 
10 |             x_pi = X[i, :]
11 |             y_pi = y[i]
12 |             record = (i, (x_pi, y_pi))
13 | 
14 |             partition.append(record)
15 | 
16 |         partitions.append(partition)
17 | 
18 |     return partitions


--------------------------------------------------------------------------------
/assignment3/README.md:
--------------------------------------------------------------------------------
 1 | ## Task 1: Implementing Layers in a Neural Network
 2 | 
 3 | The goal of this [task](task1.py) is to complete the implementation of a neural network for classifying points from a synthetically generated dataset:
 4 | 
 5 | ![](moon.png)
 6 | 
 7 | You only have to implement the forward pass; the backward pass and weight updates are already given. The network is defined as follows:  
 8 | 
 9 | ![](network.png)
10 | 
11 | Implement the forward pass in the [first fully connected layer, which applies a `tanh` non-linearity](components/neuralnetwork.py#L65), in the [second fully connected layer](components/neuralnetwork.py#L42) and in the [softmax output](components/neuralnetwork.py#L87). Finally, invoke your implemented methods to conduct the [full forward pass through the network](components/neuralnetwork.py#L9) and return the computed probabilities. You can execute this task via `python task1.py`
12 | 
13 | ## Task 2, 3 & 4: Translating Scikit-learn Pipelines to Dataflow Graphs
14 | 
15 | In the remaining three tasks, you have to implement [a method to convert scikit-learn pipelines into a dataflow representation](components/graph.py#L48). Given a pipeline, your code has to inspect it and generate a list of connected [DataflowVertex](components/graph.py#L1) objects, which represent the operations and dataflow in the pipeline.
16 | 
17 | A vertex in this graph is defined as follows, where the `name` refers to the step name in the pipeline, the `operation` is the name of the transformer class which executes the step and the `parent_vertices` are operations that the current vertex depends on.
18 | 
19 | ```python
20 | class DataFlowVertex:
21 |     def __init__(self, parent_vertices, name, operation):
22 |         self.parent_vertices = parent_vertices
23 |         self.name = name
24 |         self.operation = operation
25 | ```
26 | Your method has to handle four different pipelines with growing complexity. The resulting graphs should look as follows:
27 | 
28 | ![](graphs.png)
29 | 


--------------------------------------------------------------------------------
/assignment3/adult-sample.csv:
--------------------------------------------------------------------------------
  1 | age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
  2 | 28,Private,273269,Some-college,10,Never-married,Craft-repair,Not-in-family,Black,Male,0,0,40,United-States,<=50K
  3 | 58,State-gov,123329,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,16,United-States,<=50K
  4 | 34,Private,79637,Bachelors,13,Never-married,Exec-managerial,Own-child,Amer-Indian-Eskimo,Female,0,0,40,United-States,<=50K
  5 | 71,Private,97870,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,15,Germany,<=50K
  6 | 20,State-gov,41103,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K
  7 | 46,Private,125492,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Black,Female,0,0,40,United-States,<=50K
  8 | 31,Private,467579,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,1887,40,United-States,>50K
  9 | 24,Private,376393,Assoc-voc,11,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 10 | 21,Private,56582,11th,7,Never-married,Other-service,Own-child,White,Male,0,0,50,United-States,<=50K
 11 | 38,Private,76317,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K
 12 | 43,Federal-gov,144778,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,>50K
 13 | 47,Private,454989,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
 14 | 23,Private,278254,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,45,United-States,<=50K
 15 | 38,Private,111499,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,7298,0,50,United-States,>50K
 16 | 31,Private,168521,Bachelors,13,Never-married,Exec-managerial,Unmarried,White,Female,0,0,50,United-States,<=50K
 17 | 36,Private,749636,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
 18 | 27,Private,167405,HS-grad,9,Married-spouse-absent,Farming-fishing,Own-child,White,Female,0,0,40,Mexico,<=50K
 19 | 32,Private,317378,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Female,10520,0,40,United-States,>50K
 20 | 55,State-gov,71630,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,1617,40,United-States,<=50K
 21 | 33,Private,182401,10th,6,Never-married,Adm-clerical,Not-in-family,Black,Male,0,0,40,United-States,<=50K
 22 | 21,Private,33616,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,25,United-States,<=50K
 23 | 25,Private,362912,Some-college,10,Never-married,Craft-repair,Own-child,White,Female,0,0,50,United-States,<=50K
 24 | 28,Private,34335,HS-grad,9,Divorced,Sales,Not-in-family,Amer-Indian-Eskimo,Male,14084,0,40,United-States,>50K
 25 | 51,Private,305147,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K
 26 | 26,Private,50103,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 27 | 62,State-gov,221558,Masters,14,Separated,Prof-specialty,Unmarried,White,Female,0,0,24,?,<=50K
 28 | 37,Private,138940,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 29 | 55,Self-emp-not-inc,52888,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,10,United-States,<=50K
 30 | 46,Local-gov,125457,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,38,United-States,>50K
 31 | 47,Private,102771,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,Portugal,<=50K
 32 | 60,?,41517,11th,7,Married-spouse-absent,?,Unmarried,Black,Female,0,0,20,United-States,<=50K
 33 | 34,Private,153614,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,>50K
 34 | 32,Local-gov,157887,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 35 | 35,Private,308691,Masters,14,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States,<=50K
 36 | 48,Self-emp-inc,238966,Some-college,10,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K
 37 | 67,Private,123393,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
 38 | 48,Private,25468,Masters,14,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,50,United-States,>50K
 39 | 30,Private,117393,HS-grad,9,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 40 | 40,Private,175686,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
 41 | 58,Private,259014,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Male,0,0,20,United-States,<=50K
 42 | 19,?,134974,Some-college,10,Never-married,?,Own-child,White,Female,0,0,20,United-States,<=50K
 43 | 25,Private,49092,Bachelors,13,Never-married,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K
 44 | 33,Local-gov,224185,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 45 | 43,Private,136721,12th,8,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 46 | 37,Private,314963,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
 47 | 21,State-gov,337766,Some-college,10,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States,<=50K
 48 | 51,Self-emp-not-inc,111939,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,35,United-States,>50K
 49 | 43,Private,151089,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,<=50K
 50 | 49,Private,120629,Bachelors,13,Divorced,Exec-managerial,Not-in-family,Black,Female,27828,0,60,United-States,>50K
 51 | 38,Local-gov,201410,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
 52 | 49,Private,61307,7th-8th,4,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,38,United-States,<=50K
 53 | 36,Private,135289,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,48,United-States,>50K
 54 | 36,Self-emp-not-inc,89622,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,80,United-States,>50K
 55 | 21,Private,216070,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,Amer-Indian-Eskimo,Female,0,0,46,United-States,>50K
 56 | 42,Private,138662,Some-college,10,Separated,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K
 57 | 35,Private,385847,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 58 | 20,Private,189148,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,48,United-States,<=50K
 59 | 22,Private,252355,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,27,United-States,<=50K
 60 | 46,Private,243743,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K
 61 | 33,Private,290763,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K
 62 | 23,?,99399,Some-college,10,Never-married,?,Unmarried,Amer-Indian-Eskimo,Female,0,0,25,United-States,<=50K
 63 | 44,Private,160829,Bachelors,13,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,20,United-States,>50K
 64 | 46,Local-gov,329752,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,30,United-States,<=50K
 65 | 52,Private,117674,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K
 66 | 20,?,150084,Some-college,10,Never-married,?,Own-child,White,Male,0,0,25,United-States,<=50K
 67 | 49,State-gov,203039,11th,7,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K
 68 | 38,Private,210438,7th-8th,4,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K
 69 | 29,Private,163265,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
 70 | 18,Private,43272,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States,<=50K
 71 | 54,Self-emp-not-inc,103179,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K
 72 | 48,Private,449354,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,4386,0,45,United-States,>50K
 73 | 29,Private,297544,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 74 | 30,Private,161690,Assoc-voc,11,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K
 75 | 31,Local-gov,219883,HS-grad,9,Never-married,Protective-serv,Not-in-family,Black,Male,0,0,40,United-States,<=50K
 76 | 40,Federal-gov,121012,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,7298,0,48,United-States,>50K
 77 | 32,Private,207172,Some-college,10,Never-married,Sales,Other-relative,White,Female,0,0,40,United-States,<=50K
 78 | 47,Private,148995,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,2415,60,United-States,>50K
 79 | 19,Private,292590,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K
 80 | 45,Private,274657,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,?,<=50K
 81 | 49,Private,189498,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
 82 | 18,Private,25837,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,25,United-States,<=50K
 83 | 33,State-gov,306309,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States,<=50K
 84 | 48,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 85 | 30,Local-gov,289442,HS-grad,9,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
 86 | 55,Private,89690,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,>50K
 87 | 47,Self-emp-not-inc,237731,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,2829,0,65,United-States,<=50K
 88 | 72,?,402306,Some-college,10,Married-civ-spouse,?,Husband,White,Male,0,0,32,Canada,<=50K
 89 | 27,Private,119793,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
 90 | 44,Private,116358,Bachelors,13,Married-civ-spouse,Sales,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K
 91 | 23,Private,55215,Bachelors,13,Never-married,Sales,Own-child,White,Male,0,0,55,United-States,<=50K
 92 | 33,Private,184784,10th,6,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
 93 | 43,Private,269015,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,40,Germany,>50K
 94 | 46,Private,146919,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K
 95 | 90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
 96 | 34,Private,19847,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
 97 | 32,Private,108116,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1902,60,United-States,>50K
 98 | 42,Self-emp-not-inc,32185,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K
 99 | 18,Private,333611,5th-6th,3,Never-married,Other-service,Other-relative,White,Male,0,0,54,Mexico,<=50K
100 | 25,Private,50053,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,Japan,<=50K
101 | 28,Private,119287,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,28,United-States,>50K
102 | 


--------------------------------------------------------------------------------
/assignment3/components/graph.py:
--------------------------------------------------------------------------------
 1 | class DataFlowVertex:
 2 |     def __init__(self, parent_vertices, name, operation):
 3 |         self.parent_vertices = parent_vertices
 4 |         self.name = name
 5 |         self.operation = operation
 6 | 
 7 |     def __repr__(self):
 8 |         return "{}, (name={}, op={})".format(self.parent_vertices, self.name, self.operation)
 9 | 
10 | # Helper function to topologically sort a DAG
11 | def topo_sort(graph):
12 |     adjacency_list = {vertex.name: [] for vertex in graph}
13 |     visited = {vertex.name: False for vertex in graph}
14 | 
15 |     for vertex in graph:
16 |         for parent_vertex in vertex.parent_vertices:
17 |             adjacency_list[parent_vertex.name].append(vertex.name)
18 | 
19 |     output = []
20 | 
21 |     def toposort(vertex_name, adjacency_list, visited, output):
22 |         visited[vertex_name] = True
23 |         for child_name in adjacency_list[vertex_name]:
24 |             if not visited[child_name]:
25 |                 toposort(child_name, adjacency_list, visited, output)
26 |         output.append(vertex_name)
27 | 
28 |     for vertex_name in adjacency_list.keys():
29 |         if not visited[vertex_name]:
30 |             toposort(vertex_name, adjacency_list, visited, output)
31 | 
32 |     output.reverse()
33 | 
34 |     vertices_by_name = {vertex.name: vertex for vertex in graph}
35 | 
36 |     sorted_graph = []
37 |     for vertex_name in output:
38 |         sorted_graph.append(vertices_by_name[vertex_name])
39 |     return sorted_graph
40 | 
41 | 
42 | # Helper function to pick the sink from a single-sink DAG
43 | def find_sink(graph):
44 |     sorted_graph = topo_sort(graph)
45 |     return sorted_graph[-1]
46 | 
47 | 
48 | def pipeline_to_dataflow_graph(pipeline, name_prefix='', parent_vertices=[]):
49 |     graph = []
50 | 
51 |     # TODO Implement translation of the pipeline into a list of DataFlowVertex objects
52 | 
53 |     return graph
54 | 


--------------------------------------------------------------------------------
/assignment3/components/neuralnetwork.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class NeuralNetwork:
  5 | 
  6 |     def __init__(self, layers):
  7 |         self.layers = layers
  8 | 
  9 |     def forward(self, inputs):
 10 |         # TODO implement the forward pass through the network
 11 |         # TODO each layer must store its output
 12 |         # TODO additionally: return the output of the last layer from this function
 13 |         pass
 14 | 
 15 |     def backward(self, y_true):
 16 |         for layer_index in reversed(range(1, len(self.layers))):
 17 |             if layer_index == len(self.layers) - 1:
 18 |                 self.layers[layer_index].backward(y_true, None)
 19 |             else:
 20 |                 previous_gradient = self.layers[layer_index + 1].gradient
 21 |                 self.layers[layer_index].backward(previous_gradient, self.layers[layer_index - 1].output)
 22 | 
 23 |     def update_weights(self, inputs, reg_lambda, epsilon):
 24 |         for layer_index in range(0, len(self.layers) - 1):
 25 |             if layer_index == 0:
 26 |                 self.layers[0].update(inputs, self.layers[1].gradient, reg_lambda, epsilon)
 27 |             else:
 28 |                 self.layers[layer_index].update(
 29 |                     self.layers[layer_index - 1].output,
 30 |                     self.layers[layer_index + 1].gradient,
 31 |                     reg_lambda, epsilon)
 32 | 
 33 | 
 34 | class FullyConnectedLayer:
 35 | 
 36 |     def __init__(self, input_size, layer_size):
 37 |         self.W = np.random.randn(input_size, layer_size) / np.sqrt(input_size)
 38 |         self.b = np.zeros((1, layer_size))
 39 |         self.output = None
 40 |         self.gradient = None
 41 | 
 42 |     def forward(self, inputs):
 43 |         # TODO implement forward pass
 44 |         pass
 45 | 
 46 |     def backward(self, previous_gradient, inputs):
 47 |         self.gradient = previous_gradient.dot(self.W.T) * (1 - np.power(inputs, 2))
 48 | 
 49 |     def update(self, inputs, previous_gradient, reg_lambda, epsilon):
 50 |         dW = (inputs.T).dot(previous_gradient) + reg_lambda * self.W
 51 |         db = np.sum(previous_gradient, axis=0)
 52 | 
 53 |         self.W -= epsilon * dW
 54 |         self.b -= epsilon * db
 55 | 
 56 | 
 57 | class FullyConnectedLayerWithActivation:
 58 | 
 59 |     def __init__(self, input_size, layer_size):
 60 |         self.W = np.random.randn(input_size, layer_size) / np.sqrt(input_size)
 61 |         self.b = np.zeros((1, layer_size))
 62 |         self.output = None
 63 |         self.gradient = None
 64 | 
 65 |     def forward(self, inputs):
 66 |         # TODO implement forward pass with tanh as activation function
 67 |         pass
 68 | 
 69 |     def backward(self, previous_gradient):
 70 |         pass
 71 | 
 72 |     def update(self, inputs, previous_gradient, reg_lambda, epsilon):
 73 |         dW = (inputs.T).dot(previous_gradient) + reg_lambda * self.W
 74 |         db = np.sum(previous_gradient, axis=0)
 75 | 
 76 |         self.W -= epsilon * dW
 77 |         self.b -= epsilon * db
 78 | 
 79 | 
 80 | class SoftMax:
 81 | 
 82 |     def __init__(self, num_examples):
 83 |         self.output = None
 84 |         self.gradient = None
 85 |         self.num_examples = num_examples
 86 | 
 87 |     def forward(self, inputs):
 88 |         # TODO implement softmax computation
 89 |         pass
 90 | 
 91 |     def backward(self, y_true, inputs):
 92 |         self.gradient = np.copy(self.output)
 93 |         self.gradient[range(self.num_examples), y_true] -= 1
 94 | 
 95 | 
 96 | # Helper function to evaluate the total loss on the dataset
 97 | def calculate_loss(network, X, y):
 98 |     num_examples = len(X)
 99 |     probabilities = network.forward(X)
100 |     return (1.0 / num_examples) * np.sum(-np.log(probabilities[range(num_examples), y]))
101 | 
102 | 
103 | def predict(network, x):
104 |     probabilities = network.forward(x)
105 |     return np.argmax(probabilities, axis=1)
106 | 


--------------------------------------------------------------------------------
/assignment3/graphs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment3/graphs.png


--------------------------------------------------------------------------------
/assignment3/moon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment3/moon.png


--------------------------------------------------------------------------------
/assignment3/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment3/network.png


--------------------------------------------------------------------------------
/assignment3/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.21.3
2 | pandas==0.25.1
3 | numpy==1.17
4 | jupyter==1.0.0
5 | 
6 | 


--------------------------------------------------------------------------------
/assignment3/task1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import datasets
 3 | 
 4 | from components.neuralnetwork import *
 5 | 
 6 | np.random.seed(0)
 7 | 
 8 | X, y = datasets.make_moons(200, noise=0.20)
 9 | 
10 | input_dimensions = 2
11 | num_classes = 2
12 | num_training_examples = len(X)
13 | 
14 | size_of_hidden_layer = 3
15 | epsilon = 0.01  # learning rate for gradient descent
16 | reg_lambda = 0.01  # regularization strength
17 | 
18 | network = NeuralNetwork([
19 |     FullyConnectedLayerWithActivation(input_dimensions, size_of_hidden_layer),
20 |     FullyConnectedLayer(size_of_hidden_layer, num_classes),
21 |     SoftMax(num_training_examples)
22 | ])
23 | 
24 | # Train the network with batch gradient descent
25 | for iteration in range(0, 20000):
26 | 
27 |     # Forward pass
28 |     network.forward(X)
29 |     # Backward pass
30 |     network.backward(y)
31 |     # Parameter updates
32 |     network.update_weights(X, reg_lambda, epsilon)
33 | 
34 |     if iteration % 1000 == 0:
35 |         print("Loss after iteration %i: %f" % (iteration, calculate_loss(network, X, y)))
36 | 
37 | assert calculate_loss(network, X, y) < 0.08
38 | 


--------------------------------------------------------------------------------
/assignment3/task2.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import Binarizer
 2 | from sklearn.pipeline import Pipeline
 3 | from sklearn.datasets import load_iris
 4 | from sklearn.tree import DecisionTreeClassifier
 5 | from sklearn.preprocessing import StandardScaler
 6 | 
 7 | from components.graph import pipeline_to_dataflow_graph
 8 | 
 9 | # EXAMPLE 1
10 | data = [[13.0, 0.0, 1.0],
11 |         [27.0, 1.0, 0.0]]
12 | 
13 | binarizer_pipeline = Pipeline([
14 |     ('binarization', Binarizer(threshold=5.0))
15 | ])
16 | 
17 | binarizer_model = binarizer_pipeline.fit(data)
18 | binarizer_graph = pipeline_to_dataflow_graph(binarizer_model)
19 | 
20 | assert len(binarizer_graph) == 1
21 | 
22 | vertex = binarizer_graph[0]
23 | 
24 | assert len(vertex.parent_vertices) == 0
25 | assert vertex.name == 'binarization'
26 | assert vertex.operation == 'Binarizer'
27 | 
28 | 
29 | # EXAMPLE 2
30 | iris_dataset = load_iris()
31 | 
32 | iris_pipeline = Pipeline(steps=[
33 |     ('scaler', StandardScaler()),
34 |     ('classifier', DecisionTreeClassifier())])
35 | 
36 | iris_model = iris_pipeline.fit(iris_dataset.data, iris_dataset.target)
37 | iris_graph = pipeline_to_dataflow_graph(iris_model, parent_vertices=[])
38 | 
39 | assert len(iris_graph) == 2
40 | 
41 | for vertex in iris_graph:
42 |     if vertex.name == 'scaler':
43 |         assert len(vertex.parent_vertices) == 0
44 |     if vertex.name == 'classifier':
45 |         assert len(vertex.parent_vertices) == 1
46 |         assert vertex.parent_vertices[0].operation == 'StandardScaler'
47 | 


--------------------------------------------------------------------------------
/assignment3/task3.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.pipeline import Pipeline
 3 | from sklearn.tree import DecisionTreeClassifier
 4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
 5 | from sklearn.compose import ColumnTransformer
 6 | 
 7 | from components.graph import pipeline_to_dataflow_graph
 8 | 
 9 | raw_data = pd.read_csv('adult-sample.csv', na_values='?')
10 | data = raw_data.dropna()
11 | 
12 | labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])
13 | 
14 | feature_transformation = ColumnTransformer(transformers=[
15 |     ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),
16 |     ('numeric', StandardScaler(), ['age', 'hours-per-week'])
17 | ])
18 | 
19 | income_pipeline = Pipeline([
20 |   ('features', feature_transformation),
21 |   ('classifier', DecisionTreeClassifier())])
22 | 
23 | income_model = income_pipeline.fit(data, labels)
24 | 
25 | income_graph = pipeline_to_dataflow_graph(income_model)
26 | 
27 | 
28 | assert len(income_graph) == 5
29 | 
30 | steps_without_parent = set(['features__categorical__education',
31 |                             'features__categorical__workclass',
32 |                             'features__numeric__age',
33 |                             'features__numeric__hours-per-week'])
34 | 
35 | for vertex in income_graph:
36 |     if vertex.name in steps_without_parent:
37 |         assert len(vertex.parent_vertices) == 0
38 |         if 'categorical' in vertex.name:
39 |             assert vertex.operation == 'OneHotEncoder'
40 |         else:
41 |             assert vertex.operation == 'StandardScaler'
42 |     else:
43 |         assert len(vertex.parent_vertices) == 4
44 |         assert vertex.name == 'classifier'
45 | 


--------------------------------------------------------------------------------
/assignment3/task4.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.tree import DecisionTreeClassifier
 5 | from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
 6 | from sklearn.compose import ColumnTransformer
 7 | from sklearn.impute import SimpleImputer
 8 | 
 9 | from components.graph import pipeline_to_dataflow_graph
10 | 
11 | raw_data = pd.read_csv('adult-sample.csv', na_values='?')
12 | data = raw_data.dropna()
13 | 
14 | labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])
15 | 
16 | nested_categorical_feature_transformation = Pipeline(steps=[
17 |     ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
18 |     ('encode', OneHotEncoder(handle_unknown='ignore'))
19 | ])
20 | 
21 | nested_feature_transformation = ColumnTransformer(transformers=[
22 |     ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),
23 |     ('numeric', StandardScaler(), ['age', 'hours-per-week'])
24 | ])
25 | 
26 | nested_pipeline = Pipeline([
27 |   ('features', nested_feature_transformation),
28 |   ('classifier', DecisionTreeClassifier())])
29 | 
30 | nested_model = nested_pipeline.fit(data, labels)
31 | 
32 | nested_graph = pipeline_to_dataflow_graph(nested_model)
33 | 
34 | assert len(nested_graph) == 7
35 | 
36 | vertices_by_name = {vertex.name:vertex for vertex in nested_graph}
37 | 
38 | assert 'features__numeric__age' in vertices_by_name.keys()
39 | assert vertices_by_name['features__numeric__age'].parent_vertices == []
40 | 
41 | assert 'classifier' in vertices_by_name.keys()
42 | assert len(vertices_by_name['classifier'].parent_vertices) == 4
43 | 
44 | assert 'features__categorical__education__encode' in vertices_by_name.keys()
45 | 
46 | vertex_to_inspect = vertices_by_name['features__categorical__education__encode']
47 | 
48 | assert len(vertex_to_inspect.parent_vertices) == 1
49 | assert vertex_to_inspect.parent_vertices[0].name == 'features__categorical__education__impute'
50 | 


--------------------------------------------------------------------------------
/extra-assignment/README.md:
--------------------------------------------------------------------------------
 1 | ## Optional assignment for extra credits (max. 10 points)
 2 | 
 3 | **Task**: Create a **tutorial notebook for an open source project relevant to the course**, using **publicly available data**, similar to the example notebooks we worked on in the labs.
 4 | 
 5 | You should create a **single jupyter notebook with comments and stepwise instructions** to apply one of 
 6 | the following open source libraries on **a publicly available small dataset**:
 7 | 
 8 |  * [Apache Beam](https://beam.apache.org/)
 9 |  * [DataWig](https://github.com/awslabs/datawig)
10 |  * [mlflow](https://mlflow.org/)
11 |  * [Tensorflow Data Validation](https://www.tensorflow.org/tfx/data_validation/get_started)
12 |  * [Lime](https://github.com/marcotcr/lime)
13 |  * [Weld](https://github.com/weld-project/weld)
14 |  
15 | The size / length of your notebook should be roughly the same as the notebooks used during the lab. Please also provide a link to the dataset and a requirements.txt file specifying the dependencies.
16 |  
17 | Note that your solution must be different from the existing examples for these projects and 
18 |  from the notebooks which we used for the labs.
19 | 


--------------------------------------------------------------------------------
/project-paper/projectpaper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/project-paper/projectpaper.pdf


--------------------------------------------------------------------------------
/project-paper/projectpaper.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[sigconf,10pt]{acmart}
  2 | 
  3 | \usepackage{booktabs} % For formal tables
  4 | 
  5 | \usepackage[utf8]{inputenc}
  6 | \usepackage{enumitem}
  7 | \usepackage{xcolor}
  8 | \usepackage{amsmath}
  9 | %\usepackage[ruled,linesnumbered]{algorithm2e}
 10 | \usepackage{subfigure}
 11 | \usepackage{amssymb}
 12 | \usepackage{listings}
 13 | 
 14 | \usepackage{algorithm}
 15 | \usepackage{algpseudocode}
 16 | \usepackage{blindtext}
 17 | 
 18 | \renewcommand{\algorithmicrequire}{\textbf{Input:}}
 19 | 
 20 | \usepackage{listings}
 21 | % Python style for highlighting
 22 | \definecolor{darkgray}{rgb}{0.33, 0.33, 0.33}
 23 | 
 24 | 
 25 | \lstnewenvironment{Python}[1][]
 26 |   {\lstset{language=Python,
 27 |     basicstyle      = \footnotesize\ttfamily,
 28 |     keywordstyle    = \color{blue},
 29 |     keywordstyle    = [2] \color{teal}, % just to check that it works
 30 |     stringstyle     = \color{magenta},
 31 |     literate={ü}{{\"u}}1 {ö}{{\"o}}1 {É}{{\'E}}1 {œ}{{\oe}}1,
 32 |     commentstyle    = \color{darkgray}\ttfamily,  
 33 |            morekeywords=as,
 34 |            morekeywords=with,
 35 |            #1}%           
 36 |   }
 37 |   {}
 38 | 
 39 | \lstset{}
 40 | 
 41 | \usepackage{tcolorbox}
 42 | \usepackage{soul}
 43 | 
 44 | \newcommand{\todo}[1]{\textcolor{magenta}{[#1]}}
 45 | 
 46 | 
 47 | %\newcommand{\revcmt}[1]{\begin{tcolorbox}[boxrule=1pt, boxsep=4pt,left=3pt,right=3pt,top=2pt,bottom=2pt]\noindent\textit{#1}\end{tcolorbox}}
 48 | 
 49 | \definecolor{charcoal}{rgb}{0.21, 0.27, 0.31}
 50 | \newcommand{\revcmt}[1]{\noindent\textit{\textcolor{charcoal}{#1}}}
 51 | 
 52 | \DeclareTextFontCommand{\texttt}{\ttfamily\hyphenchar\font=45\relax}
 53 | 
 54 | % Copyright
 55 | %\setcopyright{none}
 56 | %\setcopyright{acmcopyright}
 57 | %\setcopyright{acmlicensed}
 58 | \setcopyright{rightsretained}
 59 | %\setcopyright{usgov}
 60 | %\setcopyright{usgovmixed}
 61 | %\setcopyright{cagov}
 62 | %\setcopyright{cagovmixed}
 63 | 
 64 | 
 65 | % DOI
 66 | \acmDOI{10.475/123_4}
 67 | 
 68 | % ISBN
 69 | \acmISBN{123-4567-24-567/08/06}
 70 | 
 71 | %Conference
 72 | \acmConference[Data Engineering for Machine Learning Course]{ACM SIGMOD}{2019}{NYU}
 73 | \acmYear{2019}
 74 | \copyrightyear{2019}
 75 | 
 76 | 
 77 | \acmArticle{4}
 78 | \acmPrice{15.00}
 79 | 
 80 | \settopmatter{printacmref=false}
 81 | \renewcommand\footnotetextcopyrightpermission[1]{}
 82 | 
 83 | \title{Team X: Name of the Project}
 84 | 
 85 | \author{Student~1, Student~2, Student~3, Student~4}
 86 | \affiliation{%
 87 |   \institution{New York University}
 88 | }
 89 | \email{{netid1,netid2,netid3,netid4}@nyu.edu}
 90 | 
 91 | %\renewcommand{\shortauthors}{Schelter et al.}
 92 | 
 93 | \begin{document}
 94 | 
 95 | \begin{abstract}
 96 | \todo{Summarize your project paper in about a quarter of a page} \blindtext
 97 | \end{abstract}
 98 | 
 99 | \maketitle
100 | 
101 | \section{Introduction}
102 | 
103 | \todo{Describe your project, why it is important, why it is difficult and summarize how you approached it and which final results you got. The introduction should fill up the first page.}
104 | 
105 | \blindtext
106 | 
107 | \blindtext
108 | 
109 | \blindtext
110 | 
111 | \blindtext
112 | 
113 | \todo{Summarize three achievements of your project}
114 | \begin{itemize}
115 |   \item \todo{Achievement 1}
116 |   \item \todo{Achievement 2}
117 |   \item \todo{Achievement 3} 
118 | \end{itemize}  
119 | 
120 | \newpage
121 | 
122 | \section{Problem Statement \& Approach}
123 | 
124 | \todo{While the introduction gives the high-level view, this section should go into details and state the problem and the approach (modeling decisions, algorithms, system design, etc) that you took.}
125 | 
126 | \subsection{Problem Statement}
127 | 
128 | \todo{Try to briefly and concisely describe the problem that you are trying to solve}
129 | 
130 | \blindtext
131 | 
132 | \blindtext
133 | 
134 | \subsection{Approach}
135 | 
136 | \todo{Try to briefly and concisely describe the approach that you took to solve our project problem. Try to be generic here. Feel free to use diagrams and figures here.}
137 | 
138 | \blindtext
139 | 
140 | \blindtext
141 | 
142 | \blindtext
143 | 
144 | \section{Implementation}
145 | 
146 | \todo{Describe in detail how you implemented your solution, here you can talk about software libraries, implementations details, etc.}
147 | 
148 | \blindtext
149 | 
150 | \blindtext
151 | 
152 | \blindtext
153 | 
154 | \section{Evaluation}
155 | 
156 | \subsection{Experimental Setup}
157 | 
158 | \todo{Describe which infrastructure (machine, operating system, library versions) you used for your experiments}
159 | 
160 | \blindtext
161 | 
162 | \subsection{Datasets}
163 | 
164 | \todo{Describe which datasets you used for your experiments}
165 | 
166 | \blindtext
167 | 
168 | 
169 | \subsection{Results}
170 | 
171 | \todo{Describe which experiments you ran, which baselines you used, create tables or figures for the results and discuss your findings.}
172 | 
173 | \blindtext
174 | 
175 | \blindtext
176 | 
177 | \blindtext
178 | 
179 | 
180 | \section{Discussion}
181 | 
182 | \todo{Summarize your project and the outcome. What went well? What were unexpected difficulties? What would be the next steps to take if you had more time for the project?}
183 | 
184 | \blindtext
185 | 
186 | \blindtext
187 | 
188 | 
189 | \section{Detailed Contributions}
190 | 
191 | \todo{Summarize the contributions of every student to the project. Give pointers to the other parts of the paper. For example, explain who implement which parts of the software, who collected and prepared data, who tried different algorithms, etc.}
192 | 
193 | \subsection{Student 1}
194 | 
195 | \blindtext
196 | 
197 | \subsection{Student 2}
198 | 
199 | \blindtext
200 | 
201 | \subsection{Student 3}
202 | 
203 | \blindtext
204 | 
205 | \subsection{Student 4}
206 | 
207 | \blindtext
208 | 
209 | 
210 | \end{document}
211 | 


--------------------------------------------------------------------------------
/project-resources/README.md:
--------------------------------------------------------------------------------
 1 | ## Pointers and resources for the group projects
 2 | 
 3 | ### (2) Evaluation of HoloClean
 4 | 
 5 | [HoloClean website](holoclean.io)
 6 | 
 7 | Papers: 
 8 |  * [HoloClean: Holistic Data Repairs with Probabilistic Inference](http://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf)
 9 |  * [HoloDetect: Few-Shot Learning for Error Detection](https://arxiv.org/pdf/1904.02285)
10 | 
11 | ### (3) Unsupervised Data Quality Validation
12 | 
13 | Compressed file with the [partitioned datasets](partitioned-data.zip) for the flights, taxi and posts data.
14 | 
15 | ### (6) Missing Data and Fairness
16 | 
17 | [IBM AIF360](https://github.com/IBM/AIF360), a python package with a comprehensive set of fairness metrics for datasets and machine learning models, explanations for these metrics, and algorithms to mitigate bias in datasets and models. 
18 | 
19 | 
20 | ### (7) Fair AutoML
21 | 
22 | [IBM AIF360](https://github.com/IBM/AIF360), a python package with a comprehensive set of fairness metrics for datasets and machine learning models, explanations for these metrics, and algorithms to mitigate bias in datasets and models. 
23 | 
24 | [Auto-Sklearn](https://automl.github.io/auto-sklearn/master/), an AutoML library based on scikit-learn.
25 | 
26 | [Google AutoML Tables](https://cloud.google.com/automl-tables/) as an example of an industrial-scale AutoML service.
27 | 
28 | ### (8) Fairness Labels
29 | 
30 | Please talk to Ke for details.
31 | 
32 | ### (9) Data Loading for Breast-Cancer Screening
33 | 
34 | Please talk to Jason for details.
35 | 
36 | ### (10) Web-fronted for the "Amnesia" Recommender System
37 | 
38 |  * [Short paper](https://drive.google.com/file/d/17M6k_b94stLyPB6LHOq9td2pRDAmREr8/view) on the approach
39 |  * [Slides](https://drive.google.com/file/d/1FU4svEaLb6a5v8CI4tl4YQpv4Cr4DUei/view) and [Video](https://www.youtube.com/watch?v=tRyX-aFjUEU) presenting the topic
40 |  
41 |  Please talk to me for details.
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/project-resources/partitioned-data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/project-resources/partitioned-data.zip


--------------------------------------------------------------------------------