├── 10 ├── images │ ├── Complex_NoProc_V3.jpg │ └── cnn_arch.png ├── requirements.txt ├── tutorial_credit_scoring.ipynb └── tutorial_medical_expenditure.ipynb ├── .gitignore ├── 01 ├── git-setup.txt ├── java-maven-setup.txt ├── python-example │ ├── digits.py │ ├── instructions.txt │ └── requirements.txt └── spark-example │ ├── instructions.txt │ ├── pom.xml │ └── src │ └── main │ └── scala │ └── edu │ └── nyu │ └── deml │ └── RunSparkLocal.scala ├── 02 ├── adult-sample.csv ├── instructions.txt ├── requirements.txt └── sklearn-pipelines.ipynb ├── 03 ├── data │ ├── eval │ │ └── data.csv │ ├── serving │ │ └── data.csv │ └── train │ │ └── data.csv ├── google-tfx.ipynb └── requirements.txt ├── 04 ├── Apache-Beam-Intro.ipynb ├── kinglear.txt └── requirements.txt ├── 05 ├── requirements.txt └── simple.ipynb ├── 06 ├── census.ipynb ├── oop.ipynb └── requirements.txt ├── 07 ├── Serving_REST_simple.ipynb └── requirements.txt ├── 08 ├── keras_train.ipynb ├── requirements.txt └── sklearn_train.ipynb ├── 09 ├── datawig.ipynb ├── dirty_debs2015_1k.csv ├── openrefine.txt ├── products.csv └── requirements.txt ├── assignment1 ├── adult-sample.csv ├── components │ ├── constraints.py │ ├── learned_imputer.py │ └── trainer.py ├── instructions.md ├── products.csv ├── requirements.txt ├── task1.py ├── task2.py └── task3.py ├── assignment2 ├── README.md ├── components │ ├── beam_job.py │ ├── linear_regression.py │ ├── mapreduce.py │ └── schema_validation.py ├── data.zip ├── data │ ├── products-data-0.tsv │ ├── products-data-1.tsv │ ├── products-data-2.tsv │ ├── products-data-3.tsv │ ├── ratings-0.tsv │ ├── ratings-1.tsv │ ├── ratings-2.tsv │ └── ratings-3.tsv ├── regression.png ├── requirements-catalina.txt ├── requirements.txt ├── task1.py ├── task1_colab.ipynb ├── task2.py ├── task2_colab.ipynb ├── task3.py ├── task4.py └── utils.py ├── assignment3 ├── README.md ├── adult-sample.csv ├── components │ ├── graph.py │ └── neuralnetwork.py ├── graphs.png ├── moon.png ├── network.png ├── requirements.txt ├── task1.py ├── task2.py ├── task3.py └── task4.py ├── extra-assignment └── README.md ├── project-paper ├── acmart.cls ├── projectpaper.pdf └── projectpaper.tex └── project-resources ├── README.md └── partitioned-data.zip /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.iml 2 | **/target/ 3 | **/.idea 4 | **/venv 5 | **/.ipynb_checkpoints/ 6 | **/__pycache__ 7 | **/*.out 8 | **/*.blg 9 | **/*.swp 10 | **/*.bbl 11 | **/*.fdb_latexmk 12 | **/*.log 13 | **/*.fls 14 | **/*.synctex.gz 15 | **/*.aux 16 | 17 | -------------------------------------------------------------------------------- /01/git-setup.txt: -------------------------------------------------------------------------------- 1 | # If you use SSH to authenticate with github 2 | git clone git@github.com:schelterlabs/deml-.git 3 | # Otherwise 4 | git clone https://github.com/schelterlabs/deml-.git 5 | 6 | # Add the tasks and assignments repository as remote 7 | cd deml- 8 | 9 | # If you use SSH to authenticate with github 10 | git remote add lab-tasks-repo git@github.com:schelterlabs/deml-lab.git 11 | # Otherwise 12 | git remote add lab-tasks-repo https://github.com/schelterlabs/deml-lab.git 13 | 14 | git remote set-url lab-tasks-repo --push DISABLE 15 | 16 | # Setup the repository 17 | git fetch lab-tasks-repo 18 | git checkout -b lab-tasks lab-tasks-repo/master 19 | git checkout master 20 | git push -u origin 21 | -------------------------------------------------------------------------------- /01/java-maven-setup.txt: -------------------------------------------------------------------------------- 1 | # Check if installed 2 | mvn -version 3 | java -version 4 | 5 | # If not, download from maven, extract it and move it from downloads to application 6 | mv apache-maven-3.5.4 /Applications/ 7 | 8 | # For JAVA, download from Google Drive and follow instructions 9 | 10 | 11 | # for Mac users. Return to your home user directory, 12 | # and use your favorite editor to open .bash_profile. 13 | # If there isn't one, create one. 14 | 15 | vi .bash_profile 16 | 17 | ## add the following JAVA path to your bash file. 18 | export JAVA_HOME=$(/usr/libexec/java_home) 19 | 20 | ## add the following MAVEN path to your bash file. 21 | export M2_HOME=/Applications/apache-maven-3.6.2 22 | export PATH=$PATH:$M2_HOME/bin 23 | 24 | # source the bash file. 25 | source .bash_profile 26 | 27 | # test that the path is successfully set. 28 | echo $JAVA_HOME 29 | 30 | # or for Maven 31 | echo $M2_HOME 32 | 33 | # something like this should be returned 34 | /Library/Java/JavaVirtualMachines/jdk1.8.0_221.jdk/Contents/Home 35 | -------------------------------------------------------------------------------- /01/python-example/digits.py: -------------------------------------------------------------------------------- 1 | # Import datasets, classifiers and performance metrics 2 | from sklearn import datasets, svm, metrics 3 | 4 | # The digits dataset 5 | digits = datasets.load_digits() 6 | 7 | # To apply a classifier on this data, we need to flatten the image, to 8 | # turn the data in a (samples, feature) matrix: 9 | n_samples = len(digits.images) 10 | data = digits.images.reshape((n_samples, -1)) 11 | 12 | # Create a classifier: a support vector classifier 13 | classifier = svm.SVC(gamma=0.001) 14 | 15 | # We learn the digits on the first half of the digits 16 | classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2]) 17 | 18 | # Now predict the value of the digit on the second half: 19 | expected = digits.target[n_samples // 2:] 20 | predicted = classifier.predict(data[n_samples // 2:]) 21 | 22 | print("Classification report for classifier %s:\n%s\n" 23 | % (classifier, metrics.classification_report(expected, predicted))) 24 | print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) 25 | 26 | -------------------------------------------------------------------------------- /01/python-example/instructions.txt: -------------------------------------------------------------------------------- 1 | python3.6 -m venv venv 2 | source venv/bin/activate 3 | pip install -r requirements.txt 4 | python digits.py 5 | 6 | -------------------------------------------------------------------------------- /01/python-example/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn == 0.21.3 2 | 3 | -------------------------------------------------------------------------------- /01/spark-example/instructions.txt: -------------------------------------------------------------------------------- 1 | mvn scala:run -DmainClass=edu.nyu.deml.RunSparkLocal 2 | 3 | -------------------------------------------------------------------------------- /01/spark-example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | edu.nyu.deml 8 | lab01 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 1.8 13 | 1.8 14 | UTF-8 15 | 2.11 16 | ${scala.major.version}.5 17 | 3.4.4 18 | 19 | 20 | 21 | 22 | org.scala-lang 23 | scala-library 24 | ${scala.version} 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-core_${scala.major.version} 30 | 2.2.2 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-sql_${scala.major.version} 36 | 2.2.2 37 | 38 | 39 | 40 | 41 | src/main/scala 42 | 43 | 44 | net.alchim31.maven 45 | scala-maven-plugin 46 | ${scala-maven-plugin.version} 47 | 48 | ${scala.major.version} 49 | ${scala.version} 50 | 51 | 52 | 53 | scala-compile-first 54 | process-resources 55 | 56 | add-source 57 | compile 58 | 59 | 60 | 61 | scala-test-compile 62 | process-test-resources 63 | 64 | testCompile 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | org.apache.maven.plugins 73 | maven-surefire-plugin 74 | 2.7 75 | 76 | true 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /01/spark-example/src/main/scala/edu/nyu/deml/RunSparkLocal.scala: -------------------------------------------------------------------------------- 1 | package edu.nyu.deml 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object RunSparkLocal extends App { 6 | 7 | case class Item(id: Long, productName: String, description: String, priority: String, numViews: Long) 8 | 9 | withSpark { session => 10 | 11 | val rdd = session.sparkContext.parallelize(Seq( 12 | Item(1, "Thingy A", "awesome thing.", "high", 0), 13 | Item(2, "Thingy B", "available at http://thingb.com", null, 0), 14 | Item(3, null, null, "low", 5), 15 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), 16 | Item(5, "Thingy E", null, "high", 12)), 17 | numSlices = 2) 18 | 19 | val data = session.createDataFrame(rdd) 20 | 21 | val count = data.count() 22 | 23 | println(s"$count items found.") 24 | } 25 | 26 | 27 | 28 | 29 | 30 | def withSpark(func: SparkSession => Unit): Unit = { 31 | 32 | val session = SparkSession.builder() 33 | .master("local") 34 | .appName("test") 35 | .config("spark.ui.enabled", "false") 36 | .config("spark.sql.shuffle.partitions", 2.toString) 37 | .getOrCreate() 38 | session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir")) 39 | 40 | try { 41 | func(session) 42 | } finally { 43 | session.stop() 44 | System.clearProperty("spark.driver.port") 45 | } 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /02/adult-sample.csv: -------------------------------------------------------------------------------- 1 | age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year 2 | 28,Private,273269,Some-college,10,Never-married,Craft-repair,Not-in-family,Black,Male,0,0,40,United-States,<=50K 3 | 58,State-gov,123329,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,16,United-States,<=50K 4 | 34,Private,79637,Bachelors,13,Never-married,Exec-managerial,Own-child,Amer-Indian-Eskimo,Female,0,0,40,United-States,<=50K 5 | 71,Private,97870,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,15,Germany,<=50K 6 | 20,State-gov,41103,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K 7 | 46,Private,125492,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Black,Female,0,0,40,United-States,<=50K 8 | 31,Private,467579,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,1887,40,United-States,>50K 9 | 24,Private,376393,Assoc-voc,11,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 10 | 21,Private,56582,11th,7,Never-married,Other-service,Own-child,White,Male,0,0,50,United-States,<=50K 11 | 38,Private,76317,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K 12 | 43,Federal-gov,144778,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,>50K 13 | 47,Private,454989,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K 14 | 23,Private,278254,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,45,United-States,<=50K 15 | 38,Private,111499,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,7298,0,50,United-States,>50K 16 | 31,Private,168521,Bachelors,13,Never-married,Exec-managerial,Unmarried,White,Female,0,0,50,United-States,<=50K 17 | 36,Private,749636,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 18 | 27,Private,167405,HS-grad,9,Married-spouse-absent,Farming-fishing,Own-child,White,Female,0,0,40,Mexico,<=50K 19 | 32,Private,317378,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Female,10520,0,40,United-States,>50K 20 | 55,State-gov,71630,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,1617,40,United-States,<=50K 21 | 33,Private,182401,10th,6,Never-married,Adm-clerical,Not-in-family,Black,Male,0,0,40,United-States,<=50K 22 | 21,Private,33616,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,25,United-States,<=50K 23 | 25,Private,362912,Some-college,10,Never-married,Craft-repair,Own-child,White,Female,0,0,50,United-States,<=50K 24 | 28,Private,34335,HS-grad,9,Divorced,Sales,Not-in-family,Amer-Indian-Eskimo,Male,14084,0,40,United-States,>50K 25 | 51,Private,305147,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K 26 | 26,Private,50103,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 27 | 62,State-gov,221558,Masters,14,Separated,Prof-specialty,Unmarried,White,Female,0,0,24,?,<=50K 28 | 37,Private,138940,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 29 | 55,Self-emp-not-inc,52888,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,10,United-States,<=50K 30 | 46,Local-gov,125457,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,38,United-States,>50K 31 | 47,Private,102771,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,Portugal,<=50K 32 | 60,?,41517,11th,7,Married-spouse-absent,?,Unmarried,Black,Female,0,0,20,United-States,<=50K 33 | 34,Private,153614,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,>50K 34 | 32,Local-gov,157887,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 35 | 35,Private,308691,Masters,14,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States,<=50K 36 | 48,Self-emp-inc,238966,Some-college,10,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K 37 | 67,Private,123393,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K 38 | 48,Private,25468,Masters,14,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,50,United-States,>50K 39 | 30,Private,117393,HS-grad,9,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 40 | 40,Private,175686,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 41 | 58,Private,259014,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Male,0,0,20,United-States,<=50K 42 | 19,?,134974,Some-college,10,Never-married,?,Own-child,White,Female,0,0,20,United-States,<=50K 43 | 25,Private,49092,Bachelors,13,Never-married,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K 44 | 33,Local-gov,224185,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 45 | 43,Private,136721,12th,8,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 46 | 37,Private,314963,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K 47 | 21,State-gov,337766,Some-college,10,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States,<=50K 48 | 51,Self-emp-not-inc,111939,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,35,United-States,>50K 49 | 43,Private,151089,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,<=50K 50 | 49,Private,120629,Bachelors,13,Divorced,Exec-managerial,Not-in-family,Black,Female,27828,0,60,United-States,>50K 51 | 38,Local-gov,201410,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K 52 | 49,Private,61307,7th-8th,4,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,38,United-States,<=50K 53 | 36,Private,135289,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,48,United-States,>50K 54 | 36,Self-emp-not-inc,89622,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,80,United-States,>50K 55 | 21,Private,216070,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,Amer-Indian-Eskimo,Female,0,0,46,United-States,>50K 56 | 42,Private,138662,Some-college,10,Separated,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K 57 | 35,Private,385847,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 58 | 20,Private,189148,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,48,United-States,<=50K 59 | 22,Private,252355,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,27,United-States,<=50K 60 | 46,Private,243743,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K 61 | 33,Private,290763,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 62 | 23,?,99399,Some-college,10,Never-married,?,Unmarried,Amer-Indian-Eskimo,Female,0,0,25,United-States,<=50K 63 | 44,Private,160829,Bachelors,13,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,20,United-States,>50K 64 | 46,Local-gov,329752,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,30,United-States,<=50K 65 | 52,Private,117674,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K 66 | 20,?,150084,Some-college,10,Never-married,?,Own-child,White,Male,0,0,25,United-States,<=50K 67 | 49,State-gov,203039,11th,7,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K 68 | 38,Private,210438,7th-8th,4,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K 69 | 29,Private,163265,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K 70 | 18,Private,43272,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States,<=50K 71 | 54,Self-emp-not-inc,103179,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K 72 | 48,Private,449354,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,4386,0,45,United-States,>50K 73 | 29,Private,297544,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 74 | 30,Private,161690,Assoc-voc,11,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K 75 | 31,Local-gov,219883,HS-grad,9,Never-married,Protective-serv,Not-in-family,Black,Male,0,0,40,United-States,<=50K 76 | 40,Federal-gov,121012,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,7298,0,48,United-States,>50K 77 | 32,Private,207172,Some-college,10,Never-married,Sales,Other-relative,White,Female,0,0,40,United-States,<=50K 78 | 47,Private,148995,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,2415,60,United-States,>50K 79 | 19,Private,292590,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K 80 | 45,Private,274657,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,?,<=50K 81 | 49,Private,189498,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K 82 | 18,Private,25837,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,25,United-States,<=50K 83 | 33,State-gov,306309,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States,<=50K 84 | 48,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 85 | 30,Local-gov,289442,HS-grad,9,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K 86 | 55,Private,89690,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,>50K 87 | 47,Self-emp-not-inc,237731,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,2829,0,65,United-States,<=50K 88 | 72,?,402306,Some-college,10,Married-civ-spouse,?,Husband,White,Male,0,0,32,Canada,<=50K 89 | 27,Private,119793,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 90 | 44,Private,116358,Bachelors,13,Married-civ-spouse,Sales,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K 91 | 23,Private,55215,Bachelors,13,Never-married,Sales,Own-child,White,Male,0,0,55,United-States,<=50K 92 | 33,Private,184784,10th,6,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K 93 | 43,Private,269015,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,40,Germany,>50K 94 | 46,Private,146919,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K 95 | 90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K 96 | 34,Private,19847,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 97 | 32,Private,108116,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1902,60,United-States,>50K 98 | 42,Self-emp-not-inc,32185,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K 99 | 18,Private,333611,5th-6th,3,Never-married,Other-service,Other-relative,White,Male,0,0,54,Mexico,<=50K 100 | 25,Private,50053,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,Japan,<=50K 101 | 28,Private,119287,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,28,United-States,>50K 102 | -------------------------------------------------------------------------------- /02/instructions.txt: -------------------------------------------------------------------------------- 1 | # Update your repo 2 | git checkout lab-tasks 3 | git pull 4 | git checkout master 5 | git merge lab-tasks 6 | 7 | # Start the jupyter notebook 8 | cd 02 9 | python3.6 -m venv venv 10 | source venv/bin/activate 11 | pip install -r requirements.txt 12 | jupyter notebook 13 | 14 | 15 | -------------------------------------------------------------------------------- /02/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.21.3 2 | pandas==0.25.1 3 | jupyter==1.0.0 4 | 5 | -------------------------------------------------------------------------------- /03/data/serving/data.csv: -------------------------------------------------------------------------------- 1 | pickup_community_area,fare,trip_start_month,trip_start_hour,trip_start_day,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_miles,pickup_census_tract,dropoff_census_tract,payment_type,company,trip_seconds,dropoff_community_area 2 | 8,6.45,9,17,4,1441213200,41.892072635,-87.628874157,41.880994471,-87.632746489,0.9,,17031839100,Credit Card,Taxi Affiliation Services,540,32 3 | 8,8.05,10,20,7,1414269000,41.899155613,-87.626210532,41.880994471,-87.632746489,0.0,,17031839100,Cash,Taxi Affiliation Services,840,32 4 | 32,7.65,3,20,2,1395087300,41.880994471,-87.632746489,41.877406123,-87.621971652,1.9,,17031320400,Cash,,480,32 5 | 8,36.05,6,17,7,1370713500,41.890922026,-87.618868355,41.97907082,-87.903039661,17.4,,17031980000,Cash,Dispatch Taxi Affiliation,2280,76 6 | 32,4.45,4,12,3,1398775500,41.880994471,-87.632746489,41.880994471,-87.632746489,0.3,,17031839100,Cash,,240,32 7 | 8,36.45,10,8,6,1413532800,41.892507781,-87.626214906,41.97907082,-87.903039661,17.5,,17031980000,Credit Card,,2040,76 8 | 76,37.05,3,23,4,1394667000,41.97907082,-87.903039661,41.898331794,-87.620762865,0.0,,17031081300,Cash,Choice Taxi Association,1680,8 9 | 28,6.25,10,9,5,1380791700,41.879255084,-87.642648998,41.880994471,-87.632746489,0.9,,17031839100,Cash,,480,32 10 | 8,8.44,10,8,1,1382862600,41.899602111,-87.633308037,41.944226601,-87.655998182,2.8,,,Cash,,360,6 11 | 32,8.85,7,19,5,1406834100,41.884987192,-87.620992913,41.851017824,-87.635091856,0.0,,17031841100,Cash,Taxi Affiliation Services,720,34 12 | 28,7.25,11,20,4,1448481600,41.874005383,-87.66351755,41.874005383,-87.66351755,1.9,,,Cash,Taxi Affiliation Services,480,28 13 | 8,10.5,4,1,7,1461979800,41.90749193,-87.63576009,41.884987192,-87.620992913,2.7,,17031320100,Credit Card,,720,32 14 | 32,3.65,8,10,1,1407060900,41.880994471,-87.632746489,41.880994471,-87.632746489,0.0,,17031839100,Cash,,60,32 15 | 32,12.5,10,19,4,1477511100,41.880994471,-87.632746489,41.899155613,-87.626210532,1.8,,17031081201,Credit Card,Dispatch Taxi Affiliation,1440,8 16 | 8,10.45,5,18,2,1399919400,41.892042136,-87.63186395,41.884987192,-87.620992913,0.1,,17031320100,Cash,Taxi Affiliation Services,960,32 17 | 28,4.65,2,16,3,1360686600,41.879255084,-87.642648998,,,0.65,,,Cash,,240, 18 | 76,40.05,11,17,5,1384451100,41.97907082,-87.903039661,41.880994471,-87.632746489,1.0,,17031839100,Cash,Taxi Affiliation Services,4440,32 19 | 28,3.25,11,12,6,1383308100,41.88528132,-87.6572332,41.88528132,-87.6572332,0.0,,17031833000,Cash,,0,28 20 | 8,7.05,12,15,6,1417793400,41.900221297,-87.629105186,41.892072635,-87.628874157,1.0,,17031081600,Cash,Dispatch Taxi Affiliation,660,8 21 | 32,5.05,1,19,3,1421781300,41.880994471,-87.632746489,41.879255084,-87.642648998,0.0,,17031281900,Credit Card,Blue Ribbon Taxi Association Inc.,300,28 22 | 8,21.45,9,20,5,1441310400,41.899602111,-87.633308037,41.954027649,-87.763399032,0.5,,,Credit Card,Taxi Affiliation Services,1080,15 23 | 3,7.85,5,22,7,1433023200,41.972437081,-87.671109526,41.946294536,-87.654298084,1.9,,17031832000,Cash,Dispatch Taxi Affiliation,600,6 24 | 32,24.85,9,11,6,1443181500,41.880994471,-87.632746489,41.785998518,-87.750934289,11.0,,17031980100,Credit Card,,1440,56 25 | 32,7.25,11,1,1,1446339600,41.877406123,-87.621971652,41.89321636,-87.63784421,0.0,,17031081800,Credit Card,Taxi Affiliation Services,540,8 26 | 32,6.65,11,18,2,1383590700,41.880994471,-87.632746489,41.899155613,-87.626210532,1.1,,17031081201,Credit Card,Northwest Management LLC,540,8 27 | 34,15.05,1,22,2,1420495200,41.842076117,-87.633973422,41.775928827,-87.666596265,6.1,,,Unknown,Taxi Affiliation Services,1200,67 28 | 32,36.65,10,14,6,1445610600,41.880994471,-87.632746489,41.97907082,-87.903039661,17.3,,17031980000,Credit Card,Taxi Affiliation Services,2640,76 29 | 8,5.45,3,19,1,1364758200,41.899155613,-87.626210532,41.907412816,-87.640901525,0.5,,17031080400,Credit Card,Dispatch Taxi Affiliation,300,8 30 | 28,35.25,1,16,3,1420560900,41.885300022,-87.642808466,41.97907082,-87.903039661,17.1,,17031980000,Credit Card,Taxi Affiliation Services,1800,76 31 | 8,5.65,5,20,6,1399060800,41.892507781,-87.626214906,41.880994471,-87.632746489,0.4,,17031839100,Cash,Dispatch Taxi Affiliation,480,32 32 | 24,8.05,6,21,4,1433972700,41.901206994,-87.676355989,41.878865584,-87.625192142,2.13,,,Credit Card,,480,32 33 | 28,5.65,1,11,3,1358249400,41.879255084,-87.642648998,41.877406123,-87.621971652,0.0,,17031320400,Cash,Dispatch Taxi Affiliation,360,32 34 | 33,9.05,8,11,6,1438947900,41.859349715,-87.617358006,41.892042136,-87.63186395,2.3,,17031081700,Credit Card,,660,8 35 | 32,7.25,10,15,5,1446131700,41.880994471,-87.632746489,41.89503345,-87.619710672,1.4,,17031081401,Cash,Choice Taxi Association,540,8 36 | 6,6.25,10,2,2,1413167400,41.944226601,-87.655998182,41.922686284,-87.649488729,0.0,,,Cash,Taxi Affiliation Services,360,7 37 | 8,7.0,2,9,1,1454835600,41.899602111,-87.633308037,41.899602111,-87.633308037,0.0,,,Cash,Choice Taxi Association,360,8 38 | 28,4.65,5,19,4,1401303600,41.885300022,-87.642808466,41.879255084,-87.642648998,0.6,,17031281900,Cash,Taxi Affiliation Services,180,28 39 | 7,3.25,6,20,3,1435090500,41.922082541,-87.634156093,,,0.03,,,Cash,,0, 40 | 6,5.45,9,2,7,1443234600,41.944226601,-87.655998182,41.944226601,-87.655998182,0.0,,,Credit Card,Blue Ribbon Taxi Association Inc.,300,6 41 | 8,6.0,6,17,7,1465062300,41.89321636,-87.63784421,41.877406123,-87.621971652,0.0,,17031320400,Cash,Blue Ribbon Taxi Association Inc.,300,32 42 | 56,26.85,4,22,1,1365976800,41.785998518,-87.750934289,41.892072635,-87.628874157,0.0,,17031081600,Cash,Choice Taxi Association,1380,8 43 | 32,7.85,2,13,6,1424437200,41.880994471,-87.632746489,41.898331794,-87.620762865,2.1,,17031081300,Cash,,540,8 44 | 32,12.85,12,22,3,1450217700,41.878865584,-87.625192142,41.944226601,-87.655998182,4.7,,,Cash,,780,6 45 | 76,28.65,6,13,5,1434029400,41.980264315,-87.913624596,41.944226601,-87.655998182,0.0,,,Credit Card,Taxi Affiliation Services,2100,6 46 | 4,11.65,4,19,3,1396380600,41.975170943,-87.687515515,41.922686284,-87.649488729,0.2,,,Credit Card,Taxi Affiliation Services,1080,7 47 | 8,8.05,5,11,3,1401188400,41.89321636,-87.63784421,41.880994471,-87.632746489,1.8,,17031839100,Cash,,660,32 48 | 28,4.65,9,17,2,1378746900,41.885300022,-87.642808466,41.879255084,-87.642648998,0.0,,17031281900,Cash,Taxi Affiliation Services,240,28 49 | 3,14.05,2,6,3,1360046700,41.96581197,-87.655878786,41.899602111,-87.633308037,0.0,,,Unknown,Taxi Affiliation Services,540,8 50 | 13,3.25,1,15,7,1421509500,41.983636307,-87.723583185,41.983636307,-87.723583185,0.0,,,Cash,,0,13 51 | 8,34.65,3,19,1,1396206000,41.890922026,-87.618868355,41.97907082,-87.903039661,17.4,,17031980000,Cash,,1380,76 52 | 8,5.05,12,2,1,1450576800,41.902788048,-87.62614559,41.892507781,-87.626214906,0.0,,17031081500,Credit Card,Taxi Affiliation Services,300,8 53 | 8,4.25,4,20,4,1398285900,41.892072635,-87.628874157,41.898331794,-87.620762865,0.0,,17031081300,Credit Card,Blue Ribbon Taxi Association Inc.,120,8 54 | 77,11.45,9,14,4,1378908000,41.9867118,-87.663416405,41.944226601,-87.655998182,0.0,,,No Charge,Northwest Management LLC,540,6 55 | 8,9.45,10,23,5,1382657400,41.892042136,-87.63186395,41.89830587,-87.653613982,0.0,,17031842300,Cash,Taxi Affiliation Services,720,24 56 | 7,7.45,5,20,5,1400791500,41.914616286,-87.631717366,41.928967266,-87.656156831,1.6,,17031070400,Cash,,540,7 57 | 32,16.25,3,20,2,1364244300,41.870607372,-87.622172937,41.928967266,-87.656156831,0.0,,17031070400,Cash,Choice Taxi Association,1440,7 58 | 8,5.85,11,15,2,1416238200,41.892507781,-87.626214906,41.880994471,-87.632746489,1.2,,17031839100,Credit Card,Dispatch Taxi Affiliation,300,32 59 | 8,5.5,1,10,2,1451901600,41.892507781,-87.626214906,41.89503345,-87.619710672,0.5,,17031081401,Cash,KOAM Taxi Association,300,8 60 | 32,5.25,6,13,6,1435325400,41.880994471,-87.632746489,41.880994471,-87.632746489,0.5,,17031839100,Credit Card,KOAM Taxi Association,300,32 61 | 6,5.45,7,1,7,1373677200,41.942691844,-87.651770507,41.936237179,-87.656411531,0.8,,17031062900,Credit Card,Choice Taxi Association,360,6 62 | 32,4.65,8,17,6,1408727700,41.880994471,-87.632746489,41.892042136,-87.63186395,0.6,,17031081700,Cash,,180,8 63 | 32,4.05,1,12,3,1390305600,41.884987192,-87.620992913,41.884987192,-87.620992913,0.4,,17031320100,Cash,,120,32 64 | 6,10.75,8,1,1,1471138200,41.945282331,-87.661545096,41.936237179,-87.656411531,2.8,,17031062900,Credit Card,,780,6 65 | 28,8.45,8,0,7,1438993800,41.885300022,-87.642808466,41.902788048,-87.62614559,2.1,,17031081202,Cash,Taxi Affiliation Services,720,8 66 | 7,5.25,9,19,7,1442690100,41.914747305,-87.654007029,41.929046937,-87.651310877,0.9,,17031070300,Cash,Taxi Affiliation Services,300,7 67 | 6,9.25,1,0,7,1422060300,41.944226601,-87.655998182,41.899602111,-87.633308037,0.1,,,Cash,Taxi Affiliation Services,480,8 68 | 5,11.05,4,0,7,1398470400,41.947791586,-87.683834942,41.983636307,-87.723583185,0.2,,,Cash,Taxi Affiliation Services,840,13 69 | 32,5.85,10,8,2,1382343300,41.880994471,-87.632746489,41.89503345,-87.619710672,1.1,,17031081401,Credit Card,,300,8 70 | 32,13.05,8,23,4,1408576500,41.884987192,-87.620992913,41.942577185,-87.647078509,0.3,,17031062000,Cash,Taxi Affiliation Services,660,6 71 | 33,15.45,9,20,3,1410899400,41.859349715,-87.617358006,41.93057857,-87.642206313,5.54,,17031070102,Cash,,900,7 72 | 76,45.85,5,15,2,1401118200,41.97907082,-87.903039661,41.880994471,-87.632746489,19.7,,17031839100,Credit Card,Taxi Affiliation Services,4200,32 73 | 32,17.05,3,18,4,1425492000,41.880994471,-87.632746489,41.938391258,-87.63857492,0.0,,17031063200,Credit Card,Taxi Affiliation Services,1440,6 74 | 6,19.45,4,19,5,1366916400,41.944226601,-87.655998182,41.850266366,-87.667569312,7.2,,,Cash,,1680,31 75 | 8,4.65,5,21,3,1431467100,41.892507781,-87.626214906,41.884987192,-87.620992913,0.59,,17031320100,Credit Card,,180,32 76 | 32,7.25,8,11,3,1408448700,41.884987192,-87.620992913,41.879255084,-87.642648998,1.0,,17031281900,Cash,,660,28 77 | 8,12.05,5,21,4,1401311700,41.891971508,-87.612945414,41.936237179,-87.656411531,0.2,,17031062900,Cash,Taxi Affiliation Services,720,6 78 | 24,6.85,6,11,5,1372330800,41.901206994,-87.676355989,41.878865584,-87.625192142,0.0,,,Credit Card,Taxi Affiliation Services,420,32 79 | 7,8.85,9,11,4,1378899000,41.914747305,-87.654007029,41.879255084,-87.642648998,2.5,,17031281900,Cash,3201 - CID Cab Co Inc,600,28 80 | 32,6.25,3,15,5,1395933300,41.880994471,-87.632746489,41.89321636,-87.63784421,1.1,,17031081800,Credit Card,,420,8 81 | 8,7.05,9,9,2,1410773400,41.89321636,-87.63784421,41.880994471,-87.632746489,1.3,,17031839100,Cash,KOAM Taxi Association,480,32 82 | 7,8.65,2,13,7,1361022300,41.922082541,-87.634156093,41.890922026,-87.618868355,0.0,,17031081403,Cash,Taxi Affiliation Services,540,8 83 | 28,12.65,9,20,7,1379189700,41.874005383,-87.66351755,41.92276062,-87.699155343,0.0,,,No Charge,Choice Taxi Association,1140,22 84 | 7,3.25,5,18,4,1432144800,41.928967266,-87.656156831,41.928967266,-87.656156831,0.0,,17031070400,Cash,Blue Ribbon Taxi Association Inc.,60,7 85 | 8,4.45,8,21,2,1439847900,41.898331794,-87.620762865,41.892507781,-87.626214906,0.5,,17031081500,Cash,Taxi Affiliation Services,240,8 86 | 7,7.45,5,2,1,1431224100,41.914616286,-87.631717366,41.916005274,-87.675095116,2.2,,17031831000,Cash,Taxi Affiliation Services,420,22 87 | 7,10.05,3,13,1,1394975700,41.914616286,-87.631717366,41.879255084,-87.642648998,3.0,,17031281900,Cash,Choice Taxi Association,840,28 88 | 28,9.25,5,14,6,1400856300,41.874005383,-87.66351755,41.901206994,-87.676355989,2.4,,,Cash,Taxi Affiliation Services,720,24 89 | 32,4.85,5,8,3,1400573700,41.880994471,-87.632746489,41.892507781,-87.626214906,0.7,,17031081500,Cash,Dispatch Taxi Affiliation,240,8 90 | 8,4.05,10,2,2,1444615200,41.892072635,-87.628874157,41.89321636,-87.63784421,0.6,,17031081800,Cash,Dispatch Taxi Affiliation,120,8 91 | 8,7.5,10,23,7,1475969400,41.904935302,-87.649907226,41.926811182,-87.642605247,1.6,,17031070103,Cash,Taxi Affiliation Services,540,7 92 | 76,36.85,1,22,1,1422223200,41.97907082,-87.903039661,41.884987192,-87.620992913,18.1,,17031320100,Credit Card,,1560,32 93 | 8,37.45,2,10,1,1393150500,41.898331794,-87.620762865,41.97907082,-87.903039661,0.0,,17031980000,Credit Card,Taxi Affiliation Services,1560,76 94 | 32,6.0,12,23,6,1481325300,41.880994471,-87.632746489,41.880994471,-87.632746489,0.7,,17031839100,Cash,,360,32 95 | 8,5.05,1,11,4,1389179700,41.900265687,-87.63210922,41.898331794,-87.620762865,0.06,,17031081300,Cash,,0,8 96 | 8,12.85,8,22,1,1407708000,41.892042136,-87.63186395,41.943237122,-87.643470956,0.49,,17031061901,Cash,,660,6 97 | 13,30.65,10,18,4,1413396000,41.983636307,-87.723583185,41.922686284,-87.649488729,0.6,,,Cash,Taxi Affiliation Services,3300,7 98 | 32,10.45,1,7,5,1389252600,41.878865584,-87.625192142,41.874005383,-87.66351755,3.6,,,Cash,,600,28 99 | 7,7.25,12,7,1,1387698300,41.922686284,-87.649488729,41.901206994,-87.676355989,0.1,,,Cash,Taxi Affiliation Services,420,24 100 | 32,6.65,5,18,5,1401386400,41.880994471,-87.632746489,41.880994471,-87.632746489,0.61,,17031839100,Cash,,540,32 101 | 32,10.5,1,1,3,1452561300,41.878865584,-87.625192142,41.922686284,-87.649488729,2.9,,,Cash,KOAM Taxi Association,600,7 102 | -------------------------------------------------------------------------------- /03/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | tensorflow==1.15.2 4 | tensorflow-data-validation==0.13.0 5 | jupyter==1.0.0 6 | 7 | -------------------------------------------------------------------------------- /04/Apache-Beam-Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Beam word count example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 9, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from __future__ import absolute_import\n", 17 | "\n", 18 | "import apache_beam as beam\n", 19 | "from apache_beam.io import ReadFromText\n", 20 | "from apache_beam.io import WriteToText\n", 21 | "import re\n", 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "A text file `Read` transform is applied to the `Pipeline` object itself, and produces a `PCollection` as output. Each element in the output PCollection represents one line of text from the input file.\n", 30 | "\n", 31 | "This transform splits the lines in `PCollection`, where each element is an individual word in Shakespeare’s collected texts. " 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 6, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "'DONE'" 43 | ] 44 | }, 45 | "execution_count": 6, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "p = beam.Pipeline()\n", 52 | "\n", 53 | "lines = p | 'read' >> ReadFromText(\"kinglear.txt\")\n", 54 | "lines | 'write' >> WriteToText(\"copy-of-kinglear.txt\")\n", 55 | "\n", 56 | "result = p.run()\n", 57 | "result.wait_until_finish()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 7, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "class ExtractWordsDoFn(beam.DoFn):\n", 67 | " def process(self, element):\n", 68 | " text_line = element.strip()\n", 69 | " words = re.findall(r'[\\w\\']+', text_line, re.UNICODE)\n", 70 | " return words\n", 71 | " \n", 72 | "\n", 73 | "# Count the occurrences of each word.\n", 74 | "def count_ones(word_ones):\n", 75 | " (word, ones) = word_ones\n", 76 | " return (word, sum(ones)) \n", 77 | "\n", 78 | "# Format the counts into a PCollection of strings.\n", 79 | "def format_result(word_count):\n", 80 | " (word, count) = word_count\n", 81 | " return '%s\\t%d' % (word, count)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 8, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "'DONE'" 93 | ] 94 | }, 95 | "execution_count": 8, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "# Creating a pipeline\n", 102 | "p = beam.Pipeline()\n", 103 | "\n", 104 | "lines = p | 'read' >> ReadFromText(\"kinglear.txt\")\n", 105 | "\n", 106 | "counts = (lines\n", 107 | " | 'split' >> (beam.ParDo(ExtractWordsDoFn()))\n", 108 | " | 'pair_with_one' >> beam.Map(lambda x: (x, 1))\n", 109 | " | 'group' >> beam.GroupByKey()\n", 110 | " | 'count' >> beam.Map(count_ones))\n", 111 | "\n", 112 | "output = counts | 'format' >> beam.Map(format_result)\n", 113 | "output | 'write' >> WriteToText(\"counts.txt\")\n", 114 | "\n", 115 | "result = p.run()\n", 116 | "result.wait_until_finish()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 11, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/html": [ 127 | "
\n", 128 | "\n", 141 | "\n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | "
01
016061
1THE7
2TRAGEDY1
3OF16
4KING1
.........
4902journey1
4903weight1
4904ought1
4905oldest1
4906END1
\n", 207 | "

4907 rows × 2 columns

\n", 208 | "
" 209 | ], 210 | "text/plain": [ 211 | " 0 1\n", 212 | "0 1606 1\n", 213 | "1 THE 7\n", 214 | "2 TRAGEDY 1\n", 215 | "3 OF 16\n", 216 | "4 KING 1\n", 217 | "... ... ..\n", 218 | "4902 journey 1\n", 219 | "4903 weight 1\n", 220 | "4904 ought 1\n", 221 | "4905 oldest 1\n", 222 | "4906 END 1\n", 223 | "\n", 224 | "[4907 rows x 2 columns]" 225 | ] 226 | }, 227 | "execution_count": 11, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "data = pd.read_csv('counts.txt-00000-of-00001', sep = '\\t', header = None)\n", 234 | "data" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 15, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/html": [ 245 | "
\n", 246 | "\n", 259 | "\n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | "
01
128the705
119I620
122and587
13of456
27to430
.........
2576outface1
2578persecutions1
2579sky1
2581precedent1
4906END1
\n", 325 | "

4907 rows × 2 columns

\n", 326 | "
" 327 | ], 328 | "text/plain": [ 329 | " 0 1\n", 330 | "128 the 705\n", 331 | "119 I 620\n", 332 | "122 and 587\n", 333 | "13 of 456\n", 334 | "27 to 430\n", 335 | "... ... ...\n", 336 | "2576 outface 1\n", 337 | "2578 persecutions 1\n", 338 | "2579 sky 1\n", 339 | "2581 precedent 1\n", 340 | "4906 END 1\n", 341 | "\n", 342 | "[4907 rows x 2 columns]" 343 | ] 344 | }, 345 | "execution_count": 15, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | } 349 | ], 350 | "source": [ 351 | "data.sort_values(by=[1], ascending=False)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "## Task\n", 359 | "\n", 360 | "Copy and adjust the beam job above so that it ignores a set of stop words\n", 361 | "\n", 362 | "Use the filename `counts-nostop.txt` for the output\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "stopwords = set(['the', 'and', 'of', 'to'])\n", 372 | "\n", 373 | "# TODO copy and adjust beam job" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "data = pd.read_csv('counts-nostop.txt-00000-of-00001', sep = '\\t', header = None)\n", 383 | "data.sort_values(by=[1], ascending=False)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [] 392 | } 393 | ], 394 | "metadata": { 395 | "kernelspec": { 396 | "display_name": "Python 3", 397 | "language": "python", 398 | "name": "python3" 399 | }, 400 | "language_info": { 401 | "codemirror_mode": { 402 | "name": "ipython", 403 | "version": 3 404 | }, 405 | "file_extension": ".py", 406 | "mimetype": "text/x-python", 407 | "name": "python", 408 | "nbconvert_exporter": "python", 409 | "pygments_lexer": "ipython3", 410 | "version": "3.6.9" 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 2 415 | } 416 | -------------------------------------------------------------------------------- /04/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | jupyter==1.0.0 4 | apache-beam==2.15.0 5 | 6 | -------------------------------------------------------------------------------- /05/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | tensorflow==1.15.2 4 | tensorflow-data-validation==0.14.0 5 | jupyter==1.0.0 6 | 7 | -------------------------------------------------------------------------------- /05/simple.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "Ifon2ScEAsJO" 8 | }, 9 | "source": [ 10 | "" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "colab_type": "text", 24 | "id": "tghWegsjhpkt" 25 | }, 26 | "source": [ 27 | "##### Copyright © 2019 Google Inc." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "colab": {}, 35 | "colab_type": "code", 36 | "id": "rSGJWC5biBiG" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n", 41 | "# you may not use this file except in compliance with the License.\n", 42 | "# You may obtain a copy of the License at\n", 43 | "#\n", 44 | "# https://www.apache.org/licenses/LICENSE-2.0\n", 45 | "#\n", 46 | "# Unless required by applicable law or agreed to in writing, software\n", 47 | "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", 48 | "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 49 | "# See the License for the specific language governing permissions and\n", 50 | "# limitations under the License." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "colab_type": "text", 57 | "id": "mPt5BHTwy_0F" 58 | }, 59 | "source": [ 60 | "# Preprocess data with TensorFlow Transform\n", 61 | "***The Feature Engineering Component of TensorFlow Extended (TFX)***\n", 62 | "\n", 63 | "This example colab notebook provides a very simple example of how TensorFlow Transform (tf.Transform) can be used to preprocess data using exactly the same code for both training a model and serving inferences in production.\n", 64 | "\n", 65 | "TensorFlow Transform is a library for preprocessing input data for TensorFlow, including creating features that require a full pass over the training dataset. For example, using TensorFlow Transform you could:\n", 66 | "\n", 67 | "* Normalize an input value by using the mean and standard deviation\n", 68 | "* Convert strings to integers by generating a vocabulary over all of the input values\n", 69 | "* Convert floats to integers by assigning them to buckets, based on the observed data distribution\n", 70 | "\n", 71 | "TensorFlow has built-in support for manipulations on a single example or a batch of examples. `tf.Transform` extends these capabilities to support full passes over the entire training dataset.\n", 72 | "\n", 73 | "The output of `tf.Transform` is exported as a TensorFlow graph which you can use for both training and serving. Using the same graph for both training and serving can prevent skew, since the same transformations are applied in both stages." 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "colab_type": "text", 80 | "id": "RptgLn2RYuK3" 81 | }, 82 | "source": [ 83 | "## Python check and imports\n", 84 | "First, we'll make sure that we're using Python 3. Then, we'll go ahead and install and import the stuff we need." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 1, 90 | "metadata": { 91 | "colab": {}, 92 | "colab_type": "code", 93 | "id": "tFcdSuXTidhH" 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "sys.version_info(major=3, minor=7, micro=3, releaselevel='final', serial=0)\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "import sys, os\n", 106 | "# Confirm that we're using Python 3\n", 107 | "assert sys.version_info.major is 3, 'Oops, not running Python 3'\n", 108 | "print(sys.version_info)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 2, 114 | "metadata": { 115 | "colab": {}, 116 | "colab_type": "code", 117 | "id": "K4QXVIM7iglN" 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "import pprint\n", 122 | "import tempfile\n", 123 | "import warnings; warnings.simplefilter('ignore')\n", 124 | "\n", 125 | "import tensorflow as tf\n", 126 | "import tensorflow_transform as tft\n", 127 | "import tensorflow_transform.beam.impl as tft_beam\n", 128 | "from tensorflow_transform.tf_metadata import dataset_metadata\n", 129 | "from tensorflow_transform.tf_metadata import dataset_schema\n", 130 | "tf.logging.set_verbosity(tf.logging.ERROR)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "colab_type": "text", 137 | "id": "CxOxaaOYRfl7" 138 | }, 139 | "source": [ 140 | "## Data: Create some dummy data\n", 141 | "We'll create some simple dummy data for our simple example:\n", 142 | "\n", 143 | "* `raw_data` is the initial raw data that we're going to preprocess\n", 144 | "* `raw_data_metadata` contains the schema that tells us the types of each of the columns in `raw_data`. In this case, it's very simple." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 3, 150 | "metadata": { 151 | "colab": {}, 152 | "colab_type": "code", 153 | "id": "-R236Tkf_ON3" 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "raw_data = [\n", 158 | " {'x': 1, 'y': 1, 's': 'hello'},\n", 159 | " {'x': 2, 'y': 2, 's': 'world'},\n", 160 | " {'x': 3, 'y': 3, 's': 'hello'}\n", 161 | " ]\n", 162 | "\n", 163 | "raw_data_metadata = dataset_metadata.DatasetMetadata(\n", 164 | " dataset_schema.from_feature_spec({\n", 165 | " 'y': tf.FixedLenFeature([], tf.float32),\n", 166 | " 'x': tf.FixedLenFeature([], tf.float32),\n", 167 | " 's': tf.FixedLenFeature([], tf.string),\n", 168 | " }))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "colab_type": "text", 175 | "id": "Zadh6MXLS3eD" 176 | }, 177 | "source": [ 178 | "## Transform: Create a preprocessing function\n", 179 | "The _preprocessing function_ is the most important concept of tf.Transform. A preprocessing function is where the transformation of the dataset really happens. It accepts and returns a dictionary of tensors, where a tensor means a Tensor or SparseTensor. There are two main groups of API calls that typically form the heart of a preprocessing function:\n", 180 | "\n", 181 | "1. **TensorFlow Ops:** Any function that accepts and returns tensors, which usually means TensorFlow ops. These add TensorFlow operations to the graph that transforms raw data into transformed data one feature vector at a time. These will run for every example, during both training and serving.\n", 182 | "2. **TensorFlow Transform Analyzers:** Any of the analyzers provided by tf.Transform. Analyzers also accept and return tensors, but unlike TensorFlow ops they only run once, during training, and typically make a full pass over the entire training dataset. They create tensor constants, which are added to your graph. For example, `tft.min` computes the minimum of a tensor over the training dataset. tf.Transform provides a fixed set of analyzers, but this will be extended in future versions.\n", 183 | "\n", 184 | "Caution: When you apply your preprocessing function to serving inferences, the constants that were created by analyzers during training do not change. If your data has trend or seasonality components, plan accordingly." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 4, 190 | "metadata": { 191 | "colab": {}, 192 | "colab_type": "code", 193 | "id": "H2wANNF_2dCR" 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "def preprocessing_fn(inputs):\n", 198 | " \"\"\"Preprocess input columns into transformed columns.\"\"\"\n", 199 | " x = inputs['x']\n", 200 | " y = inputs['y']\n", 201 | " s = inputs['s']\n", 202 | " x_centered = x - tft.mean(x)\n", 203 | " y_normalized = tft.scale_to_0_1(y)\n", 204 | " s_integerized = tft.compute_and_apply_vocabulary(s)\n", 205 | " return {\n", 206 | " 'x_centered': x_centered,\n", 207 | " 'y_normalized': y_normalized,\n", 208 | " 's_integerized': s_integerized,\n", 209 | " }" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "colab_type": "text", 216 | "id": "cSl9qyTCbBKR" 217 | }, 218 | "source": [ 219 | "## Putting it all together\n", 220 | "Now we're ready to transform our data. We'll use Apache Beam with a direct runner, and supply three inputs:\n", 221 | "1. `raw_data` - The raw input data that we created above\n", 222 | "2. `raw_data_metadata` - The schema for the raw data\n", 223 | "3. `preprocessing_fn` - The function that we created to do our transformation\n", 224 | "\n", 225 | "" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 5, 241 | "metadata": { 242 | "colab": {}, 243 | "colab_type": "code", 244 | "id": "mAF9w7RTZU7c" 245 | }, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "\n", 252 | "Raw data:\n", 253 | "[{'s': 'hello', 'x': 1, 'y': 1},\n", 254 | " {'s': 'world', 'x': 2, 'y': 2},\n", 255 | " {'s': 'hello', 'x': 3, 'y': 3}]\n", 256 | "\n", 257 | "Transformed data:\n", 258 | "[{'s_integerized': 0, 'x_centered': -1.0, 'y_normalized': 0.0},\n", 259 | " {'s_integerized': 1, 'x_centered': 0.0, 'y_normalized': 0.5},\n", 260 | " {'s_integerized': 0, 'x_centered': 1.0, 'y_normalized': 1.0}]\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "def main():\n", 266 | " # Ignore the warnings\n", 267 | " with tft_beam.Context(temp_dir=tempfile.mkdtemp()):\n", 268 | " transformed_dataset, transform_fn = ( # pylint: disable=unused-variable\n", 269 | " (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(\n", 270 | " preprocessing_fn))\n", 271 | "\n", 272 | " transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable\n", 273 | "\n", 274 | " print('\\nRaw data:\\n{}\\n'.format(pprint.pformat(raw_data)))\n", 275 | " print('Transformed data:\\n{}'.format(pprint.pformat(transformed_data)))\n", 276 | "\n", 277 | "if __name__ == '__main__':\n", 278 | " main()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "colab_type": "text", 285 | "id": "NO6LyTneNndy" 286 | }, 287 | "source": [ 288 | "## Is this the right answer?\n", 289 | "Previously, we used `tf.Transform` to do this:\n", 290 | "```\n", 291 | "x_centered = x - tft.mean(x)\n", 292 | "y_normalized = tft.scale_to_0_1(y)\n", 293 | "s_integerized = tft.compute_and_apply_vocabulary(s)\n", 294 | "x_centered_times_y_normalized = (x_centered * y_normalized)\n", 295 | "```\n", 296 | "\n", 297 | "### x_centered\n", 298 | "With input of `[1, 2, 3]` the mean of x is 2, and we subtract it from x to center our x values at 0. So our result of `[-1.0, 0.0, 1.0]` is correct.\n", 299 | "### y_normalized\n", 300 | "We wanted to scale our y values between 0 and 1. Our input was `[1, 2, 3]` so our result of `[0.0, 0.5, 1.0]` is correct.\n", 301 | "### s_integerized\n", 302 | "We wanted to map our strings to indexes in a vocabulary, and there were only 2 words in our vocabulary (\"hello\" and \"world\"). So with input of `[\"hello\", \"world\", \"hello\"]` our result of `[0, 1, 0]` is correct." 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "colab": {}, 309 | "colab_type": "code", 310 | "id": "YpsQHsMtekQo" 311 | }, 312 | "source": [ 313 | "## Task: \n", 314 | "Modify your `preprocessing_fn` to perform feature engineering on the data (e.g. return \"x_centered_times_y_normalized\")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [] 323 | } 324 | ], 325 | "metadata": { 326 | "colab": { 327 | "collapsed_sections": [ 328 | "tghWegsjhpkt" 329 | ], 330 | "name": "simple.ipynb", 331 | "private_outputs": true, 332 | "provenance": [], 333 | "toc_visible": true 334 | }, 335 | "kernelspec": { 336 | "display_name": "Python 3", 337 | "language": "python", 338 | "name": "python3" 339 | }, 340 | "language_info": { 341 | "codemirror_mode": { 342 | "name": "ipython", 343 | "version": 3 344 | }, 345 | "file_extension": ".py", 346 | "mimetype": "text/x-python", 347 | "name": "python", 348 | "nbconvert_exporter": "python", 349 | "pygments_lexer": "ipython3", 350 | "version": "3.7.3" 351 | } 352 | }, 353 | "nbformat": 4, 354 | "nbformat_minor": 1 355 | } 356 | -------------------------------------------------------------------------------- /06/oop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Object oriented programming in Python\n", 8 | "### Class structure\n", 9 | "the `__init__` method and objects" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Blu is a bird\n", 22 | "Woo is also a bird\n", 23 | "Blu is 10 years old\n", 24 | "Woo is 15 years old\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "class Parrot:\n", 30 | "\n", 31 | " # class attribute\n", 32 | " species = \"bird\"\n", 33 | "\n", 34 | " # instance attribute\n", 35 | " def __init__(self, name, age):\n", 36 | " self.name = name\n", 37 | " self.age = age\n", 38 | "\n", 39 | "# instantiate the Parrot class\n", 40 | "blu = Parrot(\"Blu\", 10)\n", 41 | "woo = Parrot(\"Woo\", 15)\n", 42 | "\n", 43 | "# access the class attributes\n", 44 | "print(\"Blu is a {}\".format(blu.__class__.species))\n", 45 | "print(\"Woo is also a {}\".format(woo.__class__.species))\n", 46 | "\n", 47 | "# access the instance attributes\n", 48 | "print(\"{} is {} years old\".format( blu.name, blu.age))\n", 49 | "print(\"{} is {} years old\".format( woo.name, woo.age))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Methods\n", 57 | "Now, we look at creating additional methods" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Blu sings 'Happy'\n", 70 | "Blu is now dancing\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "class Parrot:\n", 76 | " \n", 77 | " # instance attributes\n", 78 | " def __init__(self, name, age):\n", 79 | " self.name = name\n", 80 | " self.age = age\n", 81 | " \n", 82 | " # instance method\n", 83 | " def sing(self, song):\n", 84 | " return \"{} sings {}\".format(self.name, song)\n", 85 | "\n", 86 | " def dance(self):\n", 87 | " return \"{} is now dancing\".format(self.name)\n", 88 | "\n", 89 | "# instantiate the object\n", 90 | "blu = Parrot(\"Blu\", 10)\n", 91 | "\n", 92 | "# call our instance methods\n", 93 | "print(blu.sing(\"'Happy'\"))\n", 94 | "print(blu.dance())" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Inheritance\n", 102 | "parent and child class" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "Bird is ready\n", 115 | "Penguin is ready\n", 116 | "Penguin\n", 117 | "Swim faster\n", 118 | "Run faster\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "# parent class\n", 124 | "class Bird:\n", 125 | " \n", 126 | " def __init__(self):\n", 127 | " print(\"Bird is ready\")\n", 128 | "\n", 129 | " def whoisThis(self):\n", 130 | " print(\"Bird\")\n", 131 | "\n", 132 | " def swim(self):\n", 133 | " print(\"Swim faster\")\n", 134 | "\n", 135 | "# child class\n", 136 | "class Penguin(Bird):\n", 137 | "\n", 138 | " def __init__(self):\n", 139 | " # call super() function\n", 140 | " super().__init__()\n", 141 | " print(\"Penguin is ready\")\n", 142 | "\n", 143 | " def whoisThis(self):\n", 144 | " print(\"Penguin\")\n", 145 | "\n", 146 | " def run(self):\n", 147 | " print(\"Run faster\")\n", 148 | "\n", 149 | "peggy = Penguin()\n", 150 | "peggy.whoisThis()\n", 151 | "peggy.swim()\n", 152 | "peggy.run()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "## Encapsulation\n", 160 | "Using OOP in Python, we can restrict access to methods and variables. This prevent data from direct modification which is called encapsulation. In Python, we denote private attribute using underscore as prefix i.e single “ _ “ or double “ __“." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 4, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "Selling Price: 900\n", 173 | "Selling Price: 900\n", 174 | "Selling Price: 1000\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "class Computer:\n", 180 | "\n", 181 | " def __init__(self):\n", 182 | " self.__maxprice = 900\n", 183 | "\n", 184 | " def sell(self):\n", 185 | " print(\"Selling Price: {}\".format(self.__maxprice))\n", 186 | "\n", 187 | " def setMaxPrice(self, price):\n", 188 | " self.__maxprice = price\n", 189 | "\n", 190 | "c = Computer()\n", 191 | "c.sell()\n", 192 | "\n", 193 | "# change the price\n", 194 | "c.__maxprice = 1000\n", 195 | "c.sell()\n", 196 | "\n", 197 | "# using setter function\n", 198 | "c.setMaxPrice(1000)\n", 199 | "c.sell()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Polymorphism\n", 207 | "Polymorphism is an ability (in OOP) to use common interface for multiple form (data types).\n", 208 | "\n", 209 | "Suppose, we need to color a shape, there are multiple shape option (rectangle, square, circle). However we could use same method to color any shape. This concept is called Polymorphism." 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 5, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "Parrot can fly\n", 222 | "Penguin can't fly\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "class Parrot:\n", 228 | "\n", 229 | " def fly(self):\n", 230 | " print(\"Parrot can fly\")\n", 231 | " \n", 232 | " def swim(self):\n", 233 | " print(\"Parrot can't swim\")\n", 234 | "\n", 235 | "class Penguin:\n", 236 | "\n", 237 | " def fly(self):\n", 238 | " print(\"Penguin can't fly\")\n", 239 | " \n", 240 | " def swim(self):\n", 241 | " print(\"Penguin can swim\")\n", 242 | "\n", 243 | "# common interface\n", 244 | "def flying_test(bird):\n", 245 | " bird.fly()\n", 246 | "\n", 247 | "#instantiate objects\n", 248 | "blu = Parrot()\n", 249 | "peggy = Penguin()\n", 250 | "\n", 251 | "# passing the object\n", 252 | "flying_test(blu)\n", 253 | "flying_test(peggy)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.7.3" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 2 285 | } 286 | -------------------------------------------------------------------------------- /06/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | tensorflow==1.15.2 4 | tensorflow-transform==0.14.0 5 | jupyter==1.0.0 -------------------------------------------------------------------------------- /07/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | tensorflow==1.15.2 4 | jupyter==1.0.0 -------------------------------------------------------------------------------- /08/keras_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Using TensorFlow backend.\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "'''Trains and evaluate a simple MLP\n", 18 | "on the Reuters newswire topic classification task.\n", 19 | "'''\n", 20 | "from __future__ import print_function\n", 21 | "\n", 22 | "import numpy as np\n", 23 | "import keras\n", 24 | "from keras.datasets import reuters\n", 25 | "from keras.models import Sequential\n", 26 | "from keras.layers import Dense, Dropout, Activation\n", 27 | "from keras.preprocessing.text import Tokenizer\n", 28 | "\n", 29 | "# The following import and function call are the only additions to code required\n", 30 | "# to automatically log metrics and parameters to MLflow.\n", 31 | "import mlflow.keras" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Loading data...\n", 44 | "8982 train sequences\n", 45 | "2246 test sequences\n", 46 | "46 classes\n", 47 | "Vectorizing sequence data...\n", 48 | "x_train shape: (8982, 1000)\n", 49 | "x_test shape: (2246, 1000)\n", 50 | "Convert class vector to binary class matrix (for use with categorical_crossentropy)\n", 51 | "y_train shape: (8982, 46)\n", 52 | "y_test shape: (2246, 46)\n", 53 | "Building model...\n" 54 | ] 55 | }, 56 | { 57 | "name": "stderr", 58 | "output_type": "stream", 59 | "text": [ 60 | "/Users/yuconghu/github/deml-lab/08/venv/lib/python3.7/site-packages/keras/engine/training_utils.py:811: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", 61 | " if isinstance(loss, collections.Mapping):\n", 62 | "/Users/yuconghu/github/deml-lab/08/venv/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:339: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", 63 | " if not isinstance(values, collections.Sequence):\n" 64 | ] 65 | }, 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "Train on 8083 samples, validate on 899 samples\n", 71 | "Epoch 1/5\n", 72 | "8083/8083 [==============================] - 2s 199us/step - loss: 1.4177 - accuracy: 0.6787 - val_loss: 1.0814 - val_accuracy: 0.7620\n", 73 | "Epoch 2/5\n", 74 | "8083/8083 [==============================] - 1s 168us/step - loss: 0.7874 - accuracy: 0.8152 - val_loss: 0.9548 - val_accuracy: 0.7887\n", 75 | "Epoch 3/5\n", 76 | "8083/8083 [==============================] - 1s 156us/step - loss: 0.5516 - accuracy: 0.8639 - val_loss: 0.8593 - val_accuracy: 0.7909\n", 77 | "Epoch 4/5\n", 78 | "8083/8083 [==============================] - 1s 147us/step - loss: 0.4129 - accuracy: 0.8969 - val_loss: 0.8881 - val_accuracy: 0.8076\n", 79 | "Epoch 5/5\n", 80 | "8083/8083 [==============================] - 1s 146us/step - loss: 0.3248 - accuracy: 0.9223 - val_loss: 0.8753 - val_accuracy: 0.8209\n", 81 | "2246/2246 [==============================] - 0s 49us/step\n", 82 | "Test score: 0.8929576534091206\n", 83 | "Test accuracy: 0.7898486256599426\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "mlflow.keras.autolog()\n", 89 | "\n", 90 | "max_words = 1000\n", 91 | "batch_size = 32\n", 92 | "epochs = 5\n", 93 | "\n", 94 | "# save np.load\n", 95 | "np_load_old = np.load\n", 96 | "\n", 97 | "# modify the default parameters of np.load\n", 98 | "np.load = lambda *a,**k: np_load_old(*a, **k)\n", 99 | "\n", 100 | "print('Loading data...')\n", 101 | "(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,\n", 102 | " test_split=0.2)\n", 103 | "\n", 104 | "# restore np.load for future normal usage\n", 105 | "np.load = np_load_old\n", 106 | "\n", 107 | "print(len(x_train), 'train sequences')\n", 108 | "print(len(x_test), 'test sequences')\n", 109 | "\n", 110 | "num_classes = np.max(y_train) + 1\n", 111 | "print(num_classes, 'classes')\n", 112 | "\n", 113 | "print('Vectorizing sequence data...')\n", 114 | "tokenizer = Tokenizer(num_words=max_words)\n", 115 | "x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')\n", 116 | "x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')\n", 117 | "print('x_train shape:', x_train.shape)\n", 118 | "print('x_test shape:', x_test.shape)\n", 119 | "\n", 120 | "print('Convert class vector to binary class matrix '\n", 121 | " '(for use with categorical_crossentropy)')\n", 122 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 123 | "y_test = keras.utils.to_categorical(y_test, num_classes)\n", 124 | "print('y_train shape:', y_train.shape)\n", 125 | "print('y_test shape:', y_test.shape)\n", 126 | "\n", 127 | "print('Building model...')\n", 128 | "model = Sequential()\n", 129 | "model.add(Dense(512, input_shape=(max_words,)))\n", 130 | "model.add(Activation('relu'))\n", 131 | "model.add(Dropout(0.5))\n", 132 | "model.add(Dense(num_classes))\n", 133 | "model.add(Activation('softmax'))\n", 134 | "\n", 135 | "model.compile(loss='categorical_crossentropy',\n", 136 | " optimizer='adam',\n", 137 | " metrics=['accuracy'])\n", 138 | "\n", 139 | "history = model.fit(x_train, y_train,\n", 140 | " batch_size=batch_size,\n", 141 | " epochs=epochs,\n", 142 | " verbose=1,\n", 143 | " validation_split=0.1)\n", 144 | "score = model.evaluate(x_test, y_test,\n", 145 | " batch_size=batch_size, verbose=1)\n", 146 | "print('Test score:', score[0])\n", 147 | "print('Test accuracy:', score[1])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.7.3" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /08/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | mlflow==1.3.0 4 | scikit-learn==0.21.3 5 | tensorflow==1.15.2 6 | keras==2.3.1 7 | jupyter==1.0.0 -------------------------------------------------------------------------------- /08/sklearn_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from __future__ import print_function\n", 12 | "\n", 13 | "import numpy as np\n", 14 | "from sklearn.linear_model import LogisticRegression\n", 15 | "\n", 16 | "import mlflow\n", 17 | "import mlflow.sklearn" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "if __name__ == \"__main__\":\n", 27 | " X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)\n", 28 | " y = np.array([0, 0, 1, 1, 1, 0])\n", 29 | " lr = LogisticRegression()\n", 30 | " lr.fit(X, y)\n", 31 | " score = lr.score(X, y)\n", 32 | " print(\"Score: %s\" % score)\n", 33 | " mlflow.log_metric(\"score\", score)\n", 34 | " mlflow.sklearn.log_model(lr, \"model\")\n", 35 | " print(\"Model saved in run %s\" % mlflow.active_run().info.run_uuid)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Exercise:\n", 43 | "Wrap MLFlow around `GridSearchCV` using `sklearn`." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Automatically created module for IPython interactive environment\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "from sklearn import datasets\n", 61 | "from sklearn.model_selection import train_test_split\n", 62 | "from sklearn.model_selection import GridSearchCV\n", 63 | "from sklearn.metrics import classification_report\n", 64 | "from sklearn.svm import SVC\n", 65 | "\n", 66 | "print(__doc__)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# Loading the Digits dataset\n", 76 | "digits = datasets.load_digits()\n", 77 | "\n", 78 | "# To apply an classifier on this data, we need to flatten the image, to\n", 79 | "# turn the data in a (samples, feature) matrix:\n", 80 | "n_samples = len(digits.images)\n", 81 | "X = digits.images.reshape((n_samples, -1))\n", 82 | "y = digits.target\n", 83 | "\n", 84 | "# Split the dataset in two equal parts\n", 85 | "X_train, X_test, y_train, y_test = train_test_split(\n", 86 | " X, y, test_size=0.5, random_state=0)\n", 87 | "\n", 88 | "# Set the parameters by cross-validation\n", 89 | "tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n", 90 | " 'C': [1, 10, 100, 1000]},\n", 91 | " {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.7.3" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 2 123 | } 124 | -------------------------------------------------------------------------------- /09/datawig.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from sklearn.model_selection import train_test_split\n", 12 | "import datawig" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Imputing categorical values" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "df = datawig.utils.generate_df_string(\n", 29 | " num_samples=200, \n", 30 | " data_column_name='sentences', \n", 31 | " label_column_name='label')\n", 32 | "\n", 33 | "df_train, df_test = datawig.utils.random_split(df)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "df_train" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "#Initialize a SimpleImputer model\n", 52 | "imputer = datawig.SimpleImputer(\n", 53 | " input_columns=['sentences'], # column(s) containing information about the column we want to impute\n", 54 | " output_column='label', # the column we'd like to impute values for\n", 55 | " output_path = 'imputer_model' # stores model data and metrics\n", 56 | ")\n", 57 | "\n", 58 | "#Fit an imputer model on the train data\n", 59 | "imputer.fit(train_df=df_train)\n", 60 | "\n", 61 | "#Impute missing values and return original dataframe with predictions\n", 62 | "imputed = imputer.predict(df_test)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "imputed" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Imputing numerical values" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "df_numeric = datawig.utils.generate_df_numeric(\n", 88 | " num_samples=200, \n", 89 | " data_column_name='x', \n", 90 | " label_column_name='y') \n", 91 | "\n", 92 | "df_train_numeric, df_test_numeric = datawig.utils.random_split(df_numeric)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "df_train_numeric" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "#Initialize a SimpleImputer model\n", 111 | "imputer_numeric = datawig.SimpleImputer(\n", 112 | " input_columns=['x'], # column(s) containing information about the column we want to impute\n", 113 | " output_column='y', # the column we'd like to impute values for\n", 114 | " output_path = 'imputer_model_numeric' # stores model data and metrics\n", 115 | ")\n", 116 | "\n", 117 | "#Fit an imputer model on the train data\n", 118 | "imputer_numeric.fit(train_df=df_train_numeric, num_epochs=50)\n", 119 | "\n", 120 | "#Impute missing values and return original dataframe with predictions\n", 121 | "imputed_numeric = imputer_numeric.predict(df_test_numeric)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "imputed_numeric" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Imputing missing values on real world data" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "product_data = pd.read_csv('products.csv', sep='\\t')\n", 147 | "training_products, test_products = train_test_split(product_data, test_size=0.2, random_state=42)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### Train an imputation model for the 'category' column and measure how good the imputation works" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Python 3", 168 | "language": "python", 169 | "name": "python3" 170 | }, 171 | "language_info": { 172 | "codemirror_mode": { 173 | "name": "ipython", 174 | "version": 3 175 | }, 176 | "file_extension": ".py", 177 | "mimetype": "text/x-python", 178 | "name": "python", 179 | "nbconvert_exporter": "python", 180 | "pygments_lexer": "ipython3", 181 | "version": "3.6.9" 182 | } 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /09/openrefine.txt: -------------------------------------------------------------------------------- 1 | https://github.com/OpenRefine/OpenRefine/releases/download/3.2/openrefine-linux-3.2.tar.gz 2 | https://github.com/OpenRefine/OpenRefine/releases/download/3.2/openrefine-mac-3.2.dmg 3 | https://github.com/OpenRefine/OpenRefine/releases/download/3.2/openrefine-win-3.2.zip 4 | 5 | -------------------------------------------------------------------------------- /09/requirements.txt: -------------------------------------------------------------------------------- 1 | datawig 2 | scikit-learn==0.21.3 3 | pandas==0.25.1 4 | jupyter==1.0.0 5 | 6 | -------------------------------------------------------------------------------- /10/images/Complex_NoProc_V3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/10/images/Complex_NoProc_V3.jpg -------------------------------------------------------------------------------- /10/images/cnn_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/10/images/cnn_arch.png -------------------------------------------------------------------------------- /10/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.17.2 2 | pandas==0.25.1 3 | scikit-learn==0.21.3 4 | matplotlib==3.1.1 5 | aif360==0.2.2 6 | lime==0.1.1.36 7 | jupyter==1.0.0 -------------------------------------------------------------------------------- /10/tutorial_credit_scoring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Detecting and mitigating age bias on credit decisions \n", 8 | "\n", 9 | "The goal of this tutorial is to introduce the basic functionality of AI Fairness 360 to an interested developer who may not have a background in bias detection and mitigation.\n", 10 | "\n", 11 | "### Biases and Machine Learning\n", 12 | "A machine learning model makes predictions of an outcome for a particular instance. (Given an instance of a loan application, predict if the applicant will repay the loan.) The model makes these predictions based on a training dataset, where many other instances (other loan applications) and actual outcomes (whether they repaid) are provided. Thus, a machine learning algorithm will attempt to find patterns, or generalizations, in the training dataset to use when a prediction for a new instance is needed. (For example, one pattern it might discover is \"if a person has salary > USD 40K and has outstanding debt < USD 5, they will repay the loan\".) In many domains this technique, called supervised machine learning, has worked very well.\n", 13 | "\n", 14 | "However, sometimes the patterns that are found may not be desirable or may even be illegal. For example, a loan repay model may determine that age plays a significant role in the prediction of repayment because the training dataset happened to have better repayment for one age group than for another. This raises two problems: 1) the training dataset may not be representative of the true population of people of all age groups, and 2) even if it is representative, it is illegal to base any decision on a applicant's age, regardless of whether this is a good prediction based on historical data.\n", 15 | "\n", 16 | "AI Fairness 360 is designed to help address this problem with _fairness metrics_ and _bias mitigators_. Fairness metrics can be used to check for bias in machine learning workflows. Bias mitigators can be used to overcome bias in the workflow to produce a more fair outcome. \n", 17 | "\n", 18 | "The loan scenario describes an intuitive example of illegal bias. However, not all undesirable bias in machine learning is illegal it may also exist in more subtle ways. For example, a loan company may want a diverse portfolio of customers across all income levels, and thus, will deem it undesirable if they are making more loans to high income levels over low income levels. Although this is not illegal or unethical, it is undesirable for the company's strategy.\n", 19 | "\n", 20 | "As these two examples illustrate, a bias detection and/or mitigation toolkit needs to be tailored to the particular bias of interest. More specifically, it needs to know the attribute or attributes, called _protected attributes_, that are of interest: race is one example of a _protected attribute_ and age is a second.\n", 21 | "\n", 22 | "### The Machine Learning Workflow\n", 23 | "To understand how bias can enter a machine learning model, we first review the basics of how a model is created in a supervised machine learning process. \n", 24 | "\n", 25 | "\n", 26 | "\n", 27 | "![image](images/Complex_NoProc_V3.jpg)\n", 28 | "\n", 29 | "\n", 30 | "\n", 31 | "\n", 32 | "\n", 33 | "\n", 34 | "\n", 35 | "\n", 36 | "First, the process starts with a _training dataset_, which contains a sequence of instances, where each instance has two components: the features and the correct prediction for those features. Next, a machine learning algorithm is trained on this training dataset to produce a machine learning model. This generated model can be used to make a prediction when given a new instance. A second dataset with features and correct predictions, called a _test dataset_, is used to assess the accuracy of the model.\n", 37 | "Since this test dataset is the same format as the training dataset, a set of instances of features and prediction pairs, often these two datasets derive from the same initial dataset. A random partitioning algorithm is used to split the initial dataset into training and test datasets.\n", 38 | "\n", 39 | "Bias can enter the system in any of the three steps above. The training data set may be biased in that its outcomes may be biased towards particular kinds of instances. The algorithm that creates the model may be biased in that it may generate models that are weighted towards particular features in the input. The test data set may be biased in that it has expectations on correct answers that may be biased. These three points in the machine learning process represent points for testing and mitigating bias. In AI Fairness 360 codebase, we call these points _pre-processing_, _in-processing_, and _post-processing_. \n", 40 | "\n", 41 | "### AI Fairness 360\n", 42 | "We are now ready to utilize AI Fairness 360 (`aif360`) to detect and mitigate bias. We will use the German credit dataset, splitting it into a training and test dataset. We will look for bias in the creation of a machine learning model to predict if an applicant should be given credit based on various features from a typical credit application. The protected attribute will be \"Age\", with \"1\" (older than or equal to 25) and \"0\" (younger than 25) being the values for the privileged and unprivileged groups, respectively.\n", 43 | "For this first tutorial, we will check for bias in the initial training data, mitigate the bias, and recheck. More sophisticated machine learning workflows are given in the author tutorials and demo notebooks in the codebase.\n", 44 | "\n", 45 | "Here are the steps involved\n", 46 | "#### Step 1: Write import statements\n", 47 | "#### Step 2: Set bias detection options, load dataset, and split between train and test\n", 48 | "#### Step 3: Compute fairness metric on original training dataset\n", 49 | "#### Step 4: Mitigate bias by transforming the original dataset\n", 50 | "#### Step 5: Compute fairness metric on transformed training dataset\n", 51 | "\n", 52 | "### Step 1 Import Statements\n", 53 | "As with any python program, the first step will be to import the necessary packages. Below we import several components from the `aif360` package. We import the GermanDataset, metrics to check for bias, and classes related to the algorithm we will use to mitigate bias." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Load all necessary packages\n", 63 | "import sys\n", 64 | "sys.path.insert(1, \"../\") \n", 65 | "\n", 66 | "import numpy as np\n", 67 | "np.random.seed(0)\n", 68 | "\n", 69 | "from aif360.datasets import GermanDataset\n", 70 | "from aif360.metrics import BinaryLabelDatasetMetric\n", 71 | "from aif360.algorithms.preprocessing import Reweighing\n", 72 | "\n", 73 | "from IPython.display import Markdown, display" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Step 2 Load dataset, specifying protected attribute, and split dataset into train and test\n", 81 | "In Step 2 we load the initial dataset, setting the protected attribute to be age. We then splits the original dataset into training and testing datasets. Although we will use only the training dataset in this tutorial, a normal workflow would also use a test dataset for assessing the efficacy (accuracy, fairness, etc.) during the development of a machine learning model. Finally, we set two variables (to be used in Step 3) for the privileged (1) and unprivileged (0) values for the age attribute. These are key inputs for detecting and mitigating bias, which will be Step 3 and Step 4. " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "dataset_orig = GermanDataset(\n", 91 | " protected_attribute_names=['age'], # this dataset also contains protected\n", 92 | " # attribute for \"sex\" which we do not\n", 93 | " # consider in this evaluation\n", 94 | " privileged_classes=[lambda x: x >= 25], # age >=25 is considered privileged\n", 95 | " features_to_drop=['personal_status', 'sex'] # ignore sex-related attributes\n", 96 | ")\n", 97 | "\n", 98 | "dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)\n", 99 | "\n", 100 | "privileged_groups = [{'age': 1}]\n", 101 | "unprivileged_groups = [{'age': 0}]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Step 3 Compute fairness metric on original training dataset\n", 109 | "Now that we've identified the protected attribute 'age' and defined privileged and unprivileged values, we can use aif360 to detect bias in the dataset. One simple test is to compare the percentage of favorable results for the privileged and unprivileged groups, subtracting the former percentage from the latter. A negative value indicates less favorable outcomes for the unprivileged groups. This is implemented in the method called mean_difference on the BinaryLabelDatasetMetric class. The code below performs this check and displays the output, showing that the difference is -0.169905." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 3, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/markdown": [ 120 | "#### Original training dataset" 121 | ], 122 | "text/plain": [ 123 | "" 124 | ] 125 | }, 126 | "metadata": {}, 127 | "output_type": "display_data" 128 | }, 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "Difference in mean outcomes between unprivileged and privileged groups = -0.169905\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train, \n", 139 | " unprivileged_groups=unprivileged_groups,\n", 140 | " privileged_groups=privileged_groups)\n", 141 | "display(Markdown(\"#### Original training dataset\"))\n", 142 | "print(\"Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_train.mean_difference())" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Step 4 Mitigate bias by transforming the original dataset\n", 150 | "The previous step showed that the privileged group was getting 17% more positive outcomes in the training dataset. Since this is not desirable, we are going to try to mitigate this bias in the training dataset. As stated above, this is called _pre-processing_ mitigation because it happens before the creation of the model. \n", 151 | "\n", 152 | "AI Fairness 360 implements several pre-processing mitigation algorithms. We will choose the Reweighing algorithm [1], which is implemented in the `Reweighing` class in the `aif360.algorithms.preprocessing` package. This algorithm will transform the dataset to have more equity in positive outcomes on the protected attribute for the privileged and unprivileged groups.\n", 153 | "\n", 154 | "We then call the fit and transform methods to perform the transformation, producing a newly transformed training dataset (dataset_transf_train).\n", 155 | "\n", 156 | "`[1] F. Kamiran and T. Calders, \"Data Preprocessing Techniques for Classification without Discrimination,\" Knowledge and Information Systems, 2012.`" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 4, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "RW = Reweighing(unprivileged_groups=unprivileged_groups,\n", 166 | " privileged_groups=privileged_groups)\n", 167 | "dataset_transf_train = RW.fit_transform(dataset_orig_train)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### Step 5 Compute fairness metric on transformed dataset\n", 175 | "Now that we have a transformed dataset, we can check how effective it was in removing bias by using the same metric we used for the original training dataset in Step 3. Once again, we use the function mean_difference in the BinaryLabelDatasetMetric class. We see the mitigation step was very effective, the difference in mean outcomes is now 0.0. So we went from a 17% advantage for the privileged group to equality in terms of mean outcome." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 5, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/markdown": [ 186 | "#### Transformed training dataset" 187 | ], 188 | "text/plain": [ 189 | "" 190 | ] 191 | }, 192 | "metadata": {}, 193 | "output_type": "display_data" 194 | }, 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "Difference in mean outcomes between unprivileged and privileged groups = 0.000000\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train, \n", 205 | " unprivileged_groups=unprivileged_groups,\n", 206 | " privileged_groups=privileged_groups)\n", 207 | "display(Markdown(\"#### Transformed training dataset\"))\n", 208 | "print(\"Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_transf_train.mean_difference())" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "### Summary\n", 216 | "The purpose of this tutorial is to give a new user to bias detection and mitigation a gentle introduction to some of the functionality of AI Fairness 360. A more complete use case would take the next step and see how the transformed dataset impacts the accuracy and fairness of a trained model. This is implemented in the demo notebook in the examples directory of toolkit, called demo_reweighing_preproc.ipynb. I highly encourage readers to view that notebook as it is generalization and extension of this simple tutorial.\n", 217 | "\n", 218 | "There are many metrics one can use to detect the presence of bias. AI Fairness 360 provides many of them for your use. Since it is not clear which of these metrics to use, we also provide some guidance. Likewise, there are many different bias mitigation algorithms one can employ, many of which are in AI Fairness 360. Other tutorials will demonstrate the use of some of these metrics and mitigations algorithms.\n", 219 | "\n", 220 | "As mentioned earlier, both fairness metrics and mitigation algorithms can be performed at various stages of the machine learning pipeline. We recommend checking for bias as often as possible, using as many metrics are relevant for the application domain. We also recommend incorporating bias detection in an automated continouus integration pipeline to ensure bias awareness as a software project evolves." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.7.3" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 2 252 | } 253 | -------------------------------------------------------------------------------- /assignment1/components/constraints.py: -------------------------------------------------------------------------------- 1 | class Constraint: 2 | 3 | def is_satisfied(self, dataframe): 4 | return False 5 | 6 | 7 | class HasAtLeastNumRecords(Constraint): 8 | def __init__(self, num_records): 9 | pass 10 | 11 | # TODO Implement 12 | 13 | 14 | class NotNull(Constraint): 15 | def __init__(self, column): 16 | pass 17 | 18 | # TODO Implement 19 | 20 | 21 | class HasNumDistinctValues(Constraint): 22 | def __init__(self, column, min_distinct_values, max_distinct_values): 23 | pass 24 | 25 | # TODO Implement 26 | 27 | 28 | class IsInRange(Constraint): 29 | def __init__(self, column, min_value, max_value): 30 | pass 31 | 32 | # TODO Implement 33 | 34 | 35 | class And(Constraint): 36 | def __init__(self, *constraints): 37 | pass 38 | 39 | # TODO Implement 40 | 41 | 42 | class Or(Constraint): 43 | def __init__(self, *constraints): 44 | pass 45 | 46 | # TODO Implement 47 | -------------------------------------------------------------------------------- /assignment1/components/learned_imputer.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | 4 | class LearnedImputer(BaseEstimator, TransformerMixin): 5 | 6 | def __init__(self, target_column): 7 | pass 8 | # TODO Implement 9 | 10 | def fit(self, dataframe): 11 | # TODO Implement 12 | 13 | return self 14 | 15 | def transform(self, dataframe): 16 | pass 17 | # TODO Implement 18 | -------------------------------------------------------------------------------- /assignment1/components/trainer.py: -------------------------------------------------------------------------------- 1 | 2 | def create_pipeline(task): 3 | pass 4 | # TODO Implement 5 | 6 | 7 | def create_label_encoder(task, training_data): 8 | pass 9 | # TODO Implement 10 | 11 | 12 | def train_model_with_crossvalidation(task, pipeline, label_encoder, training_data, seed): 13 | pass 14 | # TODO Implement 15 | -------------------------------------------------------------------------------- /assignment1/instructions.md: -------------------------------------------------------------------------------- 1 | ## Prerequisites 2 | 3 | Update your git repository and setup the virtual environment for this assignment analogously to how you did it for the lab exercises. 4 | 5 | The assignment consists of three independent tasks. Each task features a python file to execute. Note that the execution might result in errors if you did not implement the require code yet. In order to fulfill the assignment, you have to **implement python code in the files in the [components](components/) folder**. **Do not edit other files, especially the task files**. 6 | 7 | ## Task 1: Data Validation 8 | 9 | You can execute this task via ```python task1.py```. The goal of this task is to implement a few constraints for data validation defined in the file [components/constraints.py](components/constraints.py). Each constraint applies to a pandas dataframe and tests certain conditions on the contained data (or a specific column) of some [demographic income data](adult-sample.csv). 10 | 11 | * `HasAtLeastNumRecords`: checks that the dataframe has at least a given number of records 12 | * `NotNull`: checks that a column contains no null values 13 | * `HasNumDistinctValues`: checks that a columns number of distinct values is in a given range (including the min/max) 14 | * `IsInRange`: checks that a columns values are in a given range (including the min/max) 15 | * `And`: checks that all of the supplied constraints is satisfied 16 | * `Or`: checks that at least one of the supplied constraints is satisfied 17 | 18 | 19 | ## Task 2: A custom Estimator/Transformer for Missing Value Imputation 20 | 21 | You can execute this task via ```python task2.py```. The goal of this task is to implement an ML-based missing value imputer as an estimator/transformer in scikit learn in the file [components/learned_imputer.py](components/learned_imputer.py). 22 | 23 | You will work on a pandas dataframe, and your goal is to learn a model to impute missing values for the `target_column` given the values from the other columns of the dataframe. The choice of features and model is up to you. 24 | 25 | The model will be learned in the `fit` method of the class. For the sake of simplicity, you can assume that the dataframe only contains text columns. In the `transform` method of the class, your learned imputer should fill in missing values for the `target_column`. 26 | 27 | As an example, we will impute the category of a product given its `review` and `title` with data taken from [products.csv](products.csv). You can run the tests for your imputer via ```python task2.py```, you should try to achieve an accuracy greater than 0.75. 28 | 29 | ## Task 3: Declarative Model Training 30 | 31 | You can execute this task via ```python task3.py```. In this task, we will implement a very simple "AutoML" system for supervised learning. You are given data in the form of a pandas dataframe, as well as a [`LearningTask`](task3.py#L9) which describes which column values we want to predict (`target_column`) and which columns we want to use as features (`categorical_columns`, `numeric_columns`, `textual_columns`). Additional, you are provided with the `learning_algorithm` to use, as well as with the number of folds `num_folds` and a hyperparameter grid for `hyperparam_grid` for grid search. 32 | 33 | The goal is to implement the following methods in [components/trainer.py](components/trainer.py) to conduct the model training invoked by ```python task3.py```: 34 | 35 | * `create_pipeline(task)`: Generate a sklearn pipeline for training a model corresponding to the task 36 | * `create_label_encoder(task, training_data)`: return a fitted [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) for the `target_column` 37 | * `train_model_with_crossvalidation(task, pipeline, label_encoder, training_data, seed)`: train the model (e.g. the pipeline defined earlier) using k-fold cross-validation 38 | 39 | The tasks file defines different tasks to predict certain columns of a sample of [income data](adult-sample.csv) and should result in an AUC of about 0.8 in most of the cases. 40 | -------------------------------------------------------------------------------- /assignment1/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.21.3 2 | pandas==0.25.1 3 | jupyter==1.0.0 4 | 5 | -------------------------------------------------------------------------------- /assignment1/task1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from components.constraints import NotNull, HasNumDistinctValues, IsInRange, And, Or, HasAtLeastNumRecords 3 | 4 | # We load a pandas dataframe with missing values 5 | data = pd.read_csv('adult-sample.csv', na_values='?') 6 | 7 | # We evaluate constraints on the number of contained records 8 | assert HasAtLeastNumRecords(500).is_satisfied(data) 9 | assert not HasAtLeastNumRecords(501).is_satisfied(data) 10 | 11 | # We evaluate constraints on the range of values in the 'age' column 12 | assert IsInRange('age', 17, 90).is_satisfied(data) 13 | assert IsInRange('age', 10, 100).is_satisfied(data) 14 | assert not IsInRange('age', 50, 100).is_satisfied(data) 15 | 16 | # We evaluate constraints on the completeness of the 'age' and 'workclass' column 17 | assert NotNull('age').is_satisfied(data) 18 | assert not NotNull('workclass').is_satisfied(data) 19 | 20 | # We evaluate constraints on the completeness of the 'workclass' and 'education' column 21 | assert HasNumDistinctValues('workclass', 8, 8).is_satisfied(data) 22 | assert HasNumDistinctValues('workclass', 0, 10).is_satisfied(data) 23 | assert not HasNumDistinctValues('workclass', 10, 20).is_satisfied(data) 24 | assert HasNumDistinctValues('education', 16, 16).is_satisfied(data) 25 | 26 | # We evaluate boolean expressions built from of constraints 27 | assert And(HasAtLeastNumRecords(500), NotNull('age'), IsInRange('age', 17, 90)).is_satisfied(data) 28 | assert And(HasAtLeastNumRecords(500), NotNull('age'), IsInRange('age', 17, 90)).is_satisfied(data) 29 | assert Or(NotNull('age'), NotNull('workclass')).is_satisfied(data) 30 | 31 | constraint = And( 32 | And(HasAtLeastNumRecords(500), NotNull('age'), IsInRange('age', 17, 90)), 33 | Or(NotNull('age'), NotNull('workclass'))) 34 | 35 | assert constraint.is_satisfied(data) 36 | 37 | -------------------------------------------------------------------------------- /assignment1/task2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from components.learned_imputer import LearnedImputer 5 | 6 | # We load product data for evaluation 7 | all_products = pd.read_csv('products.csv', sep='\t') 8 | 9 | # We run several experiments with different random seeds 10 | for seed in [42, 129, 788, 555, 123456]: 11 | data_for_run = all_products.copy(deep=True) 12 | 13 | # We define the names of the column to impute 14 | column_to_impute = 'category' 15 | 16 | # In some tests, we will change the column names, the imputer should be general enough to handle that 17 | if seed % 2 == 1: 18 | column_to_impute = 'Kategorie' 19 | data_for_run = data_for_run.rename(columns={"category": "Kategorie", "title": "Titel"}) 20 | 21 | # We split the data into train and test set 22 | train_data, test_data = train_test_split(data_for_run, test_size=0.2, random_state=seed) 23 | 24 | # We fit the imputer on the training data, it should learn its internal imputation model now 25 | imputer = LearnedImputer(column_to_impute) 26 | 27 | imputer.fit(train_data) 28 | 29 | # We create a copy with of the test data and remove the column values to impute 30 | to_impute = test_data.copy(deep=True) 31 | to_impute.category = np.nan 32 | 33 | # We have the imputer fill in the missing values 34 | imputed = imputer.transform(to_impute) 35 | 36 | # We compute the accuracy of the imputer by comparing the filled in values to the correct ones 37 | imputed['correct__'] = test_data[column_to_impute] 38 | correctly_imputed = imputed[(imputed['correct__'] == imputed[column_to_impute])] 39 | 40 | accuracy = float(len(correctly_imputed)) / len(imputed) 41 | 42 | print("Accuracy:", accuracy) 43 | assert accuracy >= 0.75 44 | -------------------------------------------------------------------------------- /assignment1/task3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.linear_model import SGDClassifier 4 | from sklearn.ensemble import RandomForestClassifier 5 | from components.trainer import create_pipeline, create_label_encoder, train_model_with_crossvalidation 6 | from sklearn.metrics import roc_auc_score 7 | 8 | # This task defines how we want to train a supervised model for some given data 9 | class LearningTask: 10 | def __init__(self, 11 | target_column, 12 | categorical_columns, 13 | numeric_columns, 14 | textual_columns, 15 | learning_algorithm, 16 | num_folds, 17 | hyperparam_grid): 18 | self.target_column = target_column 19 | self.categorical_columns = categorical_columns 20 | self.numeric_columns = numeric_columns 21 | self.textual_columns = textual_columns 22 | self.learning_algorithm = learning_algorithm 23 | self.num_folds = num_folds 24 | self.hyperparam_grid = hyperparam_grid 25 | 26 | 27 | # We execute the learning tasks using the methods defined in trainer.py 28 | def execute_task(task, data, current_seed): 29 | 30 | # We split 31 | train_data, test_data = train_test_split(data, test_size=0.2, random_state=current_seed) 32 | 33 | # We create the pipeline for our task 34 | pipeline = create_pipeline(task) 35 | 36 | # We obtain the fitted label encoder 37 | label_encoder = create_label_encoder(task, train_data) 38 | 39 | # We invoke the model training 40 | model = train_model_with_crossvalidation(task, pipeline, label_encoder, train_data, current_seed) 41 | 42 | # We compute the AUC of the model 43 | y_true = label_encoder.transform(test_data[task.target_column]) 44 | y_pred = model.predict_proba(test_data)[:, 0] 45 | auc_score = roc_auc_score(y_true, y_pred) 46 | 47 | if auc_score < 0.5: 48 | auc_score = 1.0 - auc_score 49 | 50 | return auc_score 51 | 52 | 53 | # We evaluate our model on income data 54 | income_data = pd.read_csv('adult-sample.csv') 55 | 56 | # We run several experiments with different random seeds 57 | for seed in [42, 12345]: 58 | 59 | task1 = LearningTask( 60 | target_column='income-per-year', 61 | categorical_columns=['workclass', 'occupation', 'marital-status'], 62 | numeric_columns=['age', 'capital-gain', 'capital-loss'], 63 | textual_columns=[], 64 | learning_algorithm=SGDClassifier(loss='log', random_state=seed), 65 | num_folds=4, 66 | hyperparam_grid={'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.001, 0.0001]}) 67 | 68 | auc1 = execute_task(task1, income_data, seed) 69 | print("AUC %s for task1 with seed %s" % (auc1, seed)) 70 | 71 | task2 = LearningTask( 72 | target_column='income-per-year', 73 | categorical_columns=['occupation'], 74 | numeric_columns=['age'], 75 | textual_columns=[], 76 | learning_algorithm=SGDClassifier(loss='log', random_state=seed), 77 | num_folds=4, 78 | hyperparam_grid={'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.001, 0.0001]}) 79 | 80 | auc2 = execute_task(task2, income_data, seed) 81 | print("AUC %s for task2 with seed %s" % (auc2, seed)) 82 | 83 | task3 = LearningTask( 84 | target_column='income-per-year', 85 | categorical_columns=['occupation'], 86 | numeric_columns=['age'], 87 | textual_columns=[], 88 | learning_algorithm=RandomForestClassifier(random_state=seed), 89 | num_folds=4, 90 | hyperparam_grid={'n_estimators': [10, 100, 1000]}) 91 | 92 | auc3 = execute_task(task3, income_data, seed) 93 | print("AUC %s for task3 with seed %s" % (auc3, seed)) 94 | 95 | task4 = LearningTask( 96 | target_column='sex', 97 | categorical_columns=['workclass', 'occupation', 'marital-status', 'income-per-year'], 98 | numeric_columns=['age', 'capital-gain', 'capital-loss'], 99 | textual_columns=[], 100 | learning_algorithm=SGDClassifier(loss='log', random_state=seed), 101 | num_folds=4, 102 | hyperparam_grid={'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.001, 0.0001]}) 103 | 104 | auc4 = execute_task(task4, income_data, seed) 105 | print("AUC %s for task4 with seed %s" % (auc4, seed)) 106 | 107 | -------------------------------------------------------------------------------- /assignment2/README.md: -------------------------------------------------------------------------------- 1 | ## Task 1: Data Validation with Google TFX 2 | 3 | The goal of this task is to use [Tensorflow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv) to validate two sets of data files about products and ratings in the [data](data/) folder. 4 | 5 | We assume that all products data files are valid except for `products-data-3.tsv`. Additionally, we assume that `ratings-2.tsv` and `ratings-3.tsv` have anomalies. 6 | 7 | Eyeballing the data should help you identify differences between the individual files. We ask you use **tfdv** to infer a schema from the data, adjust the schema if necessary and ensure that your code correctly identifies the files with data anomalies. Use `python task1.py` to execute this task and implement your solution in [components/schema_validation.py](components/schema_validation.py). Looking at this [test code](https://github.com/tensorflow/data-validation/blob/80809cd738fd1178f6c0334b0e4f4e644f445139/tensorflow_data_validation/anomalies/schema_test.cc) from Tensorflow might help you identify schema constraints that are helpful for this task. 8 | 9 | 10 | ## Task 2: Parallel Data Processing with Apache Beam 11 | 12 | Next, you will have implement a parallel data preprocessing job using [Apache Beam](https://beam.apache.org). The input data consists of product descriptions in the file [products-data-0.tsv](data/products-data-0.tsv) with the schema `identifier, category, description` and of product ratings in the file [ratings-0.tsv](data/ratings-0.tsv) with the schema `identifier, rating`. 13 | 14 | You Beam job should conduct the following operations: 15 | 1. Conduct an equi-join on both inputs using the join key `identifier` 16 | 1. Filter the join result to only retain records that have (a) 'Kitchen' as `category` and a `rating` of at least 4 or (b) 'Jewelry' as `category` and a `rating` of 5 17 | 1. Group the join results by `category` 18 | 1. Compute the number of records per group 19 | 1. Write the categories and counts tab separated into a file `category_counts.tsv-00000-of-00001`, a line of this file could look like `Kitchen 123` for example. 20 | 21 | Use `python task2.py` to execute this task and implement your solution in [components/beam_job.py](components/beam_job.py). 22 | 23 | 24 | ## Task 3: Implement your own MapReduce Engine 25 | 26 | In this task, you have to implement your own simple MapReduce engine. Note that instead of the typical case of implementing the functions `f_m` and `f_r` that are executed by the MapReduce engine, this task has a different setup. The functions `f_m` and `f_r` are given for a simple wordcount algorithm and you have to implement the underlying runtime that applies them to the input data, according to the MapReduce paradigm. 27 | 28 | Use `python task3.py` to execute this task and implement your solution in [components/mapreduce.py](components/mapreduce.py). You have to implement methods to run the typical three phases of a MapReduce job: 29 | 30 | 1. Run the [map-phase](https://github.com/schelterlabs/deml-lab/blob/master/assignment2/components/mapreduce.py#L28): for each partition, transform each input record with `f_r` 31 | 2. Run the [shuffle-phase](https://github.com/schelterlabs/deml-lab/blob/master/assignment2/components/mapreduce.py#L33): create as many output partitions as we have reducers, and ensure that all records with the same key are put into the same group in the same partition 32 | 3. Run the [reduce phase](https://github.com/schelterlabs/deml-lab/blob/master/assignment2/components/mapreduce.py#L37): apply `f_r` to every group in every partition 33 | 34 | Note that your implementation does not need to run in parallel or be efficient or be able to handle large datasets. 35 | 36 | ## Task 4: Parallel Linear Regression with MapReduce 37 | 38 | In this final task, you will use your MapReduce engine from the previous task to implement the parallel linear regression example from [Map-Reduce for Machine Learning on Multicore](https://papers.nips.cc/paper/3150-map-reduce-for-machine-learning-on-multicore.pdf). 39 | 40 | Here is the description of the approach: 41 | ![](regression.png) 42 | 43 | The input data for this task is a simple regression problem generated by scikit-learn's [make_regression](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html). Use `python task4.py` to execute this task and implement your solution in [components/linear_regression.py](components/linear_regression.py). Use the output of `result_key()` as the key for the model in the final result. 44 | -------------------------------------------------------------------------------- /assignment2/components/beam_job.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | import apache_beam as beam 5 | 6 | def create_and_run_beam_job(path_to_products_file, path_to_ratings_file): 7 | # TODO implement 8 | pass -------------------------------------------------------------------------------- /assignment2/components/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def f_m(key, value): 5 | # TODO Implement 6 | pass 7 | 8 | 9 | def f_r(key, values): 10 | # TODO Implement 11 | pass 12 | 13 | def result_key(): 14 | return 'dummy_key' -------------------------------------------------------------------------------- /assignment2/components/mapreduce.py: -------------------------------------------------------------------------------- 1 | 2 | class MapReduceEngine: 3 | 4 | def __init__(self, f_m, f_r, num_reducers): 5 | self.f_m = f_m 6 | self.f_r = f_r 7 | self.num_reducers = num_reducers 8 | 9 | def execute(self, partitions): 10 | # Run the map-phase: for each partition, transform each input record with f_r 11 | map_outputs = [] 12 | for partition in partitions: 13 | map_output_for_partition = self.map_partition(partition) 14 | map_outputs.extend(flatten(map_output_for_partition)) 15 | 16 | # Run the shuffle-phase: create as many output partitions as we have reducers, all records with the same 17 | # key must land in the same group in the same partition 18 | shuffle_outputs = self.shuffle(map_outputs) 19 | 20 | # Run the reduce phase: apply f_r to every group in every partition 21 | reduce_outputs = {} 22 | for partition in shuffle_outputs: 23 | reduce_output_for_partition = self.reduce_partition(partition) 24 | reduce_outputs.update(reduce_output_for_partition) 25 | 26 | return reduce_outputs 27 | 28 | def map_partition(self, partition): 29 | # TODO implement 30 | pass 31 | 32 | 33 | def shuffle(self, map_outputs): 34 | # TODO implement 35 | pass 36 | 37 | def reduce_partition(self, partition): 38 | # TODO implement 39 | pass 40 | 41 | 42 | def flatten(nested_list): 43 | return [item for sublist in nested_list for item in sublist] -------------------------------------------------------------------------------- /assignment2/components/schema_validation.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import tensorflow_data_validation as tfdv 3 | 4 | CSV_DELIMITER = '\t' 5 | 6 | def infer_schema_from_csv(csv_file, column_names): 7 | # TODO implement 8 | pass 9 | 10 | 11 | def has_anomalies(csv_file, schema): 12 | # TODO implement 13 | pass 14 | 15 | 16 | def adjust_product_schema(schema): 17 | # TODO implement 18 | pass 19 | 20 | def adjust_rating_schema(schema): 21 | # TODO implement 22 | pass -------------------------------------------------------------------------------- /assignment2/data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment2/data.zip -------------------------------------------------------------------------------- /assignment2/data/ratings-0.tsv: -------------------------------------------------------------------------------- 1 | ab602aca-9bad-4aa9-bd42-6ce24cdf8680 5 2 | f98cd8d3-d6fe-4ee3-8c9f-a18c1690f7a6 5 3 | 396f73e3-e3ef-4c93-a4b7-e8b04c6a2223 3 4 | 329ed5e3-8b47-4e78-961b-18b89e7da808 1 5 | e61dca8b-3577-4c88-a9de-47db3bc3cc99 4 6 | 39191b63-8e02-4292-8d11-67c6f2bb1ae3 5 7 | b9d72513-97db-4121-baa1-f9665b5dc1cd 5 8 | a6260267-9a39-454f-80fa-5232909049e0 5 9 | e3167452-e087-47e6-a77b-d0efb0a1b997 5 10 | 93163305-6046-41dc-b64b-e85b24e61269 5 11 | 34fe8d10-3554-4b0c-99a6-a000bdc47b9b 5 12 | f768c63e-616c-4578-9b60-d2c66972992c 5 13 | 5e60bd1b-28ca-4119-bc43-c4201cef0247 5 14 | acc327b7-d749-4900-86d2-5dea1ebb4193 5 15 | ef3ac854-c6a0-4dd4-8342-4c20d15edac7 4 16 | b1c07c7e-4f95-4a1e-a25f-df169de46011 5 17 | 1ad593c0-52c8-4c82-b682-21db37f862cc 1 18 | 7e51e537-6617-4450-a65b-4e5abee27b43 5 19 | ffbd5589-5aac-4344-a591-8a4887dd1dc7 1 20 | 57c9c17b-98fd-4286-85d4-f0539e81a0d3 5 21 | a14eb4b5-7763-464f-919b-e72804270f9a 5 22 | 2eb0961b-9bb1-4c1d-8b71-4180e706534f 5 23 | afd402e6-1bba-43d4-9ef7-b3706409e38c 4 24 | 34ac55e7-6e90-45b5-a586-e3e8cde05e34 5 25 | 1cf19399-953a-4444-baa1-c99285443d32 5 26 | 66491d95-37ce-4f40-9058-d4cb725f2afc 3 27 | 6621caa1-2a47-4c18-8dd2-d15c4e4312b4 1 28 | f9c56e5a-2663-4777-8076-76ee5e6309f7 5 29 | ab09b9c4-bcd2-44ed-9873-79fb0ecd67ac 1 30 | 1d0f80ab-0fa6-42ec-907e-95e271358b9b 5 31 | efbc939c-3ad3-4bd7-b2ef-5e34aabd3ef7 5 32 | a76c0826-b5c9-4c83-b678-74dcdf92d8b5 4 33 | e9e4fddb-f5d7-4fad-a2ca-84c659a2fec1 4 34 | cc9d4991-a063-4a56-882e-2bdf2fe24be4 5 35 | dbd8ada8-2755-4867-8dc2-8046a235c46a 5 36 | ef884d81-0309-491e-bb76-fbf13173fdfd 5 37 | bd8f77d8-af0d-4928-9f91-c155044a48cc 5 38 | 4d736ae6-5950-4241-9d70-d7acd2d0bd09 5 39 | baa90425-1179-4c11-aad7-bbe646fa2556 1 40 | 0d3d2799-aaf1-4c31-9533-ee16ec4c3fb3 5 41 | 714b4282-7773-416f-9031-50784580dc45 5 42 | dcaab57e-2ae1-4662-96f3-40629b0ec8e5 5 43 | 77398786-8b44-4f8e-a35e-b87071fe6e3c 5 44 | cd3b254e-1398-4001-9642-eba75d67dfad 5 45 | 7026f0cc-8370-4abf-a2d8-1f59244f0700 5 46 | e8f86669-aa81-4ecc-9711-6f1d7eb6c07f 4 47 | 36bb9b53-cc30-44d4-8408-23c19b9d6564 5 48 | 70128d47-b744-4421-96ff-a52d3b0d52cf 5 49 | 19d3c205-8dfb-473d-ad21-30514e05e2f5 4 50 | bf4f3700-b6e3-4000-a88d-acf081884641 5 51 | 1c6417db-3cda-4a02-bb43-71104c2f5f7a 5 52 | b0e6bb50-a31e-44e5-a68d-caaeec6470ad 5 53 | 87be9f94-ac75-41d3-bfd1-9c95a2bd65f5 5 54 | 280256b4-49cd-4f87-a3ca-aa6eea48376e 5 55 | 2d653c2a-7b5c-4789-82ac-df4e0951131f 1 56 | 01a0027e-f261-4402-8802-c69ec62eae4c 5 57 | a71591fc-91c4-47c0-ad0a-dd33c244738e 4 58 | 033c543c-da1e-4b3c-a11b-261aef50d201 5 59 | 17b8efe6-e0b2-4e49-a5fe-24bc8568d742 5 60 | edea43cb-e1b7-453b-9e9b-e0efd1592231 3 61 | 64a5234d-f9ac-4baa-85d4-22680004effd 3 62 | 6f406526-b139-426e-bd70-be38de9736f9 5 63 | 064d6b5e-68da-4a34-9ba8-f521d61ac27f 5 64 | 45e740ca-473d-47ae-84e1-badd8b3f74b3 5 65 | d97b6acb-c430-46ca-9751-4a1752af34a7 1 66 | 3585c0df-540f-4d8c-bb5a-59ee19480cda 5 67 | b8599cbe-1ec2-4884-8584-7418e848c255 5 68 | dd623e94-9422-460f-8c20-190d0d5432f4 5 69 | 5197ffce-4b62-43a6-907c-3d7e3515b1d6 5 70 | ac66c1cf-e87f-47ab-a391-26acb7797d58 5 71 | e0e0053b-f9f3-4f7d-9513-d3e1625ceaf0 5 72 | b007e2a8-cd1f-4ed2-b912-1bd1153d4dd1 5 73 | 9a5c6344-e6a6-486a-ae0d-134ed7de3e1d 5 74 | 3df801cb-2325-4e31-bc41-ee71ae851664 5 75 | 8d9d81bf-1be2-4226-a5bc-0af09506df22 4 76 | 2cd0cb45-48b7-4742-aeea-df2f991cac53 4 77 | d9c1203c-f961-438f-9e59-ac61df27e038 5 78 | 43bf23a7-6a98-4a05-b49e-b25e84c46935 3 79 | 8adce774-d906-4774-a207-82fa7258e6e7 5 80 | df28a8ea-508b-4a22-aad0-51c6b986b703 3 81 | 0f7a8aca-c7d5-4139-8ecd-5d8501d61088 5 82 | c9dc0726-b779-44d0-a4f4-7e83baa3cca4 2 83 | f72a5f00-b9fd-4f8d-9092-2dfdc6d4485f 5 84 | df7e80d4-0708-4d7c-a09d-684b46478dcc 3 85 | bc9ecd60-708d-4452-8198-1214edf344c4 4 86 | d9f56461-5547-4ba8-8ba4-a4fdf81de20b 5 87 | 71d871c8-2a59-42de-96f5-3423ee5d4784 4 88 | 78a62ae0-7c26-4284-81fb-30a716dd2411 4 89 | f638499c-6670-496e-a0f6-3e8ad1944343 5 90 | 00552178-6074-4f0d-8351-6cb508a9c208 1 91 | 233528b9-514c-4ff9-91f0-3f2517ffac67 5 92 | 3dfab41d-99be-4109-8721-5a3343cd930d 1 93 | d8d4ae14-d838-4395-a039-fd2a3b1b5e28 4 94 | 0e75b933-ded8-4164-865a-4983b7ba0cc4 4 95 | 81ea7456-0733-4f91-b401-2e9e46c19a3d 5 96 | c59903d2-a6a1-4b07-ad6b-ba3af90b9f4f 2 97 | 02845cd0-2518-4b78-820b-106add23787d 1 98 | 00645388-2228-452f-9b5c-7d1a0df1e9d1 5 99 | 47e073a8-8bcd-4b50-9d9f-dc3ddd150568 2 100 | 947989d2-3588-4e7c-bb8c-b163cc8d5386 5 101 | ffa40a7d-e690-4369-a98a-f996a465d134 3 102 | 5371f5cb-856d-4a33-a361-cf542fdf8890 3 103 | ebf5111b-fb31-4d62-b42c-b2eaa15a86b1 5 104 | 40902631-7afd-4a9d-aea0-db4f3f1cbb9c 2 105 | 8f4b3c22-5bc8-475b-a428-b167f7995800 3 106 | 1a291ad0-c8e2-4a87-b6ad-88e49e500685 5 107 | d64d7daf-a942-482c-80b1-9d97e0665d8e 5 108 | c2f59659-9fd3-43b4-95ed-c09d19b788dc 5 109 | e9791924-15c3-4197-8c38-6281c4a864ec 5 110 | 7d6206f8-742d-4c01-9c70-2192d3453089 5 111 | b3f807c6-0116-4361-850d-f02b6b840274 5 112 | 881b11fc-641b-42f0-9973-68438b222fe4 5 113 | a73f6147-5013-4fb8-ab0b-caefa2ce5284 3 114 | d4dab584-bc5d-4f21-8fc7-1249388c31c1 5 115 | 3254142e-9931-488e-95a4-27e4f784bbbf 3 116 | 1d92debc-ce16-46dd-9c1b-e8dbb354ecc2 1 117 | fd296019-e728-458e-bdf2-c8b3eabf7c93 5 118 | 712529d5-fca5-49a4-b1fa-824f6c2ff099 5 119 | 7941512a-79e7-4a39-a360-0c24d0cadab1 5 120 | 4527a9ef-3b6c-45dc-956f-10dad7c952b2 5 121 | 5f74113f-aaa3-43ab-97b6-d34e56bc14f2 5 122 | d48625d0-c533-49f3-b57a-d97b18316343 5 123 | 1b171dfe-b13e-4c71-8337-12e102268f99 5 124 | 4e1faad1-d049-4fef-9385-ecf1b0e54d48 4 125 | de02cfdb-b10a-484f-957d-b0cf11cbdac5 5 126 | c5564131-2e9f-4dcb-86af-b9e96f9bf647 5 127 | c8d1378e-ca50-4ad8-99d3-3ebe5283933d 5 128 | b0170384-c254-4720-a79a-63edbd9b8ed8 5 129 | 00965ac4-99f8-4feb-93db-1ff71ce1e134 5 130 | 45d3698e-2f0d-416c-9401-f3995eb13234 2 131 | 779112f6-073a-40bb-908e-514829d4ae1f 5 132 | 063a89ac-dd7b-416c-885d-03e55c89d0aa 5 133 | ce603435-ce39-4ae8-bce2-c2a510fdee00 5 134 | ae0fa17c-9ffb-4071-9fc6-956fc9876005 2 135 | dc03bcf1-46c0-4ea9-8da1-36cfe52070aa 5 136 | 5af384ff-31f8-4b20-96f1-d09c200e8877 5 137 | fbb206f2-47b2-4ff6-b069-8428465d71fa 1 138 | a1023de8-b11f-4d96-9b0a-ace772ad6649 5 139 | fc49957a-cae4-4324-8dac-e2a722880011 3 140 | 0f2e9567-3cbb-4faf-8911-48dd972d9da9 5 141 | 18cbcb38-a63c-4d87-8014-82d5c71fad84 5 142 | fd1f13fa-dd4a-467c-bdec-0e21fd60c1b2 4 143 | 79dde4d8-3aa1-4b82-9a33-302e2be542f0 5 144 | 69918764-0326-4f9a-a6bc-62ce4c4fc40d 5 145 | c5fed535-af63-465f-bdce-116446304c75 5 146 | 1d11ce64-6fa7-4f3d-9d54-750211e4c4fe 5 147 | 3928466a-f794-4952-803b-57687685b694 5 148 | 4066d298-8da2-4d3d-900d-af3606c834a6 5 149 | 9bd5b50f-b2ea-47d9-8ae8-351aefbac7f4 5 150 | 0170b1d0-af6c-43f5-a9c7-e368f84582d6 5 151 | 8acba203-c023-4681-8161-ef6eadff9146 2 152 | 66b030d6-eb0a-4409-aea2-31e474c3740b 5 153 | b6c622ea-ade6-4305-b6ab-8174bd47b0fd 5 154 | 57898d01-775a-4431-ab89-6d597a4f4f0b 1 155 | ae61e9c7-000f-4e3a-9460-a9312bc152eb 1 156 | 30e4b35b-9129-47fc-afec-973862bf1a9b 5 157 | a6a0db46-fb69-47a3-846a-c7800c05b035 5 158 | 18ddc4bc-f7a3-4a99-bf63-80454b150f33 4 159 | 60a48004-f59c-4c3f-aabe-4857317fdaf2 5 160 | c2c30f6a-1a33-42a0-b72e-912931a97da5 5 161 | b795dfcd-76bf-4cec-8c00-2a6d8f95ea44 5 162 | c37e9681-022b-4923-a838-3746dc30827f 5 163 | cf9e68d0-7a62-4d0b-b051-886cca76d093 5 164 | 031fd19f-48a8-4923-b886-bca54d643465 2 165 | 24566674-58a7-422d-9033-b005bd86594e 5 166 | 0a7e6948-85fc-4292-9f8d-df22ccb541a2 2 167 | 052fdb81-f4a9-4b0a-a913-2763be6546b2 1 168 | 6b6a4006-f1c0-4648-acf7-6d8f73f5c3bc 5 169 | 8694795b-258f-47e1-bf54-c12d1c8629d1 5 170 | 05f10bb4-aea1-4808-8c9f-ca1feaa6687e 2 171 | 508381f2-d295-46fd-87da-ec16cb5c2f1a 5 172 | 2069df2f-d8a4-4da6-9c41-80b3d34ae07e 5 173 | 22f8fabd-7884-4dca-88d0-fa1b8bfbc55e 5 174 | cfdd48a7-8ff3-4de9-85ce-394109134144 5 175 | b3e45891-65b2-40d6-84a7-7adc26b5d5b5 3 176 | 75c68af6-bfde-4f38-9bc6-fe65e6b0921c 3 177 | 3729140e-8b82-409b-beda-709d66d3a5ee 5 178 | 0876e7a7-8ede-4451-8ae1-c8e10b20dcdd 4 179 | 888d4ae7-b04e-4584-a302-76e2dcaa1cb6 5 180 | fed7b9e3-172e-48a5-9315-b7b8d2adf095 5 181 | 7d71af97-8409-4851-ab0e-7dd7ec53e0e4 5 182 | b751a1be-2473-4144-853e-ce1f867c1189 5 183 | 677cfb18-46c6-4770-b481-a60eb2b49dc1 5 184 | 53490736-3f8b-4212-9698-afc6a9819c58 5 185 | e9430d61-117e-4578-a6e6-3037e9112fef 5 186 | 9ceb1b30-f0f2-4c2b-9b4d-654f8ddb4574 1 187 | b6fe871a-2e58-4e56-a9dd-656a072f6106 5 188 | 17cd7ef6-e12f-40c5-8797-11105f97c57a 5 189 | dddf4365-a160-49d4-b229-d7bde498db4e 5 190 | 69a36c2b-7250-4308-9661-3776076d7353 5 191 | c35beead-a5ea-43d8-b183-5c4c3fa0d995 4 192 | dcbb3511-b3b0-415f-96d5-34bb80c4c70c 4 193 | 59dc9143-4328-41c1-a518-ce947c442884 2 194 | ab476964-1b81-4aa5-91fd-5af1eaea0a42 4 195 | fb09b323-5b04-4bb3-9f08-30c6f8e63e9c 5 196 | 7a8408f1-8234-4372-b1e6-bbcc9bb744c9 5 197 | 1e7a9da5-e0bd-4830-9637-99799c54ea2b 5 198 | 6b21f9c0-6d36-4f2d-aac5-7bdc2731c36d 1 199 | 7c705f28-80aa-4d3a-a4c5-1a6a9d0f1384 5 200 | bc51bfc6-c3e1-478b-ab29-1c34c576c8db 4 201 | 2e06b8bc-fd6d-4dc0-a364-82ab4abb2689 4 202 | aaae85ef-83ee-4deb-b6f6-f72be1b4b612 5 203 | a5d78c4b-a48b-4efb-9dac-bdc2659f501e 5 204 | d291f359-d8c1-47a1-a745-ddecc3b27dc6 5 205 | 174856b8-e940-488f-8b60-338d181f144b 4 206 | 3fee491b-b845-4d7c-9301-80d5fb854db8 5 207 | d8237ff3-05d0-4a4d-acc4-5690b11a0b74 4 208 | 3a99b893-e7b0-4602-876a-5b31edc3bb02 5 209 | 8dc0b64c-7f2c-4576-b129-4f1a867bf3be 4 210 | 81ca1b53-c347-4741-a230-9571465d9b11 4 211 | 9bb2c221-e16f-4d78-bad8-9048db4e62a7 5 212 | 9d3a40ef-994c-4cb3-b49c-10292583a639 5 213 | 3195b911-42af-4b26-a965-4f017a8ee566 2 214 | 52170b3d-1355-4143-8e56-60a6a9769743 5 215 | 90bade87-e249-4490-8698-4f5ee028f709 5 216 | e9c5886f-3436-4ca4-b90a-d943fd856654 4 217 | e87b1b1b-0a57-4c9a-9982-ab060bdaa975 5 218 | f282686b-f210-467d-b7db-46b2e950e957 5 219 | 29f72764-6d3e-49af-bada-a6bb9a445e22 5 220 | 7a10a71a-8b06-4eb7-b33b-a88d6b3e92c5 5 221 | 7bbc7e4c-dcfc-429d-97de-df378cc2d354 5 222 | d01d3dff-aa01-4200-b262-83be531533a0 4 223 | e20198f2-00a3-4bb2-8c47-9a028ae1c00c 5 224 | dc1899d8-d879-4a5b-b729-b9f711e01a36 1 225 | d4f9b6d1-5d01-45c5-a279-74a47faa4bd7 5 226 | f12a7e5d-2266-49cd-a684-835044185bcf 5 227 | 031d882a-b949-4b4e-8b99-ff4857d387c6 5 228 | 14b9303d-f778-4a07-99aa-57558145e08e 5 229 | 142b538f-388f-4e8e-a9fd-608e3d622790 3 230 | bd86820f-0927-48f4-9180-ea6a83b8ae34 4 231 | 7775b562-d669-413a-9057-08d9ab152c6d 5 232 | e67d74c1-86f1-40ee-b233-d8f70307b55e 1 233 | 4e696a07-2b61-405e-8ff9-b11a5361dff3 5 234 | f8fca6de-c39d-477f-a4fd-b4da20c6db32 5 235 | b82de19b-725c-4b21-ac97-510cd21ff62e 5 236 | 8dc8b0d1-f093-425c-b697-ab56fdd17683 5 237 | 011a4841-6338-4a19-8acc-a6ccbdefc409 5 238 | 53564ce9-c101-4ddb-ad4e-e8a7e7c41734 1 239 | f0f3dcb2-aad7-4729-b5fb-3888dc932441 5 240 | 95421457-f542-434e-aff0-cdd634437e7a 2 241 | e27e5df6-15d5-4633-a495-12c18161ba9e 5 242 | 29449dfb-fbbb-4001-bef6-8fff102f53f7 5 243 | aa06e4ef-4809-4f3f-bcee-7b730344f4a6 5 244 | 83d4a69b-28a6-4178-bb8d-cec15cf28564 5 245 | 605497c8-cb86-47cc-ae22-a00913446d88 1 246 | e15cbf0f-be0f-4c6c-8ffe-7460e1f3cfa9 5 247 | b8a82a12-7ea1-4c3f-946c-e27c9edb5295 5 248 | 110d2ecb-6887-4113-b13a-038fb60d7c07 5 249 | b5688331-6972-4ddc-8b18-8469ed2c76e1 5 250 | 6e3328b3-6458-49d3-b643-d0744f3bd949 5 251 | 18377530-2a79-4ec8-aaf3-41b04560be89 4 252 | 8cb374ae-c8b9-4d12-913f-96f6499e8113 5 253 | 80557652-ac83-48b5-b0bb-2e48a89ae0c6 5 254 | 88e66a00-273c-4e9a-aa8f-67fe0d9b3782 5 255 | 45dd273c-5ac4-43dc-8ed6-d30d108eb293 1 256 | 6a1dad68-0f5b-4189-9281-74a7f8d21ff8 5 257 | 84d4c7ab-04a4-4138-b21f-a5cbe2bbdbeb 3 258 | 16ecc692-3d65-4b40-898f-563df4cd2bfd 4 259 | b5ca790c-b8f1-4daf-9bf4-7463bc4544a9 5 260 | a60c3e9d-79b8-4ffe-9a97-97d2e00b15ff 5 261 | c7e8149a-8310-4d5f-b22d-ec5cd0f64047 5 262 | 7912e691-ce7e-4492-a127-795716e8fd4d 1 263 | 1f647b89-0044-4c1f-a1ea-61ec37b3a535 3 264 | 0c515d16-798d-4e48-8eca-8acc13ad1658 5 265 | 2c4d326e-4c7a-4c10-87c4-3e2e428aef33 1 266 | 9609ec6f-211b-448a-a373-4bc78098560f 5 267 | 0a5409ad-a8c0-4606-bf8d-6faefa57fd05 3 268 | 3bcf812f-fe88-4b61-883d-fa600f3ee484 1 269 | f9a0dfd2-4032-4858-80ac-63d53496e48c 5 270 | 9e7d6cc6-c62d-49ad-b96f-b4e30d198a2e 5 271 | 2ea0ca0b-dac8-4a08-a749-a3fa040960ec 5 272 | bd54b369-254a-4602-afa3-150994297432 3 273 | 9cebb8f0-d861-4e6b-bb59-07b89d6da906 1 274 | 685a2b73-3aec-4d49-9f89-8bc4077f5924 3 275 | 35b9d104-3db6-48df-b3bb-10bc54fbd7cf 4 276 | 7fe242ac-e844-4dfe-bfce-bde7f20b0c59 4 277 | 9ff710f7-896c-44b8-b379-9c2db5ea42e8 4 278 | afadf913-abd4-436a-8ca8-270cb7eb404d 1 279 | de48d136-7df1-4dc8-8410-18f0a97813cd 5 280 | 281fb66f-8395-4cf1-9072-8064bc58d9cc 4 281 | 0c967875-3a6f-4aeb-b922-42c5a185aa94 5 282 | 71b6e072-1597-4368-9ae9-f9204ce3acdb 5 283 | 0d7a1277-c653-4ebb-8524-2467712357ea 4 284 | 6247ac9f-ccf5-4df9-beac-e9af54d86ac0 5 285 | 13daf93c-ce61-4004-85ab-6c88c9f30279 5 286 | 58306131-9591-4793-b05f-eb98b7daa3a9 5 287 | 022ec213-2d13-41d0-b98c-0baf04d8e3d5 5 288 | 06517fdf-b7b0-4055-bee5-e45bd8c6bfef 5 289 | e8a5850d-9849-48c8-a44f-9585a6923aa9 4 290 | a96ad8d7-01f2-4c1a-b429-2dd9c6b06476 5 291 | d17a38d6-058c-4861-b6d3-934517c7c909 5 292 | eef6fbba-dcd7-4c51-8ce4-cd205c323924 5 293 | 26e053b5-910f-40ff-8231-a45162718084 5 294 | 124a8ac2-6ab3-4fee-8200-e0cf484d04ae 5 295 | a0321e36-efcc-4073-9f85-69092180e9ed 1 296 | 2019d0bf-b31d-4ad4-acb8-2657ff6db3c5 5 297 | 4dba88e1-ac67-42c4-80f6-8dd3d449dd71 5 298 | 163c0493-3b32-4bd1-8eb1-367c0d0e3039 5 299 | ad89d9af-1227-436f-aec6-ddc963d8db4e 5 300 | e8dc5827-49b5-490d-9259-3f77425d2b2f 5 301 | 39052c68-6238-41a0-9085-201949e459ff 5 302 | 676a773f-c41c-4e59-bfec-9f7c15118007 5 303 | cd1fc09b-fb28-43a2-bc72-b3329b2a63ff 5 304 | 0e418a70-92ab-4310-9fee-d33efb795849 5 305 | 60327006-34da-4c88-9b22-038b5b286eec 5 306 | 94a47aff-7a93-494f-8f40-2eab54454e95 5 307 | 802f6d9c-614b-4f16-838a-73c7d98dfa21 2 308 | 36fff17d-3edb-4c54-8687-2070b05510d2 5 309 | f5c79205-9269-4fcd-86b5-4267ae7af93c 5 310 | 14ff5559-eb57-4f93-bd91-4aeba852f312 5 311 | a4b96feb-5a11-4209-bc7c-5f088dfc3a4b 1 312 | 7712a1ec-0fbc-45b4-98ad-a8874f775fc9 5 313 | 037329a0-0954-427a-a707-6dfd58bb5044 5 314 | 1de3492d-25a8-47ba-8160-c191fe671881 5 315 | 9805d0d9-1519-4222-8af2-4df43d7d7b27 5 316 | 7be2d9b9-c7e1-48c3-aa23-83950cb2129e 5 317 | 6ba33425-261a-4008-ba50-63a3643cb3c4 2 318 | 56c4600d-cced-4aef-bf15-a23ddec388f1 4 319 | 1d8d6ae5-fb0d-4dbc-b74c-94ffa839d779 1 320 | d1e92bcd-d3d2-4d8b-97f4-c171d85ac5df 5 321 | 6f7ba9e7-60c9-4c6f-b672-3265d93310de 5 322 | d2d3e3ed-e6ff-4829-85d5-20ad40ae9663 5 323 | abb75a7e-86b8-43be-9eb2-6f8e5ba25c3f 3 324 | 19b910d9-3253-4376-9f73-b1aec56ac263 4 325 | 393b2f3c-5158-420a-a2b1-23c1f08e966a 5 326 | fdf56656-1016-485f-a3bf-79da1aeafe6a 1 327 | faf7f15e-80c8-45b2-9b69-4e466894676e 5 328 | cfbb8e61-d818-4bf6-9fc6-861068084631 5 329 | d3f57ceb-38c8-471f-82fd-31d7f31579fc 3 330 | 3de731d5-4854-42c2-b00e-37b50ef2b232 4 331 | 46fdf467-f36d-4f3a-a4bf-3ba42aba50e8 5 332 | 2b87b0cf-1f13-4574-b8a4-83773396ea5e 5 333 | 88f2e073-a32f-4c0f-b5a2-fcbed747a016 4 334 | f76f7b8d-de62-4776-a77f-06352cf7acc0 5 335 | 7bdca966-7a40-4190-ae6c-e878f088f048 5 336 | 7e6eaffe-4e0a-4fcd-be65-528d18737ca1 5 337 | 752b0d2c-adb6-4079-9af1-0fd9493090e3 3 338 | 3fe82b84-1230-45e1-9e70-71408b111458 5 339 | 95ed642c-6cb1-483f-99ae-a6937cb2b3fd 5 340 | 7cf32b45-0d0c-4e97-8063-3b2c4af73ffe 5 341 | 266fc420-775c-48c9-b13d-ed455a51fc2f 5 342 | 149b3fcc-7fcc-455a-9161-3167a9bd9f1f 4 343 | 6d593b63-17d9-47d8-82c9-43de8072fae1 4 344 | ea3ed858-3342-4128-91cb-7724f2348772 5 345 | 4252f1c2-c976-4ba4-966d-30d2eabda9e1 5 346 | ebeb26cb-756a-4c60-a862-0d1f073832b5 5 347 | 54491e98-240f-4f17-aa4e-9278d32e83c0 1 348 | 4bf73b45-efdf-4626-952a-12441dde9de8 4 349 | 0e114a73-5d8f-4aaf-980f-c0012acc3fd9 4 350 | 5a0926db-e238-48e6-8d69-617a5d824c01 1 351 | 5b6718e1-2602-4daa-bced-c38b25005e19 5 352 | c0ccd086-6c02-4126-adfc-ce752a58115f 1 353 | 7966cd84-00c1-42b1-b48c-c7c3369b489b 2 354 | 078f148e-1916-44fd-a7bf-e085af105329 5 355 | 8ff6533f-0dd9-44e4-a3b3-980abfb85edc 2 356 | 63db975b-931e-430d-94fe-5bfda6bf2b96 5 357 | a4f82b2f-c7a5-48e0-aa60-83c58161745c 5 358 | 57dd90e3-bb58-4bce-a6de-d840095bccc2 5 359 | c21b34f1-16c7-4bba-9fa0-619061a791c8 4 360 | e61740ef-1e2d-44f8-a836-07cb14410de0 4 361 | 89300dbe-3d0a-48fa-8f15-4a26bc5aded1 5 362 | 51632de9-8e43-4ae3-82f5-dd9460bda197 1 363 | 8622060e-bc75-4cbf-8910-6cc23bdb8fdf 5 364 | dd1c9b6c-c794-4987-b9d6-dc32d3772db4 5 365 | e03711d9-e0f9-4fe3-9dfd-abbb67c690c1 5 366 | 8b7ddba1-5ad6-4cce-b86f-6436388d2ab9 4 367 | 0cf5e80a-8251-4169-a1fb-a06aa2251e2d 5 368 | 4ae031b9-7226-475b-9e27-9e069f35b7f8 5 369 | 1518a8e6-721b-4a81-8140-e8e4532ffa02 4 370 | 3b7157b3-1aa9-4611-96e4-67b86349876e 5 371 | 9c7577da-c2be-45f3-a75d-0dbfe6111969 1 372 | 5a7236c9-9beb-4718-9397-68a6fca49260 5 373 | 71b6dd38-b9bb-49bb-a099-3d8f3d0aa07f 5 374 | ec1a3edf-76d3-4ec0-8659-9ec2be5e66c0 5 375 | 76fed137-afc9-4fa1-a24b-8bd6c9086244 5 376 | 2e79591c-0df4-475c-8e96-c7cae176c7e6 5 377 | c75989c7-7a46-4c92-a034-eda43f20dfa3 5 378 | e1956382-2e2b-46cc-998e-5576e5d6ec61 5 379 | 1e5de174-a45c-47f3-b4ca-8d4cfc8a90dc 5 380 | 42f0d511-9afb-4874-84a1-1660e08c2c5a 5 381 | 2c15e478-7764-43c7-8378-50f903c28b88 5 382 | 6616ac79-b0df-470c-ba2e-9ab48fc03287 4 383 | 9b8c4f91-8683-4924-a10b-2bcf9cd31e7c 5 384 | df4a557c-7be7-4cc1-ad94-4cbf5b4f1824 5 385 | 262612a3-7693-4291-b815-7534a8b2f832 5 386 | 6dbf90e0-518d-4694-a41f-efce8cc67724 5 387 | 2ee86bb4-bdde-4b09-a5c6-33776de29a05 4 388 | 1ef31326-7983-43fd-87b4-5b51a3c00765 5 389 | 6ee1cda0-bbec-4709-8031-a107e8bb4434 5 390 | e0a9c139-5739-4d7c-815e-4b75488374a9 1 391 | 2c59cc9c-7a86-4df0-8b7c-a64027eba4c5 5 392 | b95c9b15-d55a-4610-8e45-c4a2c6130356 5 393 | 4742e54c-6f9e-4059-b334-a7b1e6f967e1 5 394 | 1ba88839-00a4-47c2-87b3-aff9fbe6449d 5 395 | ad6e14a5-face-4281-a2f6-9d241276746d 3 396 | d5af638b-78a8-470b-ab77-cf66afeb8dab 5 397 | 770450a8-1f52-4f43-b143-6de0748f4369 5 398 | a1a95bf4-4786-421a-8b0b-a1b367b0174e 5 399 | 08f39e61-0914-4e3b-8344-684d02379680 2 400 | 28bccd02-2272-47c2-b666-9c1af9b605c9 5 401 | 872c6cd8-974b-417a-ae44-b085151359f2 5 402 | 771d3df9-74bb-4afa-9fd1-f356f3a98495 5 403 | 5f0999d7-7ef3-45ff-a8cd-ae77d448b104 4 404 | 866d4f6d-6471-478a-a32f-791750041f39 1 405 | c3eb3913-fe60-476e-b265-bee406c7ec69 3 406 | da6778ea-e6e4-4b17-b7ec-5aa554f7dd2f 5 407 | 29f9e887-f401-45e2-9b28-c1d2b7d384b8 2 408 | 150bd722-1fac-4517-99c6-df25e3eb0b10 5 409 | 3a71095f-5b80-42d4-9352-35dc90328ee9 5 410 | 222e92eb-0fe5-47f9-8684-4e8bc9c91999 5 411 | 2ee6efe6-a41f-40a4-bf51-4c654906dd2e 5 412 | 78a7b74e-dd4e-4ef4-9818-2ec1f25b149d 5 413 | 195c681f-d101-460d-8714-dda45eb4a4ef 5 414 | 5a415c97-1263-49d6-a511-66672090a32f 1 415 | f4f47c3e-bd82-4fe6-a8fa-27e72f0d4d10 5 416 | cc547c9b-2aa9-4687-9a47-c6b2e652a5aa 1 417 | c36bbb95-a45c-4748-96bf-62ec0f1846e5 4 418 | 9b38ce05-d944-467f-b881-7c5af8c52376 1 419 | 4e2e0f47-7237-4a78-9726-ebb220ba9e25 5 420 | 91e068a8-bbde-423a-9d2c-8b10bd2ae3e3 5 421 | 03fa3a18-1846-4e77-ab3f-bdcdcf9e4513 4 422 | f823b09f-e6a5-40c2-ab6f-8409e8bd5c6b 5 423 | 0a7c3e45-a112-42b6-a827-f77f5ad91ae9 1 424 | b269d473-151f-48d7-b9d1-5a755a327403 5 425 | ae174371-0ac0-4a15-8470-b4fcf4ff9ab8 4 426 | c25e523f-0b6e-4724-9660-2d954767ad3c 5 427 | 9f797c06-e168-4e6d-b77a-10ce1ceb73bc 2 428 | 65312d76-18f4-4157-b7d9-2bf795fc298b 5 429 | eadc7bee-77e9-46cf-b46a-21606e58a6ed 5 430 | 5b999de3-7248-4395-aaca-d56f42cef495 5 431 | 4b3243ea-c27d-4d9e-9c6f-d219cd266d31 5 432 | 88aaf95c-9ee6-4c5f-acbc-cec905f851c0 1 433 | 8059a81f-a5d5-4d59-a8cb-f365ca7a2f00 5 434 | 7fa25888-6a85-4eda-8510-ca0b8094b9bf 5 435 | d1d4442b-8f6e-479d-a780-eb800a0a6fda 3 436 | 1fbfbbf9-bf62-4ee9-9083-c9808a85ac7d 5 437 | 700664e4-d8ef-4a07-98d4-cd8e75535d97 4 438 | 1f3f51e2-b970-4360-831b-64e052c1f03b 5 439 | 3b9136af-e6f4-4fe8-bee1-a3a64ca054e7 1 440 | d4778c2e-9cea-4443-b0fd-615f13e4a5ec 3 441 | f8ff7ad0-0eb8-4d69-9fdd-d87aecbb0d27 3 442 | 10327f41-cc0a-4c2e-ba62-203be668a4e3 5 443 | 4104232d-98fa-428c-b565-dd7f85792121 5 444 | d560b147-9920-4cd3-aa4d-299a3514431b 3 445 | c10ed89e-4a99-4fc7-b82d-b0b400d6014d 5 446 | bed0ab71-557d-40d5-b4a7-8a63cdd30661 2 447 | 8cf0b389-3033-49e3-88e8-79fc5c3aa13a 5 448 | f3f0bd20-5ed9-4220-b838-b28d7df0c7f1 5 449 | 9b8b611b-05c6-4012-9c37-7fd6bf02e8fb 3 450 | 569af6de-d51f-419b-be01-45c9bfc29294 4 451 | c70cf489-22bf-4912-9fd9-a387cf15e0f0 1 452 | c910b932-fbd4-4a5d-beaf-89054c3cc5df 5 453 | 067fda12-a898-4d12-846c-0a98d42734b6 5 454 | c2273fa4-8a03-4fa3-b97e-c1ef73e976c4 5 455 | 8cb52bf5-2c86-4e17-a063-6c85207ba922 4 456 | 69068ffd-f995-4f4b-baf9-1698fc4b9a12 5 457 | ae255001-97fb-4959-93f6-fc05d2d1b1e0 5 458 | 6b1c35c1-83f9-425f-8c11-3df32bd7de43 4 459 | 802f4ee7-b017-4c39-b9d5-7321496f07a6 5 460 | 90659100-f58e-415e-a389-9d8fa84204b5 5 461 | 3e1d6e5a-992d-4c0b-b361-ef2a9df6d688 5 462 | 02048d7b-5d70-437e-a23c-ca5cbcfbd357 5 463 | 883b7378-f733-4f73-9910-c6a82170d882 4 464 | ccfec946-c11e-43f4-ae0b-cb2fa616eaa2 5 465 | aa0101e5-7d3c-4560-82f5-12c047f4651b 5 466 | 5420f885-e52f-4cde-8b11-876110299ae5 5 467 | 7982ffce-b3c8-4dab-a1a2-e3cd429fea78 5 468 | 6ba86d0a-d5f4-4660-98c1-4d31f5dbe817 5 469 | 0a8e8485-dee6-4cbf-9803-7413ace20c97 5 470 | 3e78b304-b02f-426c-9e2a-287593aea1e8 5 471 | a95739cd-aa4f-4cbd-b752-b6c1f583f62b 5 472 | ab4d76ae-ed22-4630-9afc-d0db02949c03 5 473 | 73c6f830-7d7c-4ed5-9805-15baa3de97fe 5 474 | cbb67e27-5c16-43f1-a725-a94e52922264 1 475 | bbda2255-009f-4e54-ba4a-7f0df689a3a9 4 476 | 6a119ac0-8a06-4cd1-85bb-46dc64c388b7 5 477 | 7aedf762-f413-4b5c-820e-20965e99e9da 5 478 | 838a9902-f7af-4414-ac03-0ec2da33a174 5 479 | 1685a6a2-a482-4d5c-bbfa-beff6f31a36c 5 480 | 7cd5891e-b4a7-4765-8879-5bddfc1eae40 5 481 | 28cc0d9d-2ff0-45ee-ade0-c182b4ec5e1a 3 482 | af1cb938-5a85-44d6-929e-7d0683086f1a 4 483 | c78044fe-d5d7-4132-a1ed-af4359b2a6d1 5 484 | 2b8c9707-0b88-4ca5-b5c6-5c286e6d001f 3 485 | 661302d8-c7de-418b-9d03-3fc0217b46b2 4 486 | 4605d3bb-a3ef-4c14-a5d8-8ee49634cc0e 4 487 | bc7e479f-c4c0-40f6-973c-4eaa5ae680a9 1 488 | df4fb301-6c31-4c99-8977-0a8c7801bc19 5 489 | c7f3f86a-1de7-4ee4-8b75-96228b9d6bba 4 490 | 3a9eaa75-41fe-4140-b7bb-c8f70badfcf7 5 491 | 002c7a2c-d07b-40e4-83d2-140d18db5387 3 492 | fc151034-9330-40d6-8dce-8207ec78c3bf 2 493 | ca944bb8-fe2b-444f-92dc-4726e434c197 4 494 | 737082ee-9363-4c80-98ef-1f2cd25a27d1 2 495 | b7856c82-500f-4cbc-a790-4a931a20f8c5 5 496 | 877bf80b-f1fc-42f9-9d8b-8b7f523d426d 5 497 | 212ef621-603f-4911-9c36-8d0f369250c7 5 498 | c0eddca4-8ad8-4ca5-9ab1-404050dde240 5 499 | d8969393-3567-4cc7-a979-2373c8725738 5 500 | -------------------------------------------------------------------------------- /assignment2/data/ratings-3.tsv: -------------------------------------------------------------------------------- 1 | 09d08220-7f1b-43f5-ab9c-9d70333bd831 5000 2 | b2802cae-3ff9-461b-9fe1-546b5287ae15 2 3 | 9455018c-a41c-4d57-8f52-6672e23fd879 5 4 | cfc66795-e9e3-4fa3-b59e-ad90bb113719 2 5 | d2d88575-d803-4df9-9c7a-f581c343bac8 5 6 | a2364dad-c4a9-4740-b033-575ac28ca327 1 7 | 4918d18b-2c32-45e6-947e-04328d1fc016 5 8 | ddd7c9cc-7099-407a-9de3-8123b5b38062 4 9 | 4c634bc8-98fb-4690-9220-c4896e5458f5 4 10 | ec6f0f08-abda-4974-95a7-9da0fdbd0e42 5 11 | b56b07dc-968c-41f1-bb5f-9d57a236f949 1 12 | de68a76f-a65d-4daf-a37c-ad5db06b2667 3000 13 | 4ed81c48-aec4-4ba8-8ddd-5e36e040e7e0 4 14 | 77633fc9-ca35-4113-b334-42930f3baa32 5 15 | b4be6022-72de-469a-9793-90100d06dbc0 5 16 | 44b4a847-a128-4e20-9d1e-c32a369276f4 5 17 | fb63019e-8753-40f5-890e-bab67949fd03 2 18 | c5263109-1501-4963-82e9-05bfdf0b1f80 1000 19 | 3066c74e-a9f1-4e3a-af08-d6a06b65fbef 5 20 | de9f055f-56ba-47c6-9632-6a0d2f41c3fe 5 21 | abb062e5-a118-40a1-af4b-70ac42525033 5 22 | 6d9f9d48-8984-4594-8357-f256903c1d24 1 23 | 30d0579d-7d1e-4420-b4af-7820f26d07e7 4 24 | 12687240-b315-42a5-8797-12ddd5800cd0 10000 25 | ae51fb44-156a-4ed2-8b4d-88da1acf1f2b 1 26 | 605c2638-f159-462b-a3fc-6ad1d93920cb 5 27 | 00e8b63e-a6f5-4fa5-a05b-b2e91e9c8ea5 5 28 | f7de7d41-ce58-45f6-b283-0a5047de7667 5 29 | 64600f8e-784b-4833-9b3d-df8ad9059b00 5 30 | 80044598-caf1-4e2d-bca6-7d5472ae03ca 5 31 | 324c3799-75ea-49a0-a0b5-61a1a67f5d1b 5 32 | d154587e-e185-4f22-b299-6c92dc3f29c2 5 33 | 1f7dcbd0-631f-49af-aae5-9d902119b652 5 34 | b0e2fa0c-e1a8-4f46-91a3-e178bc338ae9 5 35 | 903d0d47-9ee1-465e-a882-9896b58d4b53 5 36 | 5ad05cb0-4750-4a59-a743-9ba1e86deac6 5 37 | 27e1d531-071f-4cdf-b09e-a1a7201f7c83 5 38 | e3b7788d-1ffe-4f3d-8261-61c1e634cd6e 1 39 | fba382f6-92e8-4d83-b192-d8bb1d43db97 5 40 | 863f5a5d-b8d4-412c-9f56-9354d3fc4ac3 4 41 | 23a30c1d-11cc-40f8-8b4a-d47ce0ae7e25 5000 42 | ebbb33e4-8c54-4081-ad53-ab2a3fd6ab55 5 43 | 2f4659d7-4b98-4cbb-995a-56a6e708a211 5 44 | d7e1e702-6bd4-490a-91ca-521dfb0396b3 5 45 | 53f6695b-061f-4cd9-9f97-8f46ffa2d14f 5 46 | ff108c8e-4b0c-406c-bc3c-28264913e4bf 1 47 | ad6a5541-71d4-4dc8-81e5-28b4c6d90a60 5 48 | 2c43e76d-db1c-4056-a311-1f70501dbce3 5 49 | 83b7ba0f-afa5-41a6-9096-357fc1f85821 5 50 | a7fad65e-bd4a-41dc-a726-1f550051b23a 2 51 | 34b00498-ff09-4175-a101-95dcd39ad8e7 4 52 | f5dc753f-5ff0-4ea6-8b5d-5a0ff7aea486 5 53 | 201be5e1-02de-495e-9c63-1af4ded62b08 50000 54 | ee9f9b4b-3a9d-4911-a076-e9a9f56a71c8 5 55 | 12b08cae-da43-4954-84df-9297fb304fdc 5 56 | 475ea613-4d38-4597-b6fe-a358c8f12786 5 57 | bab3be6e-f3ef-4a7d-846e-a6f59dc0e085 1 58 | 371a5173-683e-48a0-b71b-58c7847ea917 2 59 | 4da8b371-4135-49f2-a0f9-5f91dc900caa 4 60 | 9b9213fe-cebe-4eac-a3ac-bd79fcf4bb2e 2 61 | 8db565b9-b625-46ca-a04b-fd48f4c1c726 5 62 | f550bb80-a6af-4b36-b2ec-5940bf0055f7 3 63 | 3ab485f1-8c4c-4fe6-9022-50548eddc78e 1 64 | d5c17875-a2fe-476a-8b18-b481bb40b569 4 65 | 80262769-8bb3-41b5-806c-eda6d5cad6ae 4 66 | 4d3dfd9e-1ea7-4136-81c7-0e55f1f6f1fb 4 67 | 953dcbaa-2364-44ec-8542-b35d1f49ce4b 5 68 | 1de45dc9-2e85-4e7d-aefc-35cbd36c79e3 5 69 | 118c00df-8b08-4794-8921-15463f361e91 4 70 | 2d4ea92a-5a30-48ca-9c83-dad54bcca288 5 71 | bb9c7649-379d-4c84-aadc-475a5293cea6 5 72 | 812a49d1-1503-4292-84a7-bce09f387969 5 73 | 56629e68-c383-4991-ab5e-f50a02c54985 5000 74 | 737159fe-af6a-4b1e-bd14-7ffc9a23fe2f 5 75 | 06defb3f-eaea-43a2-b7c1-73d88497d394 3 76 | 8e9ae965-0378-454f-a80b-94a9c58434aa 1 77 | 5159ad9c-111d-4244-afe7-34f94235e75c 5 78 | 7dd93199-4b31-4b7c-aea5-4c8cc18b9a9c 5 79 | 6979a062-06f4-42ec-b2f1-eba7ca41eeb4 2 80 | cf9515cc-9c1c-4ac8-93b0-2a250bdd64ce 1 81 | 0eb7f81f-33fe-4c58-ba02-0641dc8a224e 5 82 | e8a71495-e12f-45b9-852f-6e5db9738b16 5 83 | 66f5a06c-dda5-48be-a885-1d36f9810a04 5 84 | 926ead60-c6ba-4a9d-9954-8f907973934c 5 85 | 19fcbf03-dd2f-4765-a9ae-69b8d5a26c18 5 86 | a36095fa-6c76-4805-9686-87fd24015cf9 5 87 | 4ffa4495-4537-4a24-89f5-119e8089e110 1 88 | c1e47ebd-68d9-455e-8399-14303ad9baa0 5 89 | 67b1abe4-94c1-4bd3-a54c-c2daa4fc69b8 5 90 | b78ba370-dbc4-4cf1-bb86-82a0565e890b 4 91 | 710af073-dda5-46d7-af7a-cf989916fb49 4 92 | 463c58fc-70da-469c-85c0-0e3ab7fe5c75 4 93 | 8d180f7a-6afb-4d29-8b30-e0db86fe9abb 5 94 | 613c00ca-6923-42b8-ad0e-d25d084e749f 5 95 | cc93aeef-5039-4e2f-ab6f-cbe40b9ee659 5 96 | bfc43db3-53be-42eb-82cd-182f84d41bf5 5 97 | 0d27de38-b45d-4348-b996-d0c61ed42381 5 98 | ecd51f89-4f99-424f-bebe-bec79ec28c73 5 99 | cc367014-35f1-4358-becb-e8ca7957cb23 4 100 | 09b8bcea-6d0c-4709-bc67-8fc78087d646 5 101 | 232ac5ca-5b6b-4bb3-b63e-854b9f44e4bd 5 102 | 06745f5c-d06f-41e6-bbd9-83caccc3be1a 1 103 | 874f17fa-eca0-4ab8-bbee-a49c70145646 5 104 | adc59d80-303c-46d7-aba6-b104f4a4f934 5 105 | 1ea479ee-ca57-4941-b9b7-f60fd0e6430f 5 106 | 6af71cf1-953a-4877-9cb2-b2719531c977 1 107 | f1d03854-3502-4c9f-ba4e-762a61f3b180 1 108 | e36a5f9e-2f9b-4564-9827-b3d9f55453ce 5 109 | 25b033ca-7f99-406a-9cfe-25746c56fc6f 5 110 | bbb39e5f-0f30-42be-b028-9c84733f546e 5 111 | c82e02dc-bd25-4059-8d9f-876c9724107e 5 112 | 1b656987-a7f8-4453-ba3b-f7caa90321ca 5 113 | be83b9bc-02f6-419d-9efd-47ce1671a32e 5 114 | a5d20722-a98a-4031-9612-01160649054d 4 115 | b4417be2-2e7d-4de6-9dcf-443d6fe34b27 5 116 | 027db866-fc96-4aed-bc33-1dcf88cf6b6f 5 117 | ee68ec35-412b-4722-bd16-74370a289985 5 118 | b193216a-7af2-4937-b796-8417b6182c52 5 119 | 6681efd3-8424-4b99-b1b6-3c5e9faf70c4 5 120 | 27d2d9c9-78aa-4f3d-bc4b-5dd363607f12 3 121 | 4ff8461a-83d2-416e-869e-834c8606f1bb 2 122 | 9bed45ef-f7b2-4b6a-b502-6fcd6b116a55 5 123 | 0b32a899-b61e-4b47-95da-315a156a0a3e 5 124 | 50a50ecf-8e50-44b9-baca-64287ae79277 5 125 | a77f272b-ae66-449d-a135-37ca56ee3e24 5 126 | dda7992b-5ad2-4238-bf52-d8b2fec36057 5 127 | c07b056f-fab0-4ba7-a2aa-d05bf861a8ff 3 128 | caea3a80-8937-47a8-b16f-20aee31deeb5 3 129 | 7c0a527d-ece2-4681-9afe-140f1e9c0ed3 5 130 | d582c666-e9d6-41b0-93e9-dbaa2b816759 3 131 | 72132676-ad55-4901-b102-b30fa50b1592 4 132 | b922160c-7331-4433-9068-54e54f375821 4 133 | c0312efa-995d-4232-85ca-1f1b170272b3 4 134 | 34d6f900-6261-470e-9fcf-a353157c8dac 5 135 | 433a695c-838c-45bb-b398-82d0c3c866bc 5 136 | beea6ce0-fbcf-4d6a-8fd3-2a2d8988e75d 5 137 | f77c24b9-a880-45e4-be10-10c57bf6dacc 1 138 | 421e76ce-595a-4ba0-9c4d-d2b420fc4486 5 139 | ad6b33cc-d083-4a79-9a38-36747172bf8b 5 140 | f001fb8c-96a9-49f5-84a0-beb2682a18ba 1 141 | d4c8cd0a-9fe3-491e-a245-4af8aeeafbc8 5 142 | c9456ff9-61f5-4739-9b46-37d8a8ec6f49 2 143 | 35fcda98-9deb-47ae-bec7-9dbbc753d310 5 144 | 57982772-2029-48dc-9da2-cb595024b77e 4 145 | 63511115-001f-44c3-a3ee-1191ce59e20c 5 146 | f472e28b-0c17-42c6-aad1-8f7e70d7b073 5 147 | 95e9d4a9-e598-4d13-8819-a6cab4cf9772 1 148 | 19f78658-0540-4bab-89f9-a22dcb27c023 5 149 | b523452f-5974-44c0-ad1a-a052b011e34d 5 150 | a0814b13-654b-4735-805f-0db842c6414b 5 151 | dbf19161-12fb-4cfd-a3c6-3a999f23485f 5 152 | 414af967-9c18-4e1c-a6ca-f6404c6e1e75 5 153 | 75dd45f4-cd91-45f2-acf6-1c24764c4e6f 5 154 | 21f5ff9a-726c-4e74-b6c7-8b747bac575c 5 155 | ff4a01b6-242e-4775-b31a-3f4f5ca27898 5 156 | 12c03c0a-43b7-48d8-909a-467254e2e635 5 157 | a0af171e-5a6d-4e8a-a8ff-b50419c428bf 2 158 | 9026e8df-fc2d-437b-b1b1-225eeeaac5e4 5 159 | 7def10ce-101d-4117-bfd2-44aeb80bb73a 5 160 | adcc1f23-9130-4e28-97dd-95c06b03f138 2 161 | 42f1a61b-2703-498f-8afc-43aac659554c 2 162 | 7eb48d51-e780-4f7e-a429-ceb397672262 5 163 | 73399229-02b3-4017-a0bd-43577188acb0 5 164 | 13998d8e-7f81-4faa-b722-5f2560275d84 4 165 | c653e9e2-d746-40ab-add5-3842a68d18c7 4 166 | 6650b472-0f70-4049-b582-d4fd94f1e84c 5 167 | 43581635-b6d6-4bcf-a883-646db6a241ce 4 168 | 65fb3482-ba65-487c-9b0f-6cf41869257a 5 169 | 7997635d-7c2d-47f6-a8e1-477deaf34deb 5 170 | 26dd3f25-668d-4ccc-b2ad-2b63e8d4a902 5 171 | 0a3b69ba-0041-4467-8eb7-4603e4e41c4c 1 172 | 6f2f2af0-0c96-44d6-9e04-c82436a14dcb 3 173 | f0204266-2b55-428b-80ca-2d01d6ad7f82 4 174 | f39c382f-e9c3-4ed1-822a-1c3b41d9a1c1 4 175 | fb716b0a-344b-48f5-90f4-92db51c00dde 5 176 | 21f8760b-a8dc-42c2-af8e-df110e0b53b4 5 177 | db849e5b-7d47-4e1d-999d-2b39a6c0a79e 5 178 | 7f1205bf-33a5-4f7d-b030-32c6b9333c20 5 179 | 02b2181a-5c46-4769-bb18-0062d25c1da3 5 180 | 300d14db-dbc1-42c5-83c8-26799e9ee9ca 3 181 | 41f50d5b-42b1-4985-927d-268ae7b63668 5 182 | 639cedc1-b14f-49e1-a39c-226cd7f1d394 4 183 | 93d2a244-a8d8-432f-b60b-0869ac4a7fe5 5 184 | 4305771d-22ce-44da-86d4-0859db6cf7b9 4 185 | b034d5d0-b10e-425f-93aa-45f7e353132c 4 186 | e524f22f-b873-439c-ac70-8de5ddf89d35 1 187 | 5e80cc7b-7875-4596-9b12-6055203b9a10 3 188 | 3fcfc5d0-8c10-43ae-9ef5-50427556c6ed 5 189 | d96a0442-a1a6-4ea8-974f-4b11e3a674b8 3 190 | a4542301-9253-4089-857f-8542dba492f1 5 191 | 1504de06-54de-48cb-a5bd-2e0f5ab66a9a 5 192 | 8cc29b24-9861-4857-8542-e8ba332d6db2 5 193 | b51aaadf-18ef-4e91-af08-69d6ed915a2c 5 194 | 11a515c5-2fb2-40d2-b780-a5efb9cfcef3 5 195 | 295fe272-0b87-43db-8f6f-c39db455185b 5 196 | c254b9e5-80a4-40f3-895c-618e51a2800b 5 197 | 357c8a7e-f6d4-4bf7-8051-1e7880337896 5 198 | e9ec6880-0443-4db8-bd33-d4629c821cdd 4 199 | d764e032-e8ae-4fa7-adcc-773415342a4b 5 200 | c9f54bc9-6103-48b3-9031-96543c412e47 3 201 | 02c0fbd0-795d-4da2-8f4f-58e0930df727 5 202 | d482d3ed-055a-40df-90b3-5f196c2891e0 4 203 | f414b8e5-6307-4d9f-82c8-b4770fc3c5f6 1 204 | 4944ccb5-d0cc-4a97-a2aa-092019e9d571 5 205 | 0decf9f2-5828-488f-bb3a-6d3ea4bf440f 5 206 | 58c13fe9-147b-4542-8678-3c295e5b3a22 4 207 | af824988-667c-4208-9810-407bbde6ad02 5 208 | 5b272de5-2034-4cd3-bef3-72a2f77d4495 5 209 | e5abb4a5-ff0a-4da9-b820-e81b1bd28d53 5 210 | 92fca88e-0f91-4170-8650-4780cbd4f080 5 211 | 5085f036-0861-4986-9c48-8e1f5518b7da 5 212 | 8c866c49-eb7e-419e-89b9-7fa922f18f27 4 213 | cd78c10e-7d26-478a-ae0b-bd999ccfcdde 5 214 | 797be056-fd8b-4979-8a5a-6a78eb486f40 3 215 | 6807bda5-4df8-49e4-a99a-c13354739dec 5 216 | b8e52b05-5b4c-4bf2-bc86-f8981200442f 5 217 | 7a429f6f-0c61-47bc-bedc-a3a9fc9d7d91 5 218 | 59aabd81-771b-47c0-b117-ededfa318040 5 219 | 2741c230-14b8-49ce-8dd5-cde428a9fa10 5 220 | aa8f5dfa-e9f3-4b47-acae-7f238b2c3f8f 5 221 | 52e8b098-a074-4bcc-b73d-90c8c1ec1f63 5 222 | 768aa12e-d10e-4d7b-841a-c65fb24e52cf 5 223 | c6da3828-da52-4111-af4d-7670b40d1ba8 3 224 | 7a6faaee-a05d-4671-a773-9c2cac07aafd 5 225 | ee216f6e-3a00-4ee8-9ddf-3247ddc819cc 4 226 | 7791834b-64c0-4bc0-97db-06b62eb16599 5 227 | f83d88d0-3e97-417b-9b74-55d6262e58b2 5 228 | 79f06680-3022-4e50-be92-9410de37ba54 5 229 | 877fabff-dcbe-440a-9934-f295dd05b155 5 230 | 404c178b-9ed5-4271-aadf-d0826c492b5a 5 231 | 75e59a5f-4df6-455e-b92b-182ee52b6a65 5 232 | e58ff289-8aef-457d-b2f9-bf7be836eb05 1 233 | 030b59b8-85a1-4b2f-ad81-a977bb111222 5 234 | bf731dbb-7087-499a-84f0-e8a6aafcc03e 5 235 | 765dbf23-dd38-4287-878b-2f9ed5cd6c05 5 236 | b6a65567-3b5d-4042-a059-a807461fdcfb 5 237 | 835f64b8-38ae-4a35-9ae3-bd80e8a52eb2 3 238 | 7eb23580-1029-431a-ad06-f9a64b17be73 4 239 | 2de6eda1-0ee6-4c29-8554-752f89295874 2 240 | 15d2620b-4cd6-48a9-9b20-bda00424f136 5 241 | ec86db96-3e5b-4de8-88cc-060911299d5b 3 242 | e57a07d9-c419-4f17-bc74-97d51f6e7389 5 243 | 3e37dff6-711e-4c72-8fc7-8131df880454 4 244 | eef18854-1a1d-48e8-8780-76ebb1c268a5 5 245 | c22abb05-2b91-4941-8b6b-ec72801f99fb 5 246 | b927dd4c-5196-41d4-8bd6-b54aafe7faa1 5 247 | 02a7d169-8925-4d3a-9dd8-253c1ef37e5b 4 248 | def6de43-649a-47c8-9b79-2dd94f21689d 5 249 | a2b774d3-5a75-47bb-91d8-59c8e3769f51 2 250 | 0e6c70dd-44d6-40e5-b635-360302acc275 5 251 | 1df86592-7c6b-4194-ae09-7a2ea30093a0 5 252 | d0876e7b-0267-4ac6-ba5d-e955bd2ffda1 5 253 | -------------------------------------------------------------------------------- /assignment2/regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment2/regression.png -------------------------------------------------------------------------------- /assignment2/requirements-catalina.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.21.3 2 | pandas==0.25.1 3 | jupyter==1.0.0 4 | apache-beam==2.14 5 | tensorflow==1.15.2 6 | tensorflow-data-validation==0.14.1 7 | -------------------------------------------------------------------------------- /assignment2/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.21.3 2 | pandas==0.25.1 3 | jupyter==1.0.0 4 | apache-beam==2.15 5 | tensorflow==1.15.2 6 | tensorflow-data-validation==0.13.0 7 | 8 | -------------------------------------------------------------------------------- /assignment2/task1.py: -------------------------------------------------------------------------------- 1 | from components.schema_validation import infer_schema_from_csv, has_anomalies, adjust_product_schema, adjust_rating_schema 2 | 3 | # We infer the schema from the first data file 4 | product_schema = infer_schema_from_csv('data/products-data-0.tsv', column_names=['id', 'category', 'description']) 5 | 6 | # We adjust the schema with some constraint that the automatic inference might not have captured 7 | adjust_product_schema(product_schema) 8 | 9 | # We use the schema to check for anomalies in subsequent data files 10 | assert not has_anomalies('data/products-data-0.tsv', product_schema) 11 | assert not has_anomalies('data/products-data-1.tsv', product_schema) 12 | assert not has_anomalies('data/products-data-2.tsv', product_schema) 13 | assert has_anomalies('data/products-data-3.tsv', product_schema) 14 | 15 | # We infer the schema from the first data file 16 | rating_schema = infer_schema_from_csv('data/ratings-0.tsv', column_names=['id', 'rating']) 17 | 18 | # We adjust the schema with some constraint that the automatic inference might not have captured 19 | adjust_rating_schema(rating_schema) 20 | 21 | # We use the schema to check for anomalies in subsequent data files 22 | assert not has_anomalies('data/ratings-0.tsv', rating_schema) 23 | assert not has_anomalies('data/ratings-1.tsv', rating_schema) 24 | assert has_anomalies('data/ratings-2.tsv', rating_schema) 25 | assert has_anomalies('data/ratings-3.tsv', rating_schema) -------------------------------------------------------------------------------- /assignment2/task2.py: -------------------------------------------------------------------------------- 1 | from components.beam_job import create_and_run_beam_job 2 | from os import path 3 | 4 | # Invoke a beam job to join products and ratings, filter them according to the instructions, 5 | # and count the number of entries per category 6 | create_and_run_beam_job(path_to_products_file='data/products-data-0.tsv', path_to_ratings_file='data/ratings-0.tsv') 7 | 8 | 9 | # Validate the outputs 10 | assert path.exists('category_counts.tsv-00000-of-00001') 11 | 12 | counts = {} 13 | with open('category_counts.tsv-00000-of-00001') as results_file: 14 | for line in results_file: 15 | category, count = line.strip().split('\t') 16 | counts[category] = int(count) 17 | 18 | assert 'Kitchen' in counts 19 | assert counts['Kitchen'] == 217 20 | 21 | assert 'Jewelry' in counts 22 | assert counts['Jewelry'] == 148 -------------------------------------------------------------------------------- /assignment2/task2_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "task2-todo.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | } 13 | }, 14 | "cells": [ 15 | { 16 | "cell_type": "code", 17 | "metadata": { 18 | "id": "ejZtKRkQWHEh", 19 | "colab_type": "code", 20 | "colab": {} 21 | }, 22 | "source": [ 23 | "import argparse\n", 24 | "import os\n", 25 | "import pprint\n", 26 | "import tempfile\n", 27 | "import urllib.request\n", 28 | "import zipfile\n", 29 | "import warnings; warnings.simplefilter('ignore')\n", 30 | "\n", 31 | "temp = tempfile.gettempdir()\n", 32 | "zip, headers = urllib.request.urlretrieve('https://raw.githubusercontent.com/schelterlabs/deml-lab/master/assignment2/data.zip')\n", 33 | "zipfile.ZipFile(zip).extractall(temp)\n", 34 | "zipfile.ZipFile(zip).close()\n", 35 | "urllib.request.urlcleanup()" 36 | ], 37 | "execution_count": 0, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "Sx_D-0a2axzu", 44 | "colab_type": "code", 45 | "colab": { 46 | "base_uri": "https://localhost:8080/", 47 | "height": 1000 48 | }, 49 | "outputId": "10d1956d-da67-4aa6-80e1-f05e12e7099a" 50 | }, 51 | "source": [ 52 | "!pip install apache-beam" 53 | ], 54 | "execution_count": 3, 55 | "outputs": [ 56 | { 57 | "output_type": "stream", 58 | "text": [ 59 | "Collecting apache-beam\n", 60 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/82/b3/b6dcbd94bf8a5ae6a0be5fc988bdfb0b0dfb87ea37e788dc4dcc039a3aee/apache_beam-2.16.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n", 61 | "\u001b[K |████████████████████████████████| 3.0MB 5.1MB/s \n", 62 | "\u001b[?25hCollecting mock<3.0.0,>=1.0.1 (from apache-beam)\n", 63 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl (56kB)\n", 64 | "\u001b[K |████████████████████████████████| 61kB 28.4MB/s \n", 65 | "\u001b[?25hRequirement already satisfied: protobuf<4,>=3.5.0.post1 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (3.7.1)\n", 66 | "Requirement already satisfied: pytz>=2018.3 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (2018.9)\n", 67 | "Requirement already satisfied: pyarrow<0.15.0,>=0.11.1; python_version >= \"3.0\" or platform_system != \"Windows\" in /usr/local/lib/python3.6/dist-packages (from apache-beam) (0.14.1)\n", 68 | "Collecting dill<0.3.1,>=0.3.0 (from apache-beam)\n", 69 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/39/7a/70803635c850e351257029089d38748516a280864c97cbc73087afef6d51/dill-0.3.0.tar.gz (151kB)\n", 70 | "\u001b[K |████████████████████████████████| 153kB 49.2MB/s \n", 71 | "\u001b[?25hCollecting avro-python3<2.0.0,>=1.8.1; python_version >= \"3.0\" (from apache-beam)\n", 72 | " Downloading https://files.pythonhosted.org/packages/76/b2/98a736a31213d3e281a62bcae5572cf297d2546bc429accf36f9ee1604bf/avro-python3-1.9.1.tar.gz\n", 73 | "Requirement already satisfied: httplib2<=0.12.0,>=0.8 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (0.11.3)\n", 74 | "Collecting hdfs<3.0.0,>=2.1.0 (from apache-beam)\n", 75 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/82/39/2c0879b1bcfd1f6ad078eb210d09dbce21072386a3997074ee91e60ddc5a/hdfs-2.5.8.tar.gz (41kB)\n", 76 | "\u001b[K |████████████████████████████████| 51kB 25.2MB/s \n", 77 | "\u001b[?25hRequirement already satisfied: grpcio<2,>=1.12.1 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (1.15.0)\n", 78 | "Collecting oauth2client<4,>=2.0.1 (from apache-beam)\n", 79 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c0/7b/bc893e35d6ca46a72faa4b9eaac25c687ce60e1fbe978993fe2de1b0ff0d/oauth2client-3.0.0.tar.gz (77kB)\n", 80 | "\u001b[K |████████████████████████████████| 81kB 30.4MB/s \n", 81 | "\u001b[?25hRequirement already satisfied: pydot<2,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (1.3.0)\n", 82 | "Requirement already satisfied: crcmod<2.0,>=1.7 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (1.7)\n", 83 | "Requirement already satisfied: pymongo<4.0.0,>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (3.9.0)\n", 84 | "Collecting fastavro<0.22,>=0.21.4 (from apache-beam)\n", 85 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e1/28/0206330c0002b1e28e21473117d0dc813defbd5891562d27af5c68c93899/fastavro-0.21.24-cp36-cp36m-manylinux1_x86_64.whl (1.2MB)\n", 86 | "\u001b[K |████████████████████████████████| 1.2MB 30.7MB/s \n", 87 | "\u001b[?25hCollecting python-dateutil<3,>=2.8.0 (from apache-beam)\n", 88 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl (226kB)\n", 89 | "\u001b[K |████████████████████████████████| 235kB 55.1MB/s \n", 90 | "\u001b[?25hRequirement already satisfied: pyyaml<4.0.0,>=3.12 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (3.13)\n", 91 | "Requirement already satisfied: future<1.0.0,>=0.16.0 in /usr/local/lib/python3.6/dist-packages (from apache-beam) (0.16.0)\n", 92 | "Collecting pbr>=0.11 (from mock<3.0.0,>=1.0.1->apache-beam)\n", 93 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/46/a4/d5c83831a3452713e4b4f126149bc4fbda170f7cb16a86a00ce57ce0e9ad/pbr-5.4.3-py2.py3-none-any.whl (110kB)\n", 94 | "\u001b[K |████████████████████████████████| 112kB 51.3MB/s \n", 95 | "\u001b[?25hRequirement already satisfied: six>=1.9 in /usr/local/lib/python3.6/dist-packages (from mock<3.0.0,>=1.0.1->apache-beam) (1.12.0)\n", 96 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf<4,>=3.5.0.post1->apache-beam) (41.2.0)\n", 97 | "Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.6/dist-packages (from pyarrow<0.15.0,>=0.11.1; python_version >= \"3.0\" or platform_system != \"Windows\"->apache-beam) (1.16.5)\n", 98 | "Requirement already satisfied: docopt in /usr/local/lib/python3.6/dist-packages (from hdfs<3.0.0,>=2.1.0->apache-beam) (0.6.2)\n", 99 | "Requirement already satisfied: requests>=2.7.0 in /usr/local/lib/python3.6/dist-packages (from hdfs<3.0.0,>=2.1.0->apache-beam) (2.21.0)\n", 100 | "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client<4,>=2.0.1->apache-beam) (0.4.7)\n", 101 | "Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client<4,>=2.0.1->apache-beam) (0.2.6)\n", 102 | "Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client<4,>=2.0.1->apache-beam) (4.0)\n", 103 | "Requirement already satisfied: pyparsing>=2.1.4 in /usr/local/lib/python3.6/dist-packages (from pydot<2,>=1.2.0->apache-beam) (2.4.2)\n", 104 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (3.0.4)\n", 105 | "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (2.8)\n", 106 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (2019.9.11)\n", 107 | "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam) (1.24.3)\n", 108 | "Building wheels for collected packages: dill, avro-python3, hdfs, oauth2client\n", 109 | " Building wheel for dill (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 110 | " Created wheel for dill: filename=dill-0.3.0-cp36-none-any.whl size=77513 sha256=e6dfbeb0c7e7fbd0bd6d8d837ccca6a2b6e855d83616aa6909015337e47dcd62\n", 111 | " Stored in directory: /root/.cache/pip/wheels/c9/de/a4/a91eec4eea652104d8c81b633f32ead5eb57d1b294eab24167\n", 112 | " Building wheel for avro-python3 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 113 | " Created wheel for avro-python3: filename=avro_python3-1.9.1-cp36-none-any.whl size=43199 sha256=232b7eb9d62dfdd8f81172a1f1897fc423b39aa4aa1dbf9be51871ef084bf44d\n", 114 | " Stored in directory: /root/.cache/pip/wheels/94/54/6f/a5df680fd3224aa45145686f3b1b02a878a90ea769fcf9daaf\n", 115 | " Building wheel for hdfs (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 116 | " Created wheel for hdfs: filename=hdfs-2.5.8-cp36-none-any.whl size=33214 sha256=ae5926ae12eee4e2f531509699a68794abfb0ad0c69e78f607c10b1e9cd72df8\n", 117 | " Stored in directory: /root/.cache/pip/wheels/fe/a7/05/23e3699975fc20f8a30e00ac1e515ab8c61168e982abe4ce70\n", 118 | " Building wheel for oauth2client (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 119 | " Created wheel for oauth2client: filename=oauth2client-3.0.0-cp36-none-any.whl size=106382 sha256=928061fecdba926e8924bdd25cfd436fc965e9752fd28fb37d948b6d73783338\n", 120 | " Stored in directory: /root/.cache/pip/wheels/48/f7/87/b932f09c6335dbcf45d916937105a372ab14f353a9ca431d7d\n", 121 | "Successfully built dill avro-python3 hdfs oauth2client\n", 122 | "\u001b[31mERROR: pydrive 1.3.1 has requirement oauth2client>=4.0.0, but you'll have oauth2client 3.0.0 which is incompatible.\u001b[0m\n", 123 | "\u001b[31mERROR: multiprocess 0.70.9 has requirement dill>=0.3.1, but you'll have dill 0.3.0 which is incompatible.\u001b[0m\n", 124 | "\u001b[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.\u001b[0m\n", 125 | "Installing collected packages: pbr, mock, dill, avro-python3, hdfs, oauth2client, fastavro, python-dateutil, apache-beam\n", 126 | " Found existing installation: dill 0.3.1.1\n", 127 | " Uninstalling dill-0.3.1.1:\n", 128 | " Successfully uninstalled dill-0.3.1.1\n", 129 | " Found existing installation: oauth2client 4.1.3\n", 130 | " Uninstalling oauth2client-4.1.3:\n", 131 | " Successfully uninstalled oauth2client-4.1.3\n", 132 | " Found existing installation: python-dateutil 2.5.3\n", 133 | " Uninstalling python-dateutil-2.5.3:\n", 134 | " Successfully uninstalled python-dateutil-2.5.3\n", 135 | "Successfully installed apache-beam-2.16.0 avro-python3-1.9.1 dill-0.3.0 fastavro-0.21.24 hdfs-2.5.8 mock-2.0.0 oauth2client-3.0.0 pbr-5.4.3 python-dateutil-2.8.0\n" 136 | ], 137 | "name": "stdout" 138 | }, 139 | { 140 | "output_type": "display_data", 141 | "data": { 142 | "application/vnd.colab-display-data+json": { 143 | "pip_warning": { 144 | "packages": [ 145 | "dateutil" 146 | ] 147 | } 148 | } 149 | }, 150 | "metadata": { 151 | "tags": [] 152 | } 153 | } 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "metadata": { 159 | "id": "6u3tv7XLarX8", 160 | "colab_type": "code", 161 | "colab": {} 162 | }, 163 | "source": [ 164 | "from __future__ import absolute_import\n", 165 | "\n", 166 | "from apache_beam.options.pipeline_options import PipelineOptions\n", 167 | "import apache_beam as beam\n", 168 | "\n", 169 | "\n", 170 | "def create_and_run_beam_job(path_to_products_file, path_to_ratings_file):\n", 171 | " pass" 172 | ], 173 | "execution_count": 0, 174 | "outputs": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "metadata": { 179 | "id": "6cL4oHrObJP6", 180 | "colab_type": "code", 181 | "colab": {} 182 | }, 183 | "source": [ 184 | "def path_to_file(file):\n", 185 | " return os.path.join(temp, file)" 186 | ], 187 | "execution_count": 0, 188 | "outputs": [] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "oIVWS2rpa_ef", 194 | "colab_type": "code", 195 | "colab": { 196 | "base_uri": "https://localhost:8080/", 197 | "height": 89 198 | }, 199 | "outputId": "62c7fa90-35ed-4d56-dacf-4ac9285292e2" 200 | }, 201 | "source": [ 202 | "from os import path\n", 203 | "\n", 204 | "create_and_run_beam_job(\n", 205 | " path_to_products_file=path_to_file('data/products-data-0.tsv'), \n", 206 | " path_to_ratings_file=path_to_file('data/ratings-0.tsv'))\n", 207 | "\n", 208 | "\n", 209 | "\n", 210 | "assert path.exists('category_counts.tsv-00000-of-00001')\n", 211 | "\n", 212 | "counts = {}\n", 213 | "with open('category_counts.tsv-00000-of-00001') as results_file:\n", 214 | " for line in results_file:\n", 215 | " category, count = line.strip().split('\\t')\n", 216 | " counts[category] = int(count)\n", 217 | "\n", 218 | "assert 'Kitchen' in counts\n", 219 | "assert counts['Kitchen'] == 217\n", 220 | "\n", 221 | "assert 'Jewelry' in counts\n", 222 | "assert counts['Jewelry'] == 148" 223 | ], 224 | "execution_count": 7, 225 | "outputs": [ 226 | { 227 | "output_type": "stream", 228 | "text": [ 229 | "Kitchen\t217\n", 230 | "\n", 231 | "Jewelry\t148\n", 232 | "\n" 233 | ], 234 | "name": "stdout" 235 | } 236 | ] 237 | } 238 | ] 239 | } 240 | -------------------------------------------------------------------------------- /assignment2/task3.py: -------------------------------------------------------------------------------- 1 | from components.mapreduce import MapReduceEngine 2 | 3 | partitioned_documents = [ 4 | [(1, "Hello World"), (2, "hello universe")], 5 | [(3, "Hello Galaxy")] 6 | ] 7 | 8 | 9 | 10 | def tokenize_document(_key, document): 11 | return [(word.lower(), 1) for word in document.split(" ")] 12 | 13 | def count_per_word(word, counts): 14 | return word, sum(counts) 15 | 16 | 17 | 18 | engine = MapReduceEngine(f_m=tokenize_document, f_r=count_per_word, num_reducers=2) 19 | 20 | results = engine.execute(partitioned_documents) 21 | 22 | print(results) 23 | 24 | assert(len(results) == 4) 25 | 26 | assert('hello' in results) 27 | assert(results['hello'] == 3) 28 | 29 | assert('world' in results) 30 | assert(results['world'] == 1) 31 | 32 | assert('universe' in results) 33 | assert(results['universe'] == 1) 34 | 35 | assert('galaxy' in results) 36 | assert(results['galaxy'] == 1) 37 | -------------------------------------------------------------------------------- /assignment2/task4.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import make_regression 2 | from components.mapreduce import MapReduceEngine 3 | from components.linear_regression import f_m, f_r, result_key 4 | from utils import to_partitions 5 | import numpy as np 6 | 7 | # Generate an artifical regression problem 8 | X, y = make_regression(n_samples=100, n_features=5, random_state=42) 9 | 10 | # Generate partitioned input data 11 | partitions = to_partitions(X, y, num_partitions=10, num_records_per_partition=10) 12 | 13 | # Run the computation using our self-coded mapreduce engine 14 | engine = MapReduceEngine(f_m=f_m, f_r=f_r, num_reducers=2) 15 | results = engine.execute(partitions) 16 | 17 | # Retrieve the final result 18 | w_mapreduce = results[result_key()] 19 | 20 | # Ensure that we computed the correct result 21 | w_local = np.linalg.solve(np.matmul(np.transpose(X), X), np.dot(np.transpose(X), y)) 22 | 23 | assert(w_mapreduce.shape == w_local.shape) 24 | assert(np.linalg.norm(w_local - w_mapreduce) < 0.0001) 25 | -------------------------------------------------------------------------------- /assignment2/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def to_partitions(X, y, num_partitions, num_records_per_partition): 3 | partitions = [] 4 | 5 | for partition_index in range(0, num_partitions): 6 | partition = [] 7 | for record_index in range(0, num_records_per_partition): 8 | i = partition_index * num_records_per_partition + record_index 9 | 10 | x_pi = X[i, :] 11 | y_pi = y[i] 12 | record = (i, (x_pi, y_pi)) 13 | 14 | partition.append(record) 15 | 16 | partitions.append(partition) 17 | 18 | return partitions -------------------------------------------------------------------------------- /assignment3/README.md: -------------------------------------------------------------------------------- 1 | ## Task 1: Implementing Layers in a Neural Network 2 | 3 | The goal of this [task](task1.py) is to complete the implementation of a neural network for classifying points from a synthetically generated dataset: 4 | 5 | ![](moon.png) 6 | 7 | You only have to implement the forward pass; the backward pass and weight updates are already given. The network is defined as follows: 8 | 9 | ![](network.png) 10 | 11 | Implement the forward pass in the [first fully connected layer, which applies a `tanh` non-linearity](components/neuralnetwork.py#L65), in the [second fully connected layer](components/neuralnetwork.py#L42) and in the [softmax output](components/neuralnetwork.py#L87). Finally, invoke your implemented methods to conduct the [full forward pass through the network](components/neuralnetwork.py#L9) and return the computed probabilities. You can execute this task via `python task1.py` 12 | 13 | ## Task 2, 3 & 4: Translating Scikit-learn Pipelines to Dataflow Graphs 14 | 15 | In the remaining three tasks, you have to implement [a method to convert scikit-learn pipelines into a dataflow representation](components/graph.py#L48). Given a pipeline, your code has to inspect it and generate a list of connected [DataflowVertex](components/graph.py#L1) objects, which represent the operations and dataflow in the pipeline. 16 | 17 | A vertex in this graph is defined as follows, where the `name` refers to the step name in the pipeline, the `operation` is the name of the transformer class which executes the step and the `parent_vertices` are operations that the current vertex depends on. 18 | 19 | ```python 20 | class DataFlowVertex: 21 | def __init__(self, parent_vertices, name, operation): 22 | self.parent_vertices = parent_vertices 23 | self.name = name 24 | self.operation = operation 25 | ``` 26 | Your method has to handle four different pipelines with growing complexity. The resulting graphs should look as follows: 27 | 28 | ![](graphs.png) 29 | -------------------------------------------------------------------------------- /assignment3/adult-sample.csv: -------------------------------------------------------------------------------- 1 | age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year 2 | 28,Private,273269,Some-college,10,Never-married,Craft-repair,Not-in-family,Black,Male,0,0,40,United-States,<=50K 3 | 58,State-gov,123329,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,16,United-States,<=50K 4 | 34,Private,79637,Bachelors,13,Never-married,Exec-managerial,Own-child,Amer-Indian-Eskimo,Female,0,0,40,United-States,<=50K 5 | 71,Private,97870,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,15,Germany,<=50K 6 | 20,State-gov,41103,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K 7 | 46,Private,125492,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Black,Female,0,0,40,United-States,<=50K 8 | 31,Private,467579,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,1887,40,United-States,>50K 9 | 24,Private,376393,Assoc-voc,11,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 10 | 21,Private,56582,11th,7,Never-married,Other-service,Own-child,White,Male,0,0,50,United-States,<=50K 11 | 38,Private,76317,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K 12 | 43,Federal-gov,144778,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,>50K 13 | 47,Private,454989,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K 14 | 23,Private,278254,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,45,United-States,<=50K 15 | 38,Private,111499,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,7298,0,50,United-States,>50K 16 | 31,Private,168521,Bachelors,13,Never-married,Exec-managerial,Unmarried,White,Female,0,0,50,United-States,<=50K 17 | 36,Private,749636,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 18 | 27,Private,167405,HS-grad,9,Married-spouse-absent,Farming-fishing,Own-child,White,Female,0,0,40,Mexico,<=50K 19 | 32,Private,317378,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Female,10520,0,40,United-States,>50K 20 | 55,State-gov,71630,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,1617,40,United-States,<=50K 21 | 33,Private,182401,10th,6,Never-married,Adm-clerical,Not-in-family,Black,Male,0,0,40,United-States,<=50K 22 | 21,Private,33616,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,25,United-States,<=50K 23 | 25,Private,362912,Some-college,10,Never-married,Craft-repair,Own-child,White,Female,0,0,50,United-States,<=50K 24 | 28,Private,34335,HS-grad,9,Divorced,Sales,Not-in-family,Amer-Indian-Eskimo,Male,14084,0,40,United-States,>50K 25 | 51,Private,305147,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K 26 | 26,Private,50103,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 27 | 62,State-gov,221558,Masters,14,Separated,Prof-specialty,Unmarried,White,Female,0,0,24,?,<=50K 28 | 37,Private,138940,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 29 | 55,Self-emp-not-inc,52888,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,10,United-States,<=50K 30 | 46,Local-gov,125457,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,38,United-States,>50K 31 | 47,Private,102771,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,Portugal,<=50K 32 | 60,?,41517,11th,7,Married-spouse-absent,?,Unmarried,Black,Female,0,0,20,United-States,<=50K 33 | 34,Private,153614,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,>50K 34 | 32,Local-gov,157887,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 35 | 35,Private,308691,Masters,14,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States,<=50K 36 | 48,Self-emp-inc,238966,Some-college,10,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K 37 | 67,Private,123393,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K 38 | 48,Private,25468,Masters,14,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,50,United-States,>50K 39 | 30,Private,117393,HS-grad,9,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 40 | 40,Private,175686,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 41 | 58,Private,259014,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Male,0,0,20,United-States,<=50K 42 | 19,?,134974,Some-college,10,Never-married,?,Own-child,White,Female,0,0,20,United-States,<=50K 43 | 25,Private,49092,Bachelors,13,Never-married,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K 44 | 33,Local-gov,224185,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 45 | 43,Private,136721,12th,8,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 46 | 37,Private,314963,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K 47 | 21,State-gov,337766,Some-college,10,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States,<=50K 48 | 51,Self-emp-not-inc,111939,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,35,United-States,>50K 49 | 43,Private,151089,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,<=50K 50 | 49,Private,120629,Bachelors,13,Divorced,Exec-managerial,Not-in-family,Black,Female,27828,0,60,United-States,>50K 51 | 38,Local-gov,201410,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K 52 | 49,Private,61307,7th-8th,4,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,38,United-States,<=50K 53 | 36,Private,135289,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,48,United-States,>50K 54 | 36,Self-emp-not-inc,89622,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,80,United-States,>50K 55 | 21,Private,216070,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,Amer-Indian-Eskimo,Female,0,0,46,United-States,>50K 56 | 42,Private,138662,Some-college,10,Separated,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K 57 | 35,Private,385847,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 58 | 20,Private,189148,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,48,United-States,<=50K 59 | 22,Private,252355,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,27,United-States,<=50K 60 | 46,Private,243743,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K 61 | 33,Private,290763,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 62 | 23,?,99399,Some-college,10,Never-married,?,Unmarried,Amer-Indian-Eskimo,Female,0,0,25,United-States,<=50K 63 | 44,Private,160829,Bachelors,13,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,20,United-States,>50K 64 | 46,Local-gov,329752,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,30,United-States,<=50K 65 | 52,Private,117674,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K 66 | 20,?,150084,Some-college,10,Never-married,?,Own-child,White,Male,0,0,25,United-States,<=50K 67 | 49,State-gov,203039,11th,7,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K 68 | 38,Private,210438,7th-8th,4,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K 69 | 29,Private,163265,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K 70 | 18,Private,43272,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States,<=50K 71 | 54,Self-emp-not-inc,103179,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K 72 | 48,Private,449354,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,4386,0,45,United-States,>50K 73 | 29,Private,297544,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 74 | 30,Private,161690,Assoc-voc,11,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K 75 | 31,Local-gov,219883,HS-grad,9,Never-married,Protective-serv,Not-in-family,Black,Male,0,0,40,United-States,<=50K 76 | 40,Federal-gov,121012,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,7298,0,48,United-States,>50K 77 | 32,Private,207172,Some-college,10,Never-married,Sales,Other-relative,White,Female,0,0,40,United-States,<=50K 78 | 47,Private,148995,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,2415,60,United-States,>50K 79 | 19,Private,292590,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K 80 | 45,Private,274657,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,?,<=50K 81 | 49,Private,189498,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K 82 | 18,Private,25837,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,25,United-States,<=50K 83 | 33,State-gov,306309,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States,<=50K 84 | 48,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 85 | 30,Local-gov,289442,HS-grad,9,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K 86 | 55,Private,89690,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,>50K 87 | 47,Self-emp-not-inc,237731,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,2829,0,65,United-States,<=50K 88 | 72,?,402306,Some-college,10,Married-civ-spouse,?,Husband,White,Male,0,0,32,Canada,<=50K 89 | 27,Private,119793,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 90 | 44,Private,116358,Bachelors,13,Married-civ-spouse,Sales,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K 91 | 23,Private,55215,Bachelors,13,Never-married,Sales,Own-child,White,Male,0,0,55,United-States,<=50K 92 | 33,Private,184784,10th,6,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K 93 | 43,Private,269015,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0,0,40,Germany,>50K 94 | 46,Private,146919,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K 95 | 90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K 96 | 34,Private,19847,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 97 | 32,Private,108116,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1902,60,United-States,>50K 98 | 42,Self-emp-not-inc,32185,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K 99 | 18,Private,333611,5th-6th,3,Never-married,Other-service,Other-relative,White,Male,0,0,54,Mexico,<=50K 100 | 25,Private,50053,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,Japan,<=50K 101 | 28,Private,119287,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,28,United-States,>50K 102 | -------------------------------------------------------------------------------- /assignment3/components/graph.py: -------------------------------------------------------------------------------- 1 | class DataFlowVertex: 2 | def __init__(self, parent_vertices, name, operation): 3 | self.parent_vertices = parent_vertices 4 | self.name = name 5 | self.operation = operation 6 | 7 | def __repr__(self): 8 | return "{}, (name={}, op={})".format(self.parent_vertices, self.name, self.operation) 9 | 10 | # Helper function to topologically sort a DAG 11 | def topo_sort(graph): 12 | adjacency_list = {vertex.name: [] for vertex in graph} 13 | visited = {vertex.name: False for vertex in graph} 14 | 15 | for vertex in graph: 16 | for parent_vertex in vertex.parent_vertices: 17 | adjacency_list[parent_vertex.name].append(vertex.name) 18 | 19 | output = [] 20 | 21 | def toposort(vertex_name, adjacency_list, visited, output): 22 | visited[vertex_name] = True 23 | for child_name in adjacency_list[vertex_name]: 24 | if not visited[child_name]: 25 | toposort(child_name, adjacency_list, visited, output) 26 | output.append(vertex_name) 27 | 28 | for vertex_name in adjacency_list.keys(): 29 | if not visited[vertex_name]: 30 | toposort(vertex_name, adjacency_list, visited, output) 31 | 32 | output.reverse() 33 | 34 | vertices_by_name = {vertex.name: vertex for vertex in graph} 35 | 36 | sorted_graph = [] 37 | for vertex_name in output: 38 | sorted_graph.append(vertices_by_name[vertex_name]) 39 | return sorted_graph 40 | 41 | 42 | # Helper function to pick the sink from a single-sink DAG 43 | def find_sink(graph): 44 | sorted_graph = topo_sort(graph) 45 | return sorted_graph[-1] 46 | 47 | 48 | def pipeline_to_dataflow_graph(pipeline, name_prefix='', parent_vertices=[]): 49 | graph = [] 50 | 51 | # TODO Implement translation of the pipeline into a list of DataFlowVertex objects 52 | 53 | return graph 54 | -------------------------------------------------------------------------------- /assignment3/components/neuralnetwork.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class NeuralNetwork: 5 | 6 | def __init__(self, layers): 7 | self.layers = layers 8 | 9 | def forward(self, inputs): 10 | # TODO implement the forward pass through the network 11 | # TODO each layer must store its output 12 | # TODO additionally: return the output of the last layer from this function 13 | pass 14 | 15 | def backward(self, y_true): 16 | for layer_index in reversed(range(1, len(self.layers))): 17 | if layer_index == len(self.layers) - 1: 18 | self.layers[layer_index].backward(y_true, None) 19 | else: 20 | previous_gradient = self.layers[layer_index + 1].gradient 21 | self.layers[layer_index].backward(previous_gradient, self.layers[layer_index - 1].output) 22 | 23 | def update_weights(self, inputs, reg_lambda, epsilon): 24 | for layer_index in range(0, len(self.layers) - 1): 25 | if layer_index == 0: 26 | self.layers[0].update(inputs, self.layers[1].gradient, reg_lambda, epsilon) 27 | else: 28 | self.layers[layer_index].update( 29 | self.layers[layer_index - 1].output, 30 | self.layers[layer_index + 1].gradient, 31 | reg_lambda, epsilon) 32 | 33 | 34 | class FullyConnectedLayer: 35 | 36 | def __init__(self, input_size, layer_size): 37 | self.W = np.random.randn(input_size, layer_size) / np.sqrt(input_size) 38 | self.b = np.zeros((1, layer_size)) 39 | self.output = None 40 | self.gradient = None 41 | 42 | def forward(self, inputs): 43 | # TODO implement forward pass 44 | pass 45 | 46 | def backward(self, previous_gradient, inputs): 47 | self.gradient = previous_gradient.dot(self.W.T) * (1 - np.power(inputs, 2)) 48 | 49 | def update(self, inputs, previous_gradient, reg_lambda, epsilon): 50 | dW = (inputs.T).dot(previous_gradient) + reg_lambda * self.W 51 | db = np.sum(previous_gradient, axis=0) 52 | 53 | self.W -= epsilon * dW 54 | self.b -= epsilon * db 55 | 56 | 57 | class FullyConnectedLayerWithActivation: 58 | 59 | def __init__(self, input_size, layer_size): 60 | self.W = np.random.randn(input_size, layer_size) / np.sqrt(input_size) 61 | self.b = np.zeros((1, layer_size)) 62 | self.output = None 63 | self.gradient = None 64 | 65 | def forward(self, inputs): 66 | # TODO implement forward pass with tanh as activation function 67 | pass 68 | 69 | def backward(self, previous_gradient): 70 | pass 71 | 72 | def update(self, inputs, previous_gradient, reg_lambda, epsilon): 73 | dW = (inputs.T).dot(previous_gradient) + reg_lambda * self.W 74 | db = np.sum(previous_gradient, axis=0) 75 | 76 | self.W -= epsilon * dW 77 | self.b -= epsilon * db 78 | 79 | 80 | class SoftMax: 81 | 82 | def __init__(self, num_examples): 83 | self.output = None 84 | self.gradient = None 85 | self.num_examples = num_examples 86 | 87 | def forward(self, inputs): 88 | # TODO implement softmax computation 89 | pass 90 | 91 | def backward(self, y_true, inputs): 92 | self.gradient = np.copy(self.output) 93 | self.gradient[range(self.num_examples), y_true] -= 1 94 | 95 | 96 | # Helper function to evaluate the total loss on the dataset 97 | def calculate_loss(network, X, y): 98 | num_examples = len(X) 99 | probabilities = network.forward(X) 100 | return (1.0 / num_examples) * np.sum(-np.log(probabilities[range(num_examples), y])) 101 | 102 | 103 | def predict(network, x): 104 | probabilities = network.forward(x) 105 | return np.argmax(probabilities, axis=1) 106 | -------------------------------------------------------------------------------- /assignment3/graphs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment3/graphs.png -------------------------------------------------------------------------------- /assignment3/moon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment3/moon.png -------------------------------------------------------------------------------- /assignment3/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/assignment3/network.png -------------------------------------------------------------------------------- /assignment3/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.21.3 2 | pandas==0.25.1 3 | numpy==1.17 4 | jupyter==1.0.0 5 | 6 | -------------------------------------------------------------------------------- /assignment3/task1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import datasets 3 | 4 | from components.neuralnetwork import * 5 | 6 | np.random.seed(0) 7 | 8 | X, y = datasets.make_moons(200, noise=0.20) 9 | 10 | input_dimensions = 2 11 | num_classes = 2 12 | num_training_examples = len(X) 13 | 14 | size_of_hidden_layer = 3 15 | epsilon = 0.01 # learning rate for gradient descent 16 | reg_lambda = 0.01 # regularization strength 17 | 18 | network = NeuralNetwork([ 19 | FullyConnectedLayerWithActivation(input_dimensions, size_of_hidden_layer), 20 | FullyConnectedLayer(size_of_hidden_layer, num_classes), 21 | SoftMax(num_training_examples) 22 | ]) 23 | 24 | # Train the network with batch gradient descent 25 | for iteration in range(0, 20000): 26 | 27 | # Forward pass 28 | network.forward(X) 29 | # Backward pass 30 | network.backward(y) 31 | # Parameter updates 32 | network.update_weights(X, reg_lambda, epsilon) 33 | 34 | if iteration % 1000 == 0: 35 | print("Loss after iteration %i: %f" % (iteration, calculate_loss(network, X, y))) 36 | 37 | assert calculate_loss(network, X, y) < 0.08 38 | -------------------------------------------------------------------------------- /assignment3/task2.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import Binarizer 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.datasets import load_iris 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn.preprocessing import StandardScaler 6 | 7 | from components.graph import pipeline_to_dataflow_graph 8 | 9 | # EXAMPLE 1 10 | data = [[13.0, 0.0, 1.0], 11 | [27.0, 1.0, 0.0]] 12 | 13 | binarizer_pipeline = Pipeline([ 14 | ('binarization', Binarizer(threshold=5.0)) 15 | ]) 16 | 17 | binarizer_model = binarizer_pipeline.fit(data) 18 | binarizer_graph = pipeline_to_dataflow_graph(binarizer_model) 19 | 20 | assert len(binarizer_graph) == 1 21 | 22 | vertex = binarizer_graph[0] 23 | 24 | assert len(vertex.parent_vertices) == 0 25 | assert vertex.name == 'binarization' 26 | assert vertex.operation == 'Binarizer' 27 | 28 | 29 | # EXAMPLE 2 30 | iris_dataset = load_iris() 31 | 32 | iris_pipeline = Pipeline(steps=[ 33 | ('scaler', StandardScaler()), 34 | ('classifier', DecisionTreeClassifier())]) 35 | 36 | iris_model = iris_pipeline.fit(iris_dataset.data, iris_dataset.target) 37 | iris_graph = pipeline_to_dataflow_graph(iris_model, parent_vertices=[]) 38 | 39 | assert len(iris_graph) == 2 40 | 41 | for vertex in iris_graph: 42 | if vertex.name == 'scaler': 43 | assert len(vertex.parent_vertices) == 0 44 | if vertex.name == 'classifier': 45 | assert len(vertex.parent_vertices) == 1 46 | assert vertex.parent_vertices[0].operation == 'StandardScaler' 47 | -------------------------------------------------------------------------------- /assignment3/task3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize 5 | from sklearn.compose import ColumnTransformer 6 | 7 | from components.graph import pipeline_to_dataflow_graph 8 | 9 | raw_data = pd.read_csv('adult-sample.csv', na_values='?') 10 | data = raw_data.dropna() 11 | 12 | labels = label_binarize(data['income-per-year'], ['>50K', '<=50K']) 13 | 14 | feature_transformation = ColumnTransformer(transformers=[ 15 | ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']), 16 | ('numeric', StandardScaler(), ['age', 'hours-per-week']) 17 | ]) 18 | 19 | income_pipeline = Pipeline([ 20 | ('features', feature_transformation), 21 | ('classifier', DecisionTreeClassifier())]) 22 | 23 | income_model = income_pipeline.fit(data, labels) 24 | 25 | income_graph = pipeline_to_dataflow_graph(income_model) 26 | 27 | 28 | assert len(income_graph) == 5 29 | 30 | steps_without_parent = set(['features__categorical__education', 31 | 'features__categorical__workclass', 32 | 'features__numeric__age', 33 | 'features__numeric__hours-per-week']) 34 | 35 | for vertex in income_graph: 36 | if vertex.name in steps_without_parent: 37 | assert len(vertex.parent_vertices) == 0 38 | if 'categorical' in vertex.name: 39 | assert vertex.operation == 'OneHotEncoder' 40 | else: 41 | assert vertex.operation == 'StandardScaler' 42 | else: 43 | assert len(vertex.parent_vertices) == 4 44 | assert vertex.name == 'classifier' 45 | -------------------------------------------------------------------------------- /assignment3/task4.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize 6 | from sklearn.compose import ColumnTransformer 7 | from sklearn.impute import SimpleImputer 8 | 9 | from components.graph import pipeline_to_dataflow_graph 10 | 11 | raw_data = pd.read_csv('adult-sample.csv', na_values='?') 12 | data = raw_data.dropna() 13 | 14 | labels = label_binarize(data['income-per-year'], ['>50K', '<=50K']) 15 | 16 | nested_categorical_feature_transformation = Pipeline(steps=[ 17 | ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 18 | ('encode', OneHotEncoder(handle_unknown='ignore')) 19 | ]) 20 | 21 | nested_feature_transformation = ColumnTransformer(transformers=[ 22 | ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']), 23 | ('numeric', StandardScaler(), ['age', 'hours-per-week']) 24 | ]) 25 | 26 | nested_pipeline = Pipeline([ 27 | ('features', nested_feature_transformation), 28 | ('classifier', DecisionTreeClassifier())]) 29 | 30 | nested_model = nested_pipeline.fit(data, labels) 31 | 32 | nested_graph = pipeline_to_dataflow_graph(nested_model) 33 | 34 | assert len(nested_graph) == 7 35 | 36 | vertices_by_name = {vertex.name:vertex for vertex in nested_graph} 37 | 38 | assert 'features__numeric__age' in vertices_by_name.keys() 39 | assert vertices_by_name['features__numeric__age'].parent_vertices == [] 40 | 41 | assert 'classifier' in vertices_by_name.keys() 42 | assert len(vertices_by_name['classifier'].parent_vertices) == 4 43 | 44 | assert 'features__categorical__education__encode' in vertices_by_name.keys() 45 | 46 | vertex_to_inspect = vertices_by_name['features__categorical__education__encode'] 47 | 48 | assert len(vertex_to_inspect.parent_vertices) == 1 49 | assert vertex_to_inspect.parent_vertices[0].name == 'features__categorical__education__impute' 50 | -------------------------------------------------------------------------------- /extra-assignment/README.md: -------------------------------------------------------------------------------- 1 | ## Optional assignment for extra credits (max. 10 points) 2 | 3 | **Task**: Create a **tutorial notebook for an open source project relevant to the course**, using **publicly available data**, similar to the example notebooks we worked on in the labs. 4 | 5 | You should create a **single jupyter notebook with comments and stepwise instructions** to apply one of 6 | the following open source libraries on **a publicly available small dataset**: 7 | 8 | * [Apache Beam](https://beam.apache.org/) 9 | * [DataWig](https://github.com/awslabs/datawig) 10 | * [mlflow](https://mlflow.org/) 11 | * [Tensorflow Data Validation](https://www.tensorflow.org/tfx/data_validation/get_started) 12 | * [Lime](https://github.com/marcotcr/lime) 13 | * [Weld](https://github.com/weld-project/weld) 14 | 15 | The size / length of your notebook should be roughly the same as the notebooks used during the lab. Please also provide a link to the dataset and a requirements.txt file specifying the dependencies. 16 | 17 | Note that your solution must be different from the existing examples for these projects and 18 | from the notebooks which we used for the labs. 19 | -------------------------------------------------------------------------------- /project-paper/projectpaper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/project-paper/projectpaper.pdf -------------------------------------------------------------------------------- /project-paper/projectpaper.tex: -------------------------------------------------------------------------------- 1 | \documentclass[sigconf,10pt]{acmart} 2 | 3 | \usepackage{booktabs} % For formal tables 4 | 5 | \usepackage[utf8]{inputenc} 6 | \usepackage{enumitem} 7 | \usepackage{xcolor} 8 | \usepackage{amsmath} 9 | %\usepackage[ruled,linesnumbered]{algorithm2e} 10 | \usepackage{subfigure} 11 | \usepackage{amssymb} 12 | \usepackage{listings} 13 | 14 | \usepackage{algorithm} 15 | \usepackage{algpseudocode} 16 | \usepackage{blindtext} 17 | 18 | \renewcommand{\algorithmicrequire}{\textbf{Input:}} 19 | 20 | \usepackage{listings} 21 | % Python style for highlighting 22 | \definecolor{darkgray}{rgb}{0.33, 0.33, 0.33} 23 | 24 | 25 | \lstnewenvironment{Python}[1][] 26 | {\lstset{language=Python, 27 | basicstyle = \footnotesize\ttfamily, 28 | keywordstyle = \color{blue}, 29 | keywordstyle = [2] \color{teal}, % just to check that it works 30 | stringstyle = \color{magenta}, 31 | literate={ü}{{\"u}}1 {ö}{{\"o}}1 {É}{{\'E}}1 {œ}{{\oe}}1, 32 | commentstyle = \color{darkgray}\ttfamily, 33 | morekeywords=as, 34 | morekeywords=with, 35 | #1}% 36 | } 37 | {} 38 | 39 | \lstset{} 40 | 41 | \usepackage{tcolorbox} 42 | \usepackage{soul} 43 | 44 | \newcommand{\todo}[1]{\textcolor{magenta}{[#1]}} 45 | 46 | 47 | %\newcommand{\revcmt}[1]{\begin{tcolorbox}[boxrule=1pt, boxsep=4pt,left=3pt,right=3pt,top=2pt,bottom=2pt]\noindent\textit{#1}\end{tcolorbox}} 48 | 49 | \definecolor{charcoal}{rgb}{0.21, 0.27, 0.31} 50 | \newcommand{\revcmt}[1]{\noindent\textit{\textcolor{charcoal}{#1}}} 51 | 52 | \DeclareTextFontCommand{\texttt}{\ttfamily\hyphenchar\font=45\relax} 53 | 54 | % Copyright 55 | %\setcopyright{none} 56 | %\setcopyright{acmcopyright} 57 | %\setcopyright{acmlicensed} 58 | \setcopyright{rightsretained} 59 | %\setcopyright{usgov} 60 | %\setcopyright{usgovmixed} 61 | %\setcopyright{cagov} 62 | %\setcopyright{cagovmixed} 63 | 64 | 65 | % DOI 66 | \acmDOI{10.475/123_4} 67 | 68 | % ISBN 69 | \acmISBN{123-4567-24-567/08/06} 70 | 71 | %Conference 72 | \acmConference[Data Engineering for Machine Learning Course]{ACM SIGMOD}{2019}{NYU} 73 | \acmYear{2019} 74 | \copyrightyear{2019} 75 | 76 | 77 | \acmArticle{4} 78 | \acmPrice{15.00} 79 | 80 | \settopmatter{printacmref=false} 81 | \renewcommand\footnotetextcopyrightpermission[1]{} 82 | 83 | \title{Team X: Name of the Project} 84 | 85 | \author{Student~1, Student~2, Student~3, Student~4} 86 | \affiliation{% 87 | \institution{New York University} 88 | } 89 | \email{{netid1,netid2,netid3,netid4}@nyu.edu} 90 | 91 | %\renewcommand{\shortauthors}{Schelter et al.} 92 | 93 | \begin{document} 94 | 95 | \begin{abstract} 96 | \todo{Summarize your project paper in about a quarter of a page} \blindtext 97 | \end{abstract} 98 | 99 | \maketitle 100 | 101 | \section{Introduction} 102 | 103 | \todo{Describe your project, why it is important, why it is difficult and summarize how you approached it and which final results you got. The introduction should fill up the first page.} 104 | 105 | \blindtext 106 | 107 | \blindtext 108 | 109 | \blindtext 110 | 111 | \blindtext 112 | 113 | \todo{Summarize three achievements of your project} 114 | \begin{itemize} 115 | \item \todo{Achievement 1} 116 | \item \todo{Achievement 2} 117 | \item \todo{Achievement 3} 118 | \end{itemize} 119 | 120 | \newpage 121 | 122 | \section{Problem Statement \& Approach} 123 | 124 | \todo{While the introduction gives the high-level view, this section should go into details and state the problem and the approach (modeling decisions, algorithms, system design, etc) that you took.} 125 | 126 | \subsection{Problem Statement} 127 | 128 | \todo{Try to briefly and concisely describe the problem that you are trying to solve} 129 | 130 | \blindtext 131 | 132 | \blindtext 133 | 134 | \subsection{Approach} 135 | 136 | \todo{Try to briefly and concisely describe the approach that you took to solve our project problem. Try to be generic here. Feel free to use diagrams and figures here.} 137 | 138 | \blindtext 139 | 140 | \blindtext 141 | 142 | \blindtext 143 | 144 | \section{Implementation} 145 | 146 | \todo{Describe in detail how you implemented your solution, here you can talk about software libraries, implementations details, etc.} 147 | 148 | \blindtext 149 | 150 | \blindtext 151 | 152 | \blindtext 153 | 154 | \section{Evaluation} 155 | 156 | \subsection{Experimental Setup} 157 | 158 | \todo{Describe which infrastructure (machine, operating system, library versions) you used for your experiments} 159 | 160 | \blindtext 161 | 162 | \subsection{Datasets} 163 | 164 | \todo{Describe which datasets you used for your experiments} 165 | 166 | \blindtext 167 | 168 | 169 | \subsection{Results} 170 | 171 | \todo{Describe which experiments you ran, which baselines you used, create tables or figures for the results and discuss your findings.} 172 | 173 | \blindtext 174 | 175 | \blindtext 176 | 177 | \blindtext 178 | 179 | 180 | \section{Discussion} 181 | 182 | \todo{Summarize your project and the outcome. What went well? What were unexpected difficulties? What would be the next steps to take if you had more time for the project?} 183 | 184 | \blindtext 185 | 186 | \blindtext 187 | 188 | 189 | \section{Detailed Contributions} 190 | 191 | \todo{Summarize the contributions of every student to the project. Give pointers to the other parts of the paper. For example, explain who implement which parts of the software, who collected and prepared data, who tried different algorithms, etc.} 192 | 193 | \subsection{Student 1} 194 | 195 | \blindtext 196 | 197 | \subsection{Student 2} 198 | 199 | \blindtext 200 | 201 | \subsection{Student 3} 202 | 203 | \blindtext 204 | 205 | \subsection{Student 4} 206 | 207 | \blindtext 208 | 209 | 210 | \end{document} 211 | -------------------------------------------------------------------------------- /project-resources/README.md: -------------------------------------------------------------------------------- 1 | ## Pointers and resources for the group projects 2 | 3 | ### (2) Evaluation of HoloClean 4 | 5 | [HoloClean website](holoclean.io) 6 | 7 | Papers: 8 | * [HoloClean: Holistic Data Repairs with Probabilistic Inference](http://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) 9 | * [HoloDetect: Few-Shot Learning for Error Detection](https://arxiv.org/pdf/1904.02285) 10 | 11 | ### (3) Unsupervised Data Quality Validation 12 | 13 | Compressed file with the [partitioned datasets](partitioned-data.zip) for the flights, taxi and posts data. 14 | 15 | ### (6) Missing Data and Fairness 16 | 17 | [IBM AIF360](https://github.com/IBM/AIF360), a python package with a comprehensive set of fairness metrics for datasets and machine learning models, explanations for these metrics, and algorithms to mitigate bias in datasets and models. 18 | 19 | 20 | ### (7) Fair AutoML 21 | 22 | [IBM AIF360](https://github.com/IBM/AIF360), a python package with a comprehensive set of fairness metrics for datasets and machine learning models, explanations for these metrics, and algorithms to mitigate bias in datasets and models. 23 | 24 | [Auto-Sklearn](https://automl.github.io/auto-sklearn/master/), an AutoML library based on scikit-learn. 25 | 26 | [Google AutoML Tables](https://cloud.google.com/automl-tables/) as an example of an industrial-scale AutoML service. 27 | 28 | ### (8) Fairness Labels 29 | 30 | Please talk to Ke for details. 31 | 32 | ### (9) Data Loading for Breast-Cancer Screening 33 | 34 | Please talk to Jason for details. 35 | 36 | ### (10) Web-fronted for the "Amnesia" Recommender System 37 | 38 | * [Short paper](https://drive.google.com/file/d/17M6k_b94stLyPB6LHOq9td2pRDAmREr8/view) on the approach 39 | * [Slides](https://drive.google.com/file/d/1FU4svEaLb6a5v8CI4tl4YQpv4Cr4DUei/view) and [Video](https://www.youtube.com/watch?v=tRyX-aFjUEU) presenting the topic 40 | 41 | Please talk to me for details. 42 | 43 | 44 | -------------------------------------------------------------------------------- /project-resources/partitioned-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schelterlabs/deml-lab/553ae32961ed1cb73d8d9590422c96ecabc81c39/project-resources/partitioned-data.zip --------------------------------------------------------------------------------