├── .README.md.swp ├── .gitignore ├── README.md └── models └── tutorials └── mllib.json /.README.md.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS12/example-personal-repo/master/.README.md.swp -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | ### Introduction 4 | This is your private repository in the DS12 organization. Only you and your instructors will have access to the code in this repo. This is where you will keep all of your solutions for problem sets, tutorials, and labs unless otherwise instructed. 5 | 6 | ### Structure 7 | 8 | We'd like you to organize your repo as follows: 9 | 10 | ``` 11 | |- Methods 12 | |-- Tutorials 13 | |-- Labs 14 | |- Models 15 | |-- Problem Sets 16 | |-- Tutorials 17 | |- Scala 18 | |-- Tutorials 19 | |-- Labs 20 | |- Misc 21 | |-- Tutorials 22 | ``` 23 | -------------------------------------------------------------------------------- /models/tutorials/mllib.json: -------------------------------------------------------------------------------- 1 | {"paragraphs":[{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465926544880_-1098522306","id":"20160614-174904_1826300551","dateCreated":"Jun 14, 2016 5:49:04 PM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:20","text":"\nimport org.apache.spark.mllib.linalg._\nimport org.apache.spark.mllib.regression._\nimport org.apache.spark.mllib.evaluation._\nimport org.apache.spark.mllib.tree._\nimport org.apache.spark.mllib.tree.model._\nimport org.apache.spark.rdd._\n\nval rawData = sc.textFile(\"covtype.data.gz\")","dateUpdated":"Jun 14, 2016 6:41:04 PM","dateFinished":"Jun 14, 2016 6:41:09 PM","dateStarted":"Jun 14, 2016 6:41:04 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"import org.apache.spark.mllib.linalg._\nimport org.apache.spark.mllib.regression._\nimport org.apache.spark.mllib.evaluation._\nimport org.apache.spark.mllib.tree._\nimport org.apache.spark.mllib.tree.model._\nimport org.apache.spark.rdd._\nrawData: org.apache.spark.rdd.RDD[String] = covtype.data.gz MapPartitionsRDD[50] at textFile at :117\n"},"focus":true},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465926572474_-937515020","id":"20160614-174932_1369027418","dateCreated":"Jun 14, 2016 5:49:32 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:67","dateUpdated":"Jun 14, 2016 6:21:41 PM","dateFinished":"Jun 14, 2016 6:21:49 PM","dateStarted":"Jun 14, 2016 6:21:41 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"res33: Long = 581012\n"},"text":"rawData.count"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465926577647_-1797028738","id":"20160614-174937_1424179639","dateCreated":"Jun 14, 2016 5:49:37 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:87","dateUpdated":"Jun 14, 2016 6:21:53 PM","dateFinished":"Jun 14, 2016 6:21:58 PM","dateStarted":"Jun 14, 2016 6:21:53 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"(1,29884)\n(0,551128)\n"},"text":"rawData.map(_.split(',')(11)).countByValue()\n.toSeq.sortBy(_._2).foreach(println)"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465928242959_2088741396","id":"20160614-181722_648535303","dateCreated":"Jun 14, 2016 6:17:22 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:121","dateUpdated":"Jun 14, 2016 6:18:00 PM","dateFinished":"Jun 14, 2016 6:18:00 PM","dateStarted":"Jun 14, 2016 6:18:00 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[34] at map at :58\n"},"text":"val data = rawData.map { line =>\nval values = line.split(',').map(_.toDouble)\nval featureVector = Vectors.dense(values.init)\nval label = values.last - 1\nLabeledPoint(label, featureVector)\n}"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465928280334_-2092078071","id":"20160614-181800_1350385226","dateCreated":"Jun 14, 2016 6:18:00 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:141","dateUpdated":"Jun 14, 2016 6:18:19 PM","dateFinished":"Jun 14, 2016 6:18:20 PM","dateStarted":"Jun 14, 2016 6:18:19 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"splits: Array[org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint]] = Array(MapPartitionsRDD[35] at randomSplit at :60, MapPartitionsRDD[36] at randomSplit at :60)\ntrainData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[35] at randomSplit at :60\ntestData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[36] at randomSplit at :60\n"},"text":"val splits = data.randomSplit(Array(0.7, 0.3))\nval (trainData, testData) = (splits(0), splits(1))"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465928299291_792415082","id":"20160614-181819_1695893611","dateCreated":"Jun 14, 2016 6:18:19 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:161","dateUpdated":"Jun 14, 2016 6:41:28 PM","dateFinished":"Jun 14, 2016 6:41:30 PM","dateStarted":"Jun 14, 2016 6:41:28 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"numClasses: Int = 7\ncategoricalFeaturesInfo: scala.collection.immutable.Map[Int,Int] = Map()\nimpurity: String = gini\nmaxDepth: Int = 3\nmaxBins: Int = 32\n"},"text":"val numClasses = 7\nval categoricalFeaturesInfo = Map[Int, Int]()\nval impurity = \"gini\"\nval maxDepth = 3\nval maxBins = 32"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465929530256_1832149461","id":"20160614-183850_1341415599","dateCreated":"Jun 14, 2016 6:38:50 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:187","dateUpdated":"Jun 14, 2016 6:41:35 PM","dateFinished":"Jun 14, 2016 6:42:06 PM","dateStarted":"Jun 14, 2016 6:41:35 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"model: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 3 with 15 nodes\n"},"text":"val model = DecisionTree.trainClassifier(trainData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465929549605_-1861024583","id":"20160614-183909_1385343868","dateCreated":"Jun 14, 2016 6:39:09 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:207","dateUpdated":"Jun 14, 2016 6:51:52 PM","dateFinished":"Jun 14, 2016 6:52:04 PM","dateStarted":"Jun 14, 2016 6:51:52 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"labelAndPreds: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[80] at map at :130\ntestErr: Double = 0.32530853567533435\nTest Error = 0.32530853567533435\n"},"text":"val labelAndPreds = testData.map { point =>\n val prediction = model.predict(point.features)\n (prediction, point.label)\n}\n\nval testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()\nprintln(\"Test Error = \" + testErr)"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465929741114_1746056008","id":"20160614-184221_2012087483","dateCreated":"Jun 14, 2016 6:42:21 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:254","dateUpdated":"Jun 14, 2016 6:52:07 PM","dateFinished":"Jun 14, 2016 6:52:26 PM","dateStarted":"Jun 14, 2016 6:52:07 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@12a7175f\nres62: Array[Double] = Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0)\nres63: org.apache.spark.mllib.linalg.Matrix = \n42525.0 18247.0 35.0 0.0 0.0 0.0 2443.0 \n19622.0 63807.0 1675.0 0.0 0.0 0.0 204.0 \n0.0 2422.0 8351.0 0.0 0.0 0.0 0.0 \n0.0 0.0 791.0 0.0 0.0 0.0 0.0 \n0.0 2796.0 22.0 0.0 0.0 0.0 0.0 \n0.0 1147.0 4084.0 0.0 0.0 0.0 0.0 \n3169.0 15.0 0.0 0.0 0.0 0.0 2855.0 \nres64: Double = 0.6746914643246656\n"},"text":"val metrics = new MulticlassMetrics(labelAndPreds)\nmetrics.labels\nmetrics.confusionMatrix\nmetrics.precision"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465930502854_1191323594","id":"20160614-185502_2111116373","dateCreated":"Jun 14, 2016 6:55:02 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:326","dateUpdated":"Jun 14, 2016 7:02:23 PM","dateFinished":"Jun 14, 2016 7:02:25 PM","dateStarted":"Jun 14, 2016 7:02:23 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"splits: Array[org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint]] = Array(MapPartitionsRDD[95] at randomSplit at :120, MapPartitionsRDD[96] at randomSplit at :120, MapPartitionsRDD[97] at randomSplit at :120)\ntrainData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[95] at randomSplit at :120\ncvData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[96] at randomSplit at :120\ntestData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[97] at randomSplit at :120\n"},"text":"val splits = data.randomSplit(Array(0.7, 0.1, 0.2))\nval (trainData, cvData, testData) = (splits(0), splits(1), splits(2))"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465930933478_1936442651","id":"20160614-190213_797820460","dateCreated":"Jun 14, 2016 7:02:13 PM","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:355","dateUpdated":"Jun 14, 2016 7:03:31 PM","dateFinished":"Jun 14, 2016 7:10:27 PM","dateStarted":"Jun 14, 2016 7:03:31 PM","result":{"code":"SUCCESS","type":"TEXT","msg":"evaluations: Array[((String, Int, Int), Double)] = Array(((gini,1,10),0.635763990813807), ((gini,1,300),0.6349524286429644), ((gini,20,10),0.8879871531435084), ((gini,20,300),0.902750677740749), ((entropy,1,10),0.488577694127398), ((entropy,1,300),0.488577694127398), ((entropy,20,10),0.8907671852606496), ((entropy,20,300),0.9081035346122632))\n((entropy,20,300),0.9081035346122632)\n((gini,20,300),0.902750677740749)\n((entropy,20,10),0.8907671852606496)\n((gini,20,10),0.8879871531435084)\n((gini,1,10),0.635763990813807)\n((gini,1,300),0.6349524286429644)\n((entropy,1,300),0.488577694127398)\n((entropy,1,10),0.488577694127398)\n"},"text":"val evaluations =\nfor (impurity <- Array(\"gini\", \"entropy\");\n depth <- Array(1, 20);\n bins <- Array(10, 300))\nyield {\n val model = DecisionTree.trainClassifier(\n trainData, numClasses, categoricalFeaturesInfo, impurity, depth, bins)\n val predictionsAndLabels = cvData.map(example => (model.predict(example.features), example.label))\nval accuracy =\nnew MulticlassMetrics(predictionsAndLabels).precision\n ((impurity, depth, bins), accuracy)\n }\nevaluations.sortBy(_._2).reverse.foreach(println)"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1465930965532_1291080828","id":"20160614-190245_61101456","dateCreated":"Jun 14, 2016 7:02:45 PM","status":"READY","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:377"}],"name":"Models-Tutorial-1","id":"2BPJZ81AZ","angularObjects":{"2B44YVSN1":[],"2AJXGMUUJ":[],"2AK8P7CPX":[],"2AM1YV5CU":[],"2AKK3QQXU":[],"2ANGGHHMQ":[]},"config":{"looknfeel":"default"},"info":{}} --------------------------------------------------------------------------------