├── reports └── images │ ├── roc_all.png │ ├── roc_curve.png │ ├── varImprt.png │ ├── roc_all_zoom.png │ ├── roc_curve_zoom.png │ ├── breastCancerWisconsinDataSet_MachineLearning_19_0.png │ ├── breastCancerWisconsinDataSet_MachineLearning_22_1.png │ ├── breastCancerWisconsinDataSet_MachineLearning_25_0.png │ └── breastCancerWisconsinDataSet_MachineLearning_34_0.png ├── models └── pickle_models │ ├── model_nn.pkl │ ├── model_rf.pkl │ └── model_knn.pkl ├── src ├── r │ ├── .Rprofile │ ├── README.md │ ├── r.Rproj │ ├── packrat │ │ ├── packrat.opts │ │ ├── init.R │ │ └── packrat.lock │ ├── random_forest.R │ ├── breastCancer.R │ └── breast_cancer.Rmd ├── pyspark │ ├── breast_cancer_neural_networks.scala │ ├── breast_cancer_rdd.py │ ├── breast_cancer_df.py │ └── breast_cancer_zeppelin_notebook.json └── python │ ├── produce_model_metrics.py │ ├── exploratory_analysis.py │ ├── data_extraction.py │ ├── neural_networks.py │ ├── knn.py │ ├── random_forest.py │ ├── model_eval.py │ └── helper_functions.py ├── notebooks └── random_forest_files │ ├── output_36_1.png │ ├── output_50_0.png │ ├── output_58_0.png │ ├── output_65_0.png │ └── output_67_0.png ├── dash_dashboard ├── dash_breast_cancer.css ├── README.md ├── global_vars.py └── app.py ├── LICENSE.md ├── requirements.txt ├── .gitignore └── README.md /reports/images/roc_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/roc_all.png -------------------------------------------------------------------------------- /reports/images/roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/roc_curve.png -------------------------------------------------------------------------------- /reports/images/varImprt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/varImprt.png -------------------------------------------------------------------------------- /models/pickle_models/model_nn.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/models/pickle_models/model_nn.pkl -------------------------------------------------------------------------------- /models/pickle_models/model_rf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/models/pickle_models/model_rf.pkl -------------------------------------------------------------------------------- /reports/images/roc_all_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/roc_all_zoom.png -------------------------------------------------------------------------------- /reports/images/roc_curve_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/roc_curve_zoom.png -------------------------------------------------------------------------------- /src/r/.Rprofile: -------------------------------------------------------------------------------- 1 | #### -- Packrat Autoloader (version 0.4.8-1) -- #### 2 | source("packrat/init.R") 3 | #### -- End Packrat Autoloader -- #### 4 | -------------------------------------------------------------------------------- /models/pickle_models/model_knn.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/models/pickle_models/model_knn.pkl -------------------------------------------------------------------------------- /notebooks/random_forest_files/output_36_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/notebooks/random_forest_files/output_36_1.png -------------------------------------------------------------------------------- /notebooks/random_forest_files/output_50_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/notebooks/random_forest_files/output_50_0.png -------------------------------------------------------------------------------- /notebooks/random_forest_files/output_58_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/notebooks/random_forest_files/output_58_0.png -------------------------------------------------------------------------------- /notebooks/random_forest_files/output_65_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/notebooks/random_forest_files/output_65_0.png -------------------------------------------------------------------------------- /notebooks/random_forest_files/output_67_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/notebooks/random_forest_files/output_67_0.png -------------------------------------------------------------------------------- /src/r/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Techniques on Breast Cancer Wisconsin Data Set 2 | 3 | This serves as a sub-directory for the breast cancer project with all r related stuff. More info later 4 | -------------------------------------------------------------------------------- /reports/images/breastCancerWisconsinDataSet_MachineLearning_19_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/breastCancerWisconsinDataSet_MachineLearning_19_0.png -------------------------------------------------------------------------------- /reports/images/breastCancerWisconsinDataSet_MachineLearning_22_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/breastCancerWisconsinDataSet_MachineLearning_22_1.png -------------------------------------------------------------------------------- /reports/images/breastCancerWisconsinDataSet_MachineLearning_25_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/breastCancerWisconsinDataSet_MachineLearning_25_0.png -------------------------------------------------------------------------------- /reports/images/breastCancerWisconsinDataSet_MachineLearning_34_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raviolli77/machineLearning_breastCancer_Python/HEAD/reports/images/breastCancerWisconsinDataSet_MachineLearning_34_0.png -------------------------------------------------------------------------------- /src/r/r.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /src/r/packrat/packrat.opts: -------------------------------------------------------------------------------- 1 | auto.snapshot: TRUE 2 | use.cache: FALSE 3 | print.banner.on.startup: auto 4 | vcs.ignore.lib: TRUE 5 | vcs.ignore.src: FALSE 6 | external.packages: 7 | local.repos: 8 | load.external.packages.on.startup: TRUE 9 | ignored.packages: 10 | quiet.package.installation: TRUE 11 | snapshot.recommended.packages: FALSE 12 | snapshot.fields: 13 | Imports 14 | Depends 15 | LinkingTo 16 | -------------------------------------------------------------------------------- /dash_dashboard/dash_breast_cancer.css: -------------------------------------------------------------------------------- 1 | .banner { 2 | height: 75px; 3 | margin: 0px -10px 10px; 4 | background-color: #00878d; 5 | border-radius: 2px; 6 | } 7 | 8 | .banner h2{ 9 | color: white; 10 | padding-top: 10px; 11 | margin-left: 2%; 12 | display: inline-block; 13 | } 14 | 15 | table, td, th { 16 | border: 1px solid #ddd; 17 | text-align: left; 18 | } 19 | 20 | table { 21 | border-collapse: collapse; 22 | width: 100%; 23 | } 24 | 25 | th, td { 26 | padding: 15px; 27 | } 28 | 29 | 30 | h1, h2, h3, h4, h5, h6 { 31 | color: #24515d; 32 | font-family: font-family: "Courier New", Courier; 33 | } 34 | 35 | p { 36 | font-family: font-family: "Courier New", Courier; 37 | } -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Inertia7 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /dash_dashboard/README.md: -------------------------------------------------------------------------------- 1 | # Dash Dashboard 2 | 3 | This document serves as a `readme` for the subdirectory containing the code for the interactive dashboard made available by [plotly](https://plot.ly/) called [Dash](https://plot.ly/products/dash/). 4 | 5 | To run the application run `app.py` and you're web browser will open the dashboard. 6 | 7 | ## Exploratory Analysis 8 | 9 | This section explores 3 variable interaction with a 3d scatter plot that showcases the relationship between the variables of your choice. Along with showcasing the distribution of the data using histograms seperating the diagnoses. 10 | 11 | 12 | 13 | ## Machine Learning 14 | 15 | This section showcases the machine learning section of the project. 16 | The metrics are as outlined: 17 | 18 | + ROC Curves 19 | + Interactive Confusion Matrix 20 | + Classification Report - outputs the `classification_report` function from `sklearn` in an html table. 21 | 22 | 23 | 24 | Any questions or suggestions, please let me know! 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | appnope==0.1.0 3 | bleach==3.1.1 4 | certifi==2018.1.18 5 | chardet==3.0.4 6 | click==6.7 7 | cycler==0.10.0 8 | dash==0.19.0 9 | dash-core-components==0.16.0 10 | dash-html-components==0.8.0 11 | dash-renderer==0.11.1 12 | decorator==4.2.1 13 | entrypoints==0.2.2 14 | Flask==1.0 15 | Flask-Compress==1.4.0 16 | html5lib==0.999999999 17 | idna==2.6 18 | ipykernel==4.6.1 19 | ipython==6.0.0 20 | ipython-genutils==0.2.0 21 | ipywidgets==6.0.0 22 | itsdangerous==0.24 23 | jedi==0.10.2 24 | Jinja2==2.9.6 25 | jsonschema==2.6.0 26 | jupyter==1.0.0 27 | jupyter-client==5.2.3 28 | jupyter-console==5.2.0 29 | jupyter-core==4.4.0 30 | kiwisolver==1.0.1 31 | MarkupSafe==1.0 32 | matplotlib==2.2.2 33 | mistune==0.8.1 34 | nbconvert==5.1.1 35 | nbformat==4.4.0 36 | notebook==5.7.8 37 | numpy==1.14.5 38 | packaging==16.8 39 | pandas==0.23.3 40 | pandocfilters==1.4.1 41 | pexpect==4.2.1 42 | pickleshare==0.7.4 43 | plotly==3.1.0 44 | prometheus-client==0.3.0 45 | prompt-toolkit==1.0.14 46 | ptyprocess==0.5.1 47 | Pygments==2.2.0 48 | pyparsing==2.1.4 49 | python-dateutil==2.6.1 50 | pytz==2017.3 51 | pyzmq==17.1.0 52 | qtconsole==4.3.1 53 | requests==2.20.0 54 | retrying==1.3.3 55 | scikit-learn==0.19.2 56 | scipy==1.1.0 57 | seaborn==0.9.0 58 | Send2Trash==1.5.0 59 | simplegeneric==0.8.1 60 | six==1.11.0 61 | sklearn==0.0 62 | terminado==0.8.1 63 | terminaltables==3.1.0 64 | testpath==0.3 65 | tornado==4.5.1 66 | traitlets==4.3.2 67 | urllib3==1.25.3 68 | virtualenv==15.1.0 69 | wcwidth==0.1.7 70 | webencodings==0.5.1 71 | Werkzeug==0.15.3 72 | widgetsnbextension==2.0.0 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_store 2 | 3 | # Rstuff 4 | src/r/packrat/lib*/ 5 | src/r/packrat/src/* 6 | 7 | rsconnect/ 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *.pyc 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Icebox stuff 19 | ice_box/ 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | .static_storage/ 68 | .media/ 69 | local_settings.py 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # Environments 97 | .env 98 | .venv 99 | env/ 100 | venv/ 101 | ENV/ 102 | env.bak/ 103 | venv.bak/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | # R Stuff 118 | .Rhistory 119 | .Rproj.user 120 | -------------------------------------------------------------------------------- /src/pyspark/breast_cancer_neural_networks.scala: -------------------------------------------------------------------------------- 1 | // Load appropriate packages 2 | // Neural Networks 3 | // Compatible with Apache Zeppelin 4 | import org.apache.spark.ml.classification.MultilayerPerceptronClassifier 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 6 | import org.apache.spark.ml.feature.MinMaxScaler 7 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 8 | import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} 9 | 10 | // Read in file 11 | val data = spark.read.format("libsvm") 12 | .load("data/data.txt") 13 | 14 | data.collect() 15 | 16 | // Pre-processing 17 | val scaler = new MinMaxScaler() 18 | .setInputCol("features") 19 | .setOutputCol("scaledFeatures") 20 | 21 | val scalerModel = scaler.fit(data) 22 | 23 | val scaledData = scalerModel.transform(data) 24 | println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") 25 | scaledData.select("features", "scaledFeatures").show() 26 | 27 | // Changing RDD files variable names to get accurate predictions 28 | val newNames = Seq("label", "oldFeatures", "features") 29 | val data2 = scaledData.toDF(newNames: _*) 30 | 31 | val splits = data2.randomSplit(Array(0.7, 0.3), seed = 1234L) 32 | val trainingSet = splits(0) 33 | val testSet = splits(1) 34 | 35 | trainingSet.select("label", "features").show(25) 36 | 37 | // Neural Networks 38 | val layers = Array[Int](30, 5, 4, 2) 39 | 40 | // Train the Network 41 | val trainer = new MultilayerPerceptronClassifier() 42 | .setLayers(layers) 43 | .setBlockSize(128) 44 | .setSeed(1234L) 45 | .setMaxIter(100) 46 | 47 | val fitNN = trainer.fit(trainingSet) 48 | 49 | // Predict the Test set 50 | val results = fitNN.transform(testSet) 51 | val predictionAndLabelsNN = results.select("prediction", "label") 52 | val evaluator = new MulticlassClassificationEvaluator() 53 | .setMetricName("accuracy") 54 | 55 | println("Test error rate = " + (1 - evaluator.evaluate(predictionAndLabelsNN))) 56 | 57 | println("Test set accuracy = " + evaluator.evaluate(predictionAndLabelsNN)) -------------------------------------------------------------------------------- /src/python/produce_model_metrics.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from sklearn.metrics import roc_curve 3 | from sklearn.metrics import auc 4 | 5 | # Function for All Models to produce Metrics --------------------- 6 | 7 | def produce_model_metrics(fit, test_set, test_class_set, estimator): 8 | """ 9 | Purpose 10 | ---------- 11 | Function that will return predictions and probability metrics for said 12 | predictions. 13 | 14 | Parameters 15 | ---------- 16 | * fit: Fitted model containing the attribute feature_importances_ 17 | * test_set: dataframe/array containing the test set values 18 | * test_class_set: array containing the target values for the test set 19 | * estimator: String represenation of appropriate model, can only contain the 20 | following: ['knn', 'rf', 'nn'] 21 | 22 | Returns 23 | ---------- 24 | Box plot graph for all numeric data in data frame 25 | """ 26 | my_estimators = { 27 | 'rf': 'estimators_', 28 | 'nn': 'out_activation_', 29 | 'knn': '_fit_method' 30 | } 31 | try: 32 | # Captures whether first parameter is a model 33 | if not hasattr(fit, 'fit'): 34 | return print("'{0}' is not an instantiated model from scikit-learn".format(fit)) 35 | 36 | # Captures whether the model has been trained 37 | if not vars(fit)[my_estimators[estimator]]: 38 | return print("Model does not appear to be trained.") 39 | 40 | except KeyError as e: 41 | raise KeyError("'{0}' does not correspond with the appropriate key inside the estimators dictionary. \ 42 | Please refer to function to check `my_estimators` dictionary.".format(estimator)) 43 | 44 | 45 | # Outputting predictions and prediction probability 46 | # for test set 47 | predictions = fit.predict(test_set) 48 | accuracy = fit.score(test_set, test_class_set) 49 | # We grab the second array from the output which corresponds to 50 | # to the predicted probabilites of positive classes 51 | # Ordered wrt fit.classes_ in our case [0, 1] where 1 is our positive class 52 | predictions_prob = fit.predict_proba(test_set)[:, 1] 53 | # ROC Curve stuff 54 | fpr, tpr, _ = roc_curve(test_class_set, 55 | predictions_prob, 56 | pos_label = 1) 57 | auc_fit = auc(fpr, tpr) 58 | return {'predictions': predictions, 59 | 'accuracy': accuracy, 60 | 'fpr': fpr, 61 | 'tpr': tpr, 62 | 'auc': auc_fit} 63 | -------------------------------------------------------------------------------- /src/pyspark/breast_cancer_rdd.py: -------------------------------------------------------------------------------- 1 | # LOAD APPROPRIATE PACKAGE 2 | import numpy as np 3 | from pyspark.context import SparkContext 4 | from pyspark.mllib.util import MLUtils 5 | from pyspark.mllib.tree import DecisionTree, DecisionTreeModel 6 | from pyspark.mllib.tree import RandomForest, RandomForestModel 7 | from pyspark.mllib.evaluation import BinaryClassificationMetrics 8 | 9 | sc = SparkContext.getOrCreate() 10 | data = MLUtils.loadLibSVMFile(sc, 'data/dataLibSVM.txt') 11 | print(data) 12 | # NEXT LET'S CREATE THE APPROPRIATE TRAINING AND TEST SETS 13 | # WE'LL BE SETTING THEM AS 70-30, ALONG WITH SETTING A 14 | # RANDOM SEED GENERATOR TO MAKE MY RESULTS REPRODUCIBLE 15 | 16 | (trainingSet, testSet) = data.randomSplit([0.7, 0.3], seed = 7) 17 | 18 | ################## 19 | # DECISION TREES # 20 | ################## 21 | 22 | fitDT = DecisionTree.trainClassifier(trainingSet, 23 | numClasses=2, 24 | categoricalFeaturesInfo={}, 25 | impurity='gini', 26 | maxDepth=3, 27 | maxBins=32) 28 | 29 | print(fitDT.toDebugString()) 30 | 31 | predictionsDT = fitDT.predict(testSet.map(lambda x: x.features)) 32 | 33 | labelsAndPredictionsDT = testSet.map(lambda lp: lp.label).zip(predictionsDT) 34 | 35 | # Test Error Rate Evaluations 36 | 37 | testErrDT = labelsAndPredictionsDT.filter(lambda (v, p): v != p).count() / float(testSet.count()) 38 | 39 | print('Test Error = {0}'.format(testErrDT)) 40 | 41 | # Instantiate metrics object 42 | metricsDT = BinaryClassificationMetrics(labelsAndPredictionsDT) 43 | 44 | # Area under ROC curve 45 | print("Area under ROC = {0}".format(metricsDT.areaUnderROC)) 46 | 47 | ################# 48 | # RANDOM FOREST # 49 | ################# 50 | 51 | fitRF = RandomForest.trainClassifier(trainingSet, 52 | numClasses = 2, 53 | categoricalFeaturesInfo = {}, 54 | numTrees = 500, 55 | featureSubsetStrategy="auto", 56 | impurity = 'gini', # USING GINI INDEX FOR OUR RANDOM FOREST MODEL 57 | maxDepth = 4, 58 | maxBins = 100) 59 | 60 | predictionsRF = fitRF.predict(testSet.map(lambda x: x.features)) 61 | 62 | labelsAndPredictions = testSet.map(lambda lp: lp.label).zip(predictionsRF) 63 | 64 | 65 | testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testSet.count()) 66 | 67 | print('Test Error = {0}'.format(testErr)) 68 | print('Learned classification forest model:') 69 | print(fitRF.toDebugString()) 70 | 71 | # Instantiate metrics object 72 | metricsRF = BinaryClassificationMetrics(labelsAndPredictions) 73 | 74 | # Area under ROC curve 75 | print("Area under ROC = {0}".format(metricsRF.areaUnderROC)) 76 | 77 | ################### 78 | # NEURAL NETWORKS # 79 | ################### 80 | 81 | # See Scala Script -------------------------------------------------------------------------------- /src/python/exploratory_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER MACHINE LEARNING ## 5 | ##################################################### 6 | # 7 | # Project by Raul Eulogio 8 | # 9 | # Project found at: https://www.inertia7.com/projects/3 10 | # NOTE: Better in jupyter notebook format 11 | 12 | """ 13 | Exploratory Analysis 14 | """ 15 | import helper_functions as hf 16 | from data_extraction import breast_cancer 17 | import matplotlib.pyplot as plt 18 | import seaborn as sns 19 | 20 | print(''' 21 | ######################################## 22 | ## DATA FRAME SHAPE AND DTYPES ## 23 | ######################################## 24 | ''') 25 | 26 | print("Here's the dimensions of our data frame:\n", 27 | breast_cancer.shape) 28 | 29 | print("Here's the data types of our columns:\n", 30 | breast_cancer.dtypes) 31 | 32 | print("Some more statistics for our data frame: \n", 33 | breast_cancer.describe()) 34 | 35 | print(''' 36 | ########################################## 37 | ## STATISTICS RELATING TO DX ## 38 | ########################################## 39 | ''') 40 | 41 | # Next let's use the helper function to show distribution 42 | # of our data frame 43 | hf.print_target_perc(breast_cancer, 'diagnosis') 44 | import pdb 45 | pdb.set_trace() 46 | # Scatterplot Matrix 47 | # Variables chosen from Random Forest modeling. 48 | 49 | cols = ['concave_points_worst', 'concavity_mean', 50 | 'perimeter_worst', 'radius_worst', 51 | 'area_worst', 'diagnosis'] 52 | 53 | sns.pairplot(breast_cancer, 54 | x_vars = cols, 55 | y_vars = cols, 56 | hue = 'diagnosis', 57 | palette = ('Red', '#875FDB'), 58 | markers=["o", "D"]) 59 | 60 | plt.title('Scatterplot Matrix') 61 | plt.show() 62 | plt.close() 63 | 64 | # Pearson Correlation Matrix 65 | corr = breast_cancer.corr(method = 'pearson') # Correlation Matrix 66 | f, ax = plt.subplots(figsize=(11, 9)) 67 | 68 | # Generate a custom diverging colormap 69 | cmap = sns.diverging_palette(10, 275, as_cmap=True) 70 | 71 | # Draw the heatmap with the mask and correct aspect ratio 72 | sns.heatmap(corr, 73 | cmap=cmap, 74 | square=True, 75 | xticklabels=True, 76 | yticklabels=True, 77 | linewidths=.5, 78 | cbar_kws={"shrink": .5}, 79 | ax=ax) 80 | 81 | plt.title("Pearson Correlation Matrix") 82 | plt.yticks(rotation = 0) 83 | plt.xticks(rotation = 270) 84 | plt.show() 85 | plt.close() 86 | 87 | # BoxPlot 88 | hf.plot_box_plot(breast_cancer, 'Pre-Processed', (-.05, 50)) 89 | 90 | # Normalizing data 91 | breast_cancer_norm = hf.normalize_data_frame(breast_cancer) 92 | 93 | # Visuals relating to normalized data to show significant difference 94 | print(''' 95 | ################################# 96 | ## Transformed Data Statistics ## 97 | ################################# 98 | ''') 99 | 100 | print(breast_cancer_norm.describe()) 101 | 102 | hf.plot_box_plot(breast_cancer_norm, 'Transformed', (-.05, 1.05)) 103 | -------------------------------------------------------------------------------- /src/python/data_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER MACHINE LEARNING ## 5 | ##################################################### 6 | # 7 | # Project by Raul Eulogio 8 | # 9 | # Project found at: https://www.inertia7.com/projects/3 10 | # 11 | 12 | # Import Packages ----------------------------------------------- 13 | import numpy as np 14 | import pandas as pd 15 | from sklearn.preprocessing import MinMaxScaler 16 | from sklearn.model_selection import train_test_split 17 | from urllib.request import urlopen 18 | 19 | # Loading data ------------------------------ 20 | UCI_data_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases\ 21 | /breast-cancer-wisconsin/wdbc.data' 22 | 23 | names = ['id_number', 'diagnosis', 'radius_mean', 24 | 'texture_mean', 'perimeter_mean', 'area_mean', 25 | 'smoothness_mean', 'compactness_mean', 26 | 'concavity_mean','concave_points_mean', 27 | 'symmetry_mean', 'fractal_dimension_mean', 28 | 'radius_se', 'texture_se', 'perimeter_se', 29 | 'area_se', 'smoothness_se', 'compactness_se', 30 | 'concavity_se', 'concave_points_se', 31 | 'symmetry_se', 'fractal_dimension_se', 32 | 'radius_worst', 'texture_worst', 33 | 'perimeter_worst', 'area_worst', 34 | 'smoothness_worst', 'compactness_worst', 35 | 'concavity_worst', 'concave_points_worst', 36 | 'symmetry_worst', 'fractal_dimension_worst'] 37 | 38 | dx = ['Malignant', 'Benign'] 39 | 40 | breast_cancer = pd.read_csv(urlopen(UCI_data_URL), names=names) 41 | 42 | # Setting 'id_number' as our index 43 | breast_cancer.set_index(['id_number'], inplace = True) 44 | 45 | # Converted to binary to help later on with models and plots 46 | breast_cancer['diagnosis'] = breast_cancer['diagnosis'].map({'M':1, 'B':0}) 47 | 48 | for col in breast_cancer: 49 | pd.to_numeric(col, errors='coerce') 50 | 51 | # For later use in CART models 52 | names_index = names[2:] 53 | 54 | # Create Training and Test Set ---------------------------------- 55 | feature_space = breast_cancer.iloc[:, 56 | breast_cancer.columns != 'diagnosis'] 57 | feature_class = breast_cancer.iloc[:, 58 | breast_cancer.columns == 'diagnosis'] 59 | 60 | 61 | training_set, test_set, class_set, test_class_set = train_test_split(feature_space, 62 | feature_class, 63 | test_size = 0.20, 64 | random_state = 42) 65 | 66 | # Cleaning test sets to avoid future warning messages 67 | class_set = class_set.values.ravel() 68 | test_class_set = test_class_set.values.ravel() 69 | 70 | # Scaling dataframe 71 | scaler = MinMaxScaler() 72 | 73 | scaler.fit(training_set) 74 | 75 | training_set_scaled = scaler.fit_transform(training_set) 76 | test_set_scaled = scaler.transform(test_set) 77 | -------------------------------------------------------------------------------- /src/python/neural_networks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER MACHINE LEARNING ## 5 | ##################################################### 6 | # 7 | # Project by Raul Eulogio 8 | # 9 | # Project found at: https://www.inertia7.com/projects/3 10 | # 11 | 12 | """ 13 | Neural Networks Classification 14 | """ 15 | # Import Packages ----------------------------------------------- 16 | import sys, os 17 | import pandas as pd 18 | import helper_functions as hf 19 | from data_extraction import training_set_scaled, class_set 20 | from data_extraction import test_set_scaled, test_class_set 21 | from sklearn.neural_network import MLPClassifier 22 | from produce_model_metrics import produce_model_metrics 23 | 24 | # Fitting Neural Network ---------------------------------------- 25 | # Fit model 26 | fit_nn = MLPClassifier(solver='lbfgs', 27 | hidden_layer_sizes = (12, ), 28 | activation='tanh', 29 | learning_rate_init=0.05, 30 | random_state=42) 31 | 32 | # Train model on training set 33 | fit_nn.fit(training_set_scaled, 34 | class_set) 35 | 36 | if __name__ == '__main__': 37 | # Print model parameters ------------------------------------ 38 | print(fit_nn, '\n') 39 | 40 | # Initialize function for metrics --------------------------- 41 | fit_dict_nn = produce_model_metrics(fit_nn, test_set_scaled, 42 | test_class_set, 'nn') 43 | # Extract each piece from dictionary 44 | predictions_nn = fit_dict_nn['predictions'] 45 | accuracy_nn = fit_dict_nn['accuracy'] 46 | auc_nn = fit_dict_nn['auc'] 47 | 48 | 49 | print("Hyperparameter Optimization:") 50 | print("chosen parameters: \n \ 51 | {'hidden_layer_sizes': 12, \n \ 52 | 'activation': 'tanh', \n \ 53 | 'learning_rate_init': 0.05}") 54 | print("Note: Remove commented code to see this section \n") 55 | 56 | # from sklearn.model_selection import GridSearchCV 57 | # import time 58 | # start = time.time() 59 | # gs = GridSearchCV(fit_nn, cv = 10, 60 | # param_grid={ 61 | # 'learning_rate_init': [0.05, 0.01, 0.005, 0.001], 62 | # 'hidden_layer_sizes': [4, 8, 12], 63 | # 'activation': ["relu", "identity", "tanh", "logistic"]}) 64 | # gs.fit(training_set_scaled, class_set) 65 | # print(gs.best_params_) 66 | # end = time.time() 67 | # print(end - start) 68 | 69 | # Test Set Calculations ------------------------------------- 70 | # Test error rate 71 | test_error_rate_nn = 1 - accuracy_nn 72 | 73 | # Confusion Matrix 74 | test_crosstb = hf.create_conf_mat(test_class_set, 75 | predictions_nn) 76 | 77 | # Cross validation 78 | print("Cross Validation:") 79 | 80 | hf.cross_val_metrics(fit_nn, 81 | training_set_scaled, 82 | class_set, 83 | 'nn', 84 | print_results = True) 85 | 86 | print('Confusion Matrix:') 87 | print(test_crosstb, '\n') 88 | 89 | print("Here is our mean accuracy on the test set:\n {0: .3f}"\ 90 | .format(accuracy_nn)) 91 | 92 | print("The test error rate for our model is:\n {0: .3f}"\ 93 | .format(test_error_rate_nn)) 94 | -------------------------------------------------------------------------------- /src/python/knn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER MACHINE LEARNING ## 5 | ##################################################### 6 | # 7 | # Project by Raul Eulogio 8 | # 9 | # Project found at: https://www.inertia7.com/projects/3 10 | # 11 | 12 | """ 13 | Kth Nearest Neighbor Classification 14 | """ 15 | # Import Packages ----------------------------------------------- 16 | import sys, os 17 | import pandas as pd 18 | import helper_functions as hf 19 | from data_extraction import training_set, class_set 20 | from data_extraction import test_set, test_class_set 21 | from sklearn.neighbors import KNeighborsClassifier 22 | from sklearn.model_selection import cross_val_score 23 | from produce_model_metrics import produce_model_metrics 24 | 25 | # Fitting model 26 | fit_knn = KNeighborsClassifier(n_neighbors=3) 27 | 28 | # Training model 29 | fit_knn.fit(training_set, 30 | class_set) 31 | # --------------------------------------------------------------- 32 | if __name__ == '__main__': 33 | # Print model parameters ------------------------------------ 34 | print(fit_knn, '\n') 35 | 36 | # Optimal K ------------------------------------------------- 37 | # Inspired by: 38 | # https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/ 39 | 40 | myKs = [] 41 | for i in range(0, 50): 42 | if (i % 2 != 0): 43 | myKs.append(i) 44 | 45 | cross_vals = [] 46 | for k in myKs: 47 | knn = KNeighborsClassifier(n_neighbors=k) 48 | scores = cross_val_score(knn, 49 | training_set, 50 | class_set, 51 | cv = 10, 52 | scoring='accuracy') 53 | cross_vals.append(scores.mean()) 54 | 55 | MSE = [1 - x for x in cross_vals] 56 | optimal_k = myKs[MSE.index(min(MSE))] 57 | print("Optimal K is {0}".format(optimal_k), '\n') 58 | 59 | # Initialize function for metrics --------------------------- 60 | fit_dict_knn = produce_model_metrics(fit_knn, 61 | test_set, 62 | test_class_set, 63 | 'knn') 64 | # Extract each piece from dictionary 65 | predictions_knn = fit_dict_knn['predictions'] 66 | accuracy_knn = fit_dict_knn['accuracy'] 67 | auc_knn = fit_dict_knn['auc'] 68 | 69 | # Test Set Calculations ------------------------------------- 70 | # Test error rate 71 | test_error_rate_knn = 1 - accuracy_knn 72 | 73 | # Confusion Matrix 74 | test_crosstb = hf.create_conf_mat(test_class_set, 75 | predictions_knn) 76 | 77 | print('Cross Validation:') 78 | hf.cross_val_metrics(fit_knn, 79 | training_set, 80 | class_set, 81 | 'knn', 82 | print_results = True) 83 | 84 | print('Confusion Matrix:') 85 | print(test_crosstb, '\n') 86 | 87 | print("Here is our accuracy for our test set:\n {0: .3f}"\ 88 | .format(accuracy_knn)) 89 | 90 | print("The test error rate for our model is:\n {0: .3f}"\ 91 | .format(test_error_rate_knn)) 92 | -------------------------------------------------------------------------------- /dash_dashboard/global_vars.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import pandas as pd 4 | from sklearn.externals import joblib 5 | from urllib.request import urlopen 6 | from io import StringIO 7 | 8 | # Importing src python scripts ---------------------- 9 | sys.path.insert(0, '../src/python/') 10 | from knn import fit_knn 11 | from random_forest import fit_rf 12 | from neural_networks import fit_nn 13 | from data_extraction import test_set_scaled 14 | from data_extraction import test_set, test_class_set 15 | from helper_functions import create_conf_mat 16 | from produce_model_metrics import produce_model_metrics 17 | sys.path.pop(0) 18 | 19 | # Calling up metrics from the model scripts 20 | # KNN ----------------------------------------------- 21 | metrics_knn = produce_model_metrics(fit_knn, test_set, 22 | test_class_set, 'knn') 23 | # Call each value from dictionary 24 | predictions_knn = metrics_knn['predictions'] 25 | accuracy_knn = metrics_knn['accuracy'] 26 | fpr = metrics_knn['fpr'] 27 | tpr = metrics_knn['tpr'] 28 | auc_knn = metrics_knn['auc'] 29 | 30 | test_error_rate_knn = 1 - accuracy_knn 31 | 32 | # Confusion Matrix 33 | cross_tab_knn = create_conf_mat(test_class_set, 34 | predictions_knn) 35 | 36 | # RF ------------------------------------------------ 37 | metrics_rf = produce_model_metrics(fit_rf, test_set, 38 | test_class_set, 'rf') 39 | # Call each value from dictionary 40 | predictions_rf = metrics_rf['predictions'] 41 | accuracy_rf = metrics_rf['accuracy'] 42 | fpr2 = metrics_rf['fpr'] 43 | tpr2 = metrics_rf['tpr'] 44 | auc_rf = metrics_rf['auc'] 45 | 46 | test_error_rate_rf = 1 - accuracy_rf 47 | 48 | cross_tab_rf = create_conf_mat(test_class_set, 49 | predictions_rf) 50 | 51 | # NN ---------------------------------------- 52 | metrics_nn = produce_model_metrics(fit_nn, test_set_scaled, 53 | test_class_set, 'nn') 54 | 55 | # Call each value from dictionary 56 | predictions_nn = metrics_nn['predictions'] 57 | accuracy_nn = metrics_nn['accuracy'] 58 | fpr3 = metrics_nn['fpr'] 59 | tpr3 = metrics_nn['tpr'] 60 | auc_nn = metrics_nn['auc'] 61 | 62 | test_error_rate_nn = 1 - accuracy_nn 63 | 64 | cross_tab_nn = create_conf_mat(test_class_set, 65 | predictions_nn) 66 | 67 | # Classification Report Stuff 68 | def create_class_report(class_report_string): 69 | class_report_mod = StringIO(class_report_string) 70 | class_report = pd.read_csv(class_report_mod, ',') 71 | return class_report 72 | 73 | 74 | class_rep_knn_str = """ 75 | Class, Precision, Recall, F1-score, Support 76 | Benign, 0.96, 0.93, 0.94, 73 77 | Malignant, 0.88, 0.93, 0.90, 41 78 | Avg/Total, 0.93, 0.93, 0.93, 114 79 | """ 80 | 81 | class_rep_knn = create_class_report(class_rep_knn_str) 82 | 83 | class_rep_rf_str = """ 84 | Class, Precision, Recall, F1-score, Support 85 | Benign, 0.99, 0.96, 0.97, 73 86 | Malignant, 0.93, 0.98, 0.95, 41 87 | Avg/Total, 0.97, 0.96, 0.97, 114 88 | """ 89 | 90 | class_rep_rf = create_class_report(class_rep_rf_str) 91 | 92 | class_rep_nn_str = """ 93 | Class, Precision, Recall, F1-score, Support 94 | Benign , 0.99, 0.97, 0.98, 73 95 | Malignant, 0.95, 0.98, 0.96, 41 96 | Avg/Total, 0.97, 0.97, 0.97, 114 97 | """ 98 | 99 | class_rep_nn = create_class_report(class_rep_nn_str) 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Techniques on Breast Cancer Wisconsin Data Set 2 | 3 | **Contributor**: 4 | + Raul Eulogio 5 | 6 | I created this repo as a way to get better acquainted with **Python** as a language and as a tool for data analysis. But it eventually became in exercise in utilizing various programming languages for machine learning applications. 7 | 8 | I employed four **Machine Learning** techniques: 9 | + **Kth Nearest Neighbor** 10 | + **Random Forest** 11 | + **Neural Networks**: 12 | 13 | If you would like to see a walk through of the analysis on [inertia7](https://www.inertia7.com/projects/3) includes running code as well as explanations for exploratory analysis. This [project](https://www.inertia7.com/projects/95) contains an overview of *random forest*, explanations for other algorithms in the works. 14 | 15 | The repository includes the *scripts* folder which contains scripts for these programming languages (in order of most detailed): 16 | + *Python* 17 | + *R* 18 | + *PySpark* 19 | 20 | This repo is primarily concerned with the *python* iteration. 21 | 22 | The multiple *python* script is broken into 5 sections (done by creating a script for each section) in the following order: 23 | + **Exploratory Analysis** 24 | + **Kth Nearest Neighbors** 25 | + **Random Forest** 26 | + **Neural Networks** 27 | + **Comparing Models** 28 | 29 | **NOTE**: The files `data_extraction.py`, `helper_functions.py`, and `produce_model_metrics.py` are used to abstract functions to make code easier to read. These files do a lot of the work so if you are interested in how the scripts work definitely check them out. 30 | 31 | ## Running .py Script 32 | A `virtualenv` is needed where you will download the necessary packages from the `requirements.txt` using: 33 | 34 | pip3 install -r requirements.txt 35 | 36 | Once this is done you can run the scripts using the usual terminal command: 37 | 38 | $ python3 exploratory_analysis.py 39 | 40 | **NOTE**: You can also run it by making script executable as such: 41 | 42 | $ chmod +x exploratory_analysis.py 43 | 44 | 45 | **Remember**: You must have a *shebang* for this to run i.e. this must be at the very beginning of your script: 46 | 47 | #!/usr/bin/env python3 48 | 49 | then you would simply just run it (I'll use **Random Forest** as an example) 50 | 51 | $ ./random_forest.py 52 | 53 | ## Conclusions 54 | Once I employed all these methods, we were able to utilize various machine learning metrics. Each model provided valuable insight. *Kth Nearest Neighbor* helped create a baseline model to compare the more complex models. *Random forest* helps us see what variables were important in the bootstrapped decision trees. And *Neural Networks* provided minimal false negatives which results in false negatives. In this context it can mean death. 55 | 56 | ### Diagnostics for Data Set 57 | 58 | 59 | | Model/Algorithm | Test Error Rate | False Negative for Test Set | Area under the Curve for ROC | Cross Validation Score | Hyperparameter Optimization | 60 | |----------------------|-----------------|-----------------------------|------------------------------|-------------------------------|-----------------------| 61 | | Kth Nearest Neighbor | 0.07 | 5 | 0.980 | Accuracy: 0.925 (+/- 0.025) | Optimal *K* is 3 | 62 | | Random Forest | 0.035 | 3 | 0.996 | Accuracy: 0.963 (+/- 0.013) | {'max_features': 'log2', 'max_depth': 3, 'bootstrap': True, 'criterion': 'gini'} | 63 | | Neural Networks | 0.035 | 1 | 0.982 | Accuracy: 0.967 (+/- 0.011) | {'hidden_layer_sizes': 12, 'activation': 'tanh', 'learning_rate_init': 0.05} | 64 | 65 | 66 | 67 | #### ROC Curves for Data Set 68 | 69 | 70 | #### ROC Curves zoomed in 71 | 72 | 73 | The ROC Curves are more telling of **Random Forest** being a better model for predicting. 74 | 75 | Any feedback is welcomed! 76 | 77 | Things to do: 78 | + Create **Jupyter Notebook** for *KNN* and *NN* (1/25/2018) 79 | + Unit test scripts 80 | -------------------------------------------------------------------------------- /src/python/random_forest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER - MACHINE LEARNING ## 5 | ##################################################### 6 | # 7 | # Project by Raul Eulogio 8 | # 9 | # Project found at: https://www.inertia7.com/projects/3 10 | # 11 | 12 | """ 13 | Random Forest Classification 14 | """ 15 | # Import Packages ----------------------------------------------- 16 | import time 17 | import sys 18 | from numpy import argsort 19 | import pandas as pd 20 | import helper_functions as hf 21 | from data_extraction import names_index 22 | from data_extraction import training_set, class_set 23 | from data_extraction import test_set, test_class_set 24 | from sklearn.ensemble import RandomForestClassifier 25 | from produce_model_metrics import produce_model_metrics 26 | 27 | # Fitting Random Forest ----------------------------------------- 28 | # Set the random state for reproducibility 29 | fit_rf = RandomForestClassifier(random_state=42) 30 | 31 | ## Set best parameters given by grid search 32 | fit_rf.set_params(criterion = 'gini', 33 | max_features = 'log2', 34 | max_depth = 3, 35 | n_estimators=400) 36 | 37 | # Fit model on training data 38 | fit_rf.fit(training_set, 39 | class_set) 40 | 41 | # Tree Specific ------------------------------------------------- 42 | 43 | # Extracting feature importance 44 | var_imp_rf = hf.variable_importance(fit_rf) 45 | 46 | importances_rf = var_imp_rf['importance'] 47 | 48 | indices_rf = var_imp_rf['index'] 49 | 50 | if __name__=='__main__': 51 | # Print model parameters ------------------------------------ 52 | print(fit_rf, '\n') 53 | 54 | # Initialize function for metrics --------------------------- 55 | fit_dict_rf = produce_model_metrics(fit_rf, 56 | test_set, 57 | test_class_set, 58 | 'rf') 59 | 60 | # Extract each piece from dictionary 61 | predictions_rf = fit_dict_rf['predictions'] 62 | accuracy_rf = fit_dict_rf['accuracy'] 63 | auc_rf = fit_dict_rf['auc'] 64 | 65 | print("Hyperparameter Optimization:") 66 | print("chosen parameters: \n \ 67 | {'max_features': 'log2', \n \ 68 | 'max_depth': 3, \n \ 69 | 'bootstrap': True, \n \ 70 | 'criterion': 'gini'}") 71 | print("Note: Remove commented code to see this section \n") 72 | 73 | # np.random.seed(42) 74 | # start = time.time() 75 | # param_dist = {'max_depth': [2, 3, 4], 76 | # 'bootstrap': [True, False], 77 | # 'max_features': ['auto', 'sqrt', 78 | # 'log2', None], 79 | # 'criterion': ['gini', 'entropy']} 80 | # cv_rf = GridSearchCV(fit_rf, cv = 10, 81 | # param_grid=param_dist, 82 | # n_jobs = 3) 83 | # cv_rf.fit(training_set, class_set) 84 | # print('Best Parameters using grid search: \n', 85 | # cv_rf.best_params_) 86 | # end = time.time() 87 | # print('Time taken in grid search: {0: .2f}'\ 88 | #.format(end - start)) 89 | 90 | # Test Set Calculations ------------------------------------- 91 | # Test error rate 92 | test_error_rate_rf = 1 - accuracy_rf 93 | 94 | # Confusion Matrix 95 | test_crosstb = hf.create_conf_mat(test_class_set, 96 | predictions_rf) 97 | 98 | # Print Variable Importance 99 | hf.print_var_importance(importances_rf, indices_rf, names_index) 100 | 101 | # Cross validation 102 | print('Cross Validation:') 103 | hf.cross_val_metrics(fit_rf, 104 | training_set, 105 | class_set, 106 | 'rf', 107 | print_results = True) 108 | 109 | print('Confusion Matrix:') 110 | print(test_crosstb, '\n') 111 | 112 | print("Here is our mean accuracy on the test set:\n {0: 0.3f}"\ 113 | .format(accuracy_rf)) 114 | 115 | print("The test error rate for our model is:\n {0: .3f}"\ 116 | .format(test_error_rate_rf)) 117 | import pdb 118 | pdb.set_trace() 119 | -------------------------------------------------------------------------------- /src/r/random_forest.R: -------------------------------------------------------------------------------- 1 | 2 | # Load Packages 3 | suppressWarnings(library(tidyverse)) 4 | suppressWarnings(library(caret)) 5 | suppressWarnings(library(ggcorrplot)) 6 | suppressWarnings(library(GGally)) 7 | suppressWarnings(library(randomForest)) 8 | suppressWarnings(library(e1071)) 9 | suppressWarnings(library(ROCR)) 10 | suppressWarnings(library(pROC)) 11 | suppressWarnings(library(RCurl)) 12 | library(here) 13 | 14 | # Load Data 15 | UCI_data_URL <- getURL('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data') 16 | names <- c('id_number', 'diagnosis', 'radius_mean', 17 | 'texture_mean', 'perimeter_mean', 'area_mean', 18 | 'smoothness_mean', 'compactness_mean', 19 | 'concavity_mean','concave_points_mean', 20 | 'symmetry_mean', 'fractal_dimension_mean', 21 | 'radius_se', 'texture_se', 'perimeter_se', 22 | 'area_se', 'smoothness_se', 'compactness_se', 23 | 'concavity_se', 'concave_points_se', 24 | 'symmetry_se', 'fractal_dimension_se', 25 | 'radius_worst', 'texture_worst', 26 | 'perimeter_worst', 'area_worst', 27 | 'smoothness_worst', 'compactness_worst', 28 | 'concavity_worst', 'concave_points_worst', 29 | 'symmetry_worst', 'fractal_dimension_worst') 30 | breast_cancer <- read.table(textConnection(UCI_data_URL), sep = ',', col.names = names) 31 | 32 | breast_cancer$id_number <- NULL 33 | 34 | # Preview Data 35 | head(breast_cancer) 36 | 37 | # Structure of data 38 | breast_cancer %>% 39 | dim() 40 | breast_cancer %>% 41 | str() 42 | 43 | # Check distribution of Class 44 | breast_cancer %>% 45 | count(diagnosis) %>% 46 | group_by(diagnosis) %>% 47 | summarize(perc_dx = round((n / 569)* 100, 2)) 48 | 49 | summary(breast_cancer) 50 | 51 | # Create Training and Test Set 52 | set.seed(42) 53 | trainIndex <- createDataPartition(breast_cancer$diagnosis, 54 | p = .8, 55 | list = FALSE, 56 | times = 1) 57 | training_set <- breast_cancer[ trainIndex, ] 58 | test_set <- breast_cancer[ -trainIndex, ] 59 | 60 | # Custom grid search 61 | # From https://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/ 62 | customRF <- list(type = "Classification", library = "randomForest", loop = NULL) 63 | customRF$parameters <- data.frame(parameter = c("mtry", "ntree", "nodesize"), class = rep("numeric", 3), label = c("mtry", "ntree", "nodesize")) 64 | customRF$grid <- function(x, y, len = NULL, search = "grid") {} 65 | customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { 66 | randomForest(x, y, mtry = param$mtry, ntree=param$ntree, nodesize=param$nodesize, ...) 67 | } 68 | customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) 69 | predict(modelFit, newdata) 70 | customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL) 71 | predict(modelFit, newdata, type = "prob") 72 | customRF$sort <- function(x) x[order(x[,1]),] 73 | customRF$levels <- function(x) x$classes 74 | 75 | # Fitting Model 76 | fitControl <- trainControl(## 10-fold CV 77 | method = "repeatedcv", 78 | number = 3, 79 | ## repeated ten times 80 | repeats = 10) 81 | 82 | grid <- expand.grid(.mtry=c(floor(sqrt(ncol(training_set))), (ncol(training_set) - 1), floor(log(ncol(training_set)))), 83 | .ntree = c(100, 300, 500, 1000), 84 | .nodesize =c(1:4)) 85 | set.seed(42) 86 | fit_rf <- train(as.factor(diagnosis) ~ ., 87 | data = training_set, 88 | method = customRF, 89 | metric = "Accuracy", 90 | tuneGrid= grid, 91 | trControl = fitControl) 92 | 93 | # Final Model 94 | fit_rf$finalModel 95 | 96 | # Diangostic Plots 97 | fit_rf 98 | 99 | suppressWarnings(ggplot(fit_rf) + 100 | theme_bw() + 101 | ggtitle('Line plot for Random Forest')) 102 | 103 | # Variable Importance 104 | varImportance <- varImp(fit_rf, scale = FALSE) 105 | 106 | varImportanceScores <- data.frame(varImportance$importance) 107 | 108 | varImportanceScores <- data.frame(names = row.names(varImportanceScores), var_imp_scores = varImportanceScores$B) 109 | 110 | varImportanceScores 111 | 112 | # Visual 113 | ggplot(varImportanceScores, 114 | aes(reorder(names, var_imp_scores), var_imp_scores)) + 115 | geom_bar(stat='identity', 116 | fill = '#875FDB') + 117 | theme(panel.background = element_rect(fill = '#fafafa')) + 118 | coord_flip() + 119 | labs(x = 'Feature', y = 'Importance') + 120 | ggtitle('Feature Importance for Random Forest Model') 121 | ggplot(varImportanceScores, 122 | aes(reorder(names, var_imp_scores), var_imp_scores)) + 123 | geom_bar(stat='identity', 124 | fill = '#875FDB') + 125 | theme(panel.background = element_rect(fill = '#fafafa')) + 126 | coord_flip() + 127 | labs(x = 'Feature', y = 'Importance') + 128 | ggtitle('Feature Importance for Random Forest Model') 129 | 130 | # Out of Bag Error Rate 131 | oob_error <- data.frame(mtry = seq(1:100), oob = fit_rf$finalModel$err.rate[, 'OOB']) 132 | 133 | paste0('Out of Bag Error Rate for model is: ', round(oob_error[100, 2], 4)) 134 | 135 | ggplot(oob_error, aes(mtry, oob)) + 136 | geom_line(colour = 'red') + 137 | theme_minimal() + 138 | ggtitle('OOB Error Rate across 100 trees') + 139 | labs(y = 'OOB Error Rate') 140 | 141 | # Test Set Predictions 142 | predict_values <- predict(fit_rf, newdata = test_set) -------------------------------------------------------------------------------- /src/r/breastCancer.R: -------------------------------------------------------------------------------- 1 | # LOAD PACKAGES 2 | setwd('/home/rxe/myProjects/breastCancer/scripts/r') 3 | 4 | require(tidyverse) 5 | require(caret) 6 | require(ggcorrplot) 7 | require(GGally) 8 | require(class) 9 | require(randomForest) 10 | require(nnet) 11 | require(e1071) 12 | require(pROC) 13 | require(class) 14 | 15 | # EXPLORATORY ANALYSIS 16 | breastCancer <- read_csv('wdbc.data.txt') 17 | breast_simp <- read_csv('breast_cancer.txt') 18 | 19 | View(inner_join(breastCancer, breast_simp, by = "id_number")) 20 | 21 | head(breastCancer) 22 | dim(breastCancer) 23 | 24 | # REMOVING 'id_number' 25 | breastCancer$id_number <- NULL 26 | 27 | table(breastCancer$diagnosis) 28 | summary(breastCancer) 29 | 30 | # Scatterplot Matrix 31 | p <- ggpairs(data = breastCancer, 32 | columns = c('concave_points_worst', 'concavity_mean', 33 | 'perimeter_worst', 'radius_worst', 34 | 'area_worst', 'diagnosis'), 35 | mapping = aes(color = diagnosis)) + 36 | theme(panel.background = element_rect(fill = '#fafafa')) + 37 | ggtitle('Scatter Plot Matrix') 38 | 39 | # MANUALLY CHANGING COLORS OF PLOT 40 | # BORROWED FROM: https://stackoverflow.com/questions/34740210/how-to-change-the-color-palette-for-ggallyggpairs 41 | for(i in 1:p$nrow) { 42 | for(j in 1:p$ncol){ 43 | p[i,j] <- p[i,j] + 44 | scale_fill_manual(values=c("red", "#875FDB")) + 45 | scale_color_manual(values=c("red", "#875FDB")) 46 | } 47 | } 48 | 49 | p 50 | # Pearson Correlation 51 | corr <- round(cor(breastCancer[, 2:31]), 2) 52 | ggcorrplot(corr, 53 | colors = c('red', 'white', '#875FDB')) + 54 | ggtitle('Peasron Correlation Matrix') 55 | 56 | # Box Plot 57 | ggplot(data = stack(breastCancer), 58 | aes(x = ind, y = values)) + 59 | geom_boxplot() + 60 | coord_flip(ylim = c(-.05, 50)) + 61 | theme(panel.background = element_rect(fill = '#fafafa')) + 62 | ggtitle('Box Plot of Unprocessed Data') 63 | 64 | # NORMALIZING 65 | preprocessparams <- preProcess(breastCancer[, 3:31], method=c('range')) 66 | 67 | breastCancerNorm <- predict(preprocessparams, breastCancer[, 3:31]) 68 | 69 | breastCancerNorm <- data.table(breastCancerNorm, diangosis = breastCancer$diagnosis) 70 | 71 | summary(breastCancerNorm) 72 | # Box Plot of Normalized data 73 | ggplot(data = stack(breastCancerNorm), 74 | aes(x = ind, y = values)) + 75 | geom_boxplot() + 76 | coord_flip(ylim = c(-.05, 1.05)) + 77 | theme(panel.background = element_rect(fill = '#fafafa')) + 78 | ggtitle('Box Plot of Normalized Data') 79 | 80 | # TRAINING AND TEST SET 81 | breastCancer$diagnosis <- gsub('M', 1, breastCancer$diagnosis) 82 | breastCancer$diagnosis <- gsub('B', 0, breastCancer$diagnosis) 83 | 84 | breastCancer$diagnosis <- as.numeric(breastCancer$diagnosis) 85 | 86 | set.seed(42) 87 | trainIndex <- createDataPartition(breastCancer$diagnosis, 88 | p = .8, 89 | list = FALSE, 90 | times = 1) 91 | 92 | training_set <- breastCancer[ trainIndex, ] 93 | test_set <- breastCancer[ -trainIndex, ] 94 | ## Kth Nearest Neighbor 95 | 96 | # TRAINING AND TEST SETS ARE SET UP DIFFERENTLY FOR KNN 97 | # SO HERE WE'RE DOING THAT 98 | # Intialize a class set as vector 99 | class_set <- as.vector(training_set$diagnosis) 100 | 101 | test_set_knn <- test_set 102 | training_set_knn <- training_set 103 | test_set_knn$diagnosis <- NULL 104 | training_set_knn$diagnosis <- NULL 105 | 106 | head(test_set_knn) 107 | 108 | # FITTING MODEL 109 | fit_knn <- knn(training_set_knn, test_set_knn, class_set, k = 7) 110 | 111 | # TEST SET EVALUATIONS 112 | table(test_set$diagnosis, fit_knn) 113 | 114 | # TEST ERROR RATE: 0.063 115 | 116 | ## RANDOM FOREST 117 | # FITTING MODEL 118 | fitControl <- trainControl(## 10-fold CV 119 | method = "repeatedcv", 120 | number = 10, 121 | ## repeated ten times 122 | repeats = 10) 123 | 124 | set.seed(42) 125 | fit_rf <- train(as.factor(diagnosis) ~ ., 126 | data = training_set, 127 | method = "rf", 128 | trControl = fitControl) 129 | 130 | fit_rf$finalModel 131 | 132 | # VARIABLE IMPORTANCE 133 | varImportance <- varImp(fit_rf, scale = FALSE) 134 | 135 | varImportanceScores <- data.table(varImportance$importance, names = colnames(breastCancer[, 2:31])) 136 | 137 | varImportanceScores 138 | 139 | # VISUAL OF VARIABLE IMPORTANCE 140 | ggplot(varImportanceScores, 141 | aes(reorder(names, Overall), Overall)) + 142 | geom_bar(stat='identity', 143 | fill = '#875FDB') + 144 | theme(panel.background = element_rect(fill = '#fafafa')) + 145 | coord_flip() + 146 | labs(x = 'Feature', y = 'Importance') + 147 | ggtitle('Feature Importance for Random Forest Model') 148 | 149 | # TEST SET EVALUATIONS 150 | predict_values <- predict(fit_rf, newdata = test_set) 151 | 152 | table(predict_values, test_set$diagnosis) 153 | 154 | # TEST ERROR RATE: 0.027 155 | 156 | # NEURAL NETWORKS 157 | 158 | # CREATING NORMALIZED TRAINING AND TEST SET 159 | set.seed(42) 160 | trainIndex_norm <- createDataPartition(breastCancerNorm$diangosis, 161 | p = .8, 162 | list = FALSE, 163 | times = 1) 164 | 165 | training_set_norm <- breastCancerNorm[ trainIndex, ] 166 | test_set_norm <- breastCancerNorm[ -trainIndex, ] 167 | 168 | training_set_norm 169 | 170 | fit_nn <- train(as.factor(diangosis) ~ ., 171 | data = training_set_norm, 172 | method = "nnet", 173 | hidden = 3, 174 | algorithm = 'backprop') 175 | 176 | fit_nn$finalModel 177 | plot(fit_nn) 178 | predict_val_nn <- predict(fit_nn, newdata = test_set_norm) 179 | 180 | table(predict_val_nn, test_set$diagnosis) 181 | 182 | # TEST ERROR RATE: 0.035 -------------------------------------------------------------------------------- /src/pyspark/breast_cancer_df.py: -------------------------------------------------------------------------------- 1 | # Load packages 2 | from pyspark.sql.functions import col 3 | from pyspark.ml.classification import RandomForestClassifier 4 | from pyspark.ml.classification import DecisionTreeClassifier 5 | from pyspark.ml.classification import MultilayerPerceptronClassifier 6 | from pyspark.ml.feature import StringIndexer 7 | from pyspark.ml.feature import MinMaxScaler 8 | from pyspark.ml.feature import VectorAssembler 9 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 10 | from pyspark.ml.classification import MultilayerPerceptronClassifier 11 | 12 | 13 | rdd = sc.textFile('data/data.txt').map(lambda lines: lines.split(" ")) 14 | 15 | df = rdd.toDF() 16 | 17 | data = df.selectExpr('_1 as label', '_2 as radius_mean', 18 | '_3 as texture_mean', '_4 as perimeter_mean', 19 | '_5 as area_mean', '_6 as smoothness_mean', 20 | '_7 as compactness_mean', '_8 as concavity_mean', 21 | '_9 as concave_points_mean', '_10 as symmetry_mean', 22 | '_11 as fractal_dimension_mean', '_12 as radius_se', 23 | '_13 as texture_se', '_14 as perimeter_se', 24 | '_15 as area_se', '_16 as smoothness_se', 25 | '_17 as compactness_se', '_18 as concavity_se', 26 | '_19 as concave_points_se', '_20 as symmetry_se', 27 | '_21 as fractal_dimension_se', '_22 as radius_worst', 28 | '_23 as texture_worst', '_24 as perimeter_worst', 29 | '_25 as area_worst', '_26 as smoothness_worst', 30 | '_27 as compactness_worst', '_28 as concavity_worst', 31 | '_29 as concave_points_worst', '_30 as symmetry_worst', 32 | '_31 as fractal_dimension_worst') 33 | 34 | 35 | # Converting to correct data types 36 | newData = data.select([col(c).cast('float') if c != 'label' else col(c).cast('int') for c in data.columns ]) 37 | 38 | # For loops to output the describe functionality neatly 39 | mylist = [] 40 | mylist2 = [] 41 | for i in range(0, 31): 42 | if (i % 2 != 0): 43 | mylist.append(newData.columns[i]) 44 | else: 45 | mylist2.append(newData.columns[i]) 46 | 47 | # Now we use the newly created lists that have even and odd columns respectively 48 | # to see some basic statistics for our dataset 49 | for i in range(0, 15): 50 | newData.describe(mylist[i], mylist2[i]).show() 51 | 52 | # Important meta-data inputting for when I start running models! 53 | # Meta-data for the feature space 54 | featureIndexer = VectorAssembler( 55 | inputCols = [x for x in newData.columns if x != 'label'], 56 | outputCol = 'features') 57 | 58 | df = featureIndexer.transform(newData) 59 | 60 | # Some tests to see if things came out properly 61 | df.select(df['features']).show() 62 | df.select(df['label']).show() 63 | 64 | # Creating training and test sets 65 | (trainingSet, testSet) = df.randomSplit([0.7, 0.3]) 66 | 67 | #################### 68 | ## DECISION TREES ## 69 | #################### 70 | 71 | # Creating training and test sets 72 | 73 | dt = DecisionTreeClassifier(labelCol="label", 74 | featuresCol = "features") 75 | 76 | #pipeline_dt = Pipeline(stages=[labelIndexer0, featureIndexer0, dt]) 77 | 78 | model_dt = dt.fit(trainingSet) 79 | 80 | predictions_dt = model_dt.transform(testSet) 81 | 82 | # Select example rows to display. 83 | predictions_dt.select("prediction", 84 | "label", 85 | "features").show(5) 86 | 87 | # Select (prediction, true label) and compute test error 88 | evaluator_dt = MulticlassClassificationEvaluator( 89 | labelCol="label", 90 | predictionCol="prediction", 91 | metricName="accuracy") 92 | 93 | accuracy_dt = evaluator_dt.evaluate(predictions_dt) 94 | 95 | print("Test Error = %g " % (1.0 - accuracy_dt)) 96 | ''' 97 | Test Error = 0.0697674 98 | ''' 99 | 100 | ######################### 101 | ## Random Forest Model ## 102 | ######################### 103 | 104 | rf = RandomForestClassifier(labelCol='label', 105 | maxDepth=4, 106 | impurity="gini", 107 | numTrees=500, 108 | seed=42) 109 | 110 | model_rf = rf.fit(trainingSet) 111 | 112 | predictions_rf = model_rf.transform(testSet) 113 | 114 | predictions_rf.select("prediction", "label", "features").show(10) 115 | 116 | ''' 117 | +----------+-----+--------------------+ 118 | |prediction|label| features| 119 | +----------+-----+--------------------+ 120 | | 0.0| 0|[0.0,0.1258031932...| 121 | | 0.0| 0|[0.05859245005374...| 122 | | 0.0| 0|[0.07652986773450...| 123 | | 0.0| 0|[0.07747645570059...| 124 | | 0.0| 0|[0.07998483256627...| 125 | | 0.0| 0|[0.09025507729212...| 126 | | 0.0| 0|[0.09318944582402...| 127 | | 0.0| 0|[0.11756354432107...| 128 | | 0.0| 0|[0.11940932766481...| 129 | | 0.0| 0|[0.13280324046146...| 130 | +----------+-----+--------------------+ 131 | only showing top 10 rows 132 | ''' 133 | 134 | evaluator_rf = MulticlassClassificationEvaluator(labelCol="label", 135 | predictionCol="prediction", 136 | metricName="accuracy") 137 | 138 | accuracy_rf = evaluator_rf.evaluate(predictions_rf) 139 | print("Test Error = %g" % (1.0 - accuracy_rf)) 140 | ''' 141 | Test Error = 0.0223 142 | ''' 143 | 144 | ##################### 145 | ## NEURAL NETWORKS ## 146 | ##################### 147 | 148 | ######################## 149 | ## RESCALING DATA SET ## 150 | ######################## 151 | # Typically for Neural Networks to perform better 152 | # a lot of preprocessing has to go into the data 153 | # So I scaled the feature space to have min = 0 and max = 1 154 | 155 | scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures') 156 | 157 | scalerModel = scaler.fit(df) 158 | 159 | scaledData = scalerModel.transform(df) 160 | 161 | print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) 162 | 163 | scaledData.select("features", "scaledFeatures").show() 164 | 165 | new_df = scaledData.selectExpr("label", "radius_mean", "texture_mean", 166 | "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", 167 | "concavity_mean", "concave_points_mean", "symmetry_mean", 168 | "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 169 | "area_se", "smoothness_se", "compactness_se", "concavity_se", 170 | "concave_points_se", "symmetry_se", "fractal_dimension_se", 171 | "radius_worst", "texture_worst", "perimeter_worst", 172 | "area_worst", "smoothness_worst", "compactness_worst", 173 | "concavity_worst", "concave_points_worst", "symmetry_worst", 174 | "fractal_dimension_worst","features as oldFeature", 175 | "scaledFeatures as features") 176 | 177 | # Creating training and test sets 178 | (trainingSet_scaled, testSet_scaled) = new_df\ 179 | .randomSplit([0.7, 0.3]) 180 | 181 | layers = [30, 5, 4, 2] 182 | 183 | trainer = MultilayerPerceptronClassifier(maxIter=100, 184 | layers=layers, 185 | blockSize=128, 186 | seed=1234) 187 | 188 | model_nn = trainer.fit(trainingSet_scaled) 189 | 190 | result_nn = model_nn.transform(testSet_scaled) 191 | predictions_nn = result_nn.select("prediction", "label") 192 | evaluator_nn = MulticlassClassificationEvaluator(metricName="accuracy") 193 | 194 | accuracy_nn = evaluator_nn.evaluate(predictions_nn) 195 | 196 | print("Test Error = %g" % (1.0 - accuracy_nn)) 197 | ''' 198 | Test Error = 0.0314465 199 | ''' -------------------------------------------------------------------------------- /src/python/model_eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER MACHINE LEARNING ## 5 | ##################################################### 6 | 7 | # Project by Raul Eulogio 8 | 9 | # Project found at: https://www.inertia7.com/projects/3 10 | 11 | 12 | """ 13 | Model Evaluation 14 | """ 15 | # Import Packages ----------------------------------------------- 16 | import matplotlib.pyplot as plt 17 | from knn import fit_knn 18 | from random_forest import fit_rf 19 | from neural_networks import fit_nn 20 | from data_extraction import training_set, class_set 21 | from data_extraction import test_set, test_class_set 22 | from data_extraction import training_set_scaled, test_set_scaled 23 | from helper_functions import cross_val_metrics 24 | from produce_model_metrics import produce_model_metrics 25 | from terminaltables import AsciiTable 26 | from sklearn.metrics import classification_report 27 | 28 | 29 | 30 | # Calling up metrics from the model scripts 31 | # KNN ----------------------------------------------------------- 32 | metrics_knn = produce_model_metrics(fit_knn, test_set, 33 | test_class_set, 'knn') 34 | # Call each value from dictionary 35 | predictions_knn = metrics_knn['predictions'] 36 | accuracy_knn = metrics_knn['accuracy'] 37 | fpr = metrics_knn['fpr'] 38 | tpr = metrics_knn['tpr'] 39 | auc_knn = metrics_knn['auc'] 40 | 41 | # Test Error Rate 42 | test_error_rate_knn = 1 - accuracy_knn 43 | 44 | # Cross Validated Score 45 | mean_cv_knn, std_error_knn = cross_val_metrics(fit_knn, 46 | training_set, 47 | class_set, 48 | 'knn', 49 | print_results = False) 50 | 51 | # RF ------------------------------------------------------------ 52 | metrics_rf = produce_model_metrics(fit_rf, test_set, 53 | test_class_set, 'rf') 54 | # Call each value from dictionary 55 | predictions_rf = metrics_rf['predictions'] 56 | accuracy_rf = metrics_rf['accuracy'] 57 | fpr2 = metrics_rf['fpr'] 58 | tpr2 = metrics_rf['tpr'] 59 | auc_rf = metrics_rf['auc'] 60 | 61 | # Test Error Rate 62 | test_error_rate_rf = 1 - accuracy_rf 63 | 64 | # Cross Validated Score 65 | mean_cv_rf, std_error_rf = cross_val_metrics(fit_rf, 66 | training_set, 67 | class_set, 68 | 'rf', 69 | print_results = False) 70 | 71 | # NN ------------------------------------------------------------ 72 | metrics_nn = produce_model_metrics(fit_nn, test_set_scaled, 73 | test_class_set, 'nn') 74 | 75 | # Call each value from dictionary 76 | predictions_nn = metrics_nn['predictions'] 77 | accuracy_nn = metrics_nn['accuracy'] 78 | fpr3 = metrics_nn['fpr'] 79 | tpr3 = metrics_nn['tpr'] 80 | auc_nn = metrics_nn['auc'] 81 | 82 | # Test Error Rate 83 | test_error_rate_nn = 1 - accuracy_nn 84 | 85 | # Cross Validated Score 86 | mean_cv_nn, std_error_nn = cross_val_metrics(fit_nn, 87 | training_set_scaled, 88 | class_set, 89 | 'nn', 90 | print_results = False) 91 | 92 | # Main ---------------------------------------------------------- 93 | if __name__ == '__main__': 94 | # Populate list for human readable table from terminal line 95 | table_data = [[ 'Model/Algorithm', 'Test Error Rate', 96 | 'False Negative for Test Set', 'Area under the Curve for ROC', 97 | 'Cross Validation Score'], 98 | ['Kth Nearest Neighbor', 99 | round(test_error_rate_knn, 3), 100 | 5, 101 | round(auc_knn, 3), 102 | "Accuracy: {0: 0.3f} (+/- {1: 0.3f})"\ 103 | .format(mean_cv_knn, std_error_knn)], 104 | [ 'Random Forest', 105 | round(test_error_rate_rf, 3), 106 | 3, 107 | round(auc_rf, 3), 108 | "Accuracy: {0: 0.3f} (+/- {1: 0.3f})"\ 109 | .format(mean_cv_rf, std_error_rf)], 110 | [ 'Neural Networks' , 111 | round(test_error_rate_nn, 3), 112 | 1, 113 | round(auc_nn, 3), 114 | "Accuracy: {0: 0.3f} (+/- {1: 0.3f})"\ 115 | .format(mean_cv_nn, std_error_nn)]] 116 | 117 | # convert to AsciiTable from terminaltables package 118 | table = AsciiTable(table_data) 119 | 120 | target_names = ['Benign', 'Malignant'] 121 | 122 | print('Classification Report for Kth Nearest Neighbor:') 123 | print(classification_report(predictions_knn, 124 | test_class_set, 125 | target_names = target_names)) 126 | 127 | print('Classification Report for Random Forest:') 128 | print(classification_report(predictions_rf, 129 | test_class_set, 130 | target_names = target_names)) 131 | 132 | print('Classification Report for Neural Networks:') 133 | print(classification_report(predictions_nn, 134 | test_class_set, 135 | target_names = target_names)) 136 | 137 | print("Comparison of different logistics relating to model evaluation:") 138 | print(table.table) 139 | 140 | # Plotting ROC Curves---------------------------------------- 141 | f, ax = plt.subplots(figsize=(10, 10)) 142 | 143 | plt.plot(fpr, tpr, label='Kth Nearest Neighbor ROC Curve (area = {0: .3f})'\ 144 | .format(auc_knn), 145 | color = 'deeppink', 146 | linewidth=1) 147 | plt.plot(fpr2, tpr2,label='Random Forest ROC Curve (area = {0: .3f})'\ 148 | .format(auc_rf), 149 | color = 'red', 150 | linestyle=':', 151 | linewidth=2) 152 | plt.plot(fpr3, tpr3,label='Neural Networks ROC Curve (area = {0: .3f})'\ 153 | .format(auc_nn), 154 | color = 'purple', 155 | linestyle=':', 156 | linewidth=3) 157 | 158 | ax.set_axis_bgcolor('#fafafa') 159 | plt.plot([0, 1], [0, 1], 'k--', lw=2) 160 | plt.plot([0, 0], [1, 0], 'k--', lw=2, color = 'black') 161 | plt.plot([1, 0], [1, 1], 'k--', lw=2, color = 'black') 162 | plt.xlim([-0.01, 1.0]) 163 | plt.ylim([0.0, 1.05]) 164 | plt.xlabel('False Positive Rate') 165 | plt.ylabel('True Positive Rate') 166 | plt.title('ROC Curve Comparison For All Models') 167 | plt.legend(loc="lower right") 168 | plt.show() 169 | 170 | # Zoomed in 171 | f, ax = plt.subplots(figsize=(10, 10)) 172 | plt.plot(fpr, tpr, label='Kth Nearest Neighbor ROC Curve (area = {0: .3f})'\ 173 | .format(auc_knn), 174 | color = 'deeppink', 175 | linewidth=1) 176 | plt.plot(fpr2, tpr2,label='Random Forest ROC Curve (area = {0: .3f})'\ 177 | .format(auc_rf), 178 | color = 'red', 179 | linestyle=':', 180 | linewidth=3) 181 | plt.plot(fpr3, tpr3,label='Neural Networks ROC Curve (area = {0: .3f})'\ 182 | .format(auc_nn), 183 | color = 'purple', 184 | linestyle=':', 185 | linewidth=3) 186 | 187 | ax.set_axis_bgcolor('#fafafa') 188 | plt.plot([0, 1], [0, 1], 'k--', lw=2) # Add Diagonal line 189 | plt.plot([0, 0], [1, 0], 'k--', lw=2, color = 'black') 190 | plt.plot([1, 0], [1, 1], 'k--', lw=2, color = 'black') 191 | plt.xlim([-0.001, 0.2]) 192 | plt.ylim([0.7, 1.05]) 193 | plt.xlabel('False Positive Rate') 194 | plt.ylabel('True Positive Rate') 195 | plt.title('ROC Curve Comparison For All Models (Zoomed)') 196 | plt.legend(loc="lower right") 197 | plt.show() 198 | 199 | print('fin \n:)') 200 | -------------------------------------------------------------------------------- /src/r/packrat/init.R: -------------------------------------------------------------------------------- 1 | local({ 2 | 3 | ## Helper function to get the path to the library directory for a 4 | ## given packrat project. 5 | getPackratLibDir <- function(projDir = NULL) { 6 | path <- file.path("packrat", "lib", R.version$platform, getRversion()) 7 | 8 | if (!is.null(projDir)) { 9 | 10 | ## Strip trailing slashes if necessary 11 | projDir <- sub("/+$", "", projDir) 12 | 13 | ## Only prepend path if different from current working dir 14 | if (!identical(normalizePath(projDir), normalizePath(getwd()))) 15 | path <- file.path(projDir, path) 16 | } 17 | 18 | path 19 | } 20 | 21 | ## Ensure that we set the packrat library directory relative to the 22 | ## project directory. Normally, this should be the working directory, 23 | ## but we also use '.rs.getProjectDirectory()' if necessary (e.g. we're 24 | ## rebuilding a project while within a separate directory) 25 | libDir <- if (exists(".rs.getProjectDirectory")) 26 | getPackratLibDir(.rs.getProjectDirectory()) 27 | else 28 | getPackratLibDir() 29 | 30 | ## Unload packrat in case it's loaded -- this ensures packrat _must_ be 31 | ## loaded from the private library. Note that `requireNamespace` will 32 | ## succeed if the package is already loaded, regardless of lib.loc! 33 | if ("packrat" %in% loadedNamespaces()) 34 | try(unloadNamespace("packrat"), silent = TRUE) 35 | 36 | if (suppressWarnings(requireNamespace("packrat", quietly = TRUE, lib.loc = libDir))) { 37 | 38 | # Check 'print.banner.on.startup' -- when NA and RStudio, don't print 39 | print.banner <- packrat::get_opts("print.banner.on.startup") 40 | if (print.banner == "auto" && is.na(Sys.getenv("RSTUDIO", unset = NA))) { 41 | print.banner <- TRUE 42 | } else { 43 | print.banner <- FALSE 44 | } 45 | return(packrat::on(print.banner = print.banner)) 46 | } 47 | 48 | ## Escape hatch to allow RStudio to handle bootstrapping. This 49 | ## enables RStudio to provide print output when automagically 50 | ## restoring a project from a bundle on load. 51 | if (!is.na(Sys.getenv("RSTUDIO", unset = NA)) && 52 | is.na(Sys.getenv("RSTUDIO_PACKRAT_BOOTSTRAP", unset = NA))) { 53 | Sys.setenv("RSTUDIO_PACKRAT_BOOTSTRAP" = "1") 54 | setHook("rstudio.sessionInit", function(...) { 55 | # Ensure that, on sourcing 'packrat/init.R', we are 56 | # within the project root directory 57 | if (exists(".rs.getProjectDirectory")) { 58 | owd <- getwd() 59 | setwd(.rs.getProjectDirectory()) 60 | on.exit(setwd(owd), add = TRUE) 61 | } 62 | source("packrat/init.R") 63 | }) 64 | return(invisible(NULL)) 65 | } 66 | 67 | ## Bootstrapping -- only performed in interactive contexts, 68 | ## or when explicitly asked for on the command line 69 | if (interactive() || "--bootstrap-packrat" %in% commandArgs(TRUE)) { 70 | 71 | message("Packrat is not installed in the local library -- ", 72 | "attempting to bootstrap an installation...") 73 | 74 | ## We need utils for the following to succeed -- there are calls to functions 75 | ## in 'restore' that are contained within utils. utils gets loaded at the 76 | ## end of start-up anyhow, so this should be fine 77 | library("utils", character.only = TRUE) 78 | 79 | ## Install packrat into local project library 80 | packratSrcPath <- list.files(full.names = TRUE, 81 | file.path("packrat", "src", "packrat") 82 | ) 83 | 84 | ## No packrat tarballs available locally -- try some other means of installation 85 | if (!length(packratSrcPath)) { 86 | 87 | message("> No source tarball of packrat available locally") 88 | 89 | ## There are no packrat sources available -- try using a version of 90 | ## packrat installed in the user library to bootstrap 91 | if (requireNamespace("packrat", quietly = TRUE) && packageVersion("packrat") >= "0.2.0.99") { 92 | message("> Using user-library packrat (", 93 | packageVersion("packrat"), 94 | ") to bootstrap this project") 95 | } 96 | 97 | ## Couldn't find a user-local packrat -- try finding and using devtools 98 | ## to install 99 | else if (requireNamespace("devtools", quietly = TRUE)) { 100 | message("> Attempting to use devtools::install_github to install ", 101 | "a temporary version of packrat") 102 | library(stats) ## for setNames 103 | devtools::install_github("rstudio/packrat") 104 | } 105 | 106 | ## Try downloading packrat from CRAN if available 107 | else if ("packrat" %in% rownames(available.packages())) { 108 | message("> Installing packrat from CRAN") 109 | install.packages("packrat") 110 | } 111 | 112 | ## Fail -- couldn't find an appropriate means of installing packrat 113 | else { 114 | stop("Could not automatically bootstrap packrat -- try running ", 115 | "\"'install.packages('devtools'); devtools::install_github('rstudio/packrat')\"", 116 | "and restarting R to bootstrap packrat.") 117 | } 118 | 119 | # Restore the project, unload the temporary packrat, and load the private packrat 120 | packrat::restore(prompt = FALSE, restart = TRUE) 121 | 122 | ## This code path only reached if we didn't restart earlier 123 | unloadNamespace("packrat") 124 | requireNamespace("packrat", lib.loc = libDir, quietly = TRUE) 125 | return(packrat::on()) 126 | 127 | } 128 | 129 | ## Multiple packrat tarballs available locally -- try to choose one 130 | ## TODO: read lock file and infer most appropriate from there; low priority because 131 | ## after bootstrapping packrat a restore should do the right thing 132 | if (length(packratSrcPath) > 1) { 133 | warning("Multiple versions of packrat available in the source directory;", 134 | "using packrat source:\n- ", shQuote(packratSrcPath)) 135 | packratSrcPath <- packratSrcPath[[1]] 136 | } 137 | 138 | 139 | lib <- file.path("packrat", "lib", R.version$platform, getRversion()) 140 | if (!file.exists(lib)) { 141 | dir.create(lib, recursive = TRUE) 142 | } 143 | lib <- normalizePath(lib, winslash = "/") 144 | 145 | message("> Installing packrat into project private library:") 146 | message("- ", shQuote(lib)) 147 | 148 | surround <- function(x, with) { 149 | if (!length(x)) return(character()) 150 | paste0(with, x, with) 151 | } 152 | 153 | ## The following is performed because a regular install.packages call can fail 154 | peq <- function(x, y) paste(x, y, sep = " = ") 155 | installArgs <- c( 156 | peq("pkgs", surround(packratSrcPath, with = "'")), 157 | peq("lib", surround(lib, with = "'")), 158 | peq("repos", "NULL"), 159 | peq("type", surround("source", with = "'")) 160 | ) 161 | installCmd <- paste(sep = "", 162 | "utils::install.packages(", 163 | paste(installArgs, collapse = ", "), 164 | ")") 165 | 166 | fullCmd <- paste( 167 | surround(file.path(R.home("bin"), "R"), with = "\""), 168 | "--vanilla", 169 | "--slave", 170 | "-e", 171 | surround(installCmd, with = "\"") 172 | ) 173 | system(fullCmd) 174 | 175 | ## Tag the installed packrat so we know it's managed by packrat 176 | ## TODO: should this be taking information from the lockfile? this is a bit awkward 177 | ## because we're taking an un-annotated packrat source tarball and simply assuming it's now 178 | ## an 'installed from source' version 179 | 180 | ## -- InstallAgent -- ## 181 | installAgent <- 'InstallAgent: packrat 0.4.8-1' 182 | 183 | ## -- InstallSource -- ## 184 | installSource <- 'InstallSource: source' 185 | 186 | packratDescPath <- file.path(lib, "packrat", "DESCRIPTION") 187 | DESCRIPTION <- readLines(packratDescPath) 188 | DESCRIPTION <- c(DESCRIPTION, installAgent, installSource) 189 | cat(DESCRIPTION, file = packratDescPath, sep = "\n") 190 | 191 | # Otherwise, continue on as normal 192 | message("> Attaching packrat") 193 | library("packrat", character.only = TRUE, lib.loc = lib) 194 | 195 | message("> Restoring library") 196 | restore(restart = FALSE) 197 | 198 | # If the environment allows us to restart, do so with a call to restore 199 | restart <- getOption("restart") 200 | if (!is.null(restart)) { 201 | message("> Packrat bootstrap successfully completed. ", 202 | "Restarting R and entering packrat mode...") 203 | return(restart()) 204 | } 205 | 206 | # Callers (source-erers) can define this hidden variable to make sure we don't enter packrat mode 207 | # Primarily useful for testing 208 | if (!exists(".__DONT_ENTER_PACKRAT_MODE__.") && interactive()) { 209 | message("> Packrat bootstrap successfully completed. Entering packrat mode...") 210 | packrat::on() 211 | } 212 | 213 | Sys.unsetenv("RSTUDIO_PACKRAT_BOOTSTRAP") 214 | 215 | } 216 | 217 | }) 218 | -------------------------------------------------------------------------------- /dash_dashboard/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import numpy as np 5 | import dash 6 | import dash_core_components as dcc 7 | import dash_html_components as html 8 | import plotly.graph_objs as go 9 | import global_vars as gv 10 | import pandas as pd 11 | 12 | sys.path.insert(0, '../src/python/') 13 | from data_extraction import breast_cancer, names 14 | sys.path.pop(0) 15 | 16 | # Test set metrics 17 | cross_tab_knn = gv.cross_tab_knn 18 | cross_tab_rf = gv.cross_tab_rf 19 | cross_tab_nn = gv.cross_tab_nn 20 | 21 | # Classification Reports 22 | class_rep_knn = gv.class_rep_knn 23 | class_rep_rf = gv.class_rep_rf 24 | class_rep_nn = gv.class_rep_nn 25 | 26 | def generate_table(dataframe, max_rows=10): 27 | return html.Table( 28 | # Header 29 | [html.Tr([html.Th(col) for col in dataframe.columns])] + 30 | 31 | # Body 32 | [html.Tr([ 33 | html.Td(dataframe.iloc[i][col]) for col in dataframe.columns 34 | ]) for i in range(min(len(dataframe), max_rows))] 35 | ) 36 | 37 | app = dash.Dash() 38 | 39 | app.layout = html.Div([ 40 | html.Div([ 41 | html.H2("Breast Cancer Dashboard"), 42 | ], className='banner'), 43 | html.H2(children = ''' 44 | An interactive dashboard created by Raul Eulogio 45 | ''', 46 | style={ 47 | 'padding': '0px 30px 15px 30px'}), 48 | html.Div([ 49 | html.H3(children = ''' 50 | Exploratory Analysis 51 | ''', 52 | style={ 53 | 'padding': '0px 30px 15px 30px'}) 54 | ]), 55 | html.Div([ 56 | html.Div([ 57 | html.P(""" 58 | Move the multi-select options to see the 3d scatter plot and histograms change respectively. 59 | And play with the interactive 3d scatter plot to see how variables interact! 60 | 61 | """), 62 | html.Label('Choose the different parameters'), 63 | dcc.Dropdown( 64 | id='first_input', 65 | options=[ 66 | {'label': i, 'value': i} for i in names[2:] 67 | ], 68 | value = 'area_worst' 69 | ), 70 | dcc.Dropdown( 71 | id='second_input', 72 | options=[ 73 | {'label': i, 'value': i} for i in names[2:] 74 | ], 75 | value = 'perimeter_worst' 76 | ), 77 | dcc.Dropdown( 78 | id='third_input', 79 | options=[ 80 | {'label': i, 'value': i} for i in names[2:] 81 | ], 82 | value = 'concave_points_worst' 83 | ), 84 | dcc.Graph( 85 | id='scatter_plot_3d'), 86 | html.Div(html.P(' .')), 87 | html.Div([ 88 | html.H3(""" 89 | Machine Learning 90 | """), 91 | dcc.Markdown('Here are some metrics relating to how well each model did.'), 92 | dcc.Markdown('+ See [this article](https://lukeoakdenrayner.wordpress.com/2017/12/06/do-machines-actually-beat-doctors-roc-curves-and-performance-metrics/) for more information about *ROC Curves* '), 93 | html.Label('Choose a Machine Learning Model'), 94 | dcc.Dropdown( 95 | id='machine_learning', 96 | options=[ 97 | {'label': 'Kth Nearest Neighor', 'value': 'knn'}, 98 | {'label': 'Random Forest', 'value': 'rf'}, 99 | {'label': 'Neural Network', 'value': 'nn'} 100 | ], 101 | value = 'knn' 102 | ), 103 | dcc.Graph( 104 | id='roc_curve') 105 | ]) 106 | ], 107 | style={'width': '40%', 108 | 'height': '50%', 109 | 'float': 'left', 110 | 'padding': '0px 40px 40px 40px'}), 111 | # End Left Side Div 112 | # Right Side Div 113 | html.Div([ 114 | dcc.Graph( 115 | id='hist_first_var', 116 | style={'height': '12%'} 117 | ), 118 | dcc.Graph( 119 | id='hist_sec_var', 120 | style={'height': '12%'} 121 | ), 122 | dcc.Graph( 123 | id='hist_third_var', 124 | style={'height': '12%'} 125 | ), 126 | html.Div(html.P(' .')), 127 | html.Div(html.P(' .')), 128 | html.Div(html.P(' .')), 129 | html.Div(html.P(' .')), 130 | html.Div( 131 | html.H4(""" 132 | Test Set Metrics 133 | """ 134 | ) 135 | ), 136 | dcc.Markdown("+ See [Test Set Metrics Section of inertia7 project](https://www.inertia7.com/projects/95#test_set_met) for more information."), 137 | html.Div( 138 | dcc.Graph( 139 | id="conf_mat", 140 | style={'height': '10%'} 141 | ) 142 | ), 143 | html.Div( 144 | html.H4(""" 145 | Classification Report 146 | """ 147 | )), 148 | dcc.Markdown("+ See [Classification Report Section of inertia7 project](https://www.inertia7.com/projects/95) for more information. "), 149 | html.Div([html.Div(id='table_class_rep') 150 | ], 151 | style={'width': '100%'}) 152 | ], 153 | style={'width': '40%', 154 | 'float': 'right', 155 | 'padding': '0px 40px 40px 40px'}, 156 | ) 157 | # End Right Side Div 158 | ], 159 | style={'width': '100%', 160 | 'height': '100%', 161 | 'display': 'flex'}), 162 | ]) 163 | 164 | @app.callback( 165 | dash.dependencies.Output('scatter_plot_3d', 'figure'), 166 | [dash.dependencies.Input('first_input', 'value'), 167 | dash.dependencies.Input('second_input', 'value'), 168 | dash.dependencies.Input('third_input', 'value'),] 169 | ) 170 | 171 | def update_figure(first_input_name, second_input_name, third_input_name): 172 | traces = [] 173 | for i in breast_cancer.diagnosis.unique(): 174 | breast_cancer_dx = breast_cancer[breast_cancer['diagnosis'] == i] 175 | if (i == 0): 176 | traces.append(go.Scatter3d( 177 | x=breast_cancer_dx[first_input_name], 178 | y=breast_cancer_dx[second_input_name], 179 | z=breast_cancer_dx[third_input_name], 180 | text=breast_cancer_dx['diagnosis'], 181 | mode='markers', 182 | opacity=0.7, 183 | marker={ 184 | 'size': 15, 185 | 'line': {'width': 0.5, 'color': 'white'}, 186 | 'color': 'red' 187 | }, 188 | name='Malignant' 189 | )) 190 | 191 | else: 192 | traces.append(go.Scatter3d( 193 | x=breast_cancer_dx[first_input_name], 194 | y=breast_cancer_dx[second_input_name], 195 | z=breast_cancer_dx[third_input_name], 196 | text=breast_cancer_dx['diagnosis'], 197 | mode='markers', 198 | opacity=0.7, 199 | marker={ 200 | 'size': 15, 201 | 'line': {'width': 0.5, 'color': 'white'}, 202 | 'color': '#875FDB' 203 | }, 204 | name='Benign' 205 | )) 206 | return { 207 | 'data': traces, 208 | 'layout': go.Layout( 209 | xaxis={'title': first_input_name}, 210 | yaxis={'title': second_input_name}, 211 | margin={'l': 40, 'b': 40, 't': 10, 'r': 10}, 212 | legend={'x': 0, 'y': 1}, 213 | hovermode='closest' 214 | ) 215 | } 216 | 217 | 218 | @app.callback( 219 | dash.dependencies.Output('hist_first_var', 'figure'), 220 | [dash.dependencies.Input('first_input', 'value')] 221 | ) 222 | def update_hist_1(first_input_name): 223 | traces_hist = [] 224 | for i in breast_cancer.diagnosis.unique(): 225 | breast_cancer_dx = breast_cancer[breast_cancer['diagnosis'] == i] 226 | if (i == 0): 227 | traces_hist.append(go.Histogram( 228 | x = breast_cancer_dx[first_input_name], 229 | opacity=0.60, 230 | marker={ 231 | 'color': 'red' 232 | }, 233 | name='Malignant' 234 | )) 235 | else: 236 | traces_hist.append(go.Histogram( 237 | x = breast_cancer_dx[first_input_name], 238 | opacity=0.60, 239 | marker={ 240 | 'color': '#875FDB' 241 | }, 242 | name='Benign', 243 | )) 244 | return { 245 | 'data': traces_hist, 246 | 'layout': go.Layout( 247 | xaxis={'title': first_input_name}, 248 | margin={'l': 50, 'b': 40, 't': 10, 'r': 10}, 249 | legend={'x': 0, 'y': 1}, 250 | hovermode='closest', 251 | barmode='overlay' 252 | ) 253 | } 254 | 255 | @app.callback( 256 | dash.dependencies.Output('hist_sec_var', 'figure'), 257 | [dash.dependencies.Input('second_input', 'value')] 258 | ) 259 | def update_hist_2(second_input): 260 | traces_hist = [] 261 | for i in breast_cancer.diagnosis.unique(): 262 | breast_cancer_dx = breast_cancer[breast_cancer['diagnosis'] == i] 263 | if (i == 0): 264 | traces_hist.append(go.Histogram( 265 | x = breast_cancer_dx[second_input], 266 | opacity=0.60, 267 | marker={ 268 | 'color': 'red' 269 | }, 270 | name='Malignant' 271 | )) 272 | else: 273 | traces_hist.append(go.Histogram( 274 | x = breast_cancer_dx[second_input], 275 | opacity=0.60, 276 | marker={ 277 | 'color': '#875FDB' 278 | }, 279 | name='Benign', 280 | )) 281 | return { 282 | 'data': traces_hist, 283 | 'layout': go.Layout( 284 | xaxis={'title': second_input}, 285 | margin={'l': 50, 'b': 40, 't': 10, 'r': 10}, 286 | legend={'x': 0, 'y': 1}, 287 | hovermode='closest', 288 | barmode='overlay' 289 | ) 290 | } 291 | 292 | @app.callback( 293 | dash.dependencies.Output('hist_third_var', 'figure'), 294 | [dash.dependencies.Input('third_input', 'value')] 295 | ) 296 | def update_hist_3(third_input): 297 | traces_hist = [] 298 | for i in breast_cancer.diagnosis.unique(): 299 | breast_cancer_dx = breast_cancer[breast_cancer['diagnosis'] == i] 300 | if (i == 0): 301 | traces_hist.append(go.Histogram( 302 | x = breast_cancer_dx[third_input], 303 | opacity=0.60, 304 | marker={ 305 | 'color': 'red' 306 | }, 307 | name='Malignant' 308 | )) 309 | else: 310 | traces_hist.append(go.Histogram( 311 | x = breast_cancer_dx[third_input], 312 | opacity=0.60, 313 | marker={ 314 | 'color': '#875FDB' 315 | }, 316 | name='Benign', 317 | )) 318 | return { 319 | 'data': traces_hist, 320 | 'layout': go.Layout( 321 | xaxis={'title': third_input}, 322 | margin={'l': 50, 'b': 40, 't': 10, 'r': 10}, 323 | legend={'x': 0, 'y': 1}, 324 | hovermode='closest', 325 | barmode='overlay' 326 | ) 327 | } 328 | 329 | 330 | @app.callback( 331 | dash.dependencies.Output('roc_curve', 'figure'), 332 | [dash.dependencies.Input('machine_learning', 'value') 333 | ]) 334 | 335 | def update_roc(machine_learning): 336 | lw = 2 337 | if (machine_learning == 'knn'): 338 | trace1 = go.Scatter( 339 | x = gv.fpr, y = gv.tpr, 340 | mode='lines', 341 | line=dict(color='deeppink', width=lw), 342 | name='ROC curve (AUC = {0: 0.3f})'.format(gv.auc_knn)) 343 | if (machine_learning == 'rf'): 344 | trace1 = go.Scatter( 345 | x = gv.fpr2, y = gv.tpr2, 346 | mode='lines', 347 | line=dict(color='red', width=lw), 348 | name='ROC curve (AUC = {0: 0.3f})'.format(gv.auc_rf)) 349 | if (machine_learning == 'nn'): 350 | trace1 = go.Scatter( 351 | x = gv.fpr3, y = gv.tpr3, 352 | mode='lines', 353 | line=dict(color='purple', width=lw), 354 | name='ROC curve (AUC = {0: 0.3f})'.format(gv.auc_nn)) 355 | trace2 = go.Scatter(x=[0, 1], y=[0, 1], 356 | mode='lines', 357 | line=dict(color='black', width=lw, dash='dash'), 358 | showlegend=False) 359 | trace3 = go.Scatter(x=[0, 0], y=[1, 0], 360 | mode='lines', 361 | line=dict(color='black', width=lw, dash='dash'), 362 | showlegend=False) 363 | trace4 = go.Scatter(x=[1, 0], y=[1, 1], 364 | mode='lines', 365 | line=dict(color='black', width=lw, dash='dash'), 366 | showlegend=False) 367 | return { 368 | 'data': [trace1, trace2, trace3, trace4], 369 | 'layout': go.Layout( 370 | title='Receiver Operating Characteristic Plot', 371 | xaxis={'title': 'False Positive Rate'}, 372 | yaxis={'title': 'True Positive Rate'}, 373 | legend={'x': 0.7, 'y': 0.15}, 374 | #height=400 375 | ) 376 | } 377 | 378 | @app.callback( 379 | dash.dependencies.Output('conf_mat', 'figure'), 380 | [dash.dependencies.Input('machine_learning', 'value') 381 | ]) 382 | 383 | def update_conf_mat(machine_learning): 384 | lw = 2 385 | if (machine_learning == 'knn'): 386 | trace1 = go.Heatmap( 387 | z = np.roll(cross_tab_knn, 388 | 1, axis=0)) 389 | if (machine_learning == 'rf'): 390 | trace1 = go.Heatmap( 391 | z = np.roll(cross_tab_rf, 392 | 1, axis=0)) 393 | if (machine_learning == 'nn'): 394 | trace1 = go.Heatmap( 395 | z = np.roll(cross_tab_nn, 396 | 1, axis=0)) 397 | return { 398 | 'data': [trace1], 399 | 'layout': go.Layout( 400 | title='Confusion Matrix', 401 | xaxis={'title': 'Predicted Values'}, 402 | yaxis={'title': 'Actual Values'} 403 | ) 404 | } 405 | 406 | #################################### 407 | # 408 | # 409 | # 410 | #def update_table(machine_learning): 411 | #final_cross_tab = pd.DataFrame() 412 | #if (machine_learning == 'knn'): 413 | #final_cross_tab = cross_tab_knn 414 | #if (machine_learning == 'rf'): 415 | #final_cross_tab = cross_tab_rf 416 | #if (machine_learning == 'nn'): 417 | #final_cross_tab = cross_tab_nn 418 | #return generate_table(dataframe = final_cross_tab) 419 | 420 | 421 | @app.callback( 422 | dash.dependencies.Output('table_class_rep', 'children'), 423 | [dash.dependencies.Input('machine_learning', 'value') 424 | ]) 425 | def update_table(machine_learning): 426 | final_cross_tab = pd.DataFrame() 427 | if (machine_learning == 'knn'): 428 | final_cross_tab = class_rep_knn 429 | if (machine_learning == 'rf'): 430 | final_cross_tab = class_rep_rf 431 | if (machine_learning == 'nn'): 432 | final_cross_tab = class_rep_nn 433 | return generate_table(dataframe = final_cross_tab) 434 | 435 | 436 | # Append externally hosted CSS Stylesheet 437 | my_css_urls = [ 438 | # For dev: 439 | 'https://rawgit.com/raviolli77/machineLearning_breastCancer_Python/master/dash_dashboard/dash_breast_cancer.css', 440 | # For prod 441 | #'https://cdn.rawgit.com/raviolli77/machineLearning_breastCancer_Python/master/dash_dashboard/dash_breast_cancer.css' 442 | ] 443 | 444 | app.css.append_css({ 445 | 'external_url': my_css_urls 446 | }) 447 | 448 | if __name__ == '__main__': 449 | app.run_server(debug=True) 450 | -------------------------------------------------------------------------------- /src/r/packrat/packrat.lock: -------------------------------------------------------------------------------- 1 | PackratFormat: 1.4 2 | PackratVersion: 0.4.8.1 3 | RVersion: 3.2.3 4 | Repos: CRAN=https://cran.rstudio.com/ 5 | 6 | Package: BH 7 | Source: CRAN 8 | Version: 1.62.0-1 9 | Hash: 14dfb3e8ffe20996118306ff4de1fab2 10 | 11 | Package: CVST 12 | Source: CRAN 13 | Version: 0.2-1 14 | Hash: 1e50c7789a11bc9523238fcf16ee8a71 15 | Requires: kernlab 16 | 17 | Package: DEoptimR 18 | Source: CRAN 19 | Version: 1.0-8 20 | Hash: adc74e88e85eabe6c7d73db6a86fe6cf 21 | 22 | Package: DRR 23 | Source: CRAN 24 | Version: 0.0.2 25 | Hash: cd79854854a03ad0c8979b36b414d2c0 26 | Requires: CVST, kernlab 27 | 28 | Package: GGally 29 | Source: CRAN 30 | Version: 1.3.2 31 | Hash: 27e95068f899e4ab58472b1776254d9e 32 | Requires: RColorBrewer, ggplot2, gtable, plyr, progress, reshape 33 | 34 | Package: ModelMetrics 35 | Source: CRAN 36 | Version: 1.1.0 37 | Hash: 325ea8f510f9e8c2e7e774b78b0f376a 38 | Requires: Rcpp 39 | 40 | Package: PKI 41 | Source: CRAN 42 | Version: 0.1-5.1 43 | Hash: 8c194fb34ebaab38a13e43ce84feedee 44 | Requires: base64enc 45 | 46 | Package: R6 47 | Source: CRAN 48 | Version: 2.2.2 49 | Hash: b2366cd9d2f3851a5704b4e192b985c2 50 | 51 | Package: RColorBrewer 52 | Source: CRAN 53 | Version: 1.1-2 54 | Hash: c0d56cd15034f395874c870141870c25 55 | 56 | Package: RCurl 57 | Source: CRAN 58 | Version: 1.95-4.10 59 | Hash: 06af5153f969a90c6cd6c87ee57baa44 60 | Requires: bitops 61 | 62 | Package: RJSONIO 63 | Source: CRAN 64 | Version: 1.3-0 65 | Hash: fb672e20eb6f3010a3639f855d8ef6de 66 | 67 | Package: ROCR 68 | Source: CRAN 69 | Version: 1.0-7 70 | Hash: 086f78987ebf3c55b01013ee64a5e1e2 71 | Requires: gplots 72 | 73 | Package: Rcpp 74 | Source: CRAN 75 | Version: 0.12.12 76 | Hash: 8b3d5ebb9a9a4ab5c86b3a81b0cfb774 77 | 78 | Package: RcppRoll 79 | Source: CRAN 80 | Version: 0.2.2 81 | Hash: 13af7f0bc94b9252d1203421f00e30af 82 | Requires: Rcpp 83 | 84 | Package: assertthat 85 | Source: CRAN 86 | Version: 0.2.0 87 | Hash: e8805df54c65ac96d50235c44a82615c 88 | 89 | Package: backports 90 | Source: CRAN 91 | Version: 1.1.2 92 | Hash: 5ae7b3466e529e4400951ca18c137e40 93 | 94 | Package: base64enc 95 | Source: CRAN 96 | Version: 0.1-3 97 | Hash: c590d29e555926af053055e23ee79efb 98 | 99 | Package: bindr 100 | Source: CRAN 101 | Version: 0.1 102 | Hash: e3a02070cf705d3ad1c5af1635a515a3 103 | 104 | Package: bindrcpp 105 | Source: CRAN 106 | Version: 0.2 107 | Hash: 8ab2dbf7ea120cf2d31183e0bf388485 108 | Requires: Rcpp, bindr, plogr 109 | 110 | Package: bitops 111 | Source: CRAN 112 | Version: 1.0-6 113 | Hash: 67d0775189fd0041d95abca618c5c07e 114 | 115 | Package: broom 116 | Source: CRAN 117 | Version: 0.4.2 118 | Hash: 7ebcffa46afb467e3f3c5687946f6e1a 119 | Requires: dplyr, plyr, psych, reshape2, stringr, tidyr 120 | 121 | Package: caTools 122 | Source: CRAN 123 | Version: 1.17.1 124 | Hash: 97cb6f6293cd18d17df77a6383cc6763 125 | Requires: bitops 126 | 127 | Package: caret 128 | Source: CRAN 129 | Version: 6.0-77 130 | Hash: f5b47c8d7244b7e157f75641b212392f 131 | Requires: ModelMetrics, foreach, ggplot2, plyr, recipes, reshape2, 132 | withr 133 | 134 | Package: cellranger 135 | Source: CRAN 136 | Version: 1.1.0 137 | Hash: 4e1ef4d099b0c5fd531a3938cf4624bd 138 | Requires: rematch, tibble 139 | 140 | Package: colorspace 141 | Source: CRAN 142 | Version: 1.3-2 143 | Hash: 0bf8618b585fa98eb23414cd3ab95118 144 | 145 | Package: curl 146 | Source: CRAN 147 | Version: 2.7 148 | Hash: 1d97b529645be4e502fad3db22415e66 149 | 150 | Package: ddalpha 151 | Source: CRAN 152 | Version: 1.3.1 153 | Hash: 7ed2f9a3cdc72836fe74e62aa1c18853 154 | Requires: BH, Rcpp, robustbase, sfsmisc 155 | 156 | Package: dichromat 157 | Source: CRAN 158 | Version: 2.0-0 159 | Hash: 08eed0c80510af29bb15f840ccfe37ce 160 | 161 | Package: digest 162 | Source: CRAN 163 | Version: 0.6.12 164 | Hash: e53fb8c58673df868183697e39a6a4d6 165 | 166 | Package: dimRed 167 | Source: CRAN 168 | Version: 0.1.0 169 | Hash: 648bd80f3187f8e807f996e3e0866c7c 170 | Requires: DRR 171 | 172 | Package: dplyr 173 | Source: CRAN 174 | Version: 0.7.1 175 | Hash: 669f4d38aaac878ede74800b408b09fa 176 | Requires: BH, R6, Rcpp, assertthat, bindrcpp, glue, magrittr, 177 | pkgconfig, plogr, rlang, tibble 178 | 179 | Package: e1071 180 | Source: CRAN 181 | Version: 1.6-8 182 | Hash: 20320ec66d4dc608654769145b7c624a 183 | 184 | Package: evaluate 185 | Source: CRAN 186 | Version: 0.10.1 187 | Hash: 54d95f4ec6d0300100413ed0127d89ae 188 | Requires: stringr 189 | 190 | Package: forcats 191 | Source: CRAN 192 | Version: 0.2.0 193 | Hash: e5a3b0b96a39f5581467b0c6366f7408 194 | Requires: magrittr, tibble 195 | 196 | Package: foreach 197 | Source: CRAN 198 | Version: 1.4.3 199 | Hash: cd53ef4cf29dc59ce3f8c5c1af735fd1 200 | Requires: iterators 201 | 202 | Package: gdata 203 | Source: CRAN 204 | Version: 2.18.0 205 | Hash: 62797fafa287d1845a014c615d46e50c 206 | Requires: gtools 207 | 208 | Package: ggcorrplot 209 | Source: CRAN 210 | Version: 0.1.1 211 | Hash: 1f6cc9c3899518a73b83bc02d14c3759 212 | Requires: ggplot2, reshape2 213 | 214 | Package: ggplot2 215 | Source: CRAN 216 | Version: 2.2.1 217 | Hash: 46e5cb78836848aa44655e577433f54b 218 | Requires: digest, gtable, lazyeval, plyr, reshape2, scales, tibble 219 | 220 | Package: glue 221 | Source: CRAN 222 | Version: 1.1.1 223 | Hash: dfd5a27768175ae51d08dc6beba1ef11 224 | 225 | Package: gower 226 | Source: CRAN 227 | Version: 0.1.2 228 | Hash: 77a20b3ef7f9a1a7ed19457b36978605 229 | 230 | Package: gplots 231 | Source: CRAN 232 | Version: 3.0.1 233 | Hash: b7abe122479c203aa236499b7fc4b816 234 | Requires: caTools, gdata, gtools 235 | 236 | Package: gtable 237 | Source: CRAN 238 | Version: 0.2.0 239 | Hash: cd78381a9d3fea966ac39bd0daaf5554 240 | 241 | Package: gtools 242 | Source: CRAN 243 | Version: 3.5.0 244 | Hash: 471b2e2452dfb30fdc1dd6f1b567925a 245 | 246 | Package: haven 247 | Source: CRAN 248 | Version: 1.1.0 249 | Hash: d91bd77c2b46f513b36976866239bb62 250 | Requires: Rcpp, forcats, hms, readr, tibble 251 | 252 | Package: here 253 | Source: CRAN 254 | Version: 0.1 255 | Hash: 90e1a97508a0d7383b0eeb11e397e763 256 | Requires: rprojroot 257 | 258 | Package: highr 259 | Source: CRAN 260 | Version: 0.6 261 | Hash: aa3d5b7912b5fed4b546ed5cd2a1760b 262 | 263 | Package: hms 264 | Source: CRAN 265 | Version: 0.3 266 | Hash: 3fca8a1c97e6cfb297fe3f4690f82c58 267 | 268 | Package: htmltools 269 | Source: CRAN 270 | Version: 0.3.6 271 | Hash: 5b070a04ef8df1953544873db1c5896e 272 | Requires: Rcpp, digest 273 | 274 | Package: httr 275 | Source: CRAN 276 | Version: 1.2.1 277 | Hash: 7de1f8f760441881804af7c1ff324340 278 | Requires: R6, curl, jsonlite, mime, openssl 279 | 280 | Package: ipred 281 | Source: CRAN 282 | Version: 0.9-6 283 | Hash: 2fd946bce1622291262c12515d27e780 284 | Requires: prodlim 285 | 286 | Package: iterators 287 | Source: CRAN 288 | Version: 1.0.8 289 | Hash: 488b93c2a4166db0d15f1e8d882cb1d4 290 | 291 | Package: jsonlite 292 | Source: CRAN 293 | Version: 1.5 294 | Hash: 9c51936d8dd00b2f1d4fe9d10499694c 295 | 296 | Package: kernlab 297 | Source: CRAN 298 | Version: 0.9-25 299 | Hash: bf60122a2e1f073661edb69651a682c2 300 | 301 | Package: knitr 302 | Source: CRAN 303 | Version: 1.18 304 | Hash: 5be8a90c6aac24e7e0a4f18b829cc6e2 305 | Requires: digest, evaluate, highr, markdown, stringr, yaml 306 | 307 | Package: labeling 308 | Source: CRAN 309 | Version: 0.3 310 | Hash: ecf589b42cd284b03a4beb9665482d3e 311 | 312 | Package: lava 313 | Source: CRAN 314 | Version: 1.5.1 315 | Hash: a7626c3f7e753f7401e070a144ecd315 316 | Requires: numDeriv 317 | 318 | Package: lazyeval 319 | Source: CRAN 320 | Version: 0.2.0 321 | Hash: 3d6e7608e65bbf5cb170dab1e3c9ed8b 322 | 323 | Package: lubridate 324 | Source: CRAN 325 | Version: 1.6.0 326 | Hash: b90f4cbefe0b3c545dd68b22c66a8a12 327 | Requires: stringr 328 | 329 | Package: magrittr 330 | Source: CRAN 331 | Version: 1.5 332 | Hash: bdc4d48c3135e8f3b399536ddf160df4 333 | 334 | Package: markdown 335 | Source: CRAN 336 | Version: 0.8 337 | Hash: 045d7c594d503b41f1c28946d076c8aa 338 | Requires: mime 339 | 340 | Package: mime 341 | Source: CRAN 342 | Version: 0.5 343 | Hash: 463550cf44fb6f0a2359368f42eebe62 344 | 345 | Package: mnormt 346 | Source: CRAN 347 | Version: 1.5-5 348 | Hash: d0d5efbb1fb26d2dc5f9394c223084b5 349 | 350 | Package: modelr 351 | Source: CRAN 352 | Version: 0.1.0 353 | Hash: 7c9848bf4d734f38b8ce91022d8de949 354 | Requires: broom, dplyr, lazyeval, magrittr, purrr, tibble, tidyr 355 | 356 | Package: munsell 357 | Source: CRAN 358 | Version: 0.4.3 359 | Hash: f96d896947fcaf9b6d0074002e9f4f9d 360 | Requires: colorspace 361 | 362 | Package: numDeriv 363 | Source: CRAN 364 | Version: 2016.8-1 365 | Hash: 3a9d0fc99ba2f6aaa500b3d584962be2 366 | 367 | Package: openssl 368 | Source: CRAN 369 | Version: 0.9.6 370 | Hash: 5f4711e142a44655dfea4d64fcf2f641 371 | 372 | Package: pROC 373 | Source: CRAN 374 | Version: 1.10.0 375 | Hash: 538c2f9710cb24d6a6193ea89444c859 376 | Requires: Rcpp, ggplot2, plyr 377 | 378 | Package: packrat 379 | Source: CRAN 380 | Version: 0.4.8-1 381 | Hash: 6ad605ba7b4b476d84be6632393f5765 382 | 383 | Package: pkgconfig 384 | Source: CRAN 385 | Version: 2.0.1 386 | Hash: 0dda4a2654a22b36a715c2b0b6fbacac 387 | 388 | Package: plogr 389 | Source: CRAN 390 | Version: 0.1-1 391 | Hash: fb19215402e2d9f1c7f803dcaa806fc2 392 | 393 | Package: plyr 394 | Source: CRAN 395 | Version: 1.8.4 396 | Hash: 8fbaff6962e3421b5c9652eebae36159 397 | Requires: Rcpp 398 | 399 | Package: prettyunits 400 | Source: CRAN 401 | Version: 1.0.2 402 | Hash: 49286102a855640daaa38eafe8b1ec30 403 | Requires: assertthat, magrittr 404 | 405 | Package: prodlim 406 | Source: CRAN 407 | Version: 1.6.1 408 | Hash: a293698cbc0bfdc90d0ac23b988bb055 409 | Requires: Rcpp, lava 410 | 411 | Package: progress 412 | Source: CRAN 413 | Version: 1.1.2 414 | Hash: ceef88c244d792a874bdacf72b6a30da 415 | Requires: R6, prettyunits 416 | 417 | Package: psych 418 | Source: CRAN 419 | Version: 1.7.5 420 | Hash: 0c076a96de916d0d26d866e83909d961 421 | Requires: mnormt 422 | 423 | Package: purrr 424 | Source: CRAN 425 | Version: 0.2.2.2 426 | Hash: faada139260184912fea03f3fea13842 427 | Requires: Rcpp, lazyeval, magrittr, tibble 428 | 429 | Package: randomForest 430 | Source: CRAN 431 | Version: 4.6-12 432 | Hash: b37274857316c7b9431cc7f72aaffb77 433 | 434 | Package: readr 435 | Source: CRAN 436 | Version: 1.1.1 437 | Hash: c9044cbc275e63bf00dd3af329290fa9 438 | Requires: BH, R6, Rcpp, hms, tibble 439 | 440 | Package: readxl 441 | Source: CRAN 442 | Version: 1.0.0 443 | Hash: 83bc4a5b41d247b40ce7161ade89baf3 444 | Requires: Rcpp, cellranger, tibble 445 | 446 | Package: recipes 447 | Source: CRAN 448 | Version: 0.1.1 449 | Hash: 14c05a96da97c12ff93c3e18c3918d45 450 | Requires: RcppRoll, broom, ddalpha, dimRed, dplyr, gower, ipred, 451 | lubridate, magrittr, purrr, rlang, tibble, tidyselect, timeDate 452 | 453 | Package: rematch 454 | Source: CRAN 455 | Version: 1.0.1 456 | Hash: ad4faf59e7611117ff165817074c50c7 457 | 458 | Package: reshape 459 | Source: CRAN 460 | Version: 0.8.7 461 | Hash: f026a2928c05063a8d0b2e29a129f9a0 462 | Requires: plyr 463 | 464 | Package: reshape2 465 | Source: CRAN 466 | Version: 1.4.2 467 | Hash: df8d1de05444abd99e423c1e3b84c9b0 468 | Requires: Rcpp, plyr, stringr 469 | 470 | Package: rlang 471 | Source: CRAN 472 | Version: 0.1.1 473 | Hash: 86c53487ce7f82f0a7cc11c816060910 474 | 475 | Package: rmarkdown 476 | Source: CRAN 477 | Version: 1.8 478 | Hash: 2a7842e3cee62a79dd737d17e9e9d86b 479 | Requires: base64enc, evaluate, htmltools, jsonlite, knitr, mime, 480 | rprojroot, stringr, yaml 481 | 482 | Package: robustbase 483 | Source: CRAN 484 | Version: 0.92-8 485 | Hash: 47f671cf700fbaa2015bb61701e6f7f4 486 | Requires: DEoptimR 487 | 488 | Package: rprojroot 489 | Source: CRAN 490 | Version: 1.3-2 491 | Hash: a25c3f70c166fb3fbabc410eb32b6366 492 | Requires: backports 493 | 494 | Package: rsconnect 495 | Source: CRAN 496 | Version: 0.8.5 497 | Hash: eeb742b99cb0b2b98545b0582d43a4b2 498 | Requires: PKI, RCurl, RJSONIO, digest, packrat, rstudioapi, yaml 499 | 500 | Package: rstudioapi 501 | Source: CRAN 502 | Version: 0.7 503 | Hash: e2ebaff8160aff3e6b32e6e78a693c2d 504 | 505 | Package: rvest 506 | Source: CRAN 507 | Version: 0.3.2 508 | Hash: c69f7526520bad66fd2111ebe8b1364b 509 | Requires: httr, magrittr, selectr, xml2 510 | 511 | Package: scales 512 | Source: CRAN 513 | Version: 0.4.1 514 | Hash: c23bc27bbba87e4039706edf29d8eb68 515 | Requires: RColorBrewer, Rcpp, dichromat, labeling, munsell, plyr 516 | 517 | Package: selectr 518 | Source: CRAN 519 | Version: 0.3-1 520 | Hash: 367275e3dcdd208339e131c7a41bec56 521 | Requires: stringr 522 | 523 | Package: sfsmisc 524 | Source: CRAN 525 | Version: 1.1-1 526 | Hash: 00af82c1c08f9a5fb278ca3469b6eaf4 527 | 528 | Package: stringi 529 | Source: CRAN 530 | Version: 1.1.5 531 | Hash: b6308e49357a0b475f433599e0d8b5eb 532 | 533 | Package: stringr 534 | Source: CRAN 535 | Version: 1.2.0 536 | Hash: 25a86d7f410513ebb7c0bc6a5e16bdc3 537 | Requires: magrittr, stringi 538 | 539 | Package: tibble 540 | Source: CRAN 541 | Version: 1.3.3 542 | Hash: 6a18f6da2887d2c4c4a6554027161483 543 | Requires: Rcpp, rlang 544 | 545 | Package: tidyr 546 | Source: CRAN 547 | Version: 0.6.3 548 | Hash: ab001782aeb1a20618d240e91188d23a 549 | Requires: Rcpp, dplyr, lazyeval, magrittr, stringi, tibble 550 | 551 | Package: tidyselect 552 | Source: CRAN 553 | Version: 0.2.3 554 | Hash: 4db6d5baad622f56ae5783a25c7e5fc3 555 | Requires: Rcpp, glue, purrr, rlang 556 | 557 | Package: tidyverse 558 | Source: CRAN 559 | Version: 1.1.1 560 | Hash: 72d5fada870c90b835bbdfc281283c99 561 | Requires: broom, dplyr, forcats, ggplot2, haven, hms, httr, jsonlite, 562 | lubridate, magrittr, modelr, purrr, readr, readxl, rvest, stringr, 563 | tibble, tidyr, xml2 564 | 565 | Package: timeDate 566 | Source: CRAN 567 | Version: 3012.100 568 | Hash: 78876c125c98033cd093fa0283469637 569 | 570 | Package: withr 571 | Source: CRAN 572 | Version: 2.1.0 573 | Hash: 097f730987c2dc13d421b65bf01ddf08 574 | 575 | Package: xml2 576 | Source: CRAN 577 | Version: 1.1.1 578 | Hash: b326a762ddb04eef605cc88987fa71fb 579 | Requires: BH, Rcpp 580 | 581 | Package: yaml 582 | Source: CRAN 583 | Version: 2.1.16 584 | Hash: 784ea5d8302d4a81f166a32a33c10711 585 | -------------------------------------------------------------------------------- /src/python/helper_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ##################################################### 4 | ## WISCONSIN BREAST CANCER MACHINE LEARNING ## 5 | ##################################################### 6 | # 7 | # Project by Raul Eulogio 8 | # 9 | # Project found at: https://www.inertia7.com/projects/3 10 | # 11 | 12 | """ 13 | Helper Functions Script 14 | """ 15 | # Import Packages ----------------------------------------------- 16 | import numpy as np 17 | import pandas as pd 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | from data_extraction import names_index 21 | from sklearn.model_selection import KFold 22 | from sklearn.model_selection import cross_val_score 23 | 24 | def print_target_perc(data_frame, col): 25 | """Function used to print class distribution for our data set""" 26 | try: 27 | # If the number of unique instances in column exceeds 20 print warning 28 | if data_frame[col].nunique() > 20: 29 | return print('Warning: there are {0} values in `{1}` column which exceed the max of 20 for this function. \ 30 | Please try a column with lower value counts!' 31 | .format(data_frame[col].nunique(), col)) 32 | # Stores value counts 33 | col_vals = data_frame[col].value_counts().sort_values(ascending=False) 34 | # Resets index to make index a column in data frame 35 | col_vals = col_vals.reset_index() 36 | 37 | # Create a function to output the percentage 38 | f = lambda x, y: 100 * (x / sum(y)) 39 | for i in range(0, len(col_vals['index'])): 40 | print('`{0}` accounts for {1:.2f}% of the {2} column'\ 41 | .format(col_vals['index'][i], 42 | f( 43 | col_vals[col].iloc[i], 44 | col_vals[col]), 45 | col)) 46 | # try-except block goes here if it can't find the column in data frame 47 | except KeyError as e: 48 | raise KeyError('{0}: Not found. Please choose the right column name!'.format(e)) 49 | 50 | def plot_box_plot(data_frame, data_set, xlim=None): 51 | """ 52 | Purpose 53 | ---------- 54 | Creates a seaborn boxplot including all dependent 55 | variables and includes x limit parameters 56 | 57 | Parameters 58 | ---------- 59 | * data_frame : Name of pandas.dataframe 60 | * data_set : Name of title for the boxplot 61 | * xlim : Set upper and lower x-limits 62 | 63 | Returns 64 | ---------- 65 | Box plot graph for all numeric data in data frame 66 | """ 67 | f, ax = plt.subplots(figsize=(11, 15)) 68 | 69 | ax.set_axis_bgcolor('#fafafa') 70 | if xlim is not None: 71 | plt.xlim(*xlim) 72 | plt.ylabel('Dependent Variables') 73 | plt.title("Box Plot of {0} Data Set"\ 74 | .format(data_set)) 75 | ax = sns.boxplot(data = data_frame.select_dtypes(include = ['number']), 76 | orient = 'h') 77 | 78 | plt.show() 79 | plt.close() 80 | 81 | def normalize_data_frame(data_frame): 82 | """ 83 | Purpose 84 | ---------- 85 | Function created to normalize data set. 86 | Intializes an empty data frame which will normalize all columns that 87 | have at > 10 unique values (chosen arbitrarily since target columns 88 | will have classes < 10) and append the non-float types. 89 | Application can vary significantly for different data set, use with caution 90 | or modify accordingly. 91 | 92 | Parameters 93 | ---------- 94 | * data_frame: Name of pandas.dataframe 95 | 96 | Returns 97 | ---------- 98 | * data_frame_norm: Normalized dataframe values ranging (0, 1) 99 | """ 100 | data_frame_norm = pd.DataFrame() 101 | for col in data_frame: 102 | if ((len(np.unique(data_frame[col])) > 10) & (data_frame[col].dtype != 'object')): 103 | data_frame_norm[col]=((data_frame[col] - data_frame[col].min()) / 104 | (data_frame[col].max() - data_frame[col].min())) 105 | else: 106 | data_frame_norm[col] = data_frame[col] 107 | return data_frame_norm 108 | 109 | 110 | 111 | def variable_importance(fit): 112 | """ 113 | Purpose 114 | ---------- 115 | Checks if model is fitted CART model then produces variable importance 116 | and respective indices in dictionary. 117 | 118 | Parameters 119 | ---------- 120 | * fit: Fitted model containing the attribute feature_importances_ 121 | 122 | Returns 123 | ---------- 124 | Dictionary containing arrays with importance score and index of columns 125 | ordered in descending order of importance. 126 | """ 127 | try: 128 | if not hasattr(fit, 'fit'): 129 | return print("'{0}' is not an instantiated model from scikit-learn".format(fit)) 130 | 131 | # Captures whether the model has been trained 132 | if not vars(fit)["estimators_"]: 133 | return print("Model does not appear to be trained.") 134 | except KeyError: 135 | KeyError("Model entered does not contain 'estimators_' attribute.") 136 | 137 | importances = fit.feature_importances_ 138 | indices = np.argsort(importances)[::-1] 139 | return {'importance': importances, 140 | 'index': indices} 141 | 142 | def print_var_importance(importance, indices, name_index): 143 | """ 144 | Purpose 145 | ---------- 146 | Prints dependent variable names ordered from largest to smallest 147 | based on information gain for CART model. 148 | Parameters 149 | ---------- 150 | * importance: Array returned from feature_importances_ for CART 151 | models organized by dataframe index 152 | * indices: Organized index of dataframe from largest to smallest 153 | based on feature_importances_ 154 | * name_index: Name of columns included in model 155 | 156 | Returns 157 | ---------- 158 | Prints feature importance in descending order 159 | """ 160 | print("Feature ranking:") 161 | 162 | for f in range(0, indices.shape[0]): 163 | i = f 164 | print("{0}. The feature '{1}' has a Mean Decrease in Impurity of {2:.5f}" 165 | .format(f + 1, 166 | names_index[indices[i]], 167 | importance[indices[f]])) 168 | 169 | def variable_importance(fit): 170 | """ 171 | Purpose 172 | ---------- 173 | Checks if model is fitted CART model then produces variable importance 174 | and respective indices in dictionary. 175 | Parameters 176 | ---------- 177 | * fit: Fitted model containing the attribute feature_importances_ 178 | Returns 179 | ---------- 180 | Dictionary containing arrays with importance score and index of columns 181 | ordered in descending order of importance. 182 | """ 183 | try: 184 | if not hasattr(fit, 'fit'): 185 | return print("'{0}' is not an instantiated model from scikit-learn".format(fit)) 186 | 187 | # Captures whether the model has been trained 188 | if not vars(fit)["estimators_"].all(): 189 | return print("Model does not appear to be trained.") 190 | except KeyError: 191 | raise KeyError("Model entered does not contain 'estimators_' attribute.") 192 | 193 | importances = fit.feature_importances_ 194 | indices = np.argsort(importances)[::-1] 195 | return {'importance': importances, 196 | 'index': indices} 197 | 198 | def print_var_importance(importance, indices, name_index): 199 | """ 200 | Purpose 201 | ---------- 202 | Prints dependent variable names ordered from largest to smallest 203 | based on information gain for CART model. 204 | Parameters 205 | ---------- 206 | * importance: Array returned from feature_importances_ for CART 207 | models organized by dataframe index 208 | * indices: Organized index of dataframe from largest to smallest 209 | based on feature_importances_ 210 | * name_index: Name of columns included in model 211 | Returns 212 | ---------- 213 | Prints feature importance in descending order 214 | """ 215 | print("Feature ranking:") 216 | 217 | for f in range(0, indices.shape[0]): 218 | i = f 219 | print("{0}. The feature '{1}' has a Mean Decrease in Impurity of {2:.5f}" 220 | .format(f + 1, 221 | name_index[indices[i]], 222 | importance[indices[f]])) 223 | 224 | def variable_importance_plot(importance, indices, name_index): 225 | """ 226 | Purpose 227 | ---------- 228 | Prints bar chart detailing variable importance for CART model 229 | NOTE: feature_space list was created because the bar chart 230 | was transposed and index would be in incorrect order. 231 | Parameters 232 | ---------- 233 | * importance: Array returned from feature_importances_ for CART 234 | models organized by dataframe index 235 | * indices: Organized index of dataframe from largest to smallest 236 | based on feature_importances_ 237 | * name_index: Name of columns included in model 238 | 239 | Returns: 240 | ---------- 241 | Returns variable importance plot in descending order 242 | """ 243 | index = np.arange(len(name_index)) 244 | 245 | importance_desc = sorted(importance) 246 | feature_space = [] 247 | for i in range(indices.shape[0] - 1, -1, -1): 248 | 249 | feature_space.append(name_index[indices[i]]) 250 | 251 | fig, ax = plt.subplots(figsize=(10, 10)) 252 | 253 | ax.set_facecolor('#fafafa') 254 | plt.title('Feature importances for Gradient Boosting Model\ 255 | \nCustomer Churn') 256 | plt.barh(index, 257 | importance_desc, 258 | align="center", 259 | color = '#875FDB') 260 | plt.yticks(index, 261 | feature_space) 262 | 263 | plt.ylim(-1, indices.shape[0]) 264 | plt.xlim(0, max(importance_desc) + 0.01) 265 | plt.xlabel('Mean Decrease in Impurity') 266 | plt.ylabel('Feature') 267 | 268 | plt.show() 269 | plt.close() 270 | 271 | 272 | def plot_roc_curve(fpr, tpr, auc, estimator, xlim=None, ylim=None): 273 | """ 274 | Purpose 275 | ---------- 276 | Function creates ROC Curve for respective model given selected parameters. 277 | Optional x and y limits to zoom into graph 278 | 279 | Parameters 280 | ---------- 281 | * fpr: Array returned from sklearn.metrics.roc_curve for increasing 282 | false positive rates 283 | * tpr: Array returned from sklearn.metrics.roc_curve for increasing 284 | true positive rates 285 | * auc: Float returned from sklearn.metrics.auc (Area under Curve) 286 | * estimator: String represenation of appropriate model, can only contain the 287 | following: ['knn', 'rf', 'nn'] 288 | * xlim: Set upper and lower x-limits 289 | * ylim: Set upper and lower y-limits 290 | """ 291 | my_estimators = {'knn': ['Kth Nearest Neighbor', 'deeppink'], 292 | 'rf': ['Random Forest', 'red'], 293 | 'nn': ['Neural Network', 'purple']} 294 | 295 | try: 296 | plot_title = my_estimators[estimator][0] 297 | color_value = my_estimators[estimator][1] 298 | except KeyError as e: 299 | raise("'{0}' does not correspond with the appropriate key inside the estimators dictionary. \ 300 | Please refer to function to check `my_estimators` dictionary.".format(estimator)) 301 | 302 | fig, ax = plt.subplots(figsize=(10, 10)) 303 | ax.set_axis_bgcolor('#fafafa') 304 | 305 | plt.plot(fpr, tpr, 306 | color=color_value, 307 | linewidth=1) 308 | plt.title('ROC Curve For {0} (AUC = {1: 0.3f})'\ 309 | .format(plot_title, auc)) 310 | 311 | plt.plot([0, 1], [0, 1], 'k--', lw=2) # Add Diagonal line 312 | plt.plot([0, 0], [1, 0], 'k--', lw=2, color = 'black') 313 | plt.plot([1, 0], [1, 1], 'k--', lw=2, color = 'black') 314 | if xlim is not None: 315 | plt.xlim(*xlim) 316 | if ylim is not None: 317 | plt.ylim(*ylim) 318 | plt.xlabel('False Positive Rate') 319 | plt.ylabel('True Positive Rate') 320 | plt.show() 321 | plt.close() 322 | 323 | def cross_val_metrics(fit, training_set, class_set, estimator, print_results = True): 324 | """ 325 | Purpose 326 | ---------- 327 | Function helps automate cross validation processes while including 328 | option to print metrics or store in variable 329 | 330 | Parameters 331 | ---------- 332 | fit: Fitted model 333 | training_set: Data_frame containing 80% of original dataframe 334 | class_set: data_frame containing the respective target vaues 335 | for the training_set 336 | print_results: Boolean, if true prints the metrics, else saves metrics as 337 | variables 338 | 339 | Returns 340 | ---------- 341 | scores.mean(): Float representing cross validation score 342 | scores.std() / 2: Float representing the standard error (derived 343 | from cross validation score's standard deviation) 344 | """ 345 | my_estimators = { 346 | 'rf': 'estimators_', 347 | 'nn': 'out_activation_', 348 | 'knn': '_fit_method' 349 | } 350 | try: 351 | # Captures whether first parameter is a model 352 | if not hasattr(fit, 'fit'): 353 | return print("'{0}' is not an instantiated model from scikit-learn".format(fit)) 354 | 355 | # Captures whether the model has been trained 356 | if not vars(fit)[my_estimators[estimator]]: 357 | return print("Model does not appear to be trained.") 358 | 359 | except KeyError as e: 360 | raise KeyError("'{0}' does not correspond with the appropriate key inside the estimators dictionary. \ 361 | Please refer to function to check `my_estimators` dictionary.".format(estimator)) 362 | 363 | n = KFold(n_splits=10) 364 | scores = cross_val_score(fit, 365 | training_set, 366 | class_set, 367 | cv = n) 368 | if print_results: 369 | for i in range(0, len(scores)): 370 | print("Cross validation run {0}: {1: 0.3f}".format(i, scores[i])) 371 | print("Accuracy: {0: 0.3f} (+/- {1: 0.3f})"\ 372 | .format(scores.mean(), scores.std() / 2)) 373 | else: 374 | return scores.mean(), scores.std() / 2 375 | 376 | 377 | def create_conf_mat(test_class_set, predictions): 378 | """Function returns confusion matrix comparing two arrays""" 379 | if (len(test_class_set.shape) != len(predictions.shape) == 1): 380 | return print('Arrays entered are not 1-D.\nPlease enter the correctly sized sets.') 381 | elif (test_class_set.shape != predictions.shape): 382 | return print('Number of values inside the Arrays are not equal to each other.\nPlease make sure the array has the same number of instances.') 383 | else: 384 | # Set Metrics 385 | test_crosstb_comp = pd.crosstab(index = test_class_set, 386 | columns = predictions) 387 | test_crosstb = test_crosstb_comp.values 388 | return test_crosstb 389 | -------------------------------------------------------------------------------- /src/r/breast_cancer.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Breast Cancer" 3 | author: "Raul Eulogio" 4 | date: "January 26, 2018" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | # Table of Contents 13 | + [Introduction](#intro) 14 | + [Load Packages](#load_pack) 15 | + [Load Data](#load_data) 16 | + [Training and Test Sets](#train_test) 17 | + [Fitting Random Forest](#fit_model) 18 | + [Hyperparameters Optimization](#hype_opt) 19 | + [Out of Bag Error](#oob) 20 | + [Variable Importance](#var_imp) 21 | + [Test Set Metrics](#test_set_met) 22 | + [Conclusions](#concl) 23 | 24 | **NOTE**: Original found [here](https://www.inertia7.com/projects/95) and repo [here](https://github.com/raviolli77/machineLearning_breastCancer_Python/tree/master/src/r) 25 | 26 | # Introduction 27 | 28 | Random forests, also known as random decision forests, are a popular ensemble method that can be used to build predictive models for both classification and regression problems. Ensemble methods use multiple learning models to gain better predictive results - in the case of a random forest, the model creates an entire forest of random uncorrelated decision trees to arrive at the best possible answer. 29 | 30 | To demonstrate how this works in practice - specifically in a classification context - I'll be walking you through an example using a famous data set from the University of California, Irvine (UCI) Machine Learning Repository. The data set, called the Breast Cancer Wisconsin (Diagnostic) Data Set, deals with binary classification and includes features computed from digitized images of biopsies. The data set can be downloaded [here](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29). 31 | To follow this tutorial, you will need some familiarity with classification and regression tree (CART) modeling. I will provide a brief overview of different CART methodologies that are relevant to random forest, beginning with decision trees. If you'd like to brush up on your knowledge of CART modeling before beginning the tutorial, I highly recommend reading Chapter 8 of the book "An Introduction to Statistical Learning with Applications in R," which can be downloaded [here](http://www-bcf.usc.edu/~gareth/ISL/). 32 | 33 | ## Decision Trees 34 | 35 | Decision trees are simple but intuitive models that utilize a top-down approach in which the root node creates binary splits until a certain criteria is met. This binary splitting of nodes provides a predicted value based on the interior nodes leading to the terminal (final) nodes. In a classification context, a decision tree will output a predicted target class for each terminal node produced. 36 | Although intuitive, decision trees have limitations that prevent them from being useful in machine learning applications. You can learn more about implementing a decision tree [here](http://scikit-learn.org/stable/modules/tree.html). 37 | 38 | ### Limitations to Decision Trees 39 | 40 | Decision trees tend to have high variance when they utilize different training and test sets of the same data, since they tend to overfit on training data. This leads to poor performance on unseen data. Unfortunately, this limits the usage of decision trees in predictive modeling. However, using ensemble methods, we can create models that utilize underlying decision trees as a foundation for producing powerful results. 41 | 42 | ## Bootstrap Aggregating Trees 43 | 44 | Through a process known as bootstrap aggregating (or bagging), it's possible to create an ensemble (forest) of trees where multiple training sets are generated with replacement, meaning data instances - or in the case of this tutorial, patients - can be repeated. Once the training sets are created, a CART model can be trained on each subsample. 45 | This approach helps reduce variance by averaging the ensemble's results, creating a majority-votes model. Another important feature of bagging trees is that the resulting model uses the entire feature space when considering node splits. Bagging trees allow the trees to grow without pruning, reducing the tree-depth sizes and resulting in high variance but lower bias, which can help improve predictive power. 46 | However, a downside to this process is that the utilization of the entire feature space creates a risk of correlation between trees, increasing bias in the model. 47 | 48 | ### Limitations to Bagging Trees 49 | 50 | The main limitation of bagging trees is that it uses the entire feature space when creating splits in the trees. If some variables within the feature space are indicative of certain predictions, you run the risk of having a forest of correlated trees, thereby increasing bias and reducing variance. 51 | However, a simple tweak of the bagging trees methodology can prove advantageous to the model's predictive power. 52 | 53 | ## Random Forest 54 | 55 | Random forest aims to reduce the previously mentioned correlation issue by choosing only a subsample of the feature space at each split. Essentially, it aims to make the trees de-correlated and prune the trees by setting a stopping criteria for node splits, which I will cover in more detail later. 56 | 57 | # Load Packages 58 | 59 | We load our packages unto our *Rstudio*. In my case, I will be employing a *Rmarkdown* file. 60 | 61 | ```{r load_packages, message=FALSE } 62 | suppressWarnings(library(tidyverse)) 63 | suppressWarnings(library(caret)) 64 | suppressWarnings(library(ggcorrplot)) 65 | suppressWarnings(library(GGally)) 66 | suppressWarnings(library(randomForest)) 67 | suppressWarnings(library(e1071)) 68 | suppressWarnings(library(ROCR)) 69 | suppressWarnings(library(pROC)) 70 | suppressWarnings(library(RCurl)) 71 | ``` 72 | 73 | # Load Data 74 | 75 | For this section, I'll load the data into a *tibble* using the `RCurl` package similar to the *python* version. 76 | I do recommend on keeping a static file for your dataset as well. 77 | Next, I created a list with the appropriate names and set them as the column names, once I load them unto a data frame. 78 | 79 | ```{r load_data} 80 | UCI_data_URL <- getURL('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data') 81 | names <- c('id_number', 'diagnosis', 'radius_mean', 82 | 'texture_mean', 'perimeter_mean', 'area_mean', 83 | 'smoothness_mean', 'compactness_mean', 84 | 'concavity_mean','concave_points_mean', 85 | 'symmetry_mean', 'fractal_dimension_mean', 86 | 'radius_se', 'texture_se', 'perimeter_se', 87 | 'area_se', 'smoothness_se', 'compactness_se', 88 | 'concavity_se', 'concave_points_se', 89 | 'symmetry_se', 'fractal_dimension_se', 90 | 'radius_worst', 'texture_worst', 91 | 'perimeter_worst', 'area_worst', 92 | 'smoothness_worst', 'compactness_worst', 93 | 'concavity_worst', 'concave_points_worst', 94 | 'symmetry_worst', 'fractal_dimension_worst') 95 | breast_cancer <- read.table(textConnection(UCI_data_URL), sep = ',', col.names = names) 96 | 97 | breast_cancer$id_number <- NULL 98 | ``` 99 | 100 | Let's preview the data set utilizing the `head()` function which will give the first 6 values of our data frame. 101 | 102 | ```{r} 103 | head(breast_cancer) 104 | ``` 105 | 106 | Next, we'll give the dimensions of the data set; where the first value is the number of patients and the second value is the number of features. 107 | We print the data types of our data set this is important because this will often be an indicator of missing data, as well as giving us context to anymore data cleanage. 108 | 109 | ```{r data_types} 110 | breast_cancer %>% 111 | dim() 112 | breast_cancer %>% 113 | str() 114 | ``` 115 | 116 | ## Class Imbalance 117 | 118 | The distribution for `diagnosis` is important because it brings up the discussion of *Class Imbalance* within Machine learning and data mining applications. 119 | Class Imbalance refers to when a target class within a data set is outnumbered by the other target class (or classes). This can lead to misleading accuracy metrics, known as [accuracy paradox](https://en.wikipedia.org/wiki/Accuracy_paradox), therefore we have to make sure our target classes aren't imblanaced. 120 | We do so by creating a function that will output the distribution of the target classes. 121 | 122 | **NOTE**: If your data set suffers from class imbalance I suggest reading documentation on upsampling and downsampling. 123 | 124 | ```{r class_imb} 125 | breast_cancer %>% 126 | count(diagnosis) %>% 127 | group_by(diagnosis) %>% 128 | summarize(perc_dx = round((n / 569)* 100, 2)) 129 | ``` 130 | Fortunately, this data set does not suffer from *class imbalance*. 131 | Next we will use a useful function that gives us standard descriptive statistics for each feature including mean, standard deviation, minimum value, maximum value, and range intervals. 132 | 133 | ```{r describe} 134 | summary(breast_cancer) 135 | ``` 136 | We can see through the maximum row that our data varies in distribution, this will be important when considering classification models. 137 | Standardization is an important requirement for many classification models that should be considered when implementing pre-processing. Some models (like neural networks) can perform poorly if pre-processing isn't considered, so the describe() function can be a good indicator for standardization. Fortunately Random Forest does not require any pre-processing (for use of categorical data see [sklearn's Encoding Categorical Data section](http://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)). 138 | 139 | # Creating Training and Test Sets 140 | 141 | We split the data set into our training and test sets which will be (pseudo) randomly selected having a 80-20% splt. We will use the training set to train our model along with some optimization, and use our test set as the unseen data that will be a useful final metric to let us know how well our model does. 142 | 143 | When using this method for machine learning always be weary of utilizing your test set when creating models. The issue of data leakage is a grave and serious issue that is common in practice and can result in over-fitting. More on data leakage can be found in this [Kaggle article](https://www.kaggle.com/wiki/Leakage) 144 | 145 | ```{r create_train_test} 146 | set.seed(42) 147 | trainIndex <- createDataPartition(breast_cancer$diagnosis, 148 | p = .8, 149 | list = FALSE, 150 | times = 1) 151 | training_set <- breast_cancer[ trainIndex, ] 152 | test_set <- breast_cancer[ -trainIndex, ] 153 | ``` 154 | 155 | **NOTE**: What I mean when I say pseudo-random is that we would want everyone who replicates this project to get the same results. So we use a random seed generator and set it equal to a number of our choosing, this will then make the results the same for anyone who uses this generator, awesome for reproducibility. 156 | 157 | # Fitting Random Forest 158 | 159 | The *R* version is very different because the `caret` package *hyperparameter optimization* will be done in the same chapter as fitting model along with *cross validation*. If you want an in more depth look check the *python* version. 160 | 161 | # Hyperparameters Optimization 162 | 163 | Here we'll create a custom model to allow us to do a grid search, I will see which parameters output the best model based on *accuracy*. 164 | 165 | + mtry: Features used in each split 166 | + ntree: Number of trees used in model 167 | + nodesize: Max number of node splits 168 | 169 | ```{r} 170 | # Custom grid search 171 | # From https://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/ 172 | customRF <- list(type = "Classification", library = "randomForest", loop = NULL) 173 | customRF$parameters <- data.frame(parameter = c("mtry", "ntree", "nodesize"), class = rep("numeric", 3), label = c("mtry", "ntree", "nodesize")) 174 | customRF$grid <- function(x, y, len = NULL, search = "grid") {} 175 | customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { 176 | randomForest(x, y, mtry = param$mtry, ntree=param$ntree, nodesize=param$nodesize, ...) 177 | } 178 | customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) 179 | predict(modelFit, newdata) 180 | customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL) 181 | predict(modelFit, newdata, type = "prob") 182 | customRF$sort <- function(x) x[order(x[,1]),] 183 | customRF$levels <- function(x) x$classes 184 | ``` 185 | 186 | Now that we have the custom settings well use the `train` method which crossvlidates and does a grid search, giving us the best parameters. 187 | 188 | 189 | ```{r fit_control} 190 | fitControl <- trainControl(## 10-fold CV 191 | method = "repeatedcv", 192 | number = 3, 193 | ## repeated ten times 194 | repeats = 10) 195 | 196 | grid <- expand.grid(.mtry=c(floor(sqrt(ncol(training_set))), (ncol(training_set) - 1), floor(log(ncol(training_set)))), 197 | .ntree = c(100, 300, 500, 1000), 198 | .nodesize =c(1:4)) 199 | set.seed(42) 200 | fit_rf <- train(as.factor(diagnosis) ~ ., 201 | data = training_set, 202 | method = customRF, 203 | metric = "Accuracy", 204 | tuneGrid= grid, 205 | trControl = fitControl) 206 | ``` 207 | 208 | Let's print out the different models and best model given by model. 209 | 210 | ```{r} 211 | fit_rf$finalModel 212 | ``` 213 | 214 | ```{r fit_model, echo = FALSE} 215 | fit_rf 216 | 217 | suppressWarnings(ggplot(fit_rf) + 218 | theme_bw() + 219 | ggtitle('Line plot for Random Forest')) 220 | 221 | ``` 222 | 223 | # Variable Importance 224 | 225 | Once we have trained the model, we are able to assess this concept of variable importance. A downside to creating ensemble methods with Decision Trees is we lose the interpretability that a single tree gives. A single tree can outline for us important node splits along with variables that were important at each split. 226 | 227 | 228 | Forunately ensemble methods utilzing CART models use a metric to evaluate homogeneity of splits. Thus when creating ensembles these metrics can be utilized to give insight to important variables used in the training of the model. Two metrics that are used are `gini impurity` and `entropy`. 229 | 230 | The two metrics vary and from reading documentation online, many people favor `gini impurity` due to the computational cost of `entropy` since it requires calculating the logarithmic function. For more discussion I recommend reading this [article](https://github.com/rasbt/python-machine-learning-book/blob/master/faq/decision-tree-binary.md). 231 | 232 | Here we define each metric: 233 | 234 | $$Gini\ Impurity = 1 - \sum_i p_i$$ 235 | 236 | $$Entropy = \sum_i -p_i * \log_2 p_i$$ 237 | 238 | where $p_i$ is defined as the proportion of subsamples that belong to a certain target class. 239 | 240 | For the package `randomForest`, I believe the *gini index* is used without giving the choice to the *information gain*. 241 | 242 | ```{r} 243 | 244 | varImportance <- varImp(fit_rf, scale = FALSE) 245 | 246 | varImportanceScores <- data.frame(varImportance$importance) 247 | 248 | varImportanceScores <- data.frame(names = row.names(varImportanceScores), var_imp_scores = varImportanceScores$B) 249 | 250 | varImportanceScores 251 | ``` 252 | 253 | ## Visual Representation 254 | 255 | ```{r} 256 | 257 | ggplot(varImportanceScores, 258 | aes(reorder(names, var_imp_scores), var_imp_scores)) + 259 | geom_bar(stat='identity', 260 | fill = '#875FDB') + 261 | theme(panel.background = element_rect(fill = '#fafafa')) + 262 | coord_flip() + 263 | labs(x = 'Feature', y = 'Importance') + 264 | ggtitle('Feature Importance for Random Forest Model') 265 | 266 | ``` 267 | 268 | # Out of Bag Error Rate 269 | 270 | Another useful feature of Random Forest is the concept of Out of Bag Error Rate or OOB error rate. When creating the forest, typically only 2/3 of the data is used to train the trees, this gives us 1/3 of unseen data that we can then utilize. 271 | 272 | ```{r} 273 | 274 | oob_error <- data.frame(mtry = seq(1:100), oob = fit_rf$finalModel$err.rate[, 'OOB']) 275 | 276 | paste0('Out of Bag Error Rate for model is: ', round(oob_error[100, 2], 4)) 277 | 278 | ggplot(oob_error, aes(mtry, oob)) + 279 | geom_line(colour = 'red') + 280 | theme_minimal() + 281 | ggtitle('OOB Error Rate across 100 trees') + 282 | labs(y = 'OOB Error Rate') 283 | ``` 284 | 285 | # Test Set Metrics 286 | 287 | Now we will be utilizing the test set that was created earlier to receive another metric for evaluation of our model. Recall the importance of data leakage and that we didn't touch the test set until now, after we had done hyperparamter optimization. 288 | 289 | ```{r} 290 | 291 | predict_values <- predict(fit_rf, newdata = test_set) 292 | ``` 293 | 294 | ```{r} 295 | ftable(predict_values, test_set$diagnosis) 296 | 297 | paste0('Test error rate is: ', round(((2/113)), 4)) 298 | ``` 299 | 300 | 301 | # Conclusions 302 | 303 | For this tutorial we went through a number of metrics to assess the capabilites of our Random Forest, but this can be taken further when using background information of the data set. Feature engineering would be a powerful tool to extract and move forward into research regarding the important features. As well defining key metrics to utilize when optimizing model paramters. 304 | 305 | There have been advancements with image classification in the past decade that utilize the images intead of extracted features from images, but this data set is a great resource to become with machine learning processes. Especially for those who are just beginning to learn machine learning concepts. If you have any suggestions, recommendations, or corrections please reach out to me. 306 | -------------------------------------------------------------------------------- /src/pyspark/breast_cancer_zeppelin_notebook.json: -------------------------------------------------------------------------------- 1 | {"paragraphs":[{"text":"%pyspark\nfrom pyspark.sql.functions import col\nfrom pyspark.ml.classification import RandomForestClassifier\nfrom pyspark.ml.classification import DecisionTreeClassifier\nfrom pyspark.ml.classification import MultilayerPerceptronClassifier\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.ml.feature import MinMaxScaler\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.ml.evaluation import MulticlassClassificationEvaluator\nfrom numpy import array","user":"anonymous","dateUpdated":"2017-04-23T13:19:16-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411274_576758133","id":"20170423-055816_313551939","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T13:19:16-0700","dateFinished":"2017-04-23T13:20:00-0700","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:234"},{"text":"%pyspark\n#data = sc.textFile('s3://dc-sparkzeppelin/BreastCancerData.txt').map(lambda lines: lines.split(\" \"))\n# Or you can wget from command line on my GitHub account from terminal line on Ubuntu\ndata = sc.textFile('/home/rxe/myProjects/dataScience/breastCancer/data.txt').map(lambda lines: lines.split(\" \"))\n","user":"anonymous","dateUpdated":"2017-04-23T13:23:11-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411280_585222609","id":"20170423-055905_540096736","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T13:23:11-0700","dateFinished":"2017-04-23T13:23:11-0700","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:235"},{"text":"%pyspark\ndata","user":"anonymous","dateUpdated":"2017-04-23T13:23:16-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"PythonRDD[4] at RDD at PythonRDD.scala:48\n"}]},"apps":[],"jobName":"paragraph_1492935411282_585992107","id":"20170423-055959_1651811836","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T13:23:16-0700","dateFinished":"2017-04-23T13:23:16-0700","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:236"},{"text":"%pyspark\ndata.collect()","user":"anonymous","dateUpdated":"2017-04-23T13:23:19-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"Traceback (most recent call last):\n File \"/tmp/zeppelin_pyspark-7364956677326941508.py\", line 349, in \n raise Exception(traceback.format_exc())\nException: Traceback (most recent call last):\n File \"/tmp/zeppelin_pyspark-7364956677326941508.py\", line 342, in \n exec(code)\n File \"\", line 1, in \n File \"/usr/lib/spark/python/pyspark/rdd.py\", line 809, in collect\n port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())\n File \"/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py\", line 1133, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File \"/usr/lib/spark/python/pyspark/sql/utils.py\", line 63, in deco\n return f(*a, **kw)\n File \"/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py\", line 319, in get_return_value\n format(target_id, \".\", name), value)\nPy4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.\n: java.io.IOException: No FileSystem for scheme: https\n\tat org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)\n\tat org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)\n\tat org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)\n\tat org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)\n\tat org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)\n\tat org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)\n\tat org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)\n\tat org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:258)\n\tat org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)\n\tat org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)\n\tat org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:53)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:362)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:934)\n\tat org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)\n\tat org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\n\n\n"}]},"apps":[],"jobName":"paragraph_1492935411283_585607358","id":"20170423-060007_642256882","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T13:23:19-0700","dateFinished":"2017-04-23T13:23:19-0700","status":"ERROR","progressUpdateIntervalMs":500,"$$hashKey":"object:237"},{"text":"%pyspark\ndf = data.toDF()","user":"anonymous","dateUpdated":"2017-04-23T01:22:43-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411300_565215666","id":"20170423-060132_205489433","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T01:22:43-0700","dateFinished":"2017-04-23T01:22:44-0700","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:238"},{"text":"%pyspark\ndf.printSchema()","user":"anonymous","dateUpdated":"2017-04-23T01:22:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"root\n |-- _1: string (nullable = true)\n |-- _2: string (nullable = true)\n |-- _3: string (nullable = true)\n |-- _4: string (nullable = true)\n |-- _5: string (nullable = true)\n |-- _6: string (nullable = true)\n |-- _7: string (nullable = true)\n |-- _8: string (nullable = true)\n |-- _9: string (nullable = true)\n |-- _10: string (nullable = true)\n |-- _11: string (nullable = true)\n |-- _12: string (nullable = true)\n |-- _13: string (nullable = true)\n |-- _14: string (nullable = true)\n |-- _15: string (nullable = true)\n |-- _16: string (nullable = true)\n |-- _17: string (nullable = true)\n |-- _18: string (nullable = true)\n |-- _19: string (nullable = true)\n |-- _20: string (nullable = true)\n |-- _21: string (nullable = true)\n |-- _22: string (nullable = true)\n |-- _23: string (nullable = true)\n |-- _24: string (nullable = true)\n |-- _25: string (nullable = true)\n |-- _26: string (nullable = true)\n |-- _27: string (nullable = true)\n |-- _28: string (nullable = true)\n |-- _29: string (nullable = true)\n |-- _30: string (nullable = true)\n |-- _31: string (nullable = true)\n\n"}]},"apps":[],"jobName":"paragraph_1492935411301_564830918","id":"20170423-060400_284199338","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T01:22:51-0700","dateFinished":"2017-04-23T01:22:52-0700","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:239"},{"text":"%pyspark\ndata = df.selectExpr('_1 as label', '_2 as radius_mean', \n\t'_3 as texture_mean', '_4 as perimeter_mean', \n\t'_5 as area_mean', '_6 as smoothness_mean', \n\t'_7 as compactness_mean', '_8 as concavity_mean', \n\t'_9 as concave_points_mean', '_10 as symmetry_mean', \n\t'_11 as fractal_dimension_mean', '_12 as radius_se', \n\t'_13 as texture_se', '_14 as perimeter_se', \n\t'_15 as area_se', '_16 as smoothness_se', \n\t'_17 as compactness_se', '_18 as concavity_se', \n\t'_19 as concave_points_se', '_20 as symmetry_se', \n\t'_21 as fractal_dimension_se', '_22 as radius_worst', \n\t'_23 as texture_worst', '_24 as perimeter_worst', \n\t'_25 as area_worst', '_26 as smoothness_worst', \n\t'_27 as compactness_worst', '_28 as concavity_worst', \n\t'_29 as concave_points_worst', '_30 as symmetry_worst', \n\t'_31 as fractal_dimension_worst')\n\t\ndata.registerTempTable(\"data\")","user":"anonymous","dateUpdated":"2017-04-23T01:24:18-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411302_565985164","id":"20170423-060413_1633001850","dateCreated":"2017-04-23T01:16:51-0700","dateStarted":"2017-04-23T01:24:18-0700","dateFinished":"2017-04-23T01:24:19-0700","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:240"},{"text":"%sql\nSELECT concave_points_worst, count(*) from data group by concave_points_worst ","user":"anonymous","dateUpdated":"2017-04-23T01:30:16-0700","config":{"colWidth":12,"enabled":true,"results":{"0":{"graph":{"mode":"scatterChart","height":300,"optionOpen":false,"setting":{"multiBarChart":{"stacked":false}},"commonSetting":{},"keys":[{"name":"compactness_se","index":0,"aggr":"sum"}],"groups":[],"values":[{"name":"fractal_dimension_mean","index":1,"aggr":"sum"}]},"helium":{}}},"editorSetting":{"language":"sql","editOnDblClick":false},"editorMode":"ace/mode/sql"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TABLE","data":"concave_points_worst\tcount(1)\n0.2688\t1\n0.1974\t1\n0.151\t1\n0.2088\t1\n0.1357\t2\n0.1112\t1\n0.06499\t1\n0.08178\t1\n0.2393\t1\n0.09993\t1\n0.04866\t1\n0.2508\t1\n0.1541\t1\n0.1226\t1\n0.06189\t1\n0.05356\t1\n0.1447\t1\n0.2903\t1\n0.1021\t1\n0.09514\t1\n0.04464\t1\n0.06876\t1\n0.2148\t1\n0.1999\t1\n0.1776\t1\n0.05052\t1\n0.08288\t1\n0.08333\t1\n0.09186\t1\n0.153\t1\n0.02564\t3\n0.2102\t1\n0.009259\t1\n0.06544\t1\n0.1205\t2\n0.2462\t1\n0.1674\t1\n0.0812\t2\n0.1492\t1\n0.2034\t1\n0.08312\t1\n0.107\t1\n0.1474\t1\n0.04773\t1\n0.1015\t2\n0.05366\t1\n0.2575\t1\n0.1565\t1\n0.2593\t1\n0.1362\t1\n0.0829\t1\n0.1048\t1\n0.06575\t1\n0.05601\t1\n0.09123\t1\n0.2208\t1\n0.06413\t1\n0.1418\t1\n0.1258\t1\n0.1218\t3\n0.1599\t1\n0.0497\t1\n0.1515\t1\n0.1561\t1\n0.1825\t1\n0.1225\t1\n0.1329\t1\n0.01042\t1\n0.192\t1\n0.05159\t1\n0.09861\t1\n0.2066\t1\n0.2432\t1\n0.06222\t1\n0.1708\t3\n0.08485\t1\n0.09314\t1\n0.1765\t1\n0.06961\t1\n0.1573\t1\n0\t13\n0.2356\t1\n0.07879\t1\n0.09678\t1\n0.03002\t1\n0.1526\t1\n0.05781\t1\n0.1775\t1\n0.02778\t1\n0.04815\t2\n0.0866\t1\n0.105\t1\n0.05093\t1\n0.06835\t1\n0.06913\t1\n0.1282\t1\n0.09181\t1\n0.08829\t1\n0.0753\t1\n0.112\t1\n0.1628\t1\n0.09173\t1\n0.1108\t1\n0.2013\t1\n0.09222\t1\n0.03125\t1\n0.02784\t1\n0.03571\t1\n0.1659\t1\n0.1181\t1\n0.175\t1\n0.1642\t1\n0.03846\t2\n0.2009\t1\n0.1105\t3\n0.1087\t1\n0.03922\t1\n0.197\t1\n0.265\t1\n0.06227\t1\n0.1312\t1\n0.05556\t3\n0.1932\t1\n0.0991\t1\n0.05602\t1\n0.09851\t1\n0.05104\t1\n0.1977\t1\n0.1535\t1\n0.1789\t1\n0.1841\t2\n0.07966\t1\n0.1155\t2\n0.1964\t1\n0.1607\t1\n0.1425\t1\n0.04074\t1\n0.1045\t1\n0.2095\t1\n0.2247\t1\n0.09608\t1\n0.06754\t1\n0.06127\t1\n0.086\t1\n0.1407\t2\n0.1147\t1\n0.1813\t1\n0.08611\t1\n0.1667\t1\n0.06608\t1\n0.228\t1\n0.1476\t1\n0.1035\t1\n0.1335\t1\n0.08542\t1\n0.08278\t1\n0.07971\t1\n0.2089\t1\n0.1466\t1\n0.04195\t1\n0.02579\t2\n0.05754\t1\n0.1216\t1\n0.08978\t1\n0.06019\t1\n0.08272\t1\n0.1609\t1\n0.1777\t1\n0.2507\t1\n0.1712\t1\n0.03715\t1\n0.1479\t1\n0.04052\t1\n0.1556\t1\n0.2035\t1\n0.1563\t1\n0.08088\t1\n0.04537\t1\n0.1864\t1\n0.06517\t1\n0.07632\t1\n0.07025\t1\n0.06402\t1\n0.1583\t1\n0.1414\t1\n0.08704\t1\n0.2258\t1\n0.1654\t1\n0.05185\t1\n0.1092\t1\n0.08296\t1\n0.05575\t1\n0.2756\t1\n0.05802\t1\n0.06548\t2\n0.09391\t1\n0.05813\t1\n0.2542\t1\n0.1827\t3\n0.08219\t1\n0.05741\t1\n0.2475\t2\n0.1095\t1\n0.1555\t2\n0.03264\t1\n0.0399\t1\n0.08187\t1\n0.06042\t1\n0.06266\t1\n0.08263\t1\n0.1456\t1\n0.1847\t1\n0.1614\t1\n0.07625\t1\n0.02832\t2\n0.08308\t1\n0.05087\t1\n0.1251\t2\n0.2135\t1\n0.06987\t2\n0.2422\t1\n0.07485\t1\n0.05547\t1\n0.08512\t1\n0.108\t1\n0.03203\t1\n0.1252\t1\n0.1981\t1\n0.08235\t2\n0.1739\t1\n0.07174\t1\n0.025\t2\n0.09077\t1\n0.08476\t1\n0.03312\t1\n0.03413\t1\n0.01635\t1\n0.07453\t1\n0.06918\t1\n0.08958\t1\n0.06528\t2\n0.0268\t1\n0.1741\t1\n0.08405\t1\n0.06203\t1\n0.1047\t1\n0.05\t1\n0.2388\t1\n0.07911\t2\n0.01389\t1\n0.1514\t1\n0.1069\t1\n0.07887\t1\n0.05013\t1\n0.07958\t1\n0.2216\t1\n0.08586\t1\n0.1012\t1\n0.07763\t1\n0.05334\t1\n0.2252\t1\n0.09815\t1\n0.1872\t1\n0.02222\t1\n0.0815\t1\n0.2163\t1\n0.09127\t1\n0.06968\t1\n0.1716\t1\n0.152\t1\n0.291\t1\n0.09331\t1\n0.01667\t1\n0.1834\t1\n0.0716\t1\n0.04786\t1\n0.0221\t1\n0.09532\t1\n0.06384\t1\n0.2733\t1\n0.1099\t2\n0.04766\t1\n0.1613\t2\n0.1528\t1\n0.1136\t1\n0.1923\t1\n0.2685\t1\n0.0377\t1\n0.01111\t2\n0.1018\t1\n0.1119\t1\n0.1221\t1\n0.06343\t1\n0.1767\t1\n0.2073\t1\n0.2493\t1\n0.1325\t1\n0.0578\t1\n0.09744\t1\n0.2173\t1\n0.1284\t1\n0.101\t1\n0.0585\t1\n0.1054\t1\n0.09653\t1\n0.2524\t1\n0.08388\t1\n0.1505\t1\n0.03953\t1\n0.1339\t1\n0.1318\t1\n0.04044\t1\n0.1202\t1\n0.0656\t1\n0.1465\t1\n0.06696\t1\n0.0569\t1\n0.07909\t1\n0.1546\t1\n0.02022\t1\n0.2867\t1\n0.1865\t1\n0.1673\t1\n0.02083\t1\n0.1075\t2\n0.04762\t1\n0.04603\t1\n0.1857\t1\n0.1424\t1\n0.08436\t1\n0.0931\t1\n0.149\t1\n0.01852\t1\n0.1625\t2\n0.181\t1\n0.05509\t1\n0.03532\t1\n0.1521\t1\n0.2701\t1\n0.1096\t2\n0.09804\t1\n0.1056\t2\n0.255\t1\n0.1001\t1\n0.116\t1\n0.1838\t1\n0.08056\t1\n0.03983\t1\n0.2121\t1\n0.05563\t1\n0.2113\t1\n0.08216\t1\n0.2264\t1\n0.186\t1\n0.1452\t1\n0.1966\t1\n0.1129\t1\n0.1053\t1\n0.221\t1\n0.2014\t1\n0.1697\t1\n0.08411\t2\n0.08737\t1\n0.2248\t2\n0.06664\t1\n0.2543\t1\n0.06136\t2\n0.08224\t1\n0.2378\t1\n0.1397\t1\n0.1025\t1\n0.05882\t2\n0.05614\t1\n0.05588\t1\n0.206\t2\n0.08194\t1\n0.08442\t1\n0.1379\t2\n0.04793\t1\n0.1342\t1\n0.07222\t1\n0.07116\t1\n0.1383\t1\n0.07926\t2\n0.07247\t1\n0.1772\t2\n0.09975\t1\n0.1848\t1\n0.1374\t2\n0.243\t1\n0.07283\t1\n0.2105\t1\n0.08946\t1\n0.08698\t1\n0.1184\t1\n0.1564\t1\n0.1294\t1\n0.07955\t1\n0.1941\t1\n0.2229\t1\n0.03194\t1\n0.1899\t1\n0.08568\t1\n0.02381\t2\n0.0914\t2\n0.0589\t2\n0.02796\t1\n0.09594\t1\n0.1996\t1\n0.02232\t1\n0.06316\t1\n0.1459\t1\n0.09858\t1\n0.06493\t1\n0.07431\t3\n0.1986\t1\n0.07963\t1\n0.1956\t1\n0.0875\t2\n0.09783\t1\n0.008772\t1\n0.2115\t1\n0.07393\t1\n0.07407\t1\n0.2027\t1\n0.04419\t1\n0.2346\t1\n0.1721\t1\n0.1427\t1\n0.1785\t2\n0.04262\t1\n0.06005\t1\n0.06296\t3\n0.1445\t1\n0.2048\t1\n0.1939\t1\n0.118\t1\n0.03612\t1\n0.2024\t1\n0.1185\t1\n0.05506\t1\n0.1308\t2\n0.07828\t1\n0.03333\t1\n0.1423\t1\n0.1984\t1\n0.07864\t1\n0.1416\t1\n0.06106\t1\n0.1359\t1\n0.1453\t1\n0.2091\t1\n0.02899\t1\n0.1571\t2\n0.04589\t1\n0.09265\t1\n0.08341\t1\n0.06946\t1\n0.182\t2\n0.04306\t3\n0.05921\t1\n0.2625\t1\n0.2654\t1\n0.08211\t1\n0.1595\t1\n0.09749\t1\n0.06845\t1\n0.1126\t1\n0.07262\t1\n0.1727\t1\n0.04715\t1\n0.1288\t1\n0.198\t1\n0.2112\t1\n0.08045\t1\n0.1732\t2\n0.1145\t1\n0.09722\t1\n0.0909\t1\n0.2051\t1\n0.2134\t1\n0.1138\t1\n0.1663\t1\n0.06498\t1\n0.06736\t1\n0.2152\t1\n0.08449\t1\n0.1489\t2\n0.2255\t1\n0.0737\t1\n0.1017\t2\n0.06335\t1\n"}]},"apps":[],"jobName":"paragraph_1492935785861_1135117444","id":"20170423-012305_1726819658","dateCreated":"2017-04-23T01:23:05-0700","dateStarted":"2017-04-23T01:30:06-0700","dateFinished":"2017-04-23T01:30:09-0700","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:241"},{"text":"%pyspark\ndata.select('area_worst').show()","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+----------+\n|area_worst|\n+----------+\n| 2019|\n| 1956|\n| 1709|\n| 567.7|\n| 1575|\n| 741.6|\n| 1606|\n| 897|\n| 739.3|\n| 711.4|\n| 1150|\n| 1299|\n| 1332|\n| 876.5|\n| 697.7|\n| 943.2|\n| 1138|\n| 1315|\n| 2398|\n| 711.2|\n+----------+\nonly showing top 20 rows\n\n"}]},"apps":[],"jobName":"paragraph_1492935411303_565600415","id":"20170423-060628_2087116257","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:242"},{"text":"%pyspark\nnewData = data.select([col(c).cast('float') if c != 'label' else col(c).cast('int') for c in data.columns ])","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411304_563676671","id":"20170423-060642_2014350977","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:243"},{"text":"%pyspark\nnewData.printSchema()","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"root\n |-- label: integer (nullable = true)\n |-- radius_mean: float (nullable = true)\n |-- texture_mean: float (nullable = true)\n |-- perimeter_mean: float (nullable = true)\n |-- area_mean: float (nullable = true)\n |-- smoothness_mean: float (nullable = true)\n |-- compactness_mean: float (nullable = true)\n |-- concavity_mean: float (nullable = true)\n |-- concave_points_mean: float (nullable = true)\n |-- symmetry_mean: float (nullable = true)\n |-- fractal_dimension_mean: float (nullable = true)\n |-- radius_se: float (nullable = true)\n |-- texture_se: float (nullable = true)\n |-- perimeter_se: float (nullable = true)\n |-- area_se: float (nullable = true)\n |-- smoothness_se: float (nullable = true)\n |-- compactness_se: float (nullable = true)\n |-- concavity_se: float (nullable = true)\n |-- concave_points_se: float (nullable = true)\n |-- symmetry_se: float (nullable = true)\n |-- fractal_dimension_se: float (nullable = true)\n |-- radius_worst: float (nullable = true)\n |-- texture_worst: float (nullable = true)\n |-- perimeter_worst: float (nullable = true)\n |-- area_worst: float (nullable = true)\n |-- smoothness_worst: float (nullable = true)\n |-- compactness_worst: float (nullable = true)\n |-- concavity_worst: float (nullable = true)\n |-- concave_points_worst: float (nullable = true)\n |-- symmetry_worst: float (nullable = true)\n |-- fractal_dimension_worst: float (nullable = true)\n\n"}]},"apps":[],"jobName":"paragraph_1492935411305_563291922","id":"20170423-060806_859673761","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:244"},{"text":"%pyspark\nmylist = []\nmylist2 = []\nfor i in range(0, 31):\n if (i % 2 != 0):\n \tmylist.append(newData.columns[i])\n else:\n \tmylist2.append(newData.columns[i])","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411306_564446169","id":"20170423-060815_2125440954","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:245"},{"text":"%pyspark\nfor i in range(0, 15): \t\n\tnewData.describe(mylist[i], mylist2[i]).show()","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+-------+------------------+-------------------+\n|summary| radius_mean| label|\n+-------+------------------+-------------------+\n| count| 569| 569|\n| mean|14.127291743072348|0.37258347978910367|\n| stddev|3.5240488129671963|0.48391795640316865|\n| min| 6.981| 0|\n| max| 28.11| 1|\n+-------+------------------+-------------------+\n\n+-------+------------------+------------------+\n|summary| perimeter_mean| texture_mean|\n+-------+------------------+------------------+\n| count| 569| 569|\n| mean| 91.96903329993384|19.289648528677297|\n| stddev|24.298980946187065| 4.301035792275386|\n| min| 43.79| 9.71|\n| max| 188.5| 39.28|\n+-------+------------------+------------------+\n\n+-------+--------------------+------------------+\n|summary| smoothness_mean| area_mean|\n+-------+--------------------+------------------+\n| count| 569| 569|\n| mean| 0.09636028129312821| 654.889103814043|\n| stddev|0.014064128011679857|351.91412886139733|\n| min| 0.05263| 143.5|\n| max| 0.1634| 2501.0|\n+-------+--------------------+------------------+\n\n+-------+-------------------+-------------------+\n|summary| concavity_mean| compactness_mean|\n+-------+-------------------+-------------------+\n| count| 569| 569|\n| mean|0.08879931578830029|0.10434098429781481|\n| stddev|0.07971980885275735|0.05281275807458228|\n| min| 0.0| 0.01938|\n| max| 0.4268| 0.3454|\n+-------+-------------------+-------------------+\n\n+-------+-------------------+-------------------+\n|summary| symmetry_mean|concave_points_mean|\n+-------+-------------------+-------------------+\n| count| 569| 569|\n| mean|0.18116186307792295|0.04891914597230428|\n| stddev|0.02741428169736473|0.03880284499915188|\n| min| 0.106| 0.0|\n| max| 0.304| 0.2012|\n+-------+-------------------+-------------------+\n\n+-------+-------------------+----------------------+\n|summary| radius_se|fractal_dimension_mean|\n+-------+-------------------+----------------------+\n| count| 569| 569|\n| mean|0.40517205624515434| 0.06279760972974799|\n| stddev|0.27731273103393916| 0.00706036285946223|\n| min| 0.1115| 0.04996|\n| max| 2.873| 0.09744|\n+-------+-------------------+----------------------+\n\n+-------+------------------+------------------+\n|summary| perimeter_se| texture_se|\n+-------+------------------+------------------+\n| count| 569| 569|\n| mean|2.8660592201095474|1.2168534254566856|\n| stddev| 2.021854536795029|0.5516483938812107|\n| min| 0.757| 0.3602|\n| max| 21.98| 4.885|\n+-------+------------------+------------------+\n\n+-------+--------------------+-----------------+\n|summary| smoothness_se| area_se|\n+-------+--------------------+-----------------+\n| count| 569| 569|\n| mean|0.007040978908565007|40.33707911519887|\n| stddev|0.003002517919151...|45.49100533347044|\n| min| 0.001713| 6.802|\n| max| 0.03113| 542.2|\n+-------+--------------------+-----------------+\n\n+-------+-------------------+--------------------+\n|summary| concavity_se| compactness_se|\n+-------+-------------------+--------------------+\n| count| 569| 569|\n| mean|0.03189371635352535|0.025478138811378913|\n| stddev| 0.0301860601467103| 0.01790817919899339|\n| min| 0.0| 0.002252|\n| max| 0.396| 0.1354|\n+-------+-------------------+--------------------+\n\n+-------+--------------------+--------------------+\n|summary| symmetry_se| concave_points_se|\n+-------+--------------------+--------------------+\n| count| 569| 569|\n| mean|0.020542298759512197|0.011796137079660353|\n| stddev|0.008266371517617574|0.006170285165756808|\n| min| 0.007882| 0.0|\n| max| 0.07895| 0.05279|\n+-------+--------------------+--------------------+\n\n+-------+------------------+--------------------+\n|summary| radius_worst|fractal_dimension_se|\n+-------+------------------+--------------------+\n| count| 569| 569|\n| mean|16.269189776770887|0.003794903873493...|\n| stddev| 4.833241591272437|0.002646070973950...|\n| min| 7.93| 8.948E-4|\n| max| 36.04| 0.02984|\n+-------+------------------+--------------------+\n\n+-------+------------------+-----------------+\n|summary| perimeter_worst| texture_worst|\n+-------+------------------+-----------------+\n| count| 569| 569|\n| mean|107.26121279644421|25.67722316534113|\n| stddev| 33.60254226450891|6.146257611231103|\n| min| 50.41| 12.02|\n| max| 251.2| 49.54|\n+-------+------------------+-----------------+\n\n+-------+-------------------+-----------------+\n|summary| smoothness_worst| area_worst|\n+-------+-------------------+-----------------+\n| count| 569| 569|\n| mean|0.13236859435565862|880.5831290514901|\n| stddev|0.02283242955918711|569.3569923849645|\n| min| 0.07117| 185.2|\n| max| 0.2226| 4254.0|\n+-------+-------------------+-----------------+\n\n+-------+-------------------+-------------------+\n|summary| concavity_worst| compactness_worst|\n+-------+-------------------+-------------------+\n| count| 569| 569|\n| mean| 0.2721884833807977|0.25426504394016597|\n| stddev|0.20862428007810732|0.15733648854662943|\n| min| 0.0| 0.02729|\n| max| 1.252| 1.058|\n+-------+-------------------+-------------------+\n\n+-------+--------------------+--------------------+\n|summary| symmetry_worst|concave_points_worst|\n+-------+--------------------+--------------------+\n| count| 569| 569|\n| mean| 0.2900755708948799| 0.11460622294146325|\n| stddev|0.061867468184841665| 0.06573234105890068|\n| min| 0.1565| 0.0|\n| max| 0.6638| 0.291|\n+-------+--------------------+--------------------+\n\n"}]},"apps":[],"jobName":"paragraph_1492935411307_564061420","id":"20170423-060912_430726443","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:246"},{"text":"%pyspark\nfeatureIndexer = VectorAssembler(\n\tinputCols = [x for x in newData.columns if x != 'label'],\n\toutputCol = 'features')","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411309_561752926","id":"20170423-060931_567115405","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:247"},{"text":"%pyspark\ndf = featureIndexer.transform(newData)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411310_562907173","id":"20170423-061113_215972431","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:248"},{"text":"%pyspark\ndf.select(df['features']).show(50)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+--------------------+\n| features|\n+--------------------+\n|[17.9899997711181...|\n|[20.5699996948242...|\n|[19.6900005340576...|\n|[11.4200000762939...|\n|[20.2900009155273...|\n|[12.4499998092651...|\n|[18.25,19.9799995...|\n|[13.7100000381469...|\n|[13.0,21.81999969...|\n|[12.4600000381469...|\n|[16.0200004577636...|\n|[15.7799997329711...|\n|[19.1700000762939...|\n|[15.8500003814697...|\n|[13.7299995422363...|\n|[14.5399999618530...|\n|[14.6800003051757...|\n|[16.1299991607666...|\n|[19.8099994659423...|\n|[13.5399999618530...|\n|[13.0799999237060...|\n|[9.50399971008300...|\n|[15.3400001525878...|\n|[21.1599998474121...|\n|[16.6499996185302...|\n|[17.1399993896484...|\n|[14.5799999237060...|\n|[18.6100006103515...|\n|[15.3000001907348...|\n|[17.5699996948242...|\n|[18.6299991607666...|\n|[11.8400001525878...|\n|[17.0200004577636...|\n|[19.2700004577636...|\n|[16.1299991607666...|\n|[16.7399997711181...|\n|[14.25,21.7199993...|\n|[13.0299997329711...|\n|[14.9899997711181...|\n|[13.4799995422363...|\n|[13.4399995803833...|\n|[10.9499998092651...|\n|[19.0699996948242...|\n|[13.2799997329711...|\n|[13.1700000762939...|\n|[18.6499996185302...|\n|[8.19600009918212...|\n|[13.1700000762939...|\n|[12.0500001907348...|\n|[13.4899997711181...|\n+--------------------+\nonly showing top 50 rows\n\n"}]},"apps":[],"jobName":"paragraph_1492935411311_562522424","id":"20170423-061125_1153980412","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:249"},{"text":"%pyspark\ndf.select(df['label']).show()","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+-----+\n|label|\n+-----+\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 1|\n| 0|\n+-----+\nonly showing top 20 rows\n\n"}]},"apps":[],"jobName":"paragraph_1492935411311_562522424","id":"20170423-061137_482776558","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:250"},{"text":"%pyspark\n(trainingSet, testSet) = df.randomSplit([0.7, 0.3])","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411312_572910644","id":"20170423-061217_1543778442","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:251"},{"text":"%pyspark\ndt = DecisionTreeClassifier(labelCol=\"label\",\n\tfeaturesCol = \"features\",\n\tseed=42)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411313_572525896","id":"20170423-061232_579938062","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:252"},{"text":"%pyspark\nmodel_dt = dt.fit(trainingSet)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411314_573680142","id":"20170423-061345_1810525098","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:253"},{"text":"%pyspark\npredictions_dt = model_dt.transform(testSet)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411314_573680142","id":"20170423-061355_440006592","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:254"},{"text":"%pyspark\npredictions_dt.select(\"prediction\", \n\t\"label\", \n\t\"features\").show(50)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+----------+-----+--------------------+\n|prediction|label| features|\n+----------+-----+--------------------+\n| 0.0| 0|[8.21899986267089...|\n| 0.0| 0|[8.67099952697753...|\n| 0.0| 0|[8.72599983215332...|\n| 0.0| 0|[9.56700038909912...|\n| 0.0| 0|[9.87600040435791...|\n| 0.0| 0|[10.0799999237060...|\n| 0.0| 0|[11.2899999618530...|\n| 0.0| 0|[11.3400001525878...|\n| 0.0| 0|[11.4099998474121...|\n| 0.0| 0|[11.4099998474121...|\n| 0.0| 0|[11.4300003051757...|\n| 0.0| 0|[11.4300003051757...|\n| 0.0| 0|[11.5200004577636...|\n| 0.0| 0|[11.6400003433227...|\n| 0.0| 0|[11.7100000381469...|\n| 0.0| 0|[11.8400001525878...|\n| 0.0| 0|[11.8900003433227...|\n| 0.0| 0|[12.2299995422363...|\n| 0.0| 0|[12.6300001144409...|\n| 0.0| 0|[12.8100004196167...|\n| 0.0| 0|[12.8699998855590...|\n| 0.0| 0|[13.2700004577636...|\n| 0.0| 0|[13.5399999618530...|\n| 0.0| 0|[13.5900001525878...|\n| 0.0| 0|[13.6499996185302...|\n| 0.0| 0|[13.7399997711181...|\n| 0.0| 0|[13.8500003814697...|\n| 0.0| 0|[14.2899999618530...|\n| 0.0| 0|[14.6400003433227...|\n| 1.0| 0|[16.8400001525878...|\n| 0.0| 1|[10.9499998092651...|\n| 1.0| 1|[12.4600000381469...|\n| 1.0| 1|[13.1700000762939...|\n| 0.0| 1|[13.4399995803833...|\n| 1.0| 1|[13.6099996566772...|\n| 1.0| 1|[13.6099996566772...|\n| 0.0| 1|[13.8000001907348...|\n| 0.0| 1|[13.9600000381469...|\n| 0.0| 1|[14.25,21.7199993...|\n| 1.0| 1|[14.25,22.1499996...|\n| 0.0| 1|[14.4799995422363...|\n| 1.0| 1|[14.6800003051757...|\n| 1.0| 1|[14.7100000381469...|\n| 1.0| 1|[15.0600004196167...|\n| 1.0| 1|[15.1000003814697...|\n| 1.0| 1|[15.3199996948242...|\n| 0.0| 1|[15.3400001525878...|\n| 0.0| 1|[15.4600000381469...|\n| 1.0| 1|[15.5299997329711...|\n| 1.0| 1|[15.75,20.25,102....|\n+----------+-----+--------------------+\nonly showing top 50 rows\n\n"}]},"apps":[],"jobName":"paragraph_1492935411315_573295393","id":"20170423-061413_479505104","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:255"},{"text":"%pyspark\nevaluator_dt = MulticlassClassificationEvaluator(\n labelCol=\"label\", \n predictionCol=\"prediction\", \n metricName=\"accuracy\")","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411316_571371649","id":"20170423-061428_1446177306","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:256"},{"text":"%pyspark\naccuracy_dt = evaluator_dt.evaluate(predictions_dt)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411317_570986900","id":"20170423-061500_494891792","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:257"},{"text":"%pyspark\nprint(\"Test Error = {0}\".format((1.0 - accuracy_dt)))","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"Test Error = 0.0722891566265\n"}]},"apps":[],"jobName":"paragraph_1492935411318_572141147","id":"20170423-061510_632281672","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:258"},{"text":"%md\n# Random Forest","dateUpdated":"2017-04-23T01:16:51-0700","config":{"tableHide":false,"editorSetting":{"language":"markdown","editOnDblClick":true},"colWidth":12,"editorMode":"ace/mode/markdown","editorHide":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"HTML","data":"

Random Forest

\n"}]},"apps":[],"jobName":"paragraph_1492935411319_571756398","id":"20170423-061551_1620585578","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:259"},{"text":"%pyspark\nrf = RandomForestClassifier(labelCol='label',\n\tmaxDepth=4,\n\timpurity=\"gini\",\n\tnumTrees=500,\n\tseed=42)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411320_569832653","id":"20170423-061736_1391532472","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:260"},{"text":"%pyspark\nmodel_rf = rf.fit(trainingSet)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411320_569832653","id":"20170423-061903_1651495864","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:261"},{"text":"%pyspark\npredictions_rf = model_rf.transform(testSet)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411321_569447904","id":"20170423-061910_1330207312","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:262"},{"text":"%pyspark\nevaluator_rf = MulticlassClassificationEvaluator(labelCol=\"label\", \n\tpredictionCol=\"prediction\", \n\tmetricName=\"accuracy\")","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411322_570602151","id":"20170423-061928_1628864200","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:263"},{"text":"%pyspark\naccuracy_rf = evaluator_rf.evaluate(predictions_rf)\nprint(\"Test Error = {0}\".format((1.0 - accuracy_rf)))","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"Test Error = 0.0722891566265\n"}]},"apps":[],"jobName":"paragraph_1492935411322_570602151","id":"20170423-061936_1630663902","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:264"},{"text":"%md\n# Neural Networks\n","dateUpdated":"2017-04-23T01:16:51-0700","config":{"tableHide":false,"editorSetting":{"language":"markdown","editOnDblClick":true},"colWidth":12,"editorMode":"ace/mode/markdown","editorHide":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"HTML","data":"

Neural Networks

\n"}]},"apps":[],"jobName":"paragraph_1492935411323_570217402","id":"20170423-062001_2006746793","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:265"},{"text":"%pyspark \nscaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')\n\nscalerModel = scaler.fit(df)\nscaledData = scalerModel.transform(df)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411324_568293658","id":"20170423-062134_1181406934","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:266"},{"text":"%pyspark\nprint(\"Features scaled to range: [%f, %f]\" % (scaler.getMin(), scaler.getMax()))","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"Features scaled to range: [0.000000, 1.000000]\n"}]},"apps":[],"jobName":"paragraph_1492935411325_567908909","id":"20170423-062249_1463359208","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:267"},{"text":"%pyspark\nnew_df = scaledData.selectExpr(\"label\", \"radius_mean\", \"texture_mean\", \n\t\"perimeter_mean\", \"area_mean\", \"smoothness_mean\", \"compactness_mean\",\n\t \"concavity_mean\", \"concave_points_mean\", \"symmetry_mean\", \n\t \"fractal_dimension_mean\", \"radius_se\", \"texture_se\", \"perimeter_se\", \n\t \"area_se\", \"smoothness_se\", \"compactness_se\", \"concavity_se\", \n\t \"concave_points_se\", \"symmetry_se\", \"fractal_dimension_se\", \n\t \"radius_worst\", \"texture_worst\", \"perimeter_worst\", \n\t \"area_worst\", \"smoothness_worst\", \"compactness_worst\", \n\t \"concavity_worst\", \"concave_points_worst\", \"symmetry_worst\", \n\t \"fractal_dimension_worst\",\"features as oldFeature\", \n\t \"scaledFeatures as features\")","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411326_569063155","id":"20170423-062302_992392082","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:268"},{"text":"%pyspark\n# Creating training and test sets\n(trainingSet_scaled, testSet_scaled) = new_df\\\n.randomSplit([0.7, 0.3], seed = 42)\n\nlayers = [30, 5, 4, 2]","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411326_569063155","id":"20170423-062329_1005850037","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:269"},{"text":"%pyspark\ntrainer = MultilayerPerceptronClassifier(maxIter=100, \n\tlayers=layers, \n\tblockSize=128, \n\tseed=1234)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411327_568678407","id":"20170423-062414_1096546719","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:270"},{"text":"%pyspark\nmodel_nn = trainer.fit(trainingSet_scaled)","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411328_652938416","id":"20170423-062430_1394594903","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:271"},{"text":"%pyspark\nresult_nn = model_nn.transform(testSet_scaled)\npredictions_nn = result_nn.select(\"prediction\", \"label\")\nevaluator_nn = MulticlassClassificationEvaluator(metricName=\"accuracy\")","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1492935411329_652553667","id":"20170423-062438_950890432","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:272"},{"text":"%pyspark\naccuracy_nn = evaluator_nn.evaluate(predictions_nn) \n\nprint(\"Test Error = %g\" % (1.0 - accuracy_nn))","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"Test Error = 0.0446927\n"}]},"apps":[],"jobName":"paragraph_1492935411329_652553667","id":"20170423-062458_962956565","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:273"},{"text":"%pyspark\n","dateUpdated":"2017-04-23T01:16:51-0700","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1492935411330_653707913","id":"20170423-062508_1561863743","dateCreated":"2017-04-23T01:16:51-0700","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:274"}],"name":"breastCancerMachineLearning","id":"2CEY53YPA","angularObjects":{"2CECP2KSJ:shared_process":[],"2CEAA8R18:shared_process":[],"2CEAGTXJS:shared_process":[],"2CEM5S4C6:shared_process":[],"2CDXZCUHR:shared_process":[],"2CDXJWRG5:shared_process":[],"2CCVCCN7B:shared_process":[],"2CDER5QY5:shared_process":[],"2CFDXMD3K:shared_process":[],"2CEFYZ3J8:shared_process":[],"2CFQ4VA1V:shared_process":[],"2CCYBXVC5:shared_process":[],"2CFXY43GY:shared_process":[],"2CG2U7N1Z:shared_process":[],"2CE6A2D8P:shared_process":[],"2CFZA8Y8R:shared_process":[],"2CE3ZUS2K:shared_process":[],"2CCQP97XZ:shared_process":[],"2CDHY866U:shared_process":[]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}} --------------------------------------------------------------------------------