.
675 |
--------------------------------------------------------------------------------
/Tutorial 8-Linear Regression With Pyspark.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["## Overview\n\nThis notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.\n\nThis notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"96816ed7-b08a-4ca3-abb9-f99880c3535d"}}},{"cell_type":"code","source":["# File location and type\nfile_location = \"/FileStore/tables/tips.csv\"\nfile_type = \"csv\"\n\n# The applied options are for CSV files. For other file types, these will be ignored.\ndf =spark.read.csv(file_location,header=True,inferSchema=True)\ndf.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6482be4c-f067-47c9-b0ac-35c938b94601"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+\ntotal_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n 16.99|1.01|Female| No|Sun|Dinner| 2|\n 10.34|1.66| Male| No|Sun|Dinner| 3|\n 21.01| 3.5| Male| No|Sun|Dinner| 3|\n 23.68|3.31| Male| No|Sun|Dinner| 2|\n 24.59|3.61|Female| No|Sun|Dinner| 4|\n 25.29|4.71| Male| No|Sun|Dinner| 4|\n 8.77| 2.0| Male| No|Sun|Dinner| 2|\n 26.88|3.12| Male| No|Sun|Dinner| 4|\n 15.04|1.96| Male| No|Sun|Dinner| 2|\n 14.78|3.23| Male| No|Sun|Dinner| 2|\n 10.27|1.71| Male| No|Sun|Dinner| 2|\n 35.26| 5.0|Female| No|Sun|Dinner| 4|\n 15.42|1.57| Male| No|Sun|Dinner| 2|\n 18.43| 3.0| Male| No|Sun|Dinner| 4|\n 14.83|3.02|Female| No|Sun|Dinner| 2|\n 21.58|3.92| Male| No|Sun|Dinner| 2|\n 10.33|1.67|Female| No|Sun|Dinner| 3|\n 16.29|3.71| Male| No|Sun|Dinner| 3|\n 16.97| 3.5|Female| No|Sun|Dinner| 3|\n 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5e5b80f2-3426-44e1-b86e-171314f4827e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- total_bill: double (nullable = true)\n |-- tip: double (nullable = true)\n |-- sex: string (nullable = true)\n |-- smoker: string (nullable = true)\n |-- day: string (nullable = true)\n |-- time: string (nullable = true)\n |-- size: integer (nullable = true)\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nroot\n-- total_bill: double (nullable = true)\n-- tip: double (nullable = true)\n-- sex: string (nullable = true)\n-- smoker: string (nullable = true)\n-- day: string (nullable = true)\n-- time: string (nullable = true)\n-- size: integer (nullable = true)\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0432b71c-b266-417d-b0d5-1c17afa0f090"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[3]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[3]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
"]}}],"execution_count":0},{"cell_type":"code","source":["### Handling Categorical Features\nfrom pyspark.ml.feature import StringIndexer"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0ae62ac1-81a6-4b1d-92b9-f85ec9cc93ff"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"faa6f9b0-6f8b-4dbd-a5a2-dc074181f2e3"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+\ntotal_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n 16.99|1.01|Female| No|Sun|Dinner| 2|\n 10.34|1.66| Male| No|Sun|Dinner| 3|\n 21.01| 3.5| Male| No|Sun|Dinner| 3|\n 23.68|3.31| Male| No|Sun|Dinner| 2|\n 24.59|3.61|Female| No|Sun|Dinner| 4|\n 25.29|4.71| Male| No|Sun|Dinner| 4|\n 8.77| 2.0| Male| No|Sun|Dinner| 2|\n 26.88|3.12| Male| No|Sun|Dinner| 4|\n 15.04|1.96| Male| No|Sun|Dinner| 2|\n 14.78|3.23| Male| No|Sun|Dinner| 2|\n 10.27|1.71| Male| No|Sun|Dinner| 2|\n 35.26| 5.0|Female| No|Sun|Dinner| 4|\n 15.42|1.57| Male| No|Sun|Dinner| 2|\n 18.43| 3.0| Male| No|Sun|Dinner| 4|\n 14.83|3.02|Female| No|Sun|Dinner| 2|\n 21.58|3.92| Male| No|Sun|Dinner| 2|\n 10.33|1.67|Female| No|Sun|Dinner| 3|\n 16.29|3.71| Male| No|Sun|Dinner| 3|\n 16.97| 3.5|Female| No|Sun|Dinner| 3|\n 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["indexer=StringIndexer(inputCol=\"sex\",outputCol=\"sex_indexed\")\ndf_r=indexer.fit(df).transform(df)\ndf_r.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2ee7ab64-9804-4afb-852c-ee02eb5d3a20"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+-----------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|\n+----------+----+------+------+---+------+----+-----------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0|\n+----------+----+------+------+---+------+----+-----------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+-----------+\ntotal_bill| tip| sex|smoker|day| time|size|sex_indexed|\n+----------+----+------+------+---+------+----+-----------+\n 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0|\n 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0|\n 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0|\n 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0|\n 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0|\n 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0|\n 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0|\n 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0|\n 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0|\n 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0|\n 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0|\n 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0|\n 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0|\n 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0|\n 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0|\n 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0|\n 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0|\n 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0|\n 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0|\n 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0|\n+----------+----+------+------+---+------+----+-----------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["indexer=StringIndexer(inputCols=[\"smoker\",\"day\",\"time\"],outputCols=[\"smoker_indexed\",\"day_indexed\",\n \"time_index\"])\ndf_r=indexer.fit(df_r).transform(df_r)\ndf_r.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6b95d734-4c80-4762-bd9b-92b6a107dced"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\ntotal_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\n 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df_r.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a9909b0b-caee-4838-b477-47c3701dbfd4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[9]: ['total_bill',\n 'tip',\n 'sex',\n 'smoker',\n 'day',\n 'time',\n 'size',\n 'sex_indexed',\n 'smoker_indexed',\n 'day_indexed',\n 'time_index']
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[9]: ['total_bill',\n 'tip',\n 'sex',\n 'smoker',\n 'day',\n 'time',\n 'size',\n 'sex_indexed',\n 'smoker_indexed',\n 'day_indexed',\n 'time_index']
"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.feature import VectorAssembler\nfeatureassembler=VectorAssembler(inputCols=['tip','size','sex_indexed','smoker_indexed','day_indexed',\n 'time_index'],outputCol=\"Independent Features\")\noutput=featureassembler.transform(df_r)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"61d875e5-71fa-4dc4-ae90-54924b00a632"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["output.select('Independent Features').show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d33d1178-95a2-468f-a94a-e0eebc67be86"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+\n|Independent Features|\n+--------------------+\n|[1.01,2.0,1.0,0.0...|\n|[1.66,3.0,0.0,0.0...|\n|[3.5,3.0,0.0,0.0,...|\n|[3.31,2.0,0.0,0.0...|\n|[3.61,4.0,1.0,0.0...|\n|[4.71,4.0,0.0,0.0...|\n|[2.0,2.0,0.0,0.0,...|\n|[3.12,4.0,0.0,0.0...|\n|[1.96,2.0,0.0,0.0...|\n|[3.23,2.0,0.0,0.0...|\n|[1.71,2.0,0.0,0.0...|\n|[5.0,4.0,1.0,0.0,...|\n|[1.57,2.0,0.0,0.0...|\n|[3.0,4.0,0.0,0.0,...|\n|[3.02,2.0,1.0,0.0...|\n|[3.92,2.0,0.0,0.0...|\n|[1.67,3.0,1.0,0.0...|\n|[3.71,3.0,0.0,0.0...|\n|[3.5,3.0,1.0,0.0,...|\n|(6,[0,1],[3.35,3.0])|\n+--------------------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+--------------------+\nIndependent Features|\n+--------------------+\n[1.01,2.0,1.0,0.0...|\n[1.66,3.0,0.0,0.0...|\n[3.5,3.0,0.0,0.0,...|\n[3.31,2.0,0.0,0.0...|\n[3.61,4.0,1.0,0.0...|\n[4.71,4.0,0.0,0.0...|\n[2.0,2.0,0.0,0.0,...|\n[3.12,4.0,0.0,0.0...|\n[1.96,2.0,0.0,0.0...|\n[3.23,2.0,0.0,0.0...|\n[1.71,2.0,0.0,0.0...|\n[5.0,4.0,1.0,0.0,...|\n[1.57,2.0,0.0,0.0...|\n[3.0,4.0,0.0,0.0,...|\n[3.02,2.0,1.0,0.0...|\n[3.92,2.0,0.0,0.0...|\n[1.67,3.0,1.0,0.0...|\n[3.71,3.0,0.0,0.0...|\n[3.5,3.0,1.0,0.0,...|\n(6,[0,1],[3.35,3.0])|\n+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["output.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f2646b66-7710-4297-a6e1-156a37e6582d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|Independent Features|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[1.01,2.0,1.0,0.0...|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[1.66,3.0,0.0,0.0...|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.5,3.0,0.0,0.0,...|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.31,2.0,0.0,0.0...|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[3.61,4.0,1.0,0.0...|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[4.71,4.0,0.0,0.0...|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[2.0,2.0,0.0,0.0,...|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.12,4.0,0.0,0.0...|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.96,2.0,0.0,0.0...|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.23,2.0,0.0,0.0...|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.71,2.0,0.0,0.0...|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[5.0,4.0,1.0,0.0,...|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.57,2.0,0.0,0.0...|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.0,4.0,0.0,0.0,...|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[3.02,2.0,1.0,0.0...|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.92,2.0,0.0,0.0...|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[1.67,3.0,1.0,0.0...|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.71,3.0,0.0,0.0...|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[3.5,3.0,1.0,0.0,...|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|(6,[0,1],[3.35,3.0])|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\ntotal_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|Independent Features|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\n 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[1.01,2.0,1.0,0.0...|\n 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[1.66,3.0,0.0,0.0...|\n 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.5,3.0,0.0,0.0,...|\n 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.31,2.0,0.0,0.0...|\n 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[3.61,4.0,1.0,0.0...|\n 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[4.71,4.0,0.0,0.0...|\n 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[2.0,2.0,0.0,0.0,...|\n 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.12,4.0,0.0,0.0...|\n 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.96,2.0,0.0,0.0...|\n 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.23,2.0,0.0,0.0...|\n 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.71,2.0,0.0,0.0...|\n 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[5.0,4.0,1.0,0.0,...|\n 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.57,2.0,0.0,0.0...|\n 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.0,4.0,0.0,0.0,...|\n 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[3.02,2.0,1.0,0.0...|\n 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.92,2.0,0.0,0.0...|\n 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[1.67,3.0,1.0,0.0...|\n 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.71,3.0,0.0,0.0...|\n 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[3.5,3.0,1.0,0.0,...|\n 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|(6,[0,1],[3.35,3.0])|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["finalized_data=output.select(\"Independent Features\",\"total_bill\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d1c1fa4c-c78a-441a-bed9-3bcfcc5af966"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["finalized_data.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3d14fe7b-bc59-4376-8139-142283af09b0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+----------+\n|Independent Features|total_bill|\n+--------------------+----------+\n|[1.01,2.0,1.0,0.0...| 16.99|\n|[1.66,3.0,0.0,0.0...| 10.34|\n|[3.5,3.0,0.0,0.0,...| 21.01|\n|[3.31,2.0,0.0,0.0...| 23.68|\n|[3.61,4.0,1.0,0.0...| 24.59|\n|[4.71,4.0,0.0,0.0...| 25.29|\n|[2.0,2.0,0.0,0.0,...| 8.77|\n|[3.12,4.0,0.0,0.0...| 26.88|\n|[1.96,2.0,0.0,0.0...| 15.04|\n|[3.23,2.0,0.0,0.0...| 14.78|\n|[1.71,2.0,0.0,0.0...| 10.27|\n|[5.0,4.0,1.0,0.0,...| 35.26|\n|[1.57,2.0,0.0,0.0...| 15.42|\n|[3.0,4.0,0.0,0.0,...| 18.43|\n|[3.02,2.0,1.0,0.0...| 14.83|\n|[3.92,2.0,0.0,0.0...| 21.58|\n|[1.67,3.0,1.0,0.0...| 10.33|\n|[3.71,3.0,0.0,0.0...| 16.29|\n|[3.5,3.0,1.0,0.0,...| 16.97|\n|(6,[0,1],[3.35,3.0])| 20.65|\n+--------------------+----------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+--------------------+----------+\nIndependent Features|total_bill|\n+--------------------+----------+\n[1.01,2.0,1.0,0.0...| 16.99|\n[1.66,3.0,0.0,0.0...| 10.34|\n[3.5,3.0,0.0,0.0,...| 21.01|\n[3.31,2.0,0.0,0.0...| 23.68|\n[3.61,4.0,1.0,0.0...| 24.59|\n[4.71,4.0,0.0,0.0...| 25.29|\n[2.0,2.0,0.0,0.0,...| 8.77|\n[3.12,4.0,0.0,0.0...| 26.88|\n[1.96,2.0,0.0,0.0...| 15.04|\n[3.23,2.0,0.0,0.0...| 14.78|\n[1.71,2.0,0.0,0.0...| 10.27|\n[5.0,4.0,1.0,0.0,...| 35.26|\n[1.57,2.0,0.0,0.0...| 15.42|\n[3.0,4.0,0.0,0.0,...| 18.43|\n[3.02,2.0,1.0,0.0...| 14.83|\n[3.92,2.0,0.0,0.0...| 21.58|\n[1.67,3.0,1.0,0.0...| 10.33|\n[3.71,3.0,0.0,0.0...| 16.29|\n[3.5,3.0,1.0,0.0,...| 16.97|\n(6,[0,1],[3.35,3.0])| 20.65|\n+--------------------+----------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.regression import LinearRegression\n##train test split\ntrain_data,test_data=finalized_data.randomSplit([0.75,0.25])\nregressor=LinearRegression(featuresCol='Independent Features', labelCol='total_bill')\nregressor=regressor.fit(train_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"dbe03a38-e728-40f9-8a53-0b7968b8dc87"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["regressor.coefficients"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0fdc835a-96fb-4ab3-89be-6cfbc57c7ac6"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[17]: DenseVector([3.3598, 3.3861, -0.6641, 2.5847, -0.1423, -1.3377])
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[17]: DenseVector([3.3598, 3.3861, -0.6641, 2.5847, -0.1423, -1.3377])
"]}}],"execution_count":0},{"cell_type":"code","source":["regressor.intercept"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"fd1642d4-bb73-4fc0-a410-ada61d0f3410"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[18]: 0.9231025978363154
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[18]: 0.9231025978363154
"]}}],"execution_count":0},{"cell_type":"code","source":["### Predictions\npred_results=regressor.evaluate(test_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2e45a3d8-af1c-408b-b64f-fe466c3401bd"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["## Final comparison\npred_results.predictions.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"01d128d2-1a71-44d0-a14c-b0377693547b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+----------+------------------+\n|Independent Features|total_bill| prediction|\n+--------------------+----------+------------------+\n| (6,[0,1],[2.0,2.0])| 12.69|14.414877568922382|\n|(6,[0,1],[3.35,3.0])| 20.65|22.336705086951124|\n|[1.0,1.0,1.0,0.0,...| 7.25| 7.004851678101628|\n|[1.17,2.0,0.0,1.0...| 32.83|14.210940490994291|\n|[1.36,3.0,1.0,0.0...| 18.64|13.364280305420156|\n|[1.5,2.0,0.0,1.0,...| 11.59|15.319683950195104|\n|[1.58,2.0,0.0,1.0...| 13.42| 13.82395853728497|\n|[1.66,3.0,0.0,0.0...| 10.34|16.516310272733463|\n|[1.73,2.0,0.0,0.0...| 9.78| 11.88549649517034|\n|[2.0,2.0,0.0,0.0,...| 13.81| 14.27259319727858|\n|[2.0,2.0,0.0,0.0,...| 13.03| 12.79265023451646|\n|[2.0,2.0,1.0,0.0,...| 14.15|12.128511829238738|\n|[2.0,2.0,1.0,0.0,...| 14.52|12.128511829238738|\n|[2.0,2.0,1.0,1.0,...| 10.63|16.335459877039824|\n|[2.0,2.0,1.0,1.0,...| 27.18|16.335459877039824|\n|[2.0,3.0,1.0,0.0,...| 16.21|16.994513613299002|\n|[2.23,2.0,1.0,1.0...| 12.76| 17.10822046981615|\n|[2.24,2.0,0.0,0.0...| 20.76|15.078952076697353|\n|[2.31,2.0,0.0,0.0...| 11.69|13.834197120432375|\n|[2.5,2.0,0.0,0.0,...| 14.07|15.952507529401023|\n+--------------------+----------+------------------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+--------------------+----------+------------------+\nIndependent Features|total_bill| prediction|\n+--------------------+----------+------------------+\n (6,[0,1],[2.0,2.0])| 12.69|14.414877568922382|\n(6,[0,1],[3.35,3.0])| 20.65|22.336705086951124|\n[1.0,1.0,1.0,0.0,...| 7.25| 7.004851678101628|\n[1.17,2.0,0.0,1.0...| 32.83|14.210940490994291|\n[1.36,3.0,1.0,0.0...| 18.64|13.364280305420156|\n[1.5,2.0,0.0,1.0,...| 11.59|15.319683950195104|\n[1.58,2.0,0.0,1.0...| 13.42| 13.82395853728497|\n[1.66,3.0,0.0,0.0...| 10.34|16.516310272733463|\n[1.73,2.0,0.0,0.0...| 9.78| 11.88549649517034|\n[2.0,2.0,0.0,0.0,...| 13.81| 14.27259319727858|\n[2.0,2.0,0.0,0.0,...| 13.03| 12.79265023451646|\n[2.0,2.0,1.0,0.0,...| 14.15|12.128511829238738|\n[2.0,2.0,1.0,0.0,...| 14.52|12.128511829238738|\n[2.0,2.0,1.0,1.0,...| 10.63|16.335459877039824|\n[2.0,2.0,1.0,1.0,...| 27.18|16.335459877039824|\n[2.0,3.0,1.0,0.0,...| 16.21|16.994513613299002|\n[2.23,2.0,1.0,1.0...| 12.76| 17.10822046981615|\n[2.24,2.0,0.0,0.0...| 20.76|15.078952076697353|\n[2.31,2.0,0.0,0.0...| 11.69|13.834197120432375|\n[2.5,2.0,0.0,0.0,...| 14.07|15.952507529401023|\n+--------------------+----------+------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["### PErformance Metrics\npred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"75e3e5b1-0bb4-4dbe-a1ca-08e5a31ee173"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[25]: (0.40050077944613716, 4.809771114444798, 40.934088106916576)
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[25]: (0.40050077944613716, 4.809771114444798, 40.934088106916576)
"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ce0398a7-7ebd-4f2c-b12e-2ef701925124"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"Tutorial 8-Linear Regression With Pyspark","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":523045182520803}},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------