├── Loading data from cloudant to Pandas.ipynb ├── Lendo e gravando o COS.ipynb ├── cvs2db2.ipynb ├── COVID-BR.ipynb ├── Predicting Telco Customer Churn using SparkML.ipynb └── Linear regression.ipynb /Loading data from cloudant to Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install cloudant" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from cloudant import Cloudant\n", 19 | "# PEGAR AS INFROMAÇÕES ABAIXO NA ABA CREDENTIALS DO SERVIÇO DE CLOUDANT NA IBM CLOUD\n", 20 | "u = ''\n", 21 | "p = ''\n", 22 | "a = '' # SIM, O ENDPOINT CIRADO PARA O SEU CLOUD É O MESMO QUE O USUÁRIO :-)\n", 23 | "client = Cloudant(u, p, account=a, connect=True, auto_renew=True)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "\n", 33 | "db = client[''] # NOME DO DB CRIADO NO CLOUDANT" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "response = db.all_docs(limit=500, include_docs= True)\n", 43 | "\n", 44 | "# put document bodies into an array\n", 45 | "docs = []\n", 46 | "for r in response['rows']:\n", 47 | " docs.append(r['doc']) # CASO TENHA USADO O NODE-RED E GRAVADO TODO O MSG USE: docs.append(r['doc']['payload'])\n", 48 | "type(docs)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "pixiedust": { 56 | "displayParams": { 57 | "brunelMapType": "Heat Map", 58 | "chartsize": "97", 59 | "coloropacity": "65", 60 | "handlerId": "mapView", 61 | "keyFields": "LAT,LONG", 62 | "mapboxtoken": "pk.eyJ1IjoibWFwYm94IiwiYSI6ImNpejY4M29iazA2Z2gycXA4N2pmbDZmangifQ.-g_vE53SD2WrJ6tFX7QHmA", 63 | "numbins": "16", 64 | "rendererId": "mapbox" 65 | } 66 | } 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "# create a Pandas dataframe containing the data\n", 71 | "import pandas as pd\n", 72 | "df = pd.DataFrame(data=docs)\n", 73 | "\n", 74 | "#df[\"LAT\"] = df[\"LAT\"].astype(float)\n", 75 | "#df[\"LONG\"] = df[\"LONG\"].astype(float)\n", 76 | "\n", 77 | "df.head()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "pixiedust": { 85 | "displayParams": { 86 | "colorrampname": "Light to Dark Red", 87 | "handlerId": "mapView", 88 | "keyFields": "LONG,LAT", 89 | "kind": "simple-cluster", 90 | "mapboxtoken": "pk.eyJ1IjoibWFwYm94IiwiYSI6ImNpejY4M29iazA2Z2gycXA4N2pmbDZmangifQ.-g_vE53SD2WrJ6tFX7QHmA", 91 | "valueFields": "Estação" 92 | } 93 | } 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "#import pixiedust\n", 98 | "#display(df)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3.6", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.6.9" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 1 130 | } 131 | -------------------------------------------------------------------------------- /Lendo e gravando o COS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": "\nimport types\nimport pandas as pd\nfrom botocore.client import Config\nimport ibm_boto3\n\ndef __iter__(self): return 0\n\n# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n# You might want to remove those credentials before you share the notebook.\ncos = ibm_boto3.client(service_name='s3',\n ibm_api_key_id=\"\",\n ibm_auth_endpoint=\"https://iam.ng.bluemix.net/oidc/token\",\n config=Config(signature_version='oauth'),\n endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": "# The code was removed by Watson Studio for sharing." 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 8, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AVGHEARTBEATSPERMINPALPITATIONSPERDAYCHOLESTEROLBMIHEARTFAILUREAGESEXFAMILYHISTORYSMOKERLAST5YRSEXERCISEMINPERWEEK
0932216325N49FNN110
11082218124N32FNN192
286023920N60FNN121
3803616431Y45FYN141
4663618523N39FNN63
\n
", 25 | "text/plain": " AVGHEARTBEATSPERMIN PALPITATIONSPERDAY CHOLESTEROL BMI HEARTFAILURE \\\n0 93 22 163 25 N \n1 108 22 181 24 N \n2 86 0 239 20 N \n3 80 36 164 31 Y \n4 66 36 185 23 N \n\n AGE SEX FAMILYHISTORY SMOKERLAST5YRS EXERCISEMINPERWEEK \n0 49 F N N 110 \n1 32 F N N 192 \n2 60 F N N 121 \n3 45 F Y N 141 \n4 39 F N N 63 " 26 | }, 27 | "execution_count": 8, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": "body = cos.get_object(Bucket='covid19-donotdelete-pr-im28vw91gfqqeg',Key='data-health.csv')['Body']\n# add missing __iter__ method, so pandas accepts body as file-like object\nif not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n\ndf_data_1 = pd.read_csv(body)\ndf_data_1.head()\n" 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 9, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": "filename-local.csv uploaded to IBM COS.\n" 43 | } 44 | ], 45 | "source": "\n df_data_1.to_csv('novo_arquivo.csv', sep=',', encoding='utf-8')\n try:\n res=cos.upload_file(Filename='novo_arquivo.csv', Bucket='covid19-donotdelete-pr-im28vw91gfqqeg', Key='filename-at-cos.csv')\n except Exception as e:\n print(Exception, e)\n else:\n print(\"filename-local.csv uploaded to IBM COS.\")" 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 10, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0AVGHEARTBEATSPERMINPALPITATIONSPERDAYCHOLESTEROLBMIHEARTFAILUREAGESEXFAMILYHISTORYSMOKERLAST5YRSEXERCISEMINPERWEEK
00932216325N49FNN110
111082218124N32FNN192
2286023920N60FNN121
33803616431Y45FYN141
44663618523N39FNN63
\n
", 55 | "text/plain": " Unnamed: 0 AVGHEARTBEATSPERMIN PALPITATIONSPERDAY CHOLESTEROL BMI \\\n0 0 93 22 163 25 \n1 1 108 22 181 24 \n2 2 86 0 239 20 \n3 3 80 36 164 31 \n4 4 66 36 185 23 \n\n HEARTFAILURE AGE SEX FAMILYHISTORY SMOKERLAST5YRS EXERCISEMINPERWEEK \n0 N 49 F N N 110 \n1 N 32 F N N 192 \n2 N 60 F N N 121 \n3 Y 45 F Y N 141 \n4 N 39 F N N 63 " 56 | }, 57 | "execution_count": 10, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": "body = cos.get_object(Bucket='covid19-donotdelete-pr-im28vw91gfqqeg',Key='filename-at-cos.csv')['Body']\n# add missing __iter__ method, so pandas accepts body as file-like object\nif not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n\ndf = pd.read_csv(body)\ndf.head()" 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": "" 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3.6", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.6.9" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 1 93 | } -------------------------------------------------------------------------------- /cvs2db2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": "import pandas as pd\nfrom sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Date, Numeric, DateTime\nfrom sqlalchemy.orm import scoped_session, sessionmaker\nfrom sqlalchemy.ext.automap import automap_base" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 21, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": "# The code was removed by Watson Studio for sharing." 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 22, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": "#Connect to DB2\nconnection_string = \"db2+ibm_db://\" + user + \":\" + pwd + \"@dashdb-txn-flex-yp-dal13-43.services.dal.bluemix.net:50000/BLUDB\"\n" 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 23, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": "\nengine = create_engine(connection_string, echo=False)\nmeta = MetaData()\n\nnome_tabela = 'COVID'\n\ncovid = Table('covid', meta, \n Column(\"DATE\", Date),\n Column(\"TYPE\", String(30)),\n Column(\"CASES\", Integer),\n Column(\"DIFFERENCE\", Integer),\n Column(\"COUNTRY\", String(50)),\n Column(\"PROVINCE_STATE\", String(50)), \n Column(\"LAT\", Numeric(18,8)),\n Column(\"LONG\", Numeric(18,8)),\n Column(\"LATEST_DATE\", DateTime))\n\ndropTable = True\n\nif not engine.dialect.has_table(engine, nome_tabela, schema = 'BLUADMIN'):\n meta.create_all(engine)\nelse:\n if dropTable:\n covid.drop(engine) # Apaga tabela\n meta.create_all(engine) # Cria tabela\n conn = engine.connect()\n #stmt = covid.delete().where(students.c.lastname == 'Khanna')\n stmt = covid.delete()\n conn.execute(stmt)\n conn.close()" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 24, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "Base = automap_base()\n\n#engine = create_engine(connection_string, echo=True)\nsession = scoped_session(sessionmaker(autocommit=True,\n autoflush=False,\n bind=engine))\n\nBase.prepare(engine, reflect=True)\nbase_sa = Base.classes" 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 25, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DATETYPECASESDIFFERENCECOUNTRYPROVINCE_STATELATLONGLATEST_DATE
02020-06-04Confirmed3262152Cote d'IvoireNaN7.540000-5.5471002020-06-04T23:15:39
12020-06-04Deaths00EritreaNaN15.17940039.7823002020-06-04T23:15:39
22020-06-04Deaths10FranceFrench Guiana4.000000-53.0000002020-06-04T23:15:39
32020-06-04Confirmed2980CanadaManitoba53.760900-98.8139002020-06-04T23:15:39
42020-06-04Confirmed200FranceNew Caledonia-20.904305165.6180422020-06-04T23:15:39
\n
", 46 | "text/plain": " DATE TYPE CASES DIFFERENCE COUNTRY PROVINCE_STATE \\\n0 2020-06-04 Confirmed 3262 152 Cote d'Ivoire NaN \n1 2020-06-04 Deaths 0 0 Eritrea NaN \n2 2020-06-04 Deaths 1 0 France French Guiana \n3 2020-06-04 Confirmed 298 0 Canada Manitoba \n4 2020-06-04 Confirmed 20 0 France New Caledonia \n\n LAT LONG LATEST_DATE \n0 7.540000 -5.547100 2020-06-04T23:15:39 \n1 15.179400 39.782300 2020-06-04T23:15:39 \n2 4.000000 -53.000000 2020-06-04T23:15:39 \n3 53.760900 -98.813900 2020-06-04T23:15:39 \n4 -20.904305 165.618042 2020-06-04T23:15:39 " 47 | }, 48 | "execution_count": 25, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": "df = pd.read_csv('https://download.data.world/s/3xpcwdh7es3o6uqyztkkhdnh4ht7i7')\ndf.head()" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 26, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": "nome_tabela = 'covid'" 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 27, 65 | "metadata": { 66 | "scrolled": true 67 | }, 68 | "outputs": [], 69 | "source": "df.to_sql(name=nome_tabela, con=engine, if_exists='append', index=False)\n#df.to_sql(name=nome_tabela, con=engine, if_exists='append', index=False, chunksize=1000) # With number of records to be processed each time" 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 28, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DATETYPEcasesdifferencecountryprovince_statelatLONGlatest_date
02020-06-04Confirmed3262152Cote d'IvoireNone7.540000-5.5471002020-06-04 23:15:39
12020-06-04Deaths00EritreaNone15.17940039.7823002020-06-04 23:15:39
22020-06-04Deaths10FranceFrench Guiana4.000000-53.0000002020-06-04 23:15:39
32020-06-04Confirmed2980CanadaManitoba53.760900-98.8139002020-06-04 23:15:39
42020-06-04Confirmed200FranceNew Caledonia-20.904305165.6180422020-06-04 23:15:39
\n
", 79 | "text/plain": " DATE TYPE cases difference country province_state \\\n0 2020-06-04 Confirmed 3262 152 Cote d'Ivoire None \n1 2020-06-04 Deaths 0 0 Eritrea None \n2 2020-06-04 Deaths 1 0 France French Guiana \n3 2020-06-04 Confirmed 298 0 Canada Manitoba \n4 2020-06-04 Confirmed 20 0 France New Caledonia \n\n lat LONG latest_date \n0 7.540000 -5.547100 2020-06-04 23:15:39 \n1 15.179400 39.782300 2020-06-04 23:15:39 \n2 4.000000 -53.000000 2020-06-04 23:15:39 \n3 53.760900 -98.813900 2020-06-04 23:15:39 \n4 -20.904305 165.618042 2020-06-04 23:15:39 " 80 | }, 81 | "execution_count": 28, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": "df2 = pd.read_sql_table(nome_tabela, engine)\ndf2.head()" 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 29, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": "session.close()" 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": "" 101 | } 102 | ], 103 | "metadata": { 104 | "kernelspec": { 105 | "display_name": "Python 3.6", 106 | "language": "python", 107 | "name": "python3" 108 | }, 109 | "language_info": { 110 | "codemirror_mode": { 111 | "name": "ipython", 112 | "version": 3 113 | }, 114 | "file_extension": ".py", 115 | "mimetype": "text/x-python", 116 | "name": "python", 117 | "nbconvert_exporter": "python", 118 | "pygments_lexer": "ipython3", 119 | "version": "3.6.9" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 1 124 | } -------------------------------------------------------------------------------- /COVID-BR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": "import pandas as pd\nfrom sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Date, Numeric, DateTime\nfrom sqlalchemy.orm import scoped_session, sessionmaker\nfrom sqlalchemy.ext.automap import automap_base" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 8, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": "# The code was removed by Watson Studio for sharing." 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 9, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": "#DB connection string\nconnection_string = \"db2+ibm_db://\" + user + \":\" + pwd + \"@dashdb-txn-flex-yp-dal13-43.services.dal.bluemix.net:50000/BLUDB\"" 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 10, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": "\nengine = create_engine(connection_string, echo=False)\nmeta = MetaData()\n\nnome_tabela = 'COVID-BR'\n\ncovid = Table('COVID-BR', meta, \n Column(\"COUNTRY\", String(30)),\n Column(\"PROVINCE_STATE\", String(50)),\n Column(\"CITY\", String(50)),\n Column(\"IBGEID\", Integer),\n Column(\"COD_REGIAODESAUDE\",Numeric(18,8)),\n Column(\"NOME_REGIAODESAUDE\",String(50)),\n Column(\"DEATHS\", Integer),\n Column(\"CASES\", Integer),\n Column(\"DEATHS_PER_100K_INHABITANTS\", Numeric(18,8)),\n Column(\"TOTALCASES_PER_100K_INHABITANTS\", Numeric(18,8)),\n Column(\"DEATHS_BY_TOTALCASES\", Numeric(18,8)),\n Column(\"SOURCE\", String(10)),\n Column(\"DATE\", Date),\n Column(\"NEWCASES\", Integer),\n Column(\"NEWDEATHS\",Integer),\n Column(\"LAST_INFO_DATE\", Date))\n\n\n# date\tnewCases\tnewDeaths\n#'DEATHS_PER_100K_INHABITANTS','TOTALCASES_PER_100K_INHABITANTS','DEATHS_BY_TOTALCASES'\n\ndropTable = True\n\nif not engine.dialect.has_table(engine, nome_tabela, schema = 'BLUADMIN'):\n meta.create_all(engine)\nelse:\n if dropTable:\n covid.drop(engine) # Apaga tabela\n meta.create_all(engine) # Cria tabela\n conn = engine.connect()\n #stmt = covid.delete().where(students.c.lastname == 'Khanna')\n stmt = covid.delete()\n conn.execute(stmt)\n conn.close()" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 11, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "Base = automap_base()\n\n#engine = create_engine(connection_string, echo=True)\nsession = scoped_session(sessionmaker(autocommit=True,\n autoflush=False,\n bind=engine))\n\nBase.prepare(engine, reflect=True)\nbase_sa = Base.classes" 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 12, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
countrystatecityibgeIDcod_RegiaoDeSaudename_RegiaoDeSaudedeathstotalCasesdeaths_per_100k_inhabitantstotalCases_per_100k_inhabitantsdeaths_by_totalCases_sourcedatenewCasesnewDeathslast_info_date
0BrazilGOAbadia de Goi\u00e1s/GO520005052001.0Central113511.398611538.812260.00741SES2020-07-25002020-07-25
1BrazilMGAbadia dos Dourados/MG310010431074.0Patroc\u00ednio / Monte Carmelo0150.00000214.622980.00000SES2020-07-25002020-07-24
2BrazilGOAbadi\u00e2nia/GO520010052011.0Pirineus77734.92665384.193190.09091SES2020-07-25002020-07-25
3BrazilPAAbaetetuba/PA150010715011.0Tocantins109259869.119461647.452730.04196SES2020-07-25002020-07-25
4BrazilMGAbaet\u00e9/MG310020331024.0Sete Lagoas0260.00000111.890520.00000SES2020-07-25002020-07-24
\n
", 46 | "text/plain": " country state city ibgeID cod_RegiaoDeSaude \\\n0 Brazil GO Abadia de Goi\u00e1s/GO 5200050 52001.0 \n1 Brazil MG Abadia dos Dourados/MG 3100104 31074.0 \n2 Brazil GO Abadi\u00e2nia/GO 5200100 52011.0 \n3 Brazil PA Abaetetuba/PA 1500107 15011.0 \n4 Brazil MG Abaet\u00e9/MG 3100203 31024.0 \n\n name_RegiaoDeSaude deaths totalCases \\\n0 Central 1 135 \n1 Patroc\u00ednio / Monte Carmelo 0 15 \n2 Pirineus 7 77 \n3 Tocantins 109 2598 \n4 Sete Lagoas 0 26 \n\n deaths_per_100k_inhabitants totalCases_per_100k_inhabitants \\\n0 11.39861 1538.81226 \n1 0.00000 214.62298 \n2 34.92665 384.19319 \n3 69.11946 1647.45273 \n4 0.00000 111.89052 \n\n deaths_by_totalCases _source date newCases newDeaths \\\n0 0.00741 SES 2020-07-25 0 0 \n1 0.00000 SES 2020-07-25 0 0 \n2 0.09091 SES 2020-07-25 0 0 \n3 0.04196 SES 2020-07-25 0 0 \n4 0.00000 SES 2020-07-25 0 0 \n\n last_info_date \n0 2020-07-25 \n1 2020-07-24 \n2 2020-07-25 \n3 2020-07-25 \n4 2020-07-24 " 47 | }, 48 | "execution_count": 12, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": "df = pd.read_csv('https://raw.githubusercontent.com/wcota/covid19br/master/cases-brazil-cities.csv')\ndf.head()" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 13, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
COUNTRYPROVINCE_STATECITYIBGEIDCOD_REGIAODESAUDENOME_REGIAODESAUDEDEATHSCASESDEATHS_PER_100K_INHABITANTSTOTALCASES_PER_100K_INHABITANTSDEATHS_BY_TOTALCASESSOURCEDATENEWCASESNEWDEATHSLAST_INFO_DATE
0BrazilGOAbadia de Goi\u00e1s/GO520005052001.0Central113511.398611538.812260.00741SES2020-07-25002020-07-25
1BrazilMGAbadia dos Dourados/MG310010431074.0Patroc\u00ednio / Monte Carmelo0150.00000214.622980.00000SES2020-07-25002020-07-24
2BrazilGOAbadi\u00e2nia/GO520010052011.0Pirineus77734.92665384.193190.09091SES2020-07-25002020-07-25
3BrazilPAAbaetetuba/PA150010715011.0Tocantins109259869.119461647.452730.04196SES2020-07-25002020-07-25
4BrazilMGAbaet\u00e9/MG310020331024.0Sete Lagoas0260.00000111.890520.00000SES2020-07-25002020-07-24
\n
", 63 | "text/plain": " COUNTRY PROVINCE_STATE CITY IBGEID COD_REGIAODESAUDE \\\n0 Brazil GO Abadia de Goi\u00e1s/GO 5200050 52001.0 \n1 Brazil MG Abadia dos Dourados/MG 3100104 31074.0 \n2 Brazil GO Abadi\u00e2nia/GO 5200100 52011.0 \n3 Brazil PA Abaetetuba/PA 1500107 15011.0 \n4 Brazil MG Abaet\u00e9/MG 3100203 31024.0 \n\n NOME_REGIAODESAUDE DEATHS CASES DEATHS_PER_100K_INHABITANTS \\\n0 Central 1 135 11.39861 \n1 Patroc\u00ednio / Monte Carmelo 0 15 0.00000 \n2 Pirineus 7 77 34.92665 \n3 Tocantins 109 2598 69.11946 \n4 Sete Lagoas 0 26 0.00000 \n\n TOTALCASES_PER_100K_INHABITANTS DEATHS_BY_TOTALCASES SOURCE DATE \\\n0 1538.81226 0.00741 SES 2020-07-25 \n1 214.62298 0.00000 SES 2020-07-25 \n2 384.19319 0.09091 SES 2020-07-25 \n3 1647.45273 0.04196 SES 2020-07-25 \n4 111.89052 0.00000 SES 2020-07-25 \n\n NEWCASES NEWDEATHS LAST_INFO_DATE \n0 0 0 2020-07-25 \n1 0 0 2020-07-24 \n2 0 0 2020-07-25 \n3 0 0 2020-07-25 \n4 0 0 2020-07-24 " 64 | }, 65 | "execution_count": 13, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": "df.columns = ['COUNTRY','PROVINCE_STATE','CITY',\n 'IBGEID','COD_REGIAODESAUDE','NOME_REGIAODESAUDE','DEATHS','CASES','DEATHS_PER_100K_INHABITANTS',\n 'TOTALCASES_PER_100K_INHABITANTS','DEATHS_BY_TOTALCASES','SOURCE','DATE',\n 'NEWCASES','NEWDEATHS','LAST_INFO_DATE']\ndf.head()" 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 14, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": "nome_tabela = 'COVID-BR'" 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 15, 82 | "metadata": { 83 | "scrolled": true 84 | }, 85 | "outputs": [], 86 | "source": "df.to_sql(name=nome_tabela, con=engine, if_exists='append', index=False)\n#df.to_sql(name=nome_tabela, con=engine, if_exists='append', index=False, chunksize=1000) # With number of records to be processed each time" 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 16, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
countryprovince_statecityibgeidcod_regiaodesaudenome_regiaodesaudedeathscasesdeaths_per_100k_inhabitantstotalcases_per_100k_inhabitantsdeaths_by_totalcasesSOURCEDATEnewcasesnewdeathslast_info_date
0BrazilGOAbadia de Goi\u00e1s/GO520005052001.0Central113511.398611538.812260.00741SES2020-07-25002020-07-25
1BrazilMGAbadia dos Dourados/MG310010431074.0Patroc\u00ednio / Monte Carmelo0150.00000214.622980.00000SES2020-07-25002020-07-24
2BrazilGOAbadi\u00e2nia/GO520010052011.0Pirineus77734.92665384.193190.09091SES2020-07-25002020-07-25
3BrazilPAAbaetetuba/PA150010715011.0Tocantins109259869.119461647.452730.04196SES2020-07-25002020-07-25
4BrazilMGAbaet\u00e9/MG310020331024.0Sete Lagoas0260.00000111.890520.00000SES2020-07-25002020-07-24
\n
", 96 | "text/plain": " country province_state city ibgeid cod_regiaodesaude \\\n0 Brazil GO Abadia de Goi\u00e1s/GO 5200050 52001.0 \n1 Brazil MG Abadia dos Dourados/MG 3100104 31074.0 \n2 Brazil GO Abadi\u00e2nia/GO 5200100 52011.0 \n3 Brazil PA Abaetetuba/PA 1500107 15011.0 \n4 Brazil MG Abaet\u00e9/MG 3100203 31024.0 \n\n nome_regiaodesaude deaths cases deaths_per_100k_inhabitants \\\n0 Central 1 135 11.39861 \n1 Patroc\u00ednio / Monte Carmelo 0 15 0.00000 \n2 Pirineus 7 77 34.92665 \n3 Tocantins 109 2598 69.11946 \n4 Sete Lagoas 0 26 0.00000 \n\n totalcases_per_100k_inhabitants deaths_by_totalcases SOURCE DATE \\\n0 1538.81226 0.00741 SES 2020-07-25 \n1 214.62298 0.00000 SES 2020-07-25 \n2 384.19319 0.09091 SES 2020-07-25 \n3 1647.45273 0.04196 SES 2020-07-25 \n4 111.89052 0.00000 SES 2020-07-25 \n\n newcases newdeaths last_info_date \n0 0 0 2020-07-25 \n1 0 0 2020-07-24 \n2 0 0 2020-07-25 \n3 0 0 2020-07-25 \n4 0 0 2020-07-24 " 97 | }, 98 | "execution_count": 16, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": "df2 = pd.read_sql_table(nome_tabela, engine)\ndf2.head()" 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 17, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": "session.close()" 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": "" 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3.6", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.6.9" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 1 141 | } -------------------------------------------------------------------------------- /Predicting Telco Customer Churn using SparkML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Predicting Telco Customer Churn using SparkML on IBM Cloud Pak for Data (ICP4D)" 7 | }, 8 | { 9 | "cell_type": "markdown", 10 | "metadata": {}, 11 | "source": "We'll use this notebook to create a machine learning model to predict customer churn. In this notebook we will build the prediction model using the SparkML library.\n\nThis notebook walks you through these steps:\n\n- Load and Visualize data set. (https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv)\n- Build a predictive model with SparkML API\n- Save the model in the ML repository\n\n* This notebook has been updated, in order to compatibilize to the services new versions, and it is part of code pattern at: https://developer.ibm.com/patterns/data-analysis-model-building-and-deploying-with-wml/" 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": "\n# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.\n# Generate the TOKEN on settings session, and insert here the code using menu above (3 points) \"Insert Project Token\"\n" 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": "# The code was removed by Watson Studio for sharing." 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": "## 1.0 Install required packages\n\nThere are a couple of Python packages we will use in this notebook. First we make sure the Watson Machine Learning client v3 is removed (its not installed by default) and then install/upgrade the v4 version of the client (this package is installed by default on CP4D).\n\nWML Client: https://wml-api-pyclient-dev-v4.mybluemix.net/#repository" 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": "!pip uninstall --yes watson-machine-learning-client-V4\n!pip install --user watson-machine-learning-client-V4\n!pip install --user pyspark==2.4 --upgrade|tail -n 1\n!pip install --user scikit-learn==0.20.3 --upgrade|tail -n 1" 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": "import pandas as pd\nimport numpy as np\nimport json\nimport os\nimport warnings\n\nwarnings.filterwarnings(\"ignore\")" 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": "## 2.0 Load and Clean data\n\nWe'll load our data as a pandas data frame.\n\n**<< FOLLOW THE INSTRUCTIONS BELOW TO LOAD THE DATASET >>**\n\n* Highlight the cell below by clicking it.\n* Click the `10/01` \"Find data\" icon in the upper right of the notebook.\n* If you are using Virtualized data, begin by choosing the `Files` tab. Then choose your virtualized data (i.e. MYSCHEMA.BILLINGPRODUCTCUSTOMERS), click `Insert to code` and choose `Insert Pandas DataFrame`.\n* If you are using this notebook without virtualized data, add the locally uploaded file `Telco-Customer-Churn.csv` by choosing the `Files` tab. Then choose the `Telco-Customer-Churn.csv`. Click `Insert to code` and choose `Insert Pandas DataFrame`.\n* The code to bring the data into the notebook environment and create a Pandas DataFrame will be added to the cell below.\n* Run the cell\n" 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": "# Place cursor below and insert the Pandas DataFrame for the Telco churn data\nimport os, types\nimport pandas as pd\nfrom botocore.client import Config\nimport ibm_boto3\n\ndef __iter__(self): return 0\n\n# Insert pandas code below\n" 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": "# The code was removed by Watson Studio for sharing." 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": "We'll use the Pandas naming convention df for our DataFrame. Make sure that the cell below uses the name for the dataframe used above. For the locally uploaded file it should look like df_data_1 or df_data_2 or df_data_x. For the virtualized data case it should look like data_df_1 or data_df_2 or data_df_x.\n\n**<< UPDATE THE VARIABLE ASSIGNMENT TO THE VARIABLE GENERATED ABOVE. >>**" 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": "# for virtualized data\n# df = data_df_1\n\n# for local upload\ndf = df_data_2" 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": "### 2.1 Drop CustomerID feature (column)" 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": "df = df.drop('customerID', axis=1)" 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": "df.head()" 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": "### 2.2 Examine the data types of the features" 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": "df.info()" 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": "# Statistics for the columns (features). Set it to all, since default is to describe just the numeric features.\ndf.describe(include = 'all')" 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": "We see that Tenure ranges from 0 (new customer) to 6 years, Monthly charges range from $18 to $118, etc" 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": "### 2.3 Check for need to Convert TotalCharges column to numeric if it is detected as object\n\nIf the above `df.info` shows the \"TotalCharges\" columnn as an object, we'll need to convert it to numeric. If you have already done this during a previous exercise for \"Data Visualization with Data Refinery\", you can skip to step `2.4`." 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": "totalCharges = df.columns.get_loc(\"TotalCharges\")\nnew_col = pd.to_numeric(df.iloc[:, totalCharges], errors='coerce')\ndf.iloc[:, totalCharges] = pd.Series(new_col)" 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": "# Statistics for the columns (features). Set it to all, since default is to describe just the numeric features.\ndf.describe(include = 'all')" 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": "We now see statistics for the `TotalCharges` feature." 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": "\n\n### 2.4 Any NaN values should be removed to create a more accurate model." 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": "# Check if we have any NaN values and see which features have missing values that should be addressed\nprint(df.isnull().values.any())\ndf.isnull().sum()" 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": "We should see that the `TotalCharges` column has missing values. There are various ways we can address this issue:\n\n- Drop records with missing values \n- Fill in the missing value with one of the following strategies: Zero, Mean of the values for the column, Random value, etc)." 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": "# Handle missing values for nan_column (TotalCharges)\nfrom sklearn.impute import SimpleImputer\n\n# Find the column number for TotalCharges (starting at 0).\ntotal_charges_idx = df.columns.get_loc(\"TotalCharges\")\nimputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n\ndf.iloc[:, total_charges_idx] = imputer.fit_transform(df.iloc[:, total_charges_idx].values.reshape(-1, 1))\ndf.iloc[:, total_charges_idx] = pd.Series(df.iloc[:, total_charges_idx])" 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": "# Validate that we have addressed any NaN values\nprint(df.isnull().values.any())\ndf.isnull().sum()" 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": "\n### 2.5 Categorize Features\n\nWe will categorize some of the columns / features based on wether they are categorical values or continuous (i.e numerical) values. We will use this in later sections to build visualizations." 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": "columns_idx = np.s_[0:] # Slice of first row(header) with all columns.\nfirst_record_idx = np.s_[0] # Index of first record\n\nstring_fields = [type(fld) is str for fld in df.iloc[first_record_idx, columns_idx]] # All string fields\nall_features = [x for x in df.columns if x != 'Churn']\ncategorical_columns = list(np.array(df.columns)[columns_idx][string_fields])\ncategorical_features = [x for x in categorical_columns if x != 'Churn']\ncontinuous_features = [x for x in all_features if x not in categorical_features]\n\n#print('All Features: ', all_features)\n#print('\\nCategorical Features: ', categorical_features)\n#print('\\nContinuous Features: ', continuous_features)\n#print('\\nAll Categorical Columns: ', categorical_columns)" 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": "### 2.6 Visualize data\n\nData visualization can be used to find patterns, detect outliers, understand distribution and more. We can use graphs such as:\n\n- Histograms, boxplots, etc: To find distribution / spread of our continuous variables.\n- Bar charts: To show frequency in categorical values.\n" 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": "import seaborn as sns\nimport matplotlib.pyplot as plt\n\nfrom sklearn.preprocessing import LabelEncoder\n\n%matplotlib inline\nsns.set(style=\"darkgrid\")\nsns.set_palette(\"hls\", 3)" 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": "First, we get a high level view of the distribution of `Churn`. What percentage of customer in our dataset are churning vs not churning. " 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": "print(df.groupby(['Churn']).size())\nchurn_plot = sns.countplot(data=df, x='Churn', order=df.Churn.value_counts().index)\nplt.ylabel('Count')\nfor p in churn_plot.patches:\n height = p.get_height()\n churn_plot.text(p.get_x()+p.get_width()/2., height + 1,'{0:.0%}'.format(height/float(len(df))),ha=\"center\") \nplt.show()" 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": "We can get use frequency counts charts to get an understanding of the categorical features relative to `Churn` \n\n- We can see that for the `gender` feature. We have relatively equal rates of churn by `gender`\n- We can see that for the `InternetService` feature. We have higher churn for those that have \"Fiber optic\" service versus those with \"DSL\"\n" 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": "# Categorical feature count plots\nf, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9), (ax10, ax11, ax12), (ax13, ax14, ax15)) = plt.subplots(5, 3, figsize=(20, 20))\nax = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11, ax12, ax13, ax14, ax15 ]\n\nfor i in range(len(categorical_features)):\n sns.countplot(x = categorical_features[i], hue=\"Churn\", data=df, ax=ax[i])" 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": "We can get use histrogram charts to get an understanding of the distribution of our continuous / numerical features relative to Churn.\n\n- We can see that for the `MonthlyCharges` feature, customers that churn tend to pay higher monthly fees than those that stay.\n- We can see that for the `tenure` feature, customers that churn tend to be relatively new customers." 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": "# Continuous feature histograms.\nfig, ax = plt.subplots(2, 2, figsize=(28, 8))\ndf[df.Churn == 'No'][continuous_features].hist(bins=20, color=\"blue\", alpha=0.5, ax=ax)\ndf[df.Churn == 'Yes'][continuous_features].hist(bins=20, color=\"orange\", alpha=0.5, ax=ax)\n\n# Or use displots\n#sns.set_palette(\"hls\", 3)\n#f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(25, 25))\n#ax = [ax1, ax2, ax3, ax4]\n#for i in range(len(continuous_features)):\n# sns.distplot(df[continuous_features[i]], bins=20, hist=True, ax=ax[i])" 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "scrolled": true 240 | }, 241 | "outputs": [], 242 | "source": "# Create Grid for pairwise relationships\ngr = sns.PairGrid(df, height=5, hue=\"Churn\")\ngr = gr.map_diag(plt.hist)\ngr = gr.map_offdiag(plt.scatter)\ngr = gr.add_legend()" 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": "# Plot boxplots of numerical columns. More variation in the boxplot implies higher significance. \nf, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(25, 25))\nax = [ax1, ax2, ax3, ax4]\n\nfor i in range(len(continuous_features)):\n sns.boxplot(x = 'Churn', y = continuous_features[i], data=df, ax=ax[i])" 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": "## 3.0 Create a model\n\nNow we can create our machine learning model. You could use the insights / intuition gained from the data visualization steps above to what kind of model to create or which features to use. We will create a simple classification model." 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": "from pyspark.sql import SparkSession\nimport pandas as pd\nimport json\n\nspark = SparkSession.builder.getOrCreate()\ndf_data = spark.createDataFrame(df)\ndf_data.head()" 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": "### 3.1 Split the data into training and test sets" 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": "spark_df = df_data\n(train_data, test_data) = spark_df.randomSplit([0.8, 0.2], 24)\n\nprint(\"Number of records for training: \" + str(train_data.count()))\nprint(\"Number of records for evaluation: \" + str(test_data.count()))" 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": "### 3.2 Examine the Spark DataFrame Schema\nLook at the data types to determine requirements for feature engineering" 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": "spark_df.printSchema()" 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": "### 3.3 Use StringIndexer to encode a string column of labels to a column of label indices\n\nWe are using the Pipeline package to build the development steps as pipeline. \nWe are using StringIndexer to handle categorical / string features from the dataset. StringIndexer encodes a string column of labels to a column of label indices\n\nWe then use VectorAssembler to asemble these features into a vector. Pipelines API requires that input variables are passed in a vector" 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": "from pyspark.ml.classification import RandomForestClassifier\nfrom pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler\nfrom pyspark.ml.evaluation import BinaryClassificationEvaluator\nfrom pyspark.ml import Pipeline, Model\n\n\nsi_gender = StringIndexer(inputCol = 'gender', outputCol = 'gender_IX')\nsi_Partner = StringIndexer(inputCol = 'Partner', outputCol = 'Partner_IX')\nsi_Dependents = StringIndexer(inputCol = 'Dependents', outputCol = 'Dependents_IX')\nsi_PhoneService = StringIndexer(inputCol = 'PhoneService', outputCol = 'PhoneService_IX')\nsi_MultipleLines = StringIndexer(inputCol = 'MultipleLines', outputCol = 'MultipleLines_IX')\nsi_InternetService = StringIndexer(inputCol = 'InternetService', outputCol = 'InternetService_IX')\nsi_OnlineSecurity = StringIndexer(inputCol = 'OnlineSecurity', outputCol = 'OnlineSecurity_IX')\nsi_OnlineBackup = StringIndexer(inputCol = 'OnlineBackup', outputCol = 'OnlineBackup_IX')\nsi_DeviceProtection = StringIndexer(inputCol = 'DeviceProtection', outputCol = 'DeviceProtection_IX')\nsi_TechSupport = StringIndexer(inputCol = 'TechSupport', outputCol = 'TechSupport_IX')\nsi_StreamingTV = StringIndexer(inputCol = 'StreamingTV', outputCol = 'StreamingTV_IX')\nsi_StreamingMovies = StringIndexer(inputCol = 'StreamingMovies', outputCol = 'StreamingMovies_IX')\nsi_Contract = StringIndexer(inputCol = 'Contract', outputCol = 'Contract_IX')\nsi_PaperlessBilling = StringIndexer(inputCol = 'PaperlessBilling', outputCol = 'PaperlessBilling_IX')\nsi_PaymentMethod = StringIndexer(inputCol = 'PaymentMethod', outputCol = 'PaymentMethod_IX')\n" 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": "si_Label = StringIndexer(inputCol=\"Churn\", outputCol=\"label\").fit(spark_df)\nlabel_converter = IndexToString(inputCol=\"prediction\", outputCol=\"predictedLabel\", labels=si_Label.labels)" 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": "### 3.4 Create a single vector" 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": "va_features = VectorAssembler(inputCols=['gender_IX', 'SeniorCitizen', 'Partner_IX', 'Dependents_IX', 'PhoneService_IX', 'MultipleLines_IX', 'InternetService_IX', \\\n 'OnlineSecurity_IX', 'OnlineBackup_IX', 'DeviceProtection_IX', 'TechSupport_IX', 'StreamingTV_IX', 'StreamingMovies_IX', \\\n 'Contract_IX', 'PaperlessBilling_IX', 'PaymentMethod_IX', 'TotalCharges', 'MonthlyCharges'], outputCol=\"features\")" 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": "### 3.5 Create a pipeline, and fit a model using RandomForestClassifier \nAssemble all the stages into a pipeline. We don't expect a clean linear regression, so we'll use RandomForestClassifier to find the best decision tree for the data." 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": "classifier = RandomForestClassifier(featuresCol=\"features\")\n\npipeline = Pipeline(stages=[si_gender, si_Partner, si_Dependents, si_PhoneService, si_MultipleLines, si_InternetService, si_OnlineSecurity, si_OnlineBackup, si_DeviceProtection, \\\n si_TechSupport, si_StreamingTV, si_StreamingMovies, si_Contract, si_PaperlessBilling, si_PaymentMethod, si_Label, va_features, \\\n classifier, label_converter])\n\nmodel = pipeline.fit(train_data)" 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": "predictions = model.transform(test_data)\nevaluatorDT = BinaryClassificationEvaluator(rawPredictionCol=\"prediction\")\narea_under_curve = evaluatorDT.evaluate(predictions)\n\nevaluatorDT = BinaryClassificationEvaluator(rawPredictionCol=\"prediction\", metricName='areaUnderROC')\narea_under_curve = evaluatorDT.evaluate(predictions)\nevaluatorDT = BinaryClassificationEvaluator(rawPredictionCol=\"prediction\", metricName='areaUnderPR')\narea_under_PR = evaluatorDT.evaluate(predictions)\nprint(\"areaUnderROC = %g\" % area_under_curve)" 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": "## 4.0 Save the model and test data\n\nNow the model can be saved for future deployment. The model will be saved using the Watson Machine Learning client, to a deployment space.\n\n**<< UPDATE THE VARIABLE 'MODEL_NAME' TO A UNIQUE NAME>>**\n\n**<< UPDATE THE VARIABLE 'DEPLOYMENT_SPACE_NAME' TO THE NAME OF THE DEPLOYMENT SPACE CREATED PREVIOUSLY>>**" 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": "\nMODEL_NAME = \"GAMA-PREDICT-CHURN\"\nDEPLOYMENT_SPACE_NAME = 'Big-Data'\n" 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": "### 4.1 Save the model to ICP4D local Watson Machine Learning\n\n1. Generate an API Key: https://cloud.ibm.com/iam/apikeys\n2. Generate a TOKEN for your Watson Machine Learning:\n curl --insecure -X POST --header \"Content-Type: application/x-www-form-urlencoded\" --header \"Accept: application/json\" --data-urlencode \"grant_type=urn:ibm:params:oauth:grant-type:apikey\" --data-urlencode \"apikey=$API_key\" \"https://iam.ng.bluemix.net/identity/token\"\n3. Replace the `token` value of `*****` with `token` generate. The value for `url` should match the `url` for your Watson Machine Learning." 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": "from ibm_watson_machine_learning import APIClient\n\nwml_credentials = {\n \"url\": \"https://us-south.ml.cloud.ibm.com\",\n \"token\":\"*******\"\n}\n\nclient = APIClient(wml_credentials)" 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": "# The code was removed by Watson Studio for sharing." 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": "client.spaces.list()" 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": "### Use the desired space as the `default_space`\n\nThe deployment space ID will be looked up based on the name specified above. If you do not receive a space GUID as an output to the next cell, do not proceed until you have created a deployment space." 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": "# Be sure to update the name of the space with the one you want to use.\n#client.spaces.list()\nall_spaces = client.spaces.get_details()['resources']\nspace_id = None\n#print(all_spaces)\nfor space in all_spaces:\n if space['entity']['name'] == DEPLOYMENT_SPACE_NAME:\n space_id = space[\"metadata\"][\"id\"]\n print(\"\\nDeployment Space GUID: \", space_id)\n\nif space_id is None:\n print(\"WARNING: Your space does not exist. Create a deployment space before proceeding to the next cell.\")\n #space_id = client.spaces.store(meta_props={client.spaces.ConfigurationMetaNames.NAME: space_name})[\"metadata\"][\"guid\"]" 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": "**<< REPLACE space_id BELOW with the id for your space. For e.g.
client.set.default_space(\"6b39c537-f707-4078-9dc7-ce70b70ab22f\") >>
**" 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": "# Now set the default space to the GUID for your deployment space. If this is successful, you will see a 'SUCCESS' message.\nclient.set.default_space(space_id)" 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": "#### Save the Model" 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": "# case you need check the services, uncomment th eline below and run it.\n#client.software_specifications.list()" 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": "software_spec_id = client.software_specifications.get_id_by_name('spark-mllib_2.4')\nprint(software_spec_id)" 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": "# Store our model\nmodel_props = {client.repository.ModelMetaNames.NAME: MODEL_NAME,\n client.repository.ModelMetaNames.SOFTWARE_SPEC_UID : software_spec_id,\n client.repository.ModelMetaNames.TYPE : \"mllib_2.4\"}\npublished_model = client.repository.store_model(model=model, pipeline=pipeline, meta_props=model_props, training_data=train_data)\n\nprint(json.dumps(published_model, indent=3))" 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": "# Use this cell to do any cleanup of previously created models and deployments\nclient.repository.list_models()\nclient.deployments.list()\n\n# client.repository.delete('GUID of stored model')\n# client.deployments.delete('GUID of deployed model')\n" 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": "## 5.0 Save Test Data\n\nWe will save the test data we used to evaluate the model to our project." 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": "write_score_CSV=test_data.toPandas().drop(['Churn'], axis=1)\n#write_score_CSV.to_csv('/project_data/data_asset/TelcoCustomerSparkMLBatchScore.csv', sep=',', index=False)\nproject.save_data('TelcoCustomerSparkMLBatchScore.csv', write_score_CSV.to_csv(), overwrite=True)\n\nwrite_eval_CSV=test_data.toPandas()\n#write_eval_CSV.to_csv('/project_data/data_asset/TelcoCustomerSparkMLEval.csv', sep=',', index=False)\nproject.save_data('TelcoCustomerSparkMLEval.csv', write_eval_CSV.to_csv(), overwrite=True)" 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": "## Congratulations, you have created a model based on customer churn data, and deployed it to Watson Machine Learning!" 448 | } 449 | ], 450 | "metadata": { 451 | "kernelspec": { 452 | "display_name": "Python 3.7", 453 | "language": "python", 454 | "name": "python3" 455 | }, 456 | "language_info": { 457 | "codemirror_mode": { 458 | "name": "ipython", 459 | "version": 3 460 | }, 461 | "file_extension": ".py", 462 | "mimetype": "text/x-python", 463 | "name": "python", 464 | "nbconvert_exporter": "python", 465 | "pygments_lexer": "ipython3", 466 | "version": "3.7.10" 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 1 471 | } -------------------------------------------------------------------------------- /Linear regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 33, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": "#import pandas as pd \nimport numpy as np \nimport matplotlib.pyplot as plt \nimport seaborn as seabornInstance \nfrom sklearn.model_selection import train_test_split \nfrom sklearn.linear_model import LinearRegression\nfrom sklearn import metrics\n%matplotlib inline" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 34, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PIBCO2
01.7552.6
11.2462.3
22.5475.4
32.8374.3
43.6748.5
\n
", 18 | "text/plain": " PIB CO2\n0 1.7 552.6\n1 1.2 462.3\n2 2.5 475.4\n3 2.8 374.3\n4 3.6 748.5" 19 | }, 20 | "execution_count": 34, 21 | "metadata": {}, 22 | "output_type": "execute_result" 23 | } 24 | ], 25 | "source": "# The code was removed by Watson Studio for sharing." 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 35, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PIBCO2
count10.00000010.000000
mean2.460000526.300000
std1.459224267.164415
min0.800000253.000000
25%1.550000380.950000
50%2.300000468.850000
75%2.725000538.650000
max5.9000001180.600000
\n
", 35 | "text/plain": " PIB CO2\ncount 10.000000 10.000000\nmean 2.460000 526.300000\nstd 1.459224 267.164415\nmin 0.800000 253.000000\n25% 1.550000 380.950000\n50% 2.300000 468.850000\n75% 2.725000 538.650000\nmax 5.900000 1180.600000" 36 | }, 37 | "execution_count": 35, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": "df_data_1.describe()" 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 36, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "image/png": "\n", 52 | "text/plain": "
" 53 | }, 54 | "metadata": { 55 | "needs_background": "light" 56 | }, 57 | "output_type": "display_data" 58 | } 59 | ], 60 | "source": "df_data_1.plot(x='PIB', y='CO2', style='o') \nplt.title('PIB vs CO2') \nplt.xlabel('PIB') \nplt.ylabel('CO2') \nplt.show()" 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 37, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": "" 70 | }, 71 | "execution_count": 37, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | }, 75 | { 76 | "data": { 77 | "image/png": "\n", 78 | "text/plain": "
" 79 | }, 80 | "metadata": { 81 | "needs_background": "light" 82 | }, 83 | "output_type": "display_data" 84 | } 85 | ], 86 | "source": "plt.figure(figsize=(15,10))\nplt.tight_layout()\nseabornInstance.distplot(df_data_1['CO2'])" 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 38, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": "X = df_data_1['PIB'].values.reshape(-1,1)\ny = df_data_1['CO2'].values.reshape(-1,1)\n" 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 45, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)" 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 46, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)" 110 | }, 111 | "execution_count": 46, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": "regressor = LinearRegression() \nregressor.fit(X_train, y_train)" 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 47, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": "[121.79434072]\n[[167.03397889]]\n" 127 | } 128 | ], 129 | "source": "#To retrieve the intercept:\nprint(regressor.intercept_)\n#For retrieving the slope:\nprint(regressor.coef_)" 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 48, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": "y_pred = regressor.predict(X_test)" 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 49, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ActualPredicted
0475.4539.379288
\n
", 146 | "text/plain": " Actual Predicted\n0 475.4 539.379288" 147 | }, 148 | "execution_count": 49, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": "df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})\ndf" 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 50, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "image/png": "\n", 163 | "text/plain": "
" 164 | }, 165 | "metadata": { 166 | "needs_background": "light" 167 | }, 168 | "output_type": "display_data" 169 | } 170 | ], 171 | "source": "df1 = df.head(25)\ndf1.plot(kind='bar',figsize=(16,10))\nplt.grid(which='major', linestyle='-', linewidth='0.5', color='green')\nplt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')\nplt.show()" 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": "plt.scatter(X_test, y_test, color='gray')\nplt.plot(X_test, y_pred, color='red', linewidth=2)\nplt.show()" 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) \nprint('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) \nprint('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))" 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": "" 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3.6", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.6.9" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 1 216 | } --------------------------------------------------------------------------------