├── Final Report.pdf
├── Final Presentation.pdf
├── README.md
├── SCN Toronto : Web Scraping.ipynb
└── SCN Toronto : Latitude and Longitude.ipynb


/Final Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/p3jitnath/IBM-DataScience-Capstone/HEAD/Final Report.pdf


--------------------------------------------------------------------------------
/Final Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/p3jitnath/IBM-DataScience-Capstone/HEAD/Final Presentation.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IBM-Datascience-Capstone
 2 | This is the repository for the capstone project of IBM Professional Data Science course offered along with Coursera.
 3 | 
 4 | ## Overview
 5 | 
 6 | 1. `Coursera Capstone Project` - Contains the Jupyter Notebook for the final project
 7 | 2. `Final Presentation` - Contains the presentation for the final project
 8 | 3. `Final Report` - Contains the extensive report for the final project
 9 | 
10 | 4. `SCN Toronto`: Segementation and Clustering of Neighbourhoods of Toronto    
11 |   4.1. `Foursquare API` : Contains the code for the implementation of Foursquare API    
12 |   4.2. `Latitude and Longitude` : Contains the code for the use of Geocoding API   
13 |   4.3. `Web Scraping` : Contains the code for Web Scraping the necessary data from a WikiPedia page  
14 | 


--------------------------------------------------------------------------------
/SCN Toronto : Web Scraping.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat_minor": 1, 
  3 |     "cells": [
  4 |         {
  5 |             "source": "### Importing required libraries", 
  6 |             "cell_type": "markdown", 
  7 |             "metadata": {}
  8 |         }, 
  9 |         {
 10 |             "execution_count": 1, 
 11 |             "cell_type": "code", 
 12 |             "metadata": {}, 
 13 |             "outputs": [], 
 14 |             "source": "# Download beautifulsoup4 library for webscraping, if not installed\n# !conda install beautifulsoup4\n\nfrom bs4 import BeautifulSoup\nimport requests\nimport pandas as pd\nimport csv "
 15 |         }, 
 16 |         {
 17 |             "source": "### Setting `maxcolwidth` to `800` for good viewability", 
 18 |             "cell_type": "markdown", 
 19 |             "metadata": {}
 20 |         }, 
 21 |         {
 22 |             "execution_count": 2, 
 23 |             "cell_type": "code", 
 24 |             "metadata": {}, 
 25 |             "outputs": [], 
 26 |             "source": "pd.set_option('max_colwidth', 800) "
 27 |         }, 
 28 |         {
 29 |             "source": "### Getting the source webpage and assigining the variable `source` to it and iniatilizing the `beautifulsoup` object to `soup`\n", 
 30 |             "cell_type": "markdown", 
 31 |             "metadata": {}
 32 |         }, 
 33 |         {
 34 |             "execution_count": 3, 
 35 |             "cell_type": "code", 
 36 |             "metadata": {}, 
 37 |             "outputs": [], 
 38 |             "source": "source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text \nsoup = BeautifulSoup(source, 'lxml') "
 39 |         }, 
 40 |         {
 41 |             "source": "### Initializing the `csv_writer` object and writing the name of the columns on it as the first row", 
 42 |             "cell_type": "markdown", 
 43 |             "metadata": {}
 44 |         }, 
 45 |         {
 46 |             "execution_count": 4, 
 47 |             "cell_type": "code", 
 48 |             "metadata": {}, 
 49 |             "outputs": [
 50 |                 {
 51 |                     "execution_count": 4, 
 52 |                     "metadata": {}, 
 53 |                     "data": {
 54 |                         "text/plain": "32"
 55 |                     }, 
 56 |                     "output_type": "execute_result"
 57 |                 }
 58 |             ], 
 59 |             "source": "csv_file = open('toronto_postal_codes.csv', 'w')\ncsv_writer = csv.writer(csv_file)\ncsv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])"
 60 |         }, 
 61 |         {
 62 |             "source": "### Scraping the page to extract the data table", 
 63 |             "cell_type": "markdown", 
 64 |             "metadata": {}
 65 |         }, 
 66 |         {
 67 |             "execution_count": 5, 
 68 |             "cell_type": "code", 
 69 |             "metadata": {}, 
 70 |             "outputs": [], 
 71 |             "source": "table = soup.find('table', class_ = 'wikitable sortable') # Gets the table from the webpage\nrows = table.find_all('tr') # Gets the table rows\n\npostcodes = [] # Initializes the raw postcodes list\nboroughs = [] # Initializes the raw boroughs list\nneighbourhoods = [] # Initializes the raw neighbourhoods list\n\nfor row in rows:    \n    columns = row.find_all('td')\n    try :\n        if columns[1].text != 'Not assigned':  # To skip if the borough name is 'Not Assigned'\n            \n            postcode = columns[0].text\n            postcodes.append(postcode)\n            \n            borough = columns[1].text\n            boroughs.append(borough)\n            \n            neighbourhood = columns[2].text.split('\\n')[0] # Removing the newline character at the end     \n            \n            if neighbourhood == 'Not assigned': # Assigning the same name to neighbourhood if it is 'Not Assigned'\n                neighbourhood = borough            \n                \n            neighbourhoods.append(neighbourhood)\n             \n    except Exception as e : # To skip the first row which contains column names\n        pass \n    \npostcode_explored = [] # Initializing the list of explored postcodes\nfor index_i, postcode_i in enumerate(postcodes) :   \n    if postcode_i not in postcode_explored :\n        nbds = neighbourhoods[index_i]\n        for index_f, postcode_f in enumerate(postcodes) :\n            if postcode_i == postcode_f and index_i != index_f:\n                nbds = nbds + ', ' + neighbourhoods[index_f] # Concatenating the neighbourhood names\n        csv_writer.writerow([postcode_i, boroughs[index_i], nbds]) # Writing the rows in the csv file\n        postcode_explored.append(postcode_i)"
 72 |         }, 
 73 |         {
 74 |             "source": "### Closing the csv file\n", 
 75 |             "cell_type": "markdown", 
 76 |             "metadata": {}
 77 |         }, 
 78 |         {
 79 |             "execution_count": 6, 
 80 |             "cell_type": "code", 
 81 |             "metadata": {}, 
 82 |             "outputs": [], 
 83 |             "source": "csv_file.close()"
 84 |         }, 
 85 |         {
 86 |             "source": "### Creating the pandas dataframe", 
 87 |             "cell_type": "markdown", 
 88 |             "metadata": {}
 89 |         }, 
 90 |         {
 91 |             "execution_count": 7, 
 92 |             "cell_type": "code", 
 93 |             "metadata": {}, 
 94 |             "outputs": [], 
 95 |             "source": "df = pd.read_csv('toronto_postal_codes.csv')"
 96 |         }, 
 97 |         {
 98 |             "source": "### Getting the `shape` of the dataframe", 
 99 |             "cell_type": "markdown", 
100 |             "metadata": {}
101 |         }, 
102 |         {
103 |             "execution_count": 8, 
104 |             "cell_type": "code", 
105 |             "metadata": {}, 
106 |             "outputs": [
107 |                 {
108 |                     "execution_count": 8, 
109 |                     "metadata": {}, 
110 |                     "data": {
111 |                         "text/plain": "(103, 3)"
112 |                     }, 
113 |                     "output_type": "execute_result"
114 |                 }
115 |             ], 
116 |             "source": "df.shape"
117 |         }
118 |     ], 
119 |     "metadata": {
120 |         "kernelspec": {
121 |             "display_name": "Python 3.5", 
122 |             "name": "python3", 
123 |             "language": "python"
124 |         }, 
125 |         "language_info": {
126 |             "mimetype": "text/x-python", 
127 |             "nbconvert_exporter": "python", 
128 |             "version": "3.5.5", 
129 |             "name": "python", 
130 |             "file_extension": ".py", 
131 |             "pygments_lexer": "ipython3", 
132 |             "codemirror_mode": {
133 |                 "version": 3, 
134 |                 "name": "ipython"
135 |             }
136 |         }
137 |     }, 
138 |     "nbformat": 4
139 | }


--------------------------------------------------------------------------------
/SCN Toronto : Latitude and Longitude.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat_minor": 1, 
  3 |     "cells": [
  4 |         {
  5 |             "source": "### Importing required libraries", 
  6 |             "cell_type": "markdown", 
  7 |             "metadata": {}
  8 |         }, 
  9 |         {
 10 |             "execution_count": 1, 
 11 |             "cell_type": "code", 
 12 |             "metadata": {}, 
 13 |             "outputs": [], 
 14 |             "source": "# Download beautifulsoup4 library for webscraping, if not installed\n# !conda install beautifulsoup4\n\nfrom bs4 import BeautifulSoup\nimport requests\nimport pandas as pd\nimport csv "
 15 |         }, 
 16 |         {
 17 |             "source": "### Setting `maxcolwidth` to `800` for good viewability", 
 18 |             "cell_type": "markdown", 
 19 |             "metadata": {}
 20 |         }, 
 21 |         {
 22 |             "execution_count": 2, 
 23 |             "cell_type": "code", 
 24 |             "metadata": {}, 
 25 |             "outputs": [], 
 26 |             "source": "pd.set_option('max_colwidth', 800) "
 27 |         }, 
 28 |         {
 29 |             "source": "### Getting the source webpage and assigining the variable `source` to it and iniatilizing the `beautifulsoup` object to `soup`\n", 
 30 |             "cell_type": "markdown", 
 31 |             "metadata": {}
 32 |         }, 
 33 |         {
 34 |             "execution_count": 3, 
 35 |             "cell_type": "code", 
 36 |             "metadata": {}, 
 37 |             "outputs": [], 
 38 |             "source": "source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text \nsoup = BeautifulSoup(source, 'lxml') "
 39 |         }, 
 40 |         {
 41 |             "source": "### Initializing the `csv_writer` object and writing the name of the columns on it as the first row", 
 42 |             "cell_type": "markdown", 
 43 |             "metadata": {}
 44 |         }, 
 45 |         {
 46 |             "execution_count": 4, 
 47 |             "cell_type": "code", 
 48 |             "metadata": {}, 
 49 |             "outputs": [
 50 |                 {
 51 |                     "execution_count": 4, 
 52 |                     "metadata": {}, 
 53 |                     "data": {
 54 |                         "text/plain": "32"
 55 |                     }, 
 56 |                     "output_type": "execute_result"
 57 |                 }
 58 |             ], 
 59 |             "source": "csv_file = open('toronto_postal_codes.csv', 'w')\ncsv_writer = csv.writer(csv_file)\ncsv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])"
 60 |         }, 
 61 |         {
 62 |             "source": "### Scraping the page to extract the data table", 
 63 |             "cell_type": "markdown", 
 64 |             "metadata": {}
 65 |         }, 
 66 |         {
 67 |             "execution_count": 5, 
 68 |             "cell_type": "code", 
 69 |             "metadata": {}, 
 70 |             "outputs": [], 
 71 |             "source": "table = soup.find('table', class_ = 'wikitable sortable') # Gets the table from the webpage\nrows = table.find_all('tr') # Gets the table rows\n\npostcodes = [] # Initializes the raw postcodes list\nboroughs = [] # Initializes the raw boroughs list\nneighbourhoods = [] # Initializes the raw neighbourhoods list\n\nfor row in rows:    \n    columns = row.find_all('td')\n    try :\n        if columns[1].text != 'Not assigned':  # To skip if the borough name is 'Not Assigned'\n            \n            postcode = columns[0].text\n            postcodes.append(postcode)\n            \n            borough = columns[1].text\n            boroughs.append(borough)\n            \n            neighbourhood = columns[2].text.split('\\n')[0] # Removing the newline character at the end     \n            \n            if neighbourhood == 'Not assigned': # Assigning the same name to neighbourhood if it is 'Not Assigned'\n                neighbourhood = borough            \n                \n            neighbourhoods.append(neighbourhood)\n             \n    except Exception as e : # To skip the first row which contains column names\n        pass \n    \npostcode_explored = [] # Initializing the list of explored postcodes\nfor index_i, postcode_i in enumerate(postcodes) :   \n    if postcode_i not in postcode_explored :\n        nbds = neighbourhoods[index_i]\n        for index_f, postcode_f in enumerate(postcodes) :\n            if postcode_i == postcode_f and index_i != index_f:\n                nbds = nbds + ', ' + neighbourhoods[index_f] # Concatenating the neighbourhood names\n        csv_writer.writerow([postcode_i, boroughs[index_i], nbds]) # Writing the rows in the csv file\n        postcode_explored.append(postcode_i)"
 72 |         }, 
 73 |         {
 74 |             "source": "### Closing the csv file\n", 
 75 |             "cell_type": "markdown", 
 76 |             "metadata": {}
 77 |         }, 
 78 |         {
 79 |             "execution_count": 6, 
 80 |             "cell_type": "code", 
 81 |             "metadata": {}, 
 82 |             "outputs": [], 
 83 |             "source": "csv_file.close()"
 84 |         }, 
 85 |         {
 86 |             "source": "### Creating the pandas dataframe", 
 87 |             "cell_type": "markdown", 
 88 |             "metadata": {}
 89 |         }, 
 90 |         {
 91 |             "execution_count": 7, 
 92 |             "cell_type": "code", 
 93 |             "metadata": {}, 
 94 |             "outputs": [], 
 95 |             "source": "df = pd.read_csv('toronto_postal_codes.csv')"
 96 |         }, 
 97 |         {
 98 |             "source": "### Getting the `shape` of the dataframe", 
 99 |             "cell_type": "markdown", 
100 |             "metadata": {}
101 |         }, 
102 |         {
103 |             "execution_count": 8, 
104 |             "cell_type": "code", 
105 |             "metadata": {}, 
106 |             "outputs": [
107 |                 {
108 |                     "execution_count": 8, 
109 |                     "metadata": {}, 
110 |                     "data": {
111 |                         "text/plain": "(103, 3)"
112 |                     }, 
113 |                     "output_type": "execute_result"
114 |                 }
115 |             ], 
116 |             "source": "df.shape"
117 |         }, 
118 |         {
119 |             "execution_count": 9, 
120 |             "cell_type": "code", 
121 |             "metadata": {}, 
122 |             "outputs": [
123 |                 {
124 |                     "execution_count": 9, 
125 |                     "metadata": {}, 
126 |                     "data": {
127 |                         "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Postcode</th>\n      <th>Borough</th>\n      <th>Neighbourhood</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>M3A</td>\n      <td>North York</td>\n      <td>Parkwoods</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>M4A</td>\n      <td>North York</td>\n      <td>Victoria Village</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>M5A</td>\n      <td>Downtown Toronto</td>\n      <td>Harbourfront, Regent Park</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>M6A</td>\n      <td>North York</td>\n      <td>Lawrence Heights, Lawrence Manor</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>M7A</td>\n      <td>Queen's Park</td>\n      <td>Queen's Park</td>\n    </tr>\n  </tbody>\n</table>\n</div>", 
128 |                         "text/plain": "  Postcode           Borough                     Neighbourhood\n0      M3A        North York                         Parkwoods\n1      M4A        North York                  Victoria Village\n2      M5A  Downtown Toronto         Harbourfront, Regent Park\n3      M6A        North York  Lawrence Heights, Lawrence Manor\n4      M7A      Queen's Park                      Queen's Park"
129 |                     }, 
130 |                     "output_type": "execute_result"
131 |                 }
132 |             ], 
133 |             "source": "df.head()"
134 |         }, 
135 |         {
136 |             "source": "### Getting the list of postal codes", 
137 |             "cell_type": "markdown", 
138 |             "metadata": {}
139 |         }, 
140 |         {
141 |             "execution_count": 10, 
142 |             "cell_type": "code", 
143 |             "metadata": {}, 
144 |             "outputs": [], 
145 |             "source": "postal_codes = df['Postcode'].values"
146 |         }, 
147 |         {
148 |             "source": "### OpenCage GeoCoder API credentials", 
149 |             "cell_type": "markdown", 
150 |             "metadata": {}
151 |         }, 
152 |         {
153 |             "execution_count": 11, 
154 |             "cell_type": "code", 
155 |             "metadata": {}, 
156 |             "outputs": [], 
157 |             "source": "# The code was removed by Watson Studio for sharing."
158 |         }, 
159 |         {
160 |             "source": "### Using the OpenCage GeoCoder API ", 
161 |             "cell_type": "markdown", 
162 |             "metadata": {}
163 |         }, 
164 |         {
165 |             "execution_count": 12, 
166 |             "cell_type": "code", 
167 |             "metadata": {}, 
168 |             "outputs": [], 
169 |             "source": "import json\n\nlatitudes = [] # Initializing the latitude array\nlongitudes = [] # Initializing the longitude array\n\nfor postal_code in postal_codes : \n    place_name = postal_code + \" Toronto\" # Formats the place name\n    url = 'https://api.opencagedata.com/geocode/v1/json?q={}&key={}'.format(place_name, API_KEY) # Gets the proper url to make the API call\n    obj = json.loads(requests.get(url).text) # Loads the JSON file in the form of a python dictionary\n    \n    results = obj['results'] # Extracts the results information out of the JSON file\n    lat = results[0]['geometry']['lat'] # Extracts the latitude value\n    lng = results[0]['geometry']['lng'] # Extracts the longitude value\n    \n    latitudes.append(lat) # Appending to the list of latitudes\n    longitudes.append(lng) # Appending to the list of longitudes "
170 |         }, 
171 |         {
172 |             "source": "### Adding the latitude and longitude values to the dataframe", 
173 |             "cell_type": "markdown", 
174 |             "metadata": {}
175 |         }, 
176 |         {
177 |             "execution_count": 13, 
178 |             "cell_type": "code", 
179 |             "metadata": {}, 
180 |             "outputs": [], 
181 |             "source": "df['Latitude'] = latitudes\ndf['Longitude'] = longitudes"
182 |         }, 
183 |         {
184 |             "execution_count": 14, 
185 |             "cell_type": "code", 
186 |             "metadata": {}, 
187 |             "outputs": [
188 |                 {
189 |                     "execution_count": 14, 
190 |                     "metadata": {}, 
191 |                     "data": {
192 |                         "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Postcode</th>\n      <th>Borough</th>\n      <th>Neighbourhood</th>\n      <th>Latitude</th>\n      <th>Longitude</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>M3A</td>\n      <td>North York</td>\n      <td>Parkwoods</td>\n      <td>43.754500</td>\n      <td>-79.330000</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>M4A</td>\n      <td>North York</td>\n      <td>Victoria Village</td>\n      <td>43.727600</td>\n      <td>-79.314800</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>M5A</td>\n      <td>Downtown Toronto</td>\n      <td>Harbourfront, Regent Park</td>\n      <td>43.655500</td>\n      <td>-79.362600</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>M6A</td>\n      <td>North York</td>\n      <td>Lawrence Heights, Lawrence Manor</td>\n      <td>43.722300</td>\n      <td>-79.450400</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>M7A</td>\n      <td>Queen's Park</td>\n      <td>Queen's Park</td>\n      <td>-33.013441</td>\n      <td>151.594204</td>\n    </tr>\n  </tbody>\n</table>\n</div>", 
193 |                         "text/plain": "  Postcode           Borough                     Neighbourhood   Latitude  \\\n0      M3A        North York                         Parkwoods  43.754500   \n1      M4A        North York                  Victoria Village  43.727600   \n2      M5A  Downtown Toronto         Harbourfront, Regent Park  43.655500   \n3      M6A        North York  Lawrence Heights, Lawrence Manor  43.722300   \n4      M7A      Queen's Park                      Queen's Park -33.013441   \n\n    Longitude  \n0  -79.330000  \n1  -79.314800  \n2  -79.362600  \n3  -79.450400  \n4  151.594204  "
194 |                     }, 
195 |                     "output_type": "execute_result"
196 |                 }
197 |             ], 
198 |             "source": "df.head()"
199 |         }
200 |     ], 
201 |     "metadata": {
202 |         "kernelspec": {
203 |             "display_name": "Python 3.5", 
204 |             "name": "python3", 
205 |             "language": "python"
206 |         }, 
207 |         "language_info": {
208 |             "mimetype": "text/x-python", 
209 |             "nbconvert_exporter": "python", 
210 |             "version": "3.5.5", 
211 |             "name": "python", 
212 |             "file_extension": ".py", 
213 |             "pygments_lexer": "ipython3", 
214 |             "codemirror_mode": {
215 |                 "version": 3, 
216 |                 "name": "ipython"
217 |             }
218 |         }
219 |     }, 
220 |     "nbformat": 4
221 | }


--------------------------------------------------------------------------------