├── CSS selectors with Beautiful Soup_final.ipynb ├── Extracting_html_table_1_and_2.ipynb ├── Navigating_Websites_Part_1.ipynb ├── Navigating_websites_1-4_(all parts of the series).ipynb └── README.md /CSS selectors with Beautiful Soup_final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CSS selectors with Beautiful Soup" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 19, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import requests\n", 19 | "from bs4 import BeautifulSoup" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 20, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "page = requests.get('http://www.marketwatch.com', headers={'User-Agent': 'Mozilla/5.0'})\n", 31 | "soup = BeautifulSoup(page.content, 'html.parser')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# Finding Sections by tag" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "soup.select('p') # finds all p" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "soup.select('div') #finds all div" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "#finding a tag within a tag\n", 72 | "\n", 73 | "soup.select(\"p a\")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "#multiple tags\n", 85 | "soup.select('p,a')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "# Finding sections by class" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 26, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "[
\n", 106 | "

Latest NewsAll Times Eastern

\n", 107 | "
\n", 108 | " \n", 109 | " \n", 110 | "
\n", 111 | "
]" 112 | ] 113 | }, 114 | "execution_count": 26, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "\n", 121 | "soup.select('.element__options') #finds all classes with this name" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 27, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "1" 135 | ] 136 | }, 137 | "execution_count": 27, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "len(soup.select('.element__options'))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "class=\"latestNews j-scrollElement\" #multiple spaces within class name" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "soup.select(\".latestNews.j-scrollElement\")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "soup.select(\"ul.latestNews.j-scrollElement\") #tag with class" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "soup.find_all('ul', class_= \"latestNews j-scrollElement\")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "
" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": true 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "soup.select(\"section.container.container--mostPopular.full-width.templateD1\")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "
" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "soup.select(\"div.col.col--12.col--curated\") #tag with class" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "source": [ 267 | "# Using ID with tags" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 35, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "url = 'http://docs.python-requests.org/en/master/user/advanced/#advanced'\n", 279 | "\n", 280 | "#headers={'User-Agent': 'Mozilla/5.0'} #results in 403 code\n", 281 | "headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 36, 287 | "metadata": { 288 | "collapsed": true 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "response = requests.get(url, headers)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 37, 298 | "metadata": { 299 | "collapsed": true 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "soup = BeautifulSoup(response.content, 'html.parser')" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": true 311 | }, 312 | "outputs": [], 313 | "source": [] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "
" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "collapsed": true 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "soup.select('div#session-objects')" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "outputs": [], 375 | "source": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "collapsed": true 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "
" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "soup.select('div#request-and-response-objects')" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "collapsed": true 404 | }, 405 | "outputs": [], 406 | "source": [] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [] 425 | } 426 | ], 427 | "metadata": { 428 | "kernelspec": { 429 | "display_name": "Python 3", 430 | "language": "python", 431 | "name": "python3" 432 | }, 433 | "language_info": { 434 | "codemirror_mode": { 435 | "name": "ipython", 436 | "version": 3 437 | }, 438 | "file_extension": ".py", 439 | "mimetype": "text/x-python", 440 | "name": "python", 441 | "nbconvert_exporter": "python", 442 | "pygments_lexer": "ipython3", 443 | "version": "3.6.0" 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 1 448 | } 449 | -------------------------------------------------------------------------------- /Extracting_html_table_1_and_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "from bs4 import BeautifulSoup\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "url ='http://www.espn.com/nba/statistics/player/_/stat/assists/sort/avgAssists/\n", 25 | "\n", 26 | "headers= {'User-Agent': 'Mozilla/5.0'}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "response = requests.get(url)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "response.status_code" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "response.content" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "soup = BeautifulSoup(response.content, 'html.parser')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "stat_table =soup.find_all('table', class_ = 'tablehead' )" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "len(stat_table)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "stat_table = stat_table[0]" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "for row in stat_table.find_all('tr'):\n", 115 | " for cell in row.find_all('td'):\n", 116 | " print(cell.text)\n", 117 | " " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "with open ('basketball_stats.txt', 'w') as r:\n", 129 | " for row in stat_table.find_all('tr'):\n", 130 | " for cell in row.find_all('td'):\n", 131 | " r.write(cell.text.ljust(22))\n", 132 | " r.write('\\n')`\n", 133 | " " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "#http://www.whoishostingthis.com/tools/user-agent/" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "#'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "# video 2" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 2, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "import requests\n", 174 | "from bs4 import BeautifulSoup\n", 175 | "import time" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 3, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "#http://www.espn.com/nba/statistics/player/_/stat/assists/sort/avgAssists/count/41" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 11, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "num = 1\n", 207 | "\n", 208 | "url ='http://www.espn.com/nba/statistics/player/_/stat/assists/sort/avgAssists/count/{}'.format(num)\n", 209 | "\n", 210 | "headers= {'User-Agent': 'Mozilla/5.0'}\n", 211 | "with open ('basketball_stats.txt', 'w') as r:\n", 212 | " r.write('BASKETBALL ASSISTS TABLE\\n')\n", 213 | "\n", 214 | "\n", 215 | "while num < 272:\n", 216 | " url ='http://www.espn.com/nba/statistics/player/_/stat/assists/sort/avgAssists/count/{}'.format(num)\n", 217 | " \n", 218 | " time.sleep(1)\n", 219 | " response = requests.get(url, headers)\n", 220 | " \n", 221 | " if response.status_code == 200:\n", 222 | " soup = BeautifulSoup(response.content, 'html.parser')\n", 223 | " stat_table = soup.find_all('table', class_ = 'tablehead')\n", 224 | " if len(stat_table) < 2:\n", 225 | " stat_table = stat_table[0]\n", 226 | " with open ('basketball_stats.txt', 'a') as r:\n", 227 | " for row in stat_table.find_all('tr'):\n", 228 | " for cell in row.find_all('td'):\n", 229 | " r.write(cell.text.ljust(22))\n", 230 | " r.write('\\n')\n", 231 | " else: print('Too many tables')\n", 232 | " \n", 233 | " else:\n", 234 | " print('No response')\n", 235 | " print(num)\n", 236 | " \n", 237 | " \n", 238 | " num += 40\n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " " 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": true 259 | }, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.6.0" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 1 285 | } 286 | -------------------------------------------------------------------------------- /Navigating_Websites_Part_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Navigating Gamefaqs" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import requests\n", 19 | "from bs4 import BeautifulSoup\n", 20 | "\n", 21 | "response = requests.get('https://www.gamefaqs.com', headers={'User-Agent': 'Mozilla/5.0'})" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "response.status_code" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "#Here is another head to try if you get an 401 status code\n", 44 | "\n", 45 | "response = requests.get('https://www.gamefaqs.com', headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "response.status_code" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "#if you get another 401 error, try this method\n", 77 | "\n", 78 | "import requests\n", 79 | "\n", 80 | "session = requests.Session()\n", 81 | "response = session.get('https://www.gamefaqs.com', headers={'User-Agent': 'Mozilla/5.0'})\n", 82 | "\n", 83 | "print(response.status_code)\n", 84 | "\n", 85 | "\n", 86 | "\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "soup = BeautifulSoup(response.content, 'html.parser')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "soup.title" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "soup.get_text(strip = True) #gets rid of new lines" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "#Let's get info about Vita" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "soup.find_all(string = 'Vita') #capitalized" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "soup.find_all(string = 'vita') #lower-case" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "type(soup.find_all(string = 'Vita')) #element-tag" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "len(soup.find_all(string = 'Vita')) #how many results" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "soup.find_all(string = 'Vita')[0].find_parents()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "len(soup.find_all(string = 'Vita')[0].find_parents())" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "soup.find_all('p',string = 'Vita') #no results" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "soup.find_all('a',string = 'Vita')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "soup.find_all('a',string = 'Vita')[0].get('href')" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": true 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "response = requests.get('https://www.gamefaqs.com/vita', headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "response.status_code" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": true 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "soup = BeautifulSoup(response.content, 'html.parser')" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "soup.get_text(strip = True)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "\"Top 10 Games\"" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": false 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "soup.find_all(string = 'Top 10 Games' )" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": true 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()\n", 316 | "\n", 317 | "#Lots of elements" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "len(soup.find_all(string = 'Top 10 Games')[0].find_parents())" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()[0]" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()[1]" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()[2]" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "collapsed": false 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "top_ten_vita_games =soup.find_all(string = 'Top 10 Games')[0].find_parents()[2]" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "collapsed": false 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "type(top_ten_vita_games)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "collapsed": true 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 395 | " print(i.get_text().strip())" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "collapsed": true 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 407 | " print(i.find('a').get_text().strip())" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [], 417 | "source": [ 418 | "top_ten_vita_games.find('ol').find_all('li')[1].get_text().strip()\n", 419 | " " 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "collapsed": false 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 431 | " print(i.get_text().strip().split())" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": { 438 | "collapsed": false 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 443 | " print(' '.join((i.get_text()).strip().split()))\n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " " 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": { 456 | "collapsed": true 457 | }, 458 | "outputs": [], 459 | "source": [] 460 | } 461 | ], 462 | "metadata": { 463 | "kernelspec": { 464 | "display_name": "Python 3", 465 | "language": "python", 466 | "name": "python3" 467 | }, 468 | "language_info": { 469 | "codemirror_mode": { 470 | "name": "ipython", 471 | "version": 3 472 | }, 473 | "file_extension": ".py", 474 | "mimetype": "text/x-python", 475 | "name": "python", 476 | "nbconvert_exporter": "python", 477 | "pygments_lexer": "ipython3", 478 | "version": "3.6.0" 479 | } 480 | }, 481 | "nbformat": 4, 482 | "nbformat_minor": 1 483 | } 484 | -------------------------------------------------------------------------------- /Navigating_websites_1-4_(all parts of the series).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Navigating Gamefaqs" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import requests\n", 19 | "from bs4 import BeautifulSoup\n", 20 | "\n", 21 | "response = requests.get('https://www.gamefaqs.com', headers={'User-Agent': 'Mozilla/5.0'})" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "response.status_code" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "response = requests.get('https://www.gamefaqs.com', headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "response.status_code" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "#if you get a 401 error, try this method\n", 75 | "\n", 76 | "import requests\n", 77 | "\n", 78 | "session = requests.Session()\n", 79 | "response = session.get('https://www.gamefaqs.com', headers={'User-Agent': 'Mozilla/5.0'})\n", 80 | "\n", 81 | "print(response.status_code)\n", 82 | "\n", 83 | "\n", 84 | "\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "soup = BeautifulSoup(response.content, 'html.parser')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "soup.title" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "soup.get_text(strip = True) #gets rid of new lines" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "#Let's get info about Vita" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "soup.find_all(string = 'Vita') #capitalized" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "soup.find_all(string = 'vita') #lower-case" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "type(soup.find_all(string = 'Vita')) #element-tag" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "len(soup.find_all(string = 'Vita')) #how many results" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "soup.find_all(string = 'Vita')[0].find_parents()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "len(soup.find_all(string = 'Vita')[0].find_parents())" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "soup.find_all('p',string = 'Vita') #no results" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "soup.find_all('a',string = 'Vita')" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "soup.find_all('a',string = 'Vita')[0].get('href')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "response = requests.get('https://www.gamefaqs.com/vita', headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "response.status_code" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "soup = BeautifulSoup(response.content, 'html.parser')" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "soup.get_text(strip = True)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "\"Top 10 Games\"" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "soup.find_all(string = 'Top 10 Games' )" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()\n", 314 | "\n", 315 | "#Lots of elements" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "len(soup.find_all(string = 'Top 10 Games')[0].find_parents())" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()[0]" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()[1]" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "soup.find_all(string = 'Top 10 Games')[0].find_parents()[2]" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": { 366 | "collapsed": false 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "top_ten_vita_games =soup.find_all(string = 'Top 10 Games')[0].find_parents()[2]" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "type(top_ten_vita_games)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 393 | " print(i.get_text().strip())" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": true 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 405 | " print(i.find('a').get_text().strip())" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": { 412 | "collapsed": false 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "top_ten_vita_games.find('ol').find_all('li')[1].get_text().strip()\n", 417 | " " 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": false 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 429 | " print(i.get_text().strip().split())" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": { 436 | "collapsed": false 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "for i in top_ten_vita_games.find('ol').find_all('li'):\n", 441 | " print(' '.join((i.get_text()).strip().split()))\n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " " 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "# 'Upcoming PlayStation Vita Game Releases'" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": { 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "Brief Overview of some of these methods:\n", 466 | " \n", 467 | "find() and find_all() \n", 468 | "find_parents() and find_parent()\n", 469 | "find_next_siblings() and find_next_si`bling()\n", 470 | "find_previous_siblings() and find_previous_sibling()\n", 471 | "find_all_next() and find_next()\n", 472 | "find_all_previous() and find_previous()" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "collapsed": true 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "#Sample html\n", 484 | "\n", 485 | "\n", 486 | "html1 = '''\n", 487 | "\n", 488 | "\n", 489 | "HTML Sample Cell\n", 490 | " \n", 491 | "
\n", 492 | "

New York

\n", 493 | "

New York is the home of various cultures and ethnicities.

\n", 494 | "
\n", 495 | "\n", 496 | "
\n", 497 | "

London

\n", 498 | "

London is the home of various cultures and ethnicities.

\n", 499 | "
\n", 500 | "'''" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "collapsed": true 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "soup = BeautifulSoup(html1, 'html.parser')" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": { 518 | "collapsed": true 519 | }, 520 | "outputs": [], 521 | "source": [ 522 | "soup.find_all('div')" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "collapsed": true 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "#find parents" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": { 540 | "collapsed": false 541 | }, 542 | "outputs": [], 543 | "source": [ 544 | "soup.find(string = 'New York').find_parent()" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": true 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "soup.find(string = 'New York').find_parents()" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": { 562 | "collapsed": false 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "#find_siblings\n", 567 | "\n", 568 | "soup.find(string = 'New York').find_next_siblings()\n", 569 | "\n" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "collapsed": false 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "soup.find(string = 'New York').find_previous_siblings()" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": { 587 | "collapsed": false 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "soup.find(string = 'New York').find_parent().find_next_sibling()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": { 598 | "collapsed": false 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "soup.find(string = 'New York').find_parent().find_previous_siblings()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "collapsed": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "soup.find(string = 'New York').find_parent().find_previous_sibling()" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": { 620 | "collapsed": false 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "soup.find(string = 'New York').find_parent().find_next_sibling().find_previous_siblings()" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": { 631 | "collapsed": true 632 | }, 633 | "outputs": [], 634 | "source": [ 635 | "#find next and previous\n", 636 | "find_all_next() and find_next()\n", 637 | "find_all_previous() and find_previous()" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": { 644 | "collapsed": false 645 | }, 646 | "outputs": [], 647 | "source": [ 648 | "soup.find(string = 'New York').find_next()" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": { 655 | "collapsed": false 656 | }, 657 | "outputs": [], 658 | "source": [ 659 | "soup.find(string = 'New York').find_next().find_next()" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": { 666 | "collapsed": false 667 | }, 668 | "outputs": [], 669 | "source": [ 670 | "soup.find(string = 'New York').find_all_next()" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": { 677 | "collapsed": false 678 | }, 679 | "outputs": [], 680 | "source": [ 681 | "soup.find(string = 'London').find_all_next()" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": { 688 | "collapsed": false 689 | }, 690 | "outputs": [], 691 | "source": [ 692 | "soup.find(string = 'New York').find_previous() #same as parent" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": { 699 | "collapsed": false 700 | }, 701 | "outputs": [], 702 | "source": [ 703 | "soup.find(string = 'New York').find_all_previous() #same as all parents" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": { 710 | "collapsed": true 711 | }, 712 | "outputs": [], 713 | "source": [] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "# Let's find UPCOMING VITA RELEASES" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 39, 725 | "metadata": { 726 | "collapsed": true 727 | }, 728 | "outputs": [], 729 | "source": [ 730 | "import requests\n", 731 | "from bs4 import BeautifulSoup" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 40, 737 | "metadata": { 738 | "collapsed": true 739 | }, 740 | "outputs": [], 741 | "source": [ 742 | "response = requests.get('https://www.gamefaqs.com/vita', headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": 41, 748 | "metadata": { 749 | "collapsed": false 750 | }, 751 | "outputs": [ 752 | { 753 | "data": { 754 | "text/plain": [ 755 | "200" 756 | ] 757 | }, 758 | "execution_count": 41, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "response.status_code" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": 42, 770 | "metadata": { 771 | "collapsed": true 772 | }, 773 | "outputs": [], 774 | "source": [ 775 | "soup = BeautifulSoup(response.content, 'html.parser')" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 43, 781 | "metadata": { 782 | "collapsed": false 783 | }, 784 | "outputs": [ 785 | { 786 | "data": { 787 | "text/plain": [ 788 | "'Upcoming PlayStation Vita Game Releases'" 789 | ] 790 | }, 791 | "execution_count": 43, 792 | "metadata": {}, 793 | "output_type": "execute_result" 794 | } 795 | ], 796 | "source": [ 797 | "soup.find(string = 'Upcoming PlayStation Vita Game Releases')" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": { 804 | "collapsed": true 805 | }, 806 | "outputs": [], 807 | "source": [ 808 | "soup.find(string = 'Upcoming PlayStation Vita Game Releases').find_parents()" 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": 7, 814 | "metadata": { 815 | "collapsed": false 816 | }, 817 | "outputs": [ 818 | { 819 | "data": { 820 | "text/plain": [ 821 | "11" 822 | ] 823 | }, 824 | "execution_count": 7, 825 | "metadata": {}, 826 | "output_type": "execute_result" 827 | } 828 | ], 829 | "source": [ 830 | "len(soup.find(string = 'Upcoming PlayStation Vita Game Releases').find_parents())" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": null, 836 | "metadata": { 837 | "collapsed": true 838 | }, 839 | "outputs": [], 840 | "source": [ 841 | "for i in range(11):\n", 842 | " \n", 843 | " print(soup.find(string = 'Upcoming PlayStation Vita Game Releases').find_parents()[i])\n", 844 | " print('\\n***************************************************')" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": { 851 | "collapsed": true 852 | }, 853 | "outputs": [], 854 | "source": [] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": { 860 | "collapsed": true 861 | }, 862 | "outputs": [], 863 | "source": [ 864 | "soup.find(string = 'Upcoming PlayStation Vita Game Releases').find_parents()" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "metadata": { 871 | "collapsed": true 872 | }, 873 | "outputs": [], 874 | "source": [ 875 | "soup.find(string = 'Upcoming PlayStation Vita Game Releases').find_parents()[2]" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 45, 881 | "metadata": { 882 | "collapsed": true 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "North_America_games = soup.find(string = 'Upcoming PlayStation Vita Game Releases').find_parents()[2]" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": null, 892 | "metadata": { 893 | "collapsed": true 894 | }, 895 | "outputs": [], 896 | "source": [ 897 | "North_America_games" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": 47, 903 | "metadata": { 904 | "collapsed": false 905 | }, 906 | "outputs": [ 907 | { 908 | "name": "stdout", 909 | "output_type": "stream", 910 | "text": [ 911 | "07/18\n", 912 | "99Vidas\n", 913 | "Fallen Legion: Flames of Rebellion\n", 914 | "07/28\n", 915 | "Collar x Malice\n", 916 | "08/11\n", 917 | "Drive Girls\n", 918 | "08/15\n", 919 | "Summon Night 6: Lost Borders\n", 920 | "08/29\n", 921 | "WindJammers\n", 922 | "08/31\n", 923 | "Mary Skelter: Nightmares\n", 924 | "09/12\n", 925 | "Ys VIII: Lacrimosa of DANA\n", 926 | "09/26\n", 927 | "Danganronpa V3: Killing Harmony\n", 928 | "09/29\n", 929 | "Bad Apple Wars\n", 930 | "10/10\n", 931 | "Touhou Kobuto V: Burst Battle\n", 932 | "10/24\n", 933 | "Yomawari: Midnight Shadows\n", 934 | "11/30\n", 935 | "Tokyo Tattoo Girls\n", 936 | "Utawarerumono: Mask of Truth\n", 937 | "03/20\n", 938 | "Penny-Punching Princess\n" 939 | ] 940 | } 941 | ], 942 | "source": [ 943 | "for i in North_America_games.find_all('dl')[:-1]:\n", 944 | " print (i.find('dt').get_text())\n", 945 | " for games in (i.find_all('dd')):\n", 946 | " print(games.get_text())" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": null, 952 | "metadata": { 953 | "collapsed": true 954 | }, 955 | "outputs": [], 956 | "source": [] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": null, 961 | "metadata": { 962 | "collapsed": true 963 | }, 964 | "outputs": [], 965 | "source": [ 966 | "soup.find(string = 'Japan').find_all_next()" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": null, 972 | "metadata": { 973 | "collapsed": true 974 | }, 975 | "outputs": [], 976 | "source": [ 977 | "len(soup.find(string = 'Japan').find_all_next())" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 22, 983 | "metadata": { 984 | "collapsed": false 985 | }, 986 | "outputs": [ 987 | { 988 | "data": { 989 | "text/plain": [ 990 | "[

See Also...

,

North America

,

Japan

,

Europe

]" 991 | ] 992 | }, 993 | "execution_count": 22, 994 | "metadata": {}, 995 | "output_type": "execute_result" 996 | } 997 | ], 998 | "source": [ 999 | "soup.select('h3')" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": 28, 1005 | "metadata": { 1006 | "collapsed": false 1007 | }, 1008 | "outputs": [ 1009 | { 1010 | "data": { 1011 | "text/plain": [ 1012 | "
Dragon Ball Z: Battle of Z (Welcome Price!!)
" 1013 | ] 1014 | }, 1015 | "execution_count": 28, 1016 | "metadata": {}, 1017 | "output_type": "execute_result" 1018 | } 1019 | ], 1020 | "source": [ 1021 | "soup.select('h3')[2].find_next().find_next().find_next().find_next()" 1022 | ] 1023 | }, 1024 | { 1025 | "cell_type": "code", 1026 | "execution_count": 34, 1027 | "metadata": { 1028 | "collapsed": true 1029 | }, 1030 | "outputs": [], 1031 | "source": [ 1032 | "append_list =[]\n", 1033 | "x =soup.select('h3')[2]\n", 1034 | "\n", 1035 | "while str(x) != '

Europe

':\n", 1036 | " x = x.find_next()\n", 1037 | " append_list.append(x.get_text())\n", 1038 | " \n", 1039 | " " 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "metadata": { 1046 | "collapsed": true 1047 | }, 1048 | "outputs": [], 1049 | "source": [] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": 36, 1054 | "metadata": { 1055 | "collapsed": false 1056 | }, 1057 | "outputs": [ 1058 | { 1059 | "name": "stdout", 1060 | "output_type": "stream", 1061 | "text": [ 1062 | "07/20\n", 1063 | "Dragon Ball Z: Battle of Z (Welcome Price!!)\n", 1064 | "Farming Simulator 18: Pocket Nouen 4\n", 1065 | "HimeHibi: Princess Days\n", 1066 | "Tokyo Ghoul: Jail (Welcome Price!!)\n", 1067 | "Under Night In-Birth Exe:Late[st]\n", 1068 | "World Election\n", 1069 | "07/21\n", 1070 | "Onigiri\n", 1071 | "07/27\n", 1072 | "Grisaia no Kajitsu: Side Episode\n", 1073 | "Hiiro no Kakera: Omoi Iro no Kioku\n", 1074 | "Kenka Bancho Otome: Kanzenmuketsu no My Honey\n", 1075 | "Utsusemi no Mawari\n", 1076 | "Wagamama High Spec\n", 1077 | "08/24\n", 1078 | "Futagoza no Paradox\n", 1079 | "Starry * Sky: Autumn Stories\n", 1080 | "Taishou Mebiusline: Teito Bibouroku Hare\n", 1081 | "The Lost Child\n", 1082 | "Yomawari: Midnight Shadows\n", 1083 | "08/31\n", 1084 | "Crank In\n", 1085 | "Dungeon Travelers 2: Ouritsu Toshokan to Mamono no Fuuin (Aqua...\n", 1086 | "To Heart 2: Dungeon Travelers (Aquaprice 2800)\n" 1087 | ] 1088 | } 1089 | ], 1090 | "source": [ 1091 | "append_list\n", 1092 | "from collections import OrderedDict\n", 1093 | "list(OrderedDict.fromkeys(append_list))\n", 1094 | "\n", 1095 | "for i in list(OrderedDict.fromkeys(append_list))[:-2]:\n", 1096 | " print(i)\n" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": null, 1102 | "metadata": { 1103 | "collapsed": true 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "list(OrderedDict.fromkeys(append_list))" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": null, 1113 | "metadata": { 1114 | "collapsed": true 1115 | }, 1116 | "outputs": [], 1117 | "source": [ 1118 | "\n", 1119 | "\n", 1120 | "\n", 1121 | "###THE END####" 1122 | ] 1123 | } 1124 | ], 1125 | "metadata": { 1126 | "kernelspec": { 1127 | "display_name": "Python 3", 1128 | "language": "python", 1129 | "name": "python3" 1130 | }, 1131 | "language_info": { 1132 | "codemirror_mode": { 1133 | "name": "ipython", 1134 | "version": 3 1135 | }, 1136 | "file_extension": ".py", 1137 | "mimetype": "text/x-python", 1138 | "name": "python", 1139 | "nbconvert_exporter": "python", 1140 | "pygments_lexer": "ipython3", 1141 | "version": "3.6.0" 1142 | } 1143 | }, 1144 | "nbformat": 4, 1145 | "nbformat_minor": 1 1146 | } 1147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gentle-Intro-to-Web-Scraping 2 | This is the code to my Gentle Intro to Web Scraping series. Enjoy! 3 | --------------------------------------------------------------------------------