└── scrapey_scrape.ipynb /scrapey_scrape.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e240dfab", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "openai_key=\"DEFINE_YOUR_KEY_HERE\"" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "54c2d44f", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from bs4 import BeautifulSoup\n", 21 | "from playwright.async_api import async_playwright\n", 22 | "from langchain.chains import create_extraction_chain" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "dc9e556d", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "async def run_playwright(site):\n", 33 | " data = \"\"\n", 34 | " async with async_playwright() as p:\n", 35 | " browser = await p.chromium.launch(headless=True)\n", 36 | "\n", 37 | " page = await browser.new_page()\n", 38 | " await page.goto(site)\n", 39 | "\n", 40 | " page_source = await page.content()\n", 41 | " soup = BeautifulSoup(page_source, \"html.parser\")\n", 42 | " \n", 43 | " for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n", 44 | " script.extract()\n", 45 | " # get text\n", 46 | " text = soup.get_text()\n", 47 | " # break into lines and remove leading and trailing space on each\n", 48 | " lines = (line.strip() for line in text.splitlines())\n", 49 | " # break multi-headlines into a line each\n", 50 | " chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n", 51 | " # drop blank lines\n", 52 | " data = '\\n'.join(chunk for chunk in chunks if chunk)\n", 53 | "\n", 54 | " await browser.close()\n", 55 | " return data\n", 56 | "\n", 57 | "output = await run_playwright(\"https://www.youtube.com/@tylerwhatsgood/videos\")\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "cd7e39a7", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from langchain.chat_models import ChatOpenAI\n", 68 | "from langchain.chains import create_extraction_chain\n", 69 | "\n", 70 | "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n", 71 | "structured_schema = {\n", 72 | " \"properties\": {\n", 73 | " \"video_name\": {\"type\": \"string\"},\n", 74 | " \"views\": {\"type\": \"integer\"},\n", 75 | " },\n", 76 | " \"required\": [\"video_name\", \"views\"],\n", 77 | "}\n", 78 | "extraction_chain = create_extraction_chain(structured_schema, llm)\n", 79 | "extraction_chain.run(output)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "b9846eda", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "async def run_playwright(site):\n", 90 | " data = \"\"\n", 91 | " async with async_playwright() as p:\n", 92 | " browser = await p.chromium.launch(headless=True)\n", 93 | "\n", 94 | " page = await browser.new_page()\n", 95 | " await page.goto(site)\n", 96 | "\n", 97 | " page_source = await page.content()\n", 98 | " soup = BeautifulSoup(page_source, \"html.parser\")\n", 99 | "\n", 100 | " for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n", 101 | " script.extract()\n", 102 | " # get text\n", 103 | " text = soup.get_text()\n", 104 | " # break into lines and remove leading and trailing space on each\n", 105 | " lines = (line.strip() for line in text.splitlines())\n", 106 | " # break multi-headlines into a line each\n", 107 | " chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n", 108 | " # drop blank lines\n", 109 | " data = '\\n'.join(chunk for chunk in chunks if chunk)\n", 110 | "\n", 111 | " await browser.close()\n", 112 | " return data" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "5c16b1f5", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "output = await run_playwright(\"https://www.futuretools.io/\")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "2b6b2999", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "output" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "e6260e44", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from langchain.chat_models import ChatOpenAI\n", 143 | "from langchain.chains import create_extraction_chain\n", 144 | "\n", 145 | "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n", 146 | "\n", 147 | "structured_schema = {\n", 148 | " \"properties\": {\n", 149 | " \"product\": {\"type\": \"string\"},\n", 150 | " \"description\": {\"type\": \"string\"},\n", 151 | " },\n", 152 | " \"required\": [\"product\", \"description\"],\n", 153 | "}\n", 154 | "extraction_chain = create_extraction_chain(structured_schema, llm)\n", 155 | "extraction_chain.run(output)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "d5b9cf69", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "async def run_playwright(site):\n", 166 | " data = \"\"\n", 167 | " async with async_playwright() as p:\n", 168 | " browser = await p.chromium.launch(headless=True)\n", 169 | "\n", 170 | " page = await browser.new_page()\n", 171 | " await page.goto(site)\n", 172 | "\n", 173 | " page_source = await page.content()\n", 174 | " soup = BeautifulSoup(page_source, \"html.parser\")\n", 175 | "\n", 176 | " for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n", 177 | " script.extract()\n", 178 | " # get text\n", 179 | " text = soup.get_text()\n", 180 | " # break into lines and remove leading and trailing space on each\n", 181 | " lines = (line.strip() for line in text.splitlines())\n", 182 | " # break multi-headlines into a line each\n", 183 | " chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n", 184 | " # drop blank lines\n", 185 | " data = '\\n'.join(chunk for chunk in chunks if chunk)\n", 186 | "\n", 187 | " await browser.close()\n", 188 | " return data\n", 189 | "\n", 190 | "output = await run_playwright(\"https://news.ycombinator.com/news\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "0d5f138d", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "output" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "b75c8340", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "from langchain.chat_models import ChatOpenAI\n", 211 | "from langchain.chains import create_extraction_chain\n", 212 | "\n", 213 | "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n", 214 | "structured_schema = {\n", 215 | " \"properties\": {\n", 216 | " \"name\": {\"type\": \"string\"},\n", 217 | " \"points\": {\"type\": \"integer\"},\n", 218 | " \"comments\": {\"type\": \"integer\"},\n", 219 | " \"url\": {\"type\":\"string\"}\n", 220 | " },\n", 221 | " \"required\": [\"name\", \"points\", \"comments\"],\n", 222 | "}\n", 223 | "extraction_chain = create_extraction_chain(structured_schema, llm)\n", 224 | "extraction_chain.run(output)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "454531ac", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "async def run_playwright(site):\n", 235 | " data = \"\"\n", 236 | " async with async_playwright() as p:\n", 237 | " browser = await p.chromium.launch(headless=True)\n", 238 | "\n", 239 | " page = await browser.new_page()\n", 240 | " await page.goto(site)\n", 241 | "\n", 242 | " page_source = await page.content()\n", 243 | " soup = BeautifulSoup(page_source, \"html.parser\")\n", 244 | " \n", 245 | " for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n", 246 | " script.extract()\n", 247 | " # get text\n", 248 | " text = soup.get_text()\n", 249 | " # break into lines and remove leading and trailing space on each\n", 250 | " lines = (line.strip() for line in text.splitlines())\n", 251 | " # break multi-headlines into a line each\n", 252 | " chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n", 253 | " # drop blank lines\n", 254 | " data = '\\n'.join(chunk for chunk in chunks if chunk)\n", 255 | "\n", 256 | " await browser.close()\n", 257 | " return data\n", 258 | "\n", 259 | "output = await run_playwright(\"https://www.monster.com/jobs/l-los-angeles-ca?page=1\")" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "cf0def6c", 266 | "metadata": { 267 | "scrolled": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "output" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "id": "487a5464", 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "from langchain.chat_models import ChatOpenAI\n", 282 | "from langchain.chains import create_extraction_chain\n", 283 | "\n", 284 | "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n", 285 | "structured_schema = {\n", 286 | " \"properties\": {\n", 287 | " \"job_name\": {\"type\": \"string\"},\n", 288 | " \"posted_date\": {\"type\": \"string\"}\n", 289 | " },\n", 290 | " \"required\": [\"job_name\", \"posted_date\"],\n", 291 | "}\n", 292 | "extraction_chain = create_extraction_chain(structured_schema, llm)\n", 293 | "extraction_chain.run(output)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "f4b11e91", 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "\n", 304 | "async def run_playwright(site):\n", 305 | " data = \"\"\n", 306 | " async with async_playwright() as p:\n", 307 | " browser = await p.chromium.launch(headless=True)\n", 308 | "\n", 309 | " page = await browser.new_page()\n", 310 | " await page.goto(site)\n", 311 | "\n", 312 | " page_source = await page.content()\n", 313 | " soup = BeautifulSoup(page_source, \"html.parser\")\n", 314 | " \n", 315 | " for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n", 316 | " script.extract()\n", 317 | " # get text\n", 318 | " text = soup.get_text()\n", 319 | " # break into lines and remove leading and trailing space on each\n", 320 | " lines = (line.strip() for line in text.splitlines())\n", 321 | " # break multi-headlines into a line each\n", 322 | " chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n", 323 | " # drop blank lines\n", 324 | " data = '\\n'.join(chunk for chunk in chunks if chunk)\n", 325 | "\n", 326 | " await browser.close()\n", 327 | " return data\n", 328 | "\n", 329 | "output = await run_playwright(\"https://financialservices.house.gov/about/members.htm\")\n", 330 | "\n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "04319b51", 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from langchain.chat_models import ChatOpenAI\n", 341 | "from langchain.chains import create_extraction_chain\n", 342 | "\n", 343 | "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n", 344 | "structured_schema = {\n", 345 | " \"properties\": {\n", 346 | " \"member_name\": {\"type\": \"string\"},\n", 347 | " \"state\": {\"type\": \"string\"},\n", 348 | " },\n", 349 | " \"required\": [\"member_name\", \"state\"],\n", 350 | "}\n", 351 | "extraction_chain = create_extraction_chain(structured_schema, llm)\n", 352 | "extraction_chain.run(output)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "id": "db5c2bae", 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "data = [{'member_name': 'Patrick McHenry', 'state': 'North Carolina'},\n", 363 | " {'member_name': 'Frank D. Lucas', 'state': 'Oklahoma'},\n", 364 | " {'member_name': 'Pete Sessions', 'state': 'Texas'},\n", 365 | " {'member_name': 'Bill Posey', 'state': 'Florida'},\n", 366 | " {'member_name': 'Blaine Luetkemeyer', 'state': 'Missouri'},\n", 367 | " {'member_name': 'Bill Huizenga', 'state': 'Michigan'},\n", 368 | " {'member_name': 'Ann Wagner', 'state': 'Missouri'},\n", 369 | " {'member_name': 'Andy Barr', 'state': 'Kentucky'},\n", 370 | " {'member_name': 'Roger Williams', 'state': 'Texas'},\n", 371 | " {'member_name': 'French Hill', 'state': 'Arkansas'},\n", 372 | " {'member_name': 'Tom Emmer', 'state': 'Minnesota'},\n", 373 | " {'member_name': 'Barry Loudermilk', 'state': 'Georgia'},\n", 374 | " {'member_name': 'Alexander X. Mooney', 'state': 'West Virginia'},\n", 375 | " {'member_name': 'Warren Davidson', 'state': 'Ohio'},\n", 376 | " {'member_name': 'John Rose', 'state': 'Tennessee'},\n", 377 | " {'member_name': 'Bryan Steil', 'state': 'Wisconsin'},\n", 378 | " {'member_name': 'William Timmons', 'state': 'South Carolina'},\n", 379 | " {'member_name': 'Ralph Norman', 'state': 'South Carolina'},\n", 380 | " {'member_name': 'Dan Meuser', 'state': 'Pennsylvania'},\n", 381 | " {'member_name': 'Scott Fitzgerald', 'state': 'Wisconsin'},\n", 382 | " {'member_name': 'Andrew Garbarino', 'state': 'New York'},\n", 383 | " {'member_name': 'Young Kim', 'state': 'California'},\n", 384 | " {'member_name': 'Byron Donalds', 'state': 'Florida'},\n", 385 | " {'member_name': 'Mike Flood', 'state': 'Nebraska'},\n", 386 | " {'member_name': 'Mike Lawler', 'state': 'New York'},\n", 387 | " {'member_name': 'Zach Nunn', 'state': 'Iowa'},\n", 388 | " {'member_name': 'Monica De La Cruz', 'state': 'Texas'},\n", 389 | " {'member_name': 'Erin Houchin', 'state': 'Indiana'},\n", 390 | " {'member_name': 'Andy Ogles', 'state': 'Tennessee'},\n", 391 | " {'member_name': 'Maxine Waters', 'state': 'California'},\n", 392 | " {'member_name': 'Nydia M. Velázquez', 'state': 'New York'},\n", 393 | " {'member_name': 'Brad Sherman', 'state': 'California'},\n", 394 | " {'member_name': 'Gregory W. Meeks', 'state': 'New York'},\n", 395 | " {'member_name': 'David Scott', 'state': 'Georgia'},\n", 396 | " {'member_name': 'Stephen F. Lynch', 'state': 'Massachusetts'},\n", 397 | " {'member_name': 'Al Green', 'state': 'Texas'},\n", 398 | " {'member_name': 'Emanuel Cleaver, II', 'state': 'Missouri'},\n", 399 | " {'member_name': 'Jim A. Himes', 'state': 'Connecticut'},\n", 400 | " {'member_name': 'Bill Foster', 'state': 'Illinois'},\n", 401 | " {'member_name': 'Joyce Beatty', 'state': 'Ohio'},\n", 402 | " {'member_name': 'Juan Vargas', 'state': 'California'},\n", 403 | " {'member_name': 'Josh Gottheimer', 'state': 'New Jersey'},\n", 404 | " {'member_name': 'Vicente Gonzalez', 'state': 'Texas'},\n", 405 | " {'member_name': 'Sean Casten', 'state': 'Illinois'},\n", 406 | " {'member_name': 'Ayanna Pressley', 'state': 'Massachusetts'},\n", 407 | " {'member_name': 'Steven Horsford', 'state': 'Nevada'},\n", 408 | " {'member_name': 'Rashida Tlaib', 'state': 'Michigan'},\n", 409 | " {'member_name': 'Ritchie Torres', 'state': 'New York'},\n", 410 | " {'member_name': 'Sylvia Garcia', 'state': 'Texas'},\n", 411 | " {'member_name': 'Nikema Williams', 'state': 'Georgia'},\n", 412 | " {'member_name': 'Wiley Nickel', 'state': 'North Carolina'},\n", 413 | " {'member_name': 'Brittany Pettersen', 'state': 'Colorado'}]" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "id": "1012e9eb", 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "from collections import Counter\n", 424 | "\n", 425 | "def find_most_potentially_corrupt_states(data):\n", 426 | " state_counts = Counter(item['state'] for item in data)\n", 427 | " most_common_state, count = state_counts.most_common(1)[0]\n", 428 | " print(\"Most common state:\", most_common_state)\n", 429 | " print(\"Congress ppl with potential to become corrupted\", count)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "b330b017", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "find_most_potentially_corrupt_states(data)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "id": "bb4a5a82", 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "id": "f176fd12", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "id": "0044ef38", 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "id": "70630e76", 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "610a2e58", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "id": "f5d22b35", 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "id": "0b7e1102", 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "id": "54b7a081", 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "id": "f5c97513", 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "\n", 514 | "\n" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "id": "5df0772d", 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "id": "1111a2a5", 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [] 532 | } 533 | ], 534 | "metadata": { 535 | "kernelspec": { 536 | "display_name": "askmoney", 537 | "language": "python", 538 | "name": "askmoney" 539 | }, 540 | "language_info": { 541 | "codemirror_mode": { 542 | "name": "ipython", 543 | "version": 3 544 | }, 545 | "file_extension": ".py", 546 | "mimetype": "text/x-python", 547 | "name": "python", 548 | "nbconvert_exporter": "python", 549 | "pygments_lexer": "ipython3", 550 | "version": "3.9.12" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 5 555 | } 556 | --------------------------------------------------------------------------------