└── scrapey_scrape.ipynb


/scrapey_scrape.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "e240dfab",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "openai_key=\"DEFINE_YOUR_KEY_HERE\""
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "54c2d44f",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from bs4 import BeautifulSoup\n",
 21 |     "from playwright.async_api import async_playwright\n",
 22 |     "from langchain.chains import create_extraction_chain"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "dc9e556d",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "async def run_playwright(site):\n",
 33 |     "    data = \"\"\n",
 34 |     "    async with async_playwright() as p:\n",
 35 |     "        browser = await p.chromium.launch(headless=True)\n",
 36 |     "\n",
 37 |     "        page = await browser.new_page()\n",
 38 |     "        await page.goto(site)\n",
 39 |     "\n",
 40 |     "        page_source = await page.content()\n",
 41 |     "        soup = BeautifulSoup(page_source, \"html.parser\")\n",
 42 |     "        \n",
 43 |     "        for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n",
 44 |     "            script.extract()\n",
 45 |     "        # get text\n",
 46 |     "        text = soup.get_text()\n",
 47 |     "        # break into lines and remove leading and trailing space on each\n",
 48 |     "        lines = (line.strip() for line in text.splitlines())\n",
 49 |     "        # break multi-headlines into a line each\n",
 50 |     "        chunks = (phrase.strip() for line in lines for phrase in line.split(\"  \"))\n",
 51 |     "        # drop blank lines\n",
 52 |     "        data = '\\n'.join(chunk for chunk in chunks if chunk)\n",
 53 |     "\n",
 54 |     "        await browser.close()\n",
 55 |     "    return data\n",
 56 |     "\n",
 57 |     "output = await run_playwright(\"https://www.youtube.com/@tylerwhatsgood/videos\")\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "cd7e39a7",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from langchain.chat_models import ChatOpenAI\n",
 68 |     "from langchain.chains import create_extraction_chain\n",
 69 |     "\n",
 70 |     "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n",
 71 |     "structured_schema = {\n",
 72 |     "    \"properties\": {\n",
 73 |     "        \"video_name\": {\"type\": \"string\"},\n",
 74 |     "        \"views\": {\"type\": \"integer\"},\n",
 75 |     "    },\n",
 76 |     "    \"required\": [\"video_name\", \"views\"],\n",
 77 |     "}\n",
 78 |     "extraction_chain = create_extraction_chain(structured_schema, llm)\n",
 79 |     "extraction_chain.run(output)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "b9846eda",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "async def run_playwright(site):\n",
 90 |     "    data = \"\"\n",
 91 |     "    async with async_playwright() as p:\n",
 92 |     "        browser = await p.chromium.launch(headless=True)\n",
 93 |     "\n",
 94 |     "        page = await browser.new_page()\n",
 95 |     "        await page.goto(site)\n",
 96 |     "\n",
 97 |     "        page_source = await page.content()\n",
 98 |     "        soup = BeautifulSoup(page_source, \"html.parser\")\n",
 99 |     "\n",
100 |     "        for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n",
101 |     "            script.extract()\n",
102 |     "        # get text\n",
103 |     "        text = soup.get_text()\n",
104 |     "        # break into lines and remove leading and trailing space on each\n",
105 |     "        lines = (line.strip() for line in text.splitlines())\n",
106 |     "        # break multi-headlines into a line each\n",
107 |     "        chunks = (phrase.strip() for line in lines for phrase in line.split(\"  \"))\n",
108 |     "        # drop blank lines\n",
109 |     "        data = '\\n'.join(chunk for chunk in chunks if chunk)\n",
110 |     "\n",
111 |     "        await browser.close()\n",
112 |     "    return data"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "5c16b1f5",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "output = await run_playwright(\"https://www.futuretools.io/\")"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "id": "2b6b2999",
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "output"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "e6260e44",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "from langchain.chat_models import ChatOpenAI\n",
143 |     "from langchain.chains import create_extraction_chain\n",
144 |     "\n",
145 |     "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n",
146 |     "\n",
147 |     "structured_schema = {\n",
148 |     "    \"properties\": {\n",
149 |     "        \"product\": {\"type\": \"string\"},\n",
150 |     "        \"description\": {\"type\": \"string\"},\n",
151 |     "    },\n",
152 |     "    \"required\": [\"product\", \"description\"],\n",
153 |     "}\n",
154 |     "extraction_chain = create_extraction_chain(structured_schema, llm)\n",
155 |     "extraction_chain.run(output)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "d5b9cf69",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "async def run_playwright(site):\n",
166 |     "    data = \"\"\n",
167 |     "    async with async_playwright() as p:\n",
168 |     "        browser = await p.chromium.launch(headless=True)\n",
169 |     "\n",
170 |     "        page = await browser.new_page()\n",
171 |     "        await page.goto(site)\n",
172 |     "\n",
173 |     "        page_source = await page.content()\n",
174 |     "        soup = BeautifulSoup(page_source, \"html.parser\")\n",
175 |     "\n",
176 |     "        for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n",
177 |     "            script.extract()\n",
178 |     "        # get text\n",
179 |     "        text = soup.get_text()\n",
180 |     "        # break into lines and remove leading and trailing space on each\n",
181 |     "        lines = (line.strip() for line in text.splitlines())\n",
182 |     "        # break multi-headlines into a line each\n",
183 |     "        chunks = (phrase.strip() for line in lines for phrase in line.split(\"  \"))\n",
184 |     "        # drop blank lines\n",
185 |     "        data = '\\n'.join(chunk for chunk in chunks if chunk)\n",
186 |     "\n",
187 |     "        await browser.close()\n",
188 |     "    return data\n",
189 |     "\n",
190 |     "output = await run_playwright(\"https://news.ycombinator.com/news\")"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "id": "0d5f138d",
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "output"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "id": "b75c8340",
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "from langchain.chat_models import ChatOpenAI\n",
211 |     "from langchain.chains import create_extraction_chain\n",
212 |     "\n",
213 |     "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n",
214 |     "structured_schema = {\n",
215 |     "    \"properties\": {\n",
216 |     "        \"name\": {\"type\": \"string\"},\n",
217 |     "        \"points\": {\"type\": \"integer\"},\n",
218 |     "        \"comments\": {\"type\": \"integer\"},\n",
219 |     "        \"url\": {\"type\":\"string\"}\n",
220 |     "    },\n",
221 |     "    \"required\": [\"name\", \"points\", \"comments\"],\n",
222 |     "}\n",
223 |     "extraction_chain = create_extraction_chain(structured_schema, llm)\n",
224 |     "extraction_chain.run(output)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "id": "454531ac",
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "async def run_playwright(site):\n",
235 |     "    data = \"\"\n",
236 |     "    async with async_playwright() as p:\n",
237 |     "        browser = await p.chromium.launch(headless=True)\n",
238 |     "\n",
239 |     "        page = await browser.new_page()\n",
240 |     "        await page.goto(site)\n",
241 |     "\n",
242 |     "        page_source = await page.content()\n",
243 |     "        soup = BeautifulSoup(page_source, \"html.parser\")\n",
244 |     "        \n",
245 |     "        for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n",
246 |     "            script.extract()\n",
247 |     "        # get text\n",
248 |     "        text = soup.get_text()\n",
249 |     "        # break into lines and remove leading and trailing space on each\n",
250 |     "        lines = (line.strip() for line in text.splitlines())\n",
251 |     "        # break multi-headlines into a line each\n",
252 |     "        chunks = (phrase.strip() for line in lines for phrase in line.split(\"  \"))\n",
253 |     "        # drop blank lines\n",
254 |     "        data = '\\n'.join(chunk for chunk in chunks if chunk)\n",
255 |     "\n",
256 |     "        await browser.close()\n",
257 |     "    return data\n",
258 |     "\n",
259 |     "output = await run_playwright(\"https://www.monster.com/jobs/l-los-angeles-ca?page=1\")"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "id": "cf0def6c",
266 |    "metadata": {
267 |     "scrolled": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "output"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "id": "487a5464",
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "from langchain.chat_models import ChatOpenAI\n",
282 |     "from langchain.chains import create_extraction_chain\n",
283 |     "\n",
284 |     "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n",
285 |     "structured_schema = {\n",
286 |     "    \"properties\": {\n",
287 |     "        \"job_name\": {\"type\": \"string\"},\n",
288 |     "        \"posted_date\": {\"type\": \"string\"}\n",
289 |     "    },\n",
290 |     "    \"required\": [\"job_name\", \"posted_date\"],\n",
291 |     "}\n",
292 |     "extraction_chain = create_extraction_chain(structured_schema, llm)\n",
293 |     "extraction_chain.run(output)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "id": "f4b11e91",
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "\n",
304 |     "async def run_playwright(site):\n",
305 |     "    data = \"\"\n",
306 |     "    async with async_playwright() as p:\n",
307 |     "        browser = await p.chromium.launch(headless=True)\n",
308 |     "\n",
309 |     "        page = await browser.new_page()\n",
310 |     "        await page.goto(site)\n",
311 |     "\n",
312 |     "        page_source = await page.content()\n",
313 |     "        soup = BeautifulSoup(page_source, \"html.parser\")\n",
314 |     "        \n",
315 |     "        for script in soup([\"script\", \"style\"]): # remove all javascript and stylesheet code\n",
316 |     "            script.extract()\n",
317 |     "        # get text\n",
318 |     "        text = soup.get_text()\n",
319 |     "        # break into lines and remove leading and trailing space on each\n",
320 |     "        lines = (line.strip() for line in text.splitlines())\n",
321 |     "        # break multi-headlines into a line each\n",
322 |     "        chunks = (phrase.strip() for line in lines for phrase in line.split(\"  \"))\n",
323 |     "        # drop blank lines\n",
324 |     "        data = '\\n'.join(chunk for chunk in chunks if chunk)\n",
325 |     "\n",
326 |     "        await browser.close()\n",
327 |     "    return data\n",
328 |     "\n",
329 |     "output = await run_playwright(\"https://financialservices.house.gov/about/members.htm\")\n",
330 |     "\n"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "id": "04319b51",
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "from langchain.chat_models import ChatOpenAI\n",
341 |     "from langchain.chains import create_extraction_chain\n",
342 |     "\n",
343 |     "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\", openai_api_key=openai_key)\n",
344 |     "structured_schema = {\n",
345 |     "    \"properties\": {\n",
346 |     "        \"member_name\": {\"type\": \"string\"},\n",
347 |     "        \"state\": {\"type\": \"string\"},\n",
348 |     "    },\n",
349 |     "    \"required\": [\"member_name\", \"state\"],\n",
350 |     "}\n",
351 |     "extraction_chain = create_extraction_chain(structured_schema, llm)\n",
352 |     "extraction_chain.run(output)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "id": "db5c2bae",
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "data = [{'member_name': 'Patrick McHenry', 'state': 'North Carolina'},\n",
363 |     " {'member_name': 'Frank D. Lucas', 'state': 'Oklahoma'},\n",
364 |     " {'member_name': 'Pete Sessions', 'state': 'Texas'},\n",
365 |     " {'member_name': 'Bill Posey', 'state': 'Florida'},\n",
366 |     " {'member_name': 'Blaine Luetkemeyer', 'state': 'Missouri'},\n",
367 |     " {'member_name': 'Bill Huizenga', 'state': 'Michigan'},\n",
368 |     " {'member_name': 'Ann Wagner', 'state': 'Missouri'},\n",
369 |     " {'member_name': 'Andy Barr', 'state': 'Kentucky'},\n",
370 |     " {'member_name': 'Roger Williams', 'state': 'Texas'},\n",
371 |     " {'member_name': 'French Hill', 'state': 'Arkansas'},\n",
372 |     " {'member_name': 'Tom Emmer', 'state': 'Minnesota'},\n",
373 |     " {'member_name': 'Barry Loudermilk', 'state': 'Georgia'},\n",
374 |     " {'member_name': 'Alexander X. Mooney', 'state': 'West Virginia'},\n",
375 |     " {'member_name': 'Warren Davidson', 'state': 'Ohio'},\n",
376 |     " {'member_name': 'John Rose', 'state': 'Tennessee'},\n",
377 |     " {'member_name': 'Bryan Steil', 'state': 'Wisconsin'},\n",
378 |     " {'member_name': 'William Timmons', 'state': 'South Carolina'},\n",
379 |     " {'member_name': 'Ralph Norman', 'state': 'South Carolina'},\n",
380 |     " {'member_name': 'Dan Meuser', 'state': 'Pennsylvania'},\n",
381 |     " {'member_name': 'Scott Fitzgerald', 'state': 'Wisconsin'},\n",
382 |     " {'member_name': 'Andrew Garbarino', 'state': 'New York'},\n",
383 |     " {'member_name': 'Young Kim', 'state': 'California'},\n",
384 |     " {'member_name': 'Byron Donalds', 'state': 'Florida'},\n",
385 |     " {'member_name': 'Mike Flood', 'state': 'Nebraska'},\n",
386 |     " {'member_name': 'Mike Lawler', 'state': 'New York'},\n",
387 |     " {'member_name': 'Zach Nunn', 'state': 'Iowa'},\n",
388 |     " {'member_name': 'Monica De La Cruz', 'state': 'Texas'},\n",
389 |     " {'member_name': 'Erin Houchin', 'state': 'Indiana'},\n",
390 |     " {'member_name': 'Andy Ogles', 'state': 'Tennessee'},\n",
391 |     " {'member_name': 'Maxine Waters', 'state': 'California'},\n",
392 |     " {'member_name': 'Nydia M. Velázquez', 'state': 'New York'},\n",
393 |     " {'member_name': 'Brad Sherman', 'state': 'California'},\n",
394 |     " {'member_name': 'Gregory W. Meeks', 'state': 'New York'},\n",
395 |     " {'member_name': 'David Scott', 'state': 'Georgia'},\n",
396 |     " {'member_name': 'Stephen F. Lynch', 'state': 'Massachusetts'},\n",
397 |     " {'member_name': 'Al Green', 'state': 'Texas'},\n",
398 |     " {'member_name': 'Emanuel Cleaver, II', 'state': 'Missouri'},\n",
399 |     " {'member_name': 'Jim A. Himes', 'state': 'Connecticut'},\n",
400 |     " {'member_name': 'Bill Foster', 'state': 'Illinois'},\n",
401 |     " {'member_name': 'Joyce Beatty', 'state': 'Ohio'},\n",
402 |     " {'member_name': 'Juan Vargas', 'state': 'California'},\n",
403 |     " {'member_name': 'Josh Gottheimer', 'state': 'New Jersey'},\n",
404 |     " {'member_name': 'Vicente Gonzalez', 'state': 'Texas'},\n",
405 |     " {'member_name': 'Sean Casten', 'state': 'Illinois'},\n",
406 |     " {'member_name': 'Ayanna Pressley', 'state': 'Massachusetts'},\n",
407 |     " {'member_name': 'Steven Horsford', 'state': 'Nevada'},\n",
408 |     " {'member_name': 'Rashida Tlaib', 'state': 'Michigan'},\n",
409 |     " {'member_name': 'Ritchie Torres', 'state': 'New York'},\n",
410 |     " {'member_name': 'Sylvia Garcia', 'state': 'Texas'},\n",
411 |     " {'member_name': 'Nikema Williams', 'state': 'Georgia'},\n",
412 |     " {'member_name': 'Wiley Nickel', 'state': 'North Carolina'},\n",
413 |     " {'member_name': 'Brittany Pettersen', 'state': 'Colorado'}]"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "id": "1012e9eb",
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "from collections import Counter\n",
424 |     "\n",
425 |     "def find_most_potentially_corrupt_states(data):\n",
426 |     "    state_counts = Counter(item['state'] for item in data)\n",
427 |     "    most_common_state, count = state_counts.most_common(1)[0]\n",
428 |     "    print(\"Most common state:\", most_common_state)\n",
429 |     "    print(\"Congress ppl with potential to become corrupted\", count)"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "id": "b330b017",
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "find_most_potentially_corrupt_states(data)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "id": "bb4a5a82",
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": []
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "id": "f176fd12",
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": []
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "id": "0044ef38",
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": []
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "id": "70630e76",
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": []
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "id": "610a2e58",
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": []
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "id": "f5d22b35",
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": []
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "id": "0b7e1102",
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": []
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "id": "54b7a081",
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": []
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "id": "f5c97513",
510 |    "metadata": {},
511 |    "outputs": [],
512 |    "source": [
513 |     "\n",
514 |     "\n"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "id": "5df0772d",
521 |    "metadata": {},
522 |    "outputs": [],
523 |    "source": []
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "id": "1111a2a5",
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": []
532 |   }
533 |  ],
534 |  "metadata": {
535 |   "kernelspec": {
536 |    "display_name": "askmoney",
537 |    "language": "python",
538 |    "name": "askmoney"
539 |   },
540 |   "language_info": {
541 |    "codemirror_mode": {
542 |     "name": "ipython",
543 |     "version": 3
544 |    },
545 |    "file_extension": ".py",
546 |    "mimetype": "text/x-python",
547 |    "name": "python",
548 |    "nbconvert_exporter": "python",
549 |    "pygments_lexer": "ipython3",
550 |    "version": "3.9.12"
551 |   }
552 |  },
553 |  "nbformat": 4,
554 |  "nbformat_minor": 5
555 | }
556 | 


--------------------------------------------------------------------------------