├── .gitignore ├── .idea ├── codeStyles │ ├── codeStyleConfig.xml │ └── Project.xml ├── vcs.xml ├── .gitignore ├── misc.xml ├── modules.xml ├── csv-editor.xml └── git_toolbox_prj.xml ├── README.md ├── yellow_page_parser.iml ├── pyproject.toml ├── src ├── config_dict.py ├── ywlp │ └── __init__.py └── navigator │ └── __init__.py └── states /.gitignore: -------------------------------------------------------------------------------- 1 | /runtime/ 2 | /.envrc 3 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Yellow page parser 2 | 3 | A simple yellow page parser using `playwright` + `lxml` and my personal helper package `navigator`. 4 | 5 | This only parses 5 pages of restaurants, and stores it in CSV file 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/csv-editor.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | -------------------------------------------------------------------------------- /yellow_page_parser.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ywlp" 3 | version = "1.0.0" 4 | description = "Parses Yellow pages" 5 | requires-python = ">=3.11" 6 | dependencies = [ 7 | "click>=8.1", 8 | "termcolor", 9 | "structlog", 10 | "rich", 11 | "click-repl", 12 | "playwright", 13 | "platformdirs", 14 | "lxml" 15 | 16 | ] 17 | 18 | [project.scripts] 19 | ywlp_restaurants = "ywlp:find_restaurants_in_city" 20 | 21 | 22 | [build-system] 23 | requires = ["flit_core<4"] 24 | build-backend = "flit_core.buildapi" 25 | -------------------------------------------------------------------------------- /.idea/git_toolbox_prj.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 14 | 15 | -------------------------------------------------------------------------------- /src/config_dict.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from platformdirs import user_cache_path 4 | 5 | 6 | class ConfigDict: 7 | creds_file: Path 8 | session_file: Path 9 | cache_dir: Path 10 | base_url: str 11 | creds_file: Path 12 | 13 | def __init__(self): 14 | self.__creds_file = Path(".auths/creds.json") 15 | self.__session_file = Path(".auths/app.json") 16 | self.__runtime_dir = Path('runtime') 17 | self.__cache_dir = Path('runtime/cache') 18 | self.__base_url = "https://www.yellowpages.com/" 19 | if not self.__cache_dir.exists(): 20 | self.__cache_dir.mkdir(parents=True, exist_ok=True) 21 | 22 | ConfigDict.cache_dir = property(lambda self: self.__cache_dir) 23 | ConfigDict.runtime_dir = property(lambda self: self.__runtime_dir) 24 | ConfigDict.session_file = property(lambda self: self.__session_file) 25 | ConfigDict.base_url = property(lambda self: self.__base_url) 26 | ConfigDict.creds_file = property(lambda self: self.__creds_file) 27 | -------------------------------------------------------------------------------- /src/ywlp/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import csv 3 | import json 4 | from asyncio import sleep 5 | 6 | import click 7 | import sqlite3 8 | 9 | from playwright.async_api import async_playwright 10 | 11 | import navigator 12 | import playwright 13 | import platformdirs 14 | 15 | from config_dict import ConfigDict 16 | 17 | CATEGORIES = [ 18 | 'restaurants', 19 | ] 20 | 21 | 22 | def load_state_links(): 23 | state_dict = {} 24 | with open('states', 'r', encoding='utf-8') as fp: 25 | for line in fp: 26 | k, v = line.split(' ') 27 | state_dict[k] = v.strip() 28 | return state_dict 29 | 30 | 31 | async def finding_restaurants_in_city(): 32 | # state_dict = load_state_links() 33 | cfg = ConfigDict() 34 | url = 'https://www.yellowpages.com/new-york-ny/restaurants?s=average_rating' 35 | parsed_pages = 0 36 | data = [] 37 | async with async_playwright() as p: 38 | while url: 39 | doc = await navigator.get_doc(cfg, p, url, 'restaurants') 40 | headers = doc.xpath('//div[@class="search-results organic"]//div[@class="result"]//h2/a') 41 | for header in headers: 42 | rr = header.getparent().getparent().xpath('//div[@class="ratings"]') 43 | d = json.loads(rr[0].attrib['data-tripadvisor']) 44 | data.append({'name': header.text_content(), 'rating': d['rating'], 'count': d['count']}) 45 | pages = doc.xpath('//a[@class="next ajax-page"]') 46 | print([k for k in pages]) 47 | try: 48 | url = pages[0].attrib['href'] 49 | print(f"going to next url: {url}") 50 | await sleep(5) 51 | parsed_pages += 1 52 | if parsed_pages > 5: 53 | break 54 | except IndexError: 55 | url = None 56 | with open('output.csv', encoding='utf-8', mode='w') as f: 57 | writer = csv.DictWriter(f, ['name', 'rating', 'count']) 58 | for d in data: 59 | writer.writerow(d) 60 | 61 | 62 | @click.command() 63 | def find_restaurants_in_city(): 64 | asyncio.run(finding_restaurants_in_city()) 65 | -------------------------------------------------------------------------------- /states: -------------------------------------------------------------------------------- 1 | ak https://www.yellowpages.com/state-ak 2 | al https://www.yellowpages.com/state-al 3 | ar https://www.yellowpages.com/state-ar 4 | az https://www.yellowpages.com/state-az 5 | ca https://www.yellowpages.com/state-ca 6 | co https://www.yellowpages.com/state-co 7 | ct https://www.yellowpages.com/state-ct 8 | dc https://www.yellowpages.com/state-dc 9 | de https://www.yellowpages.com/state-de 10 | fl https://www.yellowpages.com/state-fl 11 | ga https://www.yellowpages.com/state-ga 12 | hi https://www.yellowpages.com/state-hi 13 | ia https://www.yellowpages.com/state-ia 14 | id https://www.yellowpages.com/state-id 15 | il https://www.yellowpages.com/state-il 16 | in https://www.yellowpages.com/state-in 17 | ks https://www.yellowpages.com/state-ks 18 | ky https://www.yellowpages.com/state-ky 19 | la https://www.yellowpages.com/state-la 20 | ma https://www.yellowpages.com/state-ma 21 | md https://www.yellowpages.com/state-md 22 | me https://www.yellowpages.com/state-me 23 | mi https://www.yellowpages.com/state-mi 24 | mn https://www.yellowpages.com/state-mn 25 | mo https://www.yellowpages.com/state-mo 26 | ms https://www.yellowpages.com/state-ms 27 | mt https://www.yellowpages.com/state-mt 28 | nc https://www.yellowpages.com/state-nc 29 | nd https://www.yellowpages.com/state-nd 30 | ne https://www.yellowpages.com/state-ne 31 | nh https://www.yellowpages.com/state-nh 32 | nj https://www.yellowpages.com/state-nj 33 | nm https://www.yellowpages.com/state-nm 34 | nv https://www.yellowpages.com/state-nv 35 | ny https://www.yellowpages.com/state-ny 36 | oh https://www.yellowpages.com/state-oh 37 | ok https://www.yellowpages.com/state-ok 38 | or https://www.yellowpages.com/state-or 39 | pa https://www.yellowpages.com/state-pa 40 | ri https://www.yellowpages.com/state-ri 41 | sc https://www.yellowpages.com/state-sc 42 | sd https://www.yellowpages.com/state-sd 43 | tn https://www.yellowpages.com/state-tn 44 | tx https://www.yellowpages.com/state-tx 45 | ut https://www.yellowpages.com/state-ut 46 | va https://www.yellowpages.com/state-va 47 | vt https://www.yellowpages.com/state-vt 48 | wa https://www.yellowpages.com/state-wa 49 | wi https://www.yellowpages.com/state-wi 50 | wv https://www.yellowpages.com/state-wv 51 | wy https://www.yellowpages.com/state-wy -------------------------------------------------------------------------------- /src/navigator/__init__.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import textwrap 3 | 4 | import playwright.async_api 5 | from playwright.async_api import Playwright, Browser, Page, BrowserContext, ElementHandle 6 | 7 | import logging 8 | import os 9 | import sys 10 | from pathlib import Path 11 | from typing import Literal 12 | from lxml.html import fromstring 13 | 14 | from config_dict import ConfigDict 15 | 16 | FORMATTER = logging.Formatter("%(asctime)s — %(name)s — %(levelname)s %(lineno)d — %(message)s") 17 | 18 | 19 | async def get_doc(cfg: ConfigDict, p: Playwright, url: str, page: str): 20 | path = await download_url(cfg, p, url, page) 21 | with open(path) as f: 22 | content = f.read() 23 | doc = fromstring(content) 24 | doc.make_links_absolute(cfg.base_url) 25 | return doc 26 | 27 | 28 | def get_path(cfg: ConfigDict, url: str, txt: str) -> Path: 29 | txt = txt.replace(" ", '_').replace("/", '--') 30 | upath = hash_url_and_split(url) 31 | path = prefix_data_cached(cfg, upath, txt) 32 | return path 33 | 34 | 35 | def get_console_handler(): 36 | console_handler = logging.StreamHandler(sys.stdout) 37 | console_handler.setFormatter(FORMATTER) 38 | return console_handler 39 | 40 | 41 | def get_logger(logger_name, default_level="DEBUG"): 42 | _logger = logging.getLogger(logger_name) 43 | if 'LOG_LEVEL' in os.environ: 44 | _logger.setLevel(os.environ['LOG_LEVEL']) # better to have too much log than not enough 45 | else: 46 | _logger.setLevel(default_level) # better to have too much log than not enough 47 | 48 | if not _logger.hasHandlers(): 49 | _logger.addHandler(get_console_handler()) 50 | # with this pattern, it's rarely necessary to propagate the error up to parent 51 | _logger.propagate = False 52 | return _logger 53 | 54 | 55 | class Navigator: 56 | 57 | def __init__(self, p: Playwright, headless=False): 58 | """ Must start inside playwright async context""" 59 | self.__lg = get_logger('navigator') 60 | self.__p = p 61 | self.__headless = headless 62 | self.__browser: Browser | None = None 63 | self.__current_page: Page | None = None 64 | self.__current_context: BrowserContext | None = None 65 | 66 | async def start(self, saved_session: Path | None = None, reset=False): 67 | """ Starts a browser""" 68 | if reset and self.__browser: 69 | self.__lg.debug("resetting browser") 70 | await self.__browser.close() 71 | self.__browser = None 72 | if not self.__browser: 73 | self.__lg.debug("launching browser") 74 | self.__browser = await self.__p.firefox.launch(headless=self.__headless) 75 | else: 76 | self.__lg.warning("A browser is already running, reusing, will start new context") 77 | if saved_session: 78 | self.__lg.debug(f"trying to load session file: {saved_session}") 79 | self.__current_context = await self.__browser.new_context(storage_state=saved_session) 80 | else: 81 | self.__lg.debug("using empty context") 82 | self.__current_context = await self.__browser.new_context() 83 | self.__current_page = await self.__current_context.new_page() 84 | 85 | def page(self): 86 | return self.__current_page 87 | 88 | async def wait_for(self, selector: str, state=Literal["attached", "detached", "hidden", "visible"] | None, 89 | timeout=None) -> ElementHandle | None: 90 | """ Wait for an element to be available, timeout is ms""" 91 | try: 92 | return await self.__current_page.wait_for_selector( 93 | selector=selector, 94 | state=state, 95 | timeout=timeout 96 | ) 97 | except playwright.async_api.Error: 98 | return None 99 | 100 | async def click(self, selector: str, timeout=None) -> bool: 101 | try: 102 | elm = await self.wait_for(selector, timeout=timeout) 103 | await elm.click() 104 | self.__lg.debug(f"Clicked: {selector}") 105 | return True 106 | except (playwright.async_api.Error, AttributeError): 107 | self.__lg.warning(f"Was not able to click: {selector}") 108 | return False 109 | 110 | async def goto(self, url: str): 111 | """ Load a webpage, If no page, start a new context and load page""" 112 | if not self.__current_page: 113 | self.__lg.debug("no context loaded, starting new") 114 | await self.start() 115 | await self.__current_page.goto(url) 116 | 117 | async def wait_for_state(self, timeout: float | None = None, state="load"): 118 | self.__lg.debug(f"Waiting for {state} for {timeout}") 119 | await self.__current_page.wait_for_load_state(state, timeout=timeout) 120 | self.__lg.debug(f"state reached") 121 | 122 | async def fill_input(self, selector: str, val: str) -> bool | None: 123 | try: 124 | elm: ElementHandle = await self.__current_page.wait_for_selector(selector) 125 | await elm.fill(val) 126 | except playwright.async_api.Error: 127 | return False 128 | 129 | async def store_session(self, path): 130 | await self.__current_context.storage_state(path=path) 131 | 132 | async def exit(self): 133 | if self.__browser: 134 | await self.__browser.close() 135 | 136 | 137 | def hash_url_and_split(url, how_many=2) -> str: 138 | encoded = int(hashlib.sha256(url.encode('utf-8')).hexdigest(), 16) % 10 ** 8 139 | broken = textwrap.wrap(str(encoded), how_many) 140 | return "/".join(broken) 141 | 142 | 143 | def prefix_data_cached(cfg: ConfigDict, path: str, prefix: str) -> Path: 144 | p = cfg.cache_dir / path 145 | p.mkdir(exist_ok=True, parents=True) 146 | file = p / f"{prefix}.html" 147 | return file 148 | 149 | 150 | async def download_url(cfg, p, url, page) -> Path: 151 | path = get_path(cfg, url, page) 152 | if path.exists(): 153 | return path 154 | nav = Navigator(p, headless=True) 155 | await nav.start() 156 | await nav.goto(url) 157 | page = nav.page() 158 | content = await page.inner_html('//html') 159 | with open(path, 'w', encoding="utf-8") as f: 160 | f.write(content) 161 | await nav.exit() 162 | return path 163 | --------------------------------------------------------------------------------