├── .gitignore
├── .idea
├── codeStyles
│ ├── codeStyleConfig.xml
│ └── Project.xml
├── vcs.xml
├── .gitignore
├── misc.xml
├── modules.xml
├── csv-editor.xml
└── git_toolbox_prj.xml
├── README.md
├── yellow_page_parser.iml
├── pyproject.toml
├── src
├── config_dict.py
├── ywlp
│ └── __init__.py
└── navigator
│ └── __init__.py
└── states
/.gitignore:
--------------------------------------------------------------------------------
1 | /runtime/
2 | /.envrc
3 |
--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Yellow page parser
2 |
3 | A simple yellow page parser using `playwright` + `lxml` and my personal helper package `navigator`.
4 |
5 | This only parses 5 pages of restaurants, and stores it in CSV file
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/codeStyles/Project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/csv-editor.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
--------------------------------------------------------------------------------
/yellow_page_parser.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "ywlp"
3 | version = "1.0.0"
4 | description = "Parses Yellow pages"
5 | requires-python = ">=3.11"
6 | dependencies = [
7 | "click>=8.1",
8 | "termcolor",
9 | "structlog",
10 | "rich",
11 | "click-repl",
12 | "playwright",
13 | "platformdirs",
14 | "lxml"
15 |
16 | ]
17 |
18 | [project.scripts]
19 | ywlp_restaurants = "ywlp:find_restaurants_in_city"
20 |
21 |
22 | [build-system]
23 | requires = ["flit_core<4"]
24 | build-backend = "flit_core.buildapi"
25 |
--------------------------------------------------------------------------------
/.idea/git_toolbox_prj.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/src/config_dict.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from platformdirs import user_cache_path
4 |
5 |
6 | class ConfigDict:
7 | creds_file: Path
8 | session_file: Path
9 | cache_dir: Path
10 | base_url: str
11 | creds_file: Path
12 |
13 | def __init__(self):
14 | self.__creds_file = Path(".auths/creds.json")
15 | self.__session_file = Path(".auths/app.json")
16 | self.__runtime_dir = Path('runtime')
17 | self.__cache_dir = Path('runtime/cache')
18 | self.__base_url = "https://www.yellowpages.com/"
19 | if not self.__cache_dir.exists():
20 | self.__cache_dir.mkdir(parents=True, exist_ok=True)
21 |
22 | ConfigDict.cache_dir = property(lambda self: self.__cache_dir)
23 | ConfigDict.runtime_dir = property(lambda self: self.__runtime_dir)
24 | ConfigDict.session_file = property(lambda self: self.__session_file)
25 | ConfigDict.base_url = property(lambda self: self.__base_url)
26 | ConfigDict.creds_file = property(lambda self: self.__creds_file)
27 |
--------------------------------------------------------------------------------
/src/ywlp/__init__.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import csv
3 | import json
4 | from asyncio import sleep
5 |
6 | import click
7 | import sqlite3
8 |
9 | from playwright.async_api import async_playwright
10 |
11 | import navigator
12 | import playwright
13 | import platformdirs
14 |
15 | from config_dict import ConfigDict
16 |
17 | CATEGORIES = [
18 | 'restaurants',
19 | ]
20 |
21 |
22 | def load_state_links():
23 | state_dict = {}
24 | with open('states', 'r', encoding='utf-8') as fp:
25 | for line in fp:
26 | k, v = line.split(' ')
27 | state_dict[k] = v.strip()
28 | return state_dict
29 |
30 |
31 | async def finding_restaurants_in_city():
32 | # state_dict = load_state_links()
33 | cfg = ConfigDict()
34 | url = 'https://www.yellowpages.com/new-york-ny/restaurants?s=average_rating'
35 | parsed_pages = 0
36 | data = []
37 | async with async_playwright() as p:
38 | while url:
39 | doc = await navigator.get_doc(cfg, p, url, 'restaurants')
40 | headers = doc.xpath('//div[@class="search-results organic"]//div[@class="result"]//h2/a')
41 | for header in headers:
42 | rr = header.getparent().getparent().xpath('//div[@class="ratings"]')
43 | d = json.loads(rr[0].attrib['data-tripadvisor'])
44 | data.append({'name': header.text_content(), 'rating': d['rating'], 'count': d['count']})
45 | pages = doc.xpath('//a[@class="next ajax-page"]')
46 | print([k for k in pages])
47 | try:
48 | url = pages[0].attrib['href']
49 | print(f"going to next url: {url}")
50 | await sleep(5)
51 | parsed_pages += 1
52 | if parsed_pages > 5:
53 | break
54 | except IndexError:
55 | url = None
56 | with open('output.csv', encoding='utf-8', mode='w') as f:
57 | writer = csv.DictWriter(f, ['name', 'rating', 'count'])
58 | for d in data:
59 | writer.writerow(d)
60 |
61 |
62 | @click.command()
63 | def find_restaurants_in_city():
64 | asyncio.run(finding_restaurants_in_city())
65 |
--------------------------------------------------------------------------------
/states:
--------------------------------------------------------------------------------
1 | ak https://www.yellowpages.com/state-ak
2 | al https://www.yellowpages.com/state-al
3 | ar https://www.yellowpages.com/state-ar
4 | az https://www.yellowpages.com/state-az
5 | ca https://www.yellowpages.com/state-ca
6 | co https://www.yellowpages.com/state-co
7 | ct https://www.yellowpages.com/state-ct
8 | dc https://www.yellowpages.com/state-dc
9 | de https://www.yellowpages.com/state-de
10 | fl https://www.yellowpages.com/state-fl
11 | ga https://www.yellowpages.com/state-ga
12 | hi https://www.yellowpages.com/state-hi
13 | ia https://www.yellowpages.com/state-ia
14 | id https://www.yellowpages.com/state-id
15 | il https://www.yellowpages.com/state-il
16 | in https://www.yellowpages.com/state-in
17 | ks https://www.yellowpages.com/state-ks
18 | ky https://www.yellowpages.com/state-ky
19 | la https://www.yellowpages.com/state-la
20 | ma https://www.yellowpages.com/state-ma
21 | md https://www.yellowpages.com/state-md
22 | me https://www.yellowpages.com/state-me
23 | mi https://www.yellowpages.com/state-mi
24 | mn https://www.yellowpages.com/state-mn
25 | mo https://www.yellowpages.com/state-mo
26 | ms https://www.yellowpages.com/state-ms
27 | mt https://www.yellowpages.com/state-mt
28 | nc https://www.yellowpages.com/state-nc
29 | nd https://www.yellowpages.com/state-nd
30 | ne https://www.yellowpages.com/state-ne
31 | nh https://www.yellowpages.com/state-nh
32 | nj https://www.yellowpages.com/state-nj
33 | nm https://www.yellowpages.com/state-nm
34 | nv https://www.yellowpages.com/state-nv
35 | ny https://www.yellowpages.com/state-ny
36 | oh https://www.yellowpages.com/state-oh
37 | ok https://www.yellowpages.com/state-ok
38 | or https://www.yellowpages.com/state-or
39 | pa https://www.yellowpages.com/state-pa
40 | ri https://www.yellowpages.com/state-ri
41 | sc https://www.yellowpages.com/state-sc
42 | sd https://www.yellowpages.com/state-sd
43 | tn https://www.yellowpages.com/state-tn
44 | tx https://www.yellowpages.com/state-tx
45 | ut https://www.yellowpages.com/state-ut
46 | va https://www.yellowpages.com/state-va
47 | vt https://www.yellowpages.com/state-vt
48 | wa https://www.yellowpages.com/state-wa
49 | wi https://www.yellowpages.com/state-wi
50 | wv https://www.yellowpages.com/state-wv
51 | wy https://www.yellowpages.com/state-wy
--------------------------------------------------------------------------------
/src/navigator/__init__.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import textwrap
3 |
4 | import playwright.async_api
5 | from playwright.async_api import Playwright, Browser, Page, BrowserContext, ElementHandle
6 |
7 | import logging
8 | import os
9 | import sys
10 | from pathlib import Path
11 | from typing import Literal
12 | from lxml.html import fromstring
13 |
14 | from config_dict import ConfigDict
15 |
16 | FORMATTER = logging.Formatter("%(asctime)s — %(name)s — %(levelname)s %(lineno)d — %(message)s")
17 |
18 |
19 | async def get_doc(cfg: ConfigDict, p: Playwright, url: str, page: str):
20 | path = await download_url(cfg, p, url, page)
21 | with open(path) as f:
22 | content = f.read()
23 | doc = fromstring(content)
24 | doc.make_links_absolute(cfg.base_url)
25 | return doc
26 |
27 |
28 | def get_path(cfg: ConfigDict, url: str, txt: str) -> Path:
29 | txt = txt.replace(" ", '_').replace("/", '--')
30 | upath = hash_url_and_split(url)
31 | path = prefix_data_cached(cfg, upath, txt)
32 | return path
33 |
34 |
35 | def get_console_handler():
36 | console_handler = logging.StreamHandler(sys.stdout)
37 | console_handler.setFormatter(FORMATTER)
38 | return console_handler
39 |
40 |
41 | def get_logger(logger_name, default_level="DEBUG"):
42 | _logger = logging.getLogger(logger_name)
43 | if 'LOG_LEVEL' in os.environ:
44 | _logger.setLevel(os.environ['LOG_LEVEL']) # better to have too much log than not enough
45 | else:
46 | _logger.setLevel(default_level) # better to have too much log than not enough
47 |
48 | if not _logger.hasHandlers():
49 | _logger.addHandler(get_console_handler())
50 | # with this pattern, it's rarely necessary to propagate the error up to parent
51 | _logger.propagate = False
52 | return _logger
53 |
54 |
55 | class Navigator:
56 |
57 | def __init__(self, p: Playwright, headless=False):
58 | """ Must start inside playwright async context"""
59 | self.__lg = get_logger('navigator')
60 | self.__p = p
61 | self.__headless = headless
62 | self.__browser: Browser | None = None
63 | self.__current_page: Page | None = None
64 | self.__current_context: BrowserContext | None = None
65 |
66 | async def start(self, saved_session: Path | None = None, reset=False):
67 | """ Starts a browser"""
68 | if reset and self.__browser:
69 | self.__lg.debug("resetting browser")
70 | await self.__browser.close()
71 | self.__browser = None
72 | if not self.__browser:
73 | self.__lg.debug("launching browser")
74 | self.__browser = await self.__p.firefox.launch(headless=self.__headless)
75 | else:
76 | self.__lg.warning("A browser is already running, reusing, will start new context")
77 | if saved_session:
78 | self.__lg.debug(f"trying to load session file: {saved_session}")
79 | self.__current_context = await self.__browser.new_context(storage_state=saved_session)
80 | else:
81 | self.__lg.debug("using empty context")
82 | self.__current_context = await self.__browser.new_context()
83 | self.__current_page = await self.__current_context.new_page()
84 |
85 | def page(self):
86 | return self.__current_page
87 |
88 | async def wait_for(self, selector: str, state=Literal["attached", "detached", "hidden", "visible"] | None,
89 | timeout=None) -> ElementHandle | None:
90 | """ Wait for an element to be available, timeout is ms"""
91 | try:
92 | return await self.__current_page.wait_for_selector(
93 | selector=selector,
94 | state=state,
95 | timeout=timeout
96 | )
97 | except playwright.async_api.Error:
98 | return None
99 |
100 | async def click(self, selector: str, timeout=None) -> bool:
101 | try:
102 | elm = await self.wait_for(selector, timeout=timeout)
103 | await elm.click()
104 | self.__lg.debug(f"Clicked: {selector}")
105 | return True
106 | except (playwright.async_api.Error, AttributeError):
107 | self.__lg.warning(f"Was not able to click: {selector}")
108 | return False
109 |
110 | async def goto(self, url: str):
111 | """ Load a webpage, If no page, start a new context and load page"""
112 | if not self.__current_page:
113 | self.__lg.debug("no context loaded, starting new")
114 | await self.start()
115 | await self.__current_page.goto(url)
116 |
117 | async def wait_for_state(self, timeout: float | None = None, state="load"):
118 | self.__lg.debug(f"Waiting for {state} for {timeout}")
119 | await self.__current_page.wait_for_load_state(state, timeout=timeout)
120 | self.__lg.debug(f"state reached")
121 |
122 | async def fill_input(self, selector: str, val: str) -> bool | None:
123 | try:
124 | elm: ElementHandle = await self.__current_page.wait_for_selector(selector)
125 | await elm.fill(val)
126 | except playwright.async_api.Error:
127 | return False
128 |
129 | async def store_session(self, path):
130 | await self.__current_context.storage_state(path=path)
131 |
132 | async def exit(self):
133 | if self.__browser:
134 | await self.__browser.close()
135 |
136 |
137 | def hash_url_and_split(url, how_many=2) -> str:
138 | encoded = int(hashlib.sha256(url.encode('utf-8')).hexdigest(), 16) % 10 ** 8
139 | broken = textwrap.wrap(str(encoded), how_many)
140 | return "/".join(broken)
141 |
142 |
143 | def prefix_data_cached(cfg: ConfigDict, path: str, prefix: str) -> Path:
144 | p = cfg.cache_dir / path
145 | p.mkdir(exist_ok=True, parents=True)
146 | file = p / f"{prefix}.html"
147 | return file
148 |
149 |
150 | async def download_url(cfg, p, url, page) -> Path:
151 | path = get_path(cfg, url, page)
152 | if path.exists():
153 | return path
154 | nav = Navigator(p, headless=True)
155 | await nav.start()
156 | await nav.goto(url)
157 | page = nav.page()
158 | content = await page.inner_html('//html')
159 | with open(path, 'w', encoding="utf-8") as f:
160 | f.write(content)
161 | await nav.exit()
162 | return path
163 |
--------------------------------------------------------------------------------