├── .gitignore
├── LICENSE
├── README.md
├── cache
    └── df_listings.csv
├── collect_list.py
├── fetch_info.py
├── images
    └── example-1.png
├── requirements.txt
└── utils
    ├── __init__.py
    └── post_processing.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 CeShine Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 591 租房網自動抓取腳本
 2 | 
 3 | MIT LICENSE 開源，希望能幫助到有需要的人
 4 | 
 5 | ## Changelog
 6 | 
 7 | ### 2021-12-17
 8 | 
 9 | - 針對 591 網站界面更新進行修改。感謝 [haruzheng 回報](https://github.com/ceshine/591scraper/issues/1)。
10 | - 依據新版界面修正 `collect_list.py`。
11 | - 新版界面由於使用 JS rendering，我們不再能直接使用 requests 抓取物件詳情頁面，所以 `fetch_info.py` 腳本已經改用 Selenium，抓取速度會慢一些。
12 | 
13 | ## 使用說明
14 | 
15 | ### Prerequisites
16 | 
17 | 安裝所需套件 (Python 3.7+)：
18 | 
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 | 
23 | 本腳本使用 [Selenium + Chrome](https://chromedriver.chromium.org/getting-started) 抓取網頁，請按照網頁說明安裝 WebDriver for Chrome。
24 | 
25 | ### Step 1: 抓取符合條件物件列表
26 | 
27 | 首先將 591 搜尋頁面的網址存到 `X591URL` 環境變數，範例 (Bash):
28 | 
29 | ````bash
30 | export X591URL="https://rent.591.com.tw/?kind=1&order=money&orderType=asc&region=17&rentprice=10000,18000&other=lift"
31 | ```
32 | 
33 | 以下範例會抓取最多 12 頁搜尋結果：
34 | 
35 | ```bash
36 | python collect_list.py --max-pages 12
37 | ````
38 | 
39 | 預設結果存放位置是 `cache/listings.jbl`。
40 | 
41 | ### Step 2: 抓取物件詳細資訊
42 | 
43 | 直接執行 `fetch_info.py` 以獲取上一步抓取到的物件的詳細資訊，結果預設會存到 `cache/df_listings.csv`。
44 | 
45 | ```bash
46 | python fetch_info.py
47 | ```
48 | 
49 | 如果你最近已經有抓過同一個搜尋條件的資料，你可以提供上一次的資料，本腳本會自動跳過已經抓取過的物件，然後在輸出的 CSV 檔案中將新的物件存在舊的前面：
50 | 
51 | ```bash
52 | python fetch_info.py --data-path cache/df_listings.csv
53 | ```
54 | 
55 | 你可以同時追蹤多組搜尋條件，你只需要將預設 `df_listings.csv` 名稱改成各自條件的自訂名稱即可。
56 | 
57 | ### 使用建議
58 | 
59 | 個人推薦使用 LibreOffice Calc 開啓輸出的 CSV 檔案，一般會將 `desc` 欄位隱藏，利用 `mark` 欄位標記出自己感興趣的物件。**記得將修改結果回存到 CSV 檔案，這樣你的修改才會保留在下一次更新的結果中**。
60 | 
61 | ![範例圖片](images/example-1.png)
62 | 
63 | ## Acknowledgements
64 | 
65 | 本組腳本參考了以下開源程式，謹此致謝：
66 | 
67 | 1. [開放台灣民間租屋資料 (g0v/tw-rental-house-data)](https://github.com/g0v/tw-rental-house-data)
68 | 2. [591 租屋網 - 租屋資訊爬蟲 (AlanSyue/rent591data)](https://github.com/AlanSyue/rent591data)
69 | 


--------------------------------------------------------------------------------
/cache/df_listings.csv:
--------------------------------------------------------------------------------
 1 | mark,title,price,price_adjusted,link,addr,社區,車位費,管理費,poster,養寵物,提供設備,格局,坪數,樓層,型態,id,fetched,desc
 2 | ,採光一級棒物超所值,4500,4500,https://rent.591.com.tw/rent-detail-11698273.html,北區北安路一段12號,,,無,屋主: 吳太太,No,"冰箱, 洗衣機, 電視, 冷氣, 熱水器, 床, 衣櫃, 第四台, 網路, 桌椅, 1陽台",1房1衛,10坪,4F/5F,公寓,11698273,2021-11-28,"入主單純優質房客
 3 | 附近有便利商店、傳統市場、公園綠地、醫療機構、學校、夜市。"
 4 | ,東區富農街獨立套房,5000,5000,https://rent.591.com.tw/rent-detail-11636314.html,東區富農街二段43巷,,,,仲介: 莊先生 (收服務費),No,"洗衣機, 冷氣, 熱水器, 衣櫃, 1陽台",1房1衛,6坪,3F/5F,公寓,11636314,2021-11-28,"台水---台電--獨立.附機車位
 5 | 附近有便利商店、傳統市場、百貨公司、公園綠地、學校、醫療機構、夜市。"
 6 | ,延平市場超大雅房~全新床組、冷氣全新~,6000,6000,https://rent.591.com.tw/rent-detail-11764053.html,北區公園路,,,無,仲介: 陳小姐 (收服務費),No,"冰箱, 洗衣機, 冷氣, 熱水器, 床, 衣櫃, 網路, 桌椅, 1陽台",1房1廳1衛,10坪,3F/4F,透天厝,11764053,2021-11-28,1.超大雅房、通風採光良好，出入方便。2.二樓大廚房可使用。3.全新床組、衣櫥、桌椅，網速100m4.地點絕佳、居住安全無慮。5.歡迎合法居住、單純入住。 ~~歡迎小貓兩隻內，需自備洗衣機喔~~
 7 | ,雙城計畫旁全新裝潢套房,6500,7100,https://rent.591.com.tw/rent-detail-11716266.html,中西區保安路,,,600元/月,代理人: 陳小姐,No,"冷氣, 熱水器, 衣櫃, 電梯",1房1衛,5坪,5F/7F,電梯大樓,11716266,2021-11-28,"暫未添加說明
 8 | 附近有便利商店、傳統市場、百貨公司、公園綠地、學校、醫療機構、夜市。"
 9 | ,台南市安平區近市政府，2房1衛1浴,6500,6500,https://rent.591.com.tw/rent-detail-11690211.html,安平區永華三街82號,,,無,屋主: 柯女士,No,"冰箱, 洗衣機, 冷氣, 熱水器, 床, 第四台, 網路, 1陽台",2房1廳1衛,25坪,4F/6F,透天厝,11690211,2021-11-28,"1.位於安平區近台南市政府2.2房1衛1浴1陽台，有冷氣3.禁祭祀4.無車庫5.限女性
10 | 附近有便利商店、公園綠地、學校。"
11 | 


--------------------------------------------------------------------------------
/collect_list.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import random
 4 | from typing import List
 5 | from urllib.parse import urlparse, parse_qs
 6 | 
 7 | import typer
 8 | import joblib
 9 | from selenium import webdriver
10 | from selenium.common.exceptions import NoSuchElementException
11 | from bs4 import BeautifulSoup
12 | 
13 | URL = os.environ["X591URL"]
14 | 
15 | 
16 | def main(
17 |     output_path: str = "cache/listings.jbl", max_pages: int = 10, quiet: bool = False
18 | ):
19 |     try:
20 |         region = parse_qs(urlparse(URL).query)["region"][0]
21 |     except AttributeError as e:
22 |         print("The URL must have a 'region' query argument!")
23 |         raise e
24 |     options = webdriver.ChromeOptions()
25 |     if quiet:
26 |         options.add_argument("headless")
27 |     browser = webdriver.Chrome(options=options)
28 |     browser.get(URL)
29 |     try:
30 |         browser.find_element_by_css_selector(f'dd[data-id="{region}"]').click()
31 |     except NoSuchElementException:
32 |         pass
33 |     time.sleep(2)
34 |     listings: List[str] = []
35 |     for i in range(max_pages):
36 |         print(f"Page {i+1}")
37 |         soup = BeautifulSoup(browser.page_source, "lxml")
38 |         for item in soup.find_all("section", attrs={"class": "vue-list-rent-item"}):
39 |             link = item.find("a")
40 |             listings.append(link.attrs["href"].split("-")[-1].split(".")[0])
41 |         browser.find_element_by_class_name("pageNext").click()
42 |         time.sleep(random.random() * 5)
43 |         try:
44 |             browser.find_element_by_css_selector("a.last")
45 |             break
46 |         except NoSuchElementException:
47 |             pass
48 |     print(len(set(listings)))
49 |     joblib.dump(listings, output_path)
50 |     print(f"Done! Collected {len(listings)} entries.")
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     typer.run(main)
55 | 


--------------------------------------------------------------------------------
/fetch_info.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import shutil
  4 | import random
  5 | import logging
  6 | from datetime import date
  7 | from typing import Optional
  8 | 
  9 | import typer
 10 | import joblib
 11 | import pandas as pd
 12 | from tqdm import tqdm
 13 | from bs4 import BeautifulSoup
 14 | from selenium import webdriver
 15 | from selenium.common.exceptions import TimeoutException
 16 | from selenium.webdriver.support.ui import WebDriverWait
 17 | from selenium.webdriver.common.by import By
 18 | from selenium.webdriver.support import expected_conditions as ec
 19 | from tenacity import (
 20 |     retry,
 21 |     stop_after_attempt,
 22 |     wait_fixed,
 23 |     before_sleep_log,
 24 |     RetryError,
 25 |     retry_if_exception_type,
 26 | )
 27 | 
 28 | from utils.post_processing import adjust_price_, auto_marking_, parse_price
 29 | 
 30 | LOGGER = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | class NotExistException(Exception):
 34 |     pass
 35 | 
 36 | 
 37 | def get_attributes(soup):
 38 |     result = {}
 39 |     try:
 40 |         result["養寵物"] = (
 41 |             "No" if "不可養寵物" in soup.select_one("div.service-rule").text else "Yes"
 42 |         )
 43 |     except AttributeError:
 44 |         result["養寵物"] = None
 45 |     contents = soup.select_one("div.main-info-left div.content").children
 46 |     for item in contents:
 47 |         try:
 48 |             name = item.select_one("div div.name").text
 49 |             if name in ("租金含", "車位費", "管理費"):
 50 |                 result[name] = name = item.select_one("div div.text").text.strip()
 51 |         except AttributeError as e:
 52 |             print(e)
 53 |             continue
 54 |     service_list = soup.select_one("div.service-list-box").select(
 55 |         "div.service-list-item"
 56 |     )
 57 |     services = []
 58 |     for item in service_list:
 59 |         if "del" in item["class"]:
 60 |             continue
 61 |         services.append(item.text.strip())
 62 |     result["提供設備"] = ", ".join(services)
 63 |     attributes = soup.select_one("div.house-pattern").find_all("span")
 64 |     for i, key in enumerate(("格局", "坪數", "樓層", "型態")):
 65 |         result[key] = attributes[i * 2].text.strip()
 66 |     return result
 67 | 
 68 | 
 69 | @retry(
 70 |     reraise=False,
 71 |     retry=retry_if_exception_type(TimeoutException),
 72 |     stop=stop_after_attempt(2),
 73 |     wait=wait_fixed(1),
 74 |     before_sleep=before_sleep_log(LOGGER, logging.INFO),
 75 | )
 76 | def get_page(browser: webdriver.Chrome, listing_id):
 77 |     browser.get(f"https://rent.591.com.tw/home/{listing_id}".strip())
 78 |     wait = WebDriverWait(browser, 5)
 79 |     try:
 80 |         wait.until(
 81 |             ec.visibility_of_element_located((By.CSS_SELECTOR, "div.main-info-left"))
 82 |         )
 83 |     except TimeoutException as e:
 84 |         soup = BeautifulSoup(browser.page_source, "lxml")
 85 |         tmp = soup.select_one("div.title")
 86 |         # print(tmp)
 87 |         if tmp and "不存在" in tmp.text:
 88 |             raise NotExistException()
 89 |         else:
 90 |             raise e
 91 |     return True
 92 | 
 93 | 
 94 | def get_listing_info(browser: webdriver.Chrome, listing_id):
 95 |     try:
 96 |         get_page(browser, listing_id)
 97 |     except RetryError:
 98 |         pass
 99 |     soup = BeautifulSoup(browser.page_source, "lxml")
100 |     result = {"id": listing_id}
101 |     result["title"] = soup.select_one(".house-title h1").text
102 |     result["addr"] = soup.select_one("span.load-map").text.strip()
103 |     complex = soup.select_one("div.address span").text.strip()
104 |     if complex != result["addr"]:
105 |         result["社區"] = complex
106 |     result["price"] = parse_price(soup.select_one("span.price").text)
107 |     result["desc"] = soup.select_one("div.article").text.strip()
108 |     result["poster"] = re.sub(r"\s+", " ", soup.select_one("p.name").text.strip())
109 |     result.update(get_attributes(soup))
110 |     return result
111 | 
112 | 
113 | def main(
114 |     source_path: str = "cache/listings.jbl",
115 |     data_path: Optional[str] = None,
116 |     output_path: Optional[str] = None,
117 |     limit: int = -1,
118 |     headless: bool = False,
119 | ):
120 |     listing_ids = joblib.load(source_path)
121 |     df_original: Optional[pd.DataFrame] = None
122 |     if data_path:
123 |         if data_path.endswith(".pd"):
124 |             df_original = pd.read_pickle(data_path)
125 |         else:
126 |             df_original = pd.read_csv(data_path)
127 |         listing_ids = list(set(listing_ids) - set(df_original.id.values.astype("str")))
128 |         print(len(listing_ids))
129 | 
130 |     if limit > 0:
131 |         listing_ids = listing_ids[:limit]
132 | 
133 |     print(f"Collecting {len(listing_ids)} entries...")
134 | 
135 |     options = webdriver.ChromeOptions()
136 |     if headless:
137 |         options.add_argument("headless")
138 |     prefs = {"profile.managed_default_content_settings.images": 2}
139 |     options.add_experimental_option("prefs", prefs)
140 |     browser = webdriver.Chrome(options=options)
141 | 
142 |     data = []
143 |     for id_ in tqdm(listing_ids, ncols=100):
144 |         try:
145 |             data.append(get_listing_info(browser, id_))
146 |         except NotExistException:
147 |             LOGGER.warning(f"Does not exist: {id_}")
148 |             pass
149 |         time.sleep(random.random() * 5)
150 | 
151 |     df_new = pd.DataFrame(data)
152 |     optional_fields = ("租金含", "車位費", "管理費")
153 |     for field in optional_fields:
154 |         if field not in df_new:
155 |             df_new[field] = None
156 |     df_new = auto_marking_(df_new)
157 |     df_new = adjust_price_(df_new)
158 |     df_new["fetched"] = date.today().isoformat()
159 |     if df_original is not None:
160 |         df_new = pd.concat([df_new, df_original], axis=0).reset_index(drop=True)
161 | 
162 |     if output_path is None and data_path is None:
163 |         # default output path
164 |         output_path = "cache/df_listings.csv"
165 |     elif output_path is None and data_path:
166 |         output_path = data_path
167 |         shutil.copy(data_path, data_path + ".bak")
168 | 
169 |     df_new["link"] = (
170 |         "https://rent.591.com.tw/rent-detail-" + df_new["id"].astype("str") + ".html"
171 |     )
172 |     column_ordering = [
173 |         "mark",
174 |         "title",
175 |         "price",
176 |         "price_adjusted",
177 |         "link",
178 |         "addr",
179 |         "社區",
180 |         "車位費",
181 |         "管理費",
182 |         "poster",
183 |         "養寵物",
184 |         "提供設備",
185 |         "格局",
186 |         "坪數",
187 |         "樓層",
188 |         "型態",
189 |         "id",
190 |         "fetched",
191 |         "desc",
192 |     ]
193 |     print(df_new.drop("desc", axis=1).sample(min(df_new.shape[0], 10)))
194 |     df_new[column_ordering].to_csv(output_path, index=False)
195 |     print("Finished!")
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     typer.run(main)
200 | 


--------------------------------------------------------------------------------
/images/example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/591scraper/a548b2065deea88a68d4652ef060f7cb03d042b8/images/example-1.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | typer
2 | tqdm
3 | joblib
4 | pandas
5 | requests
6 | tenacity
7 | selenium
8 | beautifulsoup4
9 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/591scraper/a548b2065deea88a68d4652ef060f7cb03d042b8/utils/__init__.py


--------------------------------------------------------------------------------
/utils/post_processing.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def parse_price(price_str: str) -> int:
 7 |     if price_str == "" or "--" in price_str or "無" in price_str:
 8 |         return 0
 9 |     return int(re.match(r"^([\d,]+)\w+", price_str).group(1).replace(",", ""))
10 | 
11 | 
12 | def auto_marking_(df: pd.DataFrame) -> pd.DataFrame:
13 |     df["mark"] = ""
14 |     social = (
15 |         df.title.str.contains("社宅")
16 |         | df.title.str.contains("社會住宅")
17 |         | df.desc.str.contains("社會住宅")
18 |     )
19 |     df.loc[social, "mark"] = "x"
20 |     df.loc[df["提供設備"].str.contains("機械車位"), "mark"] = "x"
21 |     return df
22 | 
23 | 
24 | def adjust_price_(df: pd.DataFrame) -> pd.DataFrame:
25 |     df["price_adjusted"] = (
26 |         df["price"] * (df["poster"].str.contains("收取服務費").astype("float") * 1 / 24 + 1)
27 |     ).astype("int")
28 |     df["price_adjusted"] = df["price_adjusted"] + df["管理費"].fillna("").apply(
29 |         parse_price
30 |     )
31 |     df["price_adjusted"] = (
32 |         df["price_adjusted"]
33 |         + df["車位費"].fillna("").str.contains("費用另計").astype("int") * 2500
34 |     )
35 |     return df
36 | 


--------------------------------------------------------------------------------