├── .gitignore ├── 10 selenium 套件.ipynb ├── 11 selenium 瀏覽器自動化測試.ipynb ├── 12 selenium 等待.ipynb ├── 13 ActionChains.ipynb ├── 2 正規表達式.ipynb ├── 5 requests 套件.ipynb ├── 7 beautifulsoup 套件.ipynb ├── 8 cookie 用於 requests.ipynb ├── 9 PTT_NBA_看板主頁與內頁.ipynb ├── GPT2-Chinese.md ├── README.md ├── cases ├── .gitignore ├── PyAutoGUI.ipynb ├── archived │ ├── JCC_reject.ipynb │ ├── crowdfunding.ipynb │ ├── fb_video.ipynb │ ├── ig.ipynb │ ├── journals │ │ ├── EBSCO.ipynb │ │ ├── WebOfScience_custom_search.ipynb │ │ └── WebOfScience_journal_title.ipynb │ ├── lm_studio │ │ ├── README.md │ │ ├── example01.jpg │ │ ├── templates │ │ │ └── index.html │ │ ├── web_api.py │ │ └── web_api_embedding.py │ ├── network.ipynb │ ├── open_access │ │ ├── deprecated │ │ │ ├── (deprecated)臺灣大學學術期刊資料庫.ipynb │ │ │ ├── 中國學術年刊.ipynb │ │ │ ├── 中正漢學研究.ipynb │ │ │ ├── 政大中文學報.ipynb │ │ │ └── 東吳大學政治學報.ipynb │ │ ├── 中國文哲研究集刊.ipynb │ │ ├── 中研院法學期刊.ipynb │ │ ├── 台大日本語文研究.ipynb │ │ ├── 同心圓:文學與文化研究.ipynb │ │ ├── 國立中山大學_中國文學系_文與哲.ipynb │ │ ├── 國立臺灣大學美術史研究集刊.ipynb │ │ ├── 成大歷史學報.ipynb │ │ ├── 戲劇學刊.ipynb │ │ ├── 東吳中文學報.ipynb │ │ ├── 清華中文學報.ipynb │ │ └── 臺大中文學報.ipynb │ ├── tika.ipynb │ ├── twitter │ │ ├── README.md │ │ ├── basic.py │ │ └── twint_run.py │ ├── vector_index │ │ ├── README.md │ │ ├── make_index.py │ │ └── query.py │ ├── wikiart.ipynb │ └── 綜合.ipynb ├── digital_archives │ ├── taco-ith.ipynb │ ├── wikisource.csv │ └── wikisource.ipynb ├── download_captcha_images.ipynb ├── excel.ipynb ├── free-proxy.ipynb ├── hetubook_jinyong_requests.ipynb ├── image_gen │ ├── ComfyUI.md │ ├── civitai.md │ ├── colab_diffusers_pipeline.ipynb │ ├── microsoft_bing_chat.md │ └── run_diffusers_flux_pipeline.py ├── ixdzs_jinyong_post_requests.ipynb ├── jinyong_requests.ipynb ├── jinyong_selenium.ipynb ├── leaflet │ ├── README.md │ ├── templates │ │ └── index_cafe.html │ └── web_api.py ├── line-stickers.ipynb ├── mouse_XY_colors.py ├── nidss.ipynb ├── pymysql.ipynb ├── read.xlsx ├── sound │ ├── ebook │ │ ├── css │ │ │ └── jquery.highlight-within-textarea.css │ │ ├── js │ │ │ └── jquery.highlight-within-textarea.js │ │ ├── templates │ │ │ └── fetch.html │ │ ├── tmp │ │ │ └── .gitignore │ │ └── web_api.py │ └── google_lady.py ├── sqlite.ipynb ├── tabs.ipynb ├── twse.ipynb ├── weather.ipynb └── youtube.ipynb ├── html ├── HTML_CSS.zip ├── README.md ├── images │ ├── .DS_Store │ ├── Logo.svg │ ├── Shop.png │ ├── banner.png │ ├── p1.png │ ├── p2.png │ └── p3.png ├── index.html └── style.css ├── python_web_scraping.docx ├── python_web_scraping.pdf ├── turingcerts.jpg └── yt-dlp_and_ffmpeg.docx /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | dev/ 3 | chromedriver* 4 | test* -------------------------------------------------------------------------------- /10 selenium 套件.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "'''\n", 10 | "參考網頁:\n", 11 | "[1] 下載 Chrome Web Driver\n", 12 | "https://chromedriver.chromium.org/downloads\n", 13 | "'''\n", 14 | "\n", 15 | "# 操作 browser 的 API\n", 16 | "# from selenium.webdriver.chrome.service import Service\n", 17 | "from selenium import webdriver\n", 18 | "\n", 19 | "# 匯入套件\n", 20 | "from bs4 import BeautifulSoup as bs\n", 21 | "\n", 22 | "# 強制等待 (執行期間休息一下)\n", 23 | "from time import sleep\n", 24 | "\n", 25 | "# 使用 Chrome 的 WebDriver\n", 26 | "'''\n", 27 | "my_service = Service(executable_path=\"./chromedriver.exe\")\n", 28 | "driver = webdriver.Chrome(service=my_service)\n", 29 | "'''\n", 30 | "\n", 31 | "# 補充: 若沒有特別設定,只要電腦有安裝 Chrome,就可以直接使用\n", 32 | "driver = webdriver.Chrome()\n", 33 | "\n", 34 | "# 開啟 104人力行銀 首頁\n", 35 | "driver.get(\"https://www.104.com.tw/jobs/main/\")\n", 36 | "\n", 37 | "# 取得檢視原始碼的內容 (page_source 取得的 html,是動態的、使用者操作過後的結果)\n", 38 | "html = driver.page_source\n", 39 | "\n", 40 | "# 印出 html (也可以跟 Beautifulsoup 整合)\n", 41 | "# print(html)\n", 42 | "\n", 43 | "# 指定 lxml 作為解析器\n", 44 | "soup = bs(html, \"lxml\")\n", 45 | "\n", 46 | "# 取得元素\n", 47 | "div = soup.select('div.header__container')[0]\n", 48 | "\n", 49 | "# 顯示內文\n", 50 | "print(div.get_text())\n", 51 | "\n", 52 | "# 關閉瀏覽器\n", 53 | "driver.quit()" 54 | ] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Python 3 (ipykernel)", 60 | "language": "python", 61 | "name": "python3" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.10.14" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 4 78 | } 79 | -------------------------------------------------------------------------------- /11 selenium 瀏覽器自動化測試.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 匯入自動測試工具相關套件" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "'''\n", 17 | "匯入套件\n", 18 | "'''\n", 19 | "# 操作 browser 的 API\n", 20 | "from selenium.webdriver.chrome.service import Service\n", 21 | "from selenium import webdriver\n", 22 | "\n", 23 | "# 處理逾時例外的工具\n", 24 | "from selenium.common.exceptions import TimeoutException\n", 25 | "\n", 26 | "# 面對動態網頁,等待某個元素出現的工具,通常與 exptected_conditions 搭配\n", 27 | "from selenium.webdriver.support.ui import WebDriverWait\n", 28 | "\n", 29 | "# 搭配 WebDriverWait 使用,對元素狀態的一種期待條件,若條件發生,則等待結束,往下一行執行\n", 30 | "from selenium.webdriver.support import expected_conditions as EC\n", 31 | "\n", 32 | "# 期待元素出現要透過什麼方式指定,通常與 EC、WebDriverWait 一起使用\n", 33 | "from selenium.webdriver.common.by import By\n", 34 | "\n", 35 | "# 強制等待 (執行期間休息一下)\n", 36 | "from time import sleep\n", 37 | "\n", 38 | "'''\n", 39 | "selenium 啓動 Chrome 的進階配置參數\n", 40 | "參考網址:https://stackoverflow.max-everyday.com/2019/12/selenium-chrome-options/\n", 41 | "'''\n", 42 | "# 啟動瀏覽器工具的選項\n", 43 | "my_options = webdriver.ChromeOptions()\n", 44 | "# my_options.add_argument(\"--headless\") #不開啟實體瀏覽器背景執行\n", 45 | "my_options.add_argument(\"--start-maximized\") #最大化視窗\n", 46 | "my_options.add_argument(\"--incognito\") #開啟無痕模式\n", 47 | "my_options.add_argument(\"--disable-popup-blocking\") #禁用彈出攔截\n", 48 | "my_options.add_argument(\"--disable-notifications\") #取消 chrome 推播通知\n", 49 | "my_options.add_argument(\"--lang=zh-TW\") #設定為正體中文\n", 50 | "\n", 51 | "\n", 52 | "# 使用 Chrome 的 WebDriver\n", 53 | "# my_service = Service(executable_path=\"./chromedriver.exe\")\n", 54 | "driver = webdriver.Chrome(\n", 55 | " options = my_options,\n", 56 | "# service = my_service\n", 57 | ")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "# 在瀏覽器中執行自訂 JavaScript 程式" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# 開啟網頁\n", 74 | "driver.get(\"https://crptransfer.moe.gov.tw/\")\n", 75 | "\n", 76 | "# 跳出 alert 視窗 (在 chrome 裡面執行 javascript 語法)\n", 77 | "driver.execute_script(\"window.alert('這是我們自訂的彈跳視窗');\")\n", 78 | "\n", 79 | "# 等個幾秒\n", 80 | "sleep(3)\n", 81 | "\n", 82 | "# 點選彈出裡面的確定按鈕\n", 83 | "driver.switch_to.alert.accept()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "# 輸入文字,送出表單" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# 開啟網頁\n", 100 | "driver.get(\"https://crptransfer.moe.gov.tw/\")\n", 101 | "\n", 102 | "# 尋找網頁中的搜尋框\n", 103 | "inputElement = driver.find_element(\n", 104 | " By.CSS_SELECTOR, 'input#SN'\n", 105 | ")\n", 106 | "\n", 107 | "# 在搜尋框中輸入文字\n", 108 | "inputElement.send_keys(\"人帥真好\")\n", 109 | "\n", 110 | "# 睡個幾秒\n", 111 | "sleep(2)\n", 112 | "\n", 113 | "# 送出搜尋\n", 114 | "inputElement.submit()\n", 115 | "\n", 116 | "# 搜尋結果的 CSS Selector\n", 117 | "cssSelector = \"body > table > tbody > tr:nth-child(1) > td > main > article > div > table > tbody > tr:nth-child(2) > td\"\n", 118 | "\n", 119 | "try:\n", 120 | " # 等待網頁搜尋結果\n", 121 | " WebDriverWait(driver, 10).until(\n", 122 | " EC.presence_of_element_located(\n", 123 | " (By.CSS_SELECTOR, cssSelector)\n", 124 | " )\n", 125 | " )\n", 126 | " \n", 127 | " # 取得第一頁搜尋結果\n", 128 | " element = driver.find_element(\n", 129 | " By.CSS_SELECTOR, cssSelector\n", 130 | " )\n", 131 | " \n", 132 | " # 輸出想要爬取的文字\n", 133 | " print(element.text) \n", 134 | " print(element.get_attribute('innerText')) # 另一種寫法\n", 135 | "except TimeoutException:\n", 136 | " print('等待逾時!')" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "# 輸入文字,按下送出鈕" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# 開啟網頁\n", 153 | "driver.get(\"https://www.104.com.tw/jobs/main/\")\n", 154 | "\n", 155 | "# 尋找網頁中的搜尋框\n", 156 | "inputElement = driver.find_element(\n", 157 | " By.CSS_SELECTOR, 'input[data-gtm-index^=\"搜尋欄位\"]'\n", 158 | ")\n", 159 | "\n", 160 | "# 在搜尋框中輸入文字\n", 161 | "inputElement.send_keys(\"python\")\n", 162 | "\n", 163 | "# 睡個幾秒\n", 164 | "sleep(3)\n", 165 | "\n", 166 | "# 按鈕選擇器\n", 167 | "cssSelectorBtn = 'button.btn[type=\"submit\"][data-gtm-index^=\"搜尋欄位\"]'\n", 168 | "\n", 169 | "try:\n", 170 | " # 等待元素\n", 171 | " WebDriverWait(driver, 10).until(\n", 172 | " EC.presence_of_element_located(\n", 173 | " (By.CSS_SELECTOR, cssSelectorBtn)\n", 174 | " )\n", 175 | " )\n", 176 | " \n", 177 | " # 取得按鈕元素\n", 178 | " btn = driver.find_element(\n", 179 | " By.CSS_SELECTOR, cssSelectorBtn\n", 180 | " )\n", 181 | " \n", 182 | " # 按下按鈕\n", 183 | " btn.click()\n", 184 | "except TimeoutException:\n", 185 | " print('等待逾時!')" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# 刷新頁面 (類似 F5 或 Ctrl + R)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# 開啟網頁\n", 202 | "driver.get(\"https://reurl.cc/jR725D\")\n", 203 | "\n", 204 | "# 睡個幾秒\n", 205 | "sleep(3)\n", 206 | "\n", 207 | "# 刷新頁面\n", 208 | "driver.refresh()\n", 209 | "\n", 210 | "# 睡個幾秒\n", 211 | "sleep(3)\n", 212 | "\n", 213 | "# 刷新頁面\n", 214 | "driver.refresh()\n", 215 | "\n", 216 | "# 睡個幾秒\n", 217 | "sleep(3)\n", 218 | "\n", 219 | "# 關閉瀏覽器\n", 220 | "driver.quit()" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 3 (ipykernel)", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.10.14" 241 | }, 242 | "vscode": { 243 | "interpreter": { 244 | "hash": "585a938ec471c889bf0cce0aed741a99eaf47ca09c0fa8393793bc5bfe77ba11" 245 | } 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 4 250 | } 251 | -------------------------------------------------------------------------------- /12 selenium 等待.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 匯入自動測試工具相關套件" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "'''\n", 17 | "參考網址:\n", 18 | "[1] Webdriver Manager for Python\n", 19 | "https://pypi.org/project/webdriver-manager/\n", 20 | "'''\n", 21 | "\n", 22 | "# 匯入套件\n", 23 | "from selenium import webdriver\n", 24 | "from selenium.webdriver.chrome.service import Service\n", 25 | "from webdriver_manager.chrome import ChromeDriverManager\n", 26 | "from selenium.common.exceptions import TimeoutException, NoSuchElementException\n", 27 | "from selenium.webdriver.support.ui import WebDriverWait\n", 28 | "from selenium.webdriver.support import expected_conditions as EC\n", 29 | "from selenium.webdriver.common.by import By\n", 30 | "from time import sleep" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# (Optional) 開啟用於自動控制的瀏覽器 (自動取得 Chrome 的 WebDriver)\n", 40 | "driver = webdriver.Chrome(\n", 41 | " service = Service(ChromeDriverManager().install())\n", 42 | ")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# 強制等待" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "'''\n", 59 | "強制等待\n", 60 | "'''\n", 61 | "# 開啟用於自動控制的瀏覽器\n", 62 | "driver = webdriver.Chrome()\n", 63 | "\n", 64 | "try:\n", 65 | " # 走訪網址\n", 66 | " driver.get('https://tw.yahoo.com/')\n", 67 | " \n", 68 | " # 強制等待 3 秒\n", 69 | " sleep(3)\n", 70 | " \n", 71 | " # 印出網址\n", 72 | " print(driver.current_url)\n", 73 | "except:\n", 74 | " print(\"程式出錯!\")\n", 75 | "finally:\n", 76 | " # 關閉瀏覽器\n", 77 | " driver.quit()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# 隱性等待" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "'''\n", 94 | "隱性等待\n", 95 | "'''\n", 96 | "# 開啟用於自動控制的瀏覽器\n", 97 | "driver = webdriver.Chrome()\n", 98 | "\n", 99 | "try:\n", 100 | " # 最多等 15 秒\n", 101 | " driver.implicitly_wait(15)\n", 102 | " \n", 103 | " # 走訪網址\n", 104 | " driver.get('https://tw.yahoo.com/')\n", 105 | " \n", 106 | " # 取得元素\n", 107 | " element = driver.find_element(\n", 108 | " By.CSS_SELECTOR, \n", 109 | " 'a#header-logo'\n", 110 | " )\n", 111 | "\n", 112 | " # 印出超連結 ( 透過 .get_attribute('屬性') 來取得屬性的值 )\n", 113 | " print(element.get_attribute('href'))\n", 114 | "except NoSuchElementException:\n", 115 | " print(\"找不到元素!\")\n", 116 | "finally:\n", 117 | " # 關閉瀏覽器\n", 118 | " driver.quit()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "# 顯性等待" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "'''\n", 135 | "顯性等待\n", 136 | "'''\n", 137 | "# 開啟用於自動控制的瀏覽器\n", 138 | "driver = webdriver.Chrome()\n", 139 | "\n", 140 | "try:\n", 141 | " # 走訪網址\n", 142 | " driver.get('https://www.youtube.com/?gl=TW')\n", 143 | "\n", 144 | " # 滿足條件(10秒內找到元素),則往下一步\n", 145 | " WebDriverWait(driver, 10).until(\n", 146 | " EC.presence_of_element_located( \n", 147 | " (By.LINK_TEXT, '首頁') \n", 148 | " )\n", 149 | " )\n", 150 | " \n", 151 | " # 印出首頁連結\n", 152 | " link = driver.find_element(\n", 153 | " By.LINK_TEXT, '首頁'\n", 154 | " ).get_attribute('href')\n", 155 | " print(link)\n", 156 | " \n", 157 | "except TimeoutException:\n", 158 | " print('等待逾時!')\n", 159 | "finally:\n", 160 | " # 關閉瀏覽器\n", 161 | " driver.quit()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# 參考連結\n", 169 | "- [Ubuntu安装运行无头Selenium Chrome](https://cloud.tencent.com/developer/article/1966470)" 170 | ] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Python 3 (ipykernel)", 176 | "language": "python", 177 | "name": "python3" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 3 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython3", 189 | "version": "3.10.14" 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 4 194 | } 195 | -------------------------------------------------------------------------------- /2 正規表達式.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 正規表達式 Regular Expression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# 匯入 regex 套件\n", 17 | "import re" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# match\n", 27 | "'''\n", 28 | "說明\n", 29 | "re.match 會從字串的「開頭」開始比對,\n", 30 | "比對不到,則回傳 None\n", 31 | "'''\n", 32 | "regex01 = r'2[0-9]{3}\\/[0-1]?[0-9]{1}\\/([0-3]?[0-9])'\n", 33 | "string01 = \"2024/09/18\"\n", 34 | "match01 = re.match(regex01, string01)\n", 35 | "print(match01)\n", 36 | "print(match01[0])\n", 37 | "print(match01[1])\n", 38 | "\n", 39 | "'''\n", 40 | "補充:\n", 41 | "match.group() 或 match.group(0) 是regex所代表的整個完整比對的字串,\n", 42 | "match.group(1)是第一組()中的內容,\n", 43 | "match.group(2)是第二組()中的內容...\n", 44 | "'''\n", 45 | "print(match01.group(0))\n", 46 | "print(match01.group(1))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# findall\n", 56 | "'''\n", 57 | "說明\n", 58 | "re.findall 會將所有配對到的字串\n", 59 | "回傳成一個 list\n", 60 | "'''\n", 61 | "regex02 = r'[0-9]+'\n", 62 | "string02 = \"0911111111, 0922222222, 0933333333\"\n", 63 | "listMatch02 = re.findall(regex02, string02)\n", 64 | "print(listMatch02)\n", 65 | "print(listMatch02[0])\n", 66 | "print(listMatch02[2])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# finditer\n", 76 | "'''\n", 77 | "說明\n", 78 | "re.finditer 會將所有配對到的字串\n", 79 | "以迭代的方式呈現,若沒有配對到,則回傳 None\n", 80 | "'''\n", 81 | "regex03 = r'[0-9]+'\n", 82 | "string03 = \"0911111111, 0922222222, 0933333333\"\n", 83 | "iterableMatch03 = re.finditer(regex03, string03)\n", 84 | "if iterableMatch03 != None:\n", 85 | " for match in iterableMatch03:\n", 86 | " print(match[0])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# search\n", 96 | "'''\n", 97 | "說明\n", 98 | "re.search 會將整個字串進行搜尋,\n", 99 | "但只會比對到第一組,\n", 100 | "比對不到,則回傳 None\n", 101 | "'''\n", 102 | "regex04 = r'[a-zA-Z]([12])\\d{8}'\n", 103 | "string04 = \"A123456789, S299888777\"\n", 104 | "match04 = re.search(regex04, string04)\n", 105 | "print(match04)\n", 106 | "print(match04[0])\n", 107 | "print(match04[1])" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# split\n", 117 | "'''\n", 118 | "說明\n", 119 | "re.split 類似 string.split('separator'),\n", 120 | "只是用正規表達式來作為 separator,\n", 121 | "並回傳 list\n", 122 | "'''\n", 123 | "regex05 = r'\\d'\n", 124 | "string05 = \"One1Two2Three3Four4\"\n", 125 | "listMatch05 = re.split(regex05, string05)\n", 126 | "print(listMatch05)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# sub\n", 136 | "'''\n", 137 | "說明\n", 138 | "re.sub(regex, replace_string, test_string)\n", 139 | "將 regex 所代表的文字,改成 replace_string,文字來源是 test_string\n", 140 | "'''\n", 141 | "regex06 = r\"\\D\"\n", 142 | "string06 = \"5-20 #1314\"\n", 143 | "strResult = re.sub(regex06, \"\", string06)\n", 144 | "print(strResult)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "# 環視\n", 152 | "| 名稱 | 語法 | 說明 |\n", 153 | "|:---:|:---:|:---:|\n", 154 | "| 正向環視 | (?=) | 這位置右邊要出現什麼 |\n", 155 | "| 正向環視否定 | (?!) | 這位置右邊不能出現什麼 |\n", 156 | "| 反向環視 | (?<=) | 這位置左邊要出現什麼 |\n", 157 | "| 反向環視否定 | (?[12])\\d{8}'\n", 207 | "string09 = \"A100000001\"\n", 208 | "match09 = re.match(regex09, string09)\n", 209 | "\n", 210 | "# 完整配對的文字\n", 211 | "print(match09[0])\n", 212 | "print(match09.group(0))\n", 213 | "print(match09.group())\n", 214 | "\n", 215 | "# 具名(類似key)所代表的值,也可以用索引代號來取得\n", 216 | "print(match09.group('gender'))\n", 217 | "print(match09['gender'])\n", 218 | "print(match09[1])" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "# 參考資料\n", 226 | "1. [Python3 正则表达式](https://www.runoob.com/python3/python3-reg-expressions.html \"Python3 正则表达式\")\n", 227 | "2. [正則表達式-全型英數中文字、常用符號unicode對照表](https://blog.typeart.cc/%E6%AD%A3%E5%89%87%E8%A1%A8%E9%81%94%E5%BC%8F-%E5%85%A8%E5%9E%8B%E8%8B%B1%E6%95%B8%E4%B8%AD%E6%96%87%E5%AD%97%E3%80%81%E5%B8%B8%E7%94%A8%E7%AC%A6%E8%99%9Funicode%E5%B0%8D%E7%85%A7%E8%A1%A8/ \"正則表達式-全型英數中文字、常用符號unicode對照表\")\n", 228 | "3. [匹配中文字符的正則表達式: [/u4e00-/u9fa5]](https://www.itread01.com/content/1513168876.html \"匹配中文字符的正則表達式: [/u4e00-/u9fa5]\")\n", 229 | "4. [【Regular Expression】正向環視、反向環視](https://toyo0103.blogspot.com/2017/01/regular-expression.html \"【Regular Expression】正向環視、反向環視\")" 230 | ] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 3 (ipykernel)", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.10.11" 250 | }, 251 | "vscode": { 252 | "interpreter": { 253 | "hash": "585a938ec471c889bf0cce0aed741a99eaf47ca09c0fa8393793bc5bfe77ba11" 254 | } 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 4 259 | } 260 | -------------------------------------------------------------------------------- /7 beautifulsoup 套件.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5a441807", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''\n", 11 | "參考網頁\n", 12 | "[1] Python 使用 Beautiful Soup 抓取與解析網頁資料,開發網路爬蟲教學\n", 13 | "https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/2/\n", 14 | "'''\n", 15 | "\n", 16 | "import requests as req\n", 17 | "from bs4 import BeautifulSoup as bs\n", 18 | "\n", 19 | " # PTT NBA 板\n", 20 | "url = \"https://www.ptt.cc/bbs/NBA/index.html\"\n", 21 | "\n", 22 | "# 用 requests 的 get 方法把網頁抓下來\n", 23 | "res = req.get(url) \n", 24 | "\n", 25 | "# 指定 lxml 作為解析器\n", 26 | "soup = bs(res.text, \"lxml\") " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "33639bed", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# 第一個 \n", 37 | "print(soup.find(\"a\")) \n", 38 | "\n", 39 | "# 全部 ,此時回傳 list\n", 40 | "print(soup.find_all(\"a\")) \n", 41 | "\n", 42 | "# 指定 list 某個元素的 html\n", 43 | "print(soup.find_all(\"a\")[2]) " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "532d85f1", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# 取得 id 為 logo 的元素\n", 54 | "a = soup.find(id = \"logo\")\n", 55 | "print(a)\n", 56 | "\n", 57 | "# 取得所有 div,類別名稱為 r-ent,回傳為 list\n", 58 | "divs = soup.find_all(\"div\", class_ = \"r-ent\")\n", 59 | "print(divs)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "9c1ddf9a", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "'''\n", 70 | "以下透過 CSS selector 取得元素,\n", 71 | "回傳格式為 list\n", 72 | "'''\n", 73 | "# 輸出 title\n", 74 | "print(soup.select_one('title'))\n", 75 | "\n", 76 | "# 輸出 a\n", 77 | "print(soup.select('a'))\n", 78 | "\n", 79 | "# 透過 class 名稱取得元素\n", 80 | "print(soup.select(\"a.board\"))\n", 81 | "\n", 82 | "# 透過 id 名稱取得元素\n", 83 | "print(soup.select_one(\"#logo\"))\n", 84 | "\n", 85 | "# 透過 attribute 取得元素\n", 86 | "print(soup.select('a[class=\"board\"]'))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "77efcd15", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# 取得單一節點的文字內容 (select_one 會回傳單一 bs element 物件,select 會回傳 list)\n", 97 | "print(soup.select_one('title').get_text())\n", 98 | "print(soup.select('a')[0].get_text())" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "59d702d3", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# 透過迭代取得所有 a 的文字內容\n", 109 | "for a in soup.select('a'):\n", 110 | " print(a.get_text())" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "8b5ccf58", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# 透過迭代取得所有 a 的屬性 href\n", 121 | "for a in soup.select('a'):\n", 122 | " if a.has_attr('href'):\n", 123 | " print(a['href']) # a.get(\"href\")\n", 124 | " else:\n", 125 | " print(\"=\" * 50)\n", 126 | " print(f\"連結[{a.get_text()}] 沒有 href 屬性\")\n", 127 | " print(\"=\" * 50)" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3 (ipykernel)", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.10.11" 148 | }, 149 | "vscode": { 150 | "interpreter": { 151 | "hash": "585a938ec471c889bf0cce0aed741a99eaf47ca09c0fa8393793bc5bfe77ba11" 152 | } 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 5 157 | } 158 | -------------------------------------------------------------------------------- /8 cookie 用於 requests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "af7c99e7", 6 | "metadata": {}, 7 | "source": [ 8 | "# 以 PTT Gossiiping (八卦版) 為例" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "cd2ac08a", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import requests as req\n", 19 | "from bs4 import BeautifulSoup as bs\n", 20 | "\n", 21 | " # PTT Gossiiping (八卦版) \n", 22 | "url = \"https://www.ptt.cc/bbs/Gossiping/index.html\"\n", 23 | "\n", 24 | "# 首頁網址\n", 25 | "prefix = 'https://www.ptt.cc'\n", 26 | "\n", 27 | "# 設定 cookie\n", 28 | "my_cookies = {\n", 29 | " \"over18\": \"1\"\n", 30 | "}\n", 31 | "\n", 32 | "# 用 requests 的 get 方法把網頁抓下來\n", 33 | "res = req.get(url, cookies = my_cookies) \n", 34 | "\n", 35 | "# 指定 lxml 作為解析器\n", 36 | "soup = bs(res.text, \"lxml\")\n", 37 | "\n", 38 | "# 顯示連結列表\n", 39 | "for a in soup.select('div.r-ent > div.title > a'):\n", 40 | " print(a.get_text())\n", 41 | " print(prefix + a['href'])" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3 (ipykernel)", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.10.13" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 5 66 | } 67 | -------------------------------------------------------------------------------- /9 PTT_NBA_看板主頁與內頁.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# 匯入套件\n", 10 | "from bs4 import BeautifulSoup as bs\n", 11 | "import requests as req\n", 12 | "from pprint import pprint" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# 一般用法\n", 20 | "---" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# 取得新聞列表\n", 30 | "url = \"https://www.ptt.cc/bbs/NBA/index.html\" \n", 31 | "\n", 32 | "# 用 requests 的 get 方法把網頁抓下來\n", 33 | "res = req.get(url) \n", 34 | "\n", 35 | "# 指定 lxml 作為解析器\n", 36 | "soup = bs(res.text, \"lxml\") \n", 37 | "\n", 38 | "# 建立 list 來放置列表資訊\n", 39 | "list_posts = []" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# 清空放置列表資訊的變數\n", 49 | "list_posts.clear()\n", 50 | "\n", 51 | "# 取得 列表 的文字與超連結\n", 52 | "for a in soup.select('div.r-ent div.title a[href]'):\n", 53 | " print(a.get_text())\n", 54 | " print(a['href']) # 或是 a.get('href') \n", 55 | " \n", 56 | " # 加入列表資訊\n", 57 | " list_posts.append({\n", 58 | " 'title': a.get_text(),\n", 59 | " 'link': 'https://www.ptt.cc' + a['href']\n", 60 | " })" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# 走訪每一個 a link,整合網頁內文\n", 70 | "for index, obj in enumerate(list_posts):\n", 71 | " res_ = req.get(obj['link'])\n", 72 | " soup_ = bs(res_.text, \"lxml\")\n", 73 | " \n", 74 | " # 去掉 div.article-metaline (作者、標題、時間…等)\n", 75 | " for div in soup_.select('div[class^=\"article-metaline\"]'):\n", 76 | " div.decompose()\n", 77 | " \n", 78 | " # 去掉 div.push (推文: 推、→、噓) (判斷元素是否存在)\n", 79 | " if len( soup_.select('div.push') ) > 0:\n", 80 | " for div in soup_.select('div.push'):\n", 81 | " div.decompose()\n", 82 | " \n", 83 | " # 取得實際需要的內容 (類似 JavaScript 的 innerHTML)\n", 84 | " html = soup_.select_one('div#main-content').decode_contents()\n", 85 | "\n", 86 | " # 類似 JavaScript outerHTML\n", 87 | " # html = str(soup_.select_one('div#main-content')) \n", 88 | " \n", 89 | " # 整合到列表資訊的變數當中\n", 90 | " list_posts[index]['html'] = html\n", 91 | "\n", 92 | "\n", 93 | "# 預覽所有結果\n", 94 | "pprint(list_posts)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "# 思考\n", 102 | "- 如何取得**多個分頁**的內容?\n", 103 | " - 觀察分頁數字在網址的呈現方式\n", 104 | " - 將觀察到的分頁數字嵌入對應的網址當中" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# 清空放置列表資訊的變數\n", 114 | "list_posts.clear()\n", 115 | "\n", 116 | "# 起始頁數\n", 117 | "init_page = 6503\n", 118 | "\n", 119 | "# 最新頁數\n", 120 | "latest_page = 6504\n", 121 | "\n", 122 | "# 在已經知道分頁數的情況下\n", 123 | "for page in range(init_page, latest_page + 1):\n", 124 | " \n", 125 | " # 取得新聞列表\n", 126 | " url = f\"https://www.ptt.cc/bbs/NBA/index{page}.html\" \n", 127 | "\n", 128 | " # 用 requests 的 get 方法把網頁抓下來\n", 129 | " res = req.get(url) \n", 130 | "\n", 131 | " # 指定 lxml 作為解析器\n", 132 | " soup = bs(res.text, \"lxml\") \n", 133 | " \n", 134 | " # 取得 列表 的文字與超連結\n", 135 | " for a in soup.select('div.r-ent div.title a[href]'):\n", 136 | " # 加入列表資訊\n", 137 | " list_posts.append({\n", 138 | " 'title': a.get_text(),\n", 139 | " 'link': 'https://www.ptt.cc' + a['href']\n", 140 | " })\n", 141 | " \n", 142 | "# 走訪每一個 a link,整合網頁內文\n", 143 | "for index, obj in enumerate(list_posts):\n", 144 | " res_ = req.get(obj['link'])\n", 145 | " soup_ = bs(res_.text, \"lxml\")\n", 146 | "\n", 147 | " # 去掉 div.article-metaline (作者、標題、時間…等)\n", 148 | " for div in soup_.select('div[class^=\"article-metaline\"]'):\n", 149 | " div.decompose()\n", 150 | " \n", 151 | " # 去掉 div.push (推文: 推、→、噓) (判斷去掉元素是否存在)\n", 152 | " if len( soup_.select('div.push') ) > 0:\n", 153 | " for div in soup_.select('div.push'):\n", 154 | " div.decompose()\n", 155 | "\n", 156 | " # 取得實際需要的內容 (類似 JavaScript 的 innerHTML)\n", 157 | " html = soup_.select_one('div#main-content').decode_contents()\n", 158 | "\n", 159 | " # 整合到列表資訊的變數當中\n", 160 | " list_posts[index]['html'] = html\n", 161 | "\n", 162 | " \n", 163 | "# 預覽所有結果\n", 164 | "pprint(list_posts)" 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3 (ipykernel)", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.10.13" 185 | }, 186 | "vscode": { 187 | "interpreter": { 188 | "hash": "585a938ec471c889bf0cce0aed741a99eaf47ca09c0fa8393793bc5bfe77ba11" 189 | } 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 4 194 | } 195 | -------------------------------------------------------------------------------- /GPT2-Chinese.md: -------------------------------------------------------------------------------- 1 | # GPT2-Chinese 2 | 3 | ## 一、簡介 4 | 自然語言生成(natural language generation,NLG)是自然語言處理(natural language processing,NLP)重要的一部分。它可以降低人類與機器之間資訊交流的鴻溝,也試著將大量的非結構性資料,轉換成人類能夠理解的呈現方式。自然語言生成的系統,常用於協助人類作家撰寫日常的文件,包括商業書信、天氣報告,也被用作互動式解說工具,以易於理解的方式,與非專業使用者進行溝通交流;在進行決策時,儘管最常用的方法是以圖形方式顯示數據,但已證明文本摘要可以改善決策制定。 5 | *** 6 | 由於近年人工智慧(artificial intelligence,AI)議題的興起,透過 AI 即時處理並應用大量數據的特性,延伸出許多 NLG 的應用,例如聊天機器人(chatbot)、線上客戶(online customer service)等對話系統,其對話的內容與訊息,是透過 NLG 來產生。 7 | *** 8 | 在過去,對話生成通常使用循環神經網路(recurrent neural network)、長短期記憶(long-short term memory,LSTM)、閘道循環單位(gated recurrent unit,GRU)等來建立語言模型(language model),以考慮前一個文字,來對下一個可能生成的文字進行預測,然而 RNN 在長時間的記憶表現不好,LSTM 與 GRU 雖然改善了 RNN 架構,卻又有耗費系統資源的問題。在 2019 年,Open AI 推出了 GPT-2(generative pre-trained 2),在對話生成上,有著優異的表現。GPT-2 是一個大型、以 transformer 為基礎的語言模型,其語言建模對大約 40 GB 的超大語料庫進行了預訓練。GPT-2 的模型訓練效果不僅比 RNN、LSTM、GRU 好,且對上下文的文句預測精準度更佳,徹底改變了自然語言處理的生態;現今主流自然語言生成領域的機器學習模型基礎,大多建立在 transformer 之上。 9 | *** 10 | 近年 GPT-2 應用的範圍很多,例如透過生成專利範圍內容,用來協助發明人解決對於專利範圍無法掌握撰寫要領的問題;在收集多樣化的問題過後,透過人、事、時、地、物加以分類,再透過生成完整的問題句子,來解決將相同分類中相似的問題進行替換後,問句不完整的情形;給定風格、關鍵句子與隨機參數,進行歌詞生成,根據風格條件生成具有結構性、押韻性、原創性的歌詞等等。 11 | 12 | ## 二、本例練習目的 13 | 1. 透過 GPT2-Chinese 訓練自行整理的語料。 14 | 2. 套用訓練完成的語言模型,透過自訂的前導文字,來進行後續的文字生成。 15 | 16 | ## 三、說明 17 | - 作業環境: 18 | - Windows 10 or Linux Ubuntu 18.04+ 19 | - Anaconda (Python 3.7+) 20 | - 專案連結: 21 | - [GPT2-Chinese 專案連結](https://github.com/Morizeyao/GPT2-Chinese/tree/old_gpt_2_chinese_before_2021_4_22 "GPT2-Chinese 專案連結") 22 | - 下載方式: 23 | - Git 下載指令:```git clone https://github.com/Morizeyao/GPT2-Chinese.git``` 24 | - 手動下載:專案連結頁面 -> Code -> Download ZIP 25 | - 開始前的準備與流程 26 | - 說明: 27 | - 以 電腦 / 筆電 有 GPU 的 Windows 環境為例 28 | - 請注意 nvidia driver 與 CUDA、CUDA 與 CuDNN 之間的相依問題 29 | - 流程一: 先確認 nVIDIA driver 是否安裝,如果不是進階使用者,建議用最新版 [NVIDIA 驅動程式下載](https://www.nvidia.com.tw/Download/index.aspx?lang=tw "NVIDIA 驅動程式下載") 30 | - 下載方式分為「SD」與「GRD」 31 | - 如果你需要對最新遊戲、DLC 提供最即時支援的玩家,請選擇 Game Ready 驅動程式。 32 | - 如果你需要對影片編輯、動畫、攝影、繪圖設計和直播等創作流程提供穩定的品質,請選擇 Studio 驅動程式。 33 | - 流程二: 以 GPT-2 所使用的 Transformer 為例 34 | - 它使用 PyTorch 框架,所以要先了解 PyTorch 支援的 CUDA 版本: [INSTALL PYTORCH](https://pytorch.org/ "INSTALL PYTORCH") 35 | - 下載 CUDA 前,請先至 PyTorch 網站,了解目前支援 CUDA 的版本,下載 cuDNN 亦同。 36 | - ![INSTALL PYTORCH](https://i.imgur.com/xBctpZ0.png "INSTALL PYTORCH") 37 | - 流程三: 下載: [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive "CUDA Toolkit Archive") 38 | - 流程四: 下載: [NVIDIA cuDNN](https://developer.nvidia.com/cudnn "NVIDIA cuDNN") 39 | - 需要先申請帳號密碼,才能進入下載頁面 40 | - 中間可能會請你填寫問卷,依實際情況填寫即可 41 | - 目前個人電腦環境所配對的 CUDA / cuDNN 下載連結 42 | - CUDA: [CUDA Toolkit 11.1 Update 1 Downloads](https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_456.81_win10.exe "cuda_11.1.1_456.81_win10.exe") 43 | - cuDNN: [cuDNN Library for Windows (x86)](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.0.5/11.1_20201106/cudnn-11.1-windows-x64-v8.0.5.39.zip "cudnn-11.1-windows-x64-v8.0.5.39.zip") 44 | - 安裝套件: 45 | - 進入專案資料夾後輸入 ```pip install -r requirements.txt``` 46 | - 檢測目前電腦有沒有支援 CUDA 的程式碼 47 | ``` 48 | import torch 49 | print(torch.cuda.is_available()) 50 | print(torch.cuda.current_device()) 51 | print(torch.cuda.device(0)) 52 | ``` 53 | ``` 54 | 若有支援,應該輸出類似下方的字樣: 55 | True 56 | 0 57 | 58 | ``` 59 | - 安裝參考連結: 60 | - [Table 3. CUDA Toolkit and Corresponding Driver Versions](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html "Table 3. CUDA Toolkit and Corresponding Driver Versions") 61 | - [CUDA 與 CuDNN 安裝及環境變數設定](https://qqmanlin.medium.com/cuda-%E8%88%87-cudnn-%E5%AE%89%E8%A3%9D-e982d92162af "CUDA 與 CuDNN 安裝及環境變數設定") 62 | - [Win10環境下配置CUDA與cuDNN](https://dotblogs.com.tw/CYLcode/2018/09/20/163005 "Win10環境下配置CUDA與cuDNN") 63 | - 確認 CUDA 是否正確安裝的指令: 64 | - 安裝完 nVIDIA driver 後可用的指令(GPU 當前狀態): ```nvidia-smi``` 65 | - 安裝完 CUDA 後可用的指令(觀看目前安裝的 CUDA 版本): ```nvcc -V``` 66 | - 訓練資料結構: 67 | - [連結](https://github.com/Morizeyao/GPT2-Chinese/blob/old_gpt_2_chinese_before_2021_4_22/train.json "連結") 68 | - ![訓練格式](https://i.imgur.com/Dab1QPY.png "訓練格式") 69 | - 參考網頁: 70 | - [直觀理解 GPT-2 語言模型並生成金庸武俠小說](https://leemeng.tw/gpt2-language-model-generate-chinese-jing-yong-novels.html "直觀理解 GPT-2 語言模型並生成金庸武俠小說") 71 | - [使用GPT2-Chinese生成中文小說](https://www.cc.ntu.edu.tw/chinese/epaper/0058/20210920_5808.html "作者:楊德倫 / 資策會數位教育研究所數位人才培育中心講師") 72 | 73 | - 金庸小說訓練資料的爬取教學 74 | - [selenium 取得金庸小說的內容,並存成txt與json檔](https://www.youtube.com/watch?v=jJzZcMjZsKM "[selenium]取得金庸小說的內容,並存成txt與json檔") 75 | - [requests 取得金庸小說的內容,並存成txt與json檔](https://www.youtube.com/watch?v=JsmLtMC43Lc "[requests]取得金庸小說的內容,並存成txt與json檔") 76 | 77 | ## 四、在 Windows 的基本使用方式 78 | 79 | ### (一)win10 安裝pytorch gpu 及 解決報錯“OSError: [WinError 126] 找不到指定的模組 80 | [WinError 126 找不到指定的模組](https://www.mdeditor.tw/pl/pndo/zh-tw "WinError 126 找不到指定的模組") 81 | 82 | ### (二)[WinError 126] VC-redist 安裝檔 83 | [WinError 126 VC-redist 安裝檔](https://download.visualstudio.microsoft.com/download/pr/89a3b9df-4a09-492e-8474-8f92c115c51d/B1A32C71A6B7D5978904FB223763263EA5A7EB23B2C44A0D60E90D234AD99178/VC_redist.x64.exe "WinError 126 VC-redist 安裝檔") 84 | 85 | ### (三)訓練文章的指令 86 | ``` 87 | python train.py --device=0 --epochs=1 --batch_size=1 --min_length=10 --raw_data_path=data/jinyong.json --output_dir=model/ --raw 88 | ``` 89 | | 參數 | 說明 | 90 | | ------ | ------ | 91 | | train.py | 訓練用主程式 | 92 | | device | 指定用哪一個 GPU (沒 GPU,預設 CPU) | 93 | | epochs | 訓練幾回 | 94 | | batch_size | 每次拿幾個樣本進行訓練。常見的是 2 的 n 次方 | 95 | | min_length | 每個樣本至少需要多少長度才拿來訓練 | 96 | | raw_data_path | 訓練資料 JSON 檔案路徑 | 97 | | output_dir | 訓練完的語言模型存放資料夾 | 98 | | raw | 設定此參數,會將樣本進行 tokenize | 99 | 100 | ### (四)生成文章的指令 101 | ``` 102 | python generate.py --length=250 --nsamples=3 --prefix="張無忌見三名老僧在片刻間連斃崑崙派四位高手," --temperature=0.7 --model_path=model/model_epoch100_jinyong/ --save_samples --save_samples_path=output/ 103 | ``` 104 | | 參數 | 說明 | 105 | | ------ | ------ | 106 | | generate.py | 生成文字用主程式 | 107 | | length | 生成文字的長度 | 108 | | nsamples | 生成幾個文章範本 | 109 | | prefix | 生成文章的前導文字,會影響生成的發展 | 110 | | temperature | 生成溫度溫度越高,模型產生出來的結果越隨機、越不可預測;換言之,使得原先容易被選到的字,抽出的機會變小,平常較少出現的字,被選到的機會稍微增加 | 111 | | model_path | 生成文字所使用的語言模型資料夾路徑 | 112 | | save_samples | 有設定的話,會儲存生成文章的範本 | 113 | | save_samples_path | 生成文章範本的儲存路徑 | 114 | 115 | 有關溫度(temperature)的參考資料 116 | [Deep learning with Python 學習筆記(10)生成式深度學習](https://www.cnblogs.com/zhhfan/p/10335907.html "Deep learning with Python 學習筆記(10)生成式深度學習") 117 | ![更低的溫度= 更確定,更高的溫度= 更隨機](https://img2018.cnblogs.com/blog/1503464/201901/1503464-20190129211804100-1598676964.png "更高的溫度得到的是熵更大的採樣分佈,會生成更加出人意料、更加無結構的生成數據,而更低的溫度對應更小的隨機性,以及更加可預測的生成數據。對同一個概率分佈進行不同的重新加權。") 118 | 119 | ## 五、在 Linux、MacOS 的基本使用方式 120 | 121 | ### (一)訓練文章的指令 122 | ``` 123 | time python3.7 train.py \ 124 | --device=0 \ 125 | --epochs=1 \ 126 | --num_pieces=100 \ 127 | --batch_size=32 \ 128 | --min_length=10 \ 129 | --raw_data_path=data/jinyong.json \ 130 | --output_dir=model/ \ 131 | --raw 132 | ``` 133 | 134 | ### (二)生成文章的指令 135 | ``` 136 | time python3 generate.py \ 137 | --length=250 \ 138 | --nsamples=3 \ 139 | --prefix="張無忌見三名老僧在片刻間連斃崑崙派四位高手," \ 140 | --temperature=0.7 \ 141 | --model_path=model/model_epoch100_jinyong/ \ 142 | --save_samples \ 143 | --save_samples_path=output/ 144 | ``` 145 | 146 | ## 六、擷圖 147 | !["訓練過程中,每個 step 輸出 log 的畫面"](https://i.imgur.com/nIh2b7b.png "訓練過程中,每個 step 輸出 log 的畫面") 148 | 149 | 圖 1 訓練過程中,每個 step 輸出 log 的畫面 150 | 151 | ![每一回合訓練的 model,會各別輸出儲存](https://i.imgur.com/dUOe9ZI.png "每一回合訓練的 model,會各別輸出儲存") 152 | 153 | 圖 2 每一回合訓練的 model,會各別輸出儲存 154 | 155 | ![每訓練完 1 個 epoch,會計算與顯示當前回合的訓練時間長](https://i.imgur.com/apTVmn6.png "每訓練完 1 個 epoch,會計算與顯示當前回合的訓練時間長") 156 | 157 | 圖 3 每訓練完 1 個 epoch,會計算與顯示當前回合的訓練時間長(圖片以 epoch 1 為例) 158 | 159 | ![以訓練第 100 回合的 model 來進行生成測試](https://i.imgur.com/0Rq0oK5.png "以訓練第 100 回合的 model 來進行生成測試") 160 | 161 | 圖 4 以訓練第 100 回合的 model 來進行生成測試 162 | 163 | ![訓練 100 epochs 後的畫面,訓練程式結束的畫面](https://i.imgur.com/NEx0J0H.png "訓練 100 epochs 後的畫面,訓練程式結束的畫面") 164 | 165 | 圖 5 訓練 100 epochs 後,訓練程式結束的畫面 166 | 167 | ## 七、預訓練模型與相關檔案連結 168 | [Google 雲端硬碟](https://drive.google.com/drive/folders/1EmqZsb3Lp_M7ftSiKVgHC6xIiWQVmDBe?usp=sharing "Google 雲端硬碟") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python_web_scraping 2 | Python 網路爬蟲講義與範例程式碼 3 | 4 | 5 | ## 提問 6 | - 通則 7 | - 「結業前」可提問、討論,要把多餘時間和資源,留給當前上課的學員。 8 | - 寫信 9 | - E-mail: `darren@darreninfo.cc` 10 | - 信件標題寫上你的**班別和姓名**,或是在哪裡參與我的課程,例如 `[資展 BDSEXX / 臺大計中 / 聯成]` 你的主旨 ○○○。 11 | - 提問的內容要與本專案有關,**其它課程的部分,去請益原本授課的老師**。 12 | - **不要把程式碼寄給我**,可能沒時間看,討論儘量以解決問題的方向為主。 13 | - 不符合以上幾點,將**直接刪除**,敬請見諒。 14 | 15 | 16 | ## 作業 17 | - 僅限授課學員。 18 | - 同學之間可以互相討論,但千萬不要抄襲。 19 | - 使用 `requests` 和 `BeautifulSoup`,或是 `selenium` 來爬取網站資料。 20 | - [Project Gutenberg](https://www.gutenberg.org/) 21 | - 爬取 [中文](https://www.gutenberg.org/browse/languages/zh) 書籍資料 (注意: **只要取得中文字,不要英文字**。) 22 | - 可使用選擇器 `li.pgdbetext > a[href]` 來檢視相關連結數量。 23 | - 取得中文字的正規表示式: [正則表達式-全型英數中文字、常用符號unicode對照表](https://blog.typeart.cc/正則表達式-全型英數中文字、常用符號unicode對照表/) 24 | - `80` 分條件 25 | - 新增 `project_gutenberg` 資料夾,並將每一本書的中文內容存入 txt 檔,txt 的檔名是超連結名稱,例如 `豆棚閒話.txt`。 26 | - 注意:每一個 txt 都會被存在 `project_gutenberg` 資料夾內。 27 | - `錄製`執行過程,並提供`影片連結`,可以放在 `YouTube` 或是 `Google Drive`,影片當中要`隨機打開 3 個 .txt`,驗證內容是否是純中文字 (不要英文字)。 28 | - 至少要有 `200` 本,少 1 本扣 1 分,要在影片中`顯示 .txt 的數量`,例如在檔案總管的某一個角落,有寫著檔案總數。 29 | - 不用給我看程式碼,也不用邊寫邊執行,錄製的時候直接執行程式、直接爬取資料到 `project_gutenberg` 即可。 30 | - 參考影片: [古騰堡計劃(Project Gutenberg)中文電子書爬取](https://www.youtube.com/watch?v=gKDBiVvzMfk) 31 | - `100` 分條件 (基於 `80` 分條件) 32 | - 使用 `GitHub` 平台來提交作業,並且將 `github repo 連結` 以及 `影片連結` 連結寄給我。 33 | - Git 與 GitHub 使用教學: [程式與網頁開發者必備技能!Git 和 GitHub 零基礎快速上手,輕鬆掌握版本控制的要訣!](https://www.youtube.com/watch?v=FKXRiAiQFiY) 34 | - Markdown 語法: [如何使用 Markdown 語言撰寫技術文件](https://experienceleague.adobe.com/zh-hant/docs/contributor/contributor-guide/writing-essentials/markdown) 35 | - `repository` 裡面至少要有 `project_gutenberg` 資料夾,還有你的 `.py` 或 `.ipynb` 檔案,以及 `README.md`。 36 | ``` 37 | project_gutenberg/ 38 | project_gutenberg.ipynb (或 .py) 39 | README.md 40 | ``` 41 | - `README.md` 要有說明 (用 `.py` 執行要額外說明執行指令或方法),例如: 42 | ```markdown 43 | # Project Gutenberg 44 | 爬取中文書籍,共 xxx 本。 45 | 46 | ## 安裝套件 47 | - requests (版本號) 48 | - beautifulsoup4 (版本號) 49 | - selenium (版本號) 50 | ... 51 | (版本號可用 pip list,或是 conda list 來檢視) 52 | ... 53 | 54 | ## 成果 55 | ![](執行過程的擷圖或說明圖片) 56 | ... 57 | [影片名稱或其它標題](你的影片連結) 58 | ... 59 | 60 | ## 其它你想要補充標題和內容 61 | ... 62 | ... 63 | ``` 64 | - 可以參考以前學長的 README 撰寫方式: [FaceBook FanPage Scraper with selenium](https://github.com/nana89823/facebook_scraper) 65 | - 沒交:`0` 分。 66 | - 繳交時間 67 | - 原則上最後一堂課結束後 2 週內,準確時間上課說明。 68 | 69 | 70 | ## 教學參考影片 71 | - [資展國際-OJTP01-18小時-Python網路爬蟲](https://www.youtube.com/playlist?list=PLV4FeK54eNbzgcKtC5s3u7Tv2dZ0BnVsW "資展國際-OJTP01-18小時-Python網路爬蟲") 72 | - [資展國際-BDSE33-Python網路爬蟲 - 12/2, 12/3, 12/9, 12/10](https://www.youtube.com/playlist?list=PLV4FeK54eNbxprT9Sn6FWlcb63u8t0HKt "資展國際-BDSE33-Python網路爬蟲") 73 | - [臺大計算機中心 - Python 網路爬蟲 - 2022/06/20 開班](https://www.youtube.com/playlist?list=PLV4FeK54eNbyZ_rvAAkCICYufOtuQZtTI) 74 | - [資展國際 - 養成班 BDSE22 - Python網路爬蟲](https://www.youtube.com/playlist?list=PLV4FeK54eNbwOKHOH4aWR95fo0cU4wH3O "Python網路爬蟲") 75 | - [臺大計算機中心 - Python 網路爬蟲 - 2021/10/01 開班](https://www.youtube.com/playlist?list=PLV4FeK54eNby0rK-Xpex6baRXE3DG-leg "Python網路爬蟲") 76 | - [資展國際 - 在職班 20210925 至 20210926 - Python 網路爬蟲](https://www.youtube.com/playlist?list=PLV4FeK54eNbwqSdrLfXitmfb4HhB51yOM "Python網路爬蟲") 77 | 78 | 79 | ## 延伸應用 80 | - [Leaflet.js - Web 互動式地圖](https://www.youtube.com/playlist?list=PLV4FeK54eNbwNaCoJomI1jhvgm-A-vOsz) 81 | - [GPT2-Chinese old branch 中文語言模型訓練與生成](https://youtu.be/c3fHRQonqlM) 82 | - [臺大計中電子報 - 第0062期‧2022-09-20 發行 - 使用PyAutoGUI開發桌面自動化程式](https://www.cc.ntu.edu.tw/chinese/epaper/home/20220920_006203.html "臺大計中電子報 - 第0062期‧2022-09-20 發行 - 使用PyAutoGUI開發桌面自動化程式") 83 | - [臺大計中電子報 - 第0059期‧2021.12.20 發行 - 使用GPT2-Chinese生成具有情感的中文對話文字](https://www.cc.ntu.edu.tw/chinese/epaper/0059/20211220_5908.html "臺大計中電子報 - 第0059期‧2021.12.20 發行 - 使用GPT2-Chinese生成具有情感的中文對話文字") 84 | - [臺大計中電子報 - 第0058期‧2021.09.20 發行 - 使用GPT2-Chinese生成中文小說](https://www.cc.ntu.edu.tw/chinese/epaper/0058/20210920_5808.html "臺大計中電子報 - 第0058期‧2021.09.20 發行 - 使用GPT2-Chinese生成中文小說") -------------------------------------------------------------------------------- /cases/.gitignore: -------------------------------------------------------------------------------- 1 | lawbank* 2 | -------------------------------------------------------------------------------- /cases/archived/JCC_reject.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "'''\n", 10 | "司法院大法官 -> 解釋及不受理決議 -> 不受理決議\n", 11 | "\n", 12 | "範例連結: https://cons.judicial.gov.tw/jcc/zh-tw/rrp04?page=1\n", 13 | "'''\n", 14 | "import requests, json\n", 15 | "from bs4 import BeautifulSoup\n", 16 | "\n", 17 | "# 整理所有資料的變數\n", 18 | "listData = []\n", 19 | "\n", 20 | "# 走訪總頁數\n", 21 | "pages = 1\n", 22 | "\n", 23 | "# 取得 table 中的列表連結\n", 24 | "def getListItems():\n", 25 | " for page in range(1, pages + 1):\n", 26 | " response = requests.get(f\"https://cons.judicial.gov.tw/jcc/zh-tw/rrp04?page={page}\")\n", 27 | " soup = BeautifulSoup(response.text, 'lxml')\n", 28 | " a_elms = soup.select(\"table.blocky_body.form_table.form_table_second.sm-responsive tbody tr td[data-head='案號'] a\")\n", 29 | " for a in a_elms:\n", 30 | " listData.append({\n", 31 | " \"title\": a.get_text(),\n", 32 | " \"link\": \"https://cons.judicial.gov.tw\" + a[\"href\"]\n", 33 | " })\n", 34 | "\n", 35 | "# 根據先前儲存的列表連結,爬出需要的資訊\n", 36 | "def getItemDetail():\n", 37 | " for index, _dict in enumerate(listData):\n", 38 | " response = requests.get(_dict['link'])\n", 39 | " soup = BeautifulSoup(response.text, 'lxml')\n", 40 | " pre_elms = soup.select('div.item.title-w-8 pre.content.pure_text')\n", 41 | " listData[index]['公布院令'] = pre_elms[0].get_text()\n", 42 | " listData[index]['會次'] = pre_elms[1].get_text()\n", 43 | " listData[index]['日期'] = pre_elms[2].get_text()\n", 44 | " listData[index]['案號'] = pre_elms[3].get_text()\n", 45 | " listData[index]['聲請人'] = pre_elms[4].get_text()\n", 46 | " listData[index]['案由'] = pre_elms[5].get_text()\n", 47 | " listData[index]['決議'] = pre_elms[6].get_text()\n", 48 | "\n", 49 | "# 將所有資訊轉成 JSON 檔\n", 50 | "def saveJson():\n", 51 | " with open(\"JCC_reject.json\", \"w\", encoding=\"utf-8\") as file:\n", 52 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )\n", 53 | "\n", 54 | "# 主程式區段\n", 55 | "if __name__ == \"__main__\":\n", 56 | " getListItems()\n", 57 | " getItemDetail()\n", 58 | " saveJson()\n", 59 | " " 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3 (ipykernel)", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.8.8" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 4 84 | } 85 | -------------------------------------------------------------------------------- /cases/archived/crowdfunding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "d2dc61bd", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# 匯入套件\n", 11 | "import requests as req\n", 12 | "from bs4 import BeautifulSoup as bs\n", 13 | "import re, json\n", 14 | "from time import sleep\n", 15 | "from random import randint\n", 16 | "\n", 17 | "# 隨機取得 User-Agent\n", 18 | "'''\n", 19 | "# 從外部資料來取得清單,清單預設儲存路徑: /tmp\n", 20 | "ua = UserAgent(use_external_data=True)\n", 21 | "# 從外部資料來取得清單,儲存在指定路徑\n", 22 | "ua = UserAgent(use_external_data=True, cache_path=/home/fake_useragent.json)\n", 23 | "\n", 24 | "更詳細的說明,請見以下網頁:\n", 25 | "https://pypi.org/project/fake-useragent/\n", 26 | "'''\n", 27 | "from fake_useragent import UserAgent\n", 28 | "ua = UserAgent(use_external_data=True)\n", 29 | "\n", 30 | "# 設定請求標頭\n", 31 | "my_headers = {\n", 32 | " 'user-agent': ua.random\n", 33 | "}\n", 34 | "\n", 35 | "# 設定 cookie\n", 36 | "my_cookies = {\n", 37 | " 'age_checked_for': \"12190\"\n", 38 | "}\n", 39 | "\n", 40 | "# 整理所有取得資料的變數\n", 41 | "listData = []\n", 42 | "\n", 43 | "# 欲抓取資料的網址\n", 44 | "domainName = 'https://www.zeczec.com/'\n", 45 | "\n", 46 | "# 指定最後頁數\n", 47 | "pages = 1\n", 48 | "\n", 49 | "# 取得首頁列表資訊\n", 50 | "def getMainData():\n", 51 | " for page in range(1, pages + 1):\n", 52 | " # 取得回應\n", 53 | " res = req.get(url = f'{domainName}categories?page={page}', headers = my_headers)\n", 54 | "\n", 55 | " # 初始化 soup 物件\n", 56 | " soup = bs(res.text, 'lxml')\n", 57 | "\n", 58 | " # 取得所有超連結\n", 59 | " for a in soup.select('div.flex.gutter3-l a.db'):\n", 60 | " # 取得圖片連結\n", 61 | " strStyle = a.select_one('div.aspect-ratio-project-cover')['data-bg']\n", 62 | " regexImg = r\"https:\\/\\/assets\\.zeczec\\.com\\/asset_\\d+_image_big\\.(jpe?g|png)\"\n", 63 | " matchImg = re.match(regexImg, strStyle)\n", 64 | " strImg = matchImg[0]\n", 65 | "\n", 66 | " # 取得超連結\n", 67 | " strLink = domainName + a['href']\n", 68 | "\n", 69 | " # 取得標題文字\n", 70 | " strTitle = a.select_one('h3.b').get_text()\n", 71 | "\n", 72 | " # 整理首頁資料\n", 73 | " listData.append({\n", 74 | " \"cover\": strImg,\n", 75 | " \"link\": strLink,\n", 76 | " \"title\": strTitle\n", 77 | " })\n", 78 | "\n", 79 | "# 取得詳細頁面資訊\n", 80 | "def getDetailData():\n", 81 | " # 走訪每一個頁面\n", 82 | " for index, _dict in enumerate(listData): \n", 83 | " # 輸出網址,以便於 debug\n", 84 | " print(_dict['link'])\n", 85 | " \n", 86 | " # 取得回應\n", 87 | " res = req.get(url = _dict['link'], headers = my_headers, cookies = my_cookies)\n", 88 | "\n", 89 | " # 初始化 soup 物件\n", 90 | " soup = bs(res.text, 'lxml')\n", 91 | " \n", 92 | " # 取得價格\n", 93 | " strPrice = soup.select_one('div.js-sum-raised').get_text()\n", 94 | " regexPrice = r\"\\D\"\n", 95 | " strPrice = re.sub(regexPrice, \"\", strPrice)\n", 96 | " \n", 97 | " # 取得贊助人數\n", 98 | " strBacker = soup.select_one('span.js-backers-count').get_text()\n", 99 | " \n", 100 | " # 取得剩餘時間\n", 101 | " strTime = \"longterm\"\n", 102 | " if soup.select_one('span.js-time-left') != None:\n", 103 | " strTime = soup.select_one('span.js-time-left').get_text()\n", 104 | " regexTime = r\"\\d+\"\n", 105 | " strTime = re.search(regexTime, strTime)[0]\n", 106 | " \n", 107 | " # 取得持續時間\n", 108 | " dictDuration = {\"begin\": \"\", \"end\": \"\"}\n", 109 | " strDuration = soup.select_one('div.mb2.f7').get_text()\n", 110 | " regexDuration = r\"(\\d{4}\\/\\d{2}\\/\\d{2}\\s\\d{2}:\\d{2})(\\s–\\s(\\d{4}\\/\\d{2}\\/\\d{2}\\s\\d{2}:\\d{2}))?\"\n", 111 | " matchDuration = re.search(regexDuration, strDuration)\n", 112 | " dictDuration['begin'] = matchDuration[1]\n", 113 | " if matchDuration[3] != None:\n", 114 | " dictDuration['end'] = matchDuration[3]\n", 115 | " \n", 116 | " # 整理詳細頁面資料\n", 117 | " listData[index]['price'] = strPrice\n", 118 | " listData[index]['backer'] = strBacker\n", 119 | " listData[index]['time'] = strTime\n", 120 | " listData[index]['duration'] = dictDuration\n", 121 | " \n", 122 | " # 隨機等待\n", 123 | " sleep(randint(5,10))\n", 124 | "\n", 125 | "# 儲存成 json 檔案\n", 126 | "def saveJson():\n", 127 | " with open(\"crowdfunding.json\", \"w\", encoding=\"utf-8\") as file:\n", 128 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "c7ef6fb5", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# 主程式\n", 139 | "if __name__ == \"__main__\":\n", 140 | " getMainData()\n", 141 | " getDetailData()\n", 142 | " saveJson()" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3.9.13 ('base')", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.9.13" 163 | }, 164 | "vscode": { 165 | "interpreter": { 166 | "hash": "585a938ec471c889bf0cce0aed741a99eaf47ca09c0fa8393793bc5bfe77ba11" 167 | } 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 5 172 | } 173 | -------------------------------------------------------------------------------- /cases/archived/lm_studio/example01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/cases/archived/lm_studio/example01.jpg -------------------------------------------------------------------------------- /cases/archived/lm_studio/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 測試 LLM 串流傳輸效果 7 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 |
25 | 26 | 27 | 28 | 29 | 30 | 101 | 102 | -------------------------------------------------------------------------------- /cases/archived/lm_studio/web_api.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template 2 | from openai import OpenAI 3 | 4 | ''' 5 | Flask 初始化 6 | ''' 7 | app = Flask(__name__) 8 | app.json.ensure_ascii = False # 防止中文變成 unicode 編碼 9 | 10 | ''' 11 | OpenAI 設定初始化 12 | ''' 13 | client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio") 14 | 15 | ''' 16 | 變數初始化 17 | ''' 18 | # 使用字典來保存每個會話的對話歷史 19 | sessions_history = {} 20 | 21 | ''' 22 | 自訂 router 23 | ''' 24 | # 首頁 (透過 render_template 函式,將 templates/index.html 檔案回傳給前端) 25 | @app.route('/', methods=['GET']) 26 | def index(): 27 | return render_template('index.html') 28 | 29 | # 取得 ai assistant 的回應 30 | @app.route("/chat", methods=["POST"]) 31 | def chat(): 32 | # 取得前端傳來的 JSON 格式資料 33 | data = request.json 34 | 35 | # 取得 session id 和 使用者的訊息 36 | session_id = data.get("session_id") 37 | user_message = data.get("message") 38 | 39 | # 如果 session id 不存在於對話記錄中,針對這個 session id 建立一個新的會話記錄 40 | if session_id not in sessions_history: 41 | sessions_history[session_id] = [ 42 | {"role": "system", "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."} 43 | ] 44 | 45 | # 將使用者的訊息,加入到會話記錄中 46 | sessions_history[session_id].append({"role": "user", "content": user_message}) 47 | 48 | # 生成回應 49 | def generate_responses(): 50 | # 透過 Chat Completions API 來取得 ai assistant 的回應 51 | completion = client.chat.completions.create( 52 | model="audreyt/Taiwan-LLM-7B-v2.0.1-chat-GGUF/Taiwan-LLM-7B-v2.0.1-chat-Q5_K_M.gguf", 53 | messages=sessions_history[session_id], 54 | temperature=0.7, 55 | stream=True, 56 | ) 57 | 58 | # 透過 stream 方式,將 ai assistant 生成的文字一個一個輸出 59 | content = '' 60 | for chunk in completion: 61 | if chunk.choices[0].delta.content: 62 | content += chunk.choices[0].delta.content 63 | yield chunk.choices[0].delta.content 64 | 65 | # 將生成的回應加入到會話歷史中 66 | sessions_history[session_id].append({"role": "assistant", "content": content}) 67 | 68 | # 回傳 ai assistant 生成的回應 69 | return generate_responses(), {"Content-Type": "text/plain"} 70 | 71 | if __name__ == "__main__": 72 | app.run( 73 | # 除錯模式為 True,服務執行期間有錯誤,會將 Traceback 顯示在網頁上, 74 | # 反之則顯示一般的 Internal Server Error 75 | debug=True, 76 | 77 | # 127.0.0.1 或 localhost 限定本機使用服務, 78 | # 0.0.0.0 代表所有知道主機實際 IP 的人都能存取 79 | host='127.0.0.1', 80 | 81 | # 網址或 IP 後面附加的 Port 號,代表服務由該 Port 號提供 82 | port=5000 83 | ) 84 | -------------------------------------------------------------------------------- /cases/archived/lm_studio/web_api_embedding.py: -------------------------------------------------------------------------------- 1 | # Make sure to `pip install openai` first 2 | from openai import OpenAI 3 | client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio") 4 | 5 | def get_embedding(text, model="nomic-ai/nomic-embed-text-v1.5-GGUF"): 6 | text = text.replace("\n", " ") 7 | return client.embeddings.create(input = [text], model=model).data[0].embedding 8 | 9 | vector_example = get_embedding("Once upon a time, there was a cat.") 10 | print(vector_example) # 句向量 11 | print("=" * 50) 12 | print("向量維度:", len(vector_example)) # 維度 13 | print("=" * 50) 14 | 15 | # 計算兩個句子的相似度 16 | import numpy as np 17 | text_a = "Once upon a time, there was a cat." 18 | text_b = "Long times ago, there was a cat." 19 | vector_text_a = get_embedding(text_a) 20 | vector_text_b = get_embedding(text_b) 21 | 22 | # 透過 cosine similarity 計算相似度 23 | cosine_similarity = np.dot(vector_text_a, vector_text_b) / (np.linalg.norm(vector_text_a) * np.linalg.norm(vector_text_b)) 24 | print(cosine_similarity) 25 | print(f"{cosine_similarity * 100:.4f} % 相似度") 26 | -------------------------------------------------------------------------------- /cases/archived/network.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "id": "1b68387f", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "{'log': {'comment': '',\n", 14 | " 'creator': {'comment': '',\n", 15 | " 'name': 'BrowserMob Proxy',\n", 16 | " 'version': '2.1.4'},\n", 17 | " 'entries': [],\n", 18 | " 'pages': [{'comment': '',\n", 19 | " 'id': 'wine_searcher',\n", 20 | " 'pageTimings': {'comment': ''},\n", 21 | " 'startedDateTime': '2022-03-27T01:32:46.629+08:00',\n", 22 | " 'title': 'wine_searcher'}],\n", 23 | " 'version': '1.2'}}\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "'''\n", 29 | "匯入套件\n", 30 | "'''\n", 31 | "# 操作 browser 的 API\n", 32 | "from selenium import webdriver\n", 33 | "\n", 34 | "# 處理逾時例外的工具\n", 35 | "from selenium.common.exceptions import TimeoutException\n", 36 | "\n", 37 | "# 面對動態網頁,等待某個元素出現的工具,通常與 exptected_conditions 搭配\n", 38 | "from selenium.webdriver.support.ui import WebDriverWait\n", 39 | "\n", 40 | "# 搭配 WebDriverWait 使用,對元素狀態的一種期待條件,若條件發生,則等待結束,往下一行執行\n", 41 | "from selenium.webdriver.support import expected_conditions as EC\n", 42 | "\n", 43 | "# 期待元素出現要透過什麼方式指定,通常與 EC、WebDriverWait 一起使用\n", 44 | "from selenium.webdriver.common.by import By\n", 45 | "\n", 46 | "# 強制等待 (執行期間休息一下)\n", 47 | "from time import sleep\n", 48 | "\n", 49 | "# 整理 json 使用的工具\n", 50 | "import json\n", 51 | "\n", 52 | "# 執行 command 的時候用的\n", 53 | "import os\n", 54 | "\n", 55 | "# 引入 regular expression 工具\n", 56 | "import re\n", 57 | "\n", 58 | "# 輸出排版美化的工具\n", 59 | "from pprint import pprint\n", 60 | "\n", 61 | "# 瀏覽器代理工具\n", 62 | "from browsermobproxy import Server\n", 63 | "\n", 64 | "# 剖析網址資訊\n", 65 | "from urllib import parse\n", 66 | "\n", 67 | "# 正規表達式\n", 68 | "import re\n", 69 | "\n", 70 | "# 子處理程序,用來取代 os.system 的功能\n", 71 | "import subprocess\n", 72 | "\n", 73 | "# 美化輸出\n", 74 | "from pprint import pprint\n", 75 | "\n", 76 | "'''\n", 77 | "啟動瀏覽器工具的選項\n", 78 | "'''\n", 79 | "# 選項初始設定\n", 80 | "options = webdriver.ChromeOptions()\n", 81 | "# options.add_argument(\"--headless\") #不開啟實體瀏覽器背景執行\n", 82 | "options.add_argument(\"--start-maximized\") #最大化視窗\n", 83 | "options.add_argument(\"--incognito\") #開啟無痕模式\n", 84 | "options.add_argument(\"--disable-popup-blocking \") #禁用彈出攔截\n", 85 | "\n", 86 | "# 啟動 proxy server 與 proxy client\n", 87 | "dictSetting = {'port': 8090}\n", 88 | "server = Server(\n", 89 | " path = r'.\\browsermob-proxy-2.1.4\\bin\\browsermob-proxy.bat',\n", 90 | " options = dictSetting\n", 91 | ")\n", 92 | "server.start()\n", 93 | "proxy = server.create_proxy()\n", 94 | "\n", 95 | "#\n", 96 | "user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'\n", 97 | "\n", 98 | "# 忽略認證錯誤訊息,以及加入自訂的 proxy\n", 99 | "options.add_argument(\"--ignore-certificate-errors\")\n", 100 | "options.add_argument(f\"--proxy-server={proxy.proxy}\")\n", 101 | "options.add_argument(f'--user-agent={user_agent}')\n", 102 | "\n", 103 | "# chrome 執行檔路徑 (在 unix-like 環境要用 / 這個斜線)\n", 104 | "executable_path = os.getcwd() + \"\\\\\" + \"chromedriver.exe\" \n", 105 | "\n", 106 | "# 使用 Chrome 的 WebDriver (含 options, executable_path)\n", 107 | "driver = webdriver.Chrome(\n", 108 | " options = options, \n", 109 | " executable_path = executable_path\n", 110 | ")\n", 111 | "\n", 112 | "#\n", 113 | "url = 'https://www.wine-searcher.com/find/mouton+rothschild+pauillac+medoc+bordeaux+france/2012#t4'\n", 114 | "\n", 115 | "# 前往指定連結\n", 116 | "driver.get(url);\n", 117 | "\n", 118 | "# 強制等待\n", 119 | "sleep(10)\n", 120 | "\n", 121 | "# 代理機制設定\n", 122 | "proxy.new_har('wine_searcher', options = {\n", 123 | " 'captureHeaders': True,\n", 124 | " 'captureContent': True\n", 125 | "})\n", 126 | "\n", 127 | "# 這裡的強制等待比較特別,等越久,取得的 Network 面板資訊愈多\n", 128 | "sleep(10)\n", 129 | "\n", 130 | "# 取得所有請求與回應的資訊\n", 131 | "result = proxy.har\n", 132 | "\n", 133 | "#\n", 134 | "pprint(result)" 135 | ] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.8.8" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 5 159 | } 160 | -------------------------------------------------------------------------------- /cases/archived/open_access/deprecated/中國學術年刊.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "10daaa05", 7 | "metadata": { 8 | "scrolled": true 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "'''\n", 13 | "中國學術年刊\n", 14 | "http://140.122.64.125/SIS/app/paper.php\n", 15 | "\n", 16 | "\n", 17 | "技術參考連結:\n", 18 | "[1] Python requests 中文亂碼解決方法 \n", 19 | "https://sjkou.net/2017/01/06/python-requests-encoding/\n", 20 | "'''\n", 21 | "\n", 22 | "'''\n", 23 | "匯入套件\n", 24 | "'''\n", 25 | "# 請求套件\n", 26 | "import requests as req\n", 27 | "\n", 28 | "# 格式化輸出工具\n", 29 | "from pprint import pprint as pp\n", 30 | "\n", 31 | "# HTML Parser\n", 32 | "from bs4 import BeautifulSoup as bs\n", 33 | "\n", 34 | "# 強制等待 (執行期間休息一下)\n", 35 | "from time import sleep\n", 36 | "\n", 37 | "# 整理 json 使用的工具\n", 38 | "import json\n", 39 | "\n", 40 | "# regular expression 工具\n", 41 | "import re\n", 42 | "\n", 43 | "# 子處理程序,用來取代 os.system 的功能\n", 44 | "import subprocess\n", 45 | "\n", 46 | "# 建立隨機數\n", 47 | "from random import randint\n", 48 | "\n", 49 | "# 資料庫 (sqlite3)\n", 50 | "import sqlite3\n", 51 | "\n", 52 | "# Excel 工具\n", 53 | "from openpyxl import load_workbook\n", 54 | "from openpyxl import Workbook\n", 55 | "\n", 56 | "# 時間工具\n", 57 | "from datetime import datetime\n", 58 | "\n", 59 | "# 其它\n", 60 | "import json, os, sys\n", 61 | "\n", 62 | "'''設定'''\n", 63 | "# 主要首頁\n", 64 | "prefix = 'http://140.122.64.125/'\n", 65 | "infix = 'SIS/app'\n", 66 | "url = prefix + infix + '/paper.php'\n", 67 | "\n", 68 | "# JSON 存檔路徑\n", 69 | "json_path = \"./中國學術年刊.json\"\n", 70 | "\n", 71 | "# 自訂標頭\n", 72 | "my_headers = {\n", 73 | " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'\n", 74 | "}\n", 75 | "\n", 76 | "# 整理資料用的變數\n", 77 | "listData = []\n", 78 | "\n", 79 | "# 建立儲存檔案用的資料夾,不存在就新增\n", 80 | "folderPath = '中國學術年刊'\n", 81 | "if not os.path.exists(folderPath):\n", 82 | " os.makedirs(folderPath)\n", 83 | "\n", 84 | " \n", 85 | "'''程式區域'''\n", 86 | "# 取得分頁資訊\n", 87 | "def getPaginationLinks():\n", 88 | " # 建立 parser\n", 89 | " res = req.get(url, headers = my_headers)\n", 90 | " res.encoding = 'utf-8'\n", 91 | " soup = bs(res.text, \"lxml\")\n", 92 | " \n", 93 | " # 取得分頁連結\n", 94 | " regex = r\"\\s(\\d+)\\s\" \n", 95 | " a_elms = soup.select('ol.PageList li a')\n", 96 | " for a in a_elms:\n", 97 | " match = re.search(regex, a.get_text())\n", 98 | " if match != None:\n", 99 | " listData.append({\n", 100 | " \"page\": match[1],\n", 101 | " \"link\": url + '' + a['href']\n", 102 | " })\n", 103 | " \n", 104 | "# 取得 主要 連結\n", 105 | "def getMainLinks():\n", 106 | " for index, _dict in enumerate(listData):\n", 107 | " # 沒有 sub 屬性,則建立,為了放置細節頁的內容\n", 108 | " if \"sub\" not in listData[index]:\n", 109 | " listData[index]['sub'] = []\n", 110 | " \n", 111 | " # 建立 parser\n", 112 | " res = req.get(_dict['link'], headers = my_headers)\n", 113 | " res.encoding = 'utf-8'\n", 114 | " soup = bs(res.text, \"lxml\")\n", 115 | " \n", 116 | " # 取得主要連結\n", 117 | " a_elms = soup.select('table#RSS_Table_page_paper_1 tbody tr td a')\n", 118 | " for a in a_elms:\n", 119 | " listData[index]['sub'].append({\n", 120 | " \"title\": a.get_text(),\n", 121 | " \"link\": prefix + '/' + infix + '/' + a['href']\n", 122 | " })\n", 123 | " \n", 124 | "# 取得 PDF 連結\n", 125 | "def getPdfLinks():\n", 126 | " for index, _dict in enumerate(listData):\n", 127 | " for idx, _d in enumerate(listData[index]['sub']):\n", 128 | " # 沒有 sub 屬性,則建立,為了放置細節頁的內容\n", 129 | " if \"downloads\" not in listData[index]['sub'][idx]:\n", 130 | " listData[index]['sub'][idx]['downloads'] = []\n", 131 | " \n", 132 | " # 過濾出 pdf 連結\n", 133 | " regex = r\"https?:\\/\\/.+\\.pdf\"\n", 134 | " \n", 135 | " # 建立 parser\n", 136 | " res = req.get(_d['link'], headers = my_headers)\n", 137 | " res.encoding = 'utf-8'\n", 138 | " soup = bs(res.text, \"lxml\")\n", 139 | "\n", 140 | " # 取得該頁的每一個 tr\n", 141 | " tr_elms = soup.select('table#RSS_Table_page_paper_1 tbody tr')\n", 142 | " if len(tr_elms) > 0:\n", 143 | " for tr in tr_elms:\n", 144 | " # 取得該 tr 下的所有 td\n", 145 | " td_elms = tr.select('td')\n", 146 | " \n", 147 | " # 期數\n", 148 | " title = td_elms[0].get_text()\n", 149 | " \n", 150 | " # 作者\n", 151 | " author = td_elms[1].get_text()\n", 152 | " \n", 153 | " # 頁碼\n", 154 | " page = td_elms[2].get_text()\n", 155 | " \n", 156 | " # 篇名\n", 157 | " journal_name = td_elms[3].get_text()\n", 158 | " \n", 159 | " # 篇名超連結\n", 160 | " link = prefix + '/' + infix + '/' + td_elms[3].select_one('a')['href']\n", 161 | " \n", 162 | " # pdf 連結\n", 163 | " if len(td_elms[4].select('a')) > 0:\n", 164 | " pdf_link = prefix + td_elms[4].select('a')[1]['href']\n", 165 | " else:\n", 166 | " pdf_link = None\n", 167 | "\n", 168 | " # 整理資料\n", 169 | " listData[index]['sub'][idx]['downloads'].append({\n", 170 | " \"title\": title,\n", 171 | " \"author\": author,\n", 172 | " \"page\": page,\n", 173 | " \"journal_name\": journal_name,\n", 174 | " \"link\": link,\n", 175 | " \"pdf_link\": pdf_link\n", 176 | " })\n", 177 | " \n", 178 | " # (選項) 下載 PDF\n", 179 | " os.system(f\"curl {pdf_link} -o {folderPath}/{journal_name}.pdf\")\n", 180 | " \n", 181 | "# 儲存成 JSON\n", 182 | "def saveToJson():\n", 183 | " with open(json_path, \"w\", encoding=\"utf-8\") as file:\n", 184 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )\n", 185 | " \n", 186 | "'''執行區域'''\n", 187 | "if __name__ == \"__main__\":\n", 188 | " getPaginationLinks()\n", 189 | " getMainLinks()\n", 190 | " getPdfLinks()\n", 191 | " saveToJson()" 192 | ] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.8.8" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 5 216 | } 217 | -------------------------------------------------------------------------------- /cases/archived/open_access/deprecated/政大中文學報.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "25080100", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''\n", 11 | "政大中文學報\n", 12 | "http://ctma.nccu.edu.tw/chibulletin/app/paper.php\n", 13 | "'''\n", 14 | "\n", 15 | "'''\n", 16 | "匯入套件\n", 17 | "'''\n", 18 | "# 請求套件\n", 19 | "import requests as req\n", 20 | "\n", 21 | "# 格式化輸出工具\n", 22 | "from pprint import pprint as pp\n", 23 | "\n", 24 | "# HTML Parser\n", 25 | "from bs4 import BeautifulSoup as bs\n", 26 | "\n", 27 | "# 強制等待 (執行期間休息一下)\n", 28 | "from time import sleep\n", 29 | "\n", 30 | "# 整理 json 使用的工具\n", 31 | "import json\n", 32 | "\n", 33 | "# regular expression 工具\n", 34 | "import re\n", 35 | "\n", 36 | "# 子處理程序,用來取代 os.system 的功能\n", 37 | "import subprocess\n", 38 | "\n", 39 | "# 建立隨機數\n", 40 | "from random import randint\n", 41 | "\n", 42 | "# 資料庫 (sqlite3)\n", 43 | "import sqlite3\n", 44 | "\n", 45 | "# Excel 工具\n", 46 | "from openpyxl import load_workbook\n", 47 | "from openpyxl import Workbook\n", 48 | "\n", 49 | "# 時間工具\n", 50 | "from datetime import datetime\n", 51 | "\n", 52 | "# 其它\n", 53 | "import json, os, sys\n", 54 | "\n", 55 | "'''\n", 56 | "設定\n", 57 | "'''\n", 58 | "# 主要首頁\n", 59 | "prefix = \"http://ctma.nccu.edu.tw/\"\n", 60 | "infix = 'chibulletin/app'\n", 61 | "url = prefix + infix + '/paper.php'\n", 62 | "\n", 63 | "# JSON 存檔路徑\n", 64 | "json_path = \"./政大中文學報.json\"\n", 65 | "\n", 66 | "# 自訂標頭\n", 67 | "my_headers = {\n", 68 | " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'\n", 69 | "}\n", 70 | "\n", 71 | "# 整理資料用的變數\n", 72 | "listData = []\n", 73 | "\n", 74 | "# 建立儲存檔案用的資料夾,不存在就新增\n", 75 | "folderPath = '政大中文學報'\n", 76 | "if not os.path.exists(folderPath):\n", 77 | " os.makedirs(folderPath)\n", 78 | "\n", 79 | "'''\n", 80 | "程式區域\n", 81 | "'''\n", 82 | "# 取得分頁資訊\n", 83 | "def getPaginationLinks():\n", 84 | " # 建立 parser\n", 85 | " res = req.get(url, headers = my_headers)\n", 86 | " res.encoding = 'utf-8'\n", 87 | " soup = bs(res.text, \"lxml\")\n", 88 | " \n", 89 | " # 取得分頁資訊\n", 90 | " regex = r\"\\s(\\d+)\\s\" \n", 91 | " a_elms = soup.select('ol.PageList li a')\n", 92 | " for a in a_elms:\n", 93 | " match = re.search(regex, a.get_text())\n", 94 | " if match != None:\n", 95 | " listData.append({\n", 96 | " \"page\": match[1],\n", 97 | " \"link\": url + '' + a['href']\n", 98 | " }) \n", 99 | "\n", 100 | "# 取得主要頁面所有連結資訊\n", 101 | "def getMainLinks():\n", 102 | " for index, _dict in enumerate(listData):\n", 103 | " # 沒有 sub 屬性,則建立,為了放置細節頁的內容\n", 104 | " if \"sub\" not in listData[index]:\n", 105 | " listData[index]['sub'] = []\n", 106 | " \n", 107 | " # 建立 parser\n", 108 | " res = req.get(_dict['link'], headers = my_headers)\n", 109 | " res.encoding = 'utf-8'\n", 110 | " soup = bs(res.text, \"lxml\")\n", 111 | " \n", 112 | " # 取得主要連結\n", 113 | " a_elms = soup.select('table#RSS_Table_page_paper_1 tbody tr td a')\n", 114 | " for a in a_elms:\n", 115 | " listData[index]['sub'].append({\n", 116 | " \"title\": a.get_text(),\n", 117 | " \"link\": prefix + '/' + infix + '/' + a['href']\n", 118 | " })\n", 119 | " \n", 120 | "# 取得 PDF 連結\n", 121 | "def getPdfLinks():\n", 122 | " for index, _dict in enumerate(listData):\n", 123 | " for idx, _d in enumerate(listData[index]['sub']):\n", 124 | " # 沒有 sub 屬性,則建立,為了放置細節頁的內容\n", 125 | " if \"downloads\" not in listData[index]['sub'][idx]:\n", 126 | " listData[index]['sub'][idx]['downloads'] = []\n", 127 | " \n", 128 | " # 過濾出 pdf 連結\n", 129 | " regex = r\"https?:\\/\\/.+\\.pdf\"\n", 130 | " \n", 131 | " # 建立 parser\n", 132 | " res = req.get(_d['link'], headers = my_headers)\n", 133 | " res.encoding = 'utf-8'\n", 134 | " soup = bs(res.text, \"lxml\")\n", 135 | "\n", 136 | " # 取得該頁的每一個 tr\n", 137 | " tr_elms = soup.select('table#RSS_Table_page_paper_1 tbody tr')\n", 138 | " if len(tr_elms) > 0:\n", 139 | " for tr in tr_elms:\n", 140 | " # 取得該 tr 下的所有 td\n", 141 | " td_elms = tr.select('td')\n", 142 | " \n", 143 | " # 期數\n", 144 | " title = td_elms[0].get_text()\n", 145 | " \n", 146 | " # 作者\n", 147 | " author = td_elms[1].get_text()\n", 148 | " \n", 149 | " # 頁碼\n", 150 | " page = td_elms[2].get_text()\n", 151 | " \n", 152 | " # 篇名\n", 153 | " journal_name = td_elms[3].get_text()\n", 154 | " \n", 155 | " # 篇名超連結\n", 156 | " link = prefix + '/' + infix + '/' + td_elms[3].select_one('a')['href']\n", 157 | " \n", 158 | " # pdf 連結\n", 159 | " if len(td_elms[4].select('a')) > 0:\n", 160 | " pdf_link = prefix + td_elms[4].select('a')[1]['href']\n", 161 | " else:\n", 162 | " pdf_link = None\n", 163 | "\n", 164 | " # 整理資料\n", 165 | " listData[index]['sub'][idx]['downloads'].append({\n", 166 | " \"title\": title,\n", 167 | " \"author\": author,\n", 168 | " \"page\": page,\n", 169 | " \"journal_name\": journal_name,\n", 170 | " \"link\": link,\n", 171 | " \"pdf_link\": pdf_link\n", 172 | " })\n", 173 | " \n", 174 | " # (選項) 下載 PDF\n", 175 | " os.system(f\"curl {pdf_link} -o {folderPath}/{journal_name}.pdf\")\n", 176 | "\n", 177 | "# 儲存成 JSON\n", 178 | "def saveToJson():\n", 179 | " with open(f\"{folderPath}/{json_path}\", \"w\", encoding=\"utf-8\") as file:\n", 180 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )\n", 181 | " \n", 182 | "'''\n", 183 | "執行區域\n", 184 | "'''\n", 185 | "if __name__ == \"__main__\":\n", 186 | " getPaginationLinks()\n", 187 | " getMainLinks()\n", 188 | " getPdfLinks()\n", 189 | " saveToJson()" 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 3", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.8.8" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 5 214 | } 215 | -------------------------------------------------------------------------------- /cases/archived/open_access/deprecated/東吳大學政治學報.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e1ed71e2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''\n", 11 | "安裝 tika\n", 12 | "\n", 13 | "參考網頁\n", 14 | "https://pypi.org/project/tika/\n", 15 | "'''\n", 16 | "!pip install tika" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "dde939c4", 23 | "metadata": { 24 | "scrolled": false 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "'''\n", 29 | "東吳大學 政治學報\n", 30 | "https://web-ch.scu.edu.tw/politics/file/11106\n", 31 | "\n", 32 | "參考連結:\n", 33 | "[1] python selenium 對瀏覽器標籤頁進行關閉和切換\n", 34 | "https://www.itread01.com/content/1543567328.html\n", 35 | "'''\n", 36 | "\n", 37 | "\n", 38 | "'''\n", 39 | "匯入套件\n", 40 | "'''\n", 41 | "# 操作 browser 的 API\n", 42 | "from selenium import webdriver\n", 43 | "\n", 44 | "# 處理逾時例外的工具\n", 45 | "from selenium.common.exceptions import TimeoutException\n", 46 | "\n", 47 | "# 面對動態網頁,等待某個元素出現的工具,通常與 exptected_conditions 搭配\n", 48 | "from selenium.webdriver.support.ui import WebDriverWait\n", 49 | "\n", 50 | "# 搭配 WebDriverWait 使用,對元素狀態的一種期待條件,若條件發生,則等待結束,往下一行執行\n", 51 | "from selenium.webdriver.support import expected_conditions as EC\n", 52 | "\n", 53 | "# 期待元素出現要透過什麼方式指定,通常與 EC、WebDriverWait 一起使用\n", 54 | "from selenium.webdriver.common.by import By\n", 55 | "\n", 56 | "# 強制等待 (執行期間休息一下)\n", 57 | "from time import sleep\n", 58 | "\n", 59 | "# 整理 json 使用的工具\n", 60 | "import json\n", 61 | "\n", 62 | "# 執行 command 的時候用的\n", 63 | "import os\n", 64 | "\n", 65 | "# 正規表達式\n", 66 | "import re\n", 67 | "\n", 68 | "# 使用 tika 的 parser\n", 69 | "from tika import parser\n", 70 | "\n", 71 | "'''\n", 72 | "Selenium with Python 中文翻譯文檔\n", 73 | "參考網頁:https://selenium-python-zh.readthedocs.io/en/latest/index.html\n", 74 | "selenium 啓動 Chrome 的進階配置參數\n", 75 | "參考網址:https://stackoverflow.max-everyday.com/2019/12/selenium-chrome-options/\n", 76 | "Mouse Hover Action in Selenium\n", 77 | "參考網址:https://www.toolsqa.com/selenium-webdriver/mouse-hover-action/\n", 78 | "yt-dlp 下載影音的好工具\n", 79 | "參考網址:https://github.com/yt-dlp/yt-dlp\n", 80 | "'''\n", 81 | "\n", 82 | "# 啟動瀏覽器工具的選項\n", 83 | "my_options = webdriver.ChromeOptions()\n", 84 | "# my_options.add_argument(\"--headless\") #不開啟實體瀏覽器背景執行\n", 85 | "my_options.add_argument(\"--start-maximized\") #最大化視窗\n", 86 | "my_options.add_argument(\"--incognito\") #開啟無痕模式\n", 87 | "my_options.add_argument(\"--disable-popup-blocking\") #禁用彈出攔截\n", 88 | "my_options.add_argument(\"--disable-notifications\") #取消通知\n", 89 | "my_options.add_argument(\"--lang=zh-TW\") #設定為正體中文\n", 90 | "\n", 91 | "# 指定 chromedriver 檔案的路徑\n", 92 | "driver_exec_path = './chromedriver.exe'\n", 93 | "\n", 94 | "# 使用 Chrome 的 WebDriver\n", 95 | "driver = webdriver.Chrome( \n", 96 | " options = my_options, \n", 97 | " executable_path = driver_exec_path\n", 98 | ")\n", 99 | "\n", 100 | "# 放置爬取的資料\n", 101 | "listData = []\n", 102 | "\n", 103 | "# 建立資料夾\n", 104 | "folderPath = 'parsed_files'\n", 105 | "if not os.path.exists(folderPath):\n", 106 | " os.makedirs(folderPath)\n", 107 | "\n", 108 | "# 網址\n", 109 | "url = 'https://web-ch.scu.edu.tw/politics/file/11106'\n", 110 | "\n", 111 | "'''\n", 112 | "函式\n", 113 | "'''\n", 114 | "# 走訪頁面\n", 115 | "def visit():\n", 116 | " driver.get(url);\n", 117 | " \n", 118 | "# 取得主要連結\n", 119 | "def getMainData():\n", 120 | " try:\n", 121 | " # 等待主要連結出現\n", 122 | " WebDriverWait(driver, 10).until(\n", 123 | " EC.presence_of_element_located(\n", 124 | " (\n", 125 | " By.CSS_SELECTOR, \n", 126 | " \"div#rndbox_body table.table.table-striped.table-bordered tbody td a\"\n", 127 | " )\n", 128 | " )\n", 129 | " )\n", 130 | " \n", 131 | " # 得到所有連結的數量\n", 132 | " count = len(driver.find_elements(By.CSS_SELECTOR, \"div#rndbox_body table.table.table-striped.table-bordered tbody td a\"))\n", 133 | " \n", 134 | " # 切換到新分頁(初始分頁代號為 0,新開的為 1,所以切換到 1,代表移到分頁去操作)\n", 135 | " for i in range(count):\n", 136 | " # 開啟新分頁\n", 137 | " driver.execute_script(f'window.open(\"{url}\", \"_blank\");')\n", 138 | " \n", 139 | " # 等待一下\n", 140 | " sleep(7)\n", 141 | " \n", 142 | " # 切換到分頁\n", 143 | " driver.switch_to.window(driver.window_handles[1])\n", 144 | " \n", 145 | " # 等元素出現\n", 146 | " WebDriverWait(driver, 10).until(\n", 147 | " EC.presence_of_element_located(\n", 148 | " (\n", 149 | " By.CSS_SELECTOR, \n", 150 | " \"div#rndbox_body table.table.table-striped.table-bordered tbody td a\")\n", 151 | " )\n", 152 | " )\n", 153 | " \n", 154 | " # 取得 a 元素集合\n", 155 | " a_elms = driver.find_elements(By.CSS_SELECTOR, \"div#rndbox_body table.table.table-striped.table-bordered tbody td a\")\n", 156 | " \n", 157 | " # 按下超連結\n", 158 | " a_elms[i].click()\n", 159 | " \n", 160 | " # 等 pdf 資訊出現\n", 161 | " WebDriverWait(driver, 10).until(\n", 162 | " EC.presence_of_element_located(\n", 163 | " (\n", 164 | " By.CSS_SELECTOR,\n", 165 | " \"div#article ol li h2 a\"\n", 166 | " )\n", 167 | " )\n", 168 | " )\n", 169 | " \n", 170 | " # 輸出所有 pdf 資訊\n", 171 | " a_article_elms = driver.find_elements(By.CSS_SELECTOR, \"div#article ol li h2 a\")\n", 172 | " for a_article in a_article_elms:\n", 173 | " print(f\"文章名稱: {a_article.get_attribute('innerText')}\")\n", 174 | " print(f\"PDF連結: {a_article.get_attribute('href')}\")\n", 175 | " \n", 176 | " # 簡單資料清理\n", 177 | " title = re.sub(r\"\\?|’|:| |\\s\", \"_\", a_article.get_attribute('innerText'))\n", 178 | " \n", 179 | " # 讀取兩篇 pdf 的內文,並寫入檔案\n", 180 | " with open(f\"{folderPath}/{title}.txt\", \"w\", encoding=\"utf-8\") as file:\n", 181 | " parsed_pdf = parser.from_file( a_article.get_attribute('href') )\n", 182 | " file.write( parsed_pdf['content'] )\n", 183 | " \n", 184 | " # 關閉當前分頁\n", 185 | " driver.close()\n", 186 | " \n", 187 | " # 切換到初始分頁\n", 188 | " driver.switch_to.window(driver.window_handles[0])\n", 189 | " \n", 190 | " except TimeoutException:\n", 191 | " print(\"等不到指定元素出現…\")\n", 192 | "\n", 193 | " \n", 194 | "# 關閉瀏覽器\n", 195 | "def close():\n", 196 | " driver.quit()\n", 197 | " \n", 198 | "'''\n", 199 | "主程式\n", 200 | "'''\n", 201 | "if __name__ == '__main__':\n", 202 | " visit()\n", 203 | " getMainData()\n", 204 | " close()" 205 | ] 206 | } 207 | ], 208 | "metadata": { 209 | "kernelspec": { 210 | "display_name": "Python 3", 211 | "language": "python", 212 | "name": "python3" 213 | }, 214 | "language_info": { 215 | "codemirror_mode": { 216 | "name": "ipython", 217 | "version": 3 218 | }, 219 | "file_extension": ".py", 220 | "mimetype": "text/x-python", 221 | "name": "python", 222 | "nbconvert_exporter": "python", 223 | "pygments_lexer": "ipython3", 224 | "version": "3.8.8" 225 | } 226 | }, 227 | "nbformat": 4, 228 | "nbformat_minor": 5 229 | } 230 | -------------------------------------------------------------------------------- /cases/archived/tika.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "6f2ab3a6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# 安裝 tika\n", 11 | "!pip install -U tika" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "db754896", 18 | "metadata": { 19 | "scrolled": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# 匯入套件\n", 24 | "from tika import parser\n", 25 | "from pprint import pprint\n", 26 | "\n", 27 | "# 開啟 pdf 檔案\n", 28 | "parsed_pdf = parser.from_file(\"./你的檔案.pdf\")\n", 29 | "\n", 30 | "# 輸出 pdf 內容\n", 31 | "print( parsed_pdf['content'] )" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "28d23c9e", 37 | "metadata": {}, 38 | "source": [ 39 | "# 參考資料\n", 40 | "1. [Apache Tika - a content analysis toolkit](https://tika.apache.org/ \"Apache Tika - a content analysis toolkit\")\n", 41 | "2. [Parsing PDFs in Python with Tika](https://www.geeksforgeeks.org/parsing-pdfs-in-python-with-tika/ \"Parsing PDFs in Python with Tika\")\n", 42 | "3. [tika 1.24 - Apache Tika Python library](https://pypi.org/project/tika/ \"tika 1.24 - Apache Tika Python library\")" 43 | ] 44 | } 45 | ], 46 | "metadata": { 47 | "kernelspec": { 48 | "display_name": "Python 3 (ipykernel)", 49 | "language": "python", 50 | "name": "python3" 51 | }, 52 | "language_info": { 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "file_extension": ".py", 58 | "mimetype": "text/x-python", 59 | "name": "python", 60 | "nbconvert_exporter": "python", 61 | "pygments_lexer": "ipython3", 62 | "version": "3.9.13" 63 | } 64 | }, 65 | "nbformat": 4, 66 | "nbformat_minor": 5 67 | } 68 | -------------------------------------------------------------------------------- /cases/archived/twitter/README.md: -------------------------------------------------------------------------------- 1 | # twint - 不用登人帳號密碼和 API key,就能抓取 twitter 資料的利器 2 | 3 | ## 基本使用教學 4 | - Scraping Twitter with Twint 5 | - [https://nealcaren.org/lessons/twint/](https://nealcaren.org/lessons/twint/) 6 | 7 | 8 | ## twint 版本 9 | - [官方版本](https://github.com/twintproject/twint) 10 | - [woluxwolu 的版本](https://github.com/woluxwolu/twint) 11 | - [minamotorin 的版本](https://github.com/minamotorin/twint) 12 | 13 | 14 | ## 補充: 有關 woluxwolu 的 twint 版本 15 | woluxwolu 的 twint 版本 README 有寫「Modified by minamotorin.」,我看了一下 minamotorin 的 github,minamotorin 有提到「About problem of Twint from this fork, please open issues on minamotorin/twint.」 16 | 17 | minamotorin 的 twint 版本: [連結](https://github.com/minamotorin/twint) 18 | 19 | 感覺 minamotorin 的 twint 會比 woluxwolu 的版本新,目前先用 woluxwolu 的版本先跑一陣子看看,同時關注 minamotorin 的 twint 修正訊息。 20 | 21 | --- 22 | 23 | # twint 安裝方式: 24 | 以下 twint 版本皆可選擇,目前推薦 minamotorin 的版本 25 | 26 | ## 1. minamotorin 將官方 twint 專案 fork 後,自行修正的版本 (推薦使用) 27 | ```bash 28 | $ git clone https://github.com/minamotorin/twint.git 29 | $ cd twint 30 | $ pip install git+https://github.com/minamotorin/twint.git 31 | ``` 32 | - 將 `basic.py` 複製/移動到 twint 資料夾中 33 | - (Optional) 修改 url.py 34 | - 如果發現 `config.lang = 'en'` 的設定有問題(例如大量非指定語系的資料),可以嘗試以下作法: 35 | - 開啟 `/twint/twint/url.py` 36 | - 大約在 111 ~ 113 之間,會看到「`if config.Search:`」,在它上面加一個給 lang 用的設定: 37 | ```python 38 | if config.Lang: 39 | q += f" lang:{config.Lang}" 40 | ``` 41 | - 儲存 url.py 42 | - 安裝 nest_asyncio 套件 43 | - `$ pip install nest_asyncio` 44 | - 安裝 aiohttp 套件 45 | - `$ pip install aiohttp==3.7.0` 46 | - 回到的 twint 資料夾,執行程式 47 | - `$ python basic.py` 48 | 49 | ## 2. woluxwolu 直接下載官方 twint,再拿 minamotorin 原始碼來套用的版本 50 | [Issue] Search just stops scraping 51 | https://github.com/twintproject/twint/issues/1363 52 | ```bash 53 | $ git clone https://github.com/woluxwolu/twint.git 54 | $ cd twint 55 | $ pip install git+https://github.com/woluxwolu/twint.git 56 | ``` 57 | - 將 `basic.py` 複製/移動到 twint 資料夾中 58 | - (Optional) 修改 url.py 59 | - 如果發現 `config.lang = 'en'` 的設定有問題(例如大量非指定語系的資料),可以嘗試以下作法: 60 | - 開啟 `/twint/twint/url.py` 61 | - 大約在 111 ~ 113 之間,會看到「`if config.Search:`」,在它上面加一個給 lang 用的設定: 62 | ```python 63 | if config.Lang: 64 | q += f" lang:{config.Lang}" 65 | ``` 66 | - 儲存 url.py 67 | - 安裝 nest_asyncio 套件 68 | - `$ pip install nest_asyncio` 69 | - 安裝 aiohttp 套件 70 | - `$ pip install aiohttp==3.7.0` 71 | - 回到的 twint 資料夾,執行程式 72 | - `$ python basic.py` 73 | 74 | ## 3. 官方版本安裝方式 75 | - 第一步:下載 twint 76 | - `$ git clone https://github.com/twintproject/twint.git` 77 | - 或是選擇 Download Zip,而後解壓縮,出現 twint 資料夾 78 | - 進入 twint 資料夾 79 | - `$ cd twint` 80 | - 將 `basic.py` 複製/移動一份到 twint 資料夾中 81 | - 第二步:安裝套件 82 | - 開啟 requirements.txt 83 | - 將「aiohttp」改成「aiohttp==3.7.0」後儲存 84 | - 修正議題參考連結 https://ppt.cc/fJkVDx 85 | - `pip install -r requirements.txt` 86 | - `pip install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint` 87 | - (Install from source code) 此時 twint 資料夾裡面,會自動產生 src 資料夾 88 | - `pip install aiohttp==3.7.0` 89 | - Optional:修改 url.py 90 | - 如果發現 `config.lang = 'en'` 的設定有問題(例如大量非指定語系的資料),可以嘗試以下作法: 91 | - 開啟 `src/twint/url.py` 92 | - 大約在 111 ~ 113 之間,會看到「`if config.Search:`」,在它上面加一個給 lang 用的設定: 93 | ```python 94 | if config.Lang: 95 | q += f" lang:{config.Lang}" 96 | ``` 97 | - 儲存 url.py 98 | - 第三步:執行 basic.py 99 | - 回到剛開始進入的 twint 資料夾 100 | - `$ python basic.py` -------------------------------------------------------------------------------- /cases/archived/twitter/basic.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 匯入套件 3 | ''' 4 | import twint 5 | import nest_asyncio 6 | nest_asyncio.apply() 7 | 8 | # twint 設定,可參考 https://github.com/twintproject/twint/blob/master/twint/config.py 9 | config = twint.Config() 10 | 11 | # twint 設定初始化 12 | config.Lang = 'zh-tw' # 語系: https://developer.twitter.com/en/docs/twitter-for-websites/supported-languages 13 | config.Search = '"機器學習" OR "自然語言處理"' # 搜尋關鍵字 14 | config.Limit = 1000000000000 # 限定抓多少筆資料 15 | config.Since = '2020-01-01 00:00:00' # 設定 since 與 until 16 | config.Until = '2022-06-30 23:59:59' # since 與 until 記得至少相差 1 天 17 | # config.Geo = "48.880048,2.385939,5km" 18 | config.Location = True # 加入發文地點 19 | config.Hide_output = False # 是否隱藏輸出 (debug 時可以開啟,正式使用時,建議關閉,可以減少 I/O) 20 | config.Store_csv = True # 儲存在 csv 檔 21 | config.Output = "./twitter.csv" # 輸出儲存資料的路徑 22 | config.Resume = './resume.txt' # 爬取過程會儲存先前取得的 tweet ID,當程式因故中斷時,它會從該 ID 開始 23 | 24 | # 進行資料抓取 (注意: 正常來說,執行幾個小時就會被程式自動中斷) 25 | twint.run.Search(config) -------------------------------------------------------------------------------- /cases/archived/twitter/twint_run.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 匯入套件 3 | ''' 4 | import twint 5 | import os, sys 6 | from random import randint 7 | from time import time, sleep 8 | import nest_asyncio 9 | nest_asyncio.apply() 10 | 11 | 12 | ''' 13 | 自訂函式 14 | ''' 15 | # 取得 tweet 資料 16 | def get_tweets(lang, keywords, since, until, save_path): 17 | # twint 設定 18 | config = twint.Config() 19 | 20 | # twint 設定初始化 21 | config.Lang = lang 22 | config.Search = keywords 23 | config.Limit = 1000000000000 24 | config.Since = since 25 | config.Until = until 26 | # config.Geo = "48.880048,2.385939,5km" 27 | config.Location = True 28 | config.Hide_output = False 29 | config.Store_csv = True 30 | config.Output = save_path 31 | config.Resume = './resume.txt' 32 | 33 | # 進行資料抓取 34 | twint.run.Search(config) 35 | 36 | # 主要執行程式 37 | def main(): 38 | # 設定初始值 39 | lang = 'zh-tw' 40 | keywords = '"機器學習" OR "自然語言處理"' 41 | since = '2022-01-01 00:00:00' 42 | until = '2023-04-15 23:59:59' 43 | save_path = './twitter.csv' 44 | 45 | try: 46 | while True: 47 | # 取得 tweets 48 | print("執行中") 49 | get_tweets(lang, keywords, since, until, save_path) 50 | 51 | print("休息一下") 52 | sleep(randint(1800, 2400)) 53 | except Exception as e: 54 | exc_type, exc_obj, exc_tb = sys.exc_info() 55 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 56 | print(exc_type, fname, exc_tb.tb_lineno) 57 | print(str(e)) 58 | 59 | # 等待一段時間,再繼續執行 (可能會佔用一定比例記憶體,記得隨時觀察,太高就先停掉) 60 | print("主程式拋出例外,停止執行; 等待一段時間,程式自動重啟") 61 | sleep(randint(1800, 2400)) 62 | main() 63 | 64 | ''' 65 | 主程式區域 66 | ''' 67 | if __name__ == "__main__": 68 | # 執行開始時間 69 | t_sec = time() 70 | 71 | # 主要執行程式 72 | main() 73 | 74 | # 完整執行花費時間 75 | print(f"完整執行花費時間: {time() - t_sec} 秒。") -------------------------------------------------------------------------------- /cases/archived/vector_index/README.md: -------------------------------------------------------------------------------- 1 | # 向量索引 (Vector Index) 2 | 3 | ## Windows 環境 - CPU Only 版本 4 | - 安裝建立 C 和 C++ 應用程式所需的元件 (請下載 Community 版本) 5 | - [連結](https://visualstudio.microsoft.com/zh-hant/vs/features/cplusplus/) 6 | - 使用 FAISS 建立向量索引 7 | - [安裝說明](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md) 8 | - 安裝指令: 9 | - `conda install -c pytorch faiss-cpu=1.8.0` 或 `conda install -c conda-forge faiss-cpu` 10 | - 安裝 Sentence Transformers 11 | - [安裝說明](https://sbert.net/docs/installation.html) 12 | - 安裝指令: 13 | - `pip install -U sentence-transformers` 或 `conda install -c conda-forge sentence-transformers` 14 | - 範例模型 15 | - [Pretrained Models](https://sbert.net/docs/sentence_transformer/pretrained_models.html#semantic-search-models) 16 | - 執行程式 17 | - `python make_index.py` 18 | - 如果出現 `ImportError: DLL load failed while importing _multiarray_umath: 找不到指定的模組。` 19 | - 步驟一: 先安裝 numpy 2.0 `pip install numpy==2.0` 20 | - 步驟二: 再安裝 numpy 小於 2.0 的版本 `pip install numpy==1.26.4` 21 | - 步驟三: 重新執行程式 `python make_index.py` 22 | - 如果還有問題,可以試試安裝 `pip install pybind11==2.12` -------------------------------------------------------------------------------- /cases/archived/vector_index/make_index.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 匯入套件 3 | ''' 4 | import os 5 | from sentence_transformers import SentenceTransformer 6 | import faiss 7 | 8 | # 索引預設變數 9 | index = None 10 | 11 | ''' 12 | [faiss.IndexFlatL2] 13 | 使用歐氏距離 14 | 15 | [faiss.IndexFlatIP] 16 | IP = Inner Product, 17 | 測試使用 Inner Product 來比較 features 資料 18 | 同時進行 feature normalization 19 | 等同於 cosine similarity 20 | ''' 21 | 22 | # 模型名稱 23 | model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v1' 24 | 25 | # 索引存放路徑 26 | index_path = './vector.index' 27 | 28 | # 讀取 model 29 | bi_encoder = SentenceTransformer(model_name) 30 | 31 | # 「句子」與對應的「句子 ID」(需要 int) 32 | listSentences = [ 33 | '我每天都被自己帥醒,壓力好大', 34 | '別瞎掰好嗎', 35 | '願你有個美好的一天', 36 | ] 37 | listIds = [1, 2, 3] 38 | 39 | # 將所有句子轉換成向量,同時計算轉向量時間 40 | embeddings = bi_encoder.encode( 41 | listSentences, 42 | batch_size=4, 43 | show_progress_bar=True, 44 | normalize_embeddings=False # 建議先查詢預訓練模型是否支援 45 | ) 46 | 47 | # 讀取索引,不存在就初始化 48 | if not os.path.exists(index_path): 49 | dims = embeddings.shape[1] 50 | index = faiss.IndexFlatIP(dims) # 初始化索引的維度 51 | index = faiss.IndexIDMap(index) # 讓 index 有記錄對應 doc id 的能力 52 | else: 53 | # 索引存在,直接讀取 54 | index = faiss.read_index(index_path) 55 | 56 | # 加入 doc id 到 對應的 vector 57 | index.add_with_ids(embeddings, listIds) # 加入 向量 與 文件ID 58 | # index.add(embeddings) # 僅加入向量 59 | 60 | # 儲存索引 61 | faiss.write_index(index, index_path) 62 | 63 | # 釋放記憶體 64 | del index, embeddings -------------------------------------------------------------------------------- /cases/archived/vector_index/query.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 匯入套件 3 | ''' 4 | from sentence_transformers import SentenceTransformer 5 | import faiss 6 | 7 | # 基本設定 8 | model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v1' 9 | bi_encoder = SentenceTransformer(model_name) 10 | 11 | # 讀取索引 12 | index_path = './vector.index' 13 | index = faiss.read_index(index_path) 14 | 15 | # 查詢句子 16 | list_query = ['不要亂說話', '希望你一整天都開心'] 17 | 18 | # 將查詢句子轉換成向量 19 | embeddings = bi_encoder.encode( 20 | list_query, 21 | batch_size=4, 22 | show_progress_bar=False, 23 | normalize_embeddings=False 24 | ) 25 | 26 | # 查詢 27 | D, I = index.search(embeddings, k=3) 28 | 29 | # 顯示結果 30 | list_scores = D.tolist() 31 | list_ids = I.tolist() 32 | print(f"相似度: {list_scores}") 33 | print(f"檢索的 Document IDs 為: {list_ids}") 34 | 35 | # 釋放記憶體 36 | del index, embeddings -------------------------------------------------------------------------------- /cases/archived/wikiart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "'''\n", 10 | "匯入套件\n", 11 | "'''\n", 12 | "# 操作 browser 的 API\n", 13 | "from selenium import webdriver\n", 14 | "\n", 15 | "# 處理逾時例外的工具\n", 16 | "from selenium.common.exceptions import TimeoutException\n", 17 | "\n", 18 | "# 面對動態網頁,等待某個元素出現的工具,通常與 exptected_conditions 搭配\n", 19 | "from selenium.webdriver.support.ui import WebDriverWait\n", 20 | "\n", 21 | "# 搭配 WebDriverWait 使用,對元素狀態的一種期待條件,若條件發生,則等待結束,往下一行執行\n", 22 | "from selenium.webdriver.support import expected_conditions as EC\n", 23 | "\n", 24 | "# 期待元素出現要透過什麼方式指定,通常與 EC、WebDriverWait 一起使用\n", 25 | "from selenium.webdriver.common.by import By\n", 26 | "\n", 27 | "# 強制等待 (執行期間休息一下)\n", 28 | "from time import sleep\n", 29 | "\n", 30 | "# 整理 json 使用的工具\n", 31 | "import json\n", 32 | "\n", 33 | "# 執行 command 的時候用的\n", 34 | "import os\n", 35 | "\n", 36 | "# 子處理程序,用來取代 os.system 的功能\n", 37 | "import subprocess\n", 38 | "\n", 39 | "# 取得隨機數\n", 40 | "import random\n", 41 | "\n", 42 | "# 啟動瀏覽器工具的選項\n", 43 | "my_options = webdriver.ChromeOptions()\n", 44 | "my_options.add_argument(\"--start-maximized\") #最大化視窗\n", 45 | "\n", 46 | "# 使用 Chrome 的 WebDriver\n", 47 | "driver = webdriver.Chrome(\n", 48 | " options = my_options,\n", 49 | ")\n", 50 | "\n", 51 | "# 建立儲存圖片、影片的資料夾\n", 52 | "folderPath = 'wikiart'\n", 53 | "if not os.path.exists(folderPath):\n", 54 | " os.makedirs(folderPath)\n", 55 | "\n", 56 | "# 放置爬取的資料\n", 57 | "set_data = set()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "def visit():\n", 67 | " driver.get(\"https://www.wikiart.org/\")\n", 68 | "\n", 69 | "# 滾動頁面\n", 70 | "def scroll():\n", 71 | " innerHeight = 0\n", 72 | " offset = 0\n", 73 | " count = 0\n", 74 | " limit = 2\n", 75 | " move = 0\n", 76 | " \n", 77 | " # 在捲動到沒有元素動態產生前,持續捲動\n", 78 | " while count <= limit:\n", 79 | " # 每次移動的距離\n", 80 | " offset += 800\n", 81 | "\n", 82 | " # 捲軸往下滑動\n", 83 | " driver.execute_script(f'''\n", 84 | " window.scrollTo({{\n", 85 | " top: {offset}, \n", 86 | " behavior: 'smooth' \n", 87 | " }});\n", 88 | " ''')\n", 89 | "\n", 90 | " # 每次捲動完,就執行一次載入更多的動作\n", 91 | " if move % 5 == 0:\n", 92 | " load_more()\n", 93 | "\n", 94 | " # 每次捲動完,就執行一次解析的動作\n", 95 | " parse()\n", 96 | "\n", 97 | " # 每次捲動完,就執行一次儲存的動作\n", 98 | " save()\n", 99 | " \n", 100 | " # (重要)強制等待,此時若有新元素生成,瀏覽器內部高度會自動增加\n", 101 | " sleep(random.randint(2, 3))\n", 102 | " \n", 103 | " # 透過執行 js 語法來取得捲動後的當前總高度\n", 104 | " innerHeight = driver.execute_script(\n", 105 | " 'return document.documentElement.scrollHeight;'\n", 106 | " )\n", 107 | "\n", 108 | " # 每次捲動完,就將移動次數加 1\n", 109 | " move += 1\n", 110 | "\n", 111 | " print(f\"count: {count}, offset: {offset}, innerHeight: {innerHeight}\")\n", 112 | " \n", 113 | " # 經過計算,如果滾動距離(offset)大於等於視窗內部總高度(innerHeight),代表已經到底了\n", 114 | " if offset >= innerHeight:\n", 115 | " count += 1\n", 116 | " \n", 117 | "# 載入更多圖片\n", 118 | "def load_more():\n", 119 | " try:\n", 120 | " css_selector_load_more = 'a.masonry-load-more-button[ng-show=\"canLoadMore()\"]'\n", 121 | " if len(driver.find_elements(By.CSS_SELECTOR, css_selector_load_more)) > 0:\n", 122 | " WebDriverWait(driver, 1).until(\n", 123 | " EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_load_more))\n", 124 | " )\n", 125 | " driver.find_element(By.CSS_SELECTOR, css_selector_load_more).click()\n", 126 | " except TimeoutException:\n", 127 | " print('Load More 按鈕失效')\n", 128 | "\n", 129 | "# 解析資料\n", 130 | "def parse():\n", 131 | " global set_data\n", 132 | " try:\n", 133 | " elements = driver.find_elements(By.CSS_SELECTOR, 'img[src^=https\\:\\/\\/upload]')\n", 134 | " for element in elements:\n", 135 | " # 取得圖片連結\n", 136 | " src = element.get_attribute('src')\n", 137 | " set_data.add(src)\n", 138 | " except:\n", 139 | " pass\n", 140 | "\n", 141 | "# 儲存資料\n", 142 | "def save():\n", 143 | " global set_data\n", 144 | " with open(f'{folderPath}/data.json', 'w', encoding='utf-8') as f:\n", 145 | " f.write(json.dumps(list(set_data), indent=4, ensure_ascii=False))" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "if __name__ == '__main__':\n", 155 | " visit()\n", 156 | " scroll()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# 關閉瀏覽器\n", 166 | "driver.quit()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# 下載圖片\n", 176 | "with open(f'{folderPath}/data.json', 'r', encoding='utf-8') as f:\n", 177 | " # 讀取 json 檔案,變成 list\n", 178 | " list_data = json.loads(f.read())\n", 179 | "\n", 180 | " # 逐一下載圖片\n", 181 | " for i, src in enumerate(list_data):\n", 182 | " # 練習期間,下載幾張就好\n", 183 | " if i == 10:\n", 184 | " break\n", 185 | "\n", 186 | " # 清除圖片網址的 !PinterestSmall.jpg,取得最高畫質的圖片連結\n", 187 | " src = src.replace('!PinterestSmall.jpg', '')\n", 188 | "\n", 189 | " # 取得圖片檔名\n", 190 | " file_name = src.split('/')[-1]\n", 191 | " \n", 192 | " print(f'第 {i+1} 張:{src}')\n", 193 | " \n", 194 | " # 下載圖片\n", 195 | " std = subprocess.run(['curl', src, '-o', f'{folderPath}/{file_name}'])\n", 196 | " if std.returncode == 0:\n", 197 | " print(f'{file_name} 下載成功!')\n", 198 | "\n", 199 | " # 每下載一張就強制等待\n", 200 | " sleep(1)" 201 | ] 202 | } 203 | ], 204 | "metadata": { 205 | "kernelspec": { 206 | "display_name": "web_scraping", 207 | "language": "python", 208 | "name": "python3" 209 | }, 210 | "language_info": { 211 | "codemirror_mode": { 212 | "name": "ipython", 213 | "version": 3 214 | }, 215 | "file_extension": ".py", 216 | "mimetype": "text/x-python", 217 | "name": "python", 218 | "nbconvert_exporter": "python", 219 | "pygments_lexer": "ipython3", 220 | "version": "3.10.14" 221 | } 222 | }, 223 | "nbformat": 4, 224 | "nbformat_minor": 2 225 | } 226 | -------------------------------------------------------------------------------- /cases/archived/綜合.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習: 將 list 當中重複的 dict 去除,並透過指定 dict key 來排序" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "[{'id': '380512238',\n", 20 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png'},\n", 21 | " {'id': '380512238',\n", 22 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png'},\n", 23 | " {'id': '380512239',\n", 24 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512239/android/sticker.png'}]\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import pprint\n", 30 | "\n", 31 | "'''\n", 32 | "流程 1\n", 33 | "'''\n", 34 | "\n", 35 | "# 假設我們有 3 個 dict,每個 dict 都是 LINE 官方貼圖(靜態圖片,無動畫、無聲音)\n", 36 | "dict01 = {\n", 37 | " \"link\": \"https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png\",\n", 38 | " \"id\": \"380512238\"\n", 39 | "}\n", 40 | "\n", 41 | "dict02 = {\n", 42 | " \"link\": \"https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png\",\n", 43 | " \"id\": \"380512238\"\n", 44 | "}\n", 45 | "\n", 46 | "dict03 = {\n", 47 | " \"link\": \"https://stickershop.line-scdn.net/stickershop/v1/sticker/380512239/android/sticker.png\",\n", 48 | " \"id\": \"380512239\"\n", 49 | "}\n", 50 | "\n", 51 | "# 接下來,我們把這三個 dict,都放到一個 list 當中\n", 52 | "listLineStickers = []\n", 53 | "listLineStickers.append(dict01)\n", 54 | "listLineStickers.append(dict02)\n", 55 | "listLineStickers.append(dict03)\n", 56 | "\n", 57 | "# 檢視一下當前內容\n", 58 | "pprint.pprint(listLineStickers)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "{(('link',\n", 71 | " 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png'),\n", 72 | " ('id', '380512238')),\n", 73 | " (('link',\n", 74 | " 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512239/android/sticker.png'),\n", 75 | " ('id', '380512239'))}\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "'''\n", 81 | "流程 2\n", 82 | "'''\n", 83 | "# 建立一個 Set 物件,準備 add 所有 tuple,這些 tuple 裡面都有 dict_items 物件\n", 84 | "_set = set()\n", 85 | "\n", 86 | "'''\n", 87 | "一、dict.items()\n", 88 | "說明:\n", 89 | " items() 方法把字典中每一對 key 和 value 組成一個 tuple\n", 90 | "例如:\n", 91 | " dict_items([\n", 92 | " ('link', 'https://stickershop.line-scdn.net/stickershop/v1/sticker/318800558/android/sticker.png'), \n", 93 | " ('id', '318800558')\n", 94 | " ])\n", 95 | "\n", 96 | "\n", 97 | "二、tuple(dict.items())\n", 98 | "說明: \n", 99 | " 1. 將 dict_items 格式轉成 tuple,目前是為了「讓 set 可以使用 .add() 方法,來去除重複」。\n", 100 | " 2. 之所以要將轉換格式,是因為 tuple 可以被新增到 set 當中,dict 和 dict_items 不行. \n", 101 | " 3. tuple 是可以雜湊的(hashable),可雜湊代表「雜湊值不可變動」,不可變動才能拿來判斷是否相同或比較(equal or compare)。\n", 102 | " 4. 可變動的資料型態,例如 list 可以 append()、remove(),或是像 dict 等透過指定 key 來新增修改、刪除資料的格式。\n", 103 | "例如:\n", 104 | " (\n", 105 | " ('link','https://stickershop.line-scdn.net/stickershop/v1/sticker/318800558/android/sticker.png'),\n", 106 | " ('id', '318800558')\n", 107 | " )\n", 108 | "'''\n", 109 | "\n", 110 | "# 將放置 LINE 貼圖的 dict 各別轉換成為 dict_items 物件,再各別轉換成 tuple,最後新增到 Set 當中\n", 111 | "for dictLineSticker in listLineStickers:\n", 112 | " dict_items = dictLineSticker.items()\n", 113 | " _tuple = tuple(dict_items)\n", 114 | " _set.add(_tuple)\n", 115 | "\n", 116 | "pprint.pprint(_set)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 3, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "[{'id': '380512238',\n", 129 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png'},\n", 130 | " {'id': '380512239',\n", 131 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512239/android/sticker.png'}]\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "'''\n", 137 | "流程 3\n", 138 | "'''\n", 139 | "# 新增 list,準備將去掉重複的 dict 資料各別 append 進去\n", 140 | "listResult = []\n", 141 | "\n", 142 | "'''\n", 143 | "三、dict(t)\n", 144 | "說明:\n", 145 | " 原先的 tuple(dict.items()) 的結果,透過 dict() 轉型,變成原先 dict 的 key-value 格式\n", 146 | "例如:\n", 147 | " {\n", 148 | " 'id': '318800558',\n", 149 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/318800558/android/sticker.png'\n", 150 | " }\n", 151 | "'''\n", 152 | "\n", 153 | "# 此時 set 應該已經去除重複的 tuple,此時將 tuple 各別轉回原本的 dict,並寫入新的 list 當中\n", 154 | "for _tuple in _set:\n", 155 | " dictLineSticker = dict(_tuple)\n", 156 | " listResult.append(dictLineSticker)\n", 157 | "\n", 158 | "pprint.pprint(listResult)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 4, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "[{'id': '380512238',\n", 171 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512238/android/sticker.png'},\n", 172 | " {'id': '380512239',\n", 173 | " 'link': 'https://stickershop.line-scdn.net/stickershop/v1/sticker/380512239/android/sticker.png'}]\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "'''\n", 179 | "流程 4\n", 180 | "'''\n", 181 | "# 使用 sorted,並指定每個 dict 當中的 id 索引進行排序\n", 182 | "listResult = sorted(listResult, key=lambda myDict: myDict['id'], reverse=False)\n", 183 | "\n", 184 | "pprint.pprint(listResult)" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.8.8" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 4 209 | } 210 | -------------------------------------------------------------------------------- /cases/digital_archives/wikisource.csv: -------------------------------------------------------------------------------- 1 | 編號,書名,網址 2 | 1,臺灣割據志,https://zh.wikisource.org/wiki/臺灣割據志 3 | 2,東瀛識略,https://zh.wikisource.org/wiki/東瀛識略 4 | 3,小琉球漫誌,https://zh.wikisource.org/wiki/小琉球漫誌 5 | 4,臺海使槎錄,https://zh.wikisource.org/wiki/臺海使槎錄 6 | 5,臺灣鄭氏紀事,https://zh.wikisource.org/wiki/臺灣鄭氏紀事 7 | 6,臺游日記,https://zh.wikisource.org/wiki/臺游日記 8 | 7,東槎紀略,https://zh.wikisource.org/wiki/東槎紀略 9 | 8,東瀛紀事,https://zh.wikisource.org/wiki/東瀛紀事 10 | 9,蠡測彙鈔,https://zh.wikisource.org/wiki/蠡測彙鈔 11 | 11,閩海紀要,https://zh.wikisource.org/wiki/閩海紀要 12 | 12,東征集,https://zh.wikisource.org/wiki/東征集 13 | 13,靖海紀事,https://zh.wikisource.org/wiki/靖海紀事 14 | 14,平臺紀略,https://zh.wikisource.org/wiki/平臺紀略 15 | 15,臺灣鄭氏始末,https://zh.wikisource.org/wiki/臺灣鄭氏始末 16 | 19,海東札記,https://zh.wikisource.org/wiki/海東札記 17 | 20,臺陽筆記,https://zh.wikisource.org/wiki/臺陽筆記 18 | 22,海紀輯要,https://zh.wikisource.org/wiki/海紀輯要 19 | 24,海上見聞錄,https://zh.wikisource.org/wiki/海上見聞錄 20 | 25,賜姓始末,https://zh.wikisource.org/wiki/賜姓始末 21 | 26,海國聞見錄,https://zh.wikisource.org/wiki/海國聞見錄 22 | 27,劉壯肅公奏議,https://zh.wikisource.org/wiki/劉壯肅公奏議 23 | 30,臺陽見聞錄,https://zh.wikisource.org/wiki/臺陽見聞錄 24 | 32,從征實錄,https://zh.wikisource.org/wiki/從征實錄 25 | 35,靖海志,https://zh.wikisource.org/wiki/靖海志 26 | 41,北郭園詩鈔,https://zh.wikisource.org/wiki/北郭園詩鈔 27 | 44,裨海紀遊,https://zh.wikisource.org/wiki/裨海紀遊 28 | 49,東溟奏稿,https://zh.wikisource.org/wiki/東溟奏稿 29 | 55,臺灣采訪冊,https://zh.wikisource.org/wiki/臺灣采訪冊 30 | 63,樹杞林志,https://zh.wikisource.org/wiki/樹杞林志 31 | 64,臺灣詩乘,https://zh.wikisource.org/wiki/臺灣詩乘 32 | 65,臺灣府志,https://zh.wikisource.org/wiki/臺灣府志 33 | 76,南天痕,https://zh.wikisource.org/wiki/南天痕 34 | 80,金門志,https://zh.wikisource.org/wiki/金門志 35 | 85,南明野史,https://zh.wikisource.org/wiki/南明野史 36 | 86,所知錄,https://zh.wikisource.org/wiki/所知錄 37 | 90,番社采風圖考,https://zh.wikisource.org/wiki/番社采風圖考 38 | 95,廈門志,https://zh.wikisource.org/wiki/廈門志 39 | 96,東南紀事,https://zh.wikisource.org/wiki/東南紀事 40 | 98,平閩紀,https://zh.wikisource.org/wiki/平閩紀 41 | 99,海東逸史,https://zh.wikisource.org/wiki/海東逸史 42 | 100,哀臺灣箋釋,https://zh.wikisource.org/wiki/哀臺灣箋釋 43 | 101,新竹縣制度考,https://zh.wikisource.org/wiki/新竹縣制度考 44 | 103,臺灣縣志,https://zh.wikisource.org/wiki/臺灣縣志 45 | 106,明季三朝野史,https://zh.wikisource.org/wiki/明季三朝野史 46 | 107,臺風雜記,https://zh.wikisource.org/wiki/臺風雜記 47 | 108,彰化節孝冊,https://zh.wikisource.org/wiki/彰化節孝冊 48 | 109,澎湖紀略,https://zh.wikisource.org/wiki/澎湖紀略 49 | 110,臺灣海防檔,https://zh.wikisource.org/wiki/臺灣海防檔 50 | 111,思文大紀,https://zh.wikisource.org/wiki/思文大紀 51 | 112,明季遺聞,https://zh.wikisource.org/wiki/明季遺聞 52 | 113,重修臺灣縣志,https://zh.wikisource.org/wiki/重修臺灣縣志 53 | 115,澎湖續編,https://zh.wikisource.org/wiki/澎湖續編 54 | 119,諸蕃志,https://zh.wikisource.org/wiki/諸蕃志 55 | 127,鹿樵紀聞,https://zh.wikisource.org/wiki/鹿樵紀聞 56 | 128,臺灣通史,https://zh.wikisource.org/wiki/臺灣通史 57 | 129,臺海見聞錄,https://zh.wikisource.org/wiki/臺海見聞錄 58 | 130,臺灣通志,https://zh.wikisource.org/wiki/臺灣通志 59 | 131,李文忠公選集,https://zh.wikisource.org/wiki/李文忠公選集 60 | 132,南疆繹史,https://zh.wikisource.org/wiki/南疆繹史 61 | 133,續明紀事本末,https://zh.wikisource.org/wiki/續明紀事本末 62 | 134,小腆紀年,https://zh.wikisource.org/wiki/小腆紀年 63 | 135,海外慟哭記,https://zh.wikisource.org/wiki/海外慟哭記 64 | 138,小腆紀傳,https://zh.wikisource.org/wiki/小腆紀傳 65 | 140,續修臺灣縣志,https://zh.wikisource.org/wiki/續修臺灣縣志 66 | 141,諸羅縣志,https://zh.wikisource.org/wiki/諸羅縣志 67 | 146,重修鳳山縣志,https://zh.wikisource.org/wiki/重修鳳山縣志 68 | 147,窺園留草,https://zh.wikisource.org/wiki/窺園留草 69 | 148,明季南略,https://zh.wikisource.org/wiki/明季南略 70 | 149,三藩紀事本末,https://zh.wikisource.org/wiki/三藩紀事本末 71 | 159,苗栗縣志,https://zh.wikisource.org/wiki/苗栗縣志 72 | 160,噶瑪蘭廳志,https://zh.wikisource.org/wiki/噶瑪蘭廳志 73 | 161,臺灣語典,https://zh.wikisource.org/wiki/臺灣語典 74 | 162,臺灣三字經,https://zh.wikisource.org/wiki/臺灣三字經 75 | 163,東山國語,https://zh.wikisource.org/wiki/東山國語 76 | 172,淡水廳志,https://zh.wikisource.org/wiki/淡水廳志 77 | 177,爝火錄,https://zh.wikisource.org/wiki/爝火錄 78 | 183,聖安本紀,https://zh.wikisource.org/wiki/聖安本紀 79 | 198,清季外交史料選輯,https://zh.wikisource.org/wiki/清季外交史料選輯 80 | 206,戴案紀略,https://zh.wikisource.org/wiki/戴案紀略 81 | 209,野史無文,https://zh.wikisource.org/wiki/野史無文 82 | 212,魂南記,https://zh.wikisource.org/wiki/魂南記 83 | 213,海濱大事記,https://zh.wikisource.org/wiki/海濱大事記 84 | 217,鮚埼亭集選輯,https://zh.wikisource.org/wiki/鮚埼亭集選輯 85 | 234,行在陽秋,https://zh.wikisource.org/wiki/行在陽秋 86 | 235,幸存錄,https://zh.wikisource.org/wiki/幸存錄 87 | 240,青燐屑,https://zh.wikisource.org/wiki/青燐屑 88 | 241,吳耿尚孔四王全傳,https://zh.wikisource.org/wiki/吳耿尚孔四王全傳 89 | 242,江南聞見錄,https://zh.wikisource.org/wiki/江南聞見錄 90 | 244,明亡述略,https://zh.wikisource.org/wiki/明亡述略 91 | 246,江陰城守紀,https://zh.wikisource.org/wiki/江陰城守紀 92 | 250,崇禎朝野紀,https://zh.wikisource.org/wiki/崇禎朝野紀 93 | 254,研堂見聞雜記,https://zh.wikisource.org/wiki/研堂見聞雜記 94 | 257,玉堂薈記,https://zh.wikisource.org/wiki/玉堂薈記 95 | 258,江上孤忠錄,https://zh.wikisource.org/wiki/江上孤忠錄 96 | 260,閩中紀略,https://zh.wikisource.org/wiki/閩中紀略 97 | 263,烈皇小識,https://zh.wikisource.org/wiki/烈皇小識 98 | 266,弘光實錄鈔,https://zh.wikisource.org/wiki/弘光實錄鈔 99 | 268,浙東紀略,https://zh.wikisource.org/wiki/浙東紀略 100 | 269,蜀碧,https://zh.wikisource.org/wiki/蜀碧 101 | 270,崇禎長編,https://zh.wikisource.org/wiki/崇禎長編 102 | 271,客滇述,https://zh.wikisource.org/wiki/客滇述 103 | 272,崇禎記聞錄,https://zh.wikisource.org/wiki/崇禎記聞錄 104 | 275,明季北略,https://zh.wikisource.org/wiki/明季北略 105 | 282,石匱書後集,https://zh.wikisource.org/wiki/石匱書後集 106 | 284,平定三逆方略,https://zh.wikisource.org/wiki/平定三逆方略 107 | 286,雪交亭正氣錄,https://zh.wikisource.org/wiki/雪交亭正氣錄 108 | 287,使琉球錄三種,https://zh.wikisource.org/wiki/使琉球錄三種 109 | 291,欽定勝朝殉節諸臣錄,https://zh.wikisource.org/wiki/欽定勝朝殉節諸臣錄 110 | 293,琉球國志略,https://zh.wikisource.org/wiki/琉球國志略 111 | 294,崇禎實錄,https://zh.wikisource.org/wiki/崇禎實錄 112 | 301,偏安排日事蹟,https://zh.wikisource.org/wiki/偏安排日事蹟 113 | 303,陳第年譜,https://zh.wikisource.org/wiki/陳第年譜 114 | 306,中山傳信錄,https://zh.wikisource.org/wiki/中山傳信錄 -------------------------------------------------------------------------------- /cases/digital_archives/wikisource.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install beautifulsoup4 requests pandas OpenCC" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 12, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import requests as req\n", 20 | "from bs4 import BeautifulSoup as bs\n", 21 | "import opencc\n", 22 | "from urllib.parse import quote, unquote\n", 23 | "import re, os\n", 24 | "\n", 25 | "# OpenCC\n", 26 | "converter = opencc.OpenCC('s2tw.json')\n", 27 | "\n", 28 | "# 建立存放資料的資料夾\n", 29 | "path_folder = 'wikisource'\n", 30 | "if not os.path.exists(path_folder):\n", 31 | " os.makedirs(path_folder)\n", 32 | "\n", 33 | "# 網頁來源的前綴\n", 34 | "prefix = 'https://zh.wikisource.org'" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "df = pd.read_csv('./wikisource.csv')\n", 44 | "df" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# 存放內文的 list\n", 54 | "li_content = []\n", 55 | "\n", 56 | "# 逐一取得內文\n", 57 | "for idx, row in df.iterrows():\n", 58 | " # 請求網頁內容\n", 59 | " url = row['網址']\n", 60 | " res = req.get(url, timeout=10)\n", 61 | "\n", 62 | " # 解析網頁內容\n", 63 | " soup = bs(res.text, 'lxml')\n", 64 | "\n", 65 | " # 取得元素\n", 66 | " content_element = soup.select_one('div#mw-content-text')\n", 67 | "\n", 68 | " # 檢視內文\n", 69 | " # print( converter.convert(content.get_text()) )\n", 70 | " \n", 71 | " # 取得內文,並轉為繁體中文\n", 72 | " content = converter.convert(content_element.get_text())\n", 73 | " \n", 74 | " # 加入 list,之後整合到 DataFrame\n", 75 | " li_content.append(content)\n", 76 | "\n", 77 | " # 取得分類連結\n", 78 | " # alinks = soup.select('div#catlinks ul li a')\n", 79 | " # if len(alinks) > 0:\n", 80 | " # for a in alinks:\n", 81 | " # print(f'{a.get_text()}: {prefix}{unquote(a[\"href\"])}')\n", 82 | "\n", 83 | "\n", 84 | "\n", 85 | "# 將內文加入 DataFrame\n", 86 | "df['內文'] = li_content" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 17, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# 儲存成 Excel 檔\n", 96 | "df.to_excel(f'{path_folder}/wikisource.xlsx', index=False)" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "test", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.10.14" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 2 121 | } 122 | -------------------------------------------------------------------------------- /cases/excel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install openpyxl" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from openpyxl import load_workbook\n", 19 | "from openpyxl import Workbook\n", 20 | "\n", 21 | "# 動態新增檔案\n", 22 | "# workbook = Workbook()\n", 23 | "# worksheet = workbook.create_sheet(\"students\", 0)\n", 24 | "\n", 25 | "# 讀取已存在的 excel 檔案\n", 26 | "importFile = \"read.xlsx\"\n", 27 | "workbook = load_workbook(filename = importFile)\n", 28 | "\n", 29 | "# 顯示所有工作表\n", 30 | "print(workbook.sheetnames)\n", 31 | "\n", 32 | "# 取得主要的 sheet\n", 33 | "worksheet = workbook['students']\n", 34 | "\n", 35 | "# 新增標題\n", 36 | "worksheet['C1'] = 'phone_number'\n", 37 | "\n", 38 | "# 新增資料\n", 39 | "worksheet['C2'] = \"0911111111\"\n", 40 | "worksheet['C3'] = \"0922222222\"\n", 41 | "worksheet['C4'] = \"0933333333\"\n", 42 | "worksheet['C5'] = \"0944444444\"\n", 43 | "\n", 44 | "# 自訂學生清單 dictionaries in list\n", 45 | "listStudents = [\n", 46 | " {\"name\": \"Eric\", \"age\": 48, \"phone_number\": \"0955555555\"},\n", 47 | " {\"name\": \"Fox\", \"age\": 27, \"phone_number\": \"0966666666\"},\n", 48 | "]\n", 49 | "\n", 50 | "# 將學生清單寫入 excel (各別寫入名單的尾端)\n", 51 | "position = 6\n", 52 | "for student in listStudents:\n", 53 | " worksheet['A' + str(position)] = student[\"name\"]\n", 54 | " worksheet['B' + str(position)] = student[\"age\"]\n", 55 | " worksheet['C' + str(position)] = student[\"phone_number\"]\n", 56 | " position += 1\n", 57 | "\n", 58 | "# 儲存 workbook\n", 59 | "exportFile = \"read_export.xlsx\"\n", 60 | "workbook.save(exportFile)\n", 61 | "\n", 62 | "# 關閉 workbook\n", 63 | "workbook.close()" 64 | ] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 3", 70 | "language": "python", 71 | "name": "python3" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 3 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython3", 83 | "version": "3.8.8" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 4 88 | } 89 | -------------------------------------------------------------------------------- /cases/hetubook_jinyong_requests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "6814170b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''匯入套件'''\n", 11 | "import requests as req\n", 12 | "from bs4 import BeautifulSoup as bs\n", 13 | "import json, os, time, random, pprint, re\n", 14 | "\n", 15 | "# 隨機取得 User-Agent\n", 16 | "'''\n", 17 | "# 從外部資料來取得清單,清單預設儲存路徑: /tmp\n", 18 | "ua = UserAgent(use_external_data=True)\n", 19 | "# 從外部資料來取得清單,儲存在指定路徑\n", 20 | "ua = UserAgent(use_external_data=True, cache_path=/home/fake_useragent.json)\n", 21 | "\n", 22 | "更詳細的說明,請見以下網頁:\n", 23 | "https://pypi.org/project/fake-useragent/\n", 24 | "'''\n", 25 | "from fake_useragent import UserAgent\n", 26 | "ua = UserAgent(use_external_data=True)\n", 27 | "\n", 28 | "'''放置 金庸小說 metadata 的資訊'''\n", 29 | "listData = []\n", 30 | "\n", 31 | "'''金庸小說的網址'''\n", 32 | "prefix = 'https://hetubook.com'\n", 33 | "list_urls = [\n", 34 | " prefix + '/tag/金庸-1.html',\n", 35 | " prefix + '/tag/金庸-2.html'\n", 36 | "]\n", 37 | "\n", 38 | "'''設定標頭'''\n", 39 | "my_headers = {\n", 40 | " 'user-agent': ua.random,\n", 41 | " 'referer': 'https://hetubook.com/book2/56/index.html',\n", 42 | "}\n", 43 | "\n", 44 | "\n", 45 | "# 沒有放置 txt 檔的資料夾,就建立起來\n", 46 | "folderPath = 'jinyong'\n", 47 | "if not os.path.exists(folderPath):\n", 48 | " os.makedirs(folderPath)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "a12857e8", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "'''\n", 59 | "自訂函式\n", 60 | "'''\n", 61 | "# 取得小說的主要連結\n", 62 | "def getMainLinks(): \n", 63 | " # 清除 list 內容\n", 64 | " listData.clear()\n", 65 | " \n", 66 | " # 走訪首頁\n", 67 | " for link in list_urls:\n", 68 | " res = req.get(url = link, headers = my_headers, cookies = my_cookies)\n", 69 | " soup = bs(res.text, \"lxml\")\n", 70 | "\n", 71 | " # 取得每一本指定的小說連結\n", 72 | " for a in soup.select('dl#body.list dd h4 a[href]'):\n", 73 | " # 取得主要連結相關資訊\n", 74 | " listData.append({\n", 75 | " 'title': a.get_text(),\n", 76 | " 'link': prefix + a['href'],\n", 77 | " 'sub': [] # 之後會放置每一本小說的章回資訊\n", 78 | " })\n", 79 | " \n", 80 | " # 預覽結果\n", 81 | " # pprint.pprint(listData)\n", 82 | " \n", 83 | "\n", 84 | "# 取得所有小說的獨立連結\n", 85 | "def getSubLinks():\n", 86 | " # 取得章回列表\n", 87 | " for index in range( len(listData) ):\n", 88 | " res = req.get(url = listData[index]['link'], headers = my_headers)\n", 89 | " soup = bs(res.text, \"lxml\")\n", 90 | " \n", 91 | " for a in soup.select('dl#dir > dd > a[href][title]'):\n", 92 | " listData[index]['sub'].append({\n", 93 | " 'title': a['title'], # 或是 a.get_text() 來取得 title\n", 94 | " 'link': prefix + a['href'],\n", 95 | " 'content': '' # 預留給小說內文\n", 96 | " })\n", 97 | " \n", 98 | " # 隨機等待\n", 99 | " # time.sleep(random.randint(1, 3))\n", 100 | " \n", 101 | " # 預覽結果\n", 102 | " # pprint.pprint(listData)\n", 103 | " \n", 104 | " \n", 105 | "# 將金庸小說所有章回的內容,各自寫到 txt 與 json 中\n", 106 | "def writeTxt():\n", 107 | " for index in range( len(listData) ):\n", 108 | " for idx in range( len(listData[index]['sub']) ):\n", 109 | " sess = req.Session()\n", 110 | " res = sess.get(url = listData[index]['sub'][idx]['link'], headers = my_headers)\n", 111 | " for key, value in res.cookies.items():\n", 112 | " print(key + '=' + value)\n", 113 | " \n", 114 | " soup = bs(res.text, \"lxml\")\n", 115 | " \n", 116 | " # 用 len( soup.select('CSS_SELECTOR') ) 來判斷文章區域是否存在,存在則儲存起來\n", 117 | " if len(soup.select('div#content')) > 0:\n", 118 | " # 取得內文所在的元素\n", 119 | " elm = soup.select_one('div#content')\n", 120 | " \n", 121 | " # 刪除不必要的元素\n", 122 | " elm.select_one('h2.h2').decompose()\n", 123 | " \n", 124 | " # 取得小說內文\n", 125 | " content = ''\n", 126 | " for div in elm.select('div'):\n", 127 | " content += div.get_text()\n", 128 | " \n", 129 | " pprint.pprint(elm)\n", 130 | " \n", 131 | " # 更新 json 的 content 節點\n", 132 | " listData[index]['sub'][idx]['content'] = content\n", 133 | " \n", 134 | " # 將小說內文額外存成 txt 檔\n", 135 | " file_name = f\"{listData[index]['title']}_{listData[index]['sub'][idx]['title']}\"\n", 136 | " with open(f\"{folderPath}/{file_name}.txt\", \"w\", encoding=\"utf-8\") as file:\n", 137 | " file.write(content)\n", 138 | " \n", 139 | " break\n", 140 | " \n", 141 | " # 隨機等待\n", 142 | " # time.sleep(random.randint(1, 3))\n", 143 | " \n", 144 | " break\n", 145 | " \n", 146 | "\n", 147 | "# 建立金庸小說的 json 檔\n", 148 | "def saveJson():\n", 149 | " with open(f\"{folderPath}/hetubook_jinyong_requests.json\", \"w\", encoding=\"utf-8\") as file:\n", 150 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "cf85fada", 157 | "metadata": { 158 | "scrolled": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "# 主程式\n", 163 | "if __name__ == \"__main__\":\n", 164 | " time_begin = time.time()\n", 165 | " getMainLinks()\n", 166 | " getSubLinks()\n", 167 | " writeTxt()\n", 168 | "# saveJson()\n", 169 | " time_end = time.time()\n", 170 | " print(f\"總共執行 {time_end - time_begin} 秒\")" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 3 (ipykernel)", 177 | "language": "python", 178 | "name": "python3" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 3 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython3", 190 | "version": "3.9.13" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 5 195 | } 196 | -------------------------------------------------------------------------------- /cases/image_gen/ComfyUI.md: -------------------------------------------------------------------------------- 1 | # ComfyUI 2 | [連結](https://github.com/comfyanonymous/ComfyUI) 3 | 4 | ## 安裝套件 5 | - PyTorch - CPU only 6 | `pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu` 7 | 8 | ## 下載連結 for Windows 9 | - 需要安裝 7zip 壓縮工具: [連結](https://www.7-zip.org/) 10 | - [版本一覽](https://github.com/comfyanonymous/ComfyUI/releases) 11 | - [直接下載](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) 12 | 13 | ## 參考網頁 14 | - [教你如何在 Windows 中安裝 Stable Diffusion,目前最簡單的運行方式](https://today.line.me/tw/v2/article/5yR9OBE) 15 | - [Stable Diffusion 台灣社群v2](https://www.facebook.com/groups/sdaitw) 16 | 17 | ## 啟用服務 18 | - 對 `ComfyUI_windows_portable\run_cpu.bat` 點兩下 19 | 20 | ## 基本設定與操作 21 | - 下載模型 22 | - [Stable Diffusion XL Base 1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main) 23 | - 下載 `sd_xl_base_1.0.safetensors` 24 | - [Stable Diffusion XL Refiner 1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/tree/main) 25 | - 下載 `sd_xl_refiner_1.0.safetensors` 26 | - [SDXL-VAE](https://huggingface.co/stabilityai/sdxl-vae/tree/main) 27 | - 下載 `sdxl_vae.safetensors` 28 | - 預設路徑 29 | - 下載 base 和 refiner 模型: `ComfyUI_windows_portable\ComfyUI\models\checkpoints` 30 | - 裡面會有 `sd_xl_base_1.0.safetensors` 和 `sd_xl_refiner_1.0.safetensors` 31 | - 下載 VAE 模型權重: `ComfyUI_windows_portable\ComfyUI\models\vae` 32 | - 裡面會有 `sdxl_vae.safetensors` 33 | - 如果有其它種類的模型,請依種類來放置模型: `ComfyUI_windows_portable\ComfyUI\models\*` 34 | - 生成圖片 35 | 36 | ## 從 civitai 尋找合適的模型來下載 37 | - 按下選單上的 `Model` 連結 38 | - 選擇 `BASE MODEL` (其它的也可以) 39 | - 在顯示的列表中,選擇圖卡左上方有寫 `Checkpoint` 字眼,例如 [Speciosa Realistica](https://civitai.com/models/488361/speciosa-realistica) 40 | - 在頁面右側,有個 `Download (6.46 GB)`,按下即可下載 `safetensors` 檔案 41 | - 將 `safetensors` 檔案,放到 `ComfyUI_windows_portable\ComfyUI\models\checkpoints` 當中 42 | - 刷新 ComfyUI 畫面,就可以在 `Load Checkpoint` 底下的 `ckpt_name` 選單中看到新的模型了 43 | - 點一下範例圖片後,可以看到生成圖片的 prompt 或 negative prompt,還有一些參數可以調整 -------------------------------------------------------------------------------- /cases/image_gen/civitai.md: -------------------------------------------------------------------------------- 1 | # Civitai 2 | - [連結](https://civitai.com/) 3 | - 選擇 Featured Models,最右有 Explore all models 的連結。 4 | - 在預覽圖上,右上角有類似一支筆 (Create 按鈕),就可以點進去看。 5 | - 進入內頁後,可以看到預覽器的右上角有一個 Create 按鈕,點下去就會在左側出現設定的地方。 6 | - 生成圖片前,參考以下範例的 Prompt 與 Negative Prompt,將它們貼到左側的設定區域。 7 | - 也可以在網頁最上方的搜尋欄,選擇 Models,輸入 CN,可以找到東方臉孔的模型輸出結果。 8 | 9 | ## Stable Diffusion 提示詞 (prompt) 參考資料 10 | - [PormptHero](https://prompthero.com/) 11 | - 選上方連結 [Stable Diffusion](https://prompthero.com/stable-diffusion-prompts),尋找偏好的圖片,點進去參考它們的設定 12 | - [Prompt 與 Negative prompt](https://prompthero.com/prompt/7fc0b9928fb-stable-diffusion-xl-base-0-9-playtime-s-over) 13 | 14 | ## 範例 15 | Prompt: 16 | ``` 17 | (masterpiece, best quality:1.2<),highly detailed,extremely detailed,real photo, 18 | fullbody,1girl,solo,asian,looking at viewer,(body facing viewer:1.2)(relax sitting),knees separation, 19 | red lips,brown long hair, 20 | collared shirt and dress shirt,long sleeves,(knees length dress:1.1), 21 | (wrap hip very thick pantyhose:1.1),color high heels, 22 | nice figure,good anatomy,good proportions,nice pose,(2shoes,2legs:1.2)(perfect legs:1.1),nice hand, 23 | outdoors,buildings,photorealistic,realistic,, 24 | ,, 25 | ``` 26 | ``` 27 | a chinese young lady lying of a pink wall with a bow tie on it's chest and a pink and white top, white shorts, sunny, a screenshot, aestheticism, movie style, studio lighting, movie cinematic, horizon, 28 | 1girl, solo, ,masterpiece, high quality, highres, absurdres, high details,8k,HDR,raw photo,realistic, bokeh, shallow depth of field, beautiful eyes, high detail eyes, beautiful face, high detail face, high detail skin, beautiful hands, beautiful fingers, beautiful eyelashes, fingernails,(above the thigh:1.6), pixie curly cut hair, lying down, hand reaching towards viewer, big pink towel, white simple background 29 | ``` 30 | 31 | Negative Prompt: 32 | ``` 33 | bad anatomy, lowres, normal quality, grayscale, worstquality, watermark, bad proportions, out of focus, long neck, deformed, mutated, mutation, disfigured, poorly drawn face, skin blemishes, skin spots, acnes, missing limb, malformed limbs, floating limbs, disconnected limbs, extra limb, extra arms, mutated hands, poorly drawn hands, malformed hands, mutated hands and fingers, bad hands, missing fingers, fused fingers, too many fingers, extra legs, bad feet, cross-eyed, (distorted, :1.3) , (:1.4) , low quality, camera, BadDream, UnrealisticDream, bad-hands-5, BadNegAnatomyV1-neg, EasyNegative, FastNegativeV2, bad-picture-chill-75v 34 | ``` 35 | 36 | 其它設定: 37 | ``` 38 | Steps: 30 39 | Sampler: DPM++ 2M SDE Karras, 40 | CFG scale: 5 41 | Seed: 2451060841 42 | Size: 512x768 43 | Model hash: e4a30e4607 44 | Model: 麦橘写实_MajicMIX_Realistic_v6 45 | Denoising strength: 0.35 46 | Clip skip: 2 47 | ADetailer model: face_yolov8n.pt 48 | ADetailer prompt: asian girl, make up, beautiful face 49 | ADetailer confidence: 0.3 50 | ADetailer dilate/erode: 4 51 | ADetailer mask blur: 4 52 | ADetailer denoising strength: 0.4 53 | ADetailer inpaint only masked: True 54 | ADetailer inpaint padding: 0 55 | ADetailer ControlNet model: control_v11p_sd15_inpaint [ebff9138] 56 | ADetailer ControlNet module: inpaint_global_harmonious 57 | ADetailer version: 23.7.6, 58 | Hires upscale: 2 59 | Hires steps: 5 60 | Hires upscaler: 4x-UltraSharp 61 | Lora hashes:more_details: 3b8aa1d351ef, yuzuv10: b1464588227a, sit_cross_leg_v2: cb80e9bce437, control_skin_exposure: 58bbb7a04626 62 | TI hashes: ng_deepnegative_v1_75t: 54e7e4826d53, negative_hand: 73b524a2da12, badhandv4: 5e40d722fc3d, negative_feet_v2: df90b1ff666d, EasyNegative: 66a7279a88dd 63 | ControlNet 0: preprocessor: inpaint_global_harmonious, model: control_v11p_sd15_inpaint [ebff9138], weight: 1.0, starting/ending: (0.0, 1.0), resize mode: ResizeMode.INNER_FIT, pixel perfect: True, control mode: ControlMode.BALANCED, preprocessor params: (-1, -1, -1) 64 | Version: v1.5.1 65 | ``` -------------------------------------------------------------------------------- /cases/image_gen/microsoft_bing_chat.md: -------------------------------------------------------------------------------- 1 | # 微軟 Bing Chat 2 | - [連結](https://www.bing.com/chat) 3 | 4 | ## 範例 5 | 可以參考: [AI咒語公社](https://www.facebook.com/groups/3208727879422567) 6 | ``` 7 | 東方美少女真人cosplay日本漫畫《Dr.Slump》中的動畫角色ARALE。穿着藍色的背帶褲,粉紅色上衣,紫色頭髮,粉紅色棒球帽繡著黑色黃底""ARALE""名字,棒球帽兩側有可愛的白色小翅膀,大大的圓形黑框眼鏡。她的表情充滿活力,展開雙臂和雙腿快速奔跑中。她的姿態顯得非常歡快和俏皮。photorealistic,真實照片,捕捉表情,8K,細節豐富,人像攝影 8 | ``` 9 | ``` 10 | 四季風情畫 11 | 春雨綿綿草木青 12 | 夏陽熾熾稻浪黃 13 | 秋風蕭蕭楓葉紅 14 | 冬雪皚皚梅花白 15 | 超高清寫實圖像 16 | ``` -------------------------------------------------------------------------------- /cases/image_gen/run_diffusers_flux_pipeline.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # 安裝 PyTorch - CPU only 3 | pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu 4 | 5 | # 安裝其它 6 | pip install transformers accelerate numpy sentencepiece protobuf 7 | 8 | # Flux GitHub 9 | https://github.com/black-forest-labs/flux 10 | 11 | # 安裝 diffusers 12 | pip install git+https://github.com/huggingface/diffusers.git 13 | ''' 14 | 15 | import torch 16 | from diffusers import FluxPipeline 17 | from time import time 18 | 19 | t1 = time() 20 | 21 | model_id = "black-forest-labs/FLUX.1-schnell" # you can also use `black-forest-labs/FLUX.1-dev` 22 | 23 | pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) # torch.bfloat16 24 | pipe.enable_model_cpu_offload() # save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power 25 | 26 | prompt = "A cat holding a sign that says hello world" 27 | seed = 42 28 | image = pipe( 29 | prompt, 30 | output_type="pil", 31 | num_inference_steps=4, # use a larger number if you are using [dev] 32 | generator=torch.Generator("cpu").manual_seed(seed) 33 | ).images[0] 34 | image.save("./flux-schnell.png") 35 | 36 | t2 = time() 37 | 38 | print(f"Time taken: {t2-t1:.4f} seconds") -------------------------------------------------------------------------------- /cases/ixdzs_jinyong_post_requests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "488c47d0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''匯入套件'''\n", 11 | "import requests as req\n", 12 | "from bs4 import BeautifulSoup as bs\n", 13 | "import json, os, pprint, re\n", 14 | "\n", 15 | "# 隨機取得 User-Agent\n", 16 | "'''\n", 17 | "# 從外部資料來取得清單,清單預設儲存路徑: /tmp\n", 18 | "ua = UserAgent(use_external_data=True)\n", 19 | "# 從外部資料來取得清單,儲存在指定路徑\n", 20 | "ua = UserAgent(use_external_data=True, cache_path=/home/fake_useragent.json)\n", 21 | "\n", 22 | "更詳細的說明,請見以下網頁:\n", 23 | "https://pypi.org/project/fake-useragent/\n", 24 | "'''\n", 25 | "from fake_useragent import UserAgent\n", 26 | "ua = UserAgent(use_external_data=True)\n", 27 | "\n", 28 | "'''放置 金庸小說 metadata 的資訊'''\n", 29 | "listData = []\n", 30 | "\n", 31 | "'''金庸小說的網址'''\n", 32 | "prefix = 'https://tw.ixdzs.com'\n", 33 | "url = prefix + '/author/金庸'\n", 34 | "\n", 35 | "'''設定標頭'''\n", 36 | "my_headers = {\n", 37 | " 'user-agent': ua.random\n", 38 | "}\n", 39 | "\n", 40 | "# 沒有放置 txt 檔的資料夾,就建立起來\n", 41 | "folderPath = 'jinyong'\n", 42 | "if not os.path.exists(folderPath):\n", 43 | " os.makedirs(folderPath)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "c90bad95", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "'''\n", 54 | "自訂函式\n", 55 | "'''\n", 56 | "# 取得小說的主要連結\n", 57 | "def getMainLinks(): \n", 58 | " # 清除 list 內容\n", 59 | " listData.clear()\n", 60 | " \n", 61 | " # 走訪首頁\n", 62 | " res = req.get(url = url, headers = my_headers)\n", 63 | " soup = bs(res.text, \"lxml\")\n", 64 | " \n", 65 | " # 定義欲取得的小說名稱\n", 66 | " list_novel_names = [\n", 67 | " '倚天屠龍記', '連城訣', '書劍恩仇錄', '碧血劍', '鹿鼎記', \n", 68 | " '俠客行', '射鵰英雄傳', '雪山飛狐', '飛狐外傳', '天龍八部', \n", 69 | " '白馬嘯西風', '神鵰俠侶', '越女劍', '鴛鴦刀', '笑傲江湖'\n", 70 | " ]\n", 71 | " \n", 72 | " # 取得每一本指定的小說連結\n", 73 | " for novel_name in list_novel_names:\n", 74 | " # 取得超連結\n", 75 | " a = soup.select_one(f'a[href][title={novel_name}]')\n", 76 | " \n", 77 | " # 取得主要連結相關資訊\n", 78 | " listData.append({\n", 79 | " 'title': a['title'], # 或是 a.get_text() 取得 innerText\n", 80 | " 'link': prefix + a['href'],\n", 81 | " 'sub': [] # 之後會放置每一本小說的章回資訊\n", 82 | " })\n", 83 | " \n", 84 | " # 預覽結果\n", 85 | " pprint.pprint(listData)\n", 86 | " \n", 87 | "\n", 88 | "# 取得所有小說的獨立連結\n", 89 | "def getSubLinks():\n", 90 | " # 取得章回列表\n", 91 | " for index in range( len(listData) ):\n", 92 | " # 取得 bid\n", 93 | " bid = re.search(r'\\/read\\/(\\d+)\\/', listData[index]['link'])[1]\n", 94 | " \n", 95 | " # post 請求,取得章回列表\n", 96 | " my_data = {'bid': bid}\n", 97 | " res = req.post(url = 'https://tw.ixdzs.com/novel/clist/', data = my_data)\n", 98 | " obj_json = res.json()\n", 99 | " \n", 100 | " # 如果回傳訊息為 200 (這個網站自訂的)\n", 101 | " if obj_json['rs'] == 200:\n", 102 | " # 取得章節連結相關資訊\n", 103 | " for obj_data in obj_json['data']:\n", 104 | " if obj_data['ctype'] != '1':\n", 105 | " listData[index]['sub'].append({\n", 106 | " 'title': obj_data['title'],\n", 107 | " 'link': prefix + f'/read/{bid}/p{obj_data[\"ordernum\"]}.html',\n", 108 | " 'content': '' # 預留給小說內文\n", 109 | " })\n", 110 | " \n", 111 | " # 預覽結果\n", 112 | " pprint.pprint(listData)\n", 113 | "\n", 114 | "\n", 115 | "# 建立金庸小說的 json 檔\n", 116 | "def saveJson():\n", 117 | " with open(f\"{folderPath}/ixdzs_jinyong_post_requests.json\", \"w\", encoding=\"utf-8\") as file:\n", 118 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "f32ecc2b", 125 | "metadata": { 126 | "scrolled": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "# 主程式\n", 131 | "if __name__ == \"__main__\":\n", 132 | " getMainLinks()\n", 133 | " getSubLinks()\n", 134 | " saveJson()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "8ae3b392", 140 | "metadata": {}, 141 | "source": [ 142 | "# 議題思考\n", 143 | "- **這個範例沒有直接抓小說內文,如果是你,該怎麼進去每一個內頁去取得文章?**\n", 144 | "\n", 145 | "```\n", 146 | "參考:\n", 147 | "for index in range( len(listData) ):\n", 148 | " for idx in range( len(listData[index]['sub']) ):\n", 149 | " res = req.get(url = listData[index]['sub'][idx]['link'])\n", 150 | " ...\n", 151 | "```\n", 152 | "\n", 153 | "- **有些連結可能沒有用,例如正文、卷○、後記等,當你用 requests 進去瀏覽時,如何略過?**\n", 154 | "\n", 155 | "```\n", 156 | "參考:\n", 157 | "res = req.get(url = listData[index]['sub'][idx]['link'])\n", 158 | "soup = bs(res.text, \"lxml\")\n", 159 | "\n", 160 | "# 判斷小說內文的區域是否存在\n", 161 | "if len(soup.select('article.page-content')) > 0:\n", 162 | " title = soup.select_one('article.page-content h3').get_text()\n", 163 | " content = soup.select_one('article.page-content section').get_text()\n", 164 | "```\n", 165 | "\n", 166 | "- **如果你能取得小說內文,有辦法儲存在對應的 content 當中嗎?有辦法另外將內文獨立儲存在各別的 txt 檔當中嗎?**\n", 167 | "\n", 168 | "```\n", 169 | "參考:\n", 170 | "content = soup.select_one('article.page-content section').get_text()\n", 171 | "\n", 172 | "# 儲存在對應的 key (content),作為 value\n", 173 | "listData[index]['sub'][idx]['content'] = content\n", 174 | "\n", 175 | "# 寫入檔案\n", 176 | "with open(f\"{folderPath}/{title}.txt\", \"w\", encoding=\"utf-8\") as file:\n", 177 | " file.write(content)\n", 178 | "```" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.9.13" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /cases/jinyong_requests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0bfbdca0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''匯入套件'''\n", 11 | "import json, os, pprint, time, re\n", 12 | "from urllib import parse\n", 13 | "import requests as req\n", 14 | "from bs4 import BeautifulSoup as bs\n", 15 | "\n", 16 | "# 隨機取得 User-Agent\n", 17 | "'''\n", 18 | "參考連結:\n", 19 | "https://pypi.org/project/fake-useragent/\n", 20 | "'''\n", 21 | "from fake_useragent import UserAgent\n", 22 | "ua = UserAgent()\n", 23 | "\n", 24 | "'''放置 金庸小說 metadata 的資訊'''\n", 25 | "listData = []\n", 26 | "\n", 27 | "'''小庸小說的網址'''\n", 28 | "url = 'https://www.bookwormzz.com'\n", 29 | "suffix = '/ky'\n", 30 | "\n", 31 | "'''設定標頭'''\n", 32 | "headers = {\n", 33 | " 'user-agent': ua.random\n", 34 | "}\n", 35 | "\n", 36 | "# 沒有放置 txt 檔的資料夾,就建立起來\n", 37 | "folderPath = 'jinyong'\n", 38 | "if not os.path.exists(folderPath):\n", 39 | " os.makedirs(folderPath)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "9d0f8199", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# 取得小說的主要連結\n", 50 | "def getMainLinks():\n", 51 | " # 走訪首頁\n", 52 | " res = req.get(url + suffix, headers = headers)\n", 53 | " soup = bs(res.text, \"lxml\")\n", 54 | " \n", 55 | " # 取得主要連結\n", 56 | " a_elms = soup.select('a[data-ajax=\"false\"]')\n", 57 | " \n", 58 | " # 整理主要連結資訊\n", 59 | " for a in a_elms:\n", 60 | " listData.append({\n", 61 | " \"title\": a.get_text(),\n", 62 | " \"link\": url + parse.unquote( a['href'].replace('..', '') ) + '#book_toc',\n", 63 | " \"sub\": [] # 為了放置各個章回小說的內頁資料,下一個步驟會用到\n", 64 | " })\n", 65 | "\n", 66 | "# 取得所有小說的獨立連結\n", 67 | "def getSubLinks():\n", 68 | " for i in range( len(listData) ):\n", 69 | " # 走訪章回小說內頁\n", 70 | " res = req.get(listData[i]['link'], headers = headers, allow_redirects = False)\n", 71 | " soup = bs(res.text, \"lxml\")\n", 72 | " a_elms = soup.select('div[data-theme=\"b\"][data-content-theme=\"c\"] a[rel=\"external\"]')\n", 73 | " \n", 74 | " # 若是走訪網頁時,選擇不到特定的元素,視為沒有資料,continue 到 for 的下一個 index 去\n", 75 | " if len(a_elms) > 0:\n", 76 | " for a in a_elms:\n", 77 | " listData[i]['sub'].append({\n", 78 | " \"sub_title\": a.get_text(),\n", 79 | " \"sub_link\": url + parse.unquote( a['href'] )\n", 80 | " })\n", 81 | " else:\n", 82 | " continue\n", 83 | "\n", 84 | "# 建立金庸小說的 json 檔\n", 85 | "def saveJson():\n", 86 | " with open(f\"{folderPath}/jinyong.json\", \"w\", encoding=\"utf-8\") as file:\n", 87 | " file.write( json.dumps(listData, ensure_ascii=False, indent=4) )\n", 88 | "\n", 89 | "# 將金庸小說所有章回的內容,各自寫到 txt 與 json 中\n", 90 | "def writeTxt():\n", 91 | " # 稍候建立 train.json 前的程式變數\n", 92 | " listContent = []\n", 93 | "\n", 94 | " # 開啟 金庸小說 metadata 的 json 檔\n", 95 | " with open(f\"{folderPath}/jinyong.json\", \"r\", encoding=\"utf-8\") as file:\n", 96 | " strJson = file.read()\n", 97 | "\n", 98 | " # 走訪所有章回的小說文字內容\n", 99 | " listResult = json.loads(strJson)\n", 100 | " for i in range(len(listResult)):\n", 101 | " for j in range(len(listResult[i]['sub'])):\n", 102 | " # 取得回應\n", 103 | " res = req.get(listResult[i]['sub'][j]['sub_link'], headers = headers, allow_redirects=False)\n", 104 | " \n", 105 | " # 先將
改成 換行符號\n", 106 | " strContent = re.sub(r\"\", '\\n', str(res.text))\n", 107 | " \n", 108 | " # 建立 soup 物件\n", 109 | " soup = bs(strContent, \"lxml\")\n", 110 | "\n", 111 | " # 去掉內文標題後,取得內文\n", 112 | " div = soup.select_one('div#html > div')\n", 113 | " strContent = div.get_text()\n", 114 | " \n", 115 | " # 去除不必要的文字\n", 116 | " # strContent = re.sub(r\" |\\r|\\n| |\\s\", '', strContent)\n", 117 | "\n", 118 | " # 決定 txt 的檔案名稱\n", 119 | " fileName = f\"{listResult[i]['title']}_{listResult[i]['sub'][j]['sub_title']}.txt\"\n", 120 | " \n", 121 | " # 將小說內容存到 txt 中\n", 122 | " with open(f\"{folderPath}/{fileName}\", \"w\", encoding=\"utf-8\") as file:\n", 123 | " file.write(strContent)\n", 124 | " \n", 125 | " # 去除不必要的文字\n", 126 | " strContent = re.sub(r\" |\\r|\\n| |\\s\", '', strContent)\n", 127 | "\n", 128 | " # 額外將小說內容放到 list 當中,建立 train.json\n", 129 | " listContent.append(strContent)\n", 130 | "\n", 131 | " # 延伸之後的教學,在此建立訓練資料\n", 132 | " with open(f\"{folderPath}/train.json\", \"w\", encoding=\"utf-8\") as file:\n", 133 | " file.write( json.dumps(listContent, ensure_ascii=False, indent=4) )" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "917b963a", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# 主程式\n", 144 | "if __name__ == \"__main__\":\n", 145 | " time1 = time.time()\n", 146 | " getMainLinks()\n", 147 | " getSubLinks()\n", 148 | " saveJson()\n", 149 | " writeTxt()\n", 150 | " print(f\"執行總花費時間: {time.time() - time1}\")" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3 (ipykernel)", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.10.13" 171 | }, 172 | "vscode": { 173 | "interpreter": { 174 | "hash": "585a938ec471c889bf0cce0aed741a99eaf47ca09c0fa8393793bc5bfe77ba11" 175 | } 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 5 180 | } 181 | -------------------------------------------------------------------------------- /cases/jinyong_selenium.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0a06d755", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''匯入套件'''\n", 11 | "from selenium import webdriver\n", 12 | "from selenium.webdriver.chrome.service import Service\n", 13 | "from webdriver_manager.chrome import ChromeDriverManager\n", 14 | "from selenium.common.exceptions import TimeoutException\n", 15 | "from selenium.webdriver.support.ui import WebDriverWait\n", 16 | "from selenium.webdriver.support import expected_conditions as EC\n", 17 | "from selenium.webdriver.common.by import By\n", 18 | "import json, os, time, re\n", 19 | "from pprint import pprint\n", 20 | "from urllib import parse\n", 21 | "\n", 22 | "# 隨機取得 User-Agent\n", 23 | "from fake_useragent import UserAgent\n", 24 | "ua = UserAgent()\n", 25 | "\n", 26 | "'''設定 Chrome 瀏覽器開啟時的狀態'''\n", 27 | "my_options = webdriver.ChromeOptions()\n", 28 | "my_options.add_argument('--start-maximized')\n", 29 | "my_options.add_argument('--incognito')\n", 30 | "my_options.add_argument('--disable-popup-blocking')\n", 31 | "my_options.add_argument(f'--user-agent={ua.random}')\n", 32 | "\n", 33 | "'''建立操控 Chrome 瀏覽器的變數'''\n", 34 | "# 使用 Chrome 的 WebDriver\n", 35 | "driver = webdriver.Chrome(\n", 36 | " options = my_options,\n", 37 | " service = Service(ChromeDriverManager().install())\n", 38 | ")\n", 39 | "'''\n", 40 | "補充: 沒有特別設定,只要電腦有安裝 Chrome,就可以直接使用\n", 41 | "driver = webdriver.Chrome()\n", 42 | "'''\n", 43 | "\n", 44 | "'''放置 金庸小說 metadata 的資訊'''\n", 45 | "listData = []\n", 46 | "\n", 47 | "'''小庸小說的網址'''\n", 48 | "url = 'https://www.bookwormzz.com/ky'\n", 49 | "\n", 50 | "# 沒有放置 txt 檔的資料夾,就建立起來\n", 51 | "folderPath = 'jinyong'\n", 52 | "if not os.path.exists(folderPath):\n", 53 | " os.makedirs(folderPath)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "9c5a6200", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# 取得小說的主要連結\n", 64 | "def getMainLinks():\n", 65 | " # 走訪首頁\n", 66 | " driver.get(url)\n", 67 | " \n", 68 | " # 取得主要連結\n", 69 | " a_elms = driver.find_elements(By.CSS_SELECTOR, 'a[data-ajax=\"false\"]')\n", 70 | " \n", 71 | " # 整理主要連結資訊\n", 72 | " for a in a_elms:\n", 73 | " listData.append({\n", 74 | " \"title\": a.get_attribute('innerText'),\n", 75 | " \"link\": parse.unquote( a.get_attribute('href') ) + \"#book_toc\",\n", 76 | " \"sub\": [] # 為了放置各個章回小說的內頁資料,下一個步驟會用到\n", 77 | " })\n", 78 | "\n", 79 | "# 取得所有章回小說的連結\n", 80 | "def getSubLinks():\n", 81 | " for i in range( len(listData) ):\n", 82 | " # 走訪章回小說內頁\n", 83 | " driver.get(listData[i][\"link\"])\n", 84 | " \n", 85 | " # 若是走訪網頁時,等待不到特定的元素,視為沒有資料,continue 到 for 的下一個 index 去\n", 86 | " try:\n", 87 | " # 等待元素\n", 88 | " WebDriverWait(driver, 5).until(\n", 89 | " EC.presence_of_element_located(\n", 90 | " (By.CSS_SELECTOR, 'div[data-theme=\"b\"][data-content-theme=\"c\"] a[rel=\"external\"]')\n", 91 | " )\n", 92 | " )\n", 93 | " \n", 94 | " # 整理章回小說\n", 95 | " a_elms = driver.find_elements(By.CSS_SELECTOR, 'div[data-theme=\"b\"][data-content-theme=\"c\"] a[rel=\"external\"]')\n", 96 | " for a in a_elms:\n", 97 | " listData[i][\"sub\"].append({\n", 98 | " \"sub_title\": a.get_attribute(\"innerText\"),\n", 99 | " \"sub_link\": parse.unquote( a.get_attribute(\"href\") )\n", 100 | " })\n", 101 | " except TimeoutException as e:\n", 102 | " continue\n", 103 | "\n", 104 | "# 建立金庸小說的 json 檔\n", 105 | "def saveJson():\n", 106 | " with open(f\"{folderPath}/jinyong.json\", \"w\", encoding=\"utf-8\") as file:\n", 107 | " file.write( json.dumps(listData, ensure_ascii=False) )\n", 108 | "\n", 109 | "# 將金庸小說所有章回的內容,各自寫到 txt 與 json 中\n", 110 | "def writeTxt(): \n", 111 | " # 稍候建立 train.json 前的程式變數\n", 112 | " listContent = []\n", 113 | "\n", 114 | " # 開啟 金庸小說 metadata 的 json 檔\n", 115 | " with open(f\"{folderPath}/jinyong.json\", \"r\", encoding=\"utf-8\") as file:\n", 116 | " strJson = file.read()\n", 117 | "\n", 118 | " # 走訪所有章回的小說文字內容\n", 119 | " listResult = json.loads(strJson)\n", 120 | " for i in range( len(listResult) ):\n", 121 | " for j in range( len(listResult[i][\"sub\"]) ):\n", 122 | " # 走訪內頁\n", 123 | " driver.get( listResult[i]['sub'][j]['sub_link'] )\n", 124 | " div = driver.find_element(By.CSS_SELECTOR, 'div#html > div')\n", 125 | " \n", 126 | " # 取得內文\n", 127 | " strContent = div.get_attribute('innerText')\n", 128 | " \n", 129 | " # 資料預處理\n", 130 | " strContent = re.sub(r\" |\\r|\\n| |\\s\", '', strContent)\n", 131 | "\n", 132 | " # 決定 txt 的檔案名稱\n", 133 | " fileName = f\"{listResult[i]['title']}_{listResult[i]['sub'][j]['sub_title']}.txt\"\n", 134 | "\n", 135 | " # 將小說內容存到 txt 中\n", 136 | " with open(f\"{folderPath}/{fileName}\", \"w\", encoding=\"utf-8\") as file:\n", 137 | " file.write( strContent )\n", 138 | "\n", 139 | " # 額外將小說內容放到 list 當中,建立 train.json\n", 140 | " listContent.append(strContent)\n", 141 | "\n", 142 | " # 延伸之後的教學,在此建立訓練資料\n", 143 | " with open(f\"{folderPath}/train.json\", \"w\", encoding=\"utf-8\") as file:\n", 144 | " file.write( json.dumps(listContent, ensure_ascii=False) )\n", 145 | "\n", 146 | "# 關閉瀏覽器\n", 147 | "def close():\n", 148 | " driver.quit()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "d36a7209", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# 主程式\n", 159 | "if __name__ == \"__main__\":\n", 160 | " time1 = time.time()\n", 161 | " getMainLinks()\n", 162 | " getSubLinks()\n", 163 | " saveJson()\n", 164 | " writeTxt()\n", 165 | " close()\n", 166 | " print(f\"執行總花費時間: {time.time() - time1}\")" 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.10.13" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | -------------------------------------------------------------------------------- /cases/leaflet/README.md: -------------------------------------------------------------------------------- 1 | # Leaflet.js - 對手機平台友善的互動式地圖開放源始碼函式庫 (JavaScript Library) 2 | [官方網站](https://leafletjs.com/) 3 | 4 | ## 1. 基礎用法 5 | 6 | ### 1.1 建立一專案資料夾,並在資料夾中建立 html 網頁 (以 VS code 為例) 7 | - 建立資料夾,例如 **gis**,並將 **gis** 資料夾拖入 vs code,或是以 vs code 開啟資料夾視窗。 8 | - 建立 html 檔,例如 **index.html**。 9 | - 開啟 index.html,輸入 **!**,提示文字出現選,選擇「!」,自動建立 html 樣版。 10 | ```html 11 | 12 | 13 | 14 | 15 | 16 | 17 | Document 18 | 19 | 20 | 21 | 22 | 23 | ``` 24 | 25 | ### 1.2 按下 Tutorials 連結,選擇 Leaflet Quick Start Guide 26 | - 放置 Leaflet CSS (CDN) 檔案在 index.html 的 `` 之前: 27 | ```html 28 | 29 | ``` 30 | - 放置 Leaflet JavaScript (CDN) 檔案在 `` 之前: 31 | ```html 32 | 33 | ``` 34 | - 放置 `
` 到 `` 之後: 35 | ```html 36 |
37 | ``` 38 | - 放置 `` 到 `` 之前: 39 | ```css 40 | 43 | ``` 44 | 45 | ### 1.3 按下 Overview 超連結,透過 JavaScript 程式碼,產生地圖 46 | - 在 Leaflet JavaScript (CDN) 的 `` 之後,加入以下程式碼: 47 | ```javascript 48 | 59 | ``` 60 | 61 | ### 2. 需要安裝 flask、requests 62 | ```bash 63 | $ pip install -U Flask requests 64 | ``` 65 | 66 | ## 3. 範例資料來源 67 | - [Cafe Nomad:咖啡廳遊牧民族 ](https://cafenomad.tw/) 68 | - [開發人員 API 文件](https://cafenomad.tw/developers) 69 | - [API v1.2](https://cafenomad.tw/developers/docs/v1.2):以 taipei 的資料為例 - [預覽 JSON](https://cafenomad.tw/api/v1.2/cafes/taipei) 70 | 71 | 72 | ## 4. 範例程式 73 | ```html 74 | 75 | 76 | 77 | 78 | 79 | 80 | Document 81 | 82 | 83 | 84 | 85 | 86 | 93 | 94 | 95 | 96 | 97 |
98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 |
idcitynameaddressurlsocketlatitudelongitude
114 | 115 | 116 | 117 | 118 | 119 | 204 | 205 | 206 | ``` 207 | 208 | ## 4. 相關影片 209 | [Leaflet.js - Web 互動式地圖](https://www.youtube.com/playlist?list=PLV4FeK54eNbwNaCoJomI1jhvgm-A-vOsz) -------------------------------------------------------------------------------- /cases/leaflet/templates/index_cafe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 24 |
25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |
idcitynameaddressurl
38 | 39 | 40 | 41 | 42 | 43 | 125 | 126 | -------------------------------------------------------------------------------- /cases/leaflet/web_api.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 需要安裝 flask、requests 3 | $ pip install -U Flask requests 4 | ''' 5 | from flask import Flask, request, jsonify, render_template 6 | import requests as req 7 | import math 8 | 9 | ''' 10 | Flask 初始化 11 | ''' 12 | app = Flask(__name__) 13 | app.json.ensure_ascii = False # 防止中文變成 unicode 編碼 14 | 15 | 16 | ''' 17 | 首頁 (透過 render_template 函式,將 templates/index.html 檔案回傳給前端) 18 | ''' 19 | @app.route('/', methods=['GET']) 20 | def index(): 21 | return render_template('index_cafe.html') 22 | 23 | 24 | ''' 25 | Web API (資料來源: https://cafenomad.tw/) 26 | ''' 27 | # 臺北 cafe 28 | @app.route('/cafe_taipei', methods=['GET']) 29 | def get_cafe_info_in_taipei(): 30 | # 先對 咖啡廳 Web API 發送 GET 請求 (request),取得對方伺服器的回應 (response), 31 | # 其中回應帶有咖啡廳列表的資訊 32 | url = 'https://cafenomad.tw/api/v1.2/cafes/taipei' 33 | res = req.get(url) 34 | 35 | # 回傳回應 36 | return jsonify(res.json()) 37 | 38 | # 新竹 cafe 39 | @app.route('/cafe_hsinchu', methods=['GET']) 40 | def get_cafe_info_in_hsinchu(): 41 | # 先對 咖啡廳 Web API 發送 GET 請求 (request),取得對方伺服器的回應 (response), 42 | # 其中回應帶有咖啡廳列表的資訊 43 | url = 'https://cafenomad.tw/api/v1.2/cafes/hsinchu' 44 | res = req.get(url) 45 | 46 | # 回傳回應 47 | return jsonify(res.json()) 48 | 49 | # 高雄 cafe 50 | @app.route('/cafe_kaohsiung', methods=['GET']) 51 | def get_cafe_info_in_kaohsiung(): 52 | # 先對 咖啡廳 Web API 發送 GET 請求 (request),取得對方伺服器的回應 (response), 53 | # 其中回應帶有咖啡廳列表的資訊 54 | url = 'https://cafenomad.tw/api/v1.2/cafes/kaohsiung' 55 | res = req.get(url) 56 | 57 | # 回傳回應 58 | return jsonify(res.json()) 59 | 60 | 61 | 62 | ''' 63 | Web API (資料來源: https://data.taipei/dataset/detail?id=6bb3304b-4f46-4bb0-8cd1-60c66dcd1cae) 64 | ''' 65 | # 臺北市垃圾車點位路線資訊 (取得總筆數) 66 | @app.route('/count_garbage_trucks_in_taipei', methods=['GET']) 67 | def get_count_garbage_trucks_in_taipei(): 68 | # 先對 垃圾車點位路線資訊 Web API 發送 GET 請求 (request),取得對方伺服器的回應 (response), 69 | # 其中回應帶有垃圾車點位路線資訊列表的資訊 70 | 71 | '''取得資料總筆數''' 72 | url = f'https://data.taipei/api/v1/dataset/a6e90031-7ec4-4089-afb5-361a4efe7202?scope=resourceAquire&offset=0&limit=1' 73 | res = req.get(url) 74 | count = res.json()['result']['count'] 75 | 76 | # 回傳回應 77 | return jsonify({'count': count}) 78 | 79 | # 臺北市垃圾車點位路線資訊 (分頁) 80 | @app.route('/some_garbage_trucks_in_taipei', methods=['GET']) 81 | def get_some_garbage_trucks_in_taipei(): 82 | # 先對 垃圾車點位路線資訊 Web API 發送 GET 請求 (request),取得對方伺服器的回應 (response), 83 | # 其中回應帶有垃圾車點位路線資訊列表的資訊 84 | 85 | '''取得資料總筆數''' 86 | url = f'https://data.taipei/api/v1/dataset/a6e90031-7ec4-4089-afb5-361a4efe7202?scope=resourceAquire&offset=0&limit=1' 87 | res = req.get(url) 88 | count = res.json()['result']['count'] 89 | 90 | '''取得總頁數''' 91 | limit = 1000 # 希望一頁有幾筆 92 | pages = math.ceil(count / limit) # 取得總頁數 93 | 94 | '''取得 GET 請求的 page 值''' 95 | page = 1 96 | if 'page' in request.args: 97 | page = int(request.args.get('page')) 98 | if page < 1: page = 1 # 指定頁碼小於 1,則設定成 1 99 | elif page > pages: page = pages # 指定頁碼大於總頁數,則設定成總頁數 (代表最後一頁) 100 | 101 | '''取得指定分頁資料''' 102 | offset = (page - 1) * limit 103 | url = f'https://data.taipei/api/v1/dataset/a6e90031-7ec4-4089-afb5-361a4efe7202?scope=resourceAquire&offset={offset}&limit={limit}' 104 | res = req.get(url) 105 | 106 | # 回傳回應 107 | return jsonify(res.json()['result']['results']) 108 | 109 | # 臺北市垃圾車點位路線資訊 (全部) 110 | @app.route('/all_garbage_trucks_in_taipei', methods=['GET']) 111 | def get_all_garbage_trucks_in_taipei(): 112 | # 先對 垃圾車點位路線資訊 Web API 發送 GET 請求 (request),取得對方伺服器的回應 (response), 113 | # 其中回應帶有垃圾車點位路線資訊列表的資訊 114 | 115 | '''取得資料總筆數''' 116 | url = f'https://data.taipei/api/v1/dataset/a6e90031-7ec4-4089-afb5-361a4efe7202?scope=resourceAquire&offset=0&limit=1' 117 | res = req.get(url) 118 | count = res.json()['result']['count'] 119 | 120 | '''取得總頁數''' 121 | limit = 1000 # 希望一頁有幾筆 122 | pages = math.ceil(count / limit) # 取得總頁數 123 | 124 | '''取得全部資料''' 125 | list_results = [] 126 | for page in range(1, pages + 1): 127 | offset = (page - 1) * limit 128 | url = f'https://data.taipei/api/v1/dataset/a6e90031-7ec4-4089-afb5-361a4efe7202?scope=resourceAquire&offset={offset}&limit={limit}' 129 | res = req.get(url) 130 | list_results.extend(res.json()['result']['results']) 131 | 132 | # 回傳回應 133 | return jsonify(list_results) 134 | 135 | 136 | 137 | ''' 138 | 主程式 139 | ''' 140 | if __name__ == '__main__': 141 | app.run( 142 | # 除錯模式為 True,服務執行期間有錯誤,會將 Traceback 顯示在網頁上, 143 | # 反之則顯示一般的 Internal Server Error 144 | debug=True, 145 | 146 | # 127.0.0.1 或 localhost 限定本機使用服務, 147 | # 0.0.0.0 代表所有知道主機實際 IP 的人都能存取 148 | host='127.0.0.1', 149 | 150 | # 網址或 IP 後面附加的 Port 號,代表服務由該 Port 號提供 151 | port=5000 152 | ) -------------------------------------------------------------------------------- /cases/line-stickers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests as req\n", 10 | "import os, json\n", 11 | "from bs4 import BeautifulSoup as bs\n", 12 | "\n", 13 | "# 建立儲存圖片的資料夾,不存在就新增\n", 14 | "folderPath = 'line_stickers'\n", 15 | "if not os.path.exists(folderPath):\n", 16 | " os.makedirs(folderPath)\n", 17 | " \n", 18 | "#放貼圖資訊用\n", 19 | "list_line_stickers = []\n", 20 | "\n", 21 | "# 官方 LINE 貼圖的網址\n", 22 | "url = 'https://store.line.me/stickershop/product/24329/zh-Hant'\n", 23 | "\n", 24 | "# 將自訂標頭加入 GET 請求中\n", 25 | "res = req.get(url)\n", 26 | "\n", 27 | "# 建立 soup 物件\n", 28 | "soup = bs(res.text, 'lxml')\n", 29 | "\n", 30 | "'''\n", 31 | "備註:\n", 32 | "1. soup.select():回傳的結果是元素集合(list 型態,BeautifulSoup ResultSet)\n", 33 | "2. soup.select_one():回傳的結果是單一元素(BeautifulSoup Result)\n", 34 | "'''\n", 35 | "# 取得放置貼圖的 li 元素 (list 型態)\n", 36 | "li_elements = soup.select(\"ul.mdCMN09Ul.FnStickerList > li.mdCMN09Li.FnStickerPreviewItem\")\n", 37 | "\n", 38 | "\n", 39 | "# 逐一取得 li 元素中的 data-preview 資訊\n", 40 | "for li in li_elements:\n", 41 | " # 取得 data-preview 屬性的值(字串)\n", 42 | " strJson = li['data-preview'] # 另一種寫法:li.get(\"data-preview\")\n", 43 | " \n", 44 | " #把屬性的值(字串)轉成物件 \n", 45 | " obj = json.loads(strJson)\n", 46 | " \n", 47 | " # 將重要資訊放置在 list 當中,幫助我們稍候進行資料下載與儲存\n", 48 | " list_line_stickers.append(obj)\n", 49 | "\n", 50 | "# 下載圖片\n", 51 | "'''\n", 52 | "範例指令:\n", 53 | "$ curl \"https://stickershop.line-scdn.net/stickershop/v1/sticker/658881986/android/sticker.png?v=1\" -o ./test.png\n", 54 | "'''\n", 55 | "for obj in list_line_stickers: \n", 56 | " os.system(f\"curl {obj['staticUrl']} -o {folderPath}/{obj['id']}.png\")\n", 57 | " print(f\"貼圖ID: {obj['id']}, 下載連結: {obj['staticUrl']}\")" 58 | ] 59 | } 60 | ], 61 | "metadata": { 62 | "kernelspec": { 63 | "display_name": "Python 3 (ipykernel)", 64 | "language": "python", 65 | "name": "python3" 66 | }, 67 | "language_info": { 68 | "codemirror_mode": { 69 | "name": "ipython", 70 | "version": 3 71 | }, 72 | "file_extension": ".py", 73 | "mimetype": "text/x-python", 74 | "name": "python", 75 | "nbconvert_exporter": "python", 76 | "pygments_lexer": "ipython3", 77 | "version": "3.10.14" 78 | }, 79 | "vscode": { 80 | "interpreter": { 81 | "hash": "93b032fe74b295ac9f9c7e1fb2471d07c5e0ee3078e59d1cdc5fc80e32fb2057" 82 | } 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 4 87 | } 88 | -------------------------------------------------------------------------------- /cases/mouse_XY_colors.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 匯入工具 3 | ''' 4 | import pyautogui 5 | 6 | ''' 7 | 請在 Terminal 當中執行 8 | 取得滑鼠座標和游標上的色碼,協助我們定位 9 | 補充: 必須在 Terminal 當中執行 10 | ''' 11 | pyautogui.displayMousePosition() -------------------------------------------------------------------------------- /cases/pymysql.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "6f9c10d9", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "!pip install pymysql" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "ddca9715", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "'''\n", 21 | "在資料庫的交易中,為確保交易(Transaction)是正確可靠的,所以必須具備四個特性:\n", 22 | "Atomicity 原子性 - 在資料庫的每一筆交易中只有兩種可能發生,第一種是全部完全(commit),第二種是全部不完成(rollback),\n", 23 | " 不會因為某個環節出錯,而終止在那個環節,在出錯之後會恢復至交易之前的狀態,如同還沒執行此筆交易。\n", 24 | "Consistency 一致性 - 在交易中會產生資料或者驗證狀態,然而當錯誤發生,所有已更改的資料或狀態將會恢復至交易之前。\n", 25 | "Isolation 隔離性 - 資料庫允許多筆交易同時進行,交易進行時未完成的交易資料並不會被其他交易使用,直到此筆交易完成。\n", 26 | "Durability 永續性 - 交易完成後對資料的修改是永久性的,資料不會因為系統重啟或錯誤而改變。\n", 27 | "'''" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "id": "e18723b0", 33 | "metadata": {}, 34 | "source": [ 35 | "![交易過程](https://i.imgur.com/r29XFgO.png \"交易過程\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "82561ad3", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "'''\n", 46 | "參考頁面:\n", 47 | "[1] PyMySQL Examples\n", 48 | "https://pymysql.readthedocs.io/en/latest/user/examples.html\n", 49 | "[2] Python+MySQL資料庫操作(PyMySQL)\n", 50 | "https://www.tw511.com/3/39/1388.html\n", 51 | "[3] Python資料庫學習筆記(四):使用PyMySQL模組\n", 52 | "https://reurl.cc/Q78eD2\n", 53 | "'''\n", 54 | "\n", 55 | "import pymysql\n", 56 | "\n", 57 | "# 資料庫連線\n", 58 | "connection = pymysql.connect(\n", 59 | " host = 'localhost',\n", 60 | " user = 'root',\n", 61 | " password = '',\n", 62 | " database = 'my_db',\n", 63 | " charset = 'utf8mb4',\n", 64 | " cursorclass=pymysql.cursors.DictCursor\n", 65 | ")\n", 66 | "\n", 67 | "# 取得 cursor 物件,進行 CRUD\n", 68 | "cursor = connection.cursor()\n", 69 | "\n", 70 | "try:\n", 71 | " # 寫入資料\n", 72 | " # sql = \"INSERT INTO `users` (`email`, `password`) VALUES (%s, %s)\"\n", 73 | " # cursor.execute(sql, ('webmaster@python.org', 'very-secret'))\n", 74 | "\n", 75 | " # 查詢資料\n", 76 | " sql = \"SELECT * FROM `users`\"\n", 77 | " cursor.execute(sql)\n", 78 | "\n", 79 | " # 查詢結果列數大於0 ,代表有資料\n", 80 | " if cursor.rowcount > 0:\n", 81 | " # 將查詢結果轉成 list 型態 (裡頭元素都是 dict)\n", 82 | " results = cursor.fetchall() # 如果 sql 語法明顯只取得一筆,則使用 fetchone()\n", 83 | " # 迭代取得資料 (dict 型態)\n", 84 | " for result in results:\n", 85 | " print(result)\n", 86 | " else:\n", 87 | " print(\"rowcount: 0\")\n", 88 | "\n", 89 | " # 提交 SQL 執行結果\n", 90 | " connection.commit()\n", 91 | "except Exception as e:\n", 92 | " # 回滾\n", 93 | " connection.rollback()\n", 94 | " print(\"SQL 執行失敗\")\n", 95 | " print(e)\n", 96 | "\n", 97 | "# 釋放 cursor\n", 98 | "cursor.close()\n", 99 | "\n", 100 | "# 關閉資料庫連線\n", 101 | "connection.close()" 102 | ] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.8.8" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 5 126 | } 127 | -------------------------------------------------------------------------------- /cases/read.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/cases/read.xlsx -------------------------------------------------------------------------------- /cases/sound/ebook/css/jquery.highlight-within-textarea.css: -------------------------------------------------------------------------------- 1 | .hwt-container { 2 | display: inline-block; 3 | position: relative; 4 | overflow: hidden !important; 5 | -webkit-text-size-adjust: none !important; 6 | } 7 | 8 | .hwt-backdrop { 9 | position: absolute !important; 10 | top: 0 !important; 11 | right: -99px !important; 12 | bottom: 0 !important; 13 | left: 0 !important; 14 | padding-right: 99px !important; 15 | overflow-x: hidden !important; 16 | overflow-y: auto !important; 17 | } 18 | 19 | .hwt-highlights { 20 | width: auto !important; 21 | height: auto !important; 22 | border-color: transparent !important; 23 | white-space: pre-wrap !important; 24 | word-wrap: break-word !important; 25 | color: transparent !important; 26 | overflow: hidden !important; 27 | } 28 | 29 | .hwt-input { 30 | display: block !important; 31 | position: relative !important; 32 | margin: 0; 33 | padding: 0; 34 | border-radius: 0; 35 | font: inherit; 36 | overflow-x: hidden !important; 37 | overflow-y: auto !important; 38 | } 39 | 40 | .hwt-content { 41 | border: 1px solid; 42 | background: none transparent !important; 43 | } 44 | 45 | .hwt-content mark { 46 | padding: 0 !important; 47 | color: inherit; 48 | } 49 | -------------------------------------------------------------------------------- /cases/sound/ebook/templates/fetch.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Document 7 | 8 | 9 | 10 | 11 | 25 | 26 | 27 | 28 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /cases/sound/ebook/tmp/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/cases/sound/ebook/tmp/.gitignore -------------------------------------------------------------------------------- /cases/sound/ebook/web_api.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 需要安裝 flask、requests 3 | $ pip install -U Flask Flask-Cors 4 | 5 | 參考連結: 6 | [1] Flask 實現 CORS 跨域請求的方法 7 | ttps://medium.com/@charming_rust_oyster_221/flask-實現-cors-跨域請求的方法-c51b6e49a8b5 8 | ''' 9 | from flask import Flask, request, send_from_directory, render_template, jsonify 10 | from flask_cors import CORS 11 | import subprocess 12 | from urllib.parse import quote 13 | 14 | 15 | 16 | ''' 17 | Flask 初始化 18 | ''' 19 | app = Flask(__name__) 20 | app.json.ensure_ascii = False 21 | CORS(app) # 設定全域 CORS 22 | 23 | ''' 24 | Templates 25 | ''' 26 | # 套用網頁樣版(v1.0) 27 | @app.route('/', methods=['GET']) 28 | def index(): 29 | return render_template('fetch.html') 30 | 31 | ''' 32 | Web API (資料來源: Google Translation) 33 | ''' 34 | # 取得 google 小姐的聲音 35 | @app.route('/sound', methods=['POST']) 36 | def get_sound(): 37 | # 取得 POST data 38 | q = request.json['q'] 39 | 40 | # 放罝回傳資訊的變數 41 | myDict = { 42 | 'success': False, 43 | 'link': None 44 | } 45 | 46 | # 轉成符合 url 格式的文字 47 | encoded_sentence = quote(q) 48 | 49 | # 翻譯的語言 (發音的口音) 50 | tl = 'zh-TW' 51 | 52 | # 取得聲音檔 53 | cmd = [ 54 | 'curl', 55 | '-X', 56 | 'GET', 57 | f'https://translate.google.com/translate_tts?ie=UTF-8&client=tw-ob&tl={tl}&q={encoded_sentence}', 58 | '-o', 59 | f'./tmp/{q}.mp3' 60 | ] 61 | 62 | # 執行指令,回傳 Process 物件,其中的屬性 returncode == 0 代表成功 63 | std_output = subprocess.run(cmd) 64 | if std_output.returncode == 0: 65 | myDict['success'] = True 66 | myDict['link'] = f'/tmp/{q}.mp3' 67 | 68 | # 回傳回應 69 | return jsonify(myDict) 70 | 71 | 72 | 73 | ''' 檔案路徑 ''' 74 | # JS 資料夾 75 | @app.route('/js/') 76 | def get_js_path(path): 77 | return send_from_directory('js', path) 78 | 79 | # CSS 資料夾 80 | @app.route('/css/') 81 | def get_css_path(path): 82 | return send_from_directory('css', path) 83 | 84 | # 暫存檔案路徑 85 | @app.route('/tmp/') 86 | def get_tmp_path(path): 87 | return send_from_directory('tmp', path) 88 | 89 | 90 | 91 | ''' 92 | 主程式 93 | ''' 94 | if __name__ == '__main__': 95 | app.run( 96 | # 除錯模式為 True,服務執行期間有錯誤,會將 Traceback 顯示在網頁上, 97 | # 反之則顯示一般的 Internal Server Error 98 | debug=True, 99 | 100 | # 127.0.0.1 或 localhost 限定本機使用服務, 101 | # 0.0.0.0 代表所有知道主機實際 IP 的人都能存取 102 | host='0.0.0.0', 103 | 104 | # 網址或 IP 後面附加的 Port 號,代表服務由該 Port 號提供 105 | port=5000 106 | ) -------------------------------------------------------------------------------- /cases/sound/google_lady.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 下載 google 小姐的聲音 3 | 4 | Web API 網址: 5 | https://translate.google.com/translate_tts?ie=UTF-8&client=tw-ob&tl=zh-TW&q=你的自訂文字 6 | 7 | 將文字進行 url encode 網頁: 8 | https://www.onlinewebtoolkit.com/url-encode-decode 9 | 10 | 在 Terminal 使用 curl 指令下載 mp3: 11 | curl -X GET "https://translate.google.com/translate_tts?ie=UTF-8&client=tw-ob&tl=zh-TW&q=%E4%BD%A0%E7%9A%84%E8%87%AA%E8%A8%82%E6%96%87%E5%AD%97" -o ./test.mp3 12 | 13 | tl=zh-TW 的其它設定: 14 | [1] Understanding the Significance of Query Parameters in Google Translate 15 | https://copyprogramming.com/howto/what-is-the-meaning-of-google-translate-query-params 16 | [2] Google translate language codes 17 | https://gist.github.com/JT5D/a2fdfefa80124a06f5a9 18 | ''' 19 | import subprocess 20 | from urllib.parse import quote 21 | import os 22 | 23 | 24 | ''' 25 | 測試單句下載 mp3 26 | ''' 27 | # 設定給 google 小姐發音的文字 28 | q = '我的優點就是帥,缺點就是帥得不明顯' 29 | 30 | # 轉成符合 url 格式的文字 31 | encoded_sentence = quote(q) 32 | 33 | # 定義指令 34 | cmd = [ 35 | 'curl', 36 | '-X', 37 | 'GET', 38 | f'https://translate.google.com/translate_tts?ie=UTF-8&client=tw-ob&tl=zh-TW&q={encoded_sentence}', 39 | '-o', 40 | f'./{q}.mp3' 41 | ] 42 | 43 | # 執行指令,回傳 Process 物件,其中的屬性 returncode == 0 代表成功 44 | std_output = subprocess.run(cmd) 45 | if std_output.returncode == 0: 46 | print(f'[{q}] 下載成功') 47 | else: 48 | print(f'[{q}] 下載失敗') 49 | 50 | 51 | 52 | ''' 53 | 將 google 小姐的聲音加速 - 使用 ffmpeg 54 | 55 | 參考網頁: 56 | https://ffmpeg.org/download.html 57 | 58 | 下載工具: 59 | - Windows 10: https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip 60 | - MacOS: https://evermeet.cx/ffmpeg/ffmpeg-112875-g47e214245b.zip 61 | 62 | Windows 說明: 63 | 1. 下載 zip 檔,解壓縮到專案目錄 (python_basics) 底下。 64 | 2. 如果解壓縮的目錄叫作「ffmpeg-6.1-essentials_build」,請改成「ffmpeg」。 65 | 3. ffmpeg 資料夾裡面有個 bin 資料夾,裡面的 ffmpeg.exe 是主要的轉檔程式。 66 | 67 | MacOS 說明: 68 | 1. 下載 zip 檔,解壓縮後,會直接看到 ffmpeg 這個檔案。 69 | 2. 給它可以執行的權限,例如在 Terminal 裡面對它輸入「chmod +x ffmpeg」。 70 | 71 | 參考指令: 72 | - Windows 10: .\ffmpeg\bin\ffmpeg.exe -i test.mp3 -filter:a "atempo=1.5" test_atempo.mp3 73 | - MacOS: ./ffmpeg -i test.mp3 -filter:a "atempo=1.5" test_atempo.mp3 74 | ''' 75 | cmd = [ 76 | './ffmpeg/bin/ffmpeg.exe', # 左邊是 Windows 指令。MacOS: ./ffmpeg 77 | '-i', 78 | f'./{q}.mp3', 79 | '-filter:a', 80 | 'atempo=1.5', # 'asetrate=44100*0.4,atempo=1.5' 81 | f'./{q}_atempo.mp3' 82 | ] 83 | std_output = subprocess.run(cmd) 84 | if std_output.returncode == 0: 85 | print(f'[{q}_atempo] 轉換成功') 86 | else: 87 | print(f'[{q}_atempo] 轉換失敗') 88 | 89 | 90 | 91 | 92 | 93 | ''' 94 | 多句下載 95 | ''' 96 | # 如果沒有自訂的資料夾,則自動新增 97 | if not os.path.exists('mp3'): 98 | os.makedirs('mp3') 99 | 100 | # 設定多個給 google 小姐發音的句子 101 | list_words = [ 102 | '人生短短幾個秋啊', 103 | '不醉不罷休', 104 | '東邊我的美人哪', 105 | '西邊黃河流' 106 | ] 107 | 108 | # 把每一句都下載成 mp3 109 | for index, q in enumerate(list_words): 110 | # 轉成符合 url 格式的文字 111 | encoded_sentence = quote(q) 112 | 113 | # 定義指令 114 | cmd = [ 115 | 'curl', 116 | '-X', 117 | 'GET', 118 | f'https://translate.google.com/translate_tts?ie=UTF-8&client=tw-ob&tl=zh-TW&q={encoded_sentence}', 119 | '-o', 120 | f'./mp3/{index}.mp3' 121 | ] 122 | 123 | # 執行指令,回傳 Process 物件,其中的屬性 returncode == 0 代表成功 124 | std_output = subprocess.run(cmd) 125 | if std_output.returncode == 0: 126 | print(f'[{index}] 下載成功') 127 | else: 128 | print(f'[{index}] 下載失敗') 129 | 130 | cmd = [ 131 | './ffmpeg/bin/ffmpeg.exe', 132 | '-i', 133 | f'./mp3/{index}.mp3', 134 | '-filter:a', 135 | 'atempo=1.5', # 'asetrate=44100*0.4,atempo=1.5' 136 | f'./mp3/{index}_atempo.mp3' 137 | ] 138 | std_output = subprocess.run(cmd) 139 | if std_output.returncode == 0: 140 | print(f'[{index}] 轉換成功') 141 | else: 142 | print(f'[{index}] 轉換失敗') -------------------------------------------------------------------------------- /cases/tabs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "f4a081b8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "'''\n", 11 | "匯入套件\n", 12 | "'''\n", 13 | "# 操作 browser 的 API\n", 14 | "from selenium import webdriver\n", 15 | "from selenium.webdriver.chrome.service import Service\n", 16 | "\n", 17 | "# ChromeDriver 的下載管理工具\n", 18 | "from webdriver_manager.chrome import ChromeDriverManager\n", 19 | "\n", 20 | "# 處理逾時例外的工具\n", 21 | "from selenium.common.exceptions import TimeoutException\n", 22 | "\n", 23 | "# 面對動態網頁,等待某個元素出現的工具,通常與 exptected_conditions 搭配\n", 24 | "from selenium.webdriver.support.ui import WebDriverWait\n", 25 | "\n", 26 | "# 搭配 WebDriverWait 使用,對元素狀態的一種期待條件,若條件發生,則等待結束,往下一行執行\n", 27 | "from selenium.webdriver.support import expected_conditions as EC\n", 28 | "\n", 29 | "# 期待元素出現要透過什麼方式指定,通常與 EC、WebDriverWait 一起使用\n", 30 | "from selenium.webdriver.common.by import By\n", 31 | "\n", 32 | "\n", 33 | "# 啟動瀏覽器工具的選項\n", 34 | "my_options = webdriver.ChromeOptions()\n", 35 | "# my_options.add_argument(\"--headless\") #不開啟實體瀏覽器背景執行\n", 36 | "my_options.add_argument(\"--start-maximized\") #最大化視窗\n", 37 | "my_options.add_argument(\"--incognito\") #開啟無痕模式\n", 38 | "my_options.add_argument(\"--disable-popup-blocking\") #禁用彈出攔截\n", 39 | "my_options.add_argument(\"--disable-notifications\") #取消 chrome 推播通知\n", 40 | "my_options.add_argument(\"--lang=zh-TW\") #設定為正體中文\n", 41 | "\n", 42 | "# 使用 Chrome 的 WebDriver\n", 43 | "driver = webdriver.Chrome(\n", 44 | " options = my_options\n", 45 | ")\n", 46 | "\n", 47 | "# 儲存資料的變數\n", 48 | "list_data = []" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "a74f4a97", 54 | "metadata": {}, 55 | "source": [ 56 | "---\n", 57 | "# 建立瀏覽器分頁: window.open\n", 58 | "- [window.open()](https://www.w3schools.com/jsref/met_win_open.asp)\n", 59 | "- [東吳大學中文學報 - 各期全文下載總覽列表](https://web-ch.scu.edu.tw/chinese/file/3423)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "b110643d", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# 連續開 2 個分頁\n", 70 | "for i in range(2):\n", 71 | " driver.execute_script(f'window.open(\"\");')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "bf9a5c51", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# 切換到初始分頁 (索引為 0)\n", 82 | "driver.switch_to.window( driver.window_handles[0] )" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "4f44a1cc", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# 將所有 tabs 轉址,以便取得對應列表\n", 93 | "for index in range(1, len(driver.window_handles)):\n", 94 | " # 切換分頁\n", 95 | " driver.switch_to.window(\n", 96 | " driver.window_handles[index]\n", 97 | " )\n", 98 | " \n", 99 | " # 使分頁自動連結到指定網址 (此時的 drive 變數指向切後的分頁)\n", 100 | " driver.get(f\"https://web-ch.scu.edu.tw/chinese/file/3423?page={index}\")\n", 101 | "\n", 102 | " # 取得列表連結與內文\n", 103 | " for a in driver.find_elements(By.CSS_SELECTOR, 'table.table.table-striped.table-border tbody tr a'):\n", 104 | " list_data.append({\n", 105 | " 'title': a.get_attribute('innerText'),\n", 106 | " 'link': a.get_attribute('href')\n", 107 | " })" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "2415b152", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# 讀最後一頁開始,把所有分頁關掉 (初始頁要保留)\n", 118 | "while len(driver.window_handles) > 1:\n", 119 | " # 切換分頁\n", 120 | " driver.switch_to.window( \n", 121 | " driver.window_handles[ len(driver.window_handles) - 1 ] \n", 122 | " )\n", 123 | " \n", 124 | " # 關閉分頁 (與 driver.quit() 不同)\n", 125 | " driver.close()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "ddb395da", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# 關閉瀏覽器\n", 136 | "driver.quit()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "51efacaf", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# 預覽結果\n", 147 | "list_data" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python3@da", 154 | "language": "python", 155 | "name": "da" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.10.10" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 5 172 | } 173 | -------------------------------------------------------------------------------- /cases/twse.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bbef53ae", 6 | "metadata": {}, 7 | "source": [ 8 | "# 下拉式選單\n", 9 | "- 例如 sel_element = Select(driver.find_element(By.CSS_SELECTOR, '選擇器字串'))\n", 10 | " - 透過 option 的內文來選擇\n", 11 | " - sel_element.select_by_visible_text('民國 100 年')\n", 12 | " - 透過 option 的 value 屬性所設定值來選擇\n", 13 | " - sel_element.select_by_value('2')\n", 14 | " - 透過 option 的順序索引 (從 0 開始,類似陣列的索引概念) 來選擇\n", 15 | " - sel_element.select_by_index(8)\n", 16 | "\n", 17 | "\n", 18 | "# 擷圖\n", 19 | "- driver.save_screenshot('/path/圖片存放路徑.png')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "47b612bd", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "'''\n", 30 | "臺灣證券交易所\n", 31 | "外資及陸資買賣超彙總表\n", 32 | "https://www.twse.com.tw/zh/page/trading/fund/TWT38U.html\n", 33 | "\n", 34 | "目標:\n", 35 | "整合下拉式選單與元素的定位與操控,來下載交易資料,並擷圖\n", 36 | "'''\n", 37 | "\n", 38 | "'''\n", 39 | "匯入套件\n", 40 | "'''\n", 41 | "# 操作 browser 的 驅動程式\n", 42 | "from selenium import webdriver\n", 43 | "\n", 44 | "# 期待元素出現要透過什麼方式指定,經常與 EC、WebDriverWait 一起使用\n", 45 | "from selenium.webdriver.common.by import By\n", 46 | "\n", 47 | "# 強制停止/強制等待 (程式執行期間休息一下)\n", 48 | "from time import sleep\n", 49 | "\n", 50 | "# 建立資料夾與執行檔案相關操作\n", 51 | "import os\n", 52 | "\n", 53 | "# 處理下拉式選單的工具\n", 54 | "from selenium.webdriver.support.ui import Select\n", 55 | "\n", 56 | "'''\n", 57 | "[1] Selenium with Python 中文翻譯文檔\n", 58 | "參考網頁:https://selenium-python-zh.readthedocs.io/en/latest/index.html\n", 59 | "[2] Selenium with Python\n", 60 | "https://selenium-python.readthedocs.io/\n", 61 | "[3] selenium 啓動 Chrome 的進階配置參數\n", 62 | "參考網址:https://stackoverflow.max-everyday.com/2019/12/selenium-chrome-options/\n", 63 | "[4] How to select a drop-down menu value with Selenium using Python?\n", 64 | "參考網址:https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python\n", 65 | "'''\n", 66 | "\n", 67 | "# 啟動瀏覽器工具的選項\n", 68 | "my_options = webdriver.ChromeOptions()\n", 69 | "# my_options.add_argument(\"--headless\") # 不開啟實體瀏覽器背景執行\n", 70 | "my_options.add_argument(\"--start-maximized\") # 最大化視窗\n", 71 | "my_options.add_argument(\"--incognito\") # 開啟無痕模式\n", 72 | "my_options.add_argument(\"--disable-popup-blocking\") # 禁用彈出攔截\n", 73 | "my_options.add_argument(\"--disable-notifications\") # 取消通知\n", 74 | "\n", 75 | "# 建立下載路徑/資料夾,不存在就新增 (os.getcwd() 會取得當前的程式工作目錄)\n", 76 | "folderPath = os.path.join(os.getcwd(), 'files')\n", 77 | "if not os.path.exists(folderPath):\n", 78 | " os.makedirs(folderPath)\n", 79 | " \n", 80 | "# 自訂下載路徑 (不會詢問下載位置)\n", 81 | "my_options.add_experimental_option(\"prefs\", {\n", 82 | " \"download.default_directory\": folderPath,\n", 83 | " \"profile.default_content_settings.popups\": 0,\n", 84 | " \"download.prompt_for_download\": False,\n", 85 | " # \"download.directory_upgrade\": True,\n", 86 | " # \"safebrowsing_for_trusted_sources_enabled\": False,\n", 87 | " # \"safebrowsing.enabled\": False,\n", 88 | " # \"plugins.always_open_pdf_externally\": True\n", 89 | "})\n", 90 | "\n", 91 | "# 使用 Chrome 的 WebDriver\n", 92 | "driver = webdriver.Chrome(\n", 93 | " options = my_options\n", 94 | ")\n", 95 | "\n", 96 | "# 走訪網址\n", 97 | "url = 'https://www.twse.com.tw/zh/page/trading/fund/TWT38U.html'" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 2, 103 | "id": "4c81f7bd", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# 走訪頁面\n", 108 | "def visit():\n", 109 | " driver.get(url)\n", 110 | "\n", 111 | "# 選取下拉式選單的項目\n", 112 | "def setDropDownMenu(year, value, index):\n", 113 | " # 強制等待\n", 114 | " sleep(1)\n", 115 | "\n", 116 | " # 選擇 select[name=\"yy\"] 元素,並依 option 的 innerText 來進行選取\n", 117 | " yy = Select(driver.find_element(By.CSS_SELECTOR, 'select[name=yy]'))\n", 118 | " yy.select_by_visible_text(f'民國 {year} 年')\n", 119 | "\n", 120 | " # 選擇 select[name=\"mm\"] 元素,並依 option 的 value 來進行選取\n", 121 | " mm = Select(driver.find_element(By.CSS_SELECTOR, 'select[name=mm]'))\n", 122 | " mm.select_by_value(str(value))\n", 123 | "\n", 124 | " # 選擇 select[name=\"dd\"] 元素,並依 option 的 index 來進行選取\n", 125 | " dd = Select(driver.find_element(By.CSS_SELECTOR, 'select[name=dd]'))\n", 126 | " dd.select_by_index(index)\n", 127 | "\n", 128 | " # 按下查詢\n", 129 | " driver.find_element(\n", 130 | " By.CSS_SELECTOR, \n", 131 | " 'div.submit'\n", 132 | " ).click()\n", 133 | " \n", 134 | "# 下載檔案\n", 135 | "def download(year, value, index):\n", 136 | " # 下載 csv\n", 137 | " year = 1911 + year\n", 138 | " value = '0' + str(value) if value < 10 else str(value)\n", 139 | " index = '0' + str(index + 1) if (index + 1) < 10 else str(index + 1)\n", 140 | " date = f'{year}{value}{index}'\n", 141 | " os.system(f'curl \"https://www.twse.com.tw/rwd/zh/fund/TWT38U?date={date}&response=csv\" -o {folderPath}/{date}.csv')\n", 142 | " \n", 143 | " # 擷圖\n", 144 | " driver.save_screenshot(f\"{folderPath}/{date}.png\")\n", 145 | "\n", 146 | "# 關閉瀏覽器\n", 147 | "def close():\n", 148 | " driver.quit()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "c1e0acec", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# 走訪頁面\n", 159 | "visit()\n", 160 | "\n", 161 | "# 指定年、月、日,檢視查詢結果,並下載 csv\n", 162 | "year = 100\n", 163 | "value = 2\n", 164 | "index = 8\n", 165 | "setDropDownMenu(year, value, index)\n", 166 | "download(year, value, index)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 37, 172 | "id": "3679d4a5", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# 關閉瀏覽器\n", 177 | "close()" 178 | ] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "test", 184 | "language": "python", 185 | "name": "python3" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 3 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython3", 197 | "version": "3.10.14" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 5 202 | } 203 | -------------------------------------------------------------------------------- /html/HTML_CSS.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/HTML_CSS.zip -------------------------------------------------------------------------------- /html/README.md: -------------------------------------------------------------------------------- 1 | # HTML & CSS 檔案資料來源 2 | PAPAYA 電腦教室 3 | - [成為網頁設計師的第一步!快速上手 HTML & CSS 展開你的網頁設計之旅!](https://www.youtube.com/watch?v=6HHN0G2cwBM) 4 | - 請給該創作者一個讚! -------------------------------------------------------------------------------- /html/images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/images/.DS_Store -------------------------------------------------------------------------------- /html/images/Shop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/images/Shop.png -------------------------------------------------------------------------------- /html/images/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/images/banner.png -------------------------------------------------------------------------------- /html/images/p1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/images/p1.png -------------------------------------------------------------------------------- /html/images/p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/images/p2.png -------------------------------------------------------------------------------- /html/images/p3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/html/images/p3.png -------------------------------------------------------------------------------- /html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 烘培工坊 8 | 9 | 10 | 11 | 12 |
13 | 14 |

烘培工坊

15 | 23 |
24 | 25 |
26 |
27 |

健康穀物麵包新鮮出爐!

28 |

亞麻的細語、葵花的微風,每一口都是自然的敘事

29 |

瞭解更多

30 |
31 | 32 |
33 |
" 嚴選天然食材,精湛匠心烘培,
讓幸福在每口麵包中的香氣中蔓延 "
34 |
35 | 36 |
37 | 烘培工坊大安店 38 |
39 |

8月1日大安旗艦店隆重開張!

40 |

享受花園庭院的悠閒時光,內用座位區美食相伴,
41 | 即日起來店消費加 LINE 好友可獲精緻小點心,等您一同品味!

42 |
43 |
44 | 45 |
46 |
47 | 巴黎風法國土司 48 |

巴黎風法國土司

49 |

外皮金黃酥脆,內質鬆軟
讓您體驗法式浪漫的味覺享受

50 |
51 |
52 | 皇家奶油可頌麵包 53 |

皇家奶油可頌麵包

54 |

酥脆口感搭配奶油的濃郁香氣
品嘗簡單而純粹的美味

55 |
56 |
57 | 健康營養全麥土司 58 |

健康營養全麥土司

59 |

全麥製成,口感柔韌紮實
天然養分健康首選

60 |
61 |
62 | 63 | 70 |
71 | 72 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /html/style.css: -------------------------------------------------------------------------------- 1 | * { 2 | padding:0; 3 | margin:0; 4 | } 5 | body { 6 | background-color: #000000; 7 | } 8 | header { 9 | background-color:rgba(35,28,26,0.8); 10 | height:80px; 11 | width:100%; 12 | position:fixed; 13 | } 14 | h1 { 15 | color:white; 16 | position:absolute; 17 | left:120px; 18 | top:0; 19 | line-height: 80px; 20 | background-image:url(images/logo.svg) ; 21 | background-repeat: no-repeat; 22 | background-position: center; 23 | width:210px; 24 | text-indent: -9999px; 25 | } 26 | header ul { 27 | position:absolute; 28 | right:5vw; 29 | top:0; 30 | line-height:80px; 31 | } 32 | header li { 33 | display:inline; 34 | margin-right:4vw; 35 | } 36 | header a { 37 | color:white; 38 | text-decoration: none; 39 | } 40 | header a:hover { 41 | text-decoration: underline; 42 | } 43 | .news { 44 | background-color: antiquewhite; 45 | color:white; 46 | height:95vh; 47 | background-image: url("images/banner.png"); 48 | background-repeat: no-repeat; 49 | background-size:cover; 50 | background-position: center; 51 | display:flex; 52 | flex-direction:column; 53 | align-items: center; 54 | justify-content: center; 55 | } 56 | .news h2 { 57 | font-size:60px; 58 | margin-top:60px; 59 | } 60 | .news p { 61 | color:#FEF7E6; 62 | font-size:18px; 63 | margin:25px 0; 64 | } 65 | .news a { 66 | color:white; 67 | text-decoration: none; 68 | border: 1px solid #ACACAC; 69 | padding:10px 20px; 70 | border-radius: 5px; 71 | } 72 | .slogan { 73 | background-color:#485652; 74 | color:white; 75 | height:250px; 76 | display:flex; 77 | justify-content: center; 78 | align-items: center; 79 | font-size: 20px; 80 | line-height: 1.8em; 81 | } 82 | .shop { 83 | display:flex; 84 | } 85 | .shop img { 86 | width:50%; 87 | } 88 | .info { 89 | width:50%; 90 | background-color: white; 91 | display:flex; 92 | flex-direction:column; 93 | align-items: center; 94 | justify-content: center; 95 | } 96 | .info h2 { 97 | font-size:40px; 98 | margin-bottom: 30px; 99 | } 100 | .info p { 101 | text-align: center; 102 | line-height: 2em; 103 | } 104 | .product { 105 | display:flex; 106 | background-color: #ede9db; 107 | justify-content: space-between; 108 | align-items: center; 109 | padding: 100px 180px 80px; 110 | } 111 | .product div { 112 | width:300px; 113 | text-align:center; 114 | } 115 | .product img { 116 | width:100%; 117 | border-radius: 10px;; 118 | } 119 | .product h3 { 120 | font-size: 20px; 121 | margin: 20px 0; 122 | } 123 | .product p { 124 | font-size:14px; 125 | line-height: 1.6em; 126 | } 127 | .newsletter { 128 | background-color:#485652; 129 | color:white; 130 | padding:60px 0; 131 | display:flex; 132 | flex-direction: column; 133 | align-items: center; 134 | } 135 | .newsletter p { 136 | margin-bottom: 20px; 137 | } 138 | .newsletter input, .newsletter button { 139 | background-color: transparent; 140 | color:white; 141 | border: 1px solid #949d9a; 142 | padding: 10px; 143 | border-radius: 5px;; 144 | } 145 | .newsletter input { 146 | width:200px; 147 | margin-right: 15px;; 148 | } 149 | .newsletter button { 150 | width:80px; 151 | cursor:pointer; 152 | } 153 | input::placeholder { 154 | color:#DDDDDD; 155 | } 156 | input:focus { 157 | outline: none; /* 移除焦點效果 */ 158 | border-color:#E0E9A3; /* 焦點時的邊框顏色 */ 159 | } 160 | footer { 161 | background-color:#000000; 162 | color:#B7B7B7; 163 | height:60px; 164 | display:flex; 165 | justify-content: center; 166 | align-items: center; 167 | font-size: 14px; 168 | } 169 | .menu { 170 | display:none; 171 | } 172 | /* 響應式設計語法 */ 173 | @media screen and (max-width: 768px) { 174 | header ul { 175 | display:none; 176 | } 177 | header h1 { 178 | left:50%; 179 | transform: translateX(-50%); 180 | } 181 | .news h2 { 182 | font-size:40px; 183 | } 184 | .menu { 185 | display:block; 186 | background-color: transparent; 187 | color:white; 188 | font-size: 35px; 189 | position:absolute; 190 | top:15px; 191 | left:20px; 192 | border:none; 193 | cursor: pointer; 194 | } 195 | .shop { 196 | flex-direction: column; 197 | } 198 | .shop img { 199 | width:100%; 200 | } 201 | .info { 202 | width:100%; 203 | padding:40px 0; 204 | } 205 | .info h2 { 206 | font-size:30px; 207 | } 208 | .product { 209 | flex-direction: column; 210 | padding: 60px 0; 211 | } 212 | .product div { 213 | margin-bottom: 30px; 214 | } 215 | } -------------------------------------------------------------------------------- /python_web_scraping.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/python_web_scraping.docx -------------------------------------------------------------------------------- /python_web_scraping.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/python_web_scraping.pdf -------------------------------------------------------------------------------- /turingcerts.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/turingcerts.jpg -------------------------------------------------------------------------------- /yt-dlp_and_ffmpeg.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/telunyang/python_web_scraping/22e2408b9c7d745e9fd5a6ccf8a2f76c2aa68df1/yt-dlp_and_ffmpeg.docx --------------------------------------------------------------------------------