├── 104_job_search.py
├── README.md
├── accupass_search.py
├── concords.py
├── google_real_time_news.py
├── google_search.py
├── lottery.py
├── materialInfo.py
├── stockPrice.py
└── taifex_tx.py


/104_job_search.py:
--------------------------------------------------------------------------------
  1 | # 104人力銀行爬蟲程式碼
  2 | # 2021/02/08 蘇彥庭
  3 | import time
  4 | import datetime
  5 | import requests
  6 | import re
  7 | import json
  8 | import pandas as pd
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | # 確認是否有正常連線
 12 | def CheckConnect(url, headers):
 13 |     try:
 14 |         response = requests.get(url, headers=headers)
 15 |         checkSuccess = True
 16 |         return response, checkSuccess
 17 |     except Exception as e:
 18 |         print('下載失敗!')
 19 |         response = None
 20 |         checkSuccess = False
 21 |         return response, checkSuccess
 22 | 
 23 | 
 24 | # 爬蟲參數設定
 25 | # 搜尋關鍵詞
 26 | keyword = '數據分析'
 27 | # 搜尋最大頁數
 28 | maxPage = 100
 29 | 
 30 | # 迴圈搜尋結果頁數
 31 | outputDf = pd.DataFrame()
 32 | for page in range(1, maxPage+1):
 33 | 
 34 |     # 設定header
 35 |     headers = {
 36 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
 37 |                       '(KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36'
 38 |     }
 39 | 
 40 |     # 目標網址
 41 |     url = 'https://www.104.com.tw/jobs/search/?ro=0&isnew=30&keyword=' + keyword + \
 42 |           '&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&order=14&asc=0&s9=1&page=' + \
 43 |           str(page) + '&mode=s&jobsource=2018indexpoc'
 44 | 
 45 |     # 網址重要參數
 46 |     # keyword: 搜尋關鍵字
 47 |     # isnew: 更新日期 / 例如: 本日最新=0 二週內=14 一個月內=30
 48 |     # s9:上班時段 / 例如: 日班=1 晚班=2
 49 |     # page: 搜尋結果第N頁
 50 |     # 此處搜尋條件設定為: 最近一個月內 日班
 51 | 
 52 |     # 取得網頁資料
 53 |     # 防呆機制
 54 |     checkSuccess = False
 55 |     tryNums = 0
 56 |     while not checkSuccess:
 57 |         response, checkSuccess = CheckConnect(url, headers)
 58 |         if not checkSuccess:   # 若爬取失敗 則暫停120秒
 59 |             if tryNums == 5:   # 若已重新爬取累計5次 則放棄此次程式執行
 60 |                 break
 61 |             tryNums += 1
 62 |             print('本次下載失敗 程式暫停120秒')
 63 |             time.sleep(120)
 64 | 
 65 |     # 防呆機制: 若累積爬取資料失敗 則終止此次程式
 66 |     if tryNums == 5:
 67 |         print('下載失敗次數累積5次 結束程式')
 68 |         break
 69 | 
 70 |     # 確認是否已查詢到底
 71 |     if '搜尋條件無符合工作機會' in response.text:
 72 |         print('搜尋結果已到底 無工作職缺資訊可下載 爬蟲終止!')
 73 |         break
 74 | 
 75 |     # 轉為soup格式
 76 |     soup = BeautifulSoup(response.text, 'html.parser')
 77 | 
 78 |     # 取得搜尋返回結果
 79 |     jobList = soup.select('article.b-block--top-bord')
 80 |     # 取得職缺公布時間
 81 |     jobAnnounceDate = [elem.select('span.b-tit__date')[0].text.replace('\n', '').strip() for elem in jobList]
 82 |     # 取得職缺名稱
 83 |     jobTitles = [elem.select('a.js-job-link')[0].text for elem in jobList]
 84 |     # 取得職缺公司名稱
 85 |     jobCompanyName = [elem.select('a')[1].text.replace('\n', '').strip() for elem in jobList]
 86 |     # 取得職缺公司頁面資訊連結
 87 |     jobCompanyUrl = ['https:' + elem.select('a')[1]['href'] for elem in jobList]
 88 |     # 取得職缺公司所屬產業類別
 89 |     jobCompanyIndustry = [elem.select('li')[2].text for elem in jobList]
 90 |     # 取得待遇資訊
 91 |     jobSalary = [elem.select('div.job-list-tag.b-content')[0].select('span')[0].text for elem in jobList]
 92 | 
 93 |     # 整理其他工作資訊(工作地點, 年資要求, 學歷要求)
 94 |     jobOtherInfo = [elem.select('ul.b-list-inline.b-clearfix.job-list-intro.b-content')[0] for elem in jobList]
 95 |     # 取得工作地點
 96 |     jobLocation = [elem.select('li')[0].text for elem in jobOtherInfo]
 97 |     # 取得年資要求
 98 |     jobRqYear = [elem.select('li')[1].text for elem in jobOtherInfo]
 99 |     # 取得學歷要求
100 |     jobRqEducation = [elem.select('li')[2].text for elem in jobOtherInfo]
101 | 
102 |     # 取得職缺網址資訊
103 |     jobDetailUrl = ['https:' + elem.select('a')[0]['href'] for elem in jobList]
104 | 
105 |     # 迴圈職缺網址資訊取得更詳細資訊
106 |     jobContent = list()
107 |     jobCategory = list()
108 |     jobRqDepartment = list()
109 |     jobSpecialty = list()
110 |     jobOthers = list()
111 |     for i, iJobDetailUrl in enumerate(jobDetailUrl):
112 | 
113 |         print('目前正在爬取第' + str(page) + '頁連結，當前頁面連結下載進度: ' + str(i+1) + ' / ' + str(len(jobDetailUrl)))
114 | 
115 |         # 詳細資訊需透過額外的ajax爬取
116 |         iUrl = 'https://www.104.com.tw/job/ajax/content/' + re.search('job/(.*)\?', iJobDetailUrl).group(1)
117 | 
118 |         # 設定header
119 |         headers = {
120 |             'Referer': iJobDetailUrl,
121 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
122 |                           '(KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36'
123 |         }
124 | 
125 |         # 取得網頁資料
126 |         # 防呆機制
127 |         checkSuccess = False
128 |         tryNums = 0
129 |         while not checkSuccess:
130 |             response, checkSuccess = CheckConnect(iUrl, headers)
131 |             if not checkSuccess:  # 若爬取失敗 則暫停120秒
132 |                 if tryNums == 5:  # 若已重新爬取累計5次 則放棄此次程式執行
133 |                     break
134 |                 tryNums += 1
135 |                 print('本次下載失敗 程式暫停120秒')
136 |                 time.sleep(120)
137 | 
138 |         # 防呆機制: 若累積爬取資料失敗 則終止此次程式
139 |         if tryNums == 5:
140 |             print('下載失敗次數累積5次 結束程式')
141 |             break
142 | 
143 |         # 取得網頁資料
144 |         response = response.json()
145 | 
146 |         # 判斷是否有error: 職務不存在
147 |         if response.get('error'):
148 | 
149 |             jobContent.append('')
150 |             jobCategory.append('')
151 |             jobRqDepartment.append('')
152 |             jobSpecialty.append('')
153 |             jobOthers.append('')
154 | 
155 |         else:
156 | 
157 |             # 取得工作內容
158 |             jobContent.append(response['data']['jobDetail']['jobDescription'])
159 |             # 取得職務類別
160 |             jobCategory.append(','.join([elem['description'] for elem in response['data']['jobDetail']['jobCategory']]))
161 |             # 取得科系要求
162 |             jobRqDepartment.append(','.join(response['data']['condition']['major']))
163 |             # 取得擅長工具
164 |             jobSpecialty.append(','.join([elem['description'] for elem in response['data']['condition']['specialty']]))
165 |             # 取得其他條件
166 |             jobOthers.append(response['data']['condition']['other'])
167 | 
168 |         # 暫停秒數避免爬太快
169 |         time.sleep(3)
170 | 
171 |     # 組合資訊成資料表並儲存
172 |     iOutputDf = pd.DataFrame({'jobAnnounceDate': jobAnnounceDate,
173 |                               'jobTitles': jobTitles,
174 |                               'jobCompanyName': jobCompanyName,
175 |                               'jobCompanyUrl': jobCompanyUrl,
176 |                               'jobCompanyIndustry': jobCompanyIndustry,
177 |                               'jobContent': jobContent,
178 |                               'jobCategory': jobCategory,
179 |                               'jobSalary': jobSalary,
180 |                               'jobLocation': jobLocation,
181 |                               'jobRqYear': jobRqYear,
182 |                               'jobRqEducation': jobRqEducation,
183 |                               'jobRqDepartment': jobRqDepartment,
184 |                               'jobSpecialty': jobSpecialty,
185 |                               'jobOthers': jobOthers,
186 |                               'jobDetailUrl': jobDetailUrl})
187 |     outputDf = pd.concat([outputDf, iOutputDf])
188 | 
189 | # 加入本次搜尋資訊
190 | outputDf.insert(0, 'maxPage', maxPage, True)
191 | outputDf.insert(0, 'keyword', keyword, True)
192 | now = datetime.datetime.now()
193 | outputDf.insert(0, 'searchTime', now.strftime('%Y-%m-%d %H:%M:%S'), True)
194 | 
195 | # 刪除jobAnnounceDate為空值之列(代表該筆資料屬於104廣告職缺 與搜尋職缺較不相關)
196 | outputDf = outputDf[outputDf.jobAnnounceDate != '']
197 | 
198 | # 輸出csv檔案
199 | fileName = now.strftime('%Y%m%d%H%M%S') + '104人力銀行_' + keyword + '_爬蟲搜尋結果.csv'
200 | outputDf.to_csv('104人力銀行爬蟲搜尋結果.csv', encoding='utf-8-sig')
201 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python爬蟲程式碼
 2 | 
 3 | 此專案主要放置我寫過的Python爬蟲程式碼
 4 | 
 5 | ## 104人力銀行爬蟲程式(104_job_search.py)
 6 | 此程式是在104人力銀行輸入職缺關鍵字後，將返回的職缺資訊蒐集回來。
 7 | 程式執行完後會整理出以下資料欄位：
 8 | * 爬蟲執行的時間
 9 | * 搜尋關鍵字
10 | * 搜尋最大頁數
11 | * 職缺日期
12 | * 職缺名稱
13 | * 職缺公司名稱
14 | * 職缺公司網址簡介(104人力銀行頁面)
15 | * 職缺公司所屬產業
16 | * 職缺內容
17 | * 職缺職業類別
18 | * 職缺薪水
19 | * 職缺地點
20 | * 職缺要求年資
21 | * 職缺要求教育程度
22 | * 職缺要求系所
23 | * 職缺要求技能
24 | * 職缺其他條件
25 | * 職缺詳細資訊(104人力銀行頁面)
26 | 
27 | ## Google股票新聞爬蟲程式(google_real_time_news.py)
28 | 
29 | 此程式主要是透過Google RSS的搜尋功能，來下載股票的新聞資訊。
30 | 由於有不同家媒體的新聞來源，所以裡面只針對比較偏向財金新聞的網站進行擷取，共有下列9個網站：
31 | * 聯合新聞網
32 | * 自由財經
33 | * Yahoo奇摩股市
34 | * 經濟日報
35 | * 中時新聞網
36 | * 工商時報
37 | * 鉅亨網
38 | * ETtoday
39 | * EBC東森財經新聞
40 | 
41 | 程式執行完後會整理出以下資料欄位：
42 | * 執行時間
43 | * 搜尋關鍵字
44 | * 新聞標題
45 | * 新聞網址
46 | * 新聞日期
47 | * 新聞摘要
48 | * 新聞來源及新聞內容
49 | 
50 | ## 公開資訊觀測站-重大訊息爬蟲程式(materialInfo.py)
51 | 
52 | 此程式為至公開資訊觀測站下載歷史重大訊息資料(包含上市、上櫃及DR公司)，並且有做防呆處理。程式執行完後會整理出以下資料欄位：
53 | * 股票名稱
54 | * 股票代碼
55 | * 公告日期
56 | * 公告時間
57 | * 主旨
58 | * 公告序號
59 | * 事實發生日
60 | * 重訊內容
61 | 
62 | ## 上市上櫃股價爬蟲程式(stockPrice.py)
63 | 
64 | 此程式至證交所及櫃買中心下載當日各檔股價資訊，程式執行完後會整理出以下資料欄位：
65 | * 日期
66 | * 市場別
67 | * 股票代號
68 | * 股票名稱
69 | * 開盤價
70 | * 最高價
71 | * 最低價
72 | * 收盤價
73 | * 成交股數
74 | * 成交金額(元)
75 | 
76 | ## Accupass搜尋活動結果爬蟲程式(accupass_search.py)
77 | 
78 | 此程式為簡單範例，至Accupass活動通網站爬取關鍵字搜索後的活動資訊，並輸出csv檔案。
79 | 
80 | ## 期交所台指期爬蟲程式(taifex_tx.py)
81 | 
82 | 此程式為簡單範例，爬取期交所台指期指定日期的價格資訊。
83 | 
84 | ## Google搜尋(google_search.py)
85 | 
86 | 此程式為簡單的範例，用Google關鍵字搜尋並找出返回結果的網站標題。
87 | 
88 | ## 樂透彩爬蟲(lottery.py)
89 | 
90 | 此程式為簡單範例，爬取台彩網站威力彩及大樂透最近一期的開獎結果。
91 | 
92 | ## 證券商盤後資料專區-外資上市/上櫃買超排行(concords.py)
93 | 
94 | 此程式為簡單範例，爬取康和證券盤後資料專區-外資上市/上櫃買超排行資訊。但建議若有三大法人資料需求，可至證交所/櫃買中心取得資料，因為是第一手資料資訊會最正確，且資料取得方式更佳便民。
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/accupass_search.py:
--------------------------------------------------------------------------------
 1 | # Accupass網站活動查詢
 2 | # 2021/02/21
 3 | # 程式撰寫: 蘇彥庭
 4 | # 程式說明:
 5 | # * 此程式主要爬取在Accupass網站以關鍵字查詢活動後 返回的相關活動資訊
 6 | # * 此程式未測試Accupass網站的反爬蟲機制 若大量爬取有可能會出錯
 7 | import time
 8 | import pandas as pd
 9 | import json
10 | import requests
11 | from bs4 import BeautifulSoup
12 | 
13 | # 設定查詢關鍵字
14 | searchKeyWord = '草莓季'
15 | 
16 | # 設定最大查詢頁數
17 | maxSearchPage = 3
18 | 
19 | # 建立儲存表
20 | allEventRows = list()
21 | 
22 | # 迴圈下載各搜尋頁面資訊
23 | for iPage in range(maxSearchPage):
24 | 
25 |     # 目標網址
26 |     url = 'https://old.accupass.com/search/changeconditions/r/0/0/0/0/4/' + str(iPage) + \
27 |           '/00010101/99991231/?q=' + searchKeyWord
28 | 
29 |     # 取得活動查詢頁面資訊
30 |     response = requests.get(url)
31 |     soup = BeautifulSoup(response.text, 'html.parser')
32 | 
33 |     # 取出活動資訊
34 |     eventRows = soup.select('div.col-xs-12.col-sm-6.col-md-4')
35 |     eventRows = [elem.select('div')[0]['event-row'] for elem in eventRows]
36 |     eventRows = [json.loads(elem) for elem in eventRows]
37 | 
38 |     # 儲存活動資訊
39 |     allEventRows.extend(eventRows)
40 | 
41 |     # 暫緩速度
42 |     time.sleep(1)
43 | 
44 | # 轉為Pandas格式
45 | eventInfoTable = pd.DataFrame(allEventRows)
46 | 
47 | # 輸出csv檔案
48 | fileName = 'accupass_' + searchKeyWord + '_搜尋活動結果報表.csv'
49 | eventInfoTable.to_csv(fileName, index=False, encoding='utf-8-sig')
50 | 


--------------------------------------------------------------------------------
/concords.py:
--------------------------------------------------------------------------------
 1 | # 康和證券盤後資料專區-外資上市/上櫃買超排行
 2 | # 頁面來源: https://concords.moneydj.com/z/zg/zg_D_0_-1.djhtm
 3 | # 程式碼撰寫: 蘇彥庭
 4 | # 日期: 2023/05/28
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | import pandas as pd
 8 | 
 9 | ########## 參數設定 ##########
10 | # url: https://concords.moneydj.com/z/zg/zg_D_{mktType}_{days}.djhtm
11 | 
12 | # mktType: 市場類別
13 | # 0: 上市 1:上櫃
14 | mktType = 0
15 | 
16 | # days: 計算日數
17 | # -1: 1周以來 
18 | # 1: 1日
19 | # 2: 2日
20 | # 3: 3日
21 | # 4: 4日
22 | # 5: 5日
23 | # 10: 10日
24 | # 20: 20日
25 | # 30: 30日
26 | days = 1
27 | 
28 | ########## 主程式 ##########
29 | # 目標網址
30 | url = f"https://concords.moneydj.com/z/zg/zg_D_{mktType}_{days}.djhtm"
31 | headers = {
32 |     'Content-Type': 'text/html;Charset=big5'
33 | }
34 | 
35 | # 執行爬蟲取得資訊
36 | # 此處採坑: 若用html.parser解析會發生錯誤 不知道為什麼解析出來會自動添加不必要的頁籤
37 | # 讓後續整理資料時發生錯誤 改用lxml解析即不會出錯
38 | response = requests.get(url, headers=headers)
39 | soup = BeautifulSoup(response.text, 'lxml')
40 | 
41 | # 取得table頁籤資料
42 | table = soup.select('table#oMainTable')
43 | 
44 | assert len(table) == 1, "無法在頁面找到目標表格, 請確認爬蟲程式是否有問題!"
45 | 
46 | # 選擇tr標籤
47 | tableRows = table[0].select('tr')
48 | 
49 | # 排除第0個tr標籤 因為是非目標資料
50 | # 迴圈整理每個tr標籤內的td標籤資料
51 | output = list()
52 | for i in range(1, len(tableRows)):
53 |     output.append([elem.text for elem in tableRows[i].select('td')])
54 | 
55 | # 轉為Panda資料表格式 此即為最後所需的排行榜資料
56 | output = pd.DataFrame(output[1:], columns=output[0])
57 | 
58 | # 輸出資料
59 | output.to_csv('output.csv', index=False, encoding='utf-8-sig')


--------------------------------------------------------------------------------
/google_real_time_news.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Google新聞即時爬蟲
  3 | 程式碼撰寫: 蘇彥庭
  4 | 日期: 20210111
  5 | 
  6 | 2023/04/08程式修改
  7 | 1. 處理Google RSS連結: 原本為新聞連結 現在被改為Google頁面連結 連結該Google頁面後才會被轉向實際新聞連結
  8 | 2. 修改經濟日報新聞內容抓取方式
  9 | 
 10 | 2024/08/14程式修改
 11 | 1. 處理日期轉換問題
 12 | 2. 處理Google RSS連結問題 
 13 | 採用此Github專案: https://github.com/SSujitX/google-news-url-decoder/tree/main 
 14 | 提供的Decoder取得正確新聞網址(但實測部分網址可能還是會解析錯誤)
 15 | 3. 修改鉅亨網新聞內容爬蟲程式碼
 16 | """
 17 | 
 18 | # 載入套件
 19 | import requests
 20 | import pandas as pd
 21 | import time
 22 | import re
 23 | from bs4 import BeautifulSoup
 24 | import datetime
 25 | import base64
 26 | 
 27 | # 參數設定
 28 | # 欲下載新聞的股票關鍵字清單
 29 | searchList = ['2330台積電', '2317鴻海', '2412中華電']
 30 | # 新聞下載起始日
 31 | nearStartDate = (datetime.date.today() + datetime.timedelta(days=-10)).strftime('%Y-%m-%d')
 32 | 
 33 | 
 34 | # google-news-url-decoder
 35 | def fetch_decoded_batch_execute(id):
 36 |     s = (
 37 |         '[[["Fbv4je","[\\"garturlreq\\",[[\\"en-US\\",\\"US\\",[\\"FINANCE_TOP_INDICES\\",\\"WEB_TEST_1_0_0\\"],'
 38 |         'null,null,1,1,\\"US:en\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],'
 39 |         '\\"en-US\\",\\"US\\",1,[2,3,4,8],1,0,\\"655000234\\",0,0,null,0],\\"'
 40 |         + id
 41 |         + '\\"]",null,"generic"]]]'
 42 |     )
 43 | 
 44 |     headers = {
 45 |         "Content-Type": "application/x-www-form-urlencoded;charset=utf-8",
 46 |         "Referer": "https://news.google.com/",
 47 |     }
 48 | 
 49 |     response = requests.post(
 50 |         "https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je",
 51 |         headers=headers,
 52 |         data={"f.req": s},
 53 |     )
 54 | 
 55 |     if response.status_code != 200:
 56 |         raise Exception("Failed to fetch data from Google.")
 57 | 
 58 |     text = response.text
 59 |     header = '[\\"garturlres\\",\\"'
 60 |     footer = '\\",'
 61 |     if header not in text:
 62 |         raise Exception(f"Header not found in response: {text}")
 63 |     start = text.split(header, 1)[1]
 64 |     if footer not in start:
 65 |         raise Exception("Footer not found in response.")
 66 |     url = start.split(footer, 1)[0]
 67 |     return url
 68 | 
 69 | 
 70 | # google-news-url-decoder
 71 | def decode_google_news_url(source_url):
 72 |     url = requests.utils.urlparse(source_url)
 73 |     path = url.path.split("/")
 74 |     if url.hostname == "news.google.com" and len(path) > 1 and path[-2] == "articles":
 75 |         base64_str = path[-1]
 76 |         decoded_bytes = base64.urlsafe_b64decode(base64_str + "==")
 77 |         decoded_str = decoded_bytes.decode("latin1")
 78 | 
 79 |         prefix = b"\x08\x13\x22".decode("latin1")
 80 |         if decoded_str.startswith(prefix):
 81 |             decoded_str = decoded_str[len(prefix) :]
 82 | 
 83 |         suffix = b"\xd2\x01\x00".decode("latin1")
 84 |         if decoded_str.endswith(suffix):
 85 |             decoded_str = decoded_str[: -len(suffix)]
 86 | 
 87 |         bytes_array = bytearray(decoded_str, "latin1")
 88 |         length = bytes_array[0]
 89 |         if length >= 0x80:
 90 |             decoded_str = decoded_str[2 : length + 1]
 91 |         else:
 92 |             decoded_str = decoded_str[1 : length + 1]
 93 | 
 94 |         if decoded_str.startswith("AU_yqL"):
 95 |             return fetch_decoded_batch_execute(base64_str)
 96 | 
 97 |         return decoded_str
 98 |     else:
 99 |         return source_url
100 |     
101 | 
102 | # 整理Google新聞資料用
103 | def arrangeGoogleNews(elem):
104 |     return ([elem.find('title').getText(),
105 |              elem.find('link').getText(),
106 |              elem.find('pubDate').getText(),
107 |              BeautifulSoup(elem.find('description').getText(), 'html.parser').find('a').getText(),
108 |              elem.find('source').getText()])
109 | 
110 | 
111 | # 擷取各家新聞網站新聞函數
112 | def beautifulSoupNews(url):
113 | 
114 |     # 設定hearers
115 |     headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
116 |                              'Chrome/87.0.4280.141 Safari/537.36'}
117 | 
118 |     # 取得Google跳轉頁面的新聞連結
119 |     newsUrl = decode_google_news_url(url)
120 | 
121 |     # 取得該篇新聞連結內容
122 |     response = requests.get(newsUrl, headers=headers)
123 |     soup = BeautifulSoup(response.text, 'html.parser') 
124 | 
125 |     # 判斷url網域做對應文章擷取
126 |     try:
127 |         domain = re.findall('https://[^/]*', newsUrl)[0].replace('https://', '')
128 |     except:
129 |         print(f'網址解析錯誤: {newsUrl}')
130 |         content = 'unknow domain'
131 |         return newsUrl, content
132 | 
133 |     if domain == 'udn.com':
134 | 
135 |         # 聯合新聞網
136 |         item = soup.find_all('section', class_='article-content__editor')[0].find_all('p')
137 |         content = [elem.getText() for elem in item]
138 |         content = ''.join(content)
139 |         content = content.replace('\r', ' ').replace('\n', ' ')
140 | 
141 |     elif domain == 'ec.ltn.com.tw':
142 | 
143 |         # 自由財經
144 |         item = soup.find_all('div', class_='text')[0].find_all('p', class_='')
145 |         content = [elem.getText() for elem in item]
146 |         content = ''.join(content)
147 |         content = content.replace('\r', ' ').replace('\n', ' ').replace(u'\xa0', ' '). \
148 |             replace('一手掌握經濟脈動', '').replace('點我訂閱自由財經Youtube頻道', '')
149 | 
150 |     elif domain in ['tw.stock.yahoo.com', 'tw.news.yahoo.com']:
151 | 
152 |         # Yahoo奇摩股市
153 |         item = soup.find_all('div', class_='caas-body')[0].find_all('p')
154 |         content = [elem.getText() for elem in item]
155 |         del_text = soup.find_all('div', class_='caas-body')[0].find_all('a')
156 |         del_text = [elem.getText() for elem in del_text]
157 |         content = [elem for elem in content if elem not in del_text]
158 |         content = ''.join(content)
159 |         content = content.replace('\r', ' ').replace('\n', ' ').replace(u'\xa0', ' ')
160 | 
161 |     elif domain == 'money.udn.com':
162 | 
163 |         # 經濟日報
164 |         item = soup.find_all('section', id='article_body')[0].find_all('p')
165 |         content = [elem.getText() for elem in item]
166 |         content = [elem for elem in content]
167 |         content = ''.join(content)
168 |         content = content.replace('\r', ' ').replace('\n', ' ')
169 | 
170 |     elif domain == 'www.chinatimes.com':
171 | 
172 |         # 中時新聞網
173 |         item = soup.find_all('div', class_='article-body')[0].find_all('p')
174 |         content = [elem.getText() for elem in item]
175 |         content = [elem for elem in content]
176 |         content = ''.join(content)
177 |         content = content.replace('\r', ' ').replace('\n', ' ')
178 | 
179 |     elif domain == 'ctee.com.tw':
180 | 
181 |         # 工商時報
182 |         item = soup.find_all('div', class_='entry-content clearfix single-post-content')[0].find_all('p')
183 |         content = [elem.getText() for elem in item]
184 |         content = [elem for elem in content]
185 |         content = ''.join(content)
186 |         content = content.replace('\r', ' ').replace('\n', ' ')
187 | 
188 |     elif domain == 'news.cnyes.com':
189 | 
190 |         # 鉅亨網
191 |         sections = soup.find_all('section', style='margin-top:30px')
192 |         content = list()
193 |         for section in sections:
194 |             p_tag = section.find('p')
195 |             if p_tag:
196 |                 content.append(p_tag.getText())
197 |         content = ''.join(content)
198 |         content = content.replace('\r', ' ').replace('\n', ' ').replace(u'\xa0', ' ')
199 | 
200 |     elif domain == 'finance.ettoday.net':
201 | 
202 |         # ETtoday
203 |         item = soup.find_all('div', itemprop='articleBody')[0].find_all('p')
204 |         content = [elem.getText() for elem in item]
205 |         content = [elem for elem in content]
206 |         content = ''.join(content)
207 |         content = content.replace('\r', ' ').replace('\n', ' ').replace(u'\xa0', ' ')
208 | 
209 |     elif domain == 'fnc.ebc.net.tw':
210 | 
211 |         # EBC東森財經新聞
212 |         content = str(soup.find_all('script')[-2]).split('ReactDOM.render(React.createElement(')[1]
213 |         content = content.split(',')[1].replace('{"content":"', '').replace('"})', '')
214 |         content = re.sub(u'\\\\u003[a-z]+', '', content)
215 |         content = content.replace('/p', ' ').replace('\\n', '')
216 | 
217 |     else:
218 | 
219 |         # 未知domain
220 |         content = 'unknow domain'
221 | 
222 |     return newsUrl, content
223 | 
224 | 
225 | # 迴圈下載股票清單的Google新聞資料
226 | stockNews = pd.DataFrame()
227 | for iSearch in range(len(searchList)):
228 | 
229 |     print('目前正在搜尋股票: ' + searchList[iSearch] +
230 |           ' 在Google的新聞清單  進度: ' + str(iSearch + 1) + ' / ' + str(len(searchList)))
231 | 
232 |     # 建立搜尋網址
233 |     url = 'https://news.google.com/news/rss/search/section/q/' + \
234 |           searchList[iSearch] + '/?hl=zh-tw&gl=TW&ned=zh-tw_tw'
235 |     response = requests.get(url)
236 |     soup = BeautifulSoup(response.text, 'xml')
237 |     item = soup.find_all('item')
238 |     rows = [arrangeGoogleNews(elem) for elem in item]
239 | 
240 |     # 組成pandas
241 |     df = pd.DataFrame(data=rows, columns=['title', 'link', 'pub_date', 'description', 'source'])
242 |     # 新增時間戳記欄位
243 |     df.insert(0, 'search_time', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), True)
244 |     # 新增搜尋字串
245 |     df.insert(1, 'search_key', searchList[iSearch], True)
246 |     # 篩選最近的新聞
247 |     df['pub_date'] = pd.to_datetime(df['pub_date'])
248 |     df = df[df['pub_date'] >= nearStartDate]
249 |     # 按發布時間排序
250 |     df = df.sort_values(['pub_date']).reset_index(drop=True)
251 | 
252 |     # 迴圈爬取新聞連結與內容
253 |     newsUrls = list()
254 |     contents = list()
255 |     for iLink in range(len(df['link'])):
256 | 
257 |         print('目前正在下載: ' + searchList[iSearch] +
258 |               ' 各家新聞  進度: ' + str(iLink + 1) + ' / ' + str(len(df['link'])))
259 | 
260 |         newsUrl, content = beautifulSoupNews(url=df['link'][iLink])
261 |         newsUrls.append(newsUrl)
262 |         contents.append(content)
263 |         time.sleep(3)
264 | 
265 |     # 新增新聞連結與內容欄位
266 |     df['newsUrl'] = newsUrls
267 |     df['content'] = contents
268 | 
269 |     # 儲存資料
270 |     stockNews = pd.concat([stockNews, df])
271 | 
272 | # 輸出結果檢查
273 | stockNews.to_csv('checkData.csv', index=False, encoding='utf-8-sig')
274 | 


--------------------------------------------------------------------------------
/google_search.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Google搜尋結果爬蟲程式
 3 | 程式碼撰寫: 蘇彥庭
 4 | 日期: 20210116
 5 | """
 6 | 
 7 | # 載入套件
 8 | import sys
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | 
12 | # 搜尋字詞
13 | query = 'tibame'
14 | 
15 | # 設定hearers
16 | headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
17 |                          'Chrome/87.0.4280.141 Safari/537.36'}
18 | 
19 | # 執行爬蟲下載搜尋結果頁面標題
20 | url = 'https://www.google.com/search?q=' + query
21 | response = requests.get(url, headers=headers)
22 | soup = BeautifulSoup(response.text, 'html.parser')
23 | content = soup.find_all('div', class_='g')
24 | title = [elem.find('h3').getText() for elem in content]
25 | 
26 | # 輸出查詢結果
27 | print('Google搜尋結果頁面共有以下標題:')
28 | [print(elem) for elem in title]
29 | 
30 | 


--------------------------------------------------------------------------------
/lottery.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 威力彩及大樂透開獎結果爬蟲程式碼
 3 | 程式碼撰寫: 蘇彥庭
 4 | 日期: 20210113
 5 | """
 6 | 
 7 | # 載入套件
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | 
11 | # 下載台灣彩券首頁原始碼
12 | url = 'https://www.taiwanlottery.com.tw/'
13 | response = requests.get(url)
14 | soup = BeautifulSoup(response.text, 'html.parser')
15 | 
16 | # 整理威力彩開獎號碼
17 | content = soup.find_all('div', class_='contents_box02')[0].find_all('div')
18 | result = [elem.getText() for elem in content][-7:]
19 | print('威力彩開獎號碼: ' + ', '.join(result))
20 | 
21 | # 整理大樂透開獎號碼
22 | content = soup.find_all('div', class_='contents_box02')[2].find_all('div')
23 | result = [elem.getText() for elem in content][-7:]
24 | print('大樂透開獎號碼: ' + ', '.join(result))
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/materialInfo.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 公開資訊觀測站-重大訊息爬蟲程式碼
  3 | 程式碼撰寫: 蘇彥庭
  4 | 日期: 20210108
  5 | """
  6 | 
  7 | # 載入套件
  8 | import datetime
  9 | import requests
 10 | import pandas as pd
 11 | import time
 12 | import os
 13 | from bs4 import BeautifulSoup
 14 | import re
 15 | 
 16 | 
 17 | # 確認是否有正常連線
 18 | def CheckConnect(url):
 19 |     try:
 20 |         response = requests.get(url)
 21 |         soup = BeautifulSoup(response.text, 'html.parser')
 22 |         if '查詢過於頻繁' in soup:  # 查詢過於頻繁視為下載失敗
 23 |             print('查詢過於頻繁!')
 24 |             soup = None
 25 |             checkSuccess = False
 26 |         else:
 27 |             checkSuccess = True
 28 |         return soup, checkSuccess
 29 |     except Exception as e:
 30 |         print('下載失敗!')
 31 |         soup = None
 32 |         checkSuccess = False
 33 |         return soup, checkSuccess
 34 | 
 35 | 
 36 | # 將ROC日期轉換為西元日期
 37 | def ConvertDate(idate):
 38 |     if '年' in idate:
 39 |         return str((int(re.findall('民國(\\d+)年', idate)[0]) + 1911) * 10000 + \
 40 |                    int(re.findall('年(\\d+)月', idate)[0]) * 100 + \
 41 |                    int(re.findall('月(\\d+)日', idate)[0]))
 42 |     elif '/' in idate:
 43 |         idate = idate.split('/')
 44 |         return str((int(idate[0]) + 1911) * 10000 + int(idate[1]) * 100 + int(idate[2]))
 45 |     # else:
 46 |     #     return str((int(idate[0:3]) + 1911) * 10000 + int(idate[3:5]) * 100 + int(idate[5:7]))
 47 | 
 48 | 
 49 | # 整理url參數名稱與值之函數
 50 | def CombineParam(elem):
 51 |     if (elem.get('name') is not None) and (elem.get('value') is not None):
 52 |         return str(elem.get('name')) + '=' + str(elem.get('value'))
 53 | 
 54 | 
 55 | # 產生爬蟲目標url
 56 | def MakeURL(i_param, url_param):
 57 |     if i_param:
 58 |         url_param = url_param + '&' + i_param.replace('document.t59sb01_form.', '').replace('.value', ''). \
 59 |             replace(";openWindow(this.form ,'');", '').replace('\'', '').replace(';', '&')
 60 |         target_url = 'https://mops.twse.com.tw/mops/web/ajax_t59sb01?' + url_param
 61 |         return target_url
 62 | 
 63 | 
 64 | # 設定程式執行路徑
 65 | runProgramPath = 'C:\\Users\\User\\PycharmProjects\\spider\\'
 66 | os.chdir(runProgramPath)
 67 | 
 68 | # 確認當前目錄是否有資料儲存資料夾 若沒有則建立
 69 | if 'material_info' not in os.listdir():
 70 |     os.mkdir((runProgramPath + 'material_info'))
 71 | 
 72 | # # 產生近7個實際日期
 73 | # todayDate = datetime.datetime.now()
 74 | # dateList = []
 75 | # for i in range(1):
 76 | #     iDate = todayDate - datetime.timedelta(days=i)
 77 | #     dateList.append(iDate.strftime('%Y%m%d'))
 78 | 
 79 | # 設定爬蟲日期區間
 80 | # 起始日
 81 | file = os.listdir((runProgramPath + 'material_info'))
 82 | if len(file) > 0:
 83 |     downloadStartDate = max(file).replace('.csv', '')
 84 |     downloadStartDate = downloadStartDate[0:4] + '-' + downloadStartDate[4:6] + '-' + downloadStartDate[6:8]
 85 | else:
 86 |     downloadStartDate = '2015-01-01'
 87 | # 結束日
 88 | downloadEndDate = datetime.datetime.now()
 89 | # 產生日期序列
 90 | dateList = pd.date_range(start=downloadStartDate, end=downloadEndDate).strftime('%Y%m%d')
 91 | 
 92 | # 每次只爬200個交易日
 93 | dateList = dateList[0:200]
 94 | # 計步器: 爬50個交易日後休息2小時
 95 | downloadDayNums = 0
 96 | 
 97 | # 迴圈日期下載重大訊息資訊資料
 98 | for iDate in dateList:
 99 | 
100 |     print('目前程式正在下載日期: ' + iDate + ' 上市櫃重大訊息資料')
101 | 
102 |     # 建立儲存表
103 |     materialInfoData = pd.DataFrame()
104 | 
105 |     # 年月日
106 |     iYear = str(int(iDate[0:4]) - 1911)
107 |     iMonth = iDate[4:6]
108 |     iDay = iDate[6:8]
109 | 
110 |     # 下載公司當日重大訊息資料
111 |     url = 'https://mops.twse.com.tw/mops/web/ajax_t05st02?' \
112 |           'encodeURIComponent=1&step=1&step00=0&firstin=1&off=1&' \
113 |           'TYPEK=all&year=' + iYear + '&month=' + iMonth + '&day=' + iDay
114 | 
115 |     # 防呆機制
116 |     checkSuccess = False
117 |     tryNums = 0
118 |     while not checkSuccess:
119 |         soup, checkSuccess = CheckConnect(url)
120 |         if not checkSuccess:   # 若爬取失敗 則暫停120秒
121 |             if tryNums == 5:   # 若已重新爬取累計5次 則放棄此次程式執行
122 |                 break
123 |             tryNums += 1
124 |             print('本次下載失敗 程式暫停120秒')
125 |             time.sleep(120)
126 | 
127 |     # 防呆機制: 若累積爬取資料失敗 則終止此次程式
128 |     if tryNums == 5:
129 |         print('下載失敗次數累積5次 結束程式')
130 |         break
131 | 
132 |     # 防呆機制: 若頁面出現"查無[日期]之重大訊息資料" 則進行下一個迴圈
133 |     if '查無' in str(soup):
134 |         print('該日期無資料 進行下一個日期資料下載')
135 |         continue
136 | 
137 |     # 整理資料
138 |     rowDatas = soup.find_all('table')[2].find_all('tr')
139 |     rows = list()
140 |     for row in rowDatas:
141 |         rows.append([elem.get('value') for elem in row.find_all('input')])
142 |     rows = [elem[:-1] for elem in rows if elem]
143 |     columnNames = ['name', 'code', 'announce_date', 'time', 'subject',
144 |                    'number', 'rule', 'actual_date', 'content']
145 |     df = pd.DataFrame(data=rows, columns=columnNames)
146 | 
147 |     # 儲存重大訊息資訊資料
148 |     materialInfoData = pd.concat([materialInfoData, df])
149 |     time.sleep(5)
150 | 
151 |     # 下載DR公司當日重大訊息
152 |     print('目前程式正在下載日期: ' + iDate + ' DR公司當日重大訊息資料')
153 | 
154 |     # 由於DR公司和一般公司的重大訊息架構不一樣 需要額外處理
155 |     # 整理基本資訊
156 |     rowDatas = soup.find_all('table')[3].find_all('tr')
157 |     simpleInfoRows = list()
158 |     for row in rowDatas:
159 |         simpleInfoRows.append([elem.getText().replace('\xa0', '') for elem in row.find_all('td')])
160 |     simpleInfoRows = [elem for elem in simpleInfoRows if elem]
161 | 
162 |     # 整理詳細資料資訊
163 |     # 整理詳細資料url網址的共用參數
164 |     urlParamRaw = soup.find_all('form')[1]
165 |     urlParam = list()
166 |     for i in urlParamRaw:
167 |         urlParam.append([CombineParam(elem) for elem in urlParamRaw.find_all('input')])
168 |     urlParam = [elem for elem in urlParam[0] if elem]
169 |     urlParam = '&'.join(urlParam)
170 | 
171 |     # 整理各家DR公司重訊詳細資料url
172 |     rawUrl = soup.find_all('table')[3].find_all('tr')
173 |     urlList = list()
174 |     for i in rawUrl:
175 |         urlList.append([MakeURL(elem.get('onclick'), urlParam) for elem in i.find_all('input')])
176 |     urlList = [elem for elem in urlList if elem]
177 | 
178 |     # 執行迴圈爬蟲
179 |     for idx, iUrl in enumerate(urlList):
180 | 
181 |         # 取得DR公司重訊資訊
182 |         url = iUrl[0]
183 | 
184 |         # 防呆機制
185 |         checkSuccess = False
186 |         tryNums = 0
187 |         while not checkSuccess:
188 |             soup2, checkSuccess = CheckConnect(url)
189 |             if not checkSuccess:  # 若爬取失敗 則暫停120秒
190 |                 if tryNums == 5:  # 若已重新爬取累計5次 則放棄此次程式執行
191 |                     break
192 |                 tryNums += 1
193 |                 print('本次下載失敗 程式暫停120秒')
194 |                 time.sleep(120)
195 | 
196 |         # 防呆機制: 若累積爬取資料失敗 則終止此次程式
197 |         if tryNums == 5:
198 |             print('下載失敗次數累積5次 結束程式')
199 |             break
200 | 
201 |         # 整理資料
202 |         rowDatas = soup2.find_all('table')[1].find_all('tr')
203 |         detailInfoRows = list()
204 |         for row in rowDatas:
205 |             detailInfoRows.append([elem.getText() for elem in row.find_all('td')])
206 | 
207 |         iRow = [[simpleInfoRows[idx][3],                   # 股票名稱(name)
208 |                  simpleInfoRows[idx][2],                   # 股票代碼(code)
209 |                  ConvertDate(simpleInfoRows[idx][0]),      # 公告日期(announce_date)
210 |                  simpleInfoRows[idx][1].replace(':', ''),  # 公告時間(time)
211 |                  simpleInfoRows[idx][4],                   # 主旨(subject)
212 |                  detailInfoRows[1][0],                     # 公告序號(number)
213 |                  '',                                       # 條款(rule): 內容有提供 但因每家寫的格式不一樣很難處理 故直接以缺值取代
214 |                  ConvertDate(detailInfoRows[2][0]),        # 事實發生日(actual_date)
215 |                  detailInfoRows[5][0]]]                    # 內容(content)
216 | 
217 |         # 儲存資料
218 |         df = pd.DataFrame(data=iRow, columns=columnNames)
219 |         materialInfoData = pd.concat([materialInfoData, df])
220 |         time.sleep(5)
221 | 
222 |     # 將本日重大訊息資料以csv檔案儲存
223 |     saveFilePath = runProgramPath + 'material_info\\' + iDate + '.csv'
224 |     materialInfoData.to_csv(saveFilePath, index=False)
225 | 
226 |     # 計步器: 爬50個交易日後休息2小時
227 |     downloadDayNums += 1
228 |     if downloadDayNums % 50 == 0:
229 |         print('目前已爬50個交易日 程式自動休息2小時!')
230 |         time.sleep(60*60*2)
231 | 


--------------------------------------------------------------------------------
/stockPrice.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 證交所及櫃買中心爬蟲程式
 3 | 程式碼撰寫: 蘇彥庭
 4 | 日期: 20210105
 5 | 程式說明: 此程式主要用於下載股價資料表(爬取最近7個交易日資料)
 6 | """
 7 | 
 8 | # 載入套件
 9 | import datetime
10 | import requests
11 | import pandas as pd
12 | import time
13 | from bs4 import BeautifulSoup
14 | 
15 | # 產生近7個實際日期
16 | todayDate = datetime.datetime.now()
17 | dateList = []
18 | for i in range(7):
19 |     iDate = todayDate - datetime.timedelta(days=i)
20 |     dateList.append(iDate.strftime('%Y%m%d'))
21 | 
22 | # 建立儲存表
23 | stockPriceData = pd.DataFrame()
24 | 
25 | # 迴圈日期下載資料
26 | for iDate in dateList:
27 | 
28 |     # 下載證交所資料
29 |     # 取得目標日期資料
30 |     url = 'https://www.twse.com.tw/exchangeReport/MI_INDEX?response=html&date=' + iDate + '&type=ALLBUT0999'
31 |     response = requests.get(url)
32 |     soup = BeautifulSoup(response.text, 'html.parser')
33 | 
34 |     # 判斷是否有空資料存在 若存在則跳離此次迴圈
35 |     if ('很抱歉，沒有符合條件的資料!' in soup.text):
36 |         continue
37 | 
38 |     # 整理證交所每日收盤行情表
39 |     table = soup.find_all('table')[8]
40 |     columnNames = table.find('thead').find_all('tr')[2].find_all('td')
41 |     columnNames = [elem.getText() for elem in columnNames]
42 |     rowDatas = table.find('tbody').find_all('tr')
43 |     rows = list()
44 |     for row in rowDatas:
45 |         rows.append([elem.getText().replace(',', '').replace('--', '') for elem in row.find_all('td')])
46 |     df = pd.DataFrame(data=rows, columns=columnNames)
47 |     df = df[['證券代號', '證券名稱', '開盤價', '最高價', '最低價', '收盤價', '成交股數', '成交金額']]
48 |     df = df.rename({'證券代號': 'code', '證券名稱': 'name', '開盤價': 'open', '最高價': 'high',
49 |                     '最低價': 'low', '收盤價': 'close', '成交股數': 'volume', '成交金額': 'value'}, axis=1)
50 |     df.insert(0, 'date', iDate, True)
51 |     df.insert(1, 'mkt', 'tse', True)
52 | 
53 |     # 儲存證交所資料
54 |     stockPriceData = pd.concat([stockPriceData, df])
55 |     time.sleep(1)
56 | 
57 |     # 下載櫃買中心資料
58 |     # 取得目標日期資料
59 |     url = ('https://www.tpex.org.tw/web/stock/aftertrading/otc_quotes_no1430/stk_wn1430_result.php?'
60 |            'l=zh-tw&o=htm&d=' + str(int(iDate[0:4])-1911) + '/' + iDate[4:6] + '/' + iDate[6:8] + '&se=EW&s=0,asc,0')
61 |     response = requests.get(url)
62 |     soup = BeautifulSoup(response.text, 'html.parser')
63 | 
64 |     # 整理櫃買中心每日收盤行情表
65 |     table = soup.find('table')
66 |     columnNames = table.find('thead').find_all('tr')[1].find_all('th')
67 |     columnNames = [elem.getText() for elem in columnNames]
68 |     rowDatas = table.find('tbody').find_all('tr')
69 |     rows = list()
70 |     for row in rowDatas:
71 |         rows.append([elem.getText().replace(',', '').replace('----', '') for elem in row.find_all('td')])
72 |     df = pd.DataFrame(data=rows, columns=columnNames)
73 |     df = df[['代號', '名稱', '開盤', '最高', '最低', '收盤', '成交股數', '成交金額(元)']]
74 |     df = df.rename({'代號': 'code', '名稱': 'name', '開盤': 'open', '最高': 'high',
75 |                     '最低': 'low', '收盤': 'close', '成交股數': 'volume', '成交金額(元)': 'value'}, axis=1)
76 |     df.insert(0, 'date', iDate, True)
77 |     df.insert(1, 'mkt', 'otc', True)
78 | 
79 |     # 儲存櫃買中心資料
80 |     stockPriceData = pd.concat([stockPriceData, df])
81 |     time.sleep(1)
82 | 
83 | # 檢查資料完整度
84 | # print(stockPriceData.groupby('date').count())
85 | 
86 | # 呈現結果
87 | print(stockPriceData)


--------------------------------------------------------------------------------
/taifex_tx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 期交所爬蟲程式
 3 | 程式碼撰寫: 蘇彥庭
 4 | 日期: 20210203
 5 | 程式說明: 此程式主要下載期交所台指期資料
 6 | """
 7 | 
 8 | # 載入套件
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | import re
12 | import pandas as pd
13 | 
14 | # 參數設定
15 | # 設定下載日期
16 | downloadDate = '2021/02/01'
17 | 
18 | # post參數
19 | post = {
20 |     'queryType': '2',
21 |     'marketCode': '0',
22 |     'dateaddcnt': '',
23 |     'commodity_id': 'TX',
24 |     'commodity_id2': '',
25 |     'queryDate': downloadDate,
26 |     'MarketCode': '0',
27 |     'commodity_idt': 'TX',
28 |     'commodity_id2t': ''
29 | }
30 | 
31 | # 目標網址
32 | url = 'https://www.taifex.com.tw/cht/3/futDailyMarketReport'
33 | 
34 | # 下載網頁
35 | response = requests.post(url, data=post)
36 | soup = BeautifulSoup(response.text, 'html.parser')
37 | 
38 | # 資料清洗
39 | datas = soup.select('table.table_f')[0].select('tr')  # 行情表
40 | rows = list()
41 | for i in range(len(datas)):
42 |     if i == 0:
43 |         # 處理標題
44 |         columns = [elem.text for elem in datas[i].select('th')]
45 |     else:
46 |         # 處理數據
47 |         rows.append([re.sub(r'[\t\n\r]', ' ', elem.text).strip() for elem in datas[i].select('td')])
48 | df = pd.DataFrame(data=rows, columns=columns)
49 | print(df)
50 | 
51 | # # 確認資料
52 | # fileName = downloadDate.replace('/', '-') + '_TX_future_data.csv'
53 | # df.to_csv(fileName, encoding='big5')
54 | 
55 | 


--------------------------------------------------------------------------------