├── heroku ├── Procfile ├── oauth2_creds.json ├── requirements.txt ├── test.py ├── app.py └── email_find.py ├── media ├── 2020-05-14-16-00-10.png ├── 2020-05-14-16-21-46.png ├── 2020-05-14-16-22-41.png ├── 2020-05-14-16-24-04.png ├── 2020-05-14-16-25-25.png ├── 2020-05-14-16-26-33.png ├── 2020-05-14-16-26-51.png ├── 2020-05-14-16-30-01.png ├── 2020-05-14-16-32-14.png ├── 2020-05-14-16-38-40.png ├── 2020-05-14-16-39-37.png ├── 2020-05-14-20-25-08.png ├── 2020-05-14-20-25-17.png ├── 2020-05-14-20-25-59.png ├── 2020-05-14-20-26-22.png └── 2020-05-14-20-28-09.png ├── jupyter_notebook └── jobs_csv │ └── 2020-06-05金融.xlsx └── README.md /heroku/Procfile: -------------------------------------------------------------------------------- 1 | web gunicorn app:app --preload -------------------------------------------------------------------------------- /media/2020-05-14-16-00-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-00-10.png -------------------------------------------------------------------------------- /media/2020-05-14-16-21-46.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-21-46.png -------------------------------------------------------------------------------- /media/2020-05-14-16-22-41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-22-41.png -------------------------------------------------------------------------------- /media/2020-05-14-16-24-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-24-04.png -------------------------------------------------------------------------------- /media/2020-05-14-16-25-25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-25-25.png -------------------------------------------------------------------------------- /media/2020-05-14-16-26-33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-26-33.png -------------------------------------------------------------------------------- /media/2020-05-14-16-26-51.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-26-51.png -------------------------------------------------------------------------------- /media/2020-05-14-16-30-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-30-01.png -------------------------------------------------------------------------------- /media/2020-05-14-16-32-14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-32-14.png -------------------------------------------------------------------------------- /media/2020-05-14-16-38-40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-38-40.png -------------------------------------------------------------------------------- /media/2020-05-14-16-39-37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-39-37.png -------------------------------------------------------------------------------- /media/2020-05-14-20-25-08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-25-08.png -------------------------------------------------------------------------------- /media/2020-05-14-20-25-17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-25-17.png -------------------------------------------------------------------------------- /media/2020-05-14-20-25-59.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-25-59.png -------------------------------------------------------------------------------- /media/2020-05-14-20-26-22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-26-22.png -------------------------------------------------------------------------------- /media/2020-05-14-20-28-09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-28-09.png -------------------------------------------------------------------------------- /heroku/oauth2_creds.json: -------------------------------------------------------------------------------- 1 | { 2 | "google_refresh_token": "token", 3 | "google_client_secret": "secret", 4 | "google_client_id": "id" 5 | } -------------------------------------------------------------------------------- /jupyter_notebook/jobs_csv/2020-06-05金融.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/jupyter_notebook/jobs_csv/2020-06-05金融.xlsx -------------------------------------------------------------------------------- /heroku/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.0 2 | pandas==1.0.3 3 | requests==2.23.0 4 | selenium==3.141.0 5 | tqdm==4.46.0 6 | yagmail==0.11.224 7 | openpyxl==3.0.3 8 | Flask==1.1.2 9 | gunicorn==19.5.0 -------------------------------------------------------------------------------- /heroku/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from selenium import webdriver 3 | 4 | chrome_options = webdriver.ChromeOptions() 5 | chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN") 6 | chrome_options.add_argument("--headless") 7 | chrome_options.add_argument("--disable-dev-shm-usage") 8 | chrome_options.add_argument("--no-sandbox") 9 | driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options) 10 | 11 | driver.get("https://www.google.com") 12 | print(driver.page_source) 13 | -------------------------------------------------------------------------------- /heroku/app.py: -------------------------------------------------------------------------------- 1 | # 使用flask call 執行檔 2 | from flask import Flask 3 | import email_find 4 | import threading 5 | 6 | app = Flask(__name__) 7 | 8 | # @app.route('/') 9 | # def hello_world(): 10 | # thread = threading.Thread(target=email_find.email, args=('金融', )) # 增加定義線程 11 | # thread.start() # 讓線程開始工作 12 | # return 'Hello, World!,爬人力銀行' 13 | 14 | @app.route('/user/////') 15 | def show_user_profile(find_key, select_salary, select_area, related_key, related_content): 16 | print(select_area) # input 11,22 17 | # print(type(select_area)) 18 | print(select_area.split(',')) 19 | select_area = select_area.split(',') 20 | print(type(select_area)) 21 | thread = threading.Thread(target=email_find.email, args=(find_key, select_salary, select_area, related_key, related_content)) # 增加定義線程 22 | thread.start() # 讓線程開始工作 23 | # show the user profile for that user 24 | return '爬取資料 %s %s %s %s %s ' % (find_key, select_salary, select_area, related_key, related_content) 25 | 26 | """ 27 | 爬取 工作關鍵字 find_key 28 | 薪資 select_salary 29 | 市區 select_area 30 | 工作最相關關鍵字 related_key 31 | 32 | 使用方法: 33 | 例子 :你的heroku網址/user/金融/30000/台北市,新北市,桃園市/分析/管理 34 | 35 | /user/爬取關鍵字/篩選薪水多少以上/台北市,新北市,桃園市/最相關工作關鍵字 36 | 37 | 38 | 市區域只能選擇3個市/縣市,你也可以到email_find 新增更多的市/縣市 39 | 40 | 新增市/縣市 41 | 例如: 42 | mask1 = concat_104_1111.薪資 >= int(select_salary) 43 | mask2 = concat_104_1111.地區.str.contains(select_area[0]) 44 | mask3 = concat_104_1111.地區.str.contains(select_area[1]) 45 | mask4 = concat_104_1111.地區.str.contains(select_area[2]) 46 | mask6 = concat_104_1111.地區.str.contains(select_area[3]) #新增 新增市/縣市 47 | mask5 = concat_104_1111.工作名稱.str.contains(related_key) 48 | mask6 = concat_104_1111.工作內容.str.contains(related_content) #工作內容簡介也許會有相關的工作 49 | 50 | #搜尋 3萬以上 工作名稱有"證卷"關鍵字、區域 台北市 新北市 51 | # & = and , | = or 52 | 53 | #儲存成 Excel格式檔 54 | file_name = find_key #檔案名稱 依關鍵字取名 55 | save_excel = concat_104_1111.loc[((mask2 | mask3 | mask4) & mask1 & mask5) | ((mask2 | mask3 | mask4) & mask1 & mask6)] 56 | 57 | """ 58 | if __name__ == "__main__": 59 | app.run() 60 | 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 爬取104及1111人力銀行工作資訊 2 | 3 | 2020/6/7 更新:解決1111小改版後無法爬取 4 | 2020/7/8 更新:新增工作內容搜尋相關字 5 | 6 | * 新增使用Heroku定時自動爬人力銀行儲存Excel檔並寄email到自己的信箱 7 | - 你正在想找新工作可以使用Heroku自動爬蟲 8 | 9 | ## 手動分為兩種版本執行 10 | 1. 本機使用版本 11 | * 在本機電腦上執行 請執行 Jupyter notebook 安裝指令 `pip install notebook`,建立使用Anaconda環境使用Jupyter notebook 比較不會出問題。 12 | 2. (推薦使用)google colab版[https://colab.research.google.com/](https://colab.research.google.com/) 13 | * (推薦使用)google colab版 14 | * 將google_colab_104_1111人力銀行爬蟲.ipynb上傳 15 | ![](./media/2020-05-14-16-00-10.png) 16 | * 直接連結我的colab分享 https://colab.research.google.com/notebook#fileId=1KuaXUGgQIqJ7_ZN5V--zApt9ykOFuBk6&offline=true&sandboxMode=true 17 | 18 | ### 本機版本需求 19 | 20 | #### 需安裝的套件 21 | 1. requests 22 | 2. beautifulsoup4 23 | 3. selenium 24 | 4. pandas 25 | 5. tqdm 26 | 27 | `pip install requests` 28 | 29 | `pip install beautifulsoup4` 30 | 31 | `pip install selenium` 32 | 33 | `pip install pandas` 34 | 35 | `pip install tqdm` 36 | 37 | #### ChromeDriver 選擇你的Chrome版本的Driver 38 | https://chromedriver.chromium.org/ 39 | 40 | # 說明 41 | 42 | 從104及1111人力銀行爬取關鍵字的資料,使用Pandas接資料、清理資料、過濾篩選。 43 | 44 | **更多說明已註解在ipynb檔案內** 45 | 46 | 47 | 48 | ## Heroku 49 | * 爬取104 1111人力銀行篩選要的資料並儲存成Excel,寄到自己Email中。 50 | * 自動定時寄送方法,使用[cron-job.org](https://cron-job.org/) 設定定時call Heroku,因為Heroku 30分鐘之後沒有在運作會自動進入休眠。 51 | * 註冊Heroku 帳號,[signup.heroku.com](https://signup.heroku.com/) 52 | 53 | ### flask使用方法 54 | * 參數傳入 user不需要更改,這樣可以非常方便的隨時更改想要的 55 | * 你的heroku網址/user/金融/30000/台北市,新北市,桃園市/分析 56 | * 說明 57 | * /user/`爬取關鍵字`/`篩選薪水多少以上`/`台北市,新北市,桃園市`/`最相關工作關鍵字/工作內容關鍵字` 58 | * 一些相關說明在app.py裡面 59 | 60 | 例如 你的網址是: `http://127.0.0.1/user/金融/30000/台北市,新北市,桃園市/分析/管理` 61 | 62 | ### gmail api 申請設定取得oauth 2 63 | 需要使用gmail api寄送email,如果只使用google帳號密碼登入會有驗證問題無法登入導致email寄送失敗。 64 | 65 | google api https://console.cloud.google.com/apis/ 66 | 67 | ![](./media/2020-05-14-20-25-08.png) 68 | ![](./media/2020-05-14-20-25-17.png) 69 | 70 | 設定OAurth 同意畫面 71 | 72 | ![](./media/2020-05-14-20-25-59.png) 73 |
74 | 75 | 建立憑證 OAuth 76 | 77 | ![](./media/2020-05-14-20-26-22.png) 78 | 79 | 執行下列程式碼 80 | ``` 81 | """ 82 | Adapted from: 83 | https://github.com/google/gmail-oauth2-tools/blob/master/python/oauth2.py 84 | https://developers.google.com/identity/protocols/OAuth2 85 | 86 | 1. Generate and authorize an OAuth2 (generate_oauth2_token) 87 | 2. Generate a new access tokens using a refresh token(refresh_token) 88 | 3. Generate an OAuth2 string to use for login (access_token) 89 | """ 90 | 91 | import base64 92 | import imaplib 93 | import json 94 | import smtplib 95 | import urllib.parse 96 | import urllib.request 97 | from email.mime.multipart import MIMEMultipart 98 | from email.mime.text import MIMEText 99 | import lxml.html 100 | 101 | GOOGLE_ACCOUNTS_BASE_URL = 'https://accounts.google.com' 102 | REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob' 103 | 104 | GOOGLE_CLIENT_ID = '' #在這裡輸入 105 | GOOGLE_CLIENT_SECRET = '' #在這裡輸入 106 | GOOGLE_REFRESH_TOKEN = None 107 | 108 | 109 | def command_to_url(command): 110 | return '%s/%s' % (GOOGLE_ACCOUNTS_BASE_URL, command) 111 | 112 | 113 | def url_escape(text): 114 | return urllib.parse.quote(text, safe='~-._') 115 | 116 | 117 | def url_unescape(text): 118 | return urllib.parse.unquote(text) 119 | 120 | 121 | def url_format_params(params): 122 | param_fragments = [] 123 | for param in sorted(params.items(), key=lambda x: x[0]): 124 | param_fragments.append('%s=%s' % (param[0], url_escape(param[1]))) 125 | return '&'.join(param_fragments) 126 | 127 | 128 | def generate_permission_url(client_id, scope='https://mail.google.com/'): 129 | params = {} 130 | params['client_id'] = client_id 131 | params['redirect_uri'] = REDIRECT_URI 132 | params['scope'] = scope 133 | params['response_type'] = 'code' 134 | return '%s?%s' % (command_to_url('o/oauth2/auth'), url_format_params(params)) 135 | 136 | 137 | def call_authorize_tokens(client_id, client_secret, authorization_code): 138 | params = {} 139 | params['client_id'] = client_id 140 | params['client_secret'] = client_secret 141 | params['code'] = authorization_code 142 | params['redirect_uri'] = REDIRECT_URI 143 | params['grant_type'] = 'authorization_code' 144 | request_url = command_to_url('o/oauth2/token') 145 | response = urllib.request.urlopen(request_url, urllib.parse.urlencode(params).encode('UTF-8')).read().decode('UTF-8') 146 | return json.loads(response) 147 | 148 | 149 | def call_refresh_token(client_id, client_secret, refresh_token): 150 | params = {} 151 | params['client_id'] = client_id 152 | params['client_secret'] = client_secret 153 | params['refresh_token'] = refresh_token 154 | params['grant_type'] = 'refresh_token' 155 | request_url = command_to_url('o/oauth2/token') 156 | response = urllib.request.urlopen(request_url, urllib.parse.urlencode(params).encode('UTF-8')).read().decode('UTF-8') 157 | return json.loads(response) 158 | 159 | 160 | def generate_oauth2_string(username, access_token, as_base64=False): 161 | auth_string = 'user=%s\1auth=Bearer %s\1\1' % (username, access_token) 162 | if as_base64: 163 | auth_string = base64.b64encode(auth_string.encode('ascii')).decode('ascii') 164 | return auth_string 165 | 166 | 167 | def test_imap(user, auth_string): 168 | imap_conn = imaplib.IMAP4_SSL('imap.gmail.com') 169 | imap_conn.debug = 4 170 | imap_conn.authenticate('XOAUTH2', lambda x: auth_string) 171 | imap_conn.select('INBOX') 172 | 173 | 174 | def test_smpt(user, base64_auth_string): 175 | smtp_conn = smtplib.SMTP('smtp.gmail.com', 587) 176 | smtp_conn.set_debuglevel(True) 177 | smtp_conn.ehlo('test') 178 | smtp_conn.starttls() 179 | smtp_conn.docmd('AUTH', 'XOAUTH2 ' + base64_auth_string) 180 | 181 | 182 | def get_authorization(google_client_id, google_client_secret): 183 | scope = "https://mail.google.com/" 184 | print('Navigate to the following URL to auth:', generate_permission_url(google_client_id, scope)) 185 | authorization_code = input('Enter verification code: ') 186 | response = call_authorize_tokens(google_client_id, google_client_secret, authorization_code) 187 | return response['refresh_token'], response['access_token'], response['expires_in'] 188 | 189 | 190 | def refresh_authorization(google_client_id, google_client_secret, refresh_token): 191 | response = call_refresh_token(google_client_id, google_client_secret, refresh_token) 192 | return response['access_token'], response['expires_in'] 193 | 194 | 195 | def send_mail(fromaddr, toaddr, subject, message): 196 | access_token, expires_in = refresh_authorization(GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET, GOOGLE_REFRESH_TOKEN) 197 | auth_string = generate_oauth2_string(fromaddr, access_token, as_base64=True) 198 | 199 | msg = MIMEMultipart('related') 200 | msg['Subject'] = subject 201 | msg['From'] = fromaddr 202 | msg['To'] = toaddr 203 | msg.preamble = 'This is a multi-part message in MIME format.' 204 | msg_alternative = MIMEMultipart('alternative') 205 | msg.attach(msg_alternative) 206 | part_text = MIMEText(lxml.html.fromstring(message).text_content().encode('utf-8'), 'plain', _charset='utf-8') 207 | part_html = MIMEText(message.encode('utf-8'), 'html', _charset='utf-8') 208 | msg_alternative.attach(part_text) 209 | msg_alternative.attach(part_html) 210 | server = smtplib.SMTP('smtp.gmail.com:587') 211 | server.ehlo(GOOGLE_CLIENT_ID) 212 | server.starttls() 213 | server.docmd('AUTH', 'XOAUTH2 ' + auth_string) 214 | server.sendmail(fromaddr, toaddr, msg.as_string()) 215 | server.quit() 216 | 217 | 218 | 219 | if __name__ == '__main__': 220 | if GOOGLE_REFRESH_TOKEN is None: 221 | print('No refresh token found, obtaining one') 222 | refresh_token, access_token, expires_in = get_authorization(GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET) 223 | print('Set the following as your GOOGLE_REFRESH_TOKEN:', refresh_token) 224 | exit() 225 | 226 | send_mail('--------@gmail.com', '--------@gmail.com', 227 | 'A mail from you from Python', 228 | 'A mail from you from Python' + 229 | 'So happy to hear from you!') 230 | ``` 231 | 232 | TOKEN使用方法 233 | 將你的ID、SECRET 輸入到上方的腳本並執行,進入產生的網址,依照圖步驟走下去。 234 | 235 | ![](./media/2020-05-14-20-28-09.png) 236 | 237 | json檔設定,開啟oauth2_creds.json 238 | 將由上面取得的id、secret、token 輸入至json檔。 239 | ``` 240 | { 241 | "google_refresh_token": "token", 242 | "google_client_secret": "secret": 243 | "google_client_id": "id" 244 | } 245 | ``` 246 | 247 | ### 過濾 關鍵詞 地區設定 248 | 編輯email_find.py 249 | 滑動到最底部 250 | 251 | 設定需要的薪資、縣市、相關的工作關鍵字,也可以新增更多條件判斷 252 | ``` 253 | mask1 = concat_104_1111.薪資 >= 30000 254 | mask2 = concat_104_1111.地區.str.contains('台北市') 255 | mask3 = concat_104_1111.地區.str.contains('新北市') 256 | mask4 = concat_104_1111.地區.str.contains('桃園市') 257 | mask5 = concat_104_1111.工作名稱.str.contains('分析師') 258 | ``` 259 | 260 | 將 收件人信箱 設定你的email 261 | ``` 262 | # 將工作資訊寄給自己gmail 263 | file = glob.glob('jobs_csv/*.xlsx') #傳送多個檔案 以list型態 264 | yag = yagmail.SMTP("你的email", oauth2_file="oauth2_creds.json") 265 | yag.send( 266 | to="你的email", subject="爬蟲:熱騰騰的工作資訊", #to = "收件人信箱" 需傳送多個 使用list ['aaaa@123.com', 'bbb@123.com'] 267 | contents="熱騰騰的工作資料", 268 | attachments= file 269 | ) 270 | ``` 271 | 272 | ### 前置設定 273 | #### 建立一個APP 274 | ![](./media/2020-05-14-16-21-46.png) 275 | ![](./media/2020-05-14-16-22-41.png) 276 | 277 | #### 設定 環境變數、chrome、chromedriver 278 | ![](./media/2020-05-14-16-24-04.png) 279 | 280 | * `heroku/python` 281 | * `https://github.com/heroku/heroku-buildpack-google-chrome` 282 | * `https://github.com/heroku/heroku-buildpack-chromedriver` 283 | ![](./media/2020-05-14-16-25-25.png) 284 | 285 | ![](./media/2020-05-14-16-26-33.png) 286 | 287 | `CHROMEDRIVER_PATH` `/app/.chromedriver/bin/chromedriver` 288 | 289 | `GOOGLE_CHROME_BIN` `/app/.apt/usr/bin/google-chrome` 290 | ![](./media/2020-05-14-16-26-51.png) 291 | 292 | #### 上傳 293 | 將heroku內全部上傳,需先安裝 Heroku CLI https://devcenter.heroku.com/articles/heroku-cli 294 | 295 | 選擇Deploy 296 | 297 | ![](./media/2020-05-14-16-30-01.png) 298 | 299 | 開啟Command Prompt命令 或 使用 vs code 或其他可執行指令的Command Prompt 300 | 輸入 301 | * `heroku login` 302 | * `heroku git:clone -a line-bot-gogo` 303 | * `cd Heroku` 切換到要上傳的的資料夾,Heroku 304 | * `git add .` 305 | * ` git commit -am "make it better" ` 306 | * ` git push heroku master ` 307 | 308 | ![](./media/2020-05-14-16-32-14.png) 309 | 310 | 上傳完成後可以看到網址,或到Heroku app內的Settings查看 311 | 312 | #### cron-job 設定 313 | 網址部分請至[flask使用方法看](#flask使用方法)看 314 | 315 | 例如 你的網址是: `http://127.0.0.1/user/金融/30000/台北市,新北市,桃園市/分析` 316 | 317 | 注意需使用編碼:例如:`http://127.0.0.1/user/%E9%87%91%E8%9E%8D/30000/%E5%8F%B0%E5%8C%97%E5%B8%82,%E6%96%B0%E5%8C%97%E5%B8%82,%E6%A1%83%E5%9C%92%E5%B8%82/%E5%88%86%E6%9E%90` 318 | 319 | 如何轉碼 320 | === 321 | 將網址貼在chrome上在剪下並貼上到記事本,chrome本身會自動轉碼 322 | 323 | 324 | [註冊cron-job.org/en/signup/](https://cron-job.org/en/signup/) 325 | 326 | ![](./media/2020-05-14-16-38-40.png) 327 | 328 | ![](./media/2020-05-14-16-39-37.png) 329 | 330 | days of month 需要call的日期 331 | days of week 一周內要call幾天 332 | months 月份 333 | Hours 小時 334 | minutes 分 335 | 336 | 圖中設定為 每周禮拜一固定 AM 5:00 Call 337 | 338 | 339 | -------------------------------------------------------------------------------- /heroku/email_find.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import random 5 | import time 6 | from urllib.parse import quote 7 | from tqdm import tqdm, trange 8 | import csv 9 | from selenium import webdriver 10 | from selenium.webdriver.common.keys import Keys 11 | from selenium.webdriver.chrome.options import Options 12 | import pandas as pd 13 | from datetime import date 14 | # from IPython.display import clear_output 15 | import os,os.path 16 | import glob 17 | import yagmail 18 | 19 | # , find_salary, find_area, related_key 20 | def email(find_key='金融', select_salary = 40000, select_area = list(['台北市', '新北市', '桃園市']), related_key = '分析', related_content = '管理'): 21 | def get_todate(): 22 | return date.today() 23 | 24 | def selenium_get_Code_104(url): 25 | #heroku selenium使用 26 | chrome_options = webdriver.ChromeOptions() 27 | chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN") 28 | chrome_options.add_argument("--headless") 29 | chrome_options.add_argument("--disable-dev-shm-usage") 30 | chrome_options.add_argument("--no-sandbox") 31 | driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options) 32 | 33 | #一般本地windows liunx使用 34 | # chrome_options = Options() # 啟動無頭模式 35 | # chrome_options.add_argument('--headless') #規避google bug 36 | # chrome_options.add_argument('--disable-gpu') 37 | # driver = webdriver.Chrome(chrome_options=chrome_options) 38 | 39 | driver.get(url) 40 | save = driver.page_source 41 | driver.quit()#關閉瀏覽器 42 | soup = BeautifulSoup(save, "html.parser") 43 | page = soup.select('.page-select.js-paging-select.gtm-paging-top')[0].find_all('option')[-1].get('value') 44 | return page 45 | 46 | def read_url(url): 47 | USER_AGENT_LIST = [ 48 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 49 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 50 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 51 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 52 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 53 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 54 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 55 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 56 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 57 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 58 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 59 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 60 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 61 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 62 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 63 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 64 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 65 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 66 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 67 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 68 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 69 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 70 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 71 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 72 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 73 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 74 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 75 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 76 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 77 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 78 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 79 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 80 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 81 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 82 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 83 | ] 84 | USER_AGENT = random.choice(USER_AGENT_LIST) 85 | headers = {'user-agent': USER_AGENT} 86 | s = requests.Session() 87 | req = s.get(url, headers = headers) 88 | soup = BeautifulSoup(req.text, "html.parser") 89 | return soup 90 | 91 | def csv_column_104(path_csv, key_txt): #建立行標題 92 | with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: 93 | employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 94 | employee_writer.writerow(['日期', '工作名稱', '公司名稱', '公司地址', '薪資', '工作內容', '地區', '經歷', '學歷', '公司人數', '文章編號', '工作網址']) 95 | 96 | def find_title_104(key_txt): 97 | #路徑組合 98 | today = get_todate() 99 | path_csv = 'jobs_csv/'+ str(today) + key_txt + '_104人力銀行' 100 | if not os.path.isdir('jobs_csv'): # 確認是否有jobs_csv資料夾 沒有則返回Ture 101 | os.mkdir('jobs_csv') # 建立jobs_csv資料夾 102 | print('建立jobs_csv資料夾完成') 103 | csv_column_104(path_csv, key_txt) #建立行標題 104 | csv_save = "" 105 | key = quote(key_txt) 106 | # 104 api searchTempExclude=2 -> 設定排除派遣 107 | find_page_url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword={0}&order=15&asc=0&page=1&mode=s&jobsource=2018indexpoc&searchTempExclude=2'.format(key) 108 | get_sum_page = int(selenium_get_Code_104(find_page_url)) 109 | print('共有:' + str(get_sum_page) + ' 頁') 110 | for i in tqdm(range(1, get_sum_page+1)): #set page 1 to find all max page ,tqdm讀取進度條 111 | url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword={0}&order=15&asc=0&page={1}&mode=s&jobsource=2018indexpoc&searchTempExclude=2'.format(key, i) 112 | #time.sleep(random.randint(2,10)) #隨機等待 113 | soup = read_url(url) #讀取網頁 114 | print('目前爬取頁面是:' + url) 115 | for title_1 in soup.select('.b-block__left'): 116 | #有三個資料是無資料的,遇到無資料就跳過這個迴圈 117 | if title_1.select('.b-list-inline.b-clearfix.job-list-item__company') != soup.select('.b-block__left')[0].select('.b-list-inline.b-clearfix.job-list-item__company'): 118 | #日期 119 | try: 120 | #正常代表找到 讚 廣告 (業主買廣告),發生異常代表找不到 讚,執行except找日期 121 | date_match__ = title_1.select('.b-icon--gray.b-icon--w18')[0].select('use')[0] 122 | date = '廣告' 123 | except: 124 | date = title_1.select('.b-tit__date')[0].get_text().replace('\n','').replace(' ','') 125 | 126 | #地區 127 | area = title_1.select('.b-list-inline.b-clearfix.job-list-intro.b-content')[0].find('li').get_text() 128 | #經歷(年資) 129 | experience = title_1.select('.b-list-inline.b-clearfix.job-list-intro.b-content')[0].find_all('li')[1].get_text() 130 | try: #業者沒有輸入學歷,遇到錯誤處理 131 | #學歷 132 | education = title_1.select('.b-list-inline.b-clearfix.job-list-intro.b-content')[0].find_all('li')[2].get_text() 133 | except: 134 | education = "" 135 | #工作網址 136 | title_url = title_1.select('.js-job-link')[0].get('href')[2:] 137 | #get 文章編號 138 | title_str = title_url.split('?')[0].split('/')[-1] #get 文章編號 139 | #標題名稱 140 | title = title_1.select('.js-job-link')[0].get_text() #get title 141 | #print(title + title_url + area) 142 | #公司名 143 | company_name = title_1.select('li')[1].find('a').get('title').split('\n')[0][4:] 144 | try: 145 | #公司地址 146 | company_address = title_1.select('li')[1].find('a').get('title').split('\n')[1][5:] 147 | except: 148 | company_address = "" 149 | try: 150 | #簡介 151 | introduction = title_1.select('.job-list-item__info.b-clearfix.b-content')[0].get_text() 152 | #處理string \r \n5 \n轉成'' 153 | introduction = introduction.replace('\r','').replace('\n5','').replace('\n','') 154 | except: 155 | introduction = "" 156 | #薪資 157 | try: 158 | salary = title_1.select('.b-tag--default')[0].get_text() 159 | except: 160 | salary = 0 #沒有寫薪資或待遇面議,設定 0 161 | 162 | if salary == '待遇面議': 163 | salary = "待遇面議" 164 | else: #數字處理 25000~35000 取25000最低薪資為主要,三位數 = 日薪,四位數 = 論件計酬 165 | try: 166 | salary = re.search('\d+.\d+', salary).group() 167 | except: 168 | salary = 0 169 | #員工人數 170 | try: 171 | people = title_1.select('.b-tag--default')[1].get_text() 172 | except: 173 | people = "" 174 | #clear_output() # 清除輸出 用於清除進度讀,註解#不使用:用來檢查出錯的網址 175 | 176 | with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: #w 177 | employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 178 | employee_writer.writerow([date, title, company_name, company_address, salary, introduction, area, experience, education, people, title_str, title_url]) 179 | else: 180 | continue 181 | return print('爬取104完成:請開啟csv檔案') 182 | 183 | 184 | # input_go = input('輸入關鍵字') 185 | # save_title_data = find_title_104(input_go) 186 | 187 | 188 | 189 | 190 | ############ 1111人力銀行 ######### 191 | 192 | 193 | def selenium_get_Code_1111(url): 194 | #heroku selenium使用 195 | chrome_options = webdriver.ChromeOptions() 196 | chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN") 197 | chrome_options.add_argument("--headless") 198 | chrome_options.add_argument("--disable-dev-shm-usage") 199 | chrome_options.add_argument("--no-sandbox") 200 | driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options) 201 | 202 | # 一般本地windows liunx使用 203 | # chrome_options = Options() # 啟動無頭模式 204 | # chrome_options.add_argument('--headless') #規避google bug 205 | # chrome_options.add_argument('--disable-gpu') 206 | # driver = webdriver.Chrome(chrome_options=chrome_options) 207 | 208 | driver.get(url) 209 | save = driver.page_source 210 | driver.quit()#關閉瀏覽器 211 | soup = BeautifulSoup(save, "html.parser") 212 | 213 | page = soup.select('.custom-select')[0].select('option')[0].text 214 | page = page.split('/') 215 | page = page[1].strip(' ') 216 | return page 217 | 218 | 219 | def csv_column_1111(path_csv): #建立行標題 220 | with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: 221 | employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 222 | employee_writer.writerow(['日期', '工作名稱', '公司名稱', '公司地址', '薪資', '工作內容', '地區', '經歷', '學歷', '工作網址']) 223 | 224 | def find_data_1111(soup): 225 | #錢錢 226 | mnone = soup.select('.needs') 227 | #縣市區域 228 | location = soup.select('.needs') 229 | #日期 230 | get_date = soup.select('.date') 231 | #簡介 232 | jbInfoTxt = soup.select('.jbInfoTxt') 233 | #網址 234 | jobs_url = soup.select('.position0') 235 | #公司名稱、類別、住址 236 | company_data = soup.select('.d-md-flex') 237 | #工作標題 238 | title = soup.select('.position0') 239 | #工作經驗 240 | jobs_exp = soup.select('.needs') 241 | # 學歷 242 | education = soup.select('.needs') 243 | return mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education 244 | 245 | 246 | def find_title_1111(key_txt): 247 | #路徑組合 248 | today = get_todate() 249 | path_csv = 'jobs_csv/' + str(today) + key_txt + '_1111人力銀行' 250 | if not os.path.isdir('jobs_csv'): # 確認是否有jobs_csv資料夾 沒有則返回Ture 251 | os.mkdir('jobs_csv') # 建立jobs_csv資料夾 252 | print('建立jobs_csv資料夾完成') 253 | csv_column_1111(path_csv) #建立行標題 254 | key = quote(key_txt) 255 | find_page_url = 'https://www.1111.com.tw/job-bank/job-index.asp?si=1&ss=s&ks={0}&page=1'.format(key) 256 | #取得最大page數 257 | get_sum_page = int(selenium_get_Code_1111(find_page_url)) 258 | print('共有:' + str(get_sum_page) + ' 頁') 259 | 260 | for i in tqdm(range(1, get_sum_page+1)): 261 | url = 'https://www.1111.com.tw/job-bank/job-index.asp?si=1&ss=s&ks={0}&page={1}'.format(key, i) 262 | soup = read_url(url) #讀取網頁 263 | #讀取網頁資料 264 | mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education = find_data_1111(soup) 265 | print('目前爬取頁面是:' + url) 266 | for mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education in zip(mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education): 267 | #錢 取最低薪資 268 | try: 269 | mnone = mnone.find_all("span")[1].get_text() 270 | get_mone = re.search('\d+.\d+', mnone).group() 271 | except: 272 | get_mone = '面議(經常性薪資4萬/月含以上)' #可能需要直接給40,000 273 | #日期 274 | get_date = get_date.get_text()[5:] 275 | 276 | #縣市區域 277 | location = location.find_all("span")[0].get_text() 278 | 279 | #簡介 280 | jbInfoTxt = jbInfoTxt.get_text().replace("\xa0", "") #刪除\xa0 281 | 282 | #工作網址 283 | jobs_url = 'https://www.1111.com.tw{0}'.format(jobs_url.find('a').get('href')) 284 | 285 | #公司名 286 | company = company_data.find_all('a')[0].get('title').replace('\r','').split('\n')[0][6:] 287 | 288 | #公司分類 目前暫不使用 289 | category = company_data.find_all('a')[0].get('title').replace('\r','').split('\n')[1][6:] 290 | 291 | #公司地址 292 | address = company_data.find_all('a')[0].get('title').replace('\r','').split('\n')[2][6:] 293 | 294 | #工作標題 295 | title = title.find('a').get('title') 296 | 297 | # 工作經驗 298 | jobs_exp = jobs_exp.find_all("span")[2].get_text() 299 | 300 | # 學歷 301 | education = education.find_all("span")[3].get_text() 302 | # 儲存 303 | 304 | # clear_output() # 清除輸出 用於清除進度讀 305 | 306 | with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: #w 307 | employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 308 | employee_writer.writerow([get_date, title, company, address, get_mone, jbInfoTxt, location, jobs_exp, education, jobs_url]) 309 | return print('爬取1111完成:請開啟csv檔案') 310 | 311 | 312 | # find_key = '金融' 313 | find_title_104(find_key) 314 | find_title_1111(find_key) 315 | 316 | #讀取爬取到的資料 317 | get_file_name = glob.glob('jobs_csv/*.csv') 318 | data_104 = pd.read_csv(get_file_name[0]) 319 | data_1111 = pd.read_csv(get_file_name[1]) 320 | 321 | # 日期、工作名稱、公司地址、薪資、工作內容、地區、經歷、學歷、工作網址 依想要的順序排序 322 | new_data_1111 = data_1111[['日期', '工作名稱', '薪資', '地區', '工作內容', '公司地址', '經歷', '學歷', '工作網址']] 323 | new_data_104 = data_104[['日期', '工作名稱', '薪資', '地區', '工作內容', '公司地址', '經歷', '學歷', '工作網址']] 324 | 325 | # 接資料 326 | concat_104_1111 = pd.concat([new_data_1111, new_data_104], axis=0) 327 | concat_104_1111.shape 328 | 329 | # index重設定 330 | concat_104_1111.reset_index(drop=True, inplace=True) #drop=True 刪除原有index , inplace 直接更新現有的DataFrame 331 | 332 | # 清理資料 333 | """ 334 | 1. 將 待遇面議 轉換成 0 335 | 2. 將 面議(經常性薪資4萬/月含以上)轉成 40000 336 | 3. 將 時薪 例如:150~200 轉成只取最低時薪 150 337 | 4. 移除數字中的, 例如"30,000"將,移除 338 | """ 339 | 340 | #將四萬改成0 但是也可以改成40,000 341 | concat_104_1111['薪資'] = concat_104_1111.薪資.str.replace('面議(經常性薪資4萬/月含以上)', '40000') 342 | 343 | #將待遇面議改成0 344 | concat_104_1111['薪資'] = concat_104_1111.薪資.str.replace('待遇面議', '0') 345 | 346 | #時薪 取最低時薪 347 | concat_104_1111.loc[concat_104_1111.薪資.str.contains('~'),'薪資'] = concat_104_1111.loc[concat_104_1111.薪資.str.contains('~'),'薪資'].str.split('~').str[0] 348 | 349 | #數字有"30,000"將,移除 350 | concat_104_1111.loc[:,'薪資'] = concat_104_1111['薪資'].apply(lambda x:x.replace(',','')) 351 | 352 | # 薪資轉成int type 353 | concat_104_1111.loc[:,'薪資'] = pd.to_numeric(concat_104_1111['薪資']) 354 | 355 | # 過濾 356 | #3萬以上 357 | # mask1 = concat_104_1111.薪資 >= 30000 358 | # mask2 = concat_104_1111.地區.str.contains('台北市') 359 | # mask3 = concat_104_1111.地區.str.contains('新北市') 360 | # mask4 = concat_104_1111.地區.str.contains('桃園市') 361 | # mask5 = concat_104_1111.工作名稱.str.contains('分析師') 362 | 363 | mask1 = concat_104_1111.薪資 >= int(select_salary) 364 | mask2 = concat_104_1111.地區.str.contains(select_area[0]) 365 | mask3 = concat_104_1111.地區.str.contains(select_area[1]) 366 | mask4 = concat_104_1111.地區.str.contains(select_area[2]) 367 | mask5 = concat_104_1111.工作名稱.str.contains(related_key) 368 | mask6 = concat_104_1111.工作內容.str.contains(related_content) #工作內容簡介也許會有相關的工作 369 | 370 | #搜尋 3萬以上 工作名稱有"證卷"關鍵字、區域 台北市 新北市 371 | # & = and , | = or 372 | concat_104_1111.loc[((mask2 | mask3 | mask4) & mask1 & mask5) | ((mask2 | mask3 | mask4) & mask1 & mask6)].head() 373 | 374 | #儲存成 Excel格式檔 375 | file_name = find_key #檔案名稱 依關鍵字取名 376 | save_excel = concat_104_1111.loc[((mask2 | mask3 | mask4) & mask1 & mask5) | ((mask2 | mask3 | mask4) & mask1 & mask6)] 377 | save_excel.to_excel('jobs_csv/{}.xlsx'.format(file_name), sheet_name='passengers', index=False) 378 | print('完成工作') 379 | 380 | # 將工作資訊寄給自己gmail 381 | 382 | file = glob.glob('jobs_csv/*.xlsx') #傳送多個檔案 以list型態 383 | yag = yagmail.SMTP("你的email", oauth2_file="oauth2_creds.json") 384 | yag.send( 385 | to="你的email", subject="爬蟲:熱騰騰的工作資訊", #to = "收件人信箱" 需傳送多個 使用list ['aaaa@123.com', 'bbb@123.com'] 386 | contents="熱騰騰的工作資料", 387 | attachments= file 388 | ) 389 | 390 | if __name__ == "__main__": 391 | # email('分析師') 392 | email(find_key='金融', select_salary = 40000, select_area = list(['台北市', '新北市', '桃園市']), related_key = '分析', related_content = '管理') 393 | --------------------------------------------------------------------------------