├── heroku
    ├── Procfile
    ├── oauth2_creds.json
    ├── requirements.txt
    ├── test.py
    ├── app.py
    └── email_find.py
├── media
    ├── 2020-05-14-16-00-10.png
    ├── 2020-05-14-16-21-46.png
    ├── 2020-05-14-16-22-41.png
    ├── 2020-05-14-16-24-04.png
    ├── 2020-05-14-16-25-25.png
    ├── 2020-05-14-16-26-33.png
    ├── 2020-05-14-16-26-51.png
    ├── 2020-05-14-16-30-01.png
    ├── 2020-05-14-16-32-14.png
    ├── 2020-05-14-16-38-40.png
    ├── 2020-05-14-16-39-37.png
    ├── 2020-05-14-20-25-08.png
    ├── 2020-05-14-20-25-17.png
    ├── 2020-05-14-20-25-59.png
    ├── 2020-05-14-20-26-22.png
    └── 2020-05-14-20-28-09.png
├── jupyter_notebook
    └── jobs_csv
    │   └── 2020-06-05金融.xlsx
└── README.md


/heroku/Procfile:
--------------------------------------------------------------------------------
1 | web gunicorn app:app --preload


--------------------------------------------------------------------------------
/media/2020-05-14-16-00-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-00-10.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-21-46.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-21-46.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-22-41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-22-41.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-24-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-24-04.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-25-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-25-25.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-26-33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-26-33.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-26-51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-26-51.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-30-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-30-01.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-32-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-32-14.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-38-40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-38-40.png


--------------------------------------------------------------------------------
/media/2020-05-14-16-39-37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-16-39-37.png


--------------------------------------------------------------------------------
/media/2020-05-14-20-25-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-25-08.png


--------------------------------------------------------------------------------
/media/2020-05-14-20-25-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-25-17.png


--------------------------------------------------------------------------------
/media/2020-05-14-20-25-59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-25-59.png


--------------------------------------------------------------------------------
/media/2020-05-14-20-26-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-26-22.png


--------------------------------------------------------------------------------
/media/2020-05-14-20-28-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/media/2020-05-14-20-28-09.png


--------------------------------------------------------------------------------
/heroku/oauth2_creds.json:
--------------------------------------------------------------------------------
1 | {
2 |     "google_refresh_token": "token",
3 |     "google_client_secret": "secret",
4 |     "google_client_id": "id"
5 | } 


--------------------------------------------------------------------------------
/jupyter_notebook/jobs_csv/2020-06-05金融.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kobojp/scraper_104_1111/HEAD/jupyter_notebook/jobs_csv/2020-06-05金融.xlsx


--------------------------------------------------------------------------------
/heroku/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.0
2 | pandas==1.0.3
3 | requests==2.23.0
4 | selenium==3.141.0
5 | tqdm==4.46.0
6 | yagmail==0.11.224
7 | openpyxl==3.0.3
8 | Flask==1.1.2
9 | gunicorn==19.5.0


--------------------------------------------------------------------------------
/heroku/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from selenium import webdriver
 3 | 
 4 | chrome_options = webdriver.ChromeOptions()
 5 | chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
 6 | chrome_options.add_argument("--headless")
 7 | chrome_options.add_argument("--disable-dev-shm-usage")
 8 | chrome_options.add_argument("--no-sandbox")
 9 | driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options)
10 | 
11 | driver.get("https://www.google.com")
12 | print(driver.page_source)
13 | 


--------------------------------------------------------------------------------
/heroku/app.py:
--------------------------------------------------------------------------------
 1 | # 使用flask call  執行檔
 2 | from flask import Flask
 3 | import email_find
 4 | import threading
 5 | 
 6 | app = Flask(__name__)
 7 | 
 8 | # @app.route('/')
 9 | # def hello_world():
10 | #     thread = threading.Thread(target=email_find.email, args=('金融', ))   # 增加定義線程
11 | #     thread.start()  # 讓線程開始工作 
12 | #     return 'Hello, World!,爬人力銀行'
13 | 
14 | @app.route('/user/<find_key>/<select_salary>/<select_area>/<related_key>/<related_content>')
15 | def show_user_profile(find_key, select_salary, select_area, related_key, related_content):
16 |     print(select_area) # input 11,22
17 |     # print(type(select_area))
18 |     print(select_area.split(','))
19 |     select_area = select_area.split(',')
20 |     print(type(select_area))
21 |     thread = threading.Thread(target=email_find.email, args=(find_key, select_salary, select_area, related_key, related_content))   # 增加定義線程
22 |     thread.start()  # 讓線程開始工作 
23 |     # show the user profile for that user
24 |     return '爬取資料 %s %s %s %s %s ' % (find_key, select_salary, select_area, related_key, related_content)
25 | 
26 | """
27 | 爬取 工作關鍵字 find_key 
28 | 薪資 select_salary
29 | 市區 select_area
30 | 工作最相關關鍵字 related_key
31 | 
32 | 使用方法：
33 | 例子 ：你的heroku網址/user/金融/30000/台北市,新北市,桃園市/分析/管理
34 | 
35 | /user/爬取關鍵字/篩選薪水多少以上/台北市,新北市,桃園市/最相關工作關鍵字
36 | 
37 | 
38 | 市區域只能選擇3個市/縣市，你也可以到email_find 新增更多的市/縣市
39 | 
40 | 新增市/縣市
41 | 例如：
42 |     mask1 = concat_104_1111.薪資 >= int(select_salary)
43 |     mask2 = concat_104_1111.地區.str.contains(select_area[0])
44 |     mask3 = concat_104_1111.地區.str.contains(select_area[1])
45 |     mask4 = concat_104_1111.地區.str.contains(select_area[2])
46 |     mask6 = concat_104_1111.地區.str.contains(select_area[3]) #新增 新增市/縣市
47 |     mask5 = concat_104_1111.工作名稱.str.contains(related_key)
48 |     mask6 = concat_104_1111.工作內容.str.contains(related_content) #工作內容簡介也許會有相關的工作
49 |     
50 |     #搜尋 3萬以上 工作名稱有"證卷"關鍵字、區域 台北市 新北市
51 |     # & = and , | = or   
52 | 
53 |     #儲存成 Excel格式檔
54 |     file_name = find_key #檔案名稱 依關鍵字取名
55 |     save_excel = concat_104_1111.loc[((mask2 | mask3 | mask4) & mask1 & mask5) | ((mask2 | mask3 | mask4) & mask1 & mask6)]
56 | 
57 | """
58 | if __name__ == "__main__":
59 |     app.run()
60 | 
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 爬取104及1111人力銀行工作資訊
  2 | 
  3 | 2020/6/7 更新：解決1111小改版後無法爬取
  4 | 2020/7/8 更新：新增工作內容搜尋相關字
  5 | 
  6 | * 新增使用Heroku定時自動爬人力銀行儲存Excel檔並寄email到自己的信箱 
  7 |     - 你正在想找新工作可以使用Heroku自動爬蟲
  8 | 
  9 | ## 手動分為兩種版本執行
 10 | 1. 本機使用版本
 11 |     * 在本機電腦上執行 請執行 Jupyter notebook  安裝指令 `pip install notebook`，建立使用Anaconda環境使用Jupyter notebook 比較不會出問題。
 12 | 2. (推薦使用)google colab版[https://colab.research.google.com/](https://colab.research.google.com/)
 13 |     * (推薦使用)google colab版  
 14 |     * 將google_colab_104_1111人力銀行爬蟲.ipynb上傳 
 15 |     ![](./media/2020-05-14-16-00-10.png)
 16 |     * 直接連結我的colab分享 https://colab.research.google.com/notebook#fileId=1KuaXUGgQIqJ7_ZN5V--zApt9ykOFuBk6&offline=true&sandboxMode=true
 17 | 
 18 | ### 本機版本需求
 19 | 
 20 | #### 需安裝的套件
 21 | 1. requests
 22 | 2. beautifulsoup4
 23 | 3. selenium
 24 | 4. pandas
 25 | 5. tqdm
 26 | 
 27 | `pip install requests`
 28 | 
 29 | `pip install beautifulsoup4`
 30 | 
 31 | `pip install selenium`
 32 | 
 33 | `pip install pandas`
 34 | 
 35 | `pip install tqdm`
 36 | 
 37 | #### ChromeDriver 選擇你的Chrome版本的Driver
 38 | https://chromedriver.chromium.org/
 39 | 
 40 | # 說明
 41 | 
 42 | 從104及1111人力銀行爬取關鍵字的資料，使用Pandas接資料、清理資料、過濾篩選。
 43 | 
 44 | **更多說明已註解在ipynb檔案內**
 45 | 
 46 | 
 47 | 
 48 | ## Heroku
 49 | * 爬取104 1111人力銀行篩選要的資料並儲存成Excel，寄到自己Email中。
 50 | * 自動定時寄送方法，使用[cron-job.org](https://cron-job.org/) 設定定時call Heroku，因為Heroku 30分鐘之後沒有在運作會自動進入休眠。
 51 |     * 註冊Heroku 帳號，[signup.heroku.com](https://signup.heroku.com/)
 52 | 
 53 | ### flask使用方法
 54 | * 參數傳入  user不需要更改，這樣可以非常方便的隨時更改想要的
 55 |     * 你的heroku網址/user/金融/30000/台北市,新北市,桃園市/分析
 56 |     * 說明 
 57 |         * /user/`爬取關鍵字`/`篩選薪水多少以上`/`台北市,新北市,桃園市`/`最相關工作關鍵字/工作內容關鍵字`
 58 |     * 一些相關說明在app.py裡面
 59 | 
 60 | 例如 你的網址是： `http://127.0.0.1/user/金融/30000/台北市,新北市,桃園市/分析/管理`
 61 | 
 62 | ### gmail api 申請設定取得oauth 2
 63 | 需要使用gmail api寄送email，如果只使用google帳號密碼登入會有驗證問題無法登入導致email寄送失敗。
 64 | 
 65 | google api https://console.cloud.google.com/apis/
 66 | 
 67 | ![](./media/2020-05-14-20-25-08.png)
 68 | ![](./media/2020-05-14-20-25-17.png)
 69 | 
 70 | 設定OAurth 同意畫面
 71 | 
 72 | ![](./media/2020-05-14-20-25-59.png)
 73 | <br>
 74 | 
 75 | 建立憑證 OAuth
 76 | 
 77 | ![](./media/2020-05-14-20-26-22.png)
 78 | 
 79 | 執行下列程式碼
 80 | ```
 81 | """
 82 | Adapted from:
 83 | https://github.com/google/gmail-oauth2-tools/blob/master/python/oauth2.py
 84 | https://developers.google.com/identity/protocols/OAuth2
 85 | 
 86 | 1. Generate and authorize an OAuth2 (generate_oauth2_token)
 87 | 2. Generate a new access tokens using a refresh token(refresh_token)
 88 | 3. Generate an OAuth2 string to use for login (access_token)
 89 | """
 90 | 
 91 | import base64
 92 | import imaplib
 93 | import json
 94 | import smtplib
 95 | import urllib.parse
 96 | import urllib.request
 97 | from email.mime.multipart import MIMEMultipart
 98 | from email.mime.text import MIMEText
 99 | import lxml.html
100 | 
101 | GOOGLE_ACCOUNTS_BASE_URL = 'https://accounts.google.com'
102 | REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
103 | 
104 | GOOGLE_CLIENT_ID = '' #在這裡輸入
105 | GOOGLE_CLIENT_SECRET = '' #在這裡輸入
106 | GOOGLE_REFRESH_TOKEN = None
107 | 
108 | 
109 | def command_to_url(command):
110 |     return '%s/%s' % (GOOGLE_ACCOUNTS_BASE_URL, command)
111 | 
112 | 
113 | def url_escape(text):
114 |     return urllib.parse.quote(text, safe='~-._')
115 | 
116 | 
117 | def url_unescape(text):
118 |     return urllib.parse.unquote(text)
119 | 
120 | 
121 | def url_format_params(params):
122 |     param_fragments = []
123 |     for param in sorted(params.items(), key=lambda x: x[0]):
124 |         param_fragments.append('%s=%s' % (param[0], url_escape(param[1])))
125 |     return '&'.join(param_fragments)
126 | 
127 | 
128 | def generate_permission_url(client_id, scope='https://mail.google.com/'):
129 |     params = {}
130 |     params['client_id'] = client_id
131 |     params['redirect_uri'] = REDIRECT_URI
132 |     params['scope'] = scope
133 |     params['response_type'] = 'code'
134 |     return '%s?%s' % (command_to_url('o/oauth2/auth'), url_format_params(params))
135 | 
136 | 
137 | def call_authorize_tokens(client_id, client_secret, authorization_code):
138 |     params = {}
139 |     params['client_id'] = client_id
140 |     params['client_secret'] = client_secret
141 |     params['code'] = authorization_code
142 |     params['redirect_uri'] = REDIRECT_URI
143 |     params['grant_type'] = 'authorization_code'
144 |     request_url = command_to_url('o/oauth2/token')
145 |     response = urllib.request.urlopen(request_url, urllib.parse.urlencode(params).encode('UTF-8')).read().decode('UTF-8')
146 |     return json.loads(response)
147 | 
148 | 
149 | def call_refresh_token(client_id, client_secret, refresh_token):
150 |     params = {}
151 |     params['client_id'] = client_id
152 |     params['client_secret'] = client_secret
153 |     params['refresh_token'] = refresh_token
154 |     params['grant_type'] = 'refresh_token'
155 |     request_url = command_to_url('o/oauth2/token')
156 |     response = urllib.request.urlopen(request_url, urllib.parse.urlencode(params).encode('UTF-8')).read().decode('UTF-8')
157 |     return json.loads(response)
158 | 
159 | 
160 | def generate_oauth2_string(username, access_token, as_base64=False):
161 |     auth_string = 'user=%s\1auth=Bearer %s\1\1' % (username, access_token)
162 |     if as_base64:
163 |         auth_string = base64.b64encode(auth_string.encode('ascii')).decode('ascii')
164 |     return auth_string
165 | 
166 | 
167 | def test_imap(user, auth_string):
168 |     imap_conn = imaplib.IMAP4_SSL('imap.gmail.com')
169 |     imap_conn.debug = 4
170 |     imap_conn.authenticate('XOAUTH2', lambda x: auth_string)
171 |     imap_conn.select('INBOX')
172 | 
173 | 
174 | def test_smpt(user, base64_auth_string):
175 |     smtp_conn = smtplib.SMTP('smtp.gmail.com', 587)
176 |     smtp_conn.set_debuglevel(True)
177 |     smtp_conn.ehlo('test')
178 |     smtp_conn.starttls()
179 |     smtp_conn.docmd('AUTH', 'XOAUTH2 ' + base64_auth_string)
180 | 
181 | 
182 | def get_authorization(google_client_id, google_client_secret):
183 |     scope = "https://mail.google.com/"
184 |     print('Navigate to the following URL to auth:', generate_permission_url(google_client_id, scope))
185 |     authorization_code = input('Enter verification code: ')
186 |     response = call_authorize_tokens(google_client_id, google_client_secret, authorization_code)
187 |     return response['refresh_token'], response['access_token'], response['expires_in']
188 | 
189 | 
190 | def refresh_authorization(google_client_id, google_client_secret, refresh_token):
191 |     response = call_refresh_token(google_client_id, google_client_secret, refresh_token)
192 |     return response['access_token'], response['expires_in']
193 | 
194 | 
195 | def send_mail(fromaddr, toaddr, subject, message):
196 |     access_token, expires_in = refresh_authorization(GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET, GOOGLE_REFRESH_TOKEN)
197 |     auth_string = generate_oauth2_string(fromaddr, access_token, as_base64=True)
198 | 
199 |     msg = MIMEMultipart('related')
200 |     msg['Subject'] = subject
201 |     msg['From'] = fromaddr
202 |     msg['To'] = toaddr
203 |     msg.preamble = 'This is a multi-part message in MIME format.'
204 |     msg_alternative = MIMEMultipart('alternative')
205 |     msg.attach(msg_alternative)
206 |     part_text = MIMEText(lxml.html.fromstring(message).text_content().encode('utf-8'), 'plain', _charset='utf-8')
207 |     part_html = MIMEText(message.encode('utf-8'), 'html', _charset='utf-8')
208 |     msg_alternative.attach(part_text)
209 |     msg_alternative.attach(part_html)
210 |     server = smtplib.SMTP('smtp.gmail.com:587')
211 |     server.ehlo(GOOGLE_CLIENT_ID)
212 |     server.starttls()
213 |     server.docmd('AUTH', 'XOAUTH2 ' + auth_string)
214 |     server.sendmail(fromaddr, toaddr, msg.as_string())
215 |     server.quit()
216 | 
217 | 
218 | 
219 | if __name__ == '__main__':
220 |     if GOOGLE_REFRESH_TOKEN is None:
221 |         print('No refresh token found, obtaining one')
222 |         refresh_token, access_token, expires_in = get_authorization(GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET)
223 |         print('Set the following as your GOOGLE_REFRESH_TOKEN:', refresh_token)
224 |         exit()
225 | 
226 |     send_mail('--------@gmail.com', '--------@gmail.com',
227 |               'A mail from you from Python',
228 |               'A mail from you from Python' +
229 |               'So happy to hear from you!')
230 | ```
231 | 
232 | TOKEN使用方法
233 | 將你的ID、SECRET 輸入到上方的腳本並執行，進入產生的網址，依照圖步驟走下去。
234 | 
235 | ![](./media/2020-05-14-20-28-09.png)
236 | 
237 | json檔設定,開啟oauth2_creds.json
238 | 將由上面取得的id、secret、token 輸入至json檔。
239 | ```
240 | {
241 |     "google_refresh_token": "token",
242 |     "google_client_secret": "secret": 
243 |     "google_client_id": "id"
244 | } 
245 | ```
246 | 
247 | ### 過濾 關鍵詞 地區設定
248 | 編輯email_find.py
249 | 滑動到最底部
250 | 
251 | 設定需要的薪資、縣市、相關的工作關鍵字，也可以新增更多條件判斷
252 | ```
253 | mask1 = concat_104_1111.薪資 >= 30000
254 | mask2 = concat_104_1111.地區.str.contains('台北市')
255 | mask3 = concat_104_1111.地區.str.contains('新北市')
256 | mask4 = concat_104_1111.地區.str.contains('桃園市')
257 | mask5 = concat_104_1111.工作名稱.str.contains('分析師')
258 | ```
259 | 
260 | 將 收件人信箱 設定你的email
261 | ```
262 | # 將工作資訊寄給自己gmail
263 | file = glob.glob('jobs_csv/*.xlsx') #傳送多個檔案 以list型態
264 | yag = yagmail.SMTP("你的email", oauth2_file="oauth2_creds.json")
265 | yag.send(
266 |     to="你的email", subject="爬蟲：熱騰騰的工作資訊", #to = "收件人信箱" 需傳送多個 使用list ['aaaa@123.com', 'bbb@123.com']
267 |     contents="熱騰騰的工作資料",
268 |     attachments= file
269 |     )
270 | ```
271 | 
272 | ### 前置設定
273 | #### 建立一個APP
274 | ![](./media/2020-05-14-16-21-46.png)
275 | ![](./media/2020-05-14-16-22-41.png)
276 | 
277 | #### 設定 環境變數、chrome、chromedriver
278 | ![](./media/2020-05-14-16-24-04.png)
279 | 
280 | * `heroku/python`
281 | * `https://github.com/heroku/heroku-buildpack-google-chrome`
282 | * `https://github.com/heroku/heroku-buildpack-chromedriver`
283 | ![](./media/2020-05-14-16-25-25.png)
284 | 
285 | ![](./media/2020-05-14-16-26-33.png)
286 | 
287 | `CHROMEDRIVER_PATH` `/app/.chromedriver/bin/chromedriver`
288 | 
289 | `GOOGLE_CHROME_BIN` `/app/.apt/usr/bin/google-chrome`
290 | ![](./media/2020-05-14-16-26-51.png)
291 | 
292 | #### 上傳
293 | 將heroku內全部上傳，需先安裝 Heroku CLI https://devcenter.heroku.com/articles/heroku-cli
294 | 
295 | 選擇Deploy
296 | 
297 | ![](./media/2020-05-14-16-30-01.png)
298 | 
299 | 開啟Command Prompt命令 或 使用 vs code 或其他可執行指令的Command Prompt
300 | 輸入
301 | * `heroku login`
302 | * `heroku git:clone -a line-bot-gogo`
303 | * `cd Heroku` 切換到要上傳的的資料夾，Heroku
304 | * `git add .`
305 | * ` git commit -am "make it better" `
306 | * ` git push heroku master `
307 | 
308 | ![](./media/2020-05-14-16-32-14.png)
309 | 
310 | 上傳完成後可以看到網址，或到Heroku app內的Settings查看
311 | 
312 | #### cron-job 設定
313 | 網址部分請至[flask使用方法看](#flask使用方法)看
314 | 
315 | 例如 你的網址是： `http://127.0.0.1/user/金融/30000/台北市,新北市,桃園市/分析`
316 | 
317 | 注意需使用編碼：例如：`http://127.0.0.1/user/%E9%87%91%E8%9E%8D/30000/%E5%8F%B0%E5%8C%97%E5%B8%82,%E6%96%B0%E5%8C%97%E5%B8%82,%E6%A1%83%E5%9C%92%E5%B8%82/%E5%88%86%E6%9E%90`
318 | 
319 | 如何轉碼
320 | ===
321 | 將網址貼在chrome上在剪下並貼上到記事本,chrome本身會自動轉碼
322 | 
323 | 
324 | [註冊cron-job.org/en/signup/](https://cron-job.org/en/signup/)
325 | 
326 | ![](./media/2020-05-14-16-38-40.png)
327 | 
328 | ![](./media/2020-05-14-16-39-37.png)
329 | 
330 | days of month 需要call的日期
331 | days of week 一周內要call幾天 
332 | months 月份
333 | Hours 小時
334 | minutes 分
335 | 
336 | 圖中設定為 每周禮拜一固定 AM 5:00 Call
337 | 
338 | 
339 | 


--------------------------------------------------------------------------------
/heroku/email_find.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | import random
  5 | import time
  6 | from urllib.parse import quote
  7 | from tqdm import tqdm, trange
  8 | import csv
  9 | from selenium import webdriver
 10 | from selenium.webdriver.common.keys import Keys
 11 | from selenium.webdriver.chrome.options import Options
 12 | import pandas as pd
 13 | from datetime import date
 14 | # from IPython.display import clear_output
 15 | import os,os.path
 16 | import glob
 17 | import yagmail
 18 | 
 19 | # , find_salary, find_area, related_key
 20 | def email(find_key='金融', select_salary = 40000, select_area = list(['台北市', '新北市', '桃園市']), related_key = '分析', related_content = '管理'):
 21 |     def get_todate():
 22 |         return date.today()
 23 | 
 24 |     def selenium_get_Code_104(url):
 25 |         #heroku selenium使用
 26 |         chrome_options = webdriver.ChromeOptions()
 27 |         chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
 28 |         chrome_options.add_argument("--headless")
 29 |         chrome_options.add_argument("--disable-dev-shm-usage")
 30 |         chrome_options.add_argument("--no-sandbox")
 31 |         driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options)
 32 |         
 33 |         #一般本地windows liunx使用
 34 |         # chrome_options = Options() # 啟動無頭模式
 35 |         # chrome_options.add_argument('--headless')  #規避google bug
 36 |         # chrome_options.add_argument('--disable-gpu')
 37 |         # driver = webdriver.Chrome(chrome_options=chrome_options)
 38 | 
 39 |         driver.get(url)
 40 |         save = driver.page_source
 41 |         driver.quit()#關閉瀏覽器
 42 |         soup = BeautifulSoup(save, "html.parser")
 43 |         page = soup.select('.page-select.js-paging-select.gtm-paging-top')[0].find_all('option')[-1].get('value')
 44 |         return page
 45 | 
 46 |     def read_url(url):
 47 |         USER_AGENT_LIST = [
 48 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 49 |             "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 50 |             "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 51 |             "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 52 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 53 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 54 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 55 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 56 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 57 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 58 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 59 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
 60 |             "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
 61 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
 62 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
 63 |             "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
 64 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
 65 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
 66 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
 67 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
 68 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
 69 |             "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
 70 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
 71 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
 72 |             "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
 73 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
 74 |             "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
 75 |             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
 76 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
 77 |             "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
 78 |             "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
 79 |             "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
 80 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
 81 |             "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
 82 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
 83 |             ]
 84 |         USER_AGENT = random.choice(USER_AGENT_LIST)
 85 |         headers = {'user-agent': USER_AGENT}
 86 |         s = requests.Session()
 87 |         req = s.get(url, headers = headers)
 88 |         soup = BeautifulSoup(req.text, "html.parser")
 89 |         return soup
 90 | 
 91 |     def csv_column_104(path_csv, key_txt): #建立行標題
 92 |         with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: 
 93 |             employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
 94 |             employee_writer.writerow(['日期', '工作名稱', '公司名稱', '公司地址', '薪資', '工作內容', '地區', '經歷', '學歷', '公司人數', '文章編號', '工作網址'])
 95 | 
 96 |     def find_title_104(key_txt):
 97 |         #路徑組合
 98 |         today = get_todate()
 99 |         path_csv = 'jobs_csv/'+ str(today) + key_txt + '_104人力銀行'
100 |         if not os.path.isdir('jobs_csv'): # 確認是否有jobs_csv資料夾  沒有則返回Ture
101 |             os.mkdir('jobs_csv') # 建立jobs_csv資料夾
102 |             print('建立jobs_csv資料夾完成')
103 |         csv_column_104(path_csv, key_txt) #建立行標題
104 |         csv_save = ""
105 |         key = quote(key_txt)
106 |         #  104 api searchTempExclude=2  -> 設定排除派遣
107 |         find_page_url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword={0}&order=15&asc=0&page=1&mode=s&jobsource=2018indexpoc&searchTempExclude=2'.format(key)
108 |         get_sum_page = int(selenium_get_Code_104(find_page_url))
109 |         print('共有：' + str(get_sum_page) + ' 頁')
110 |         for i in tqdm(range(1, get_sum_page+1)):  #set page 1 to find all max page ,tqdm讀取進度條
111 |             url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword={0}&order=15&asc=0&page={1}&mode=s&jobsource=2018indexpoc&searchTempExclude=2'.format(key, i) 
112 |             #time.sleep(random.randint(2,10)) #隨機等待
113 |             soup = read_url(url) #讀取網頁
114 |             print('目前爬取頁面是：' + url)
115 |             for title_1 in soup.select('.b-block__left'):
116 |                 #有三個資料是無資料的，遇到無資料就跳過這個迴圈
117 |                 if title_1.select('.b-list-inline.b-clearfix.job-list-item__company') != soup.select('.b-block__left')[0].select('.b-list-inline.b-clearfix.job-list-item__company'):
118 |                     #日期
119 |                     try:
120 |                         #正常代表找到 讚 廣告 (業主買廣告)，發生異常代表找不到 讚，執行except找日期 
121 |                         date_match__ = title_1.select('.b-icon--gray.b-icon--w18')[0].select('use')[0]
122 |                         date = '廣告'
123 |                     except:
124 |                         date = title_1.select('.b-tit__date')[0].get_text().replace('\n','').replace(' ','')
125 |             
126 |                     #地區
127 |                     area = title_1.select('.b-list-inline.b-clearfix.job-list-intro.b-content')[0].find('li').get_text()
128 |                     #經歷(年資)
129 |                     experience = title_1.select('.b-list-inline.b-clearfix.job-list-intro.b-content')[0].find_all('li')[1].get_text()
130 |                     try: #業者沒有輸入學歷，遇到錯誤處理
131 |                         #學歷
132 |                         education = title_1.select('.b-list-inline.b-clearfix.job-list-intro.b-content')[0].find_all('li')[2].get_text()
133 |                     except:
134 |                         education = ""
135 |                     #工作網址
136 |                     title_url = title_1.select('.js-job-link')[0].get('href')[2:]
137 |                     #get 文章編號
138 |                     title_str = title_url.split('?')[0].split('/')[-1] #get 文章編號
139 |                     #標題名稱
140 |                     title = title_1.select('.js-job-link')[0].get_text() #get title
141 |                     #print(title + title_url + area)
142 |                     #公司名
143 |                     company_name = title_1.select('li')[1].find('a').get('title').split('\n')[0][4:]
144 |                     try:
145 |                         #公司地址
146 |                         company_address = title_1.select('li')[1].find('a').get('title').split('\n')[1][5:]
147 |                     except:
148 |                         company_address = ""
149 |                     try:
150 |                         #簡介
151 |                         introduction = title_1.select('.job-list-item__info.b-clearfix.b-content')[0].get_text()
152 |                         #處理string \r \n5 \n轉成''
153 |                         introduction = introduction.replace('\r','').replace('\n5','').replace('\n','')
154 |                     except:
155 |                         introduction = ""
156 |                     #薪資
157 |                     try:
158 |                         salary = title_1.select('.b-tag--default')[0].get_text()
159 |                     except:
160 |                         salary = 0 #沒有寫薪資或待遇面議，設定 0
161 |                         
162 |                     if salary == '待遇面議':
163 |                         salary = "待遇面議"
164 |                     else: #數字處理 25000~35000 取25000最低薪資為主要，三位數 = 日薪，四位數 = 論件計酬
165 |                         try:
166 |                             salary = re.search('\d+.\d+', salary).group()
167 |                         except:
168 |                             salary = 0
169 |                     #員工人數
170 |                     try:
171 |                         people = title_1.select('.b-tag--default')[1].get_text()
172 |                     except:
173 |                         people = ""
174 |                     #clear_output() # 清除輸出 用於清除進度讀，註解#不使用：用來檢查出錯的網址
175 | 
176 |                     with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: #w
177 |                         employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
178 |                         employee_writer.writerow([date, title, company_name, company_address, salary, introduction, area, experience, education, people, title_str, title_url])
179 |                 else:
180 |                     continue
181 |         return print('爬取104完成：請開啟csv檔案')
182 | 
183 | 
184 |     # input_go = input('輸入關鍵字')
185 |     # save_title_data = find_title_104(input_go)
186 | 
187 | 
188 | 
189 | 
190 |     ############ 1111人力銀行 #########
191 | 
192 | 
193 |     def selenium_get_Code_1111(url):
194 |         #heroku selenium使用
195 |         chrome_options = webdriver.ChromeOptions()
196 |         chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
197 |         chrome_options.add_argument("--headless")
198 |         chrome_options.add_argument("--disable-dev-shm-usage")
199 |         chrome_options.add_argument("--no-sandbox")
200 |         driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options)
201 | 
202 |         # 一般本地windows liunx使用
203 |         # chrome_options = Options() # 啟動無頭模式
204 |         # chrome_options.add_argument('--headless')  #規避google bug
205 |         # chrome_options.add_argument('--disable-gpu')
206 |         # driver = webdriver.Chrome(chrome_options=chrome_options)
207 | 
208 |         driver.get(url)
209 |         save = driver.page_source
210 |         driver.quit()#關閉瀏覽器
211 |         soup = BeautifulSoup(save, "html.parser")
212 |         
213 |         page = soup.select('.custom-select')[0].select('option')[0].text
214 |         page = page.split('/')
215 |         page = page[1].strip(' ')
216 |         return page
217 | 
218 | 
219 |     def csv_column_1111(path_csv): #建立行標題
220 |         with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: 
221 |             employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
222 |             employee_writer.writerow(['日期', '工作名稱', '公司名稱', '公司地址', '薪資', '工作內容', '地區', '經歷', '學歷', '工作網址'])
223 | 
224 |     def find_data_1111(soup):
225 |         #錢錢
226 |         mnone = soup.select('.needs') 
227 |         #縣市區域
228 |         location = soup.select('.needs')
229 |         #日期
230 |         get_date = soup.select('.date')
231 |         #簡介
232 |         jbInfoTxt = soup.select('.jbInfoTxt')
233 |         #網址
234 |         jobs_url = soup.select('.position0')
235 |         #公司名稱、類別、住址
236 |         company_data = soup.select('.d-md-flex')
237 |         #工作標題
238 |         title = soup.select('.position0')
239 |         #工作經驗
240 |         jobs_exp = soup.select('.needs')
241 |         # 學歷
242 |         education = soup.select('.needs')
243 |         return mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education
244 | 
245 |             
246 |     def find_title_1111(key_txt):
247 |         #路徑組合
248 |         today = get_todate()
249 |         path_csv = 'jobs_csv/' + str(today) + key_txt + '_1111人力銀行'
250 |         if not os.path.isdir('jobs_csv'): # 確認是否有jobs_csv資料夾  沒有則返回Ture
251 |             os.mkdir('jobs_csv') # 建立jobs_csv資料夾
252 |             print('建立jobs_csv資料夾完成')
253 |         csv_column_1111(path_csv) #建立行標題
254 |         key = quote(key_txt)
255 |         find_page_url = 'https://www.1111.com.tw/job-bank/job-index.asp?si=1&ss=s&ks={0}&page=1'.format(key)
256 |         #取得最大page數
257 |         get_sum_page = int(selenium_get_Code_1111(find_page_url))
258 |         print('共有：' + str(get_sum_page) + ' 頁')
259 |         
260 |         for i in tqdm(range(1, get_sum_page+1)):
261 |             url = 'https://www.1111.com.tw/job-bank/job-index.asp?si=1&ss=s&ks={0}&page={1}'.format(key, i)
262 |             soup = read_url(url) #讀取網頁
263 |             #讀取網頁資料
264 |             mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education = find_data_1111(soup)
265 |             print('目前爬取頁面是：' + url)
266 |             for mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education in zip(mnone, location, jbInfoTxt, jobs_url, company_data, title, jobs_exp, get_date, education):
267 |                 #錢 取最低薪資
268 |                 try:
269 |                     mnone = mnone.find_all("span")[1].get_text()
270 |                     get_mone = re.search('\d+.\d+', mnone).group()
271 |                 except:
272 |                     get_mone = '面議（經常性薪資4萬/月含以上）' #可能需要直接給40,000
273 |                 #日期
274 |                 get_date = get_date.get_text()[5:]
275 |                 
276 |                 #縣市區域
277 |                 location = location.find_all("span")[0].get_text()
278 |                 
279 |                 #簡介
280 |                 jbInfoTxt = jbInfoTxt.get_text().replace("\xa0", "") #刪除\xa0
281 |                 
282 |                 #工作網址
283 |                 jobs_url = 'https://www.1111.com.tw{0}'.format(jobs_url.find('a').get('href'))
284 |         
285 |                 #公司名
286 |                 company = company_data.find_all('a')[0].get('title').replace('\r','').split('\n')[0][6:]
287 |                 
288 |                 #公司分類 目前暫不使用
289 |                 category = company_data.find_all('a')[0].get('title').replace('\r','').split('\n')[1][6:]
290 |                 
291 |                 #公司地址
292 |                 address = company_data.find_all('a')[0].get('title').replace('\r','').split('\n')[2][6:]
293 |                 
294 |                 #工作標題
295 |                 title = title.find('a').get('title')
296 |                 
297 |                 # 工作經驗
298 |                 jobs_exp = jobs_exp.find_all("span")[2].get_text()
299 |                 
300 |                 # 學歷
301 |                 education = education.find_all("span")[3].get_text()
302 |                 # 儲存
303 |                 
304 |                 # clear_output() # 清除輸出 用於清除進度讀
305 |                 
306 |                 with open(path_csv + '.csv', mode='a+', newline='', encoding='utf-8') as employee_file: #w
307 |                     employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
308 |                     employee_writer.writerow([get_date, title, company, address, get_mone, jbInfoTxt, location, jobs_exp, education, jobs_url])
309 |         return print('爬取1111完成：請開啟csv檔案')
310 | 
311 | 
312 |     # find_key = '金融'
313 |     find_title_104(find_key)
314 |     find_title_1111(find_key)
315 |     
316 |     #讀取爬取到的資料
317 |     get_file_name = glob.glob('jobs_csv/*.csv')
318 |     data_104 = pd.read_csv(get_file_name[0])
319 |     data_1111 = pd.read_csv(get_file_name[1])
320 | 
321 |     # 日期、工作名稱、公司地址、薪資、工作內容、地區、經歷、學歷、工作網址 依想要的順序排序
322 |     new_data_1111 =  data_1111[['日期', '工作名稱', '薪資', '地區', '工作內容', '公司地址', '經歷', '學歷', '工作網址']]
323 |     new_data_104 = data_104[['日期', '工作名稱', '薪資', '地區', '工作內容', '公司地址', '經歷', '學歷', '工作網址']]
324 |     
325 |     # 接資料
326 |     concat_104_1111 = pd.concat([new_data_1111, new_data_104], axis=0)
327 |     concat_104_1111.shape
328 | 
329 |     # index重設定
330 |     concat_104_1111.reset_index(drop=True, inplace=True) #drop=True 刪除原有index , inplace 直接更新現有的DataFrame
331 |     
332 |     # 清理資料
333 |     """
334 |     1. 將 待遇面議 轉換成 0
335 |     2. 將 面議（經常性薪資4萬/月含以上）轉成 40000
336 |     3. 將 時薪 例如：150~200 轉成只取最低時薪 150
337 |     4. 移除數字中的, 例如"30,000"將,移除
338 |     """
339 | 
340 |     #將四萬改成0  但是也可以改成40,000
341 |     concat_104_1111['薪資'] = concat_104_1111.薪資.str.replace('面議（經常性薪資4萬/月含以上）', '40000')
342 | 
343 |     #將待遇面議改成0
344 |     concat_104_1111['薪資'] = concat_104_1111.薪資.str.replace('待遇面議', '0')
345 | 
346 |     #時薪 取最低時薪
347 |     concat_104_1111.loc[concat_104_1111.薪資.str.contains('~'),'薪資'] = concat_104_1111.loc[concat_104_1111.薪資.str.contains('~'),'薪資'].str.split('~').str[0]
348 | 
349 |     #數字有"30,000"將,移除
350 |     concat_104_1111.loc[:,'薪資'] = concat_104_1111['薪資'].apply(lambda x:x.replace(',',''))
351 | 
352 |     # 薪資轉成int type
353 |     concat_104_1111.loc[:,'薪資'] = pd.to_numeric(concat_104_1111['薪資'])
354 | 
355 |     # 過濾
356 |     #3萬以上
357 |     # mask1 = concat_104_1111.薪資 >= 30000
358 |     # mask2 = concat_104_1111.地區.str.contains('台北市')
359 |     # mask3 = concat_104_1111.地區.str.contains('新北市')
360 |     # mask4 = concat_104_1111.地區.str.contains('桃園市')
361 |     # mask5 = concat_104_1111.工作名稱.str.contains('分析師')
362 |     
363 |     mask1 = concat_104_1111.薪資 >= int(select_salary)
364 |     mask2 = concat_104_1111.地區.str.contains(select_area[0])
365 |     mask3 = concat_104_1111.地區.str.contains(select_area[1])
366 |     mask4 = concat_104_1111.地區.str.contains(select_area[2])
367 |     mask5 = concat_104_1111.工作名稱.str.contains(related_key)
368 |     mask6 = concat_104_1111.工作內容.str.contains(related_content) #工作內容簡介也許會有相關的工作
369 | 
370 |     #搜尋 3萬以上 工作名稱有"證卷"關鍵字、區域 台北市 新北市
371 |     # & = and , | = or   
372 |     concat_104_1111.loc[((mask2 | mask3 | mask4) & mask1 & mask5) | ((mask2 | mask3 | mask4) & mask1 & mask6)].head()
373 | 
374 |     #儲存成 Excel格式檔
375 |     file_name = find_key #檔案名稱 依關鍵字取名
376 |     save_excel = concat_104_1111.loc[((mask2 | mask3 | mask4) & mask1 & mask5) | ((mask2 | mask3 | mask4) & mask1 & mask6)]
377 |     save_excel.to_excel('jobs_csv/{}.xlsx'.format(file_name), sheet_name='passengers', index=False)
378 |     print('完成工作')
379 | 
380 |     # 將工作資訊寄給自己gmail
381 | 
382 |     file = glob.glob('jobs_csv/*.xlsx') #傳送多個檔案 以list型態
383 |     yag = yagmail.SMTP("你的email", oauth2_file="oauth2_creds.json")
384 |     yag.send(
385 |         to="你的email", subject="爬蟲：熱騰騰的工作資訊", #to = "收件人信箱" 需傳送多個 使用list ['aaaa@123.com', 'bbb@123.com']
386 |         contents="熱騰騰的工作資料",
387 |         attachments= file
388 |         )
389 | 
390 | if __name__ == "__main__":
391 |     # email('分析師')
392 |     email(find_key='金融', select_salary = 40000, select_area = list(['台北市', '新北市', '桃園市']), related_key = '分析', related_content = '管理')
393 | 


--------------------------------------------------------------------------------