├── .github └── workflows │ ├── blank.yml │ └── python-app.yml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── Dockerfile_dev ├── README.md ├── api ├── .dockerignore ├── Dockerfile ├── app.py ├── requirements.txt └── worker.py ├── celery_queue ├── .dockerignore ├── Dockerfile ├── IndeedScrapper │ ├── README.md │ ├── __init__.py │ ├── indeed_extract.py │ └── indeed_scrapper.py ├── __init__.py ├── log.txt ├── requirements.txt └── tasks.py ├── cron_indeed_scrapping_test.py ├── cron_test.py ├── dev ├── 104 │ ├── code1.py │ ├── code2.py │ ├── code3.py │ └── index.txt └── test-1.ipynb ├── doc └── pic │ ├── architecture.jpg │ ├── architecture.svg │ ├── celery.jpg │ └── celery.svg ├── docker-compose.yml ├── legacy_project ├── archived │ ├── bank_swiftcode │ │ ├── UK_bank_swift_code_list.csv │ │ ├── grab_bank_list.py │ │ └── grab_bank_list_muitiprocess.py │ ├── booking │ │ ├── bookingcom_scrap.py │ │ └── next_page_sample.py │ ├── efish_scraping_demo.ipynb │ ├── glassdoor │ │ └── glassdoor_scrap.py │ └── spotify │ │ └── spotify_album copy.sh ├── blu_move │ ├── analysis.sql │ ├── blu_.json │ ├── blu_scrape_V1.py │ ├── blu_scrape_V1.sh │ ├── blu_scrape_V2.py │ ├── blu_scrape_V2.sh │ ├── run.sh │ ├── utility_data_IO.py │ └── utility_data_preprocess.py ├── carandclassic │ ├── README.md │ ├── analysis │ │ ├── .ipynb_checkpoints │ │ │ └── Rental_Location_EDA-checkpoint.ipynb │ │ ├── DemoLondonRentals.csv │ │ ├── README.md │ │ └── Rental_Location_EDA.ipynb │ ├── carandclassic_scrape_sample.csv │ └── cclassic_scrape_V1.py ├── carousell │ └── web_crawler copy.py ├── delivery_ │ ├── .gitignore │ ├── README.md │ ├── analysis.py │ ├── analysis.sql │ ├── data2db.py │ ├── query_test.sh │ ├── scrap.py │ ├── sqlite2csv.sh │ ├── weather.csv │ └── weather.db ├── env.md ├── es_scrapper_docker_demo │ ├── Dockerfile │ ├── README.md │ ├── app.py │ ├── docker-compose.yml │ └── requirements.txt ├── eztable │ ├── eztable_scarp.py │ ├── eztable_scrap_dev.py │ ├── eztable_scrap_dev2.py │ ├── eztable_scrap_inputword.py │ └── geckodriver.log ├── facebook_fan_page │ ├── google_scrap_fb_page_final.ipynb │ └── scrap_fb_page_test.ipynb ├── geojson.py ├── google_geodata │ ├── geopy_address_lon_lat.py │ └── gmap_address_lon_lat.py ├── ipeen │ ├── README.md │ ├── ipeen_grab.py │ ├── ipeen_pivot.py │ ├── ipeen_restaurant_grab_V2.ipynb │ ├── ipeen_restaurant_pivot_table.ipynb │ └── ipeen_scraping-final.ipynb ├── script │ ├── __init__.py │ ├── utility_data_IO.py │ └── utility_operation.py ├── setup.sh └── weather_scrapper │ ├── LDN_weather_scrapper_V1.py │ ├── README.md │ └── br_weather_scrapper_V1.py ├── logs └── log.txt ├── output └── 2019-08-14_jobs_1.txt ├── requirements.txt ├── script └── send_mail.py ├── slack_push.sh ├── tests ├── unit_test.py ├── unit_test_celery.py └── unittest_data.txt └── travis_push_github.sh /.github/workflows/blank.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Run a one-line script 13 | run: echo this is the dummy test 14 | - name: Run a multi-line script 15 | run: | 16 | echo *** build start *** 17 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Test with pytest 32 | run: | 33 | pytest 34 | - name: Run indeed scrapping test 35 | run: | 36 | python cron_indeed_scrapping_test.py 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *__pycache__ 3 | *.DS_Store 4 | .DS_Store 5 | *.ipynb_checkpoints 6 | .ipynb_checkpoints 7 | Indeed_scrapper_nb_V1.ipynb 8 | logs/log.txt 9 | celery_queue/celerybeat-schedule.db 10 | celery_queue/celerybeat.pid -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | services: 4 | - docker 5 | branches: 6 | only: 7 | - master 8 | notifications: 9 | email: 10 | on_failure: always 11 | recipients: 12 | - f339339@gmail.com 13 | script: 14 | - echo ' ----------------- STEP 0) UNIT TEST ----------------- ' 15 | - pytest -v tests 16 | - python tests/unit_test_celery.py -v 17 | - echo ' ----------------- STEP 1) INDEED SCRAPING ----------------- ' 18 | - python cron_indeed_scrapping_test.py >> indeed_task.log 19 | - ls output && ls logs && cat logs/log.txt 20 | #- docker build -t web_scraping_env . && docker run -it web_scraping_env /bin/bash -c "python cron_test.py && bash travis_push_github.sh" 21 | after_success: 22 | - echo 'push scraped file to slack...' && current_date=$(date +'%Y-%m-%d') && for file_name in $(ls output/*.csv) ; do echo $file_name $slack_channel && bash slack_push.sh -f $file_name -c $slack_channel -s $slack_token -n "TRAVIS SLACK PUSH" -x " >>>> INDEED SCRAPING REPORT $file_name" ; done 23 | - echo 'push LOG to slack...' && bash slack_push.sh -f indeed_task.log -c web_scraping_log -s $slack_token 24 | 25 | 26 | # deploy: 27 | # provider: script 28 | # script: bash travis_push_github.sh 29 | # skip-cleanup: true 30 | # target-branch: master 31 | # github-token: $GH_TOKEN 32 | # keep-history: true 33 | # verbose: true 34 | # on: 35 | # branch: master 36 | # env: 37 | # global: 38 | # secure: Yfr36/XdwtZyjUBJwYTboFAfH5qqSYRd7d1vx/vHO1fCP4XtQWqT1Lvo5pfbHXghOjiJZZcfhO72inUKJ7er9QXznsGufj6nnQUJs/dOoBbfGnLSdvSYT6lpXTe7GYMbOgUsmYtjeD8S6pyL2L8xcX1fPZzsVD7v/edG9kZo1H9+fKCbVipBNf0IXO4DaE1H4vw77UVb6ysA3npxyIprM4jXUkZW3KFb7fA7/LENpS1NPniQxYe1LuUjzOpdJAG28WIeQnC/Cb+jz16cRtIV7HgukG0WnpHdszI+Xj4Kx+46URZnXW95cpZ2cq4Oywx98XZbC5uEXn3GeB/9JgvnuNsfsYOzhdCg29Ca/JGiUyri7F/x3mFxMfl2OoJeO50R4JTnwPrAHot8m914rP/VXtGZFPJQfXjoyKQJPnHFO0Yt+IJ9ziK3r3tLcdrbYngPuoBHFEYr4f87jOjdiyn/+1x9liLYh+Z0/6UdbQJRQnsAh+ghSvs1M7FIKY4eMHPW9qKPUbfsQIRckTzC6U7lX16eiPQk+wehJ7o//FB6MFOEvbownBcDUooITJXgC0Cvtpd831ktlkxPqyJh13X9URbEyD25zG58zI9Bq7RfeCjWN8LZaa7bLyjhDR2KzAvWDfKowbUShpznlSSSo2czn81kT1GXaAa4Iz215kNCDfs= -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | LABEL maintainer "yennj12" 4 | 5 | #ENV PYTHONPATH /app 6 | ENV CELERY_BROKER_URL redis://redis:6379/0 7 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0 8 | ENV C_FORCE_ROOT true 9 | ADD requirements.txt /app/requirements.txt 10 | ADD ./test_celery/ /app/ 11 | COPY . /app 12 | WORKDIR /app/ 13 | RUN pip install -r requirements.txt 14 | ENTRYPOINT celery -A task worker --loglevel=info -------------------------------------------------------------------------------- /Dockerfile_dev: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | LABEL maintainer "yennj12" 4 | 5 | ENV HOME / 6 | WORKDIR $HOME 7 | COPY . $HOME 8 | 9 | RUN pip install --upgrade pip && \ 10 | pip install -r requirements.txt && \ 11 | pwd && ls && ls home 12 | 13 | RUN /bin/bash -c "python cron_test.py" 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web_scraping 2 | 3 | Collection of scrapper pipelines build for different purposes 4 | 5 | [](https://travis-ci.org/yennanliu/web_scraping) 6 | [](https://github.com/yennanliu/web_scraping/pulls) 7 | 8 | 9 | ### Architecture 10 |
\n", 174 | " | name | \n", 175 | "address | \n", 176 | "url | \n", 177 | "style | \n", 178 | "
---|---|---|---|---|
0 | \n", 183 | "吉宏米粉湯(西門町店) | \n", 184 | "台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 185 | "http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店 | \n", 186 | "小吃 | \n", 187 | "
1 | \n", 190 | "瓦法奇朵Waffogato(台北車站店) | \n", 191 | "台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t... | \n", 192 | "http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog... | \n", 193 | "異國料理 | \n", 194 | "
2 | \n", 197 | "添好運台灣 Timhowan Taiwan | \n", 198 | "台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 199 | "http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... | \n", 200 | "中式料理 | \n", 201 | "
3 | \n", 204 | "威靈頓街1號 粥麵茶餐廳 | \n", 205 | "台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 206 | "http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 | \n", 207 | "中式料理 | \n", 208 | "
4 | \n", 211 | "劉山東小吃店 | \n", 212 | "台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... | \n", 213 | "http://www.ipeen.com.tw/shop/44415-劉山東小吃店 | \n", 214 | "中式料理 | \n", 215 | "
\n", 282 | " | name | \n", 283 | "address | \n", 284 | "url | \n", 285 | "style | \n", 286 | "area | \n", 287 | "
---|---|---|---|---|---|
0 | \n", 292 | "吉宏米粉湯(西門町店) | \n", 293 | "台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 294 | "http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店 | \n", 295 | "小吃 | \n", 296 | "萬華區 | \n", 297 | "
1 | \n", 300 | "瓦法奇朵Waffogato(台北車站店) | \n", 301 | "台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t... | \n", 302 | "http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog... | \n", 303 | "異國料理 | \n", 304 | "中正區 | \n", 305 | "
2 | \n", 308 | "添好運台灣 Timhowan Taiwan | \n", 309 | "台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 310 | "http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... | \n", 311 | "中式料理 | \n", 312 | "中正區 | \n", 313 | "
3 | \n", 316 | "威靈頓街1號 粥麵茶餐廳 | \n", 317 | "台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 318 | "http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 | \n", 319 | "中式料理 | \n", 320 | "中正區 | \n", 321 | "
4 | \n", 324 | "劉山東小吃店 | \n", 325 | "台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... | \n", 326 | "http://www.ipeen.com.tw/shop/44415-劉山東小吃店 | \n", 327 | "中式料理 | \n", 328 | "中正區 | \n", 329 | "
\n", 83 | " | name | \n", 84 | "address | \n", 85 | "url | \n", 86 | "style | \n", 87 | "area | \n", 88 | "
---|---|---|---|---|---|
0 | \n", 93 | "鹿兒島燒肉專賣店(中和中山店) | \n", 94 | "新北市中和區中山路二段28號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n... | \n", 95 | "http://www.ipeen.com.tw/shop/1128665-鹿兒島燒肉專賣店-... | \n", 96 | "燒烤類 | \n", 97 | "中和區 | \n", 98 | "
1 | \n", 101 | "青禾幸福鍋物涮涮屋(永安店) | \n", 102 | "新北市中和區中和路380號2樓(永安市場捷運站旁)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 103 | "http://www.ipeen.com.tw/shop/138215-青禾幸福鍋物涮涮屋-永安店 | \n", 104 | "鍋類 | \n", 105 | "中和區 | \n", 106 | "
\n", 143 | " | area | \n", 144 | "style | \n", 145 | "count | \n", 146 | "
---|---|---|---|
0 | \n", 151 | "\\t\\t\\t | \n", 152 | "冷凍/冷藏包裝食品 | \n", 153 | "3 | \n", 154 | "
1 | \n", 157 | "\\t\\t\\t | \n", 158 | "常溫包裝食品 | \n", 159 | "6 | \n", 160 | "
2 | \n", 163 | "\\t\\t\\t | \n", 164 | "網購包裝食品 | \n", 165 | "12 | \n", 166 | "
area | \n", 221 | "style | \n", 222 | "中和區 | \n", 223 | "中山區 | \n", 224 | "中正區 | \n", 225 | "信義區 | \n", 226 | "大同區 | \n", 227 | "大安區 | \n", 228 | "松山區 | \n", 229 | "板橋區 | \n", 230 | "永和區 | \n", 231 | "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", 236 | "buffet自助餐 | \n", 237 | "1.0 | \n", 238 | "18.0 | \n", 239 | "6.0 | \n", 240 | "15.0 | \n", 241 | "6.0 | \n", 242 | "9.0 | \n", 243 | "8.0 | \n", 244 | "6.0 | \n", 245 | "2.0 | \n", 246 | "
1 | \n", 249 | "style | \n", 250 | "0.0 | \n", 251 | "0.0 | \n", 252 | "0.0 | \n", 253 | "0.0 | \n", 254 | "0.0 | \n", 255 | "0.0 | \n", 256 | "0.0 | \n", 257 | "0.0 | \n", 258 | "0.0 | \n", 259 | "
2 | \n", 262 | "中式料理 | \n", 263 | "376.0 | \n", 264 | "851.0 | \n", 265 | "667.0 | \n", 266 | "524.0 | \n", 267 | "285.0 | \n", 268 | "907.0 | \n", 269 | "618.0 | \n", 270 | "462.0 | \n", 271 | "286.0 | \n", 272 | "
3 | \n", 275 | "主題特色餐廳 | \n", 276 | "20.0 | \n", 277 | "160.0 | \n", 278 | "63.0 | \n", 279 | "101.0 | \n", 280 | "46.0 | \n", 281 | "218.0 | \n", 282 | "73.0 | \n", 283 | "54.0 | \n", 284 | "36.0 | \n", 285 | "
4 | \n", 288 | "亞洲料理 | \n", 289 | "76.0 | \n", 290 | "130.0 | \n", 291 | "117.0 | \n", 292 | "153.0 | \n", 293 | "31.0 | \n", 294 | "243.0 | \n", 295 | "112.0 | \n", 296 | "78.0 | \n", 297 | "50.0 | \n", 298 | "
5 | \n", 301 | "其他美食 | \n", 302 | "46.0 | \n", 303 | "44.0 | \n", 304 | "68.0 | \n", 305 | "32.0 | \n", 306 | "14.0 | \n", 307 | "54.0 | \n", 308 | "40.0 | \n", 309 | "46.0 | \n", 310 | "28.0 | \n", 311 | "
6 | \n", 314 | "冰品、飲料、甜湯 | \n", 315 | "99.0 | \n", 316 | "193.0 | \n", 317 | "218.0 | \n", 318 | "195.0 | \n", 319 | "131.0 | \n", 320 | "315.0 | \n", 321 | "173.0 | \n", 322 | "187.0 | \n", 323 | "103.0 | \n", 324 | "
7 | \n", 327 | "冷凍/冷藏包裝食品 | \n", 328 | "0.0 | \n", 329 | "0.0 | \n", 330 | "0.0 | \n", 331 | "0.0 | \n", 332 | "0.0 | \n", 333 | "0.0 | \n", 334 | "0.0 | \n", 335 | "0.0 | \n", 336 | "0.0 | \n", 337 | "
8 | \n", 340 | "咖啡、簡餐、茶 | \n", 341 | "167.0 | \n", 342 | "667.0 | \n", 343 | "495.0 | \n", 344 | "405.0 | \n", 345 | "196.0 | \n", 346 | "798.0 | \n", 347 | "435.0 | \n", 348 | "262.0 | \n", 349 | "140.0 | \n", 350 | "
9 | \n", 353 | "小吃 | \n", 354 | "307.0 | \n", 355 | "352.0 | \n", 356 | "420.0 | \n", 357 | "274.0 | \n", 358 | "335.0 | \n", 359 | "525.0 | \n", 360 | "384.0 | \n", 361 | "487.0 | \n", 362 | "236.0 | \n", 363 | "
10 | \n", 366 | "常溫包裝食品 | \n", 367 | "0.0 | \n", 368 | "0.0 | \n", 369 | "0.0 | \n", 370 | "0.0 | \n", 371 | "0.0 | \n", 372 | "0.0 | \n", 373 | "0.0 | \n", 374 | "0.0 | \n", 375 | "0.0 | \n", 376 | "
11 | \n", 379 | "日式料理 | \n", 380 | "117.0 | \n", 381 | "578.0 | \n", 382 | "289.0 | \n", 383 | "265.0 | \n", 384 | "111.0 | \n", 385 | "547.0 | \n", 386 | "270.0 | \n", 387 | "231.0 | \n", 388 | "99.0 | \n", 389 | "
12 | \n", 392 | "早餐 | \n", 393 | "73.0 | \n", 394 | "69.0 | \n", 395 | "63.0 | \n", 396 | "66.0 | \n", 397 | "21.0 | \n", 398 | "75.0 | \n", 399 | "59.0 | \n", 400 | "142.0 | \n", 401 | "48.0 | \n", 402 | "
13 | \n", 405 | "烘焙、甜點、零食 | \n", 406 | "95.0 | \n", 407 | "223.0 | \n", 408 | "238.0 | \n", 409 | "221.0 | \n", 410 | "100.0 | \n", 411 | "364.0 | \n", 412 | "161.0 | \n", 413 | "166.0 | \n", 414 | "75.0 | \n", 415 | "
14 | \n", 418 | "燒烤類 | \n", 419 | "74.0 | \n", 420 | "187.0 | \n", 421 | "75.0 | \n", 422 | "82.0 | \n", 423 | "23.0 | \n", 424 | "182.0 | \n", 425 | "96.0 | \n", 426 | "107.0 | \n", 427 | "55.0 | \n", 428 | "
15 | \n", 431 | "異國料理 | \n", 432 | "109.0 | \n", 433 | "328.0 | \n", 434 | "245.0 | \n", 435 | "341.0 | \n", 436 | "80.0 | \n", 437 | "688.0 | \n", 438 | "299.0 | \n", 439 | "259.0 | \n", 440 | "94.0 | \n", 441 | "
16 | \n", 444 | "素食 | \n", 445 | "23.0 | \n", 446 | "71.0 | \n", 447 | "50.0 | \n", 448 | "52.0 | \n", 449 | "14.0 | \n", 450 | "62.0 | \n", 451 | "45.0 | \n", 452 | "39.0 | \n", 453 | "23.0 | \n", 454 | "
17 | \n", 457 | "網購包裝食品 | \n", 458 | "0.0 | \n", 459 | "0.0 | \n", 460 | "0.0 | \n", 461 | "0.0 | \n", 462 | "0.0 | \n", 463 | "0.0 | \n", 464 | "0.0 | \n", 465 | "0.0 | \n", 466 | "0.0 | \n", 467 | "
18 | \n", 470 | "速食料理 | \n", 471 | "29.0 | \n", 472 | "52.0 | \n", 473 | "63.0 | \n", 474 | "47.0 | \n", 475 | "24.0 | \n", 476 | "73.0 | \n", 477 | "57.0 | \n", 478 | "47.0 | \n", 479 | "25.0 | \n", 480 | "
19 | \n", 483 | "鍋類 | \n", 484 | "113.0 | \n", 485 | "205.0 | \n", 486 | "108.0 | \n", 487 | "117.0 | \n", 488 | "47.0 | \n", 489 | "227.0 | \n", 490 | "143.0 | \n", 491 | "158.0 | \n", 492 | "108.0 | \n", 493 | "
\n", 311 | " | name | \n", 312 | "address | \n", 313 | "url | \n", 314 | "
---|---|---|---|
0 | \n", 319 | "黑潮市集花甲蟹鍋 | \n", 320 | "台北市大安區光復南路692巷6號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n | \n", 321 | "http://www.ipeen.com.tw/shop/805942-黑潮市集花甲蟹鍋 | \n", 322 | "
1 | \n", 325 | "添好運台灣 Timhowan Taiwan | \n", 326 | "台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n | \n", 327 | "http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... | \n", 328 | "
2 | \n", 331 | "威靈頓街1號 粥麵茶餐廳 | \n", 332 | "台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 333 | "http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 | \n", 334 | "
3 | \n", 337 | "劉山東小吃店 | \n", 338 | "台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n | \n", 339 | "http://www.ipeen.com.tw/shop/44415-劉山東小吃店 | \n", 340 | "
4 | \n", 343 | "阜杭豆漿店 | \n", 344 | "台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... | \n", 345 | "http://www.ipeen.com.tw/shop/27702-阜杭豆漿店 | \n", 346 | "
\n", 473 | " | name | \n", 474 | "address | \n", 475 | "url | \n", 476 | "
---|---|---|---|
0 | \n", 481 | "酒食坊 Pān-toh Bistro | \n", 482 | "台北市松山區光復北路7號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n | \n", 483 | "http://www.ipeen.com.tw/shop/1046690-酒食坊-Pān-t... | \n", 484 | "
1 | \n", 487 | "添好運台灣 Timhowan Taiwan | \n", 488 | "台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n | \n", 489 | "http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... | \n", 490 | "
2 | \n", 493 | "威靈頓街1號 粥麵茶餐廳 | \n", 494 | "台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... | \n", 495 | "http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 | \n", 496 | "
3 | \n", 499 | "劉山東小吃店 | \n", 500 | "台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n | \n", 501 | "http://www.ipeen.com.tw/shop/44415-劉山東小吃店 | \n", 502 | "
4 | \n", 505 | "阜杭豆漿店 | \n", 506 | "台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... | \n", 507 | "http://www.ipeen.com.tw/shop/27702-阜杭豆漿店 | \n", 508 | "
Here's a paragraph of text!
""" 23 | result = get_soup_(text) 24 | assert result.text.strip() == "Here's a paragraph of text!" 25 | 26 | def test_extract_company(): 27 | expected = '\\n\\nU3 INFOTECH PTE. LTD.' 28 | soup = BeautifulSoup(html) 29 | result = extract_company_(soup) 30 | assert result == expected 31 | 32 | def test_extract_salary(): 33 | expected = 'NOT_FOUND' 34 | soup = BeautifulSoup(html) 35 | result = extract_salary_(soup) 36 | assert result == expected 37 | 38 | def test_extract_location(): 39 | expected= 'Shenton Way' 40 | soup = BeautifulSoup(html) 41 | result = extract_location_(soup) 42 | assert result == expected 43 | 44 | def test_extract_job_title(): 45 | expected= 'NOT_FOUND' 46 | soup = BeautifulSoup(html) 47 | result = extract_job_title_(soup) 48 | assert result == expected 49 | 50 | def test_extract_summary(): 51 | expected= 'NOT_FOUND' 52 | soup = BeautifulSoup(html) 53 | result = extract_summary_(soup) 54 | assert result == expected 55 | 56 | def test_extract_link(): 57 | expected= 'NOT_FOUND' 58 | soup = BeautifulSoup(html) 59 | result = extract_link_(soup) 60 | assert result == expected 61 | 62 | def test_extract_date(): 63 | expected= '1 day ago' 64 | soup = BeautifulSoup(html) 65 | result = extract_date_(soup) 66 | assert result == expected 67 | 68 | def test_extract_fulltext(): 69 | expected= 'NOT_FOUND' 70 | soup = BeautifulSoup(html) 71 | result = extract_fulltext_(soup) 72 | assert result == expected 73 | 74 | def test_get_full_job_link_(): 75 | expected1 = 'https://www.indeed.com.sg/123' 76 | expected2 = 'https://jp.indeed.com/123' 77 | result1 = get_full_job_link_("123", city='Singapore') 78 | result2 = get_full_job_link_("123", city='Tokyo') 79 | assert result1 == expected1 80 | assert result2 == expected2 81 | 82 | 83 | if __name__ == '__main__': 84 | pytest.main([__file__]) 85 | -------------------------------------------------------------------------------- /tests/unit_test_celery.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | sys.path.append(".") 4 | sys.path.append("./celery_queue") 5 | import pytest 6 | from unittest.mock import patch 7 | from unittest import TestCase 8 | from celery import chain 9 | from celery_queue import tasks 10 | 11 | # Ref of celery mock unit test 12 | # - https://www.distributedpython.com/2018/05/15/testing-celery-chains/ 13 | # - https://www.distributedpython.com/2018/05/01/unit-testing-celery-tasks/ 14 | # - http://docs.celeryproject.org/en/latest/userguide/testing.html 15 | 16 | class TestAddTask(unittest.TestCase): 17 | 18 | def test_task_state_and_addition(self): 19 | 20 | task = tasks.add.apply(args=[3, 5]) 21 | self.assertEqual(task.status, "SUCCESS") 22 | self.assertEqual(task.result, 8) 23 | 24 | class TestMultiplyTask(unittest.TestCase): 25 | 26 | def test_task_state_and_multiply(self): 27 | 28 | task = tasks.multiply.apply(args=[3, 5]) 29 | self.assertEqual(task.status, "SUCCESS") 30 | self.assertEqual(task.result, 15) 31 | 32 | class TestScrapeTask(unittest.TestCase): 33 | 34 | def test_task_state_and_scrape(self): 35 | 36 | task = tasks.scrape.apply() 37 | self.assertEqual(task.status, "SUCCESS") 38 | self.assertEqual(type(task.result), str) 39 | 40 | class TestIndeedScrapTask(unittest.TestCase): 41 | 42 | def test_task_indeed_scrape(self): 43 | 44 | task = tasks.indeed_scrape.apply() 45 | self.assertEqual(task.status, "SUCCESS") 46 | self.assertEqual(type(task.result), type(None)) 47 | 48 | class TestIndeedScrapAPITask(unittest.TestCase): 49 | 50 | def test_task_indeed_scrape_api(self): 51 | 52 | task = tasks.indeed_scrape_api.apply(args=["Tokyo"]) 53 | self.assertEqual(task.status, "SUCCESS") 54 | self.assertEqual(type(task.result), type(None)) 55 | 56 | 57 | # class TestAddTask(unittest.TestCase): 58 | # 59 | # def setUp(self): 60 | # self.task = add.apply_async(args=[3, 5]) 61 | # self.results = self.task.get() 62 | # 63 | # def test_task_state(self): 64 | # self.assertEqual(self.task.state, "SUCCESS") 65 | # 66 | # def test_addition(self): 67 | # self.assertEqual(self.results, 8) 68 | # 69 | # class TestMultiplyTask(unittest.TestCase): 70 | # 71 | # def setUp(self): 72 | # self.task = multiply.apply_async(args=[3, 5]) 73 | # self.results = self.task.get() 74 | # 75 | # def test_task_state(self): 76 | # self.assertEqual(self.task.state, "SUCCESS") 77 | # 78 | # def test_multiplication(self): 79 | # self.assertEqual(self.results, 15) 80 | # 81 | # class TestScrapeTask(unittest.TestCase): 82 | # 83 | # def setUp(self): 84 | # self.task = scrape.apply_async() 85 | # self.results = self.task.get() 86 | # 87 | # def test_task_state(self): 88 | # self.assertEqual(self.task.state, "SUCCESS") 89 | # 90 | # def test_scraping(self): 91 | # self.assertEqual(type(self.results), str) 92 | 93 | if __name__ == '__main__': 94 | unittest.main() -------------------------------------------------------------------------------- /travis_push_github.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #################################################################### 3 | # MODIFY FROM https://gist.github.com/willprice/e07efd73fb7f13f917ea 4 | #################################################################### 5 | 6 | setup_git() { 7 | git init 8 | git config --global user.email "travis@travis-ci.org" 9 | git config --global user.name "Travis CI" 10 | } 11 | 12 | commit_website_files() { 13 | git checkout -b gh-pages 14 | git add . *.html 15 | git commit --message "Travis build: $TRAVIS_BUILD_NUMBER" 16 | } 17 | 18 | commit_output_file() { 19 | git status 20 | git add output/* 21 | git commit --m "Travis build : $TRAVIS_BUILD_NUMBER" 22 | } 23 | 24 | commit_new_output_file() { 25 | d=`date +%Y-%m-%d` && echo $d 26 | git status 27 | for file in "output"/* 28 | do 29 | if [[ "$file" == *"$d"* ]];then 30 | echo "no today's new file, nothing to commit" 31 | else 32 | echo "commit new file..." 33 | git add output/* 34 | git commit --m "Travis build : $TRAVIS_BUILD_NUMBER" 35 | fi 36 | done 37 | } 38 | 39 | upload_files() { 40 | echo 'Travis push to github' 41 | git push https://yennanliu:${GH_TOKEN}@${GH_REF} HEAD:master --quiet 42 | 43 | } 44 | 45 | GH_REF=github.com/yennanliu/web_scraping.git 46 | setup_git 47 | commit_new_output_file 48 | upload_files --------------------------------------------------------------------------------