├── .github
    └── workflows
    │   ├── blank.yml
    │   └── python-app.yml
├── .gitignore
├── .travis.yml
├── Dockerfile
├── Dockerfile_dev
├── README.md
├── api
    ├── .dockerignore
    ├── Dockerfile
    ├── app.py
    ├── requirements.txt
    └── worker.py
├── celery_queue
    ├── .dockerignore
    ├── Dockerfile
    ├── IndeedScrapper
    │   ├── README.md
    │   ├── __init__.py
    │   ├── indeed_extract.py
    │   └── indeed_scrapper.py
    ├── __init__.py
    ├── log.txt
    ├── requirements.txt
    └── tasks.py
├── cron_indeed_scrapping_test.py
├── cron_test.py
├── dev
    ├── 104
    │   ├── code1.py
    │   ├── code2.py
    │   ├── code3.py
    │   └── index.txt
    └── test-1.ipynb
├── doc
    └── pic
    │   ├── architecture.jpg
    │   ├── architecture.svg
    │   ├── celery.jpg
    │   └── celery.svg
├── docker-compose.yml
├── legacy_project
    ├── archived
    │   ├── bank_swiftcode
    │   │   ├── UK_bank_swift_code_list.csv
    │   │   ├── grab_bank_list.py
    │   │   └── grab_bank_list_muitiprocess.py
    │   ├── booking
    │   │   ├── bookingcom_scrap.py
    │   │   └── next_page_sample.py
    │   ├── efish_scraping_demo.ipynb
    │   ├── glassdoor
    │   │   └── glassdoor_scrap.py
    │   └── spotify
    │   │   └── spotify_album copy.sh
    ├── blu_move
    │   ├── analysis.sql
    │   ├── blu_.json
    │   ├── blu_scrape_V1.py
    │   ├── blu_scrape_V1.sh
    │   ├── blu_scrape_V2.py
    │   ├── blu_scrape_V2.sh
    │   ├── run.sh
    │   ├── utility_data_IO.py
    │   └── utility_data_preprocess.py
    ├── carandclassic
    │   ├── README.md
    │   ├── analysis
    │   │   ├── .ipynb_checkpoints
    │   │   │   └── Rental_Location_EDA-checkpoint.ipynb
    │   │   ├── DemoLondonRentals.csv
    │   │   ├── README.md
    │   │   └── Rental_Location_EDA.ipynb
    │   ├── carandclassic_scrape_sample.csv
    │   └── cclassic_scrape_V1.py
    ├── carousell
    │   └── web_crawler copy.py
    ├── delivery_
    │   ├── .gitignore
    │   ├── README.md
    │   ├── analysis.py
    │   ├── analysis.sql
    │   ├── data2db.py
    │   ├── query_test.sh
    │   ├── scrap.py
    │   ├── sqlite2csv.sh
    │   ├── weather.csv
    │   └── weather.db
    ├── env.md
    ├── es_scrapper_docker_demo
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── app.py
    │   ├── docker-compose.yml
    │   └── requirements.txt
    ├── eztable
    │   ├── eztable_scarp.py
    │   ├── eztable_scrap_dev.py
    │   ├── eztable_scrap_dev2.py
    │   ├── eztable_scrap_inputword.py
    │   └── geckodriver.log
    ├── facebook_fan_page
    │   ├── google_scrap_fb_page_final.ipynb
    │   └── scrap_fb_page_test.ipynb
    ├── geojson.py
    ├── google_geodata
    │   ├── geopy_address_lon_lat.py
    │   └── gmap_address_lon_lat.py
    ├── ipeen
    │   ├── README.md
    │   ├── ipeen_grab.py
    │   ├── ipeen_pivot.py
    │   ├── ipeen_restaurant_grab_V2.ipynb
    │   ├── ipeen_restaurant_pivot_table.ipynb
    │   └── ipeen_scraping-final.ipynb
    ├── script
    │   ├── __init__.py
    │   ├── utility_data_IO.py
    │   └── utility_operation.py
    ├── setup.sh
    └── weather_scrapper
    │   ├── LDN_weather_scrapper_V1.py
    │   ├── README.md
    │   └── br_weather_scrapper_V1.py
├── logs
    └── log.txt
├── output
    └── 2019-08-14_jobs_1.txt
├── requirements.txt
├── script
    └── send_mail.py
├── slack_push.sh
├── tests
    ├── unit_test.py
    ├── unit_test_celery.py
    └── unittest_data.txt
└── travis_push_github.sh


/.github/workflows/blank.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Run a one-line script
13 |       run: echo this is the dummy test
14 |     - name: Run a multi-line script
15 |       run: |
16 |         echo  *** build start ***
17 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest
30 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |     - name: Test with pytest
32 |       run: |
33 |         pytest
34 |     - name: Run indeed scrapping test
35 |       run: |
36 |         python cron_indeed_scrapping_test.py
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *__pycache__
 3 | *.DS_Store
 4 | .DS_Store
 5 | *.ipynb_checkpoints
 6 | .ipynb_checkpoints
 7 | Indeed_scrapper_nb_V1.ipynb
 8 | logs/log.txt
 9 | celery_queue/celerybeat-schedule.db
10 | celery_queue/celerybeat.pid


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: python
 3 | services:
 4 |   - docker
 5 | branches:
 6 |   only:
 7 |   - master
 8 | notifications:
 9 |   email:
10 |     on_failure: always
11 |     recipients:
12 |       - f339339@gmail.com
13 | script:
14 |   - echo ' ----------------- STEP 0) UNIT TEST  ----------------- '
15 |   - pytest -v tests 
16 |   - python tests/unit_test_celery.py  -v
17 |   - echo ' ----------------- STEP 1) INDEED SCRAPING ----------------- '
18 |   - python cron_indeed_scrapping_test.py >> indeed_task.log
19 |   - ls output && ls logs  && cat logs/log.txt 
20 |   #- docker build -t web_scraping_env . && docker run -it web_scraping_env /bin/bash -c "python cron_test.py && bash travis_push_github.sh"
21 | after_success:
22 |   - echo 'push scraped file to slack...' && current_date=$(date +'%Y-%m-%d') && for file_name in $(ls output/*.csv) ;  do  echo $file_name $slack_channel &&  bash slack_push.sh -f $file_name -c $slack_channel -s $slack_token  -n "TRAVIS SLACK PUSH" -x " >>>> INDEED SCRAPING REPORT  $file_name" ; done
23 |   - echo 'push LOG to slack...' && bash slack_push.sh -f indeed_task.log -c web_scraping_log -s $slack_token 
24 | 
25 | 
26 | # deploy:
27 | #   provider: script
28 | #   script: bash travis_push_github.sh
29 | #   skip-cleanup: true
30 | #   target-branch: master
31 | #   github-token: $GH_TOKEN
32 | #   keep-history: true
33 | #   verbose: true
34 | #   on:
35 | #     branch: master
36 | # env:
37 | #   global:
38 | #     secure: Yfr36/XdwtZyjUBJwYTboFAfH5qqSYRd7d1vx/vHO1fCP4XtQWqT1Lvo5pfbHXghOjiJZZcfhO72inUKJ7er9QXznsGufj6nnQUJs/dOoBbfGnLSdvSYT6lpXTe7GYMbOgUsmYtjeD8S6pyL2L8xcX1fPZzsVD7v/edG9kZo1H9+fKCbVipBNf0IXO4DaE1H4vw77UVb6ysA3npxyIprM4jXUkZW3KFb7fA7/LENpS1NPniQxYe1LuUjzOpdJAG28WIeQnC/Cb+jz16cRtIV7HgukG0WnpHdszI+Xj4Kx+46URZnXW95cpZ2cq4Oywx98XZbC5uEXn3GeB/9JgvnuNsfsYOzhdCg29Ca/JGiUyri7F/x3mFxMfl2OoJeO50R4JTnwPrAHot8m914rP/VXtGZFPJQfXjoyKQJPnHFO0Yt+IJ9ziK3r3tLcdrbYngPuoBHFEYr4f87jOjdiyn/+1x9liLYh+Z0/6UdbQJRQnsAh+ghSvs1M7FIKY4eMHPW9qKPUbfsQIRckTzC6U7lX16eiPQk+wehJ7o//FB6MFOEvbownBcDUooITJXgC0Cvtpd831ktlkxPqyJh13X9URbEyD25zG58zI9Bq7RfeCjWN8LZaa7bLyjhDR2KzAvWDfKowbUShpznlSSSo2czn81kT1GXaAa4Iz215kNCDfs=


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6-alpine
 2 | 
 3 | LABEL maintainer "yennj12"
 4 | 
 5 | #ENV PYTHONPATH /app
 6 | ENV CELERY_BROKER_URL redis://redis:6379/0
 7 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0
 8 | ENV C_FORCE_ROOT true
 9 | ADD requirements.txt /app/requirements.txt
10 | ADD ./test_celery/ /app/
11 | COPY . /app
12 | WORKDIR /app/
13 | RUN pip install -r requirements.txt
14 | ENTRYPOINT celery -A task  worker --loglevel=info  


--------------------------------------------------------------------------------
/Dockerfile_dev:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3
 2 | 
 3 | LABEL maintainer "yennj12"
 4 | 
 5 | ENV HOME /
 6 | WORKDIR $HOME
 7 | COPY . $HOME
 8 | 
 9 | RUN pip install --upgrade pip && \
10 | pip install -r requirements.txt && \ 
11 | pwd && ls && ls home   
12 | 
13 | RUN /bin/bash -c "python cron_test.py"
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # web_scraping
  2 | 
  3 | Collection of scrapper pipelines build for different purposes 
  4 | 
  5 | [![Build Status](https://travis-ci.org/yennanliu/web_scraping.svg?branch=master)](https://travis-ci.org/yennanliu/web_scraping)
  6 | [![PRs](https://img.shields.io/badge/PRs-welcome-6574cd.svg)](https://github.com/yennanliu/web_scraping/pulls)
  7 | 
  8 | 
  9 | ### Architecture
 10 | <p align="center"><img src ="https://github.com/yennanliu/web_scraping/blob/master/doc/pic/architecture.svg" width="800" height="400"></p>
 11 | <p align="center"><img src ="https://github.com/yennanliu/web_scraping/blob/master/doc/pic/celery.svg" width="800" height="400"></p>
 12 | 
 13 | - Architecture idea
 14 | - Asynchronous tasks 
 15 | 	- Celery client : `flask` <---> `Celery client` <---> `Celery worker`. Be connected to flask to the celery task, issue the commands for the tasks
 16 | 	- Celery worker : A process that runs tasks in background, can be a `schedulued`task (periodic task), and a `asynchronous` (when API call) one.
 17 | 	- Massage broker : `Celery client` <--Massage broker-> `Celery worker`. The Celery client will need to via Message worker to communicate with Celery worker. Here I use `Redis` as the Message broker.
 18 | 
 19 | ### Quick Start
 20 | <details>
 21 | <summary>Quick start via docker</summary>
 22 | 
 23 | ```bash
 24 | # Run via docker 
 25 | $ cd ~ && git clone https://github.com/yennanliu/web_scraping
 26 | $ cd ~ && cd web_scraping &&  docker-compose -f  docker-compose.yml up 
 27 | ```
 28 | - visit the services via 
 29 | 	- flower UI : http://localhost:5555/
 30 | 	- Run "add" task : http://localhost:5001/add/1/2
 31 | 	- Run "web scrape" task : http://localhost:5001/scrap_task
 32 | 	- Run "indeed scrape" task : http://localhost:5001/indeed_scrap_task
 33 | 
 34 | </details>
 35 | 
 36 | <details>
 37 | <summary>Quick start manually</summary>
 38 | 
 39 | ```bash
 40 | # Run manually 
 41 | 
 42 | # STEP 1) open one terminal and run celery server locally 
 43 | $ cd ~ && cd web_scraping/celery_queue
 44 | # run task from API call  
 45 | $ celery -A tasks worker --loglevel=info
 46 | # run cron (periodic) task 
 47 | $ celery -A tasks beat
 48 | 
 49 | # STEP 2) Run radis server locally (with the other terminal)
 50 | # make sure you have already installed radis
 51 | $ redis-server
 52 | 
 53 | # STEP 3) Run flower  (with the other terminal)
 54 | $ cd ~ && cd web_scraping/celery_queue
 55 | $ celery flower -A tasks --address=127.0.0.1 --port=5555
 56 | 
 57 | # STEP 4) Add a sample task 
 58 | # "add" task
 59 | $ curl -X POST -d '{"args":[1,2]}' http://localhost:5555/api/task/async-apply/tasks.add
 60 | 
 61 | # "multiply" task
 62 | $ curl -X POST -d '{"args":[3,5]}' http://localhost:5555/api/task/async-apply/tasks.multiply
 63 | 
 64 | # "scrape_task" task
 65 | $ curl -X POST   http://localhost:5555/api/task/async-apply/tasks.scrape_task
 66 | 
 67 | # "scrape_task_api" task
 68 | $ curl -X POST -d '{"args":["mlflow","mlflow"]}' http://localhost:5555/api/task/async-apply/tasks.scrape_task_api
 69 | 
 70 | # "indeed_scrap_task" task
 71 | $ curl -X POST  http://localhost:5555/api/task/async-apply/tasks.indeed_scrap_task
 72 | 
 73 | # "indeed_scrap_api_V1" task
 74 | $ curl -X POST -d '{"args":["New+York"]}' http://localhost:5555/api/task/async-apply/tasks.indeed_scrap_api_V1
 75 | 
 76 | ```
 77 | </details>
 78 | 
 79 | 
 80 | ### File structure 
 81 | 
 82 | ``` 
 83 | ├── Dockerfile
 84 | ├── README.md
 85 | ├── api.                  : Celery api (broker, job accepter(flask))
 86 | │   ├── Dockerfile        : Dockerfile build celery api 
 87 | │   ├── app.py            : Flask server accept job request(api)
 88 | │   ├── requirements.txt
 89 | │   └── worker.py         : Celery broker, celery backend(redis)
 90 | ├── celery-queue          : Run main web scrapping jobs (via celery)
 91 | │   ├── Dockerfile        : Dockerfile build celery-queue
 92 | │   ├── IndeedScrapper    : Scrapper scrape Indeed.com 
 93 | │   ├── requirements.txt
 94 | │   └── tasks.py          : Celery run scrapping tasks 
 95 | ├── cron_indeed_scrapping_test.py
 96 | ├── cron_test.py
 97 | ├── docker-compose.yml    : docker-compose build whole system : api, celery-queue, redis, and flower(celery job monitor)
 98 | ├── legacy_project        
 99 | ├── logs                  : Save running logs 
100 | ├── output                : Save scraped data 
101 | ├── requirements.txt
102 | └── travis_push_github.sh : Script auto push output to github via Travis 
103 | 
104 | ```
105 | 
106 | ### Development
107 | 
108 | <details>
109 | <summary>Development</summary>
110 | 
111 | ```bash
112 | # Run Unit test # 1 
113 | $ pytest -v tests/
114 | # ================================== test session starts ==================================
115 | # platform darwin -- Python 3.6.4, pytest-5.0.1, py-1.5.2, pluggy-0.12.0 -- /Users/jerryliu/anaconda3/envs/yen_dev/bin/python
116 | # cachedir: .pytest_cache
117 | # rootdir: /Users/jerryliu/web_scraping
118 | # plugins: cov-2.7.1, celery-4.3.0
119 | # collected 10 items                                                                      
120 | # tests/unit_test.py::test_get_soup PASSED                                          [ 10%]
121 | # tests/unit_test.py::test_extract_company PASSED                                   [ 20%]
122 | # tests/unit_test.py::test_extract_salary PASSED                                    [ 30%]
123 | # tests/unit_test.py::test_extract_location PASSED                                  [ 40%]
124 | # tests/unit_test.py::test_extract_job_title PASSED                                 [ 50%]
125 | # tests/unit_test.py::test_extract_summary PASSED                                   [ 60%]
126 | # tests/unit_test.py::test_extract_link PASSED                                      [ 70%]
127 | # tests/unit_test.py::test_extract_date PASSED                                      [ 80%]
128 | # tests/unit_test.py::test_extract_fulltext PASSED                                  [ 90%]
129 | # tests/unit_test.py::test_get_full_job_link_ PASSED                                [100%]
130 | 
131 | # Run Unit test # 2 
132 | python tests/unit_test_celery.py  -v
133 | # test_addition (__main__.TestAddTask) ... ok
134 | # test_task_state (__main__.TestAddTask) ... ok
135 | # test_multiplication (__main__.TestMultiplyTask) ... ok
136 | # test_task_state (__main__.TestMultiplyTask) ... ok
137 | # ----------------------------------------------------------------------
138 | # Ran 4 tests in 0.131s
139 | # OK
140 | 
141 | ```
142 | </details>
143 | 
144 | ### Tech
145 | * [Celery](http://docs.celeryproject.org/en/latest/getting-started/first-steps-with-celery.html) : parallel/single thread python tasks management tool (celery broker/worker)
146 | * [Redis](https://redis.io/)  : key-value DB save task data 
147 | * [Flower](https://flower.readthedocs.io/en/latest/) : UI monitor celery tasks 
148 | * [Flask](http://flask.palletsprojects.com/en/1.1.x/) : python light web framework, as project backend server here  
149 | * [Docker](https://www.docker.com/get-started) : build the app environment 
150 | 
151 | 
152 | ### Todo 
153 | <details>
154 | <summary>TODO</summary>
155 | 
156 | ```
157 | ### Project level
158 | 
159 | 0. Deploy to Heroku cloud and make the scrapper as an API service 
160 | 1. Dockerize the project 
161 | 2. Run the scrapping (cron/paralel)jobs via Celery 
162 | 4. Add test (unit/integration test) 
163 | 5. Design DB model that save scrapping data systematically 
164 | 
165 | ### Programming level 
166 | 
167 | 1. Add utility scripts that can get XPATH of all objects in html
168 | 2. Workflow that automate whole processes
169 | 3. Job management 
170 | 	- Multiprocessing
171 | 	- Asynchronous
172 | 	- Queue 
173 | 4. Scrapping tutorial 
174 | 5. Scrapy, Phantomjs 
175 | 
176 | ### Others 
177 | 
178 | 1. Web scrapping 101 tutorial 
179 | 
180 | ```
181 | </details>
182 | 
183 | ### Ref 
184 | <details>
185 | <summary>Ref</summary>
186 | 
187 | - Scraping via Celery
188 | 	- https://www.pythoncircle.com/post/518/scraping-10000-tweets-in-60-seconds-using-celery-rabbitmq-and-docker-cluster-with-rotating-proxy/
189 | 	- http://allynh.com/blog/flask-asynchronous-background-tasks-with-celery-and-redis/
190 | 
191 | - Travis push to github 
192 | 	- https://stackoverflow.com/questions/51925941/travis-ci-how-to-push-to-master-branch
193 | 	- https://medium.com/@preslavrachev/using-travis-for-secure-building-and-deployment-to-github-5a97afcac113
194 | 	- https://gist.github.com/willprice/e07efd73fb7f13f917ea
195 | 	- https://www.vinaygopinath.me/blog/tech/commit-to-master-branch-on-github-using-travis-ci/
196 | 	- https://www.hidennis.tech/2015/07/07/deploy-blog-using-travis/
197 | 
198 | - Indeed scrapping 
199 | 	- https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b
200 | 	- https://github.com/tarunsinghal92/indeedscrapperlatest
201 | 
202 | - Distributed scrapping
203 | 	- https://github.com/tikazyq/crawlab
204 | 
205 | - Unit test Celery
206 | 	- https://docs.celeryproject.org/en/latest/userguide/testing.html
207 | </details>


--------------------------------------------------------------------------------
/api/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | .dockerignore
3 | 


--------------------------------------------------------------------------------
/api/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6-alpine
 2 | 
 3 | ENV PYTHONPATH /app 
 4 | ENV CELERY_BROKER_URL redis://redis:6379/0
 5 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0
 6 | ENV C_FORCE_ROOT true
 7 | 
 8 | ENV HOST 0.0.0.0
 9 | ENV PORT 5001
10 | ENV DEBUG true
11 | 
12 | COPY . /api
13 | WORKDIR /api
14 | 
15 | # install requirements
16 | RUN pip install -r requirements.txt
17 | 
18 | # expose the app port
19 | EXPOSE 5001
20 | 
21 | RUN pip install gunicorn
22 | 
23 | # run the app server
24 | CMD ["gunicorn", "--bind", "0.0.0.0:5001", "--workers", "3", "app:app"]


--------------------------------------------------------------------------------
/api/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, url_for
 2 | import celery.states as states
 3 | import sys 
 4 | sys.path.append("..")
 5 | # udf 
 6 | from worker import celery
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | @app.route('/scrap_task')
11 | def run_github_scrape():
12 |     task = celery.send_task('tasks.scrap_task',kwargs={})
13 |     response = f"<a href='{url_for('check_task', task_id=task.id, external=True)}'>check status of {task.id} </a>"
14 |     return response
15 | 
16 | @app.route('/scrap_task_api/<string:account>/<string:repo_name>')
17 | def run_github_scrape_api():
18 |     task = celery.send_task('tasks.scrape_github_api',kwargs={})
19 |     response = f"<a href='{url_for('check_task', task_id=task.id, external=True)}'>check status of {task.id} </a>"
20 |     return response
21 | 
22 | @app.route('/indeed_scrap_task')
23 | def run_indeed_scrape():
24 |     task = celery.send_task('tasks.indeed_scrap_task',kwargs={})
25 |     response = f"<a href='{url_for('check_task', task_id=task.id, external=True)}'>check status of {task.id} </a>"
26 |     return response
27 | 
28 | @app.route('/indeed_scrap_api_V1/<string:city_set>')
29 | def run_indeed_scrape_api(city_set: str):
30 |     print ('city_set :', city_set)
31 |     task = celery.send_task('tasks.indeed_scrap_api_V1',city_set,kwargs={})
32 |     response = f"<a href='{url_for('check_task', task_id=task.id, external=True)}'>check status of {task.id} </a>"
33 |     return response
34 | 
35 | @app.route('/check/<string:task_id>')
36 | def check_task(task_id: str) -> str:
37 |     res = celery.AsyncResult(task_id)
38 |     if res.state == states.PENDING:
39 |         return res.state
40 |     else:
41 |         return str(res.result)


--------------------------------------------------------------------------------
/api/requirements.txt:
--------------------------------------------------------------------------------
 1 | amqp==2.2.2
 2 | Babel==2.9.1
 3 | billiard==3.5.0.3
 4 | celery==5.2.2
 5 | click==6.7
 6 | Flask==2.3.2
 7 | itsdangerous==0.24
 8 | Jinja2>=2.10.1
 9 | kombu==4.2.0
10 | MarkupSafe==1.0
11 | pytz==2018.3
12 | redis==4.4.4
13 | tornado==5.0.2
14 | vine==1.1.4
15 | Werkzeug>=0.15.3
16 | beautifulsoup4


--------------------------------------------------------------------------------
/api/worker.py:
--------------------------------------------------------------------------------
1 | import os
2 | from celery import Celery
3 | import sys 
4 | sys.path.append("..")
5 | 
6 | CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', 'redis://localhost:6379'),
7 | CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379')
8 | 
9 | celery = Celery('tasks', broker=CELERY_BROKER_URL, backend=CELERY_RESULT_BACKEND)


--------------------------------------------------------------------------------
/celery_queue/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | .dockerignore
3 | 


--------------------------------------------------------------------------------
/celery_queue/Dockerfile:
--------------------------------------------------------------------------------
 1 | #FROM python:3.6-alpine
 2 | FROM python:3.6-slim
 3 | #FROM continuumio/miniconda3
 4 | 
 5 | ENV PYTHONPATH /queue 
 6 | ENV CELERY_BROKER_URL redis://redis:6379/0
 7 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0
 8 | ENV C_FORCE_ROOT true
 9 | COPY . /queue
10 | WORKDIR /queue
11 | RUN mkdir -p output logs && pip install -r requirements.txt
12 | ENTRYPOINT celery -A tasks worker --loglevel=info


--------------------------------------------------------------------------------
/celery_queue/IndeedScrapper/README.md:
--------------------------------------------------------------------------------
1 | - Modify from https://github.com/tarunsinghal92/indeedscrapperlatest


--------------------------------------------------------------------------------
/celery_queue/IndeedScrapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/celery_queue/IndeedScrapper/__init__.py


--------------------------------------------------------------------------------
/celery_queue/IndeedScrapper/indeed_extract.py:
--------------------------------------------------------------------------------
  1 | import bs4
  2 | from bs4 import BeautifulSoup
  3 | 
  4 | # get soup object
  5 | def get_soup(text):
  6 | 	return BeautifulSoup(text, "lxml", from_encoding="utf-8")
  7 | 
  8 | 
  9 | # extract company
 10 | def extract_company(div): 
 11 |     company = div.find_all(name="span", attrs={"class":"company"})
 12 |     if len(company) > 0:
 13 |         for b in company:
 14 |             return (b.text.strip())
 15 |     else:
 16 |         sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
 17 |         for span in sec_try:
 18 |             return (span.text.strip())
 19 |     return 'NOT_FOUND'
 20 | 
 21 | 
 22 | # extract job salary
 23 | def extract_salary(div): 
 24 |     try:
 25 |         return (div.find('nobr').text)
 26 |     except:
 27 |         try:
 28 |             div_two = div.find(name='div', attrs={'class':'sjcl'})
 29 |             div_three = div_two.find('div')
 30 |             salaries.append(div_three.text.strip())
 31 |         except:
 32 |             return ('NOT_FOUND')
 33 |     return 'NOT_FOUND'
 34 | 
 35 | 
 36 | # extract job location
 37 | def extract_location(div):
 38 |     for span in div.findAll('span', attrs={'class': 'location'}):
 39 |         return (span.text)
 40 |     return 'NOT_FOUND'
 41 | 
 42 | 
 43 | # extract job title
 44 | def extract_job_title(div):
 45 |     for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
 46 |         return (a['title'])
 47 |     return('NOT_FOUND')
 48 | 
 49 | 
 50 | # extract jd summary
 51 | def extract_summary(div): 
 52 |     spans = div.findAll('span', attrs={'class': 'summary'})
 53 |     for span in spans:
 54 |         return (span.text.strip())
 55 |     return 'NOT_FOUND'
 56 |  
 57 | 
 58 | # extract link of job description 
 59 | def extract_link(div, city=None): 
 60 |     for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
 61 |         #return (a['href'])
 62 |         return get_full_job_link(a['href'], city)
 63 |     return('NOT_FOUND')
 64 | 
 65 | 
 66 | # extract date of job when it was posted
 67 | def extract_date(div):
 68 |     try:
 69 |         spans = div.findAll('span', attrs={'class': 'date'})
 70 |         for span in spans:
 71 |             return (span.text.strip())
 72 |     except:
 73 |         return 'NOT_FOUND'
 74 |     return 'NOT_FOUND'
 75 | 
 76 | 
 77 | # extract full job description from link
 78 | def extract_fulltext(url):
 79 |     try:
 80 |         page = requests.get('http://www.indeed.com' + url)
 81 |         soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
 82 |         spans = soup.findAll('span', attrs={'class': 'summary'})
 83 |         for span in spans:
 84 |             return (span.text.strip())
 85 |     except:
 86 |         return 'NOT_FOUND'
 87 |     return 'NOT_FOUND'
 88 | 
 89 | 
 90 | # write logs to file
 91 | def write_logs(text):
 92 |     # print(text + '\n')
 93 |     try:
 94 |         f = open('logs/log.txt','a')
 95 |     except Exception as e:
 96 |         print (str(e), "logs directory not exists, save at current url instead")
 97 |         f = open('log.txt', 'a')
 98 |     f.write(text + '\n')  
 99 |     f.close()
100 | 
101 | 
102 | # get full job link with country code 
103 | def get_full_job_link(link, city):
104 |     
105 |     if city=="Singapore":
106 |         return "https://www.indeed.com.sg/" + link
107 | 
108 |     elif city =="Tokyo":
109 |         return "https://jp.indeed.com/" + link
110 |         
111 |     else:
112 |         return "https://www.indeed.com" + link 


--------------------------------------------------------------------------------
/celery_queue/IndeedScrapper/indeed_scrapper.py:
--------------------------------------------------------------------------------
  1 | # import packages
  2 | import requests
  3 | import pandas as pd
  4 | import time 
  5 | import datetime
  6 | from IndeedScrapper.indeed_extract import *
  7 | 
  8 | def Scrape_Runner(city_set=['New+York'], job_set=['data+scientist'], max_results_per_city=50, file=1, SKIPPER=0):
  9 | 
 10 |     # current date 
 11 |     current_time, current_date  = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), datetime.datetime.now().strftime('%Y-%m-%d')
 12 | 
 13 |     # loop on all cities
 14 |     for city in city_set:
 15 |         
 16 |         # for each job role
 17 |         for job_qry in job_set:
 18 |             
 19 |             # count
 20 |             cnt = 0
 21 |             startTime = time.time()
 22 | 
 23 |             # skipper
 24 |             if(file > SKIPPER):
 25 |             
 26 |                 # dataframe
 27 |                 df = pd.DataFrame(columns = ['unique_id', 'city', 'job_qry','job_title', 'company_name', 'location', 'summary', 'salary', 'link', 'date', 'full_text'])
 28 |             
 29 |                 # for results
 30 |                 for start in range(0, max_results_per_city, 10):
 31 | 
 32 |                     # get dom 
 33 |                     page = requests.get('http://www.indeed.com/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start))
 34 | 
 35 |                     #ensuring at least 1 second between page grabs                    
 36 |                     time.sleep(1)  
 37 | 
 38 |                     #fetch data
 39 |                     soup = get_soup(page.text)
 40 |                     divs = soup.find_all(name="div", attrs={"class":"row"})
 41 |                     
 42 |                     # if results exist
 43 |                     if(len(divs) == 0):
 44 |                         break
 45 | 
 46 |                     # for all jobs on a page
 47 |                     for div in divs: 
 48 | 
 49 |                         #specifying row num for index of job posting in dataframe
 50 |                         num = (len(df) + 1) 
 51 |                         cnt = cnt + 1
 52 | 
 53 |                         #job data after parsing
 54 |                         job_post = [] 
 55 | 
 56 |                         #append unique id
 57 |                         job_post.append(div['id'])
 58 | 
 59 |                         #append city name
 60 |                         job_post.append(city)
 61 | 
 62 |                         #append job qry
 63 |                         job_post.append(job_qry)
 64 | 
 65 |                         #grabbing job title
 66 |                         job_post.append(extract_job_title(div))
 67 | 
 68 |                         #grabbing company
 69 |                         job_post.append(extract_company(div))
 70 | 
 71 |                         #grabbing location name
 72 |                         job_post.append(extract_location(div))
 73 | 
 74 |                         #grabbing summary text
 75 |                         job_post.append(extract_summary(div))
 76 | 
 77 |                         #grabbing salary
 78 |                         job_post.append(extract_salary(div))
 79 | 
 80 |                         #grabbing link
 81 |                         link = extract_link(div)
 82 |                         job_post.append(link)
 83 | 
 84 |                         #grabbing date
 85 |                         job_post.append(extract_date(div))
 86 | 
 87 |                         #grabbing full_text
 88 |                         job_post.append(extract_fulltext(link))
 89 | 
 90 |                         #appending list of job post info to dataframe at index num
 91 |                         df.loc[num] = job_post
 92 |                         
 93 |                     #debug add
 94 |                     write_logs(('Completed =>') + '\t' + city  + '\t' + job_qry + '\t' + str(cnt) + '\t' + str(start) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + '  ' + str(current_time))
 95 | 
 96 |                 #saving df as a local csv file 
 97 |                 try:
 98 |                     df.to_csv('output/{}_jobs_'.format(current_date) + str(file) + '.csv', encoding='utf-8')
 99 |                 except Exception as e:
100 |                     print (str(e), "outout not exists, save at current url instead")
101 |                     df.to_csv('{}_jobs_'.format(current_date) + str(file) + '.csv', encoding='utf-8')
102 |                 print (df.head(3))
103 |                 print ("len(df)", len(df))
104 |             
105 |             else:
106 | 
107 |                 #debug add
108 |                 write_logs(('Skipped =>') + '\t' + city  + '\t' + job_qry + '\t' + str(-1) + '\t' + str(-1) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + '  ' + str(current_time))
109 |             
110 |             # increment file
111 |             file = file + 1
112 | 
113 | 


--------------------------------------------------------------------------------
/celery_queue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/celery_queue/__init__.py


--------------------------------------------------------------------------------
/celery_queue/log.txt:
--------------------------------------------------------------------------------
 1 | Completed =>	New+York	data+scientist	10	0	2.892832040786743	file_1  2020-02-02-18:29:30
 2 | Completed =>	New+York	data+scientist	20	10	5.695845127105713	file_1  2020-02-02-18:29:30
 3 | Completed =>	New+York	data+scientist	30	20	8.427727222442627	file_1  2020-02-02-18:29:30
 4 | Completed =>	New+York	data+scientist	40	30	11.270271062850952	file_1  2020-02-02-18:29:30
 5 | Completed =>	New+York	data+scientist	50	40	15.064821004867554	file_1  2020-02-02-18:29:30
 6 | Completed =>	+	data+scientist	10	0	3.3448691368103027	file_4  2020-02-02-18:29:35
 7 | Completed =>	+	data+scientist	20	10	6.482846021652222	file_4  2020-02-02-18:29:35
 8 | Completed =>	+	data+scientist	30	20	9.563256978988647	file_4  2020-02-02-18:29:35
 9 | Completed =>	+	data+scientist	40	30	12.740447044372559	file_4  2020-02-02-18:29:35
10 | Completed =>	+	data+scientist	50	40	15.74568486213684	file_4  2020-02-02-18:29:35
11 | 


--------------------------------------------------------------------------------
/celery_queue/requirements.txt:
--------------------------------------------------------------------------------
 1 | amqp==2.2.2
 2 | Babel==2.9.1
 3 | billiard==3.5.0.3
 4 | celery==5.2.2
 5 | flower==1.2.0
 6 | kombu==4.2.0
 7 | pytz==2018.3
 8 | redis==4.4.4
 9 | tornado==5.0.2
10 | vine==1.1.4
11 | beautifulsoup4
12 | requests
13 | pandas
14 | lxml
15 | 


--------------------------------------------------------------------------------
/celery_queue/tasks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import sys
 4 | from datetime import timedelta
 5 | import urllib.request as request
 6 | from bs4 import BeautifulSoup
 7 | from celery import Celery
 8 | from celery.schedules import crontab
 9 | from celery.task.base import periodic_task
10 | 
11 | CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', 'redis://localhost:6379'),
12 | CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379')
13 | celery = Celery('tasks', broker=CELERY_BROKER_URL, backend=CELERY_RESULT_BACKEND)
14 | 
15 | @celery.task(name="tasks.add")
16 | def add(x, y):
17 |     return x+y
18 | 
19 | @celery.task(name="tasks.multiply")
20 | def multiply(x, y):
21 |     return x*y
22 | 
23 | @periodic_task(run_every=(crontab(minute='*')),name="run_every_minute",ignore_result=True)
24 | def push_heart_beat():
25 |     print ("this is heart beat")
26 |     return "this is heart beat"
27 | 
28 | @celery.task(name='tasks.scrape_task')
29 | def scrape():
30 |     url = 'https://github.com/apache/spark'
31 |     opener=request.build_opener()
32 |     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
33 |     page = opener.open(url)
34 |     soup = BeautifulSoup(page)
35 |     print (soup.text)
36 |     return soup.text
37 | 
38 | @celery.task(name='tasks.scrape_task_api')
39 | def scrape_github_api(account, repo_name):
40 |     url = 'https://github.com/{}/{}'.format(account, repo_name)
41 |     print ("*** url", url)
42 |     opener=request.build_opener()
43 |     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
44 |     page = opener.open(url)
45 |     soup = BeautifulSoup(page)
46 |     print (soup.text)
47 |     return soup.text
48 | 
49 | @celery.task(name='tasks.indeed_scrap_task')
50 | def indeed_scrape():
51 |     sys.path.append(".")
52 |     from IndeedScrapper.indeed_scrapper import Scrape_Runner
53 |     Scrape_Runner()
54 | 
55 | @celery.task(name='tasks.indeed_scrap_api_V1')
56 | def indeed_scrape_api(city_set):
57 |     sys.path.append(".")
58 |     from IndeedScrapper.indeed_scrapper import Scrape_Runner
59 |     Scrape_Runner(city_set)
60 |     


--------------------------------------------------------------------------------
/cron_indeed_scrapping_test.py:
--------------------------------------------------------------------------------
  1 | # import packages
  2 | import requests
  3 | import pandas as pd
  4 | import time 
  5 | import datetime
  6 | import os 
  7 | from celery_queue.IndeedScrapper.indeed_extract import *
  8 | 
  9 | 
 10 | # current date 
 11 | current_time, current_date  = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), datetime.datetime.now().strftime('%Y-%m-%d')
 12 | 
 13 | # limit per sity
 14 | max_results_per_city = 100
 15 | 
 16 | # db of city 
 17 | city_set = ['New+York', 'San+Francisco','Singapore','Tokyo']
 18 | 
 19 | # job roles
 20 | job_set = ['data+engineer', 'machine+learning+engineer', 'data+scientist']
 21 | 
 22 | # output dir 
 23 | output_dir='./output'
 24 | 
 25 | # file num
 26 | file = 1
 27 | 
 28 | # from where to skip
 29 | SKIPPER = 0
 30 | 
 31 | # loop on all cities
 32 | for city in city_set:
 33 |     
 34 |     # for each job role
 35 |     for job_qry in job_set:
 36 |         
 37 |         # count
 38 |         cnt = 0
 39 |         startTime = time.time()
 40 | 
 41 |         # skipper
 42 |         if(file > SKIPPER):
 43 |         
 44 |             # dataframe
 45 |             df = pd.DataFrame(columns = ['unique_id', 'city', 'job_qry','job_title', 'company_name', 'location', 'summary', 'salary', 'link', 'date', 'full_text'])
 46 |         
 47 |             # for results
 48 |             for start in range(0, max_results_per_city, 10):
 49 | 
 50 |                 # get dom 
 51 | 
 52 |                 # hot fix here for Asia city scrapping (will optimize it then)
 53 |                 if city=='Singapore':
 54 |                     page = requests.get('http://www.indeed.com.sg/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start))
 55 | 
 56 |                 elif city=='Tokyo':
 57 |                     page = requests.get('https://jp.indeed.com/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start))
 58 | 
 59 |                 else:
 60 |                     page = requests.get('http://www.indeed.com/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start))
 61 | 
 62 |                 #ensuring at least 1 second between page grabs                    
 63 |                 time.sleep(1)  
 64 | 
 65 |                 #fetch data
 66 |                 soup = get_soup(page.text)
 67 |                 divs = soup.find_all(name="div", attrs={"class":"row"})
 68 |                 
 69 |                 # if results exist
 70 |                 if(len(divs) == 0):
 71 |                     break
 72 | 
 73 |                 # for all jobs on a page
 74 |                 for div in divs: 
 75 | 
 76 |                     #specifying row num for index of job posting in dataframe
 77 |                     num = (len(df) + 1) 
 78 |                     cnt = cnt + 1
 79 | 
 80 |                     #job data after parsing
 81 |                     job_post = [] 
 82 | 
 83 |                     #append unique id
 84 |                     job_post.append(div['id'])
 85 | 
 86 |                     #append city name
 87 |                     job_post.append(city)
 88 | 
 89 |                     #append job qry
 90 |                     job_post.append(job_qry)
 91 | 
 92 |                     #grabbing job title
 93 |                     job_post.append(extract_job_title(div))
 94 | 
 95 |                     #grabbing company
 96 |                     job_post.append(extract_company(div))
 97 | 
 98 |                     #grabbing location name
 99 |                     job_post.append(extract_location(div))
100 | 
101 |                     #grabbing summary text
102 |                     job_post.append(extract_summary(div))
103 | 
104 |                     #grabbing salary
105 |                     job_post.append(extract_salary(div))
106 | 
107 |                     #grabbing link
108 |                     link = extract_link(div, city)
109 |                     job_post.append(link)
110 | 
111 |                     #grabbing date
112 |                     job_post.append(extract_date(div))
113 | 
114 |                     #grabbing full_text
115 |                     job_post.append(extract_fulltext(link))
116 | 
117 |                     #appending list of job post info to dataframe at index num
118 |                     df.loc[num] = job_post
119 |                     
120 |                 #debug add
121 |                 write_logs(('Completed =>') + '\t' + city  + '\t' + job_qry + '\t' + str(cnt) + '\t' + str(start) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + '  ' + str(current_time))
122 | 
123 |             #saving df as a local csv file  
124 |             if not os.path.exists(output_dir):
125 |                 os.mkdir(output_dir)
126 |             df = df.sort_values('date') # sort the df by job post date
127 |             # for QA only
128 |             print (df.head(10))
129 |             df.to_csv('output/{}_jobs_{}_{}'.format(current_date, str(city).replace('+','_'), str(job_qry).replace('+','_'))  + '.csv', encoding='utf-8')
130 |         
131 |         else:
132 | 
133 |             #debug add
134 |             write_logs(('Skipped =>') + '\t' + city  + '\t' + job_qry + '\t' + str(-1) + '\t' + str(-1) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + '  ' + str(current_time))
135 |         
136 |         # increment file
137 |         file = file + 1
138 | 


--------------------------------------------------------------------------------
/cron_test.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | def main():
 4 |     current_time, current_date  = datetime.datetime.now(), datetime.datetime.now().strftime('%Y-%m-%d')
 5 |     print('current time : ', current_time)
 6 |     with open('output/{}.txt'.format('output-'+str(current_date)), "w") as file:
 7 |         file.write('* this is cron test program \n')
 8 |         file.write(str(current_time) + '\n')
 9 |         file.write('hello world')
10 |         file.close()
11 |         print ('write to file OK')
12 | 
13 | if __name__ == '__main__':
14 |     main()


--------------------------------------------------------------------------------
/dev/104/code1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from fake_useragent import UserAgent
 6 | 
 7 | # 初始化 fake_useragent
 8 | ua = UserAgent(platforms='pc')
 9 | 
10 | # 設定 base_url 和查詢參數
11 | base_url = "https://www.104.com.tw/jobs/search/"
12 | params = {
13 |     'keyword': 'python',
14 |     'page': 1
15 | }
16 | 
17 | # 用來儲存所有工作的 URL
18 | job_urls = []
19 | 
20 | # 爬取前 150 頁
21 | for page in range(1, 151):
22 |     print(f"正在抓取第 {page} 頁...")
23 |     params['page'] = page
24 |     
25 |     # 建立隨機的 User-Agent
26 |     headers = {
27 |         'User-Agent': ua.random
28 |     }
29 |     
30 |     # 發送 GET 請求
31 |     response = requests.get(base_url, headers=headers, params=params)
32 |     soup = BeautifulSoup(response.text, 'lxml')
33 |     
34 |     # 找到所有的工作列表項目
35 |     job_items = soup.find_all('article', class_='js-job-item')
36 |     
37 |     # For Loop 每個工作項目，提取工作 URL
38 |     for job in job_items:
39 |         job_link = job.find('a', class_='js-job-link')
40 |         if job_link:
41 |             job_url = job_link['href']
42 |             # 104 的 URL 需要補全
43 |             full_job_url = "https:" + job_url
44 |             job_urls.append(full_job_url)
45 |     
46 |     # 隨機等待 5 到 10 秒
47 |     sleep_time = random.uniform(5, 10)
48 |     print(f"等待 {sleep_time:.2f} 秒...")
49 |     time.sleep(sleep_time)
50 | 


--------------------------------------------------------------------------------
/dev/104/code2.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pandas as pd
 3 | import random
 4 | import time
 5 | from fake_useragent import UserAgent
 6 | 
 7 | # 將 JSON 資料轉換為結構化字典
 8 | def convert_job_data(original_dict):
 9 |     data = original_dict['data']
10 | 
11 |     # 將 jobType 轉換為描述文字
12 |     job_type_mapping = {
13 |         0: '全部',
14 |         1: '全職',
15 |         2: '兼職',
16 |         3: '高薪',
17 |         4: '派遣'
18 |     }
19 |     
20 |     # 將 remoteWork 轉換為描述文字
21 |     remote_work_mapping = {
22 |         1: '完全遠端',
23 |         2: '部分遠端'
24 |     }
25 |     
26 |     # 建立包含工作資訊的字典
27 |     job_info = {
28 |         '職缺名稱': data['header']['jobName'],
29 |         '公司名稱': data['header']['custName'],
30 |         '公司網址': data['header']['custUrl'],
31 |         '發佈日期': data['header']['appearDate'],
32 |         '職缺分析網址': 'https:' + data['header']['analysisUrl'],
33 |         '上班地區': data['jobDetail']['addressRegion'],
34 |         '上班地點': data['jobDetail']['addressDetail'],
35 |         '工作待遇': data['jobDetail']['salary'],
36 |         '最低薪資': data['jobDetail']['salaryMin'],
37 |         '最高薪資': data['jobDetail']['salaryMax'],
38 |         '工作性質': job_type_mapping.get(data['jobDetail']['jobType'], '未知'),
39 |         '上班時段': data['jobDetail']['workPeriod'],
40 |         '假期政策': data['jobDetail']['vacationPolicy'],
41 |         '工作經歷': data['condition']['workExp'],
42 |         '學歷要求': data['condition']['edu'],
43 |         '擅長工具': [specialty['description'] for specialty in data['condition']['specialty']],
44 |         '工作技能': [skill['description'] for skill in data['condition']['skill']],
45 |         '產業類別': data['industry'],
46 |         '職務類別': [category['description'] for category in data.get('jobDetail', {}).get('jobCategory', [])],
47 |         '出差外派': data['jobDetail']['businessTrip'],
48 |         '遠端工作': remote_work_mapping.get((data['jobDetail'].get('remoteWork') or {}).get('type', 0), '無'),
49 |         '公司人數': '' if data.get('employees') == '暫不提供' else data.get('employees', '').replace('人', ''),
50 |         '管理責任': data['jobDetail']['manageResp']
51 |     }
52 |     return job_info
53 | 
54 | # 單獨抓取某一職缺的詳細資料
55 | def fetch_job_detail(job_id):
56 |     
57 |     try:
58 |         ua = UserAgent(platforms='pc')
59 |         
60 |         url = f'https://www.104.com.tw/job/ajax/content/{job_id}'
61 |         headers = {
62 |             'User-Agent': ua.random,
63 |             'Referer': f'https://www.104.com.tw/job/{job_id}'
64 |         }
65 | 
66 |         response = requests.get(url, headers=headers)
67 |         response.raise_for_status()  # 檢查 HTTP 回應狀態
68 | 
69 |         data = response.json()
70 |         job_info = convert_job_data(data)
71 |         job_info['連結'] = f'https://www.104.com.tw/job/{job_id}'
72 |         
73 |         return job_info
74 | 
75 |     except Exception as e:
76 |         print(f"處理職缺 {job_id} 時出錯: {e}")
77 |         return None
78 | 


--------------------------------------------------------------------------------
/dev/104/code3.py:
--------------------------------------------------------------------------------
 1 | # 取得所有職缺詳細信息並存入 DataFrame
 2 | def fetch_all_job_details(job_urls):
 3 |     
 4 |     job_details = []
 5 | 
 6 |     for index, original_url in enumerate(job_urls):
 7 |         job_id = original_url.split('/job/')[1].split('?')[0]
 8 |         job_info = fetch_job_detail(job_id)
 9 | 
10 |         if job_info:
11 |             job_details.append(job_info)
12 |             print(f"已完成 {index + 1} / {len(job_urls)} : {job_info['職缺名稱']}")
13 | 
14 |         sleep_time = random.uniform(3, 8)
15 |         time.sleep(sleep_time)
16 |     
17 |     df = pd.DataFrame(job_details)
18 |     return df
19 | 
20 | # 取得職缺詳細信息並存入 DataFrame
21 | df = fetch_all_job_details(job_urls)
22 | df.to_excel('104_jobs.xlsx')
23 | 


--------------------------------------------------------------------------------
/dev/104/index.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/doc/pic/architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/doc/pic/architecture.jpg


--------------------------------------------------------------------------------
/doc/pic/celery.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/doc/pic/celery.jpg


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   web:
 4 |     build:
 5 |       context: ./api
 6 |       dockerfile: Dockerfile
 7 |     restart: always
 8 |     ports:
 9 |      - "5001:5001"
10 |     depends_on:
11 |       - redis
12 |   worker:
13 |     build:
14 |       context: ./celery_queue
15 |       dockerfile: Dockerfile
16 |     depends_on:
17 |       - redis
18 |   monitor:
19 |     build:
20 |       context: ./celery_queue
21 |       dockerfile: Dockerfile
22 |     ports:
23 |      - "5555:5555"
24 |     entrypoint: flower
25 |     command:  -A tasks --port=5555 --broker=redis://redis:6379/0
26 |     depends_on:
27 |       - redis
28 |   redis:
29 |     image: redis
30 |     ports:
31 |      - "6379:6379"
32 |   mongodb:
33 |     image: mongo:latest
34 |     ports:
35 |      - "27017:27017"
36 |     container_name: "mongodb"
37 |     environment:
38 |      - MONGO_DATA_DIR=/data/db
39 |      - MONGO_LOG_DIR=/dev/null
40 |      - MONGODB_USER="mongo"
41 |      - MONGODB_PASS="password" 
42 |     volumes:
43 |     - ./data/db:/data/db
44 |     #command: mongod --smallfiles --logpath=/dev/null # --quiet


--------------------------------------------------------------------------------
/legacy_project/archived/bank_swiftcode/grab_bank_list.py:
--------------------------------------------------------------------------------
 1 | # python 3 
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd 
 4 | import urllib 
 5 | # help function 
 6 | def parse_swift_code(swift_url):
 7 |     try:
 8 |         opener=urllib.request.build_opener()
 9 |         opener.addheaders = [('User-agent', 'Mozilla/5.0')]
10 |         page = opener.open(swift_url)
11 |         soup = BeautifulSoup(page,"html.parser")
12 |         for k,j in enumerate(soup.find_all('a',{'href': True})):
13 |             if k == 7:
14 |                 print (k,j.text)
15 |                 return j.text
16 |             else:
17 |                 pass
18 |         #return j.text
19 |     except:
20 |         return None
21 | 
22 | def clean_df(df):
23 | 	# drop any col, row with null value 
24 | 	#df_ = df.dropna()
25 | 	df_ = df_[(df_.bank_name != 'SWIFT Code Databse') | 
26 | 	          (df_.bank_name != 'Countries List') |
27 | 	          (df_.bank_name != 'Home') |
28 | 	          (df_.bank_name != 'Next') |
29 | 	          (df_.bank_name != 'Last') |
30 | 	          (df_.bank_name != 'Privacy policy') |
31 | 	          (df_.bank_name != 'DMCA Policy') |
32 | 	          (df_.bank_name != 'Contact Us') ]
33 | 	return df_ 
34 | 
35 | def main_():
36 | 
37 | 	#url="http://www.swiftcodelist.com/banks/united-kingdom-1.html"
38 | 
39 | 	output = [[] for k in range(3)]
40 | 
41 | 	for x in range(1,43):
42 | 		url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html".format(x)
43 | 		print (url)
44 | 
45 | 		opener=urllib.request.build_opener()
46 | 		opener.addheaders = [('User-agent', 'Mozilla/5.0')]
47 | 		page = opener.open(url)
48 | 		soup = BeautifulSoup(page,"html.parser")
49 | 		anchors = soup.find_all('a', {'href': True})
50 | 
51 | 		for k in anchors:
52 | 
53 | 			if len(k.text) < 3:
54 | 				print (k.text)
55 | 				output[0].append(None)
56 | 				print (k['href'])
57 | 				output[1].append(k['href'])
58 | 				output[2].append(None)	        
59 | 			else:
60 | 				output[0].append(k.text)
61 | 				print (k.text)
62 | 				output[1].append(k['href'])
63 | 				print (k['href'])
64 | 				output[2].append(parse_swift_code(k['href']))
65 | 
66 | 	df_ = pd.DataFrame(output).T
67 | 	cols=['bank_name','url','swift_code']
68 | 	df_.columns = [cols]
69 | 	print (df_)
70 | 	#df_ =clean_df(df_)
71 | 	df_.to_csv('UK_bank_swift_code_list.csv')
72 | 
73 | if __name__ == '__main__':
74 | 	main_()


--------------------------------------------------------------------------------
/legacy_project/archived/bank_swiftcode/grab_bank_list_muitiprocess.py:
--------------------------------------------------------------------------------
  1 | # python 3 
  2 | # credit 
  3 | # https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/2-add/
  4 | # https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/3-queue/
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | import pandas as pd 
  8 | import urllib 
  9 | # multiprocessing
 10 | import multiprocessing as mp
 11 | import sys
 12 | 
 13 | sys.setrecursionlimit(10000) # 10000 is an example, try with different values
 14 | 
 15 | # help function 
 16 | def parse_swift_code(swift_url):
 17 |     try:
 18 |         opener=urllib.request.build_opener()
 19 |         opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 20 |         page = opener.open(swift_url)
 21 |         soup = BeautifulSoup(page,"html.parser")
 22 |         # need to fix here ( find -> find_all)
 23 |         for k,j in enumerate(soup.find('a',{'href': True})):
 24 |             if k == 7:
 25 |                 print (k,j.text)
 26 |                 return j.text
 27 |             else:
 28 |                 pass
 29 |         #return j.text
 30 |     except:
 31 |         return None
 32 | 
 33 | # main scrape function 
 34 | # url list 
 35 | url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html"
 36 | url_ = [url.format(x) for x in range(1,2)]
 37 | 
 38 | def crawl(url):
 39 | 	print ('-------------')
 40 | 	print (url)
 41 | 	print ('-------------')
 42 | 	#url="http://www.swiftcodelist.com/banks/united-kingdom-1.html"
 43 | 	#output = [[] for k in range(3)]
 44 | 	#for x in range(1,43):
 45 | 	#	url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html".format(x)
 46 | 	#	print (url)
 47 | 	opener=urllib.request.build_opener()
 48 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 49 | 	page = opener.open(url)
 50 | 	soup = BeautifulSoup(page,"html.parser")
 51 | 	# need to fix here ( find -> find_all)
 52 | 	anchors = soup.find('a', {'href': True})
 53 | 	return anchors
 54 | 
 55 | 
 56 | def parse(anchors):
 57 | 	for k in anchors:
 58 | 		if len(k.text) < 3:
 59 | 			print (k.text)
 60 | 			#output[0].append(None)
 61 | 			print (k['href'])
 62 | 			#output[1].append(k['href'])
 63 | 			#output[2].append(None)	        
 64 | 		else:
 65 | 			#output[0].append(k.text)
 66 | 			print (k.text)
 67 | 			#output[1].append(k['href'])
 68 | 			print (k['href'])
 69 | 			#output[2].append(parse_swift_code(k['href']))
 70 | 
 71 | 
 72 | 
 73 | def main_(url):
 74 | 	print ('-------------')
 75 | 	print (url)
 76 | 	print ('-------------')
 77 | 	#url="http://www.swiftcodelist.com/banks/united-kingdom-1.html"
 78 | 	output = [[] for k in range(3)]
 79 | 	#for x in range(1,43):
 80 | 	#	url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html".format(x)
 81 | 	#	print (url)
 82 | 	opener=urllib.request.build_opener()
 83 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 84 | 	page = opener.open(url)
 85 | 	soup = BeautifulSoup(page,"html.parser")
 86 | 	# need to fix here ( find -> find_all)
 87 | 	anchors = soup.find('a', {'href': True})
 88 | 
 89 | 	for k in anchors:
 90 | 
 91 | 		if len(k.text) < 3:
 92 | 			print (k.text)
 93 | 			output[0].append(None)
 94 | 			print (k['href'])
 95 | 			output[1].append(k['href'])
 96 | 			output[2].append(None)	        
 97 | 		else:
 98 | 			output[0].append(k.text)
 99 | 			print (k.text)
100 | 			output[1].append(k['href'])
101 | 			print (k['href'])
102 | 			output[2].append(parse_swift_code(k['href']))
103 | 
104 | 	df_ = pd.DataFrame(output).T
105 | 	cols=['bank_name','url','swift_code']
106 | 	df_.columns = [cols]
107 | 	print (df_)
108 | 	return df_ 
109 | 
110 | # parse job
111 | def multi_scrap():
112 | 	#count =0
113 | 	pool = mp.Pool(2)
114 | 	while True:
115 | 		# htmls = [crawl(url) for url in unseen]
116 | 		# --->
117 | 		crawl_jobs = [pool.apply_async(main_, args=(url,)) for url in url_]
118 | 		output = [j.get() for j in crawl_jobs]
119 | 		print (output)
120 | 		# results = [parse(html) for html in htmls]
121 | 		# --->
122 | 		#parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
123 | 		#results = [j.get() for j in parse_jobs]
124 | 
125 | 
126 | def multi_scrap_():
127 | 	#count =0
128 | 	pool = mp.Pool(2)
129 | 	while True:
130 | 		# htmls = [crawl(url) for url in unseen]
131 | 		# --->
132 | 		crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in url_]
133 | 		data = [j.get() for j in crawl_jobs]
134 | 		print (data)
135 | 		#results = [parse(html) for html in htmls]
136 | 		# --->
137 | 		parse_jobs = [pool.apply_async(parse, args=(data,)) for a in data]
138 | 		results = [j.get() for j in parse_jobs]
139 | 
140 | if __name__ == '__main__':
141 | 	multi_scrap_()


--------------------------------------------------------------------------------
/legacy_project/archived/booking/bookingcom_scrap.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.common.keys import Keys
 4 | from selenium.webdriver.support.ui import Select
 5 | from selenium.common.exceptions import NoSuchElementException
 6 | from selenium.common.exceptions import NoAlertPresentException
 7 | # https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path
 8 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 9 | import unittest, time, re
10 | from bs4 import BeautifulSoup
11 | import argparse
12 | 
13 | # parse parameter from command line to python 
14 | # https://docs.python.org/3/howto/argparse.html
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("echo")
17 | args = parser.parse_args()
18 | print(args.echo)
19 | print ('===========')
20 | # open firefox as browser
21 | browser = webdriver.Firefox()
22 | # set up site url 
23 | #base_url="https://www.booking.com/searchresults.zh-tw.html?aid=304142&label=gen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw&sid=fc93df7eb22345d0203784b4d254c349&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.zh-tw.html%3Faid%3D304142%3Blabel%3Dgen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw%3Bsid%3Dfc93df7eb22345d0203784b4d254c349%3Bcheckin_month%3D6%3Bcheckin_monthday%3D16%3Bcheckin_year%3D2017%3Bcheckout_month%3D6%3Bcheckout_monthday%3D17%3Bcheckout_year%3D2017%3Bclass_interval%3D1%3Bdest_id%3D17%3Bdest_type%3Dairport%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Blabel_click%3Dundef%3Bmap%3D1%3Bmih%3D0%3Bno_rooms%3D1%3Boffset%3D33%3Braw_dest_type%3Dairport%3Broom1%3DA%252CA%3Brows%3D33%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bss%3D%25E9%25A6%2599%25E6%25B8%25AF%25E8%25B5%25A4%25E9%25B1%25B2%25E8%25A7%2592%25E5%259C%258B%25E9%259A%259B%25E6%25A9%259F%25E5%25A0%25B4%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%3Bss_raw%3Dhk%3Bssb%3Dempty%26%3B&ss=NYC&ssne=%E8%B5%A4%E9%B1%B2%E8%A7%92&ssne_untouched=%E8%B5%A4%E9%B1%B2%E8%A7%92&checkin_year=2017&checkin_month=6&checkin_monthday=16&checkout_year=2017&checkout_month=6&checkout_monthday=17&room1=A%2CA&group_adults=2&group_children=0&no_rooms=1&highlighted_hotels=&dest_id=&dest_type=&search_pageview_id=024242b9a80c0437&search_selected=false"
24 | base_url="https://www.booking.com/searchresults.zh-tw.html?aid=304142&label=gen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw&sid=fc93df7eb22345d0203784b4d254c349&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.zh-tw.html%3Faid%3D304142%3Blabel%3Dgen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw%3Bsid%3Dfc93df7eb22345d0203784b4d254c349%3Bcheckin_month%3D6%3Bcheckin_monthday%3D16%3Bcheckin_year%3D2017%3Bcheckout_month%3D6%3Bcheckout_monthday%3D17%3Bcheckout_year%3D2017%3Bclass_interval%3D1%3Bdest_id%3D17%3Bdest_type%3Dairport%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Blabel_click%3Dundef%3Bmap%3D1%3Bmih%3D0%3Bno_rooms%3D1%3Boffset%3D33%3Braw_dest_type%3Dairport%3Broom1%3DA%252CA%3Brows%3D33%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bss%3D%25E9%25A6%2599%25E6%25B8%25AF%25E8%25B5%25A4%25E9%25B1%25B2%25E8%25A7%2592%25E5%259C%258B%25E9%259A%259B%25E6%25A9%259F%25E5%25A0%25B4%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%3Bss_raw%3Dhk%3Bssb%3Dempty%26%3B&ss={}&ssne=%E8%B5%A4%E9%B1%B2%E8%A7%92&ssne_untouched=%E8%B5%A4%E9%B1%B2%E8%A7%92&checkin_year=2017&checkin_month=6&checkin_monthday=16&checkout_year=2017&checkout_month=6&checkout_monthday=17&room1=A%2CA&group_adults=2&group_children=0&no_rooms=1&highlighted_hotels=&dest_id=&dest_type=&search_pageview_id=024242b9a80c0437&search_selected=false"
25 | print (base_url)
26 | base_url = base_url.format(args.echo)
27 | browser.get(base_url)
28 | page = 0    
29 | 
30 | #while len(soup.select('.paging-start')) > 0:
31 | while page < 2:
32 | 	page += 1 
33 | 	try: 
34 | 		#'======== start parse ========'
35 | 		soup = BeautifulSoup(browser.page_source,"html.parser")
36 | 		#for ele in soup.find_all('h3'):
37 | 		for ele in soup.findAll("span", { "class" : "sr-hotel__name" }):
38 | 			print ele.text 
39 | 		# next page 
40 | 		browser.find_element_by_link_text(u"下一頁").click()
41 | 		print 'page =' , page
42 | 		time.sleep(1)
43 | 	except Exception as e:
44 | 		print e, 'something failed'


--------------------------------------------------------------------------------
/legacy_project/archived/booking/next_page_sample.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.common.keys import Keys
 4 | from selenium.webdriver.support.ui import Select
 5 | from selenium.common.exceptions import NoSuchElementException
 6 | from selenium.common.exceptions import NoAlertPresentException
 7 | import unittest, time, re
 8 | 
 9 | class BookingcomNextPage(unittest.TestCase):
10 |     def setUp(self):
11 |         self.driver = webdriver.Firefox()
12 |         self.driver.implicitly_wait(30)
13 |         self.base_url = "https://www.booking.com/"
14 |         self.verificationErrors = []
15 |         self.accept_next_alert = True
16 |     
17 |     def test_bookingcom_next_page(self):
18 |         driver = self.driver
19 |         driver.get(self.base_url + "/index.zh-tw.html?label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AED6AEBkgIBeagCAw;sid=869c62621e3d43712c1fbc29cfed3288;sb_price_type=total&")
20 |         driver.find_element_by_id("ss").click()
21 |         driver.find_element_by_id("ss").clear()
22 |         driver.find_element_by_id("ss").send_keys("hk")
23 |         driver.find_element_by_xpath("//form[@id='frm']/div[2]/div/div/ul/li").click()
24 |         driver.find_element_by_css_selector("button.sb-searchbox__button.").click()
25 |         driver.find_element_by_id("close_map_lightbox").click()
26 |         driver.find_element_by_link_text(u"下一頁").click()
27 |         driver.find_element_by_link_text(u"下一頁").click()
28 |         driver.find_element_by_link_text(u"下一頁").click()
29 |     
30 |     def is_element_present(self, how, what):
31 |         try: self.driver.find_element(by=how, value=what)
32 |         except NoSuchElementException as e: return False
33 |         return True
34 |     
35 |     def is_alert_present(self):
36 |         try: self.driver.switch_to_alert()
37 |         except NoAlertPresentException as e: return False
38 |         return True
39 |     
40 |     def close_alert_and_get_its_text(self):
41 |         try:
42 |             alert = self.driver.switch_to_alert()
43 |             alert_text = alert.text
44 |             if self.accept_next_alert:
45 |                 alert.accept()
46 |             else:
47 |                 alert.dismiss()
48 |             return alert_text
49 |         finally: self.accept_next_alert = True
50 |     
51 |     def tearDown(self):
52 |         self.driver.quit()
53 |         self.assertEqual([], self.verificationErrors)
54 | 
55 | if __name__ == "__main__":
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/legacy_project/archived/glassdoor/glassdoor_scrap.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.common.keys import Keys
 4 | from selenium.webdriver.support.ui import Select
 5 | from selenium.common.exceptions import NoSuchElementException
 6 | from selenium.common.exceptions import NoAlertPresentException
 7 | # https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path
 8 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 9 | import unittest, time, re
10 | from bs4 import BeautifulSoup
11 | import argparse
12 | 
13 | # parse parameter from command line to python 
14 | # https://docs.python.org/3/howto/argparse.html
15 | # open firefox as browser
16 | browser = webdriver.Firefox()
17 | # set up site url
18 | #base_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+&sc.keyword=data+&locT=C&locId=2671300&jobType=" 
19 | base_url = "https://www.glassdoor.com/Job/london-data-jobs-SRCH_IL.0,6_IC2671300_KE7,11.htm"
20 | print (base_url)
21 | browser.get(base_url)


--------------------------------------------------------------------------------
/legacy_project/archived/spotify/spotify_album copy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # http://stackoverflow.com/questions/41566971/how-to-get-spotify-artist-id-for-the-spotify-endpoint-url
 3 | # input artist name 
 4 | echo 'plz enter artist name '
 5 | read  varname
 6 | echo  'varname = ' $varname
 7 | 
 8 | # modify usl
 9 | url="https://api.spotify.com/v1/search?q=${varname}&type=artist"
10 | echo $url
11 | 
12 | # query 
13 | API_ARTIST_URL=$(curl -s $url | jq -r '.artists.items[0].href')
14 | 
15 | echo 'API_ARTIST_URL : '   $API_ARTIST_URL
16 | echo 'ALBUM'
17 | echo '=================='
18 | 
19 | # print album
20 | curl -s "$API_ARTIST_URL/top-tracks?country=US" | jq -r '.tracks[].name'


--------------------------------------------------------------------------------
/legacy_project/blu_move/analysis.sql:
--------------------------------------------------------------------------------
  1 | -- 1. Duration period 
  2 | -- get duration period (minute) per booking per car 
  3 | 
  4 | SELECT id,
  5 |        end_reservation,
  6 |        start_reservation,
  7 |        EXTRACT (epoch
  8 |                 FROM (end_reservation - start_reservation))::integer/60 AS duration_min
  9 | FROM
 10 |   (SELECT DISTINCT id,
 11 |                    end_reservation,
 12 |                    start_reservation
 13 |    FROM <table_name>
 14 |    WHERE end_reservation IS NOT NULL
 15 |      AND start_reservation IS NOT NULL ) sub
 16 | ORDER BY id,
 17 |          end_reservation
 18 | 
 19 | 
 20 | -- 2. utilization   (day)
 21 | -- get utilization per day 
 22 | 
 23 | 
 24 | 
 25 | WITH booking AS
 26 |   (SELECT date(start_reservation) AS date,
 27 |           count(DISTINCT id) AS booked_car
 28 |    FROM <table_name>
 29 |    GROUP BY 1),
 30 |      all_ AS
 31 |   (SELECT date(date_of_insert) AS date,
 32 |           count(DISTINCT id) AS all_car
 33 |    FROM <table_name>
 34 |    GROUP BY 1)
 35 | SELECT booking.*,
 36 |        all_.all_car,
 37 |        booking.booked_car::NUMERIC/all_.all_car::NUMERIC AS utilization
 38 | FROM booking
 39 | INNER JOIN all_ ON booking.date = all_.date
 40 | ORDER BY booking.date
 41 | 
 42 | 
 43 | 
 44 | -- 3. utilization   (hour)
 45 | -- get utilization per hour 
 46 | 
 47 | 
 48 | WITH booking AS
 49 |   (SELECT TO_TIMESTAMP(cast(start_reservation AS TEXT),'yyyy-mm-dd HH24') AS date,
 50 |           count(DISTINCT id) AS booked_car
 51 |    FROM <table_name>
 52 |    GROUP BY 1),
 53 |      all_ AS
 54 |   (SELECT TO_TIMESTAMP(cast(date_of_insert AS TEXT),'yyyy-mm-dd HH24') AS date,
 55 |           count(DISTINCT id) AS all_car
 56 |    FROM <table_name>
 57 |    GROUP BY 1)
 58 | SELECT booking.*,
 59 |        all_.all_car,
 60 |        booking.booked_car::NUMERIC/all_.all_car::NUMERIC AS utilization
 61 | FROM booking
 62 | INNER JOIN all_ ON booking.date = all_.date
 63 | ORDER BY booking.date
 64 | 
 65 | 
 66 | 
 67 | -- 4.  utilization  (using hour / 24 hour)
 68 | ### fix the "duration across day problem"
 69 | ### e.g. start : 2017-01-01 23:00, end : 2017-01-02 07:00 
 70 | # hour of using / 24 hour for each car  (V1)
 71 | 
 72 | 
 73 | WITH dates AS
 74 |   ( SELECT DISTINCT generate_series(min(b.start_reservation::date) OVER (PARTITION BY b.id)::TIMESTAMP, now()::date - '1 day'::interval, '1 day'::interval)::date AS date,
 75 |                     b.id,
 76 |                     24 AS capacity_hours,
 77 |                     1 AS capacity_days
 78 |    FROM rw.blue_move b
 79 |    WHERE date(b.start_reservation) >= '2018-01-12' ),
 80 |      get_last_log AS
 81 |   (SELECT b.*,
 82 |           ROW_NUMBER() OVER (PARTITION BY id,
 83 |                                           date(date_of_insert)
 84 |                              ORDER BY date(date_of_insert),
 85 |                                       date_of_insert DESC) AS row_id,
 86 |                             ROW_NUMBER() OVER (PARTITION BY id,
 87 |                                                             start_reservation
 88 |                                                ORDER BY date_of_insert DESC) AS row_id_
 89 |    FROM rw.blue_move b
 90 |    WHERE start_reservation IS NOT NULL
 91 |      AND end_reservation IS NOT NULL )
 92 | SELECT d_1.date,
 93 |        last_log.id,
 94 |        last_log.start_reservation,
 95 |        last_log.end_reservation,
 96 |        CASE
 97 |            WHEN last_log.start_reservation < d_1.date
 98 |                 AND last_log.end_reservation::date > d_1.date THEN 24::double precision
 99 |            WHEN last_log.start_reservation < d_1.date THEN date_part('hour'::text, last_log.end_reservation) + date_part('minute'::text, last_log.end_reservation) / 60::double precision
100 |            WHEN last_log.start_reservation::date = d_1.date
101 |                 AND last_log.end_reservation::date > d_1.date THEN date_part('epoch'::text, d_1.date + '1 day'::interval - last_log.start_reservation) / 3600::double precision
102 |            WHEN last_log.start_reservation::date = d_1.date
103 |                 AND last_log.end_reservation::date = d_1.date THEN date_part('epoch'::text, last_log.end_reservation - last_log.start_reservation) / 3600::double precision
104 |            ELSE 0::double precision
105 |        END AS service_hours
106 | FROM get_last_log last_log
107 | RIGHT JOIN dates d_1 ON (last_log.start_reservation::date <= d_1.date
108 |                          AND last_log.end_reservation::date >= d_1.date
109 |                          OR last_log.start_reservation::date = d_1.date)
110 | AND d_1.id::text = last_log.id::text
111 | WHERE row_id = 1
112 |   AND row_id_ = 1
113 | ORDER BY id,
114 |          start_reservation, date
115 | 


--------------------------------------------------------------------------------
/legacy_project/blu_move/blu_scrape_V1.py:
--------------------------------------------------------------------------------
 1 | # credit : https://ianlondon.github.io/blog/web-scraping-discovering-hidden-apis/
 2 | # scrape in public home page 
 3 | #import library 
 4 | from bs4 import BeautifulSoup
 5 | import urllib, json
 6 | import pandas as pd
 7 | import sys ,re,time
 8 | 
 9 | url="https://app.bluemove.es/api/public/locations/list?cityId=100&accountId=1"
10 | 
11 | def extract_data():
12 | 	pass 
13 | 
14 | 
15 | def main():
16 | 	print (url)
17 | 	opener=urllib.request.build_opener()
18 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
19 | 	page = opener.open(url)
20 | 	soup = BeautifulSoup(page)
21 | 	geo_data =  dict(json.loads(soup.text))
22 | 	geo_data_ = geo_data['data']['locations']
23 | 	print (geo_data_)
24 | 	print ('length of data :',len(geo_data_) )
25 | 
26 | 
27 | 
28 | def main_():
29 | 	print (url)
30 | 	opener=urllib.request.build_opener()
31 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
32 | 	page = opener.open(url)
33 | 	soup = BeautifulSoup(page)
34 | 	geo_data =  dict(json.loads(soup.text))
35 | 	geo_data_ = geo_data['data']['locations']
36 | 	print (geo_data_)
37 | 	print ('length of data :',len(geo_data_) )
38 | 
39 | 	# transfer to  dataframe 
40 | 	output = [[] for k in range(5)]
41 | 	for count in range(len(geo_data['data']['locations'])):
42 | 		scraped_data = geo_data['data']['locations'][count]
43 | 		for k,j in enumerate(scraped_data['Location']['vehicles']):
44 | 		    #print (k)
45 | 			output[0].append(scraped_data['Location']['vehicles'][k]['id'])
46 | 			output[1].append(scraped_data['Location']['vehicles'][k]['gpslat'])
47 | 			output[2].append(scraped_data['Location']['vehicles'][k]['gpslong'])
48 | 			output[3].append(scraped_data['Location']['vehicles'][k]['gps_timestamp'])
49 | 			output[4].append(scraped_data['Location']['vehicles'][k]['status'])
50 | 	df_ = pd.DataFrame(output).T
51 | 	df_.columns = [['id','gpslat','gpslong','gps_timestamp','status']]
52 | 	print (df_) 
53 | 
54 | if __name__ == '__main__':
55 | 	main_()
56 | 


--------------------------------------------------------------------------------
/legacy_project/blu_move/blu_scrape_V1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # scraper V1 on user booking page 
3 | curl 'https://rest.bluemove.es/api/fleet/availability' -H 'pragma: no-cache' -H 'origin: https://webapp.bluemove.es' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' -H 'user-agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36' -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'cache-control: no-cache' -H 'authority: rest.bluemove.es' -H 'referer: https://webapp.bluemove.es/en/my-bluemove' --data 'cityId=100&start=2017-12-22+22%3A00%3A00&end=2017-12-22+23%3A00%3A00&userId=142961&token=549mNphfCEefL2iYCwdM96GMFqqnTj56UhHLE70V21idilcfl3&product=cs&usageReason=private' --compressed | jq


--------------------------------------------------------------------------------
/legacy_project/blu_move/blu_scrape_V2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | import sys ,re,time
 4 | import os 
 5 | # user defined function 
 6 | from utility_data_IO import * 
 7 | 
 8 | db_url = os.environ['db_url']
 9 | print ('db_url : ' , db_url)
10 | 
11 | def get_json():
12 | 	# read json from shell scraper (blu_scrape_V2.sh)
13 | 	with open("blu_.json") as json_file:
14 | 	    blu_data = json.load(json_file)
15 | 	return blu_data
16 | 
17 | def main_(write_to_db=False):
18 | 	blu_data = get_json()
19 | 	# prepare data, parese needed columns 
20 | 	# for loop
21 | 	output = [[] for k in range(11)]
22 | 
23 | 	for loc_index in range(len(blu_data['data']['locations'])):
24 | 		for k in range(len(blu_data['data']['locations'][loc_index]['Location']['vehicles'])):
25 | 			data_ =blu_data['data']['locations'][loc_index]['Location']['vehicles'][k]['data']
26 | 			print (data_['id'])
27 | 			# car data 
28 | 			# gat car ID, lat, lon, status, gps_timestamp
29 | 			output[0].append(data_['id'])
30 | 			output[1].append(data_['gpslat'])
31 | 			output[2].append(data_['gpslong'])
32 | 			output[3].append(data_['gps_timestamp'])
33 | 			output[4].append(data_['status'])
34 | 			# reservation data 
35 | 			# get reservation : end, end_block, end_reservation, start, start_block, start_reservation 
36 | 			data_reserve = blu_data['data']['locations'][loc_index]['Location']['vehicles'][k]['occupation']['allReservations']
37 | 			if len(data_reserve) == 0:
38 | 				output[5].append(None)
39 | 				output[6].append(None)
40 | 				output[7].append(None)
41 | 				output[8].append(None)
42 | 				output[9].append(None)
43 | 				output[10].append(None)
44 | 
45 | 			else:
46 | 				#pd.to_datetime(data_reserve[0]['end'], format='%d/%m/%y %H:%M:%S')
47 | 				output[5].append(pd.to_datetime(data_reserve[0]['end'], format='%d/%m/%Y %H:%M:%S'))   
48 | 				output[6].append(pd.to_datetime(data_reserve[0]['end_block'], format='%d/%m/%Y %H:%M:%S'))
49 | 				output[7].append(pd.to_datetime(data_reserve[0]['end_reservation'], format='%d/%m/%y %H:%M:%S'))
50 | 				output[8].append(pd.to_datetime(data_reserve[0]['start'], format='%d/%m/%Y %H:%M:%S'))
51 | 				output[9].append(pd.to_datetime(data_reserve[0]['start_block'], format='%d/%m/%Y %H:%M:%S'))
52 | 				output[10].append(pd.to_datetime(data_reserve[0]['start_reservation'], format='%d/%m/%y %H:%M:%S'))
53 | 				#print (data_reserve)
54 | 		        #print ('=====')
55 | 		    	    
56 | 	df_ = pd.DataFrame(output).T
57 | 	cols=['id', 'gpslat', 'gpslong', 'gps_timestamp', 'status', 'end',
58 | 	'end_block', 'end_reservation', 'start', 'start_block',
59 | 	'start_reservation']
60 | 
61 | 	df_.columns = [cols]
62 | 	# hot fix here 
63 | 	#df_ = df_.drop('Unnamed: 0', 1)
64 | 	df_.to_csv('blu_.csv',index=False)
65 | 	#print (df_)
66 | 
67 | 	if write_to_db == True:
68 | 		print("insert to DB....")
69 | 		print ('############')
70 | 		print (df_)
71 | 		print ('############')
72 | 		# hot fix here 
73 | 		df_2 = pd.read_csv('blu_.csv')
74 | 		write_data_to_db(df_2,'blue_move',db_url)
75 | 
76 | 	return df_
77 | 
78 | if __name__ == '__main__':
79 | 	main_(write_to_db = True)
80 | 


--------------------------------------------------------------------------------
/legacy_project/blu_move/blu_scrape_V2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # scraper V2 on user booking page  : remove start & end time parameter, since the output looks the same
3 | # whatever the value of start & end  
4 | #curl 'https://rest.bluemove.es/api/fleet/availability' -H 'pragma: no-cache' -H 'origin: https://webapp.bluemove.es' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' -H 'user-agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36' -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'cache-control: no-cache' -H 'authority: rest.bluemove.es' -H 'referer: https://webapp.bluemove.es/en/my-bluemove' --data 'cityId=100&userId=142961&token=549mNphfCEefL2iYCwdM96GMFqqnTj56UhHLE70V21idilcfl3&product=cs&usageReason=private' --compressed | jq
5 | curl 'https://rest.bluemove.es/api/fleet/availability' -H 'pragma: no-cache' -H 'origin: https://webapp.bluemove.es' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' -H 'user-agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36' -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'cache-control: no-cache' -H 'authority: rest.bluemove.es' -H 'referer: https://webapp.bluemove.es/en/my-bluemove' --data 'cityId=100&userId=142961&token=549mNphfCEefL2iYCwdM96GMFqqnTj56UhHLE70V21idilcfl3&product=cs&usageReason=private' --compressed > blu_.json


--------------------------------------------------------------------------------
/legacy_project/blu_move/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo "scraping via shell ...."
3 | bash blu_scrape_V2.sh 
4 | echo "prepare data python ...."
5 | # 2 python script run way in case of different local dev envs
6 | source activate zip_dev && python blu_scrape_V2.py ||  /Users/yennanliu/anaconda3/envs/ds_dash/bin/python blu_scrape_V2.py || python /home/ubuntu/yen_dev/blu_move/blu_scrape_V2.py
7 | #echo "clean file..."


--------------------------------------------------------------------------------
/legacy_project/blu_move/utility_data_IO.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | from sqlalchemy import create_engine
 3 | from pytz import timezone
 4 | import datetime
 5 | import os
 6 | 
 7 | european = timezone('Europe/Madrid')
 8 | now_tz = datetime.datetime.now(tz = european)
 9 | now = now_tz.replace(tzinfo = None)
10 | now = now.replace(microsecond = 0)
11 | db_url = os.environ['db_url']
12 | print ('db_url : ' , db_url)
13 | 
14 | def write_data_to_db(df, table_name,db_url):
15 |     try:
16 |         # add insert time 
17 |         df["date_of_insert"] = now
18 |         print ('=============')
19 |         print (df.head())
20 |         print (table_name)
21 |         print ('=============')
22 |         engine = create_engine(db_url)
23 |         conn = engine.connect()
24 |         df.to_sql(name= table_name, con= engine, schema= 'rw', if_exists = "append", index = False)
25 |         # close the connection after imput data 
26 |         conn.close()
27 |         print("insert to DB ok")
28 |     except Exception as e:
29 |         print (e)
30 |         print ('fail to write to db')
31 |         


--------------------------------------------------------------------------------
/legacy_project/blu_move/utility_data_preprocess.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | 
 3 | def data_prepare():
 4 | 	df = pd.read_csv('blu.csv')
 5 | 	#df = df 
 6 | 	#print (df.head())
 7 | 	cols = ['start', 'start_block','start_reservation','end','end_block','end_reservation']
 8 | 	for col in cols:
 9 | 		df[col] = pd.to_datetime(df[col])
10 | 		# maybe need to modify time form 
11 | 		# 04/01/2018  17:45:00 -> 2018-01-04 17:45:00 for example 
12 | 		# df.col = df.col.timestrip("%Y-%M-%D hr:mm:ss")
13 | 		# end_date_relative = now.date().strftime("%d/%m/%Y")
14 | 
15 | 	df['reservation_time'] = df['end_reservation'] - df['start_reservation']
16 | 	print (df.head(3))
17 | 	return df 
18 | 	
19 | if __name__ == '__main__':
20 | 	data_prepare()
21 | 


--------------------------------------------------------------------------------
/legacy_project/carandclassic/README.md:
--------------------------------------------------------------------------------
 1 | # carandclassic
 2 | Collect car data at https://www.carandclassic.co.uk/
 3 | 
 4 | ## Tech
 5 | python3, urllib, BeautifulSoup
 6 | 
 7 | ## Demo
 8 | ```bash 
 9 | # demo of cclassic_scrape_V1.py 
10 | $ git clone https://github.com/yennanliu/web_scraping
11 | $ cd web_scraping/carandclassic
12 | $ python cclassic_scrape_V1.py
13 | 
14 | # output 
15 | ['/car/C1018332', '/car/C983211', '/car/C1018314', '/car/C1018313', '/car/C1018311', '/car/C901161', '/car/C305537', '/car/C1018308', '/car/C994875', '/car/C990970', '/car/C1018297', '/car/C1018296', '/car/C1018294', '/car/C998769', '/car/C1009081', '/car/C1018284', '/car/C1018283', '/car/C1018281', '/car/C887797', '/car/C1018280', '/car/C1018279', '/car/C1005573', '/car/C1018274', '/car/C1018272', '/car/C1018269', '/car/C1018268', '/car/C1018266', '/car/C387007', '/car/C1018263', '/car/C1018262', '/car/C1018257', '/car/C1018251', '/car/C1018249', '/car/C1018247', '/car/C1018236', '/car/C1018220']
16 | url_ :  https://www.carandclassic.co.uk/car/C1018332
17 | k_next £3999 As stated
18 | k_next Classic Cars
19 | k_next Austin Healey
20 | k_next Sprite
21 | k_next 1968
22 | k_next UK
23 | k_next 07043 229662
24 | k_next 23-Jul-2018
25 | k_next C1018332
26 | ...
27 |       Price      Category           Make                   Model  Year  \
28 | 0     £3999  Classic Cars  Austin Healey                  Sprite  1968   
29 | 1    £27950  Classic Cars           Audi                 Quattro  1984   
30 | 2     £3495  Classic Cars         Morris                   Minor  1968   
31 | 3     £6500  Classic Cars     Volkswagen                  Beetle  1971   
32 | 4    £12500  Classic Cars             MG            MGB Roadster  1973   
33 | 5    £40000  Classic Cars          Buick   redfern saloon tourer  1937   
34 |  ...
35 |    Country     Telephone         Date       Ref  
36 | 0       UK  07043 229662  23-Jul-2018  C1018332  
37 | 1       UK  07043 216048  23-Jul-2018   C983211  
38 | 2       UK  07043 225499  22-Jul-2018  C1018314  
39 | 3       UK  07043 217556  22-Jul-2018  C1018313  
40 | 4       UK  07043 215436  22-Jul-2018  C1018311  
41 | 5       UK  07043 228310  22-Jul-2018   C901161  
42 | ....
43 | 
44 | 
45 | 
46 | 
47 | ```
48 | 
49 | 


--------------------------------------------------------------------------------
/legacy_project/carandclassic/analysis/README.md:
--------------------------------------------------------------------------------
1 | # analysis 
2 | ## demo 
3 | * [Rental_Location_EDA](https://nbviewer.jupyter.org/github/yennanliu/web_scraping/blob/master/carandclassic/analysis/Rental_Location_EDA.ipynb) - notebook demo explore rental geo data 


--------------------------------------------------------------------------------
/legacy_project/carandclassic/carandclassic_scrape_sample.csv:
--------------------------------------------------------------------------------
 1 | ,Price,Category,Make,Model,Year,Country,Telephone,Date,Ref
 2 | 0,£8995,Classic Cars,Fiat,Coupe,1999,UK,07043 217757,23-Jul-2018,C921382
 3 | 1,£49000,Classic Cars,Lancia,Belna,1934,Italy, 393420953091,23-Jul-2018,C980353
 4 | 2,£11200,Classic Cars,Volkswagen,Beetle,1960,Netherlands,0031615231265,23-Jul-2018,C834642
 5 | 3,£20000,Classic Cars,Peugeot,RCZ R,2014,UK,07043 235688,23-Jul-2018,C979402
 6 | 4,£7995,Classic Cars,Volkswagen,Corrado,1996,UK,07957 430966,23-Jul-2018,C999085
 7 | 5,£3999,Classic Cars,Austin Healey,Sprite,1968,UK,07043 229662,23-Jul-2018,C1018332
 8 | 6,£3900,Classic Cars,Volvo,PV,1948,Sweden,0046705384888,23-Jul-2018,C1018331
 9 | 7,£29500,Classic Cars,Ford,Mustang,1966,USA,210 913 8353,23-Jul-2018,C929401
10 | 8,£60000,Classic Cars,Land Rover,Range Rover,1982,United Arab Emirates,00971504593964,23-Jul-2018,C969349
11 | 9,£44000,Classic Cars,AC,Ace,1961,New Zealand,0279234902,23-Jul-2018,C963690
12 | 10,£3250,Classic Cars,Austin Healey,Frogeye,1959,USA,001 (619) 561-3182,23-Jul-2018,C1018327
13 | 11,£17490,Classic Cars,Mercedes,500,2000,UK,07043 228934,23-Jul-2018,C1018325
14 | 12,£3250,Classic Cars,Jaguar,X300 V12 6 LITRES SALOON,1995,UK,07043 227855,23-Jul-2018,C1018324
15 | 13,£6250,Classic Cars,Jaguar,XJ8,2003,UK,07043 225889,23-Jul-2018,C1018323
16 | 14,£27950,Classic Cars,Audi,Quattro,1984,UK,07043 216048,23-Jul-2018,C983211
17 | 15,£19700,Classic Cars,Volkswagen,Karmann Ghia,1962,Spain,670032847,23-Jul-2018,C998835
18 | 16,£2375,Classic Cars,Volvo,460 gle,1990,UK,07043 225863,22-Jul-2018,C1018321
19 | 17,£3750,Classic Cars,BMW,5 Series,2000,UK,07043 225834,22-Jul-2018,C1018320
20 | 18,£1995,Classic Cars,BMW,3 Series,2001,UK,01455 271345,22-Jul-2018,C1018316
21 | 


--------------------------------------------------------------------------------
/legacy_project/carandclassic/cclassic_scrape_V1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import datetime
 3 | import urllib, json
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | def get_html_data(url):
 7 | 	opener=urllib.request.build_opener()
 8 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 9 | 	page = opener.open(url)
10 | 	soup = BeautifulSoup(page)
11 | 	return soup
12 | 
13 | def fix_price(x):
14 |     return x.split(' ')[0]
15 | 
16 | def main_():
17 | 	# -----------  collect classic car ID  -----------
18 | 	url='https://www.carandclassic.co.uk/cat/3/'
19 | 	soup = get_html_data(url)
20 | 	content=soup.find_all('div',attrs={'class': 'item'})
21 | 	car_list = []
22 | 	for i in range(len(soup.find_all('div',attrs={'class': 'item'}))):
23 | 		# -----------  only get car ID with not null price -----------
24 | 		if len(content[i].find('li',attrs={'class':'price'}).text.replace('£','')) > 0:
25 | 			car_id = content[i].find('a').attrs['href']
26 | 			car_list.append(car_id)
27 | 		else:
28 | 			pass 
29 | 	print (car_list)
30 | 	#car_list = ['/car/C1017959', '/car/C1017957','/car/C1017957']
31 | 	# ----------- go through every car page, grab the car profile information  -----------
32 | 	output=[[] for i in range(len(car_list))]
33 | 	for i,car in enumerate(car_list):	
34 | 		url_ = 'https://www.carandclassic.co.uk' + str(car) 
35 | 		print ('url_ : ', url_)
36 | 		soup = get_html_data(url_)
37 | 		# ----------- collect needed columns -----------
38 | 		# Make, Model, Date, Ref, Telephone
39 | 		k_list = ['Price','Category','Make','Model','Year','Country','Telephone','Date','Ref']
40 | 		content=soup.find_all('td',attrs={'class':'caption'})
41 | 		for k in content:
42 | 			if k.text in k_list:
43 | 				print ('k_next' , k.find_next_siblings("td")[0].text)
44 | 				output[i].append(k.find_next_siblings("td")[0].text)
45 | 			else:
46 | 				pass
47 | 	print (output)
48 | 	# ----------- output scrape data as dataframe and fix column value -----------
49 | 	data = pd.DataFrame(output,columns =k_list )
50 | 	data['Price'] = data['Price'].apply(lambda x :  fix_price(x))
51 | 	print (data)
52 | 	return data
53 |     
54 | if __name__ == '__main__':
55 | 	main_()
56 | 


--------------------------------------------------------------------------------
/legacy_project/carousell/web_crawler copy.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import requests, re
  3 | import pandas as pd 
  4 | import urllib 
  5 | import random
  6 | import os
  7 | import sys
  8 | import subprocess
  9 | import json
 10 | 
 11 | def regular_chat():
 12 | 	sample_response = ['HI THERE', 'WAZZA UP','R U KIDDING ME', '..?']
 13 | 	response = sample_response[random.randint(0,3)]
 14 | 	print (response)
 15 | 	return response
 16 | 
 17 | def general_intro():
 18 | 	sample_response = """
 19 | ######## 
 20 | 
 21 | for Carousell product survey, please type "!caro prosuctname" \n
 22 | for asking, please type "ask" \n
 23 | for main application, please type anything \n
 24 | have fun :) \n
 25 | 
 26 | ########
 27 | 
 28 | 	"""
 29 | 	print (sample_response)
 30 | 	return sample_response
 31 | 	
 32 | # Spotify 
 33 | def spotify_album(artist):
 34 | 	# make sure artist name feat spotify API query form 
 35 | 	artist = artist.replace (" ", "+")
 36 | 	print (artist)
 37 | 	url="https://api.spotify.com/v1/search?q=${}&type=artist".format(artist)
 38 | 
 39 | 	command = """ 
 40 | 
 41 | 	API_ARTIST_URL=$(curl -s "{}" | jq -r '.artists.items[0].href') 
 42 | 	curl -s "$API_ARTIST_URL/top-tracks?country=US" > spotify_data.json
 43 | 
 44 | 	""".format(url)
 45 | 	print (command)
 46 | 	os.system(command)
 47 | 	album = ''
 48 | 	try:
 49 | 		data_spotify = json.loads(open('spotify_data.json').read())
 50 | 		for k in range(0,len(data_spotify['tracks'])): 
 51 | 		    print (data_spotify['tracks'][k]['name'])
 52 | 		    album += data_spotify['tracks'][k]['name'] + "\n\n"
 53 | 	except:
 54 | 		album = 'no feat artist, return null data'
 55 | 		print (album)
 56 | 	# remove intermediate json 	
 57 | 	os.system('rm spotify_data.json')
 58 | 	return album
 59 | 	
 60 | # Carousell
 61 | def Caro_grab_(query):
 62 | 	url = 'https://tw.carousell.com/search/products/?query={}'
 63 | 	url=url.format(query)
 64 | 	opener=urllib.request.build_opener()
 65 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 66 | 	page = opener.open(url)
 67 | 	soup = BeautifulSoup(page,"html.parser")
 68 | 	anchors = soup.find_all('a', {'class': 'pdt-card-thumbnail', 'href': True})
 69 | 	content='' 
 70 | 	url_refix = 'https://tw.carousell.com/p/'
 71 | 	for anchor in anchors:
 72 | 		for k in re.findall('\d+', anchor['href']):
 73 | 			if len(k) > 3:
 74 | 				url = url_refix + k 
 75 | 				content += anchor.find('img')['alt'] + "\n" + str(url) + "\n\n"
 76 | 
 77 | 	print (content)
 78 | 	return content[:600]
 79 | 
 80 | def Caro_grab():
 81 | 	url = 'https://tw.carousell.com/?hl=en'
 82 | 	opener=urllib.request.build_opener()
 83 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 84 | 	page = opener.open(url)
 85 | 	soup = BeautifulSoup(page,"html.parser")
 86 | 	anchors = soup.find_all('a', {'class': 'pdt-card-thumbnail', 'href': True})
 87 | 	content='' 
 88 | 	url_refix = 'https://tw.carousell.com/p/'
 89 | 	for anchor in anchors:
 90 | 		for k in re.findall('\d+', anchor['href']):
 91 | 			if len(k) > 3:
 92 | 				url = url_refix + k 
 93 | 				content += anchor.find('img')['alt'] + "\n" + str(url) + "\n\n"
 94 | 
 95 | 	print (content)
 96 | 	return content[:600]
 97 | 
 98 | ### ipeen
 99 | def ipeen_grab():
100 | 	output = [[] for k in range(2)]
101 | 	for page in range(1,5):
102 | 	    url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p={}&adkw=%E5%8F%B0%E5%8C%97'.format(page)
103 | 	    print (url)
104 | 	    opener=urllib.request.build_opener()
105 | 	    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
106 | 	    page = opener.open(url)
107 | 	    soup = BeautifulSoup(page)
108 | 	    for k in soup.find_all('a', attrs={'data-label': '店名'}):
109 | 	        output[0].append(k.text)
110 | 
111 | 	    for k in soup.findAll('span',{"style":"padding-left:3em;"}):
112 | 	        output[1].append(k.get_text())
113 | 	data = ''
114 | 	for k, m in zip(output[0],output[1]):
115 | 	    data += str(k) + str(m)
116 | 	# limit number of query response here, since there may be limit in msg length 
117 | 	return data[:600]
118 | 
119 | ### ptt beauty 
120 | def ptt_beauty():
121 | 	url = 'https://www.ptt.cc/bbs/Beauty/index.html'
122 | 	rs = requests.session()
123 | 	res = rs.get('https://www.ptt.cc/bbs/Beauty/index.html', verify=False)
124 | 	soup = BeautifulSoup(res.text, 'html.parser')
125 | 	#ALLpageURL = soup.select('.btn.wide')[1]['href']
126 | 	content=''
127 | 	# limit number of query response here, since there may be limit in msg length 
128 | 	for k in soup.find_all('a',href=True)[:15]:
129 | 
130 | 	    try:
131 | 	    	if len(k['href']) < 30:
132 | 	    		pass
133 | 	    	else:
134 | 	            print ("https://www.ptt.cc/"+ k['href'], k.text)
135 | 	            content +=  k.text + "\n" + 'https://www.ptt.cc%s'%(k['href']) + "\n\n"
136 | 	    except:
137 | 	        pass
138 | 
139 | 	print ('==================')
140 | 	print (content)
141 | 	return content
142 | 
143 | if __name__ == "__main__":
144 | 	spotify_album('pete rock')
145 | 


--------------------------------------------------------------------------------
/legacy_project/delivery_/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | *.config
4 | *.log
5 | *.pyc
6 | 


--------------------------------------------------------------------------------
/legacy_project/delivery_/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## Tech 
 4 | 
 5 | Python 3.4.5, Pandas 0.20.3, numpy , BeautifulSoup, urllib, sqlite3
 6 | 
 7 | 
 8 | ## File Structure 
 9 | 
10 | ```
11 | ├── README.md
12 | ├── analysis.py : calcluate average temperature, best to swim day  
13 | ├── data2db.py  : dump needed data to sqlite 
14 | ├── scrap.py    : scrap weather data from wunderground.com
15 | └── weather.db  : sqlite db save whole daily weather data  in 2014 
16 | ```
17 | 
18 | 
19 | ## QUICK START
20 | 
21 | 
22 | ```Bash
23 | cd web_scraping
24 | # scrap and dump data to db 
25 | python data2db.py
26 | # get needed analysis output 
27 | python analysis.py
28 | 
29 | ```
30 | 
31 | ```Bash
32 | # output 
33 | 
34 | BeautifulSoup([your markup], "html5lib")
35 | 
36 |   markup_type=markup_type))
37 |         type temp_max temp_min        CET
38 | 0    Actual:       4°      -1° 2014-01-01
39 | 1   Average:       2°      -2° 2014-01-01
40 | 2    Actual:       7°       1° 2014-01-02
41 | 3   Average:       3°      -2° 2014-01-02
42 | .....
43 | 
44 | -------------------------------
45 | 
46 | SELECT avg(temp_max) AS avg_max_temp,
47 |        avg(temp_min) AS avg_min_temp,
48 | 
49 |   (SELECT ((avg(temp_max)+avg(temp_min)))/2
50 |    FROM weather_data) AS avg_all_temp
51 | FROM weather_data
52 | WHERE TYPE = 'Actual:'
53 | 
54 | 	
55 |    avg_max_temp  avg_min_temp  avg_all_temp
56 | 0     15.260274      7.312329     10.304795
57 | -------------------------------
58 | 
59 | SELECT date(CET) AS best_swim_date,
60 |        (temp_min+ temp_min)/2 AS avg_day_temp
61 | FROM weather_data
62 | WHERE avg_day_temp =
63 |     (SELECT max((temp_min+ temp_min)/2) AS max_mean_temp
64 |      FROM weather_data
65 |      WHERE TYPE = 'Actual:')
66 |   AND TYPE = 'Actual:'
67 | 
68 | 	
69 |   best_swim_date  avg_day_temp
70 | 0     2014-07-05            21
71 | 1     2014-07-21            21
72 | ```
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/legacy_project/delivery_/analysis.py:
--------------------------------------------------------------------------------
 1 | from scrap import *
 2 | from data2db import *
 3 | 
 4 | def get_values():
 5 | 
 6 | 	sql0="""
 7 | SELECT *
 8 | 	FROM weather_data
 9 | 	LIMIT 10 ;
10 | 	"""
11 | 	sql ="""
12 | SELECT avg(temp_max) AS avg_max_temp,
13 |        avg(temp_min) AS avg_min_temp,
14 | 
15 |   (SELECT ((avg(temp_max)+avg(temp_min)))/2
16 |    FROM weather_data) AS avg_all_temp
17 | FROM weather_data
18 | WHERE TYPE = 'Actual:'
19 | 
20 | 	"""
21 | 
22 | 	sql2 ="""
23 | SELECT date(CET) AS best_swim_date,
24 |        (temp_min+ temp_min)/2 AS avg_day_temp
25 | FROM weather_data
26 | WHERE avg_day_temp =
27 |     (SELECT max((temp_min+ temp_min)/2) AS max_mean_temp
28 |      FROM weather_data
29 |      WHERE TYPE = 'Actual:')
30 |   AND TYPE = 'Actual:'
31 | 
32 | 	"""
33 | 	print (sql0)
34 | 	outcome0 = pd.read_sql(sql0, con ='sqlite:///weather.db' )
35 | 	print (outcome0)
36 | 	print ('-------------------------------')
37 | 	print (sql)
38 | 	outcome = pd.read_sql(sql, con ='sqlite:///weather.db' )
39 | 	print (outcome)
40 | 	print ('-------------------------------')
41 | 	print (sql2)
42 | 	outcome2 = pd.read_sql(sql2, con ='sqlite:///weather.db' )
43 | 	print (outcome2)
44 | 
45 | if __name__ == '__main__':
46 | 	get_values()
47 | 


--------------------------------------------------------------------------------
/legacy_project/delivery_/analysis.sql:
--------------------------------------------------------------------------------
 1 | # http://www.codedata.com.tw/database/mysql-tutorial-13-stored-routines/
 2 | 
 3 | # https://www.a2hosting.co.uk/kb/developer-corner/mysql/mysql-stored-functions-and-procedures
 4 | 
 5 | #  stored function
 6 | DELIMITER $$
 7 | CREATE FUNCTION plus(temp_max FLOAT, temp_min FLOAT) RETURNS DECIMAL(9,2)
 8 | BEGIN
 9 |   DECLARE tem__ DECIMAL(9,2);
10 |   SET tem__ = temp_max + temp_min;
11 |   RETURN tem__;
12 | END$$
13 | DELIMITER ;
14 | 
15 | 
16 | # sql 
17 | SELECT *, plus(temp_max,temp_min) AS tem_test FROM weather_data;
18 | 
19 | 
20 | #  stored function
21 | DELIMITER $$
22 | CREATE PROCEDURE procedureTest()
23 | BEGIN
24 |   SELECT CET FROM weather_data;
25 | END$$
26 | DELIMITER ;
27 | 
28 | # execute 
29 | CALL procedureTest() \G
30 | 


--------------------------------------------------------------------------------
/legacy_project/delivery_/data2db.py:
--------------------------------------------------------------------------------
 1 | from scrap import *
 2 | 
 3 | def get_data():
 4 | 	output = pd.DataFrame()
 5 | 	for month in range(1,13):
 6 | 		df = get_weather_data('2014',month)
 7 | 		print (df)
 8 | 		output= output.append(df)
 9 | 	return output 
10 | 
11 | 
12 | def dump_db():
13 | 	try:
14 | 		df = get_data()
15 | 		df.to_sql('weather_data', if_exists='fail',con='sqlite:///weather.db')
16 | 		print ('dump to DB ok')
17 | 	except:
18 | 		print ('dump DB failed')
19 | 
20 | 
21 | def update_db():
22 | 	try:
23 | 		df = get_data()
24 | 		df.to_sql('weather_data',if_exists='append',con='sqlite:///weather.db')
25 | 		print ('update to DB ok')
26 | 	except:
27 | 		print ('update DB failed')
28 | 
29 | 
30 | #==============================
31 | 
32 | 
33 | class db_manipulation:
34 | 	def __init__(self, *args, **kwargs):
35 | 		self.df = get_data()
36 | 		self.con  = 'sqlite:///weather.db'
37 | 	def test(self):
38 | 		print (self.con)
39 | 
40 | 	def dumb2db(self):
41 | 		try:
42 | 			self.df.to_sql('weather_data',if_exists='fail',con=self.con)
43 | 			print ('dump to DB ok')
44 | 		except:
45 | 			print ('dump DB failed')
46 | 
47 | 	def update2db(self):
48 | 		try:
49 | 			df = self.df 
50 | 			df.to_sql('weather_data',if_exists='append',con=self.con)
51 | 			print ('update to DB ok')
52 | 		except:
53 | 			print ('dump DB failed')
54 | 
55 | if __name__ == '__main__':
56 | 	db_job = db_manipulation()
57 | 	db_job.dumb2db()
58 | 	#dump_db()


--------------------------------------------------------------------------------
/legacy_project/delivery_/query_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | sqlite3  weather.db "select * from weather_data limit 3;" 
4 | echo '---------------------'
5 | echo ''
6 | sqlite3  weather.db "select CET from weather_data limit 3;" 


--------------------------------------------------------------------------------
/legacy_project/delivery_/scrap.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np 
 3 | import datetime
 4 | import urllib, json
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | def get_weather_data(year,month):
 8 | 	#url_ = "https://www.wunderground.com/history/airport/EDDT/2014/12/01/MonthlyCalendar.html?req_city=Werftpfuhl&req_statename=Germany&reqdb.zip=00000&reqdb.magic=46&reqdb.wmo=10389#calendar"
 9 | 	url_ = "https://www.wunderground.com/history/airport/EDDT/{}/{}/01/MonthlyCalendar.html?req_city=Werftpfuhl&req_statename=Germany&reqdb.zip=00000&reqdb.magic=46&reqdb.wmo=10389#calendar".format(year,month)
10 | 	print (url_)
11 | 	# query the page 
12 | 	opener=urllib.request.build_opener()
13 | 	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
14 | 	page = opener.open(url_)
15 | 	soup = BeautifulSoup(page)
16 | 	# set up output and filter data by attrs 
17 | 	output = [[] for k in range(3)]
18 | 	for k in soup.find_all('td', attrs={'class': 'value-header'}):
19 | 	    output[0].append(k.text)
20 | 	    
21 | 	for k in soup.find_all('span', attrs={'class': 'high'}):
22 | 	    output[1].append(k.text)
23 | 	    
24 | 	for k in soup.find_all('span', attrs={'class': 'low'}):
25 | 	    output[2].append(k.text)
26 | 
27 | 	output_ =pd.DataFrame(output).T
28 | 	output_.columns = ['type','temp_max','temp_min']
29 | 	# get day list in 2014 
30 | 	sample_dates = pd.date_range(start='2014-01-01',end='2014-12-31', freq='d')
31 | 	datetimelist = []
32 | 	# get day list in specific month 
33 | 	for x in sample_dates:
34 | 		# '1'.zfill(2) = 01 , '11'.zfill(2) = 11 
35 | 		month_ = str(month).zfill(2)
36 | 		if str(x)[:7] == '{}-{}'.format(year,month_):
37 | 	        #print (str(x))
38 | 			datetimelist.append(pd.to_datetime(x))
39 | 		else:
40 | 			pass
41 | 	datetime_ = pd.DataFrame(datetimelist)
42 | 	datetime_.columns=['CET']
43 | 	# duplicate datetime data, since there are Actual, and Average  weather data 
44 | 	datetime_ = pd.concat([datetime_]*2).sort_values('CET').reset_index()
45 | 	#datetime_.head()
46 | 	output_['CET'] = np.array(datetime_.CET)
47 | 	print (output_)
48 | 	return output_


--------------------------------------------------------------------------------
/legacy_project/delivery_/sqlite2csv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #sqlite3 -header -csv  weather.db "select * from weather_data;" > weather.csv
 3 | 
 4 | # enter db name 
 5 | echo 'db list : '
 6 | echo '---------------'
 7 | ls *.db 
 8 | echo '---------------'
 9 | echo 'plz enter db file name ?'
10 | read dbname 
11 | 
12 | # enter csv name 
13 | echo 'plz enter csv file name ?'
14 | read csvname 
15 | 
16 | # enter table name 
17 | echo 'table list : '
18 | echo '---------------'
19 | sqlite3 $dbname.db ".table"
20 | echo '---------------'
21 | 
22 | echo 'plz enter table name ?'
23 | read tablename 
24 | 
25 | echo 'extract' $dbname  with  $tablename 'to' $csvname
26 | 
27 | sqlite3 -header -csv  $dbname.db "select * from $tablename ;" > $csvname.csv
28 | 


--------------------------------------------------------------------------------
/legacy_project/delivery_/weather.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/legacy_project/delivery_/weather.db


--------------------------------------------------------------------------------
/legacy_project/env.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | *** EC2 env 
 5 | 
 6 | 
 7 | - Chrome driver  :
 8 | ChromeDriver 2.25.426924 (649f9b868f6783ec9de71c123212b908bf3b232e) on port 9515
 9 | 
10 | - Chrome browser :
11 | 
12 | Google Chrome 58.0.3029.110
13 |  
14 | 
15 | *** local mac env 
16 | 
17 | 
18 | - Chrome driver  :
19 | Starting ChromeDriver 2.34.522932
20 | 
21 | - Chrome browser :
22 | Google Chrome 63.0.3239.108
23 | 
24 | 
25 | 
26 | *** Chrome driver download 
27 | 
28 | https://sites.google.com/a/chromium.org/chromedriver/downloads
29 | https://chromedriver.storage.googleapis.com/index.html
30 | 
31 | 
32 | 
33 | ** Chrome browser download 
34 | https://www.slimjet.com/chrome/google-chrome-old-version.php
35 | http://www.geocities.jp/ecvcn/exam/chrome_installer.html
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 |  
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/legacy_project/es_scrapper_docker_demo/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6-alpine
2 | 
3 | ENV ESLATICSEARCH_URL http://localhost:9200/
4 | 
5 | ADD requirements.txt /app/requirements.txt
6 | ADD . /app/
7 | WORKDIR /app/
8 | RUN pip install -r requirements.txt
9 | ENTRYPOINT python app.py 


--------------------------------------------------------------------------------
/legacy_project/es_scrapper_docker_demo/README.md:
--------------------------------------------------------------------------------
 1 | ### Quick start
 2 | ```bash
 3 | #docker build -t es_scrapper_docker_instance . && docker run -it -t es_scrapper_docker_instance
 4 |  
 5 | docker-compose -f docker-compose.yml up 
 6 | 
 7 | ```
 8 | 
 9 | ### Modify from 
10 | - https://sysadmins.co.za/scraping-websites-with-python-and-beautiful-soup-and-ingesting-into-elasticsearch/
11 | 
12 | -  https://sysadmins.co.za/building-a-search-engine-for-our-scraped-data-on-elasticsearch-part-2/


--------------------------------------------------------------------------------
/legacy_project/es_scrapper_docker_demo/app.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from elasticsearch import Elasticsearch
 6 | 
 7 | # change to local host default ip 
 8 | #es_client = Elasticsearch(['http://localhost:9200'])
 9 | es_client = Elasticsearch(['http://127.0.0.1:9200'])
10 | 
11 | drop_index = es_client.indices.create(index='blog-sysadmins', ignore=400)
12 | create_index = es_client.indices.delete(index='blog-sysadmins', ignore=[400, 404])
13 | 
14 | def urlparser(title, url):
15 |     # scrape title
16 |     p = {}
17 |     post = title
18 |     page = requests.get(post).content
19 |     soup = BeautifulSoup(page, 'lxml')
20 |     title_name = soup.title.string
21 | 
22 |     # scrape tags
23 |     tag_names = []
24 |     desc = soup.findAll(attrs={"property":"article:tag"})
25 |     for x in range(len(desc)):
26 |         tag_names.append(desc[x-1]['content'].encode('utf-8'))
27 | 
28 |     # payload for elasticsearch
29 |     doc = {
30 |         'date': time.strftime("%Y-%m-%d"),
31 |         'title': title_name,
32 |         'tags': tag_names,
33 |         'url': url
34 |     }
35 |     # ingest payload into elasticsearch
36 |     res = es_client.index(index="blog-sysadmins", doc_type="docs", body=doc)
37 |     time.sleep(0.5)
38 | 
39 | sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml'
40 | page = requests.get(sitemap_feed)
41 | sitemap_index = BeautifulSoup(page.content, 'html.parser')
42 | urls = [element.text for element in sitemap_index.findAll('loc')]
43 | 
44 | for i in range(3):
45 |     for x in urls:
46 |         print ('x :', x )
47 |         urlparser(x, x)
48 |     time.sleep(5)
49 | 


--------------------------------------------------------------------------------
/legacy_project/es_scrapper_docker_demo/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   app:
 4 |     build: 
 5 |       context: .
 6 |       dockerfile: Dockerfile
 7 |     networks:
 8 |            - docker-elk
 9 |     depends_on:
10 |       - elasticsearch
11 |   elasticsearch:
12 |     image: docker.elastic.co/elasticsearch/elasticsearch:6.3.2
13 |     container_name: elasticsearch
14 |     environment:
15 |       - node.name=es01
16 |       - cluster.name=docker-cluster
17 |       - bootstrap.memory_lock=true
18 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
19 |     ulimits:
20 |       memlock:
21 |         soft: -1
22 |         hard: -1
23 |     networks:
24 |            - docker-elk
25 |     privileged: true
26 |     ports:
27 |       - "9200:9200"
28 |       - "9300:9300"
29 |   kibana:
30 |     image: docker.elastic.co/kibana/kibana:6.3.2
31 |     container_name: kibana
32 |     environment:
33 |       SERVER_NAME: localhost
34 |       ELASTICSEARCH_URL: http://elasticsearch:9200"
35 |     networks:
36 |           - docker-elk
37 |     ports:
38 |       - "5601:5601"
39 |     depends_on:
40 |       - elasticsearch
41 | networks:
42 |   docker-elk:
43 |     driver: bridge


--------------------------------------------------------------------------------
/legacy_project/es_scrapper_docker_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | elasticsearch
3 | requests


--------------------------------------------------------------------------------
/legacy_project/eztable/eztable_scarp.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time, re
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | # grab "steak restaurant "
 6 | url = 'https://tw.eztable.com/search?q=%E7%89%9B%E6%8E%92'
 7 | # set up selenium driver (via firefox)
 8 | driver = webdriver.Firefox()
 9 | driver.implicitly_wait(3)
10 | driver.get(url)
11 | # set grab 50 pages 
12 | for i in range(1,50):
13 | 	driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
14 | 	### wait 1 sec tull JS/ajax load the all contents 
15 | 	time.sleep(1)
16 | 
17 | # analysis web page generate by JS via BeautifulSoup
18 | soup = BeautifulSoup(driver.page_source, "html5lib")
19 | for block in soup.find_all('h5'):
20 | 	# print restaurant name 
21 | 	print (block.text)
22 | 


--------------------------------------------------------------------------------
/legacy_project/eztable/eztable_scrap_dev.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time, re
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | def enter_text(xpath, text, driver):
 6 |     textbox = driver.find_element_by_xpath(xpath)
 7 |     textbox.send_keys(text)
 8 | 
 9 | # grab "steak restaurant "
10 | url = 'https://tw.eztable.com/'
11 | # set up selenium driver (via firefox)
12 | driver = webdriver.Firefox()
13 | driver.implicitly_wait(3)
14 | driver.get(url)
15 | 
16 | xpath_ = ".//input[@class='search-input']"
17 | enter_text(xpath_, "japan" , driver)


--------------------------------------------------------------------------------
/legacy_project/eztable/eztable_scrap_dev2.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time, re
 3 | from bs4 import BeautifulSoup
 4 | from urllib.request import urlopen
 5 | 
 6 | # grab "steak restaurant "
 7 | url = 'http://www.google.com'
 8 | # set up selenium driver (via firefox)
 9 | driver = webdriver.Firefox()
10 | #driver.get(url)
11 | #element = driver.find_element_by_xpath("//input[@id='lst-ib']")
12 | #element.send_keys('abcde')
13 | 
14 | keywords = ['sss','jp','usa']
15 | keywords = ['日本']
16 | 
17 | for keyword in keywords:
18 | 	# access to website
19 | 	driver.get(url)
20 | 	element = driver.find_element_by_xpath("//input[@id='lst-ib']")
21 | 	element.send_keys(keyword)
22 | 	button = driver.find_element_by_xpath("//div[@class='jsb']/center/input[1]")
23 | 	button.click()
24 | 	# analyze website elements 
25 | 	current_url = driver.current_url
26 | 	page = urlopen(current_url)
27 | 	#html = driver.page_source
28 | 	#soup = BeautifulSoup(html)
29 | 	soup = BeautifulSoup(page, 'html.parser')
30 | 	for item in soup.find_all('b'):
31 | 		print (item.text)
32 | 	driver.implicitly_wait(10)
33 | 	#driver.get(url)


--------------------------------------------------------------------------------
/legacy_project/eztable/eztable_scrap_inputword.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time, re
 3 | from bs4 import BeautifulSoup
 4 | from urllib.request import urlopen
 5 | import time
 6 | 
 7 | # grab "steak restaurant "
 8 | url = 'https://tw.eztable.com/search?q='
 9 | # set up selenium driver (via firefox)
10 | driver = webdriver.Firefox()
11 | 
12 | keywords = ['港式','牛排','中餐']
13 | 
14 | for keyword in keywords:
15 | 	# access to website
16 | 	driver.get(url)
17 | 	# xpath for input word 
18 | 	element = driver.find_element_by_xpath("//input[@class='search-input']")
19 | 	element.send_keys(keyword)
20 | 	# xpath for search button 
21 | 	button = driver.find_element_by_class_name("search-btn")
22 | 	button.click()
23 | 	# get current url 
24 | 	print ('current_url : ', driver.current_url)
25 | 	driver.get(driver.current_url)
26 | 	# analyze html 
27 | 	soup = BeautifulSoup(driver.page_source, "html5lib")
28 | 	for block in soup.find_all('h5'):
29 | 		print (block.text)
30 | 	time.sleep(3)
31 | 


--------------------------------------------------------------------------------
/legacy_project/eztable/geckodriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/legacy_project/eztable/geckodriver.log


--------------------------------------------------------------------------------
/legacy_project/geojson.py:
--------------------------------------------------------------------------------
 1 | toy_json = {
 2 |     "type": "MultiPolygon",
 3 |     "coordinates": [
 4 |         
 5 |          [
 6 |              [ [100.0, 0.0], [101.0, 0.0], [101.0, 1.0],
 7 |                [100.0, 1.0], [100.0, 0.0] ]
 8 |         ],
 9 |          [
10 |              [ [200.0, 0.0], [201.0, 0.0], [201.0, 1.0],
11 |                [200.0, 1.0], [200.0, 0.0] ]
12 |         ]
13 |     ]
14 | }
15 | 
16 | 
17 | def make_multiple_polygon(coordinates_set1,coordinates_set2):
18 | 	pass 


--------------------------------------------------------------------------------
/legacy_project/google_geodata/geopy_address_lon_lat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # ref 
 3 | # https://pypi.python.org/pypi/geopy
 4 | # todo : deal with geopy request limit 
 5 | # https://stackoverflow.com/questions/30108786/how-to-deal-with-geopys-query-limit
 6 | # install Nominatim on server 
 7 | # https://wiki.openstreetmap.org/wiki/Nominatim/Installation
 8 | # work around 
 9 | # https://www.shanelynn.ie/batch-geocoding-in-python-with-google-geocoding-api/
10 | import numpy as np 
11 | import time
12 | import requests
13 | import os 
14 | from geopy.geocoders import Nominatim
15 | 
16 | def address_2_lonlat(x):
17 |     print (x)
18 |     time.sleep(1)  # let's see if sleep 1 per epoch is OK for limitation 
19 |     try:
20 |         geolocator = Nominatim()
21 |         location = geolocator.geocode(x)
22 |         print(location.latitude, location.longitude)
23 |         return [location.latitude, location.longitude]
24 |     except Exception as e:
25 |         print (e)
26 |         print ('fail to convert address to lon & lat ') 
27 |         return [None,None]
28 | 
29 | def address_2_lonlat_hack(x):
30 |     """
31 |     in case frequent request would make script get block
32 |     here is a mini hack : make script sleep until the API is able to response 
33 |     then do the run again 
34 |     """
35 |     print (x)
36 |     time.sleep(1)  # let's see if sleep 1 per epoch is OK for limitation 
37 |     try:
38 |         geolocator = Nominatim()
39 |         location = geolocator.geocode(x)
40 |         print(location.latitude, location.longitude)
41 |         return [location.latitude, location.longitude]
42 |     except Exception as e:
43 |         print (e)
44 |         if str(e) == '[Errno 61] Connection refused':
45 |             print ('meet API request limit, try again...')
46 |             print ('sleep 1 min  ...')
47 |             time.sleep(60)
48 |             address_2_lonlat_hack(x)
49 |         else:
50 |             print ('fail to convert address to lon & lat ') 
51 |         return [None,None]
52 | 
53 | def split_lat(x):
54 |     try:
55 |         return x[0]
56 |     except:
57 |         return None 
58 |     
59 | def split_lon(x):
60 |     try:
61 |         return x[1]
62 |     except:
63 |         return None 
64 | 
65 | def run_hack(df):
66 |     """
67 |     df :
68 |     id, address zipcode , lat lon 
69 |     """
70 |     pass         


--------------------------------------------------------------------------------
/legacy_project/google_geodata/gmap_address_lon_lat.py:
--------------------------------------------------------------------------------
 1 | import urllib.request, json 
 2 | import pandas as pd 
 3 | import numpy as np 
 4 | import requests
 5 | import urllib, json
 6 | import os 
 7 | # ref 
 8 | # https://developers.google.com/maps/documentation/geocoding/start
 9 | 
10 | gmap_api = os.environ['gmap_api']
11 | print ('gmap_api : ' , gmap_api)
12 | 
13 | 
14 | def gmap_url(address_):
15 | 	address_fix = address_.replace(' ','+')
16 | 	print (address_fix)
17 | 	g_map_url='https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(address_fix,gmap_api)
18 | 	print (g_map_url)
19 | 	return g_map_url 
20 | 
21 | def address_2_lonlat(g_map_url):
22 | 	with urllib.request.urlopen(g_map_url) as url:
23 | 		try:
24 | 			data = json.loads(url.read().decode())
25 | 			print(data)
26 | 			#data['results'][0]['geometry']['location']
27 | 			return data
28 | 		except Exception as e:
29 | 			print (e)
30 | 			print ('fail to convert address to lon & lat ') 
31 | 		return None 
32 | 
33 | def get_lon_lat(address_):
34 | 	g_map_url = gmap_url(address_)
35 | 	try:
36 | 		# to do :  fix utf-8 python 3 encodin problem 
37 | 		data = address_2_lonlat(g_map_url)
38 | 		result = data['results'][0]['geometry']['location']
39 | 	except:
40 | 		result = None,None
41 | 	return result 
42 | 
43 | def split_lat(x):
44 |     try:
45 |         return str(x['lat'])
46 |     except:
47 |         return None 
48 |     
49 | def split_lon(x):
50 |     try:
51 |         return str(x['lng'])
52 |     except:
53 |         return None 


--------------------------------------------------------------------------------
/legacy_project/ipeen/README.md:
--------------------------------------------------------------------------------
 1 | # web_scraping
 2 | 
 3 | 
 4 | ### Tech 
 5 | 
 6 | - Python 3  
 7 | 
 8 | 
 9 | ### Quick start
10 | 
11 | install git 
12 | 
13 | ```
14 | https://git-scm.com/book/en/v2/Getting-Started-Installing-Git
15 | https://www.atlassian.com/git/tutorials/install-git
16 | ```
17 | 
18 | ```
19 | $ git clone https://github.com/yennanliu/web_scraping
20 | $ cd web_scraping 
21 | $ source setup.sh
22 | ```
23 | 
24 | scrap ipeen  
25 | 
26 | ```
27 | python /ipeen/ipeen_grab.py 大安區
28 | ```
29 | or 
30 | 
31 | ```
32 | python /ipeen/ipeen_grab.py your_area
33 | ```
34 | 
35 | 
36 | ### Response
37 | 
38 | ```
39 | 	name	address	url	style	area
40 | 0	吉宏米粉湯(西門町店)	台北市萬華區昆明街140號1樓\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\...	http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店	小吃	萬華區
41 | 1	瓦法奇朵Waffogato(台北車站店)	台北市中正區信陽街29號\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t...	http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...	異國料理	中正區
42 | ```
43 | 
44 | ```
45 | please check saving csv 
46 | 
47 | ```


--------------------------------------------------------------------------------
/legacy_project/ipeen/ipeen_grab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | import urllib 
 5 | #import simplejson 
 6 | from urllib.request import urlopen
 7 | from urllib.parse   import quote
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | import urllib, json
11 | import pandas as pd, numpy as np
12 | import sys ,re,time
13 | # transform chinese into web url in python 3 
14 | # https://stackoverflow.com/questions/1695183/how-to-percent-encode-url-parameters-in-python/13625238#13625238
15 | from urllib.parse import quote
16 | # parse parameter from command line to python 
17 | import argparse
18 | 
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('nums', nargs='*')
21 | #parser.add_argument("echo")
22 | args = parser.parse_args()
23 | #print(args.echo)
24 | print(args.nums)
25 | #print (quote(args.echo))
26 | #area = quote(args.echo)
27 | search_string=" ".join(str(x) for x in args.nums)
28 | print (search_string)
29 | area = quote(search_string)
30 | 
31 | print ('===========')
32 | 
33 | def url_fix(x):
34 |     return 'http://www.ipeen.com.tw' + x
35 | 
36 | def parse_area(x):
37 | 	if x:
38 | 		return x[3:6]
39 | 	else:
40 | 		return ''
41 | 
42 | def grab_raw(area):
43 |     output = [[] for k in range(4)]
44 |     for page in range(0,171):
45 | 	    #url_='http://www.ipeen.com.tw/search/all/000/0-100-0-0/?adkw=%E5%A4%A7%E5%AE%89%E5%8D%80&p={}'
46 | 	    # 0-100-0-0 : all , 1-100-0-0 : 美食 ..
47 | 	    url_='http://www.ipeen.com.tw/search/all/000/1-100-0-0/?&baragain=1&adkw={}&p={}'
48 | 	    url_=url_.format(area,page)
49 | 	    print (url_)
50 | 	    opener=urllib.request.build_opener()
51 | 	    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
52 | 	    page = opener.open(url_)
53 | 	    soup = BeautifulSoup(page)
54 | 	    for k in soup.find_all('a', attrs={'data-label': '店名'}):
55 | 	        output[0].append(k.text)
56 | 
57 | 	    for k in soup.findAll('span',{"style":"padding-left:3em;"}):
58 | 	        output[1].append(k.get_text())
59 | 	    
60 | 	    for k in soup.find_all('a', {'class':"a37 ga_tracking"}):
61 | 	        if "/shop/" in str(k['href']):
62 | 	            output[2].append((k['href']))
63 | 	    for k in soup.find_all('a', attrs={'class': 'ga_tracking'}):
64 | 	        if "大分類" in str(k):
65 | 	            #print (k.text)
66 | 	            output[3].append((k.text))
67 | 	        
68 | 	        else:
69 | 	            pass
70 | 	    #time.sleep(1)
71 |     print (output)
72 |     return output
73 | 
74 | def grab_df():
75 | 	
76 | 	output = grab_raw(area)
77 | 	df = pd.DataFrame(output).T
78 | 	df.columns = ['name', 'address', 'url','style']
79 | 	df.url = df.url.apply(lambda x :url_fix(x) )
80 | 	df['area'] = df['address'].apply(lambda x :parse_area(x) )
81 | 	print (df.head())
82 | 	df.to_csv('ipeen_restaurant_板橋.csv')
83 | 	return df 
84 | 
85 | if __name__ == '__main__':
86 | 	grab_df()
87 | 


--------------------------------------------------------------------------------
/legacy_project/ipeen/ipeen_pivot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | # read all *.csv in samle file once and merge into one csv 
 7 | 
 8 | def data_prepare():
 9 |     #files = !ls *.csv # IPython magic
10 |     # using glob get all csv name in same route as list 
11 |     files = glob.glob('./*.csv')
12 |     df = pd.concat([pd.read_csv(f, index_col=0, header=None) for f in files], keys=files)
13 |     df.columns = ['name','address','url','style','area']
14 |     df_ = df.ix[1:].reset_index()
15 |     df_ = df_[['name','address','url','style','area']]
16 |     #df_.head()
17 |     return df_
18 | 
19 | def data_clean(df):
20 |     # modify ur city list here 
21 |     area_ = ['萬華區']
22 |     df = df[area_].reset_index()
23 |     return df
24 | 
25 | # group by city name and rename columns 
26 | df_ = data_prepare()
27 | df_inter = df_.groupby(['area','style']).count().reset_index()[['area','style','name']]
28 | df_inter.columns = [['area','style','count']]
29 | 
30 | # to pivot table 
31 | df_pivot = pd.pivot_table(df_inter, values='count', index=['area'],columns=['style'], aggfunc=np.sum).fillna(0).T
32 | df_pivot_ = data_clean(df_pivot)
33 | df_pivot.to_csv('df_pivot_final.csv')
34 | 


--------------------------------------------------------------------------------
/legacy_project/ipeen/ipeen_restaurant_grab_V2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 39,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# -*- coding: utf-8 -*-"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import urllib \n",
 23 |     "#import simplejson \n",
 24 |     "from urllib.request import urlopen\n",
 25 |     "import csv\n",
 26 |     "import requests\n",
 27 |     "from bs4 import BeautifulSoup\n",
 28 |     "import lxml\n",
 29 |     "import urllib, json\n",
 30 |     "import pandas as pd, numpy as np\n",
 31 |     "import pprint\n",
 32 |     "import datetime as dt \n",
 33 |     "from urllib.parse   import quote\n",
 34 |     "import sys \n",
 35 |     "#import urllib2\n",
 36 |     "import re\n",
 37 |     "import lxml"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 63,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# help function \n",
 49 |     "\n",
 50 |     "def url_fix(x):\n",
 51 |     "    return 'http://www.ipeen.com.tw' + x\n",
 52 |     "\n",
 53 |     "def parse_area(x):\n",
 54 |     "    return x[3:6]"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 27,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "output = [[] for k in range(4)]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 76,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "'%E5%A4%A7%E5%AE%89%E5%8D%80'"
 77 |       ]
 78 |      },
 79 |      "execution_count": 76,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "#parse chiese into code can integrate into url \n",
 86 |     "from urllib.parse import quote\n",
 87 |     "quote('大安區')"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 49,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stderr",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "//anaconda/envs/g_dash/lib/python3.4/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html5lib\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
100 |       "\n",
101 |       "The code that caused this warning is on line 170 of the file /anaconda/envs/g_dash/lib/python3.4/runpy.py. To get rid of this warning, change code that looks like this:\n",
102 |       "\n",
103 |       " BeautifulSoup([your markup])\n",
104 |       "\n",
105 |       "to this:\n",
106 |       "\n",
107 |       " BeautifulSoup([your markup], \"html5lib\")\n",
108 |       "\n",
109 |       "  markup_type=markup_type))\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "for page in range(1,5):\n",
115 |     "    #url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p={}&adkw=%E5%8F%B0%E5%8C%97'.format(page)\n",
116 |     "    #print (url)\n",
117 |     "    url_='http://www.ipeen.com.tw/search/all/000/0-100-0-0/?adkw=%E5%A4%A7%E5%AE%89%E5%8D%80&bar={}'\n",
118 |     "    url_=url_.format(page)\n",
119 |     "    opener=urllib.request.build_opener()\n",
120 |     "    opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n",
121 |     "    page = opener.open(url_)\n",
122 |     "    soup = BeautifulSoup(page)\n",
123 |     "    for k in soup.find_all('a', attrs={'data-label': '店名'}):\n",
124 |     "        output[0].append(k.text)\n",
125 |     "\n",
126 |     "    for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n",
127 |     "        output[1].append(k.get_text())\n",
128 |     "    \n",
129 |     "    for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n",
130 |     "        if \"/shop/\" in str(k['href']):\n",
131 |     "            output[2].append((k['href']))\n",
132 |     "    for k in soup.find_all('a', attrs={'class': 'ga_tracking'}):\n",
133 |     "        if \"大分類\" in str(k):\n",
134 |     "            #print (k.text)\n",
135 |     "            output[3].append((k.text))\n",
136 |     "        \n",
137 |     "        else:\n",
138 |     "            pass"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 52,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "'http://www.ipeen.com.tw/search/all/000/0-100-0-0/?adkw=%E5%A4%A7%E5%AE%89%E5%8D%80&bar=4'"
150 |       ]
151 |      },
152 |      "execution_count": 52,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "url_"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 50,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/html": [
169 |        "<div>\n",
170 |        "<table border=\"1\" class=\"dataframe\">\n",
171 |        "  <thead>\n",
172 |        "    <tr style=\"text-align: right;\">\n",
173 |        "      <th></th>\n",
174 |        "      <th>name</th>\n",
175 |        "      <th>address</th>\n",
176 |        "      <th>url</th>\n",
177 |        "      <th>style</th>\n",
178 |        "    </tr>\n",
179 |        "  </thead>\n",
180 |        "  <tbody>\n",
181 |        "    <tr>\n",
182 |        "      <th>0</th>\n",
183 |        "      <td>吉宏米粉湯(西門町店)</td>\n",
184 |        "      <td>台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
185 |        "      <td>http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店</td>\n",
186 |        "      <td>小吃</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>1</th>\n",
190 |        "      <td>瓦法奇朵Waffogato(台北車站店)</td>\n",
191 |        "      <td>台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t...</td>\n",
192 |        "      <td>http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...</td>\n",
193 |        "      <td>異國料理</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <th>2</th>\n",
197 |        "      <td>添好運台灣 Timhowan Taiwan</td>\n",
198 |        "      <td>台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
199 |        "      <td>http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...</td>\n",
200 |        "      <td>中式料理</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>3</th>\n",
204 |        "      <td>威靈頓街1號 粥麵茶餐廳</td>\n",
205 |        "      <td>台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
206 |        "      <td>http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳</td>\n",
207 |        "      <td>中式料理</td>\n",
208 |        "    </tr>\n",
209 |        "    <tr>\n",
210 |        "      <th>4</th>\n",
211 |        "      <td>劉山東小吃店</td>\n",
212 |        "      <td>台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...</td>\n",
213 |        "      <td>http://www.ipeen.com.tw/shop/44415-劉山東小吃店</td>\n",
214 |        "      <td>中式料理</td>\n",
215 |        "    </tr>\n",
216 |        "  </tbody>\n",
217 |        "</table>\n",
218 |        "</div>"
219 |       ],
220 |       "text/plain": [
221 |        "                    name                                            address  \\\n",
222 |        "0            吉宏米粉湯(西門町店)  台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
223 |        "1   瓦法奇朵Waffogato(台北車站店)  台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t...   \n",
224 |        "2  添好運台灣 Timhowan Taiwan  台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
225 |        "3           威靈頓街1號 粥麵茶餐廳  台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
226 |        "4                 劉山東小吃店  台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...   \n",
227 |        "\n",
228 |        "                                                 url style  \n",
229 |        "0    http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店    小吃  \n",
230 |        "1  http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...  異國料理  \n",
231 |        "2  http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...  中式料理  \n",
232 |        "3   http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳  中式料理  \n",
233 |        "4          http://www.ipeen.com.tw/shop/44415-劉山東小吃店  中式料理  "
234 |       ]
235 |      },
236 |      "execution_count": 50,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "df = pd.DataFrame(output).T\n",
243 |     "df.columns = ['name', 'address', 'url','style']\n",
244 |     "df.url = df.url.apply(lambda x :url_fix(x) )\n",
245 |     "df.to_csv('ipeen_scrap.csv')\n",
246 |     "df.head()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 79,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "#df.head(4)['address']"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 64,
261 |    "metadata": {
262 |     "collapsed": true
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "df['area'] = df['address'].apply(lambda x :parse_area(x) )"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 78,
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/html": [
277 |        "<div>\n",
278 |        "<table border=\"1\" class=\"dataframe\">\n",
279 |        "  <thead>\n",
280 |        "    <tr style=\"text-align: right;\">\n",
281 |        "      <th></th>\n",
282 |        "      <th>name</th>\n",
283 |        "      <th>address</th>\n",
284 |        "      <th>url</th>\n",
285 |        "      <th>style</th>\n",
286 |        "      <th>area</th>\n",
287 |        "    </tr>\n",
288 |        "  </thead>\n",
289 |        "  <tbody>\n",
290 |        "    <tr>\n",
291 |        "      <th>0</th>\n",
292 |        "      <td>吉宏米粉湯(西門町店)</td>\n",
293 |        "      <td>台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
294 |        "      <td>http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店</td>\n",
295 |        "      <td>小吃</td>\n",
296 |        "      <td>萬華區</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>1</th>\n",
300 |        "      <td>瓦法奇朵Waffogato(台北車站店)</td>\n",
301 |        "      <td>台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t...</td>\n",
302 |        "      <td>http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...</td>\n",
303 |        "      <td>異國料理</td>\n",
304 |        "      <td>中正區</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <th>2</th>\n",
308 |        "      <td>添好運台灣 Timhowan Taiwan</td>\n",
309 |        "      <td>台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
310 |        "      <td>http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...</td>\n",
311 |        "      <td>中式料理</td>\n",
312 |        "      <td>中正區</td>\n",
313 |        "    </tr>\n",
314 |        "    <tr>\n",
315 |        "      <th>3</th>\n",
316 |        "      <td>威靈頓街1號 粥麵茶餐廳</td>\n",
317 |        "      <td>台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
318 |        "      <td>http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳</td>\n",
319 |        "      <td>中式料理</td>\n",
320 |        "      <td>中正區</td>\n",
321 |        "    </tr>\n",
322 |        "    <tr>\n",
323 |        "      <th>4</th>\n",
324 |        "      <td>劉山東小吃店</td>\n",
325 |        "      <td>台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...</td>\n",
326 |        "      <td>http://www.ipeen.com.tw/shop/44415-劉山東小吃店</td>\n",
327 |        "      <td>中式料理</td>\n",
328 |        "      <td>中正區</td>\n",
329 |        "    </tr>\n",
330 |        "  </tbody>\n",
331 |        "</table>\n",
332 |        "</div>"
333 |       ],
334 |       "text/plain": [
335 |        "                    name                                            address  \\\n",
336 |        "0            吉宏米粉湯(西門町店)  台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
337 |        "1   瓦法奇朵Waffogato(台北車站店)  台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t...   \n",
338 |        "2  添好運台灣 Timhowan Taiwan  台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
339 |        "3           威靈頓街1號 粥麵茶餐廳  台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
340 |        "4                 劉山東小吃店  台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...   \n",
341 |        "\n",
342 |        "                                                 url style area  \n",
343 |        "0    http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店    小吃  萬華區  \n",
344 |        "1  http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...  異國料理  中正區  \n",
345 |        "2  http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...  中式料理  中正區  \n",
346 |        "3   http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳  中式料理  中正區  \n",
347 |        "4          http://www.ipeen.com.tw/shop/44415-劉山東小吃店  中式料理  中正區  "
348 |       ]
349 |      },
350 |      "execution_count": 78,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "df.head()"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 77,
362 |    "metadata": {
363 |     "collapsed": true
364 |    },
365 |    "outputs": [],
366 |    "source": [
367 |     "#df.to_csv('ipeen_restaurant_0617.csv')"
368 |    ]
369 |   }
370 |  ],
371 |  "metadata": {
372 |   "kernelspec": {
373 |    "display_name": "Python 3",
374 |    "language": "python",
375 |    "name": "python3"
376 |   },
377 |   "language_info": {
378 |    "codemirror_mode": {
379 |     "name": "ipython",
380 |     "version": 3
381 |    },
382 |    "file_extension": ".py",
383 |    "mimetype": "text/x-python",
384 |    "name": "python",
385 |    "nbconvert_exporter": "python",
386 |    "pygments_lexer": "ipython3",
387 |    "version": "3.4.5"
388 |   }
389 |  },
390 |  "nbformat": 4,
391 |  "nbformat_minor": 2
392 | }
393 | 


--------------------------------------------------------------------------------
/legacy_project/ipeen/ipeen_restaurant_pivot_table.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# -*- coding: utf-8 -*-"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 123,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import os\n",
 23 |     "import glob\n",
 24 |     "import pandas as pd\n",
 25 |     "import numpy as np\n",
 26 |     "#import matplotlib.pyplot as plt\n",
 27 |     "#%matplotlib inline"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 125,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# read all *.csv in samle file once and merge into one csv \n",
 37 |     "\n",
 38 |     "def data_prepare():\n",
 39 |     "    files = !ls *.csv # IPython magic\n",
 40 |     "    df = pd.concat([pd.read_csv(f, index_col=0, header=None) for f in files], keys=files)\n",
 41 |     "    df.columns = ['name','address','url','style','area']\n",
 42 |     "    df_ = df.ix[1:].reset_index()\n",
 43 |     "    df_ = df_[['name','address','url','style','area']]\n",
 44 |     "    #df_.head()\n",
 45 |     "    return df_\n",
 46 |     "\n",
 47 |     "def data_clean(df):\n",
 48 |     "    area_ = ['中和區', \n",
 49 |     "         '中山區', \n",
 50 |     "         '中正區',\n",
 51 |     "         '信義區',\n",
 52 |     "         '大同區', \n",
 53 |     "         '大安區',\n",
 54 |     "         '松山區', \n",
 55 |     "         '板橋區', \n",
 56 |     "         '永和區']\n",
 57 |     "    df = df[area_].reset_index()\n",
 58 |     "    return df "
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 72,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "df_= data_prepare()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 75,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/html": [
 78 |        "<div>\n",
 79 |        "<table border=\"1\" class=\"dataframe\">\n",
 80 |        "  <thead>\n",
 81 |        "    <tr style=\"text-align: right;\">\n",
 82 |        "      <th></th>\n",
 83 |        "      <th>name</th>\n",
 84 |        "      <th>address</th>\n",
 85 |        "      <th>url</th>\n",
 86 |        "      <th>style</th>\n",
 87 |        "      <th>area</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>鹿兒島燒肉專賣店(中和中山店)</td>\n",
 94 |        "      <td>新北市中和區中山路二段28號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n...</td>\n",
 95 |        "      <td>http://www.ipeen.com.tw/shop/1128665-鹿兒島燒肉專賣店-...</td>\n",
 96 |        "      <td>燒烤類</td>\n",
 97 |        "      <td>中和區</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>1</th>\n",
101 |        "      <td>青禾幸福鍋物涮涮屋(永安店)</td>\n",
102 |        "      <td>新北市中和區中和路380號2樓(永安市場捷運站旁)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
103 |        "      <td>http://www.ipeen.com.tw/shop/138215-青禾幸福鍋物涮涮屋-永安店</td>\n",
104 |        "      <td>鍋類</td>\n",
105 |        "      <td>中和區</td>\n",
106 |        "    </tr>\n",
107 |        "  </tbody>\n",
108 |        "</table>\n",
109 |        "</div>"
110 |       ],
111 |       "text/plain": [
112 |        "              name                                            address  \\\n",
113 |        "0  鹿兒島燒肉專賣店(中和中山店)  新北市中和區中山路二段28號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n...   \n",
114 |        "1   青禾幸福鍋物涮涮屋(永安店)  新北市中和區中和路380號2樓(永安市場捷運站旁)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
115 |        "\n",
116 |        "                                                 url style area  \n",
117 |        "0  http://www.ipeen.com.tw/shop/1128665-鹿兒島燒肉專賣店-...   燒烤類  中和區  \n",
118 |        "1  http://www.ipeen.com.tw/shop/138215-青禾幸福鍋物涮涮屋-永安店    鍋類  中和區  "
119 |       ]
120 |      },
121 |      "execution_count": 75,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "df_.head(2)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 83,
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/html": [
138 |        "<div>\n",
139 |        "<table border=\"1\" class=\"dataframe\">\n",
140 |        "  <thead>\n",
141 |        "    <tr style=\"text-align: right;\">\n",
142 |        "      <th></th>\n",
143 |        "      <th>area</th>\n",
144 |        "      <th>style</th>\n",
145 |        "      <th>count</th>\n",
146 |        "    </tr>\n",
147 |        "  </thead>\n",
148 |        "  <tbody>\n",
149 |        "    <tr>\n",
150 |        "      <th>0</th>\n",
151 |        "      <td>\\t\\t\\t</td>\n",
152 |        "      <td>冷凍/冷藏包裝食品</td>\n",
153 |        "      <td>3</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>1</th>\n",
157 |        "      <td>\\t\\t\\t</td>\n",
158 |        "      <td>常溫包裝食品</td>\n",
159 |        "      <td>6</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <th>2</th>\n",
163 |        "      <td>\\t\\t\\t</td>\n",
164 |        "      <td>網購包裝食品</td>\n",
165 |        "      <td>12</td>\n",
166 |        "    </tr>\n",
167 |        "  </tbody>\n",
168 |        "</table>\n",
169 |        "</div>"
170 |       ],
171 |       "text/plain": [
172 |        "     area      style  count\n",
173 |        "0  \\t\\t\\t  冷凍/冷藏包裝食品      3\n",
174 |        "1  \\t\\t\\t     常溫包裝食品      6\n",
175 |        "2  \\t\\t\\t     網購包裝食品     12"
176 |       ]
177 |      },
178 |      "execution_count": 83,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "df_inter = df_.groupby(['area','style']).count().reset_index()[['area','style','name']]\n",
185 |     "df_inter.columns = [['area','style','count']]\n",
186 |     "df_inter.head(3)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 107,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "df_pivot = pd.pivot_table(df_inter, values='count', index=['area'],\n",
196 |     "                    columns=['style'], aggfunc=np.sum).fillna(0).T"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 126,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "df_pivot_ = data_clean(df_pivot)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 127,
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/html": [
216 |        "<div>\n",
217 |        "<table border=\"1\" class=\"dataframe\">\n",
218 |        "  <thead>\n",
219 |        "    <tr style=\"text-align: right;\">\n",
220 |        "      <th>area</th>\n",
221 |        "      <th>style</th>\n",
222 |        "      <th>中和區</th>\n",
223 |        "      <th>中山區</th>\n",
224 |        "      <th>中正區</th>\n",
225 |        "      <th>信義區</th>\n",
226 |        "      <th>大同區</th>\n",
227 |        "      <th>大安區</th>\n",
228 |        "      <th>松山區</th>\n",
229 |        "      <th>板橋區</th>\n",
230 |        "      <th>永和區</th>\n",
231 |        "    </tr>\n",
232 |        "  </thead>\n",
233 |        "  <tbody>\n",
234 |        "    <tr>\n",
235 |        "      <th>0</th>\n",
236 |        "      <td>buffet自助餐</td>\n",
237 |        "      <td>1.0</td>\n",
238 |        "      <td>18.0</td>\n",
239 |        "      <td>6.0</td>\n",
240 |        "      <td>15.0</td>\n",
241 |        "      <td>6.0</td>\n",
242 |        "      <td>9.0</td>\n",
243 |        "      <td>8.0</td>\n",
244 |        "      <td>6.0</td>\n",
245 |        "      <td>2.0</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>1</th>\n",
249 |        "      <td>style</td>\n",
250 |        "      <td>0.0</td>\n",
251 |        "      <td>0.0</td>\n",
252 |        "      <td>0.0</td>\n",
253 |        "      <td>0.0</td>\n",
254 |        "      <td>0.0</td>\n",
255 |        "      <td>0.0</td>\n",
256 |        "      <td>0.0</td>\n",
257 |        "      <td>0.0</td>\n",
258 |        "      <td>0.0</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>2</th>\n",
262 |        "      <td>中式料理</td>\n",
263 |        "      <td>376.0</td>\n",
264 |        "      <td>851.0</td>\n",
265 |        "      <td>667.0</td>\n",
266 |        "      <td>524.0</td>\n",
267 |        "      <td>285.0</td>\n",
268 |        "      <td>907.0</td>\n",
269 |        "      <td>618.0</td>\n",
270 |        "      <td>462.0</td>\n",
271 |        "      <td>286.0</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>3</th>\n",
275 |        "      <td>主題特色餐廳</td>\n",
276 |        "      <td>20.0</td>\n",
277 |        "      <td>160.0</td>\n",
278 |        "      <td>63.0</td>\n",
279 |        "      <td>101.0</td>\n",
280 |        "      <td>46.0</td>\n",
281 |        "      <td>218.0</td>\n",
282 |        "      <td>73.0</td>\n",
283 |        "      <td>54.0</td>\n",
284 |        "      <td>36.0</td>\n",
285 |        "    </tr>\n",
286 |        "    <tr>\n",
287 |        "      <th>4</th>\n",
288 |        "      <td>亞洲料理</td>\n",
289 |        "      <td>76.0</td>\n",
290 |        "      <td>130.0</td>\n",
291 |        "      <td>117.0</td>\n",
292 |        "      <td>153.0</td>\n",
293 |        "      <td>31.0</td>\n",
294 |        "      <td>243.0</td>\n",
295 |        "      <td>112.0</td>\n",
296 |        "      <td>78.0</td>\n",
297 |        "      <td>50.0</td>\n",
298 |        "    </tr>\n",
299 |        "    <tr>\n",
300 |        "      <th>5</th>\n",
301 |        "      <td>其他美食</td>\n",
302 |        "      <td>46.0</td>\n",
303 |        "      <td>44.0</td>\n",
304 |        "      <td>68.0</td>\n",
305 |        "      <td>32.0</td>\n",
306 |        "      <td>14.0</td>\n",
307 |        "      <td>54.0</td>\n",
308 |        "      <td>40.0</td>\n",
309 |        "      <td>46.0</td>\n",
310 |        "      <td>28.0</td>\n",
311 |        "    </tr>\n",
312 |        "    <tr>\n",
313 |        "      <th>6</th>\n",
314 |        "      <td>冰品、飲料、甜湯</td>\n",
315 |        "      <td>99.0</td>\n",
316 |        "      <td>193.0</td>\n",
317 |        "      <td>218.0</td>\n",
318 |        "      <td>195.0</td>\n",
319 |        "      <td>131.0</td>\n",
320 |        "      <td>315.0</td>\n",
321 |        "      <td>173.0</td>\n",
322 |        "      <td>187.0</td>\n",
323 |        "      <td>103.0</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <th>7</th>\n",
327 |        "      <td>冷凍/冷藏包裝食品</td>\n",
328 |        "      <td>0.0</td>\n",
329 |        "      <td>0.0</td>\n",
330 |        "      <td>0.0</td>\n",
331 |        "      <td>0.0</td>\n",
332 |        "      <td>0.0</td>\n",
333 |        "      <td>0.0</td>\n",
334 |        "      <td>0.0</td>\n",
335 |        "      <td>0.0</td>\n",
336 |        "      <td>0.0</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>8</th>\n",
340 |        "      <td>咖啡、簡餐、茶</td>\n",
341 |        "      <td>167.0</td>\n",
342 |        "      <td>667.0</td>\n",
343 |        "      <td>495.0</td>\n",
344 |        "      <td>405.0</td>\n",
345 |        "      <td>196.0</td>\n",
346 |        "      <td>798.0</td>\n",
347 |        "      <td>435.0</td>\n",
348 |        "      <td>262.0</td>\n",
349 |        "      <td>140.0</td>\n",
350 |        "    </tr>\n",
351 |        "    <tr>\n",
352 |        "      <th>9</th>\n",
353 |        "      <td>小吃</td>\n",
354 |        "      <td>307.0</td>\n",
355 |        "      <td>352.0</td>\n",
356 |        "      <td>420.0</td>\n",
357 |        "      <td>274.0</td>\n",
358 |        "      <td>335.0</td>\n",
359 |        "      <td>525.0</td>\n",
360 |        "      <td>384.0</td>\n",
361 |        "      <td>487.0</td>\n",
362 |        "      <td>236.0</td>\n",
363 |        "    </tr>\n",
364 |        "    <tr>\n",
365 |        "      <th>10</th>\n",
366 |        "      <td>常溫包裝食品</td>\n",
367 |        "      <td>0.0</td>\n",
368 |        "      <td>0.0</td>\n",
369 |        "      <td>0.0</td>\n",
370 |        "      <td>0.0</td>\n",
371 |        "      <td>0.0</td>\n",
372 |        "      <td>0.0</td>\n",
373 |        "      <td>0.0</td>\n",
374 |        "      <td>0.0</td>\n",
375 |        "      <td>0.0</td>\n",
376 |        "    </tr>\n",
377 |        "    <tr>\n",
378 |        "      <th>11</th>\n",
379 |        "      <td>日式料理</td>\n",
380 |        "      <td>117.0</td>\n",
381 |        "      <td>578.0</td>\n",
382 |        "      <td>289.0</td>\n",
383 |        "      <td>265.0</td>\n",
384 |        "      <td>111.0</td>\n",
385 |        "      <td>547.0</td>\n",
386 |        "      <td>270.0</td>\n",
387 |        "      <td>231.0</td>\n",
388 |        "      <td>99.0</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <th>12</th>\n",
392 |        "      <td>早餐</td>\n",
393 |        "      <td>73.0</td>\n",
394 |        "      <td>69.0</td>\n",
395 |        "      <td>63.0</td>\n",
396 |        "      <td>66.0</td>\n",
397 |        "      <td>21.0</td>\n",
398 |        "      <td>75.0</td>\n",
399 |        "      <td>59.0</td>\n",
400 |        "      <td>142.0</td>\n",
401 |        "      <td>48.0</td>\n",
402 |        "    </tr>\n",
403 |        "    <tr>\n",
404 |        "      <th>13</th>\n",
405 |        "      <td>烘焙、甜點、零食</td>\n",
406 |        "      <td>95.0</td>\n",
407 |        "      <td>223.0</td>\n",
408 |        "      <td>238.0</td>\n",
409 |        "      <td>221.0</td>\n",
410 |        "      <td>100.0</td>\n",
411 |        "      <td>364.0</td>\n",
412 |        "      <td>161.0</td>\n",
413 |        "      <td>166.0</td>\n",
414 |        "      <td>75.0</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>14</th>\n",
418 |        "      <td>燒烤類</td>\n",
419 |        "      <td>74.0</td>\n",
420 |        "      <td>187.0</td>\n",
421 |        "      <td>75.0</td>\n",
422 |        "      <td>82.0</td>\n",
423 |        "      <td>23.0</td>\n",
424 |        "      <td>182.0</td>\n",
425 |        "      <td>96.0</td>\n",
426 |        "      <td>107.0</td>\n",
427 |        "      <td>55.0</td>\n",
428 |        "    </tr>\n",
429 |        "    <tr>\n",
430 |        "      <th>15</th>\n",
431 |        "      <td>異國料理</td>\n",
432 |        "      <td>109.0</td>\n",
433 |        "      <td>328.0</td>\n",
434 |        "      <td>245.0</td>\n",
435 |        "      <td>341.0</td>\n",
436 |        "      <td>80.0</td>\n",
437 |        "      <td>688.0</td>\n",
438 |        "      <td>299.0</td>\n",
439 |        "      <td>259.0</td>\n",
440 |        "      <td>94.0</td>\n",
441 |        "    </tr>\n",
442 |        "    <tr>\n",
443 |        "      <th>16</th>\n",
444 |        "      <td>素食</td>\n",
445 |        "      <td>23.0</td>\n",
446 |        "      <td>71.0</td>\n",
447 |        "      <td>50.0</td>\n",
448 |        "      <td>52.0</td>\n",
449 |        "      <td>14.0</td>\n",
450 |        "      <td>62.0</td>\n",
451 |        "      <td>45.0</td>\n",
452 |        "      <td>39.0</td>\n",
453 |        "      <td>23.0</td>\n",
454 |        "    </tr>\n",
455 |        "    <tr>\n",
456 |        "      <th>17</th>\n",
457 |        "      <td>網購包裝食品</td>\n",
458 |        "      <td>0.0</td>\n",
459 |        "      <td>0.0</td>\n",
460 |        "      <td>0.0</td>\n",
461 |        "      <td>0.0</td>\n",
462 |        "      <td>0.0</td>\n",
463 |        "      <td>0.0</td>\n",
464 |        "      <td>0.0</td>\n",
465 |        "      <td>0.0</td>\n",
466 |        "      <td>0.0</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>18</th>\n",
470 |        "      <td>速食料理</td>\n",
471 |        "      <td>29.0</td>\n",
472 |        "      <td>52.0</td>\n",
473 |        "      <td>63.0</td>\n",
474 |        "      <td>47.0</td>\n",
475 |        "      <td>24.0</td>\n",
476 |        "      <td>73.0</td>\n",
477 |        "      <td>57.0</td>\n",
478 |        "      <td>47.0</td>\n",
479 |        "      <td>25.0</td>\n",
480 |        "    </tr>\n",
481 |        "    <tr>\n",
482 |        "      <th>19</th>\n",
483 |        "      <td>鍋類</td>\n",
484 |        "      <td>113.0</td>\n",
485 |        "      <td>205.0</td>\n",
486 |        "      <td>108.0</td>\n",
487 |        "      <td>117.0</td>\n",
488 |        "      <td>47.0</td>\n",
489 |        "      <td>227.0</td>\n",
490 |        "      <td>143.0</td>\n",
491 |        "      <td>158.0</td>\n",
492 |        "      <td>108.0</td>\n",
493 |        "    </tr>\n",
494 |        "  </tbody>\n",
495 |        "</table>\n",
496 |        "</div>"
497 |       ],
498 |       "text/plain": [
499 |        "area      style    中和區    中山區    中正區    信義區    大同區    大安區    松山區    板橋區    永和區\n",
500 |        "0     buffet自助餐    1.0   18.0    6.0   15.0    6.0    9.0    8.0    6.0    2.0\n",
501 |        "1         style    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0\n",
502 |        "2          中式料理  376.0  851.0  667.0  524.0  285.0  907.0  618.0  462.0  286.0\n",
503 |        "3        主題特色餐廳   20.0  160.0   63.0  101.0   46.0  218.0   73.0   54.0   36.0\n",
504 |        "4          亞洲料理   76.0  130.0  117.0  153.0   31.0  243.0  112.0   78.0   50.0\n",
505 |        "5          其他美食   46.0   44.0   68.0   32.0   14.0   54.0   40.0   46.0   28.0\n",
506 |        "6      冰品、飲料、甜湯   99.0  193.0  218.0  195.0  131.0  315.0  173.0  187.0  103.0\n",
507 |        "7     冷凍/冷藏包裝食品    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0\n",
508 |        "8       咖啡、簡餐、茶  167.0  667.0  495.0  405.0  196.0  798.0  435.0  262.0  140.0\n",
509 |        "9            小吃  307.0  352.0  420.0  274.0  335.0  525.0  384.0  487.0  236.0\n",
510 |        "10       常溫包裝食品    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0\n",
511 |        "11         日式料理  117.0  578.0  289.0  265.0  111.0  547.0  270.0  231.0   99.0\n",
512 |        "12           早餐   73.0   69.0   63.0   66.0   21.0   75.0   59.0  142.0   48.0\n",
513 |        "13     烘焙、甜點、零食   95.0  223.0  238.0  221.0  100.0  364.0  161.0  166.0   75.0\n",
514 |        "14          燒烤類   74.0  187.0   75.0   82.0   23.0  182.0   96.0  107.0   55.0\n",
515 |        "15         異國料理  109.0  328.0  245.0  341.0   80.0  688.0  299.0  259.0   94.0\n",
516 |        "16           素食   23.0   71.0   50.0   52.0   14.0   62.0   45.0   39.0   23.0\n",
517 |        "17       網購包裝食品    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0\n",
518 |        "18         速食料理   29.0   52.0   63.0   47.0   24.0   73.0   57.0   47.0   25.0\n",
519 |        "19           鍋類  113.0  205.0  108.0  117.0   47.0  227.0  143.0  158.0  108.0"
520 |       ]
521 |      },
522 |      "execution_count": 127,
523 |      "metadata": {},
524 |      "output_type": "execute_result"
525 |     }
526 |    ],
527 |    "source": [
528 |     "df_pivot_"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": 128,
534 |    "metadata": {},
535 |    "outputs": [],
536 |    "source": [
537 |     "#df_pivot.to_csv('df_final.csv')"
538 |    ]
539 |   }
540 |  ],
541 |  "metadata": {
542 |   "kernelspec": {
543 |    "display_name": "Python 3",
544 |    "language": "python",
545 |    "name": "python3"
546 |   },
547 |   "language_info": {
548 |    "codemirror_mode": {
549 |     "name": "ipython",
550 |     "version": 3
551 |    },
552 |    "file_extension": ".py",
553 |    "mimetype": "text/x-python",
554 |    "name": "python",
555 |    "nbconvert_exporter": "python",
556 |    "pygments_lexer": "ipython3",
557 |    "version": "3.4.5"
558 |   }
559 |  },
560 |  "nbformat": 4,
561 |  "nbformat_minor": 2
562 | }
563 | 


--------------------------------------------------------------------------------
/legacy_project/ipeen/ipeen_scraping-final.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "- ref http://stackoverflow.com/questions/40384289/web-scraping-python-extract-data-in-class-href-tag/40384398?noredirect=1#comment68021453_40384398\n",
  8 |     "- ref http://chrisalbon.com/python/beautiful_soup_html_basics.html\n",
  9 |     "- ref https://medium.com/dualcores-studio/python-x-%E7%B6%B2%E8%B7%AF%E7%88%AC%E8%9F%B2-c30ffda0ad78#.ruh8fs4v4"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import urllib \n",
 21 |     "#import simplejson \n",
 22 |     "import sys\n",
 23 |     "from urllib.request import urlopen\n",
 24 |     "import csv\n",
 25 |     "import requests\n",
 26 |     "from bs4 import BeautifulSoup\n",
 27 |     "import lxml\n",
 28 |     "import urllib, json\n",
 29 |     "import pandas as pd, numpy as np\n",
 30 |     "import pprint\n",
 31 |     "import datetime as dt \n",
 32 |     "from urllib.parse   import quote\n",
 33 |     "import sys \n",
 34 |     "#import urllib2\n",
 35 |     "import re\n",
 36 |     "import lxml"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stderr",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "//anaconda/envs/g_dash/lib/python3.4/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
 51 |       "\n",
 52 |       "The code that caused this warning is on line 170 of the file /anaconda/envs/g_dash/lib/python3.4/runpy.py. To get rid of this warning, change code that looks like this:\n",
 53 |       "\n",
 54 |       " BeautifulSoup([your markup])\n",
 55 |       "\n",
 56 |       "to this:\n",
 57 |       "\n",
 58 |       " BeautifulSoup([your markup], \"html.parser\")\n",
 59 |       "\n",
 60 |       "  markup_type=markup_type))\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?adkw=%E5%8F%B0%E5%8C%97'\n",
 66 |     "opener=urllib.request.build_opener()\n",
 67 |     "opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n",
 68 |     "page = opener.open(url)\n",
 69 |     "soup = BeautifulSoup(page)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "# Restaurant name "
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 39,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "黑潮市集花甲蟹鍋\n",
 91 |       "添好運台灣 Timhowan Taiwan\n",
 92 |       "威靈頓街1號 粥麵茶餐廳\n",
 93 |       "劉山東小吃店\n",
 94 |       "阜杭豆漿店\n",
 95 |       "點水樓(懷寧店)\n",
 96 |       "蘇杭餐廳(濟南店)\n",
 97 |       "北平田園餡餅粥\n",
 98 |       "小南門點心世界\n",
 99 |       "叁和院 台灣風格飲食\n",
100 |       "小魏川菜餐廳\n",
101 |       "123養生雞湯\n",
102 |       "達人食社-嘉味水餃\n",
103 |       "餃先生創意手工水餃\n",
104 |       "小喬阿姨私房泡菜\n",
105 |       "中式餐廳\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "for k in soup.find_all('a', attrs={'data-label': '店名'}):\n",
111 |     "    print (k.text)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "# Restaurant address"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 42,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "台北市大安區光復南路692巷6號\n",
133 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
134 |       "\n",
135 |       "台北市中正區忠孝西路一段36號1樓\n",
136 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
137 |       "\n",
138 |       "台北市中正區館前路12號5樓 (UNIQLO樓上)\n",
139 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
140 |       "\n",
141 |       "台北市中正區開封街一段14巷2號\n",
142 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
143 |       "\n",
144 |       "台北市中正區忠孝東路一段108號2樓\n",
145 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
146 |       "\n",
147 |       "台北市中正區懷寧街64號\n",
148 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
149 |       "\n",
150 |       "台北市中正區濟南路一段2-1號1樓\n",
151 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
152 |       "\n",
153 |       "台北市中正區重慶南路一段5巷1號\n",
154 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
155 |       "\n",
156 |       "台北市中正區北平西路3號(台北車站2樓)\n",
157 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
158 |       "\n",
159 |       "台北市大安區忠孝東路四段101巷14號\n",
160 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
161 |       "\n",
162 |       "台北市中正區公園路13號3樓\n",
163 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
164 |       "\n",
165 |       "台灣\n",
166 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
167 |       "台灣\n",
168 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
169 |       "台灣\n",
170 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
171 |       "台灣\n",
172 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\n",
173 |       "台灣\n",
174 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n",
180 |     "    print (k.get_text())"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "# Restaurant ID"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 38,
193 |    "metadata": {
194 |     "collapsed": false
195 |    },
196 |    "outputs": [
197 |     {
198 |      "name": "stdout",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "/shop/805942-黑潮市集花甲蟹鍋\n",
202 |       "/shop/965236-添好運台灣-Timhowan-Taiwan\n",
203 |       "/shop/156281-威靈頓街1號-粥麵茶餐廳\n",
204 |       "/shop/44415-劉山東小吃店\n",
205 |       "/shop/27702-阜杭豆漿店\n",
206 |       "/shop/52988-點水樓-懷寧店\n",
207 |       "/shop/22553-蘇杭餐廳-濟南店\n",
208 |       "/shop/3800-北平田園餡餅粥\n",
209 |       "/shop/39130-小南門點心世界\n",
210 |       "/shop/941562-叁和院-台灣風格飲食\n",
211 |       "/shop/6325-小魏川菜餐廳\n",
212 |       "/shop/86651-123養生雞湯\n",
213 |       "/shop/62767-達人食社-嘉味水餃\n",
214 |       "/shop/583883-餃先生創意手工水餃\n",
215 |       "/shop/711282-小喬阿姨私房泡菜\n",
216 |       "/shop/593406-中式餐廳\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n",
222 |     "    \n",
223 |     "    if \"/shop/\" in str(k['href']):\n",
224 |     "        \n",
225 |     "        print (k['href'])\n",
226 |     "    else:\n",
227 |     "        pass"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "#  Put all together "
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 82,
240 |    "metadata": {
241 |     "collapsed": true
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "output = [[] for k in range(3)]"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 50,
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "for k in soup.find_all('a', attrs={'data-label': '店名'}):\n",
257 |     "    output[0].append(k.text)\n",
258 |     "\n",
259 |     "for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n",
260 |     "    output[1].append(k.get_text())\n",
261 |     "    \n",
262 |     "\n",
263 |     "for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n",
264 |     "    \n",
265 |     "    if \"/shop/\" in str(k['href']):\n",
266 |     "        output[2].append((k['href']))\n",
267 |     "    else:\n",
268 |     "        pass"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 65,
274 |    "metadata": {
275 |     "collapsed": true
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "def url_fix(x):\n",
280 |     "    return 'http://www.ipeen.com.tw' + x "
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 67,
286 |    "metadata": {
287 |     "collapsed": false
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "df = pd.DataFrame(output).T\n",
292 |     "df.columns = ['name', 'address', 'url']\n",
293 |     "df.url = df.url.apply(lambda x :url_fix(x) )"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 70,
299 |    "metadata": {
300 |     "collapsed": false
301 |    },
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/html": [
306 |        "<div>\n",
307 |        "<table border=\"1\" class=\"dataframe\">\n",
308 |        "  <thead>\n",
309 |        "    <tr style=\"text-align: right;\">\n",
310 |        "      <th></th>\n",
311 |        "      <th>name</th>\n",
312 |        "      <th>address</th>\n",
313 |        "      <th>url</th>\n",
314 |        "    </tr>\n",
315 |        "  </thead>\n",
316 |        "  <tbody>\n",
317 |        "    <tr>\n",
318 |        "      <th>0</th>\n",
319 |        "      <td>黑潮市集花甲蟹鍋</td>\n",
320 |        "      <td>台北市大安區光復南路692巷6號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n</td>\n",
321 |        "      <td>http://www.ipeen.com.tw/shop/805942-黑潮市集花甲蟹鍋</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>1</th>\n",
325 |        "      <td>添好運台灣 Timhowan Taiwan</td>\n",
326 |        "      <td>台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n</td>\n",
327 |        "      <td>http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...</td>\n",
328 |        "    </tr>\n",
329 |        "    <tr>\n",
330 |        "      <th>2</th>\n",
331 |        "      <td>威靈頓街1號 粥麵茶餐廳</td>\n",
332 |        "      <td>台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
333 |        "      <td>http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>3</th>\n",
337 |        "      <td>劉山東小吃店</td>\n",
338 |        "      <td>台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n</td>\n",
339 |        "      <td>http://www.ipeen.com.tw/shop/44415-劉山東小吃店</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>4</th>\n",
343 |        "      <td>阜杭豆漿店</td>\n",
344 |        "      <td>台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...</td>\n",
345 |        "      <td>http://www.ipeen.com.tw/shop/27702-阜杭豆漿店</td>\n",
346 |        "    </tr>\n",
347 |        "  </tbody>\n",
348 |        "</table>\n",
349 |        "</div>"
350 |       ],
351 |       "text/plain": [
352 |        "                    name                                            address  \\\n",
353 |        "0               黑潮市集花甲蟹鍋   台北市大安區光復南路692巷6號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n   \n",
354 |        "1  添好運台灣 Timhowan Taiwan  台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n   \n",
355 |        "2           威靈頓街1號 粥麵茶餐廳  台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
356 |        "3                 劉山東小吃店   台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n   \n",
357 |        "4                  阜杭豆漿店  台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...   \n",
358 |        "\n",
359 |        "                                                 url  \n",
360 |        "0       http://www.ipeen.com.tw/shop/805942-黑潮市集花甲蟹鍋  \n",
361 |        "1  http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...  \n",
362 |        "2   http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳  \n",
363 |        "3          http://www.ipeen.com.tw/shop/44415-劉山東小吃店  \n",
364 |        "4           http://www.ipeen.com.tw/shop/27702-阜杭豆漿店  "
365 |       ]
366 |      },
367 |      "execution_count": 70,
368 |      "metadata": {},
369 |      "output_type": "execute_result"
370 |     }
371 |    ],
372 |    "source": [
373 |     "df.to_csv('ipeen_scrap.csv')\n",
374 |     "df.head()"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "# Loop over 5 pages "
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 86,
387 |    "metadata": {
388 |     "collapsed": true
389 |    },
390 |    "outputs": [],
391 |    "source": [
392 |     "output = [[] for k in range(3)]"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 87,
398 |    "metadata": {
399 |     "collapsed": false
400 |    },
401 |    "outputs": [
402 |     {
403 |      "name": "stdout",
404 |      "output_type": "stream",
405 |      "text": [
406 |       "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=1&adkw=%E5%8F%B0%E5%8C%97\n"
407 |      ]
408 |     },
409 |     {
410 |      "name": "stderr",
411 |      "output_type": "stream",
412 |      "text": [
413 |       "//anaconda/envs/g_dash/lib/python3.4/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
414 |       "\n",
415 |       "The code that caused this warning is on line 170 of the file /anaconda/envs/g_dash/lib/python3.4/runpy.py. To get rid of this warning, change code that looks like this:\n",
416 |       "\n",
417 |       " BeautifulSoup([your markup])\n",
418 |       "\n",
419 |       "to this:\n",
420 |       "\n",
421 |       " BeautifulSoup([your markup], \"html.parser\")\n",
422 |       "\n",
423 |       "  markup_type=markup_type))\n"
424 |      ]
425 |     },
426 |     {
427 |      "name": "stdout",
428 |      "output_type": "stream",
429 |      "text": [
430 |       "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=2&adkw=%E5%8F%B0%E5%8C%97\n",
431 |       "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=3&adkw=%E5%8F%B0%E5%8C%97\n",
432 |       "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=4&adkw=%E5%8F%B0%E5%8C%97\n"
433 |      ]
434 |     }
435 |    ],
436 |    "source": [
437 |     "for page in range(1,5):\n",
438 |     "    url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p={}&adkw=%E5%8F%B0%E5%8C%97'.format(page)\n",
439 |     "    print (url)\n",
440 |     "    opener=urllib.request.build_opener()\n",
441 |     "    opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n",
442 |     "    page = opener.open(url)\n",
443 |     "    soup = BeautifulSoup(page)\n",
444 |     "    for k in soup.find_all('a', attrs={'data-label': '店名'}):\n",
445 |     "        output[0].append(k.text)\n",
446 |     "\n",
447 |     "    for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n",
448 |     "        output[1].append(k.get_text())\n",
449 |     "    \n",
450 |     "    for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n",
451 |     "        if \"/shop/\" in str(k['href']):\n",
452 |     "            output[2].append((k['href']))\n",
453 |     "        else:\n",
454 |     "            pass\n"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 89,
460 |    "metadata": {
461 |     "collapsed": false,
462 |     "scrolled": false
463 |    },
464 |    "outputs": [
465 |     {
466 |      "data": {
467 |       "text/html": [
468 |        "<div>\n",
469 |        "<table border=\"1\" class=\"dataframe\">\n",
470 |        "  <thead>\n",
471 |        "    <tr style=\"text-align: right;\">\n",
472 |        "      <th></th>\n",
473 |        "      <th>name</th>\n",
474 |        "      <th>address</th>\n",
475 |        "      <th>url</th>\n",
476 |        "    </tr>\n",
477 |        "  </thead>\n",
478 |        "  <tbody>\n",
479 |        "    <tr>\n",
480 |        "      <th>0</th>\n",
481 |        "      <td>酒食坊 Pān-toh Bistro</td>\n",
482 |        "      <td>台北市松山區光復北路7號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n</td>\n",
483 |        "      <td>http://www.ipeen.com.tw/shop/1046690-酒食坊-Pān-t...</td>\n",
484 |        "    </tr>\n",
485 |        "    <tr>\n",
486 |        "      <th>1</th>\n",
487 |        "      <td>添好運台灣 Timhowan Taiwan</td>\n",
488 |        "      <td>台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n</td>\n",
489 |        "      <td>http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...</td>\n",
490 |        "    </tr>\n",
491 |        "    <tr>\n",
492 |        "      <th>2</th>\n",
493 |        "      <td>威靈頓街1號 粥麵茶餐廳</td>\n",
494 |        "      <td>台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...</td>\n",
495 |        "      <td>http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳</td>\n",
496 |        "    </tr>\n",
497 |        "    <tr>\n",
498 |        "      <th>3</th>\n",
499 |        "      <td>劉山東小吃店</td>\n",
500 |        "      <td>台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n</td>\n",
501 |        "      <td>http://www.ipeen.com.tw/shop/44415-劉山東小吃店</td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <th>4</th>\n",
505 |        "      <td>阜杭豆漿店</td>\n",
506 |        "      <td>台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...</td>\n",
507 |        "      <td>http://www.ipeen.com.tw/shop/27702-阜杭豆漿店</td>\n",
508 |        "    </tr>\n",
509 |        "  </tbody>\n",
510 |        "</table>\n",
511 |        "</div>"
512 |       ],
513 |       "text/plain": [
514 |        "                    name                                            address  \\\n",
515 |        "0     酒食坊 Pān-toh Bistro     台北市松山區光復北路7號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n   \n",
516 |        "1  添好運台灣 Timhowan Taiwan  台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n   \n",
517 |        "2           威靈頓街1號 粥麵茶餐廳  台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...   \n",
518 |        "3                 劉山東小吃店   台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n   \n",
519 |        "4                  阜杭豆漿店  台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...   \n",
520 |        "\n",
521 |        "                                                 url  \n",
522 |        "0  http://www.ipeen.com.tw/shop/1046690-酒食坊-Pān-t...  \n",
523 |        "1  http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...  \n",
524 |        "2   http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳  \n",
525 |        "3          http://www.ipeen.com.tw/shop/44415-劉山東小吃店  \n",
526 |        "4           http://www.ipeen.com.tw/shop/27702-阜杭豆漿店  "
527 |       ]
528 |      },
529 |      "execution_count": 89,
530 |      "metadata": {},
531 |      "output_type": "execute_result"
532 |     }
533 |    ],
534 |    "source": [
535 |     "df = pd.DataFrame(output).T\n",
536 |     "df.columns = ['name', 'address', 'url']\n",
537 |     "df.url = df.url.apply(lambda x :url_fix(x) )\n",
538 |     "df.to_csv('ipeen_scrap.csv')\n",
539 |     "df.head()"
540 |    ]
541 |   }
542 |  ],
543 |  "metadata": {
544 |   "kernelspec": {
545 |    "display_name": "Python 3",
546 |    "language": "python",
547 |    "name": "python3"
548 |   },
549 |   "language_info": {
550 |    "codemirror_mode": {
551 |     "name": "ipython",
552 |     "version": 3
553 |    },
554 |    "file_extension": ".py",
555 |    "mimetype": "text/x-python",
556 |    "name": "python",
557 |    "nbconvert_exporter": "python",
558 |    "pygments_lexer": "ipython3",
559 |    "version": "3.4.5"
560 |   }
561 |  },
562 |  "nbformat": 4,
563 |  "nbformat_minor": 1
564 | }
565 | 


--------------------------------------------------------------------------------
/legacy_project/script/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/legacy_project/script/__init__.py


--------------------------------------------------------------------------------
/legacy_project/script/utility_data_IO.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | from sqlalchemy import create_engine
 3 | from pytz import timezone
 4 | import datetime
 5 | import os
 6 | 
 7 | european = timezone('Europe/Madrid')
 8 | now_tz = datetime.datetime.now(tz = european)
 9 | now = now_tz.replace(tzinfo = None)
10 | now = now.replace(microsecond = 0)
11 | 
12 | db_url = os.environ['db_url']
13 | print ('db_url : ' , db_url)
14 | 
15 | def write_data_to_db(df, table_name,db_url):
16 |     try:
17 |         # add insert time 
18 |         df["date_of_insert"] = now
19 |         print ('=============')
20 |         print (df.head())
21 |         print (table_name)
22 |         print ('=============')
23 |         engine = create_engine(db_url)
24 |         conn = engine.connect()
25 |         df.to_sql(name= table_name, con= engine, schema= 'rw', if_exists = "append", index = False)
26 |         # close the connection after imput data 
27 |         conn.close()
28 |         print("insert to DB ok")
29 |     except Exception as e:
30 |         print (e)
31 |         print ('fail to write to db')
32 |         


--------------------------------------------------------------------------------
/legacy_project/script/utility_operation.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver 
 2 | 
 3 | class Login():
 4 | 	def user_login(self,driver):
 5 | 		"""
 6 | 		clear form before input id / password 
 7 | 		can find element by other ways : Xpath/CSS/parent..
 8 | 		"""
 9 | 		# input user id 
10 | 		driver.find_element_by_id("idinput").clear()
11 | 		driver.find_element_by_id("idinput").send_keys("user_name")
12 | 		# iiput password 
13 | 		driver.find_element_by_id("pwdinput").clear()
14 | 		driver.find_element_by_id("pwdinput").send_keys("user_password")
15 | 		# click login button 
16 | 		driver.find_element_by_id("loginbtn").click()
17 | 		print ('### login success ###')
18 | 	def user_logout(self,driver):
19 | 		driver.find_element_by_link_text("logout").click()
20 | 		driver.quit()
21 | 		print ('### log out success ###')


--------------------------------------------------------------------------------
/legacy_project/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # set up env
 4 | read -p "Please set your env name: " env_name
 5 | echo 'your env name :' $env_name
 6 | echo 'creating env....'
 7 | conda create --name $env_name python=3 &&
 8 | 
 9 | # install library 
10 | echo 'start install env.... '
11 | source activate $env_name &&  pip install pandas urllib3 beautifulsoup4
12 | echo 'all env library installed successfully ! '
13 | 


--------------------------------------------------------------------------------
/legacy_project/weather_scrapper/LDN_weather_scrapper_V1.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np 
  3 | import datetime
  4 | import urllib, json
  5 | from bs4 import BeautifulSoup
  6 | # UDF 
  7 | from script.utility_data_IO import * 
  8 | #from script.utility_operation import * 
  9 | #import utility_data_IO
 10 | 
 11 | cols = ['Mean Temperature', 'Max Temperature', 'Min Temperature',
 12 |         'Heating Degree Days', 'Dew Point', 'Average Humidity',
 13 |         'Maximum Humidity', 'Minimum Humidity', 'Precipitation',
 14 |         'Sea Level Pressure', 'Wind Speed', 'Max Wind Speed', 'Max Gust Speed',
 15 |         'Visibility', 'Events', 'timestamp']
 16 | 
 17 | def col_fix(df):
 18 |     for col in cols:
 19 |         if col in df.columns:
 20 |             pass
 21 |         else:
 22 |             df[col]  = None  
 23 |     return df 
 24 | 
 25 | def main_(start_date,end_date):
 26 |     output=pd.DataFrame([])
 27 |     # -------------
 28 |     print ('-----------------')
 29 |     print ('start_date : ',start_date )
 30 |     print ('end_date : ',end_date )
 31 |     print ('-----------------')
 32 |     for day in pd.date_range(start=start_date, end=end_date, freq='D'):
 33 |     #for day in pd.date_range(start_date='3/1/2017', end_date='3/5/2017', freq='D'):
 34 |         print ((day))
 35 |         date_ = str(day).split(' ')[0] 
 36 |         year_ = date_.split('-')[0]
 37 |         month_ = date_.split('-')[1]
 38 |         day_ = date_.split('-')[2]
 39 |    # -------------
 40 |         url_new = 'https://www.wunderground.com/history/airport/EGMC/{}/{}/{}/DailyHistory.html?cm_ven=localwx_history'.format(year_,month_,day_)
 41 |         print (url_new)
 42 |         
 43 |         # query the page 
 44 |         opener=urllib.request.build_opener()
 45 |         opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 46 |         page = opener.open(url_new)
 47 |         soup = BeautifulSoup(page)
 48 |         trs = soup.find_all('td', attrs={'class': 'indent'})
 49 |         col=[]
 50 |         val=[]
 51 |         for tr in trs:
 52 |             if tr.text in cols:
 53 |                 tds = tr.find_next_siblings("td") # you get list
 54 |                 print (tr.text )
 55 |                 col.append(tr.text)
 56 |                 print (tds[0].text)
 57 |                 val.append(tds[0].text.strip('\n')
 58 |                     .replace('\xa0','')
 59 |                     .replace('°C','')
 60 |                     .replace('mm','')
 61 |                     .replace('hPa','')
 62 |                     .replace('km/h\n ()','')
 63 |                     .replace('km/h','')
 64 |                     .replace('kilometers','')
 65 |                     .replace('\n\t', '')
 66 |                     .replace('\t', '')
 67 |                     .replace('\n', '')
 68 |                     .replace('- ()', ''))
 69 |                     #.replace('  -', ''))
 70 |             else:
 71 |                 col.append(tr.text) 
 72 |                 val.append(None) 
 73 | 
 74 |         df = pd.DataFrame({'col':col,'val':val}).set_index('col').T.reset_index()
 75 |         df['timestamp'] = day 
 76 |         del df['index']
 77 |         df = col_fix(df)
 78 |         print ('df.columns : ' , df.columns )
 79 |         print ('cols : ' , cols  )
 80 |         #df.columns = cols 
 81 |         df = df[cols] 
 82 |         ### update output dataframe 
 83 |         output = output.append(df)
 84 |     output = output.reset_index()
 85 |     print (output)
 86 |     del output['index']
 87 |     # fix column name 
 88 |     output.columns = ['mean_temperature','max_temperature', 'min_temperature',
 89 |                      'heating_degree_days', 'dew_point', 'avg_humidity',
 90 |                      'max_humidity', 'min_humidity', 'precipitation',
 91 |                      'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed',
 92 |                      'visibility', 'events', 'timestamp']
 93 |     # re-order columns 
 94 |     output = output[['timestamp','mean_temperature','max_temperature', 'min_temperature',
 95 |                      'heating_degree_days', 'dew_point', 'avg_humidity','max_humidity', 'min_humidity', 'precipitation',
 96 |                      'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed',
 97 |                      'visibility','events']]
 98 |     # clean data 
 99 |     output=output.replace('  -', np.nan)
100 |     print (output)
101 |     return output 
102 | 
103 | if __name__ == '__main__':
104 |     df_ = main_('1/1/2016', '12/31/2017')
105 |     # dump to DB 
106 |     write_data_to_db(df_, 'weather_ldn',db_url)
107 | 


--------------------------------------------------------------------------------
/legacy_project/weather_scrapper/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | ### Quick start
 5 | 
 6 | ```bash
 7 | # get repo and install packages 
 8 | $ git clone https://github.com/yennanliu/web_scraping
 9 | $ cd web_scraping 
10 | # install library
11 | $ source setup.sh
12 | # run demo script
13 | # export pythonpath 
14 | # export PYTHONPATH=/Users/yennanliu/web_scraping
15 | $ export PYTHONPATH=$(pwd)
16 | # export db_url 
17 | $ export db_url=<ur_db_url>
18 | $ python weather_scrapper/br_weather_scrapper_V1.py 
19 | 
20 | ```
21 | 


--------------------------------------------------------------------------------
/legacy_project/weather_scrapper/br_weather_scrapper_V1.py:
--------------------------------------------------------------------------------
  1 | # ops 
  2 | import pandas as pd
  3 | import numpy as np 
  4 | import datetime
  5 | import urllib, json
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | # UDF
  9 | from script.utility_data_IO import * 
 10 | #from script.utility_operation import * 
 11 | #import utility_data_IO
 12 | 
 13 | cols = ['Mean Temperature', 'Max Temperature', 'Min Temperature',
 14 |         'Heating Degree Days', 'Dew Point', 'Average Humidity',
 15 |         'Maximum Humidity', 'Minimum Humidity', 'Precipitation',
 16 |         'Sea Level Pressure', 'Wind Speed', 'Max Wind Speed', 'Max Gust Speed',
 17 |         'Visibility', 'Events', 'timestamp']
 18 | 
 19 | def col_fix(df):
 20 |     for col in cols:
 21 |         if col in df.columns:
 22 |             pass
 23 |         else:
 24 |             df[col]  = None  
 25 |     return df 
 26 | 
 27 | def main_(start_date,end_date):
 28 |     output=pd.DataFrame([])
 29 |     # -------------
 30 |     print ('-----------------')
 31 |     print ('start_date : ',start_date )
 32 |     print ('end_date : ',end_date )
 33 |     print ('-----------------')
 34 |     for day in pd.date_range(start=start_date, end=end_date, freq='D'):
 35 |     #for day in pd.date_range(start_date='3/1/2017', end_date='3/5/2017', freq='D'):
 36 |         print ((day))
 37 |         date_ = str(day).split(' ')[0] 
 38 |         year_ = date_.split('-')[0]
 39 |         month_ = date_.split('-')[1]
 40 |         day_ = date_.split('-')[2]
 41 |    # -------------
 42 |         url_new = 'https://www.wunderground.com/history/airport/EBFS/{}/{}/{}/DailyHistory.html?req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo='.format(year_,month_,day_)
 43 |         print (url_new)
 44 |         
 45 |         # query the page 
 46 |         opener=urllib.request.build_opener()
 47 |         opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 48 |         page = opener.open(url_new)
 49 |         soup = BeautifulSoup(page)
 50 |         trs = soup.find_all('td', attrs={'class': 'indent'})
 51 |         col=[]
 52 |         val=[]
 53 |         for tr in trs:
 54 |             if tr.text in cols:
 55 |                 tds = tr.find_next_siblings("td") # you get list
 56 |                 print (tr.text )
 57 |                 col.append(tr.text)
 58 |                 print (tds[0].text)
 59 |                 val.append(tds[0].text.strip('\n')
 60 |                     .replace('\xa0','')
 61 |                     .replace('°C','')
 62 |                     .replace('mm','')
 63 |                     .replace('hPa','')
 64 |                     .replace('km/h\n ()','')
 65 |                     .replace('km/h','')
 66 |                     .replace('kilometers','')
 67 |                     .replace('\n\t', '')
 68 |                     .replace('\t', '')
 69 |                     .replace('\n', '')
 70 |                     .replace('- ()', ''))
 71 |                     #.replace('-', ''))
 72 |             else:
 73 |                 col.append(tr.text) 
 74 |                 val.append(None) 
 75 | 
 76 |         df = pd.DataFrame({'col':col,'val':val}).set_index('col').T.reset_index()
 77 |         df['timestamp'] = day 
 78 |         del df['index']
 79 |         df = col_fix(df)
 80 |         print ('df.columns : ' , df.columns )
 81 |         print ('cols : ' , cols  )
 82 |         #df.columns = cols 
 83 |         df = df[cols] 
 84 |         ### update output dataframe 
 85 |         output = output.append(df)
 86 |     output = output.reset_index()
 87 |     print (output)
 88 |     del output['index']
 89 |     output.columns = ['mean_temperature','max_temperature', 'min_temperature',
 90 |                      'heating_degree_days', 'dew_point', 'avg_humidity',
 91 |                      'max_humidity', 'min_humidity', 'precipitation',
 92 |                      'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed',
 93 |                      'visibility', 'events', 'timestamp']
 94 | 
 95 |     output = output[['timestamp','mean_temperature','max_temperature', 'min_temperature',
 96 |                      'heating_degree_days', 'dew_point', 'avg_humidity','max_humidity', 'min_humidity', 'precipitation',
 97 |                      'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed',
 98 |                      'visibility','events']]
 99 |     # clean data 
100 |     output=output.replace('  -', np.nan)
101 |     """
102 |     # modify data type 
103 |     output['mean_temperature']=output['mean_temperature'].astype('float')
104 |     output['max_temperature']=output['max_temperature'].astype('float')
105 |     output['min_temperature']=output['min_temperature'].astype('float')
106 |     output['heating_degree_days']=output['heating_degree_days'].astype('float')
107 |     output['dew_point']=output['dew_point'].astype('float')
108 |     output['avg_humidity']=output['avg_humidity'].astype('float')
109 |     output['max_humidity']=output['max_humidity'].astype('float')
110 |     output['min_humidity']=output['min_humidity'].astype('float')
111 |     output['precipitation']=output['precipitation'].astype('float')
112 |     output['sea_level_pressure']=output['sea_level_pressure'].astype('float')
113 |     output['wind_speed']=output['wind_speed'].astype('float')
114 |     output['max_wind_speed']=output['max_wind_speed'].astype('float')
115 |     output['max_gust_speed']=output['max_gust_speed'].astype('float')
116 |     output['events']=output['events'].astype('str')
117 | 
118 |     print (output.info())
119 |     """
120 |     print (output.head())
121 |     return output 
122 | 
123 | if __name__ == '__main__': 
124 |     df_ = main_('1/1/2018', '1/31/2018')
125 |     #df_ = main_('2/1/2016', '2/10/2016')
126 |     print (df_)
127 |     # dump to DB 
128 |     write_data_to_db(df_, 'weather_brussels',db_url)


--------------------------------------------------------------------------------
/logs/log.txt:
--------------------------------------------------------------------------------
  1 | Completed =>	New+York	data+scientist	10	0	6.766853094100952	file_1  2019-08-04-17:45:15
  2 | Completed =>	New+York	data+engineer	10	0	3.0191662311553955	file_1  2019-08-31-14:43:31
  3 | Completed =>	New+York	data+engineer	20	10	6.937615156173706	file_1  2019-08-31-14:43:31
  4 | Completed =>	New+York	data+engineer	30	20	10.762848138809204	file_1  2019-08-31-14:43:31
  5 | Completed =>	New+York	data+engineer	40	30	14.53726601600647	file_1  2019-08-31-14:43:31
  6 | Completed =>	New+York	data+engineer	50	40	17.391534090042114	file_1  2019-08-31-14:43:31
  7 | Completed =>	New+York	data+engineer	60	50	20.694072008132935	file_1  2019-08-31-14:43:31
  8 | Completed =>	New+York	data+engineer	70	60	23.58491015434265	file_1  2019-08-31-14:43:31
  9 | Completed =>	New+York	data+engineer	80	70	26.531862020492554	file_1  2019-08-31-14:43:31
 10 | Completed =>	New+York	data+engineer	90	80	29.492170095443726	file_1  2019-08-31-14:43:31
 11 | Completed =>	New+York	data+engineer	100	90	32.42632722854614	file_1  2019-08-31-14:43:31
 12 | Completed =>	New+York	data+engineer	110	100	35.59654521942139	file_1  2019-08-31-14:43:31
 13 | Completed =>	New+York	data+engineer	120	110	38.5950391292572	file_1  2019-08-31-14:43:31
 14 | Completed =>	New+York	data+engineer	130	120	41.553301095962524	file_1  2019-08-31-14:43:31
 15 | Completed =>	New+York	data+engineer	140	130	44.89548707008362	file_1  2019-08-31-14:43:31
 16 | Completed =>	New+York	data+engineer	150	140	47.64503002166748	file_1  2019-08-31-14:43:31
 17 | Completed =>	New+York	data+engineer	160	150	50.49106001853943	file_1  2019-08-31-14:43:31
 18 | Completed =>	New+York	data+engineer	170	160	53.34058403968811	file_1  2019-08-31-14:43:31
 19 | Completed =>	New+York	data+engineer	180	170	56.251546144485474	file_1  2019-08-31-14:43:31
 20 | Completed =>	New+York	data+engineer	190	180	59.024163007736206	file_1  2019-08-31-14:43:31
 21 | Completed =>	New+York	data+engineer	200	190	61.882164001464844	file_1  2019-08-31-14:43:31
 22 | Completed =>	New+York	data+engineer	210	200	64.63623714447021	file_1  2019-08-31-14:43:31
 23 | Completed =>	New+York	data+engineer	220	210	67.43612313270569	file_1  2019-08-31-14:43:31
 24 | Completed =>	New+York	data+engineer	230	220	70.1706280708313	file_1  2019-08-31-14:43:31
 25 | Completed =>	New+York	data+engineer	240	230	73.02157711982727	file_1  2019-08-31-14:43:31
 26 | Completed =>	New+York	data+engineer	250	240	75.7884030342102	file_1  2019-08-31-14:43:31
 27 | Completed =>	New+York	data+engineer	260	250	78.51018619537354	file_1  2019-08-31-14:43:31
 28 | Completed =>	New+York	data+engineer	270	260	81.20932006835938	file_1  2019-08-31-14:43:31
 29 | Completed =>	New+York	data+engineer	280	270	84.06421709060669	file_1  2019-08-31-14:43:31
 30 | Completed =>	New+York	data+engineer	290	280	86.99712109565735	file_1  2019-08-31-14:43:31
 31 | Completed =>	New+York	data+engineer	300	290	89.98329901695251	file_1  2019-08-31-14:43:31
 32 | Completed =>	Singapore	data+engineer	10	0	1.6717309951782227	file_2  2019-08-31-14:43:31
 33 | Completed =>	Singapore	data+engineer	20	10	3.3881940841674805	file_2  2019-08-31-14:43:31
 34 | Completed =>	Singapore	data+engineer	30	20	5.25773811340332	file_2  2019-08-31-14:43:31
 35 | Completed =>	Singapore	data+engineer	40	30	6.985188007354736	file_2  2019-08-31-14:43:31
 36 | Completed =>	Singapore	data+engineer	50	40	8.669186115264893	file_2  2019-08-31-14:43:31
 37 | Completed =>	Singapore	data+engineer	60	50	10.399969100952148	file_2  2019-08-31-14:43:31
 38 | Completed =>	Singapore	data+engineer	70	60	12.182042121887207	file_2  2019-08-31-14:43:31
 39 | Completed =>	Singapore	data+engineer	80	70	13.971599102020264	file_2  2019-08-31-14:43:31
 40 | Completed =>	Singapore	data+engineer	90	80	15.70194411277771	file_2  2019-08-31-14:43:31
 41 | Completed =>	Singapore	data+engineer	100	90	17.422276973724365	file_2  2019-08-31-14:43:31
 42 | Completed =>	Singapore	data+engineer	110	100	19.24924898147583	file_2  2019-08-31-14:43:31
 43 | Completed =>	Singapore	data+engineer	120	110	21.29511594772339	file_2  2019-08-31-14:43:31
 44 | Completed =>	Singapore	data+engineer	130	120	23.084742069244385	file_2  2019-08-31-14:43:31
 45 | Completed =>	Singapore	data+engineer	140	130	24.825146913528442	file_2  2019-08-31-14:43:31
 46 | Completed =>	Singapore	data+engineer	150	140	26.45589303970337	file_2  2019-08-31-14:43:31
 47 | Completed =>	Singapore	data+engineer	160	150	28.121127128601074	file_2  2019-08-31-14:43:31
 48 | Completed =>	Singapore	data+engineer	170	160	29.95150399208069	file_2  2019-08-31-14:43:31
 49 | Completed =>	Singapore	data+engineer	180	170	31.694020986557007	file_2  2019-08-31-14:43:31
 50 | Completed =>	Singapore	data+engineer	190	180	33.31903910636902	file_2  2019-08-31-14:43:31
 51 | Completed =>	Singapore	data+engineer	200	190	34.96339702606201	file_2  2019-08-31-14:43:31
 52 | Completed =>	Singapore	data+engineer	210	200	36.58315300941467	file_2  2019-08-31-14:43:31
 53 | Completed =>	Singapore	data+engineer	220	210	38.307523012161255	file_2  2019-08-31-14:43:31
 54 | Completed =>	Singapore	data+engineer	230	220	40.03498101234436	file_2  2019-08-31-14:43:31
 55 | Completed =>	Singapore	data+engineer	240	230	41.822685956954956	file_2  2019-08-31-14:43:31
 56 | Completed =>	Singapore	data+engineer	250	240	43.43740200996399	file_2  2019-08-31-14:43:31
 57 | Completed =>	Singapore	data+engineer	260	250	45.15139198303223	file_2  2019-08-31-14:43:31
 58 | Completed =>	Singapore	data+engineer	270	260	46.89502310752869	file_2  2019-08-31-14:43:31
 59 | Completed =>	Singapore	data+engineer	280	270	48.57830190658569	file_2  2019-08-31-14:43:31
 60 | Completed =>	Singapore	data+engineer	290	280	50.21092700958252	file_2  2019-08-31-14:43:31
 61 | Completed =>	Singapore	data+engineer	300	290	51.86261200904846	file_2  2019-08-31-14:43:31
 62 | Completed =>	Tokyo	data+engineer	10	0	1.824260950088501	file_3  2019-08-31-14:43:31
 63 | Completed =>	Tokyo	data+engineer	20	10	3.4073591232299805	file_3  2019-08-31-14:43:31
 64 | Completed =>	Tokyo	data+engineer	30	20	5.069762945175171	file_3  2019-08-31-14:43:31
 65 | Completed =>	Tokyo	data+engineer	40	30	6.628056049346924	file_3  2019-08-31-14:43:31
 66 | Completed =>	Tokyo	data+engineer	50	40	8.243564128875732	file_3  2019-08-31-14:43:31
 67 | Completed =>	Tokyo	data+engineer	60	50	9.836205959320068	file_3  2019-08-31-14:43:31
 68 | Completed =>	Tokyo	data+engineer	70	60	11.402924060821533	file_3  2019-08-31-14:43:31
 69 | Completed =>	Tokyo	data+engineer	80	70	12.990663051605225	file_3  2019-08-31-14:43:31
 70 | Completed =>	Tokyo	data+engineer	90	80	14.621431112289429	file_3  2019-08-31-14:43:31
 71 | Completed =>	Tokyo	data+engineer	100	90	16.241452932357788	file_3  2019-08-31-14:43:31
 72 | Completed =>	Tokyo	data+engineer	110	100	17.904778003692627	file_3  2019-08-31-14:43:31
 73 | Completed =>	Tokyo	data+engineer	120	110	19.482118129730225	file_3  2019-08-31-14:43:31
 74 | Completed =>	Tokyo	data+engineer	130	120	21.03421711921692	file_3  2019-08-31-14:43:31
 75 | Completed =>	Tokyo	data+engineer	140	130	22.70096707344055	file_3  2019-08-31-14:43:31
 76 | Completed =>	Tokyo	data+engineer	150	140	24.55684208869934	file_3  2019-08-31-14:43:31
 77 | Completed =>	Tokyo	data+engineer	160	150	26.177658081054688	file_3  2019-08-31-14:43:31
 78 | Completed =>	Tokyo	data+engineer	170	160	27.839234113693237	file_3  2019-08-31-14:43:31
 79 | Completed =>	Tokyo	data+engineer	180	170	29.469682931900024	file_3  2019-08-31-14:43:31
 80 | Completed =>	Tokyo	data+engineer	190	180	31.041446924209595	file_3  2019-08-31-14:43:31
 81 | Completed =>	Tokyo	data+engineer	200	190	32.85800004005432	file_3  2019-08-31-14:43:31
 82 | Completed =>	Tokyo	data+engineer	210	200	34.42713212966919	file_3  2019-08-31-14:43:31
 83 | Completed =>	Tokyo	data+engineer	220	210	35.96600794792175	file_3  2019-08-31-14:43:31
 84 | Completed =>	Tokyo	data+engineer	230	220	37.524291038513184	file_3  2019-08-31-14:43:31
 85 | Completed =>	Tokyo	data+engineer	240	230	39.25459814071655	file_3  2019-08-31-14:43:31
 86 | Completed =>	Tokyo	data+engineer	250	240	41.38111710548401	file_3  2019-08-31-14:43:31
 87 | Completed =>	Tokyo	data+engineer	260	250	42.982495069503784	file_3  2019-08-31-14:43:31
 88 | Completed =>	Tokyo	data+engineer	270	260	44.96594715118408	file_3  2019-08-31-14:43:31
 89 | Completed =>	Tokyo	data+engineer	280	270	46.65275692939758	file_3  2019-08-31-14:43:31
 90 | Completed =>	Tokyo	data+engineer	290	280	48.339587926864624	file_3  2019-08-31-14:43:31
 91 | Completed =>	Tokyo	data+engineer	300	290	49.98532509803772	file_3  2019-08-31-14:43:31
 92 | Completed =>	New+York	data+engineer	10	0	2.8843870162963867	file_1  2019-09-05-19:11:36
 93 | Completed =>	New+York	machine+learning+engineer	10	0	2.7446670532226562	file_2  2019-09-05-19:11:36
 94 | Completed =>	Singapore	data+engineer	10	0	1.896003007888794	file_3  2019-09-05-19:11:36
 95 | Completed =>	Singapore	machine+learning+engineer	10	0	1.860619068145752	file_4  2019-09-05-19:11:36
 96 | Completed =>	Tokyo	data+engineer	10	0	1.6421759128570557	file_5  2019-09-05-19:11:36
 97 | Completed =>	Tokyo	machine+learning+engineer	10	0	1.54783296585083	file_6  2019-09-05-19:11:36
 98 | Completed =>	New+York	data+engineer	10	0	2.88240909576416	file_1  2019-09-05-19:13:34
 99 | Completed =>	New+York	machine+learning+engineer	10	0	2.709868907928467	file_2  2019-09-05-19:13:34
100 | Completed =>	Singapore	data+engineer	10	0	1.807615041732788	file_3  2019-09-05-19:13:34
101 | Completed =>	Singapore	machine+learning+engineer	10	0	1.8387038707733154	file_4  2019-09-05-19:13:34
102 | Completed =>	Tokyo	data+engineer	10	0	1.774501085281372	file_5  2019-09-05-19:13:34
103 | Completed =>	Tokyo	machine+learning+engineer	10	0	1.5109920501708984	file_6  2019-09-05-19:13:34
104 | Completed =>	New+York	data+engineer	10	0	2.8072328567504883	file_1  2019-09-05-19:15:41
105 | Completed =>	New+York	machine+learning+engineer	10	0	2.7286789417266846	file_2  2019-09-05-19:15:41
106 | Completed =>	Singapore	data+engineer	10	0	1.6815550327301025	file_3  2019-09-05-19:15:41
107 | Completed =>	Singapore	machine+learning+engineer	10	0	1.6483988761901855	file_4  2019-09-05-19:15:41
108 | Completed =>	Tokyo	data+engineer	10	0	1.5782999992370605	file_5  2019-09-05-19:15:41
109 | Completed =>	Tokyo	machine+learning+engineer	10	0	1.572638988494873	file_6  2019-09-05-19:15:41
110 | Completed =>	New+York	data+engineer	10	0	2.9347329139709473	file_1  2019-09-05-19:16:35
111 | Completed =>	New+York	machine+learning+engineer	10	0	3.2965481281280518	file_2  2019-09-05-19:16:35
112 | Completed =>	Singapore	data+engineer	10	0	1.686279058456421	file_3  2019-09-05-19:16:35
113 | Completed =>	Singapore	machine+learning+engineer	10	0	1.6427698135375977	file_4  2019-09-05-19:16:35
114 | Completed =>	Tokyo	data+engineer	10	0	1.5504868030548096	file_5  2019-09-05-19:16:35
115 | Completed =>	Tokyo	machine+learning+engineer	10	0	1.6476211547851562	file_6  2019-09-05-19:16:35
116 | Completed =>	New+York	data+engineer	10	0	3.029778003692627	file_1  2019-09-05-19:21:41
117 | Completed =>	New+York	machine+learning+engineer	10	0	2.7649660110473633	file_2  2019-09-05-19:21:41
118 | Completed =>	San+Francisco	data+engineer	10	0	2.9623639583587646	file_3  2019-09-05-19:21:41
119 | Completed =>	San+Francisco	machine+learning+engineer	10	0	2.8309409618377686	file_4  2019-09-05-19:21:41
120 | Completed =>	Singapore	data+engineer	10	0	1.9062411785125732	file_5  2019-09-05-19:21:41
121 | Completed =>	Singapore	machine+learning+engineer	10	0	2.7771799564361572	file_6  2019-09-05-19:21:41
122 | Completed =>	Tokyo	data+engineer	10	0	2.502600908279419	file_7  2019-09-05-19:21:41
123 | Completed =>	Tokyo	machine+learning+engineer	10	0	1.9742908477783203	file_8  2019-09-05-19:21:41
124 | Completed =>	New+York	data+engineer	10	0	2.835904836654663	file_1  2019-09-05-19:30:26
125 | Completed =>	New+York	data+engineer	20	10	5.764356851577759	file_1  2019-09-05-19:30:26
126 | Completed =>	New+York	data+engineer	30	20	8.615731000900269	file_1  2019-09-05-19:30:26
127 | Completed =>	New+York	data+engineer	40	30	11.705473899841309	file_1  2019-09-05-19:30:26
128 | Completed =>	New+York	data+engineer	50	40	14.562924861907959	file_1  2019-09-05-19:30:26
129 | Completed =>	New+York	machine+learning+engineer	10	0	2.725008010864258	file_2  2019-09-05-19:30:26
130 | Completed =>	New+York	machine+learning+engineer	20	10	5.403141021728516	file_2  2019-09-05-19:30:26
131 | Completed =>	New+York	machine+learning+engineer	30	20	8.16013503074646	file_2  2019-09-05-19:30:26
132 | Completed =>	New+York	machine+learning+engineer	40	30	10.907695055007935	file_2  2019-09-05-19:30:26
133 | Completed =>	New+York	machine+learning+engineer	50	40	13.59488296508789	file_2  2019-09-05-19:30:26
134 | Completed =>	New+York	data+scientist	10	0	2.8523430824279785	file_3  2019-09-05-19:30:26
135 | Completed =>	New+York	data+scientist	20	10	5.502207040786743	file_3  2019-09-05-19:30:26
136 | Completed =>	New+York	data+scientist	30	20	8.322864055633545	file_3  2019-09-05-19:30:26
137 | Completed =>	New+York	data+scientist	40	30	11.044306993484497	file_3  2019-09-05-19:30:26
138 | Completed =>	New+York	data+scientist	50	40	13.77783203125	file_3  2019-09-05-19:30:26
139 | Completed =>	San+Francisco	data+engineer	10	0	3.283784866333008	file_4  2019-09-05-19:30:26
140 | Completed =>	San+Francisco	data+engineer	20	10	6.079689979553223	file_4  2019-09-05-19:30:26
141 | Completed =>	San+Francisco	data+engineer	30	20	9.213353872299194	file_4  2019-09-05-19:30:26
142 | Completed =>	San+Francisco	data+engineer	40	30	11.911937952041626	file_4  2019-09-05-19:30:26
143 | Completed =>	San+Francisco	data+engineer	50	40	14.650459051132202	file_4  2019-09-05-19:30:26
144 | Completed =>	San+Francisco	data+scientist	10	0	2.6817221641540527	file_6  2019-09-05-19:30:26
145 | Completed =>	San+Francisco	data+scientist	20	10	5.355727195739746	file_6  2019-09-05-19:30:26
146 | Completed =>	San+Francisco	data+scientist	30	20	8.048684120178223	file_6  2019-09-05-19:30:26
147 | Completed =>	San+Francisco	data+scientist	40	30	10.72742509841919	file_6  2019-09-05-19:30:26
148 | Completed =>	San+Francisco	data+scientist	50	40	13.383225202560425	file_6  2019-09-05-19:30:26
149 | Completed =>	Singapore	data+engineer	10	0	1.6111540794372559	file_7  2019-09-05-19:30:26
150 | Completed =>	Singapore	data+engineer	20	10	3.2746400833129883	file_7  2019-09-05-19:30:26
151 | Completed =>	Singapore	data+engineer	30	20	4.83650803565979	file_7  2019-09-05-19:30:26
152 | Completed =>	Singapore	data+engineer	40	30	6.416918992996216	file_7  2019-09-05-19:30:26
153 | Completed =>	Singapore	data+engineer	50	40	7.974658012390137	file_7  2019-09-05-19:30:26
154 | Completed =>	Singapore	machine+learning+engineer	10	0	1.604966163635254	file_8  2019-09-05-19:30:26
155 | Completed =>	Singapore	machine+learning+engineer	20	10	3.1582190990448	file_8  2019-09-05-19:30:26
156 | Completed =>	Singapore	machine+learning+engineer	30	20	4.670161962509155	file_8  2019-09-05-19:30:26
157 | Completed =>	Singapore	machine+learning+engineer	40	30	6.392184019088745	file_8  2019-09-05-19:30:26
158 | Completed =>	Singapore	machine+learning+engineer	50	40	7.988824129104614	file_8  2019-09-05-19:30:26
159 | Completed =>	Singapore	data+scientist	10	0	1.6494929790496826	file_9  2019-09-05-19:30:26
160 | Completed =>	Singapore	data+scientist	20	10	3.2304630279541016	file_9  2019-09-05-19:30:26
161 | Completed =>	Singapore	data+scientist	30	20	4.793152093887329	file_9  2019-09-05-19:30:26
162 | Completed =>	Singapore	data+scientist	40	30	6.354517936706543	file_9  2019-09-05-19:30:26
163 | Completed =>	Singapore	data+scientist	50	40	7.877964973449707	file_9  2019-09-05-19:30:26
164 | Completed =>	Tokyo	data+engineer	10	0	1.583630084991455	file_10  2019-09-05-19:30:26
165 | Completed =>	Tokyo	data+engineer	20	10	3.121217966079712	file_10  2019-09-05-19:30:26
166 | Completed =>	Tokyo	data+engineer	30	20	4.6353020668029785	file_10  2019-09-05-19:30:26
167 | Completed =>	Tokyo	data+engineer	40	30	6.194667100906372	file_10  2019-09-05-19:30:26
168 | Completed =>	Tokyo	data+engineer	50	40	7.732399940490723	file_10  2019-09-05-19:30:26
169 | Completed =>	Tokyo	machine+learning+engineer	10	0	1.5184919834136963	file_11  2019-09-05-19:30:26
170 | Completed =>	Tokyo	machine+learning+engineer	20	10	3.030639886856079	file_11  2019-09-05-19:30:26
171 | Completed =>	Tokyo	machine+learning+engineer	30	20	4.5386269092559814	file_11  2019-09-05-19:30:26
172 | Completed =>	Tokyo	machine+learning+engineer	40	30	6.047660827636719	file_11  2019-09-05-19:30:26
173 | Completed =>	Tokyo	machine+learning+engineer	50	40	7.574220895767212	file_11  2019-09-05-19:30:26
174 | Completed =>	Tokyo	data+scientist	10	0	1.5887770652770996	file_12  2019-09-05-19:30:26
175 | Completed =>	Tokyo	data+scientist	20	10	3.1191091537475586	file_12  2019-09-05-19:30:26
176 | Completed =>	Tokyo	data+scientist	30	20	4.641216993331909	file_12  2019-09-05-19:30:26
177 | Completed =>	Tokyo	data+scientist	40	30	6.14634895324707	file_12  2019-09-05-19:30:26
178 | Completed =>	Tokyo	data+scientist	50	40	7.6380720138549805	file_12  2019-09-05-19:30:26
179 | Completed =>	New+York	data+scientist	10	0	2.875648021697998	file_1  2020-01-03-10:16:24
180 | Completed =>	New+York	data+scientist	20	10	5.757162094116211	file_1  2020-01-03-10:16:24
181 | Completed =>	New+York	data+scientist	30	20	8.728279113769531	file_1  2020-01-03-10:16:24
182 | Completed =>	New+York	data+scientist	40	30	11.757231950759888	file_1  2020-01-03-10:16:24
183 | Completed =>	New+York	data+scientist	50	40	14.588212013244629	file_1  2020-01-03-10:16:24
184 | 


--------------------------------------------------------------------------------
/output/2019-08-14_jobs_1.txt:
--------------------------------------------------------------------------------
 1 | ,unique_id,city,job_qry,job_title,company_name,location,summary,salary,link,date,full_text
 2 | 1,p_baf234a5dd0cc155,New+York,data+scientist,Data Scientist,PepsiCo,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=baf234a5dd0cc155&fccid=2973259ddc967948&vjs=3,4 hours ago,NOT_FOUND
 3 | 2,p_1942a17d2bd166e1,New+York,data+scientist,Jr. Data Scientist,Viacom,"New York, NY 10036",NOT_FOUND,NOT_FOUND,/rc/clk?jk=1942a17d2bd166e1&fccid=ae0c894528aa6eee&vjs=3,15 days ago,NOT_FOUND
 4 | 3,p_fa03c1992457a2b9,New+York,data+scientist,Data Scientist,AETNA,"New York, NY 10016 (Gramercy area)",NOT_FOUND,NOT_FOUND,/rc/clk?jk=fa03c1992457a2b9&fccid=7077d7e88049c02a&vjs=3,3 days ago,NOT_FOUND
 5 | 4,p_9c04b28a806b92ab,New+York,data+scientist,Data Scientist,Butterfly Network,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=9c04b28a806b92ab&fccid=f34adc12ba09e47f&vjs=3,1 day ago,NOT_FOUND
 6 | 5,p_f6daa5cf3e224f5b,New+York,data+scientist,Data Scientist,"AbleTo, Inc.","New York, NY 10010 (Gramercy area)",NOT_FOUND,NOT_FOUND,/rc/clk?jk=f6daa5cf3e224f5b&fccid=954e57501f6bca1f&vjs=3,10 hours ago,NOT_FOUND
 7 | 6,p_7a578b59b17acc55,New+York,data+scientist,Machine Learning Data Scientist,UBS,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=7a578b59b17acc55&fccid=1c76c3a36f6c7557&vjs=3,4 days ago,NOT_FOUND
 8 | 7,p_62769c201dded401,New+York,data+scientist,Data Scientist,WW International (formerly Weight Watchers),"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=62769c201dded401&fccid=c29a08660ede9319&vjs=3,30+ days ago,NOT_FOUND
 9 | 8,p_d19b36ecfcd4b8a7,New+York,data+scientist,Junior Data Scientist,"Remedy BPCI Partners, LLC.","New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=d19b36ecfcd4b8a7&fccid=9744e569304da4d3&vjs=3,13 days ago,NOT_FOUND
10 | 9,p_2646d9747e37d496,New+York,data+scientist,Junior Data Scientist,Remedy Partners,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=2646d9747e37d496&fccid=3f0ffcc867369bf0&vjs=3,13 days ago,NOT_FOUND
11 | 10,p_4f7dc642a14556eb,New+York,data+scientist,Data Scientist (NY),Debtsy,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=4f7dc642a14556eb&fccid=ec3c520f9d48f531&vjs=3,26 days ago,NOT_FOUND
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | celery==5.2.2
2 | requests
3 | pymongo==3.4.0
4 | redis
5 | beautifulsoup4
6 | pandas
7 | lxml
8 | 


--------------------------------------------------------------------------------
/script/send_mail.py:
--------------------------------------------------------------------------------
 1 | import os, smtplib 
 2 | 
 3 | class SendMail:
 4 | 
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     def send_mail(self, message, to, subject):
 9 |         # Gmail Sign In
10 |         if not os.getenv('FLASK_ENV') == 'testing':
11 |             gmail_sender = os.getenv("EMAIL")
12 |             gmail_passwd = os.getenv("EMAIL_PASS")
13 | 
14 |             try:
15 |                 server = smtplib.SMTP('smtp.gmail.com', 587)
16 |                 server.ehlo()
17 |                 server.starttls()
18 |                 server.login(gmail_sender, gmail_passwd)
19 | 
20 |                 BODY = '\r\n'.join(['To: %s' % to,
21 |                                     'From: %s' % gmail_sender,
22 |                                     'Subject: %s' % subject,
23 |                                     '', message])
24 | 
25 |                 server.sendmail(gmail_sender, [to], BODY)
26 |                 res = "sent"
27 |             except:
28 |                 res = "fail"
29 |             server.quit()
30 |             return res
31 |         return True
32 | 
33 | def main(message, to, subject):
34 |     try:
35 |         SendMail().send_mail(message, to, subject)
36 |         print ('email sent!')
37 |     except Exception as e:
38 |         print ('email sending failed', e)
39 | 
40 | if __name__ == '__main__':
41 |     message='this is test msg'
42 |     to='<accept@emial.com>'
43 |     subject='eamil from SendMail script'
44 |     main(message, to, subject)


--------------------------------------------------------------------------------
/slack_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ##########################################################################################
 4 | # MODIFY FROM  https://github.com/guzzijason/slack-upload-bash/blob/master/slack-upload.sh  
 5 | ##########################################################################################
 6 | 
 7 | # This bash script makes use of the Slack API to upload files.
 8 | # I found this useful due to the fact that the attachement option
 9 | # available in incoming webhooks seems to have an upper limit of
10 | # content size, which is way too small.
11 | #
12 | # See also: https://api.slack.com/methods/files.upload
13 | 
14 | # safety first
15 | set -euf -o pipefail
16 | 
17 | echo='echo -e'
18 | 
19 | Usage() {
20 |   ${echo}
21 |   ${echo} "\tusage:\n\t\t$0 [OPTIONS]"
22 |   ${echo}
23 |   ${echo} "Required:"
24 |   ${echo} " -c CHANNEL\tSlack channel to post to"
25 |   ${echo} " -f FILENAME\tName of file to upload"
26 |   ${echo} " -s SLACK_TOKEN\tAPI auth token"
27 |   ${echo}
28 |   ${echo} "Optional:"
29 |   ${echo} " -u API_URL\tSlack API endpoint to use (default: https://slack.com/api/files.upload)"
30 |   ${echo} " -h     \tPrint help"
31 |   ${echo} " -m TYPE\tFile type (see https://api.slack.com/types/file#file_types)"
32 |   ${echo} " -n TITLE\tTitle for slack post"
33 |   ${echo} " -v     \tVerbose mode"
34 |   ${echo} " -x COMMENT\tAdd a comment to the file"
35 |   ${echo}
36 |   exit ${1:-$USAGE}
37 | }
38 | 
39 | # Exit Vars
40 | : ${HELP:=0}
41 | : ${USAGE:=1}
42 | 
43 | # Default Vars
44 | API_URL='https://slack.com/api/files.upload'
45 | CURL_OPTS='-s'
46 | 
47 | # main
48 | while getopts :c:f:s:u:hm:n:vx: OPT; do
49 |   case ${OPT} in
50 |     c)
51 |       CHANNEL="$OPTARG"
52 |       ;;
53 |     f)
54 |       FILENAME="$OPTARG"
55 |       SHORT_FILENAME=$(basename ${FILENAME})
56 |       ;;
57 |     s)
58 |       SLACK_TOKEN="$OPTARG"
59 |       ;;
60 |     u)
61 |       API_URL="$OPTARG"
62 |       ;;
63 |     h)
64 |       Usage ${HELP}
65 |       ;;
66 |     m)
67 |       CURL_OPTS="${CURL_OPTS} -F filetype=${OPTARG}"
68 |       ;;
69 |     n)
70 |       CURL_OPTS="${CURL_OPTS} -F title='${OPTARG}'"
71 |       ;;
72 |     v)
73 |       CURL_OPTS="${CURL_OPTS} -v"
74 |       ;;
75 |     x)
76 |       CURL_OPTS="${CURL_OPTS} -F initial_comment='${OPTARG}'"
77 |       ;;
78 |     \?)
79 |       echo "Invalid option: -$OPTARG" >&2
80 |       Usage ${USAGE}
81 |       ;;
82 |   esac
83 | done
84 | 
85 | if [[ ( "${CHANNEL}" != "#"* ) && ( "${CHANNEL}" != "@"* ) ]]; then
86 |   CHANNEL="#${CHANNEL}"
87 | fi
88 | 
89 | # had to use eval to avoid strange whitespace behavior in options
90 | eval curl $CURL_OPTS \
91 |   --form-string channels=${CHANNEL} \
92 |   -F file=@${FILENAME} \
93 |   -F filename=${SHORT_FILENAME} \
94 |   -F token=${SLACK_TOKEN} \
95 |   ${API_URL}
96 | 
97 | exit 0


--------------------------------------------------------------------------------
/tests/unit_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | sys.path.append(".")
 4 | from bs4 import BeautifulSoup
 5 | from celery_queue.IndeedScrapper.indeed_extract import (get_soup as get_soup_,
 6 |                                                         extract_company as extract_company_,
 7 |                                                         extract_salary as extract_salary_,
 8 |                                                         extract_location as extract_location_,
 9 |                                                         extract_job_title as extract_job_title_,
10 |                                                         extract_summary as extract_summary_,
11 |                                                         extract_link as extract_link_,
12 |                                                         extract_date as extract_date_,
13 |                                                         extract_fulltext as extract_fulltext_,
14 |                                                         write_logs as write_logs_,
15 |                                                         get_full_job_link as get_full_job_link_)
16 | 
17 | 
18 | with open('tests/unittest_data.txt', 'r') as file:
19 |   html = file.read()
20 | 
21 | def test_get_soup():
22 |     text = """<p>Here's a paragraph of text!</p>"""
23 |     result = get_soup_(text)
24 |     assert result.text.strip() == "Here's a paragraph of text!"
25 | 
26 | def test_extract_company():
27 |     expected = '\\n\\nU3 INFOTECH PTE. LTD.'
28 |     soup = BeautifulSoup(html)
29 |     result = extract_company_(soup)
30 |     assert result == expected
31 | 
32 | def test_extract_salary():
33 |     expected = 'NOT_FOUND'
34 |     soup = BeautifulSoup(html)
35 |     result = extract_salary_(soup)
36 |     assert result == expected
37 | 
38 | def test_extract_location():
39 |     expected= 'Shenton Way'
40 |     soup = BeautifulSoup(html)
41 |     result = extract_location_(soup)
42 |     assert result == expected
43 | 
44 | def test_extract_job_title():
45 |     expected= 'NOT_FOUND'
46 |     soup = BeautifulSoup(html)
47 |     result = extract_job_title_(soup)
48 |     assert result == expected
49 | 
50 | def test_extract_summary():
51 |     expected= 'NOT_FOUND'
52 |     soup = BeautifulSoup(html)
53 |     result = extract_summary_(soup)
54 |     assert result == expected
55 | 
56 | def test_extract_link():
57 |     expected= 'NOT_FOUND'
58 |     soup = BeautifulSoup(html)
59 |     result = extract_link_(soup)
60 |     assert result == expected
61 | 
62 | def test_extract_date():
63 |     expected= '1 day ago'
64 |     soup = BeautifulSoup(html)
65 |     result = extract_date_(soup)
66 |     assert result == expected
67 | 
68 | def test_extract_fulltext():
69 |     expected= 'NOT_FOUND'
70 |     soup = BeautifulSoup(html)
71 |     result = extract_fulltext_(soup)
72 |     assert result == expected
73 | 
74 | def test_get_full_job_link_():
75 |     expected1 = 'https://www.indeed.com.sg/123'
76 |     expected2 = 'https://jp.indeed.com/123'
77 |     result1 = get_full_job_link_("123", city='Singapore')
78 |     result2 = get_full_job_link_("123", city='Tokyo')
79 |     assert result1 == expected1
80 |     assert result2 == expected2
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     pytest.main([__file__])
85 | 


--------------------------------------------------------------------------------
/tests/unit_test_celery.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | sys.path.append(".")
 4 | sys.path.append("./celery_queue")
 5 | import pytest
 6 | from unittest.mock import patch
 7 | from unittest import TestCase
 8 | from celery import chain
 9 | from celery_queue import tasks
10 | 
11 | # Ref of celery mock unit test
12 | # - https://www.distributedpython.com/2018/05/15/testing-celery-chains/
13 | # - https://www.distributedpython.com/2018/05/01/unit-testing-celery-tasks/
14 | # - http://docs.celeryproject.org/en/latest/userguide/testing.html
15 | 
16 | class TestAddTask(unittest.TestCase):
17 | 
18 |     def test_task_state_and_addition(self):
19 | 
20 |         task = tasks.add.apply(args=[3, 5])
21 |         self.assertEqual(task.status, "SUCCESS")
22 |         self.assertEqual(task.result, 8)
23 | 
24 | class TestMultiplyTask(unittest.TestCase):
25 | 
26 |     def test_task_state_and_multiply(self):
27 | 
28 |         task = tasks.multiply.apply(args=[3, 5])
29 |         self.assertEqual(task.status, "SUCCESS")
30 |         self.assertEqual(task.result, 15)
31 | 
32 | class TestScrapeTask(unittest.TestCase):
33 | 
34 |     def test_task_state_and_scrape(self):
35 | 
36 |         task = tasks.scrape.apply()
37 |         self.assertEqual(task.status, "SUCCESS")
38 |         self.assertEqual(type(task.result), str)
39 | 
40 | class TestIndeedScrapTask(unittest.TestCase):
41 | 
42 |     def test_task_indeed_scrape(self):
43 | 
44 |         task = tasks.indeed_scrape.apply()
45 |         self.assertEqual(task.status, "SUCCESS")
46 |         self.assertEqual(type(task.result), type(None))
47 | 
48 | class TestIndeedScrapAPITask(unittest.TestCase):
49 | 
50 |     def test_task_indeed_scrape_api(self):
51 | 
52 |         task = tasks.indeed_scrape_api.apply(args=["Tokyo"])
53 |         self.assertEqual(task.status, "SUCCESS")
54 |         self.assertEqual(type(task.result), type(None))
55 | 
56 | 
57 | # class TestAddTask(unittest.TestCase):
58 | #
59 | #     def setUp(self):
60 | #         self.task = add.apply_async(args=[3, 5])
61 | #         self.results = self.task.get()
62 | #
63 | #     def test_task_state(self):
64 | #         self.assertEqual(self.task.state, "SUCCESS")
65 | #
66 | #     def test_addition(self):
67 | #         self.assertEqual(self.results, 8)
68 | #
69 | # class TestMultiplyTask(unittest.TestCase):
70 | #
71 | #     def setUp(self):
72 | #         self.task = multiply.apply_async(args=[3, 5])
73 | #         self.results = self.task.get()
74 | #
75 | #     def test_task_state(self):
76 | #         self.assertEqual(self.task.state, "SUCCESS")
77 | #
78 | #     def test_multiplication(self):
79 | #         self.assertEqual(self.results, 15)
80 | #
81 | # class TestScrapeTask(unittest.TestCase):
82 | #
83 | #     def setUp(self):
84 | #         self.task = scrape.apply_async()
85 | #         self.results = self.task.get()
86 | #
87 | #     def test_task_state(self):
88 | #         self.assertEqual(self.task.state, "SUCCESS")
89 | #
90 | #     def test_scraping(self):
91 | #         self.assertEqual(type(self.results), str)
92 | 
93 | if __name__ == '__main__':
94 |     unittest.main()


--------------------------------------------------------------------------------
/travis_push_github.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ####################################################################
 3 | # MODIFY FROM https://gist.github.com/willprice/e07efd73fb7f13f917ea 
 4 | ####################################################################
 5 | 
 6 | setup_git() {
 7 |   git init 
 8 |   git config --global user.email "travis@travis-ci.org"
 9 |   git config --global user.name "Travis CI"
10 | }
11 | 
12 | commit_website_files() {
13 |   git checkout -b gh-pages
14 |   git add . *.html
15 |   git commit --message "Travis build: $TRAVIS_BUILD_NUMBER"
16 | }
17 | 
18 | commit_output_file() {
19 |   git status 
20 |   git add output/*
21 |   git commit --m "Travis build  : $TRAVIS_BUILD_NUMBER"
22 | }
23 | 
24 | commit_new_output_file() {
25 |   d=`date +%Y-%m-%d` && echo $d 
26 |   git status 
27 |   for file in "output"/*
28 |   do 
29 |     if [[ "$file" == *"$d"* ]];then
30 |       echo "no today's new file, nothing to commit"    
31 |     else 
32 |       echo "commit new file..."
33 |       git add output/* 
34 |       git commit --m "Travis build  : $TRAVIS_BUILD_NUMBER"
35 |     fi 
36 |   done
37 | }
38 | 
39 | upload_files() {
40 |   echo 'Travis push to github'	
41 |   git push https://yennanliu:${GH_TOKEN}@${GH_REF} HEAD:master --quiet
42 | 
43 | }
44 | 
45 | GH_REF=github.com/yennanliu/web_scraping.git
46 | setup_git
47 | commit_new_output_file
48 | upload_files


--------------------------------------------------------------------------------