├── .github └── workflows │ ├── blank.yml │ └── python-app.yml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── Dockerfile_dev ├── README.md ├── api ├── .dockerignore ├── Dockerfile ├── app.py ├── requirements.txt └── worker.py ├── celery_queue ├── .dockerignore ├── Dockerfile ├── IndeedScrapper │ ├── README.md │ ├── __init__.py │ ├── indeed_extract.py │ └── indeed_scrapper.py ├── __init__.py ├── log.txt ├── requirements.txt └── tasks.py ├── cron_indeed_scrapping_test.py ├── cron_test.py ├── dev ├── 104 │ ├── code1.py │ ├── code2.py │ ├── code3.py │ └── index.txt └── test-1.ipynb ├── doc └── pic │ ├── architecture.jpg │ ├── architecture.svg │ ├── celery.jpg │ └── celery.svg ├── docker-compose.yml ├── legacy_project ├── archived │ ├── bank_swiftcode │ │ ├── UK_bank_swift_code_list.csv │ │ ├── grab_bank_list.py │ │ └── grab_bank_list_muitiprocess.py │ ├── booking │ │ ├── bookingcom_scrap.py │ │ └── next_page_sample.py │ ├── efish_scraping_demo.ipynb │ ├── glassdoor │ │ └── glassdoor_scrap.py │ └── spotify │ │ └── spotify_album copy.sh ├── blu_move │ ├── analysis.sql │ ├── blu_.json │ ├── blu_scrape_V1.py │ ├── blu_scrape_V1.sh │ ├── blu_scrape_V2.py │ ├── blu_scrape_V2.sh │ ├── run.sh │ ├── utility_data_IO.py │ └── utility_data_preprocess.py ├── carandclassic │ ├── README.md │ ├── analysis │ │ ├── .ipynb_checkpoints │ │ │ └── Rental_Location_EDA-checkpoint.ipynb │ │ ├── DemoLondonRentals.csv │ │ ├── README.md │ │ └── Rental_Location_EDA.ipynb │ ├── carandclassic_scrape_sample.csv │ └── cclassic_scrape_V1.py ├── carousell │ └── web_crawler copy.py ├── delivery_ │ ├── .gitignore │ ├── README.md │ ├── analysis.py │ ├── analysis.sql │ ├── data2db.py │ ├── query_test.sh │ ├── scrap.py │ ├── sqlite2csv.sh │ ├── weather.csv │ └── weather.db ├── env.md ├── es_scrapper_docker_demo │ ├── Dockerfile │ ├── README.md │ ├── app.py │ ├── docker-compose.yml │ └── requirements.txt ├── eztable │ ├── eztable_scarp.py │ ├── eztable_scrap_dev.py │ ├── eztable_scrap_dev2.py │ ├── eztable_scrap_inputword.py │ └── geckodriver.log ├── facebook_fan_page │ ├── google_scrap_fb_page_final.ipynb │ └── scrap_fb_page_test.ipynb ├── geojson.py ├── google_geodata │ ├── geopy_address_lon_lat.py │ └── gmap_address_lon_lat.py ├── ipeen │ ├── README.md │ ├── ipeen_grab.py │ ├── ipeen_pivot.py │ ├── ipeen_restaurant_grab_V2.ipynb │ ├── ipeen_restaurant_pivot_table.ipynb │ └── ipeen_scraping-final.ipynb ├── script │ ├── __init__.py │ ├── utility_data_IO.py │ └── utility_operation.py ├── setup.sh └── weather_scrapper │ ├── LDN_weather_scrapper_V1.py │ ├── README.md │ └── br_weather_scrapper_V1.py ├── logs └── log.txt ├── output └── 2019-08-14_jobs_1.txt ├── requirements.txt ├── script └── send_mail.py ├── slack_push.sh ├── tests ├── unit_test.py ├── unit_test_celery.py └── unittest_data.txt └── travis_push_github.sh /.github/workflows/blank.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Run a one-line script 13 | run: echo this is the dummy test 14 | - name: Run a multi-line script 15 | run: | 16 | echo *** build start *** 17 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Test with pytest 32 | run: | 33 | pytest 34 | - name: Run indeed scrapping test 35 | run: | 36 | python cron_indeed_scrapping_test.py 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *__pycache__ 3 | *.DS_Store 4 | .DS_Store 5 | *.ipynb_checkpoints 6 | .ipynb_checkpoints 7 | Indeed_scrapper_nb_V1.ipynb 8 | logs/log.txt 9 | celery_queue/celerybeat-schedule.db 10 | celery_queue/celerybeat.pid -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | services: 4 | - docker 5 | branches: 6 | only: 7 | - master 8 | notifications: 9 | email: 10 | on_failure: always 11 | recipients: 12 | - f339339@gmail.com 13 | script: 14 | - echo ' ----------------- STEP 0) UNIT TEST ----------------- ' 15 | - pytest -v tests 16 | - python tests/unit_test_celery.py -v 17 | - echo ' ----------------- STEP 1) INDEED SCRAPING ----------------- ' 18 | - python cron_indeed_scrapping_test.py >> indeed_task.log 19 | - ls output && ls logs && cat logs/log.txt 20 | #- docker build -t web_scraping_env . && docker run -it web_scraping_env /bin/bash -c "python cron_test.py && bash travis_push_github.sh" 21 | after_success: 22 | - echo 'push scraped file to slack...' && current_date=$(date +'%Y-%m-%d') && for file_name in $(ls output/*.csv) ; do echo $file_name $slack_channel && bash slack_push.sh -f $file_name -c $slack_channel -s $slack_token -n "TRAVIS SLACK PUSH" -x " >>>> INDEED SCRAPING REPORT $file_name" ; done 23 | - echo 'push LOG to slack...' && bash slack_push.sh -f indeed_task.log -c web_scraping_log -s $slack_token 24 | 25 | 26 | # deploy: 27 | # provider: script 28 | # script: bash travis_push_github.sh 29 | # skip-cleanup: true 30 | # target-branch: master 31 | # github-token: $GH_TOKEN 32 | # keep-history: true 33 | # verbose: true 34 | # on: 35 | # branch: master 36 | # env: 37 | # global: 38 | # secure: Yfr36/XdwtZyjUBJwYTboFAfH5qqSYRd7d1vx/vHO1fCP4XtQWqT1Lvo5pfbHXghOjiJZZcfhO72inUKJ7er9QXznsGufj6nnQUJs/dOoBbfGnLSdvSYT6lpXTe7GYMbOgUsmYtjeD8S6pyL2L8xcX1fPZzsVD7v/edG9kZo1H9+fKCbVipBNf0IXO4DaE1H4vw77UVb6ysA3npxyIprM4jXUkZW3KFb7fA7/LENpS1NPniQxYe1LuUjzOpdJAG28WIeQnC/Cb+jz16cRtIV7HgukG0WnpHdszI+Xj4Kx+46URZnXW95cpZ2cq4Oywx98XZbC5uEXn3GeB/9JgvnuNsfsYOzhdCg29Ca/JGiUyri7F/x3mFxMfl2OoJeO50R4JTnwPrAHot8m914rP/VXtGZFPJQfXjoyKQJPnHFO0Yt+IJ9ziK3r3tLcdrbYngPuoBHFEYr4f87jOjdiyn/+1x9liLYh+Z0/6UdbQJRQnsAh+ghSvs1M7FIKY4eMHPW9qKPUbfsQIRckTzC6U7lX16eiPQk+wehJ7o//FB6MFOEvbownBcDUooITJXgC0Cvtpd831ktlkxPqyJh13X9URbEyD25zG58zI9Bq7RfeCjWN8LZaa7bLyjhDR2KzAvWDfKowbUShpznlSSSo2czn81kT1GXaAa4Iz215kNCDfs= -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | LABEL maintainer "yennj12" 4 | 5 | #ENV PYTHONPATH /app 6 | ENV CELERY_BROKER_URL redis://redis:6379/0 7 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0 8 | ENV C_FORCE_ROOT true 9 | ADD requirements.txt /app/requirements.txt 10 | ADD ./test_celery/ /app/ 11 | COPY . /app 12 | WORKDIR /app/ 13 | RUN pip install -r requirements.txt 14 | ENTRYPOINT celery -A task worker --loglevel=info -------------------------------------------------------------------------------- /Dockerfile_dev: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | LABEL maintainer "yennj12" 4 | 5 | ENV HOME / 6 | WORKDIR $HOME 7 | COPY . $HOME 8 | 9 | RUN pip install --upgrade pip && \ 10 | pip install -r requirements.txt && \ 11 | pwd && ls && ls home 12 | 13 | RUN /bin/bash -c "python cron_test.py" 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web_scraping 2 | 3 | Collection of scrapper pipelines build for different purposes 4 | 5 | [![Build Status](https://travis-ci.org/yennanliu/web_scraping.svg?branch=master)](https://travis-ci.org/yennanliu/web_scraping) 6 | [![PRs](https://img.shields.io/badge/PRs-welcome-6574cd.svg)](https://github.com/yennanliu/web_scraping/pulls) 7 | 8 | 9 | ### Architecture 10 |

11 |

12 | 13 | - Architecture idea 14 | - Asynchronous tasks 15 | - Celery client : `flask` <---> `Celery client` <---> `Celery worker`. Be connected to flask to the celery task, issue the commands for the tasks 16 | - Celery worker : A process that runs tasks in background, can be a `schedulued`task (periodic task), and a `asynchronous` (when API call) one. 17 | - Massage broker : `Celery client` <--Massage broker-> `Celery worker`. The Celery client will need to via Message worker to communicate with Celery worker. Here I use `Redis` as the Message broker. 18 | 19 | ### Quick Start 20 |
21 | Quick start via docker 22 | 23 | ```bash 24 | # Run via docker 25 | $ cd ~ && git clone https://github.com/yennanliu/web_scraping 26 | $ cd ~ && cd web_scraping && docker-compose -f docker-compose.yml up 27 | ``` 28 | - visit the services via 29 | - flower UI : http://localhost:5555/ 30 | - Run "add" task : http://localhost:5001/add/1/2 31 | - Run "web scrape" task : http://localhost:5001/scrap_task 32 | - Run "indeed scrape" task : http://localhost:5001/indeed_scrap_task 33 | 34 |
35 | 36 |
37 | Quick start manually 38 | 39 | ```bash 40 | # Run manually 41 | 42 | # STEP 1) open one terminal and run celery server locally 43 | $ cd ~ && cd web_scraping/celery_queue 44 | # run task from API call 45 | $ celery -A tasks worker --loglevel=info 46 | # run cron (periodic) task 47 | $ celery -A tasks beat 48 | 49 | # STEP 2) Run radis server locally (with the other terminal) 50 | # make sure you have already installed radis 51 | $ redis-server 52 | 53 | # STEP 3) Run flower (with the other terminal) 54 | $ cd ~ && cd web_scraping/celery_queue 55 | $ celery flower -A tasks --address=127.0.0.1 --port=5555 56 | 57 | # STEP 4) Add a sample task 58 | # "add" task 59 | $ curl -X POST -d '{"args":[1,2]}' http://localhost:5555/api/task/async-apply/tasks.add 60 | 61 | # "multiply" task 62 | $ curl -X POST -d '{"args":[3,5]}' http://localhost:5555/api/task/async-apply/tasks.multiply 63 | 64 | # "scrape_task" task 65 | $ curl -X POST http://localhost:5555/api/task/async-apply/tasks.scrape_task 66 | 67 | # "scrape_task_api" task 68 | $ curl -X POST -d '{"args":["mlflow","mlflow"]}' http://localhost:5555/api/task/async-apply/tasks.scrape_task_api 69 | 70 | # "indeed_scrap_task" task 71 | $ curl -X POST http://localhost:5555/api/task/async-apply/tasks.indeed_scrap_task 72 | 73 | # "indeed_scrap_api_V1" task 74 | $ curl -X POST -d '{"args":["New+York"]}' http://localhost:5555/api/task/async-apply/tasks.indeed_scrap_api_V1 75 | 76 | ``` 77 |
78 | 79 | 80 | ### File structure 81 | 82 | ``` 83 | ├── Dockerfile 84 | ├── README.md 85 | ├── api. : Celery api (broker, job accepter(flask)) 86 | │   ├── Dockerfile : Dockerfile build celery api 87 | │   ├── app.py : Flask server accept job request(api) 88 | │   ├── requirements.txt 89 | │   └── worker.py : Celery broker, celery backend(redis) 90 | ├── celery-queue : Run main web scrapping jobs (via celery) 91 | │   ├── Dockerfile : Dockerfile build celery-queue 92 | │   ├── IndeedScrapper : Scrapper scrape Indeed.com 93 | │   ├── requirements.txt 94 | │   └── tasks.py : Celery run scrapping tasks 95 | ├── cron_indeed_scrapping_test.py 96 | ├── cron_test.py 97 | ├── docker-compose.yml : docker-compose build whole system : api, celery-queue, redis, and flower(celery job monitor) 98 | ├── legacy_project 99 | ├── logs : Save running logs 100 | ├── output : Save scraped data 101 | ├── requirements.txt 102 | └── travis_push_github.sh : Script auto push output to github via Travis 103 | 104 | ``` 105 | 106 | ### Development 107 | 108 |
109 | Development 110 | 111 | ```bash 112 | # Run Unit test # 1 113 | $ pytest -v tests/ 114 | # ================================== test session starts ================================== 115 | # platform darwin -- Python 3.6.4, pytest-5.0.1, py-1.5.2, pluggy-0.12.0 -- /Users/jerryliu/anaconda3/envs/yen_dev/bin/python 116 | # cachedir: .pytest_cache 117 | # rootdir: /Users/jerryliu/web_scraping 118 | # plugins: cov-2.7.1, celery-4.3.0 119 | # collected 10 items 120 | # tests/unit_test.py::test_get_soup PASSED [ 10%] 121 | # tests/unit_test.py::test_extract_company PASSED [ 20%] 122 | # tests/unit_test.py::test_extract_salary PASSED [ 30%] 123 | # tests/unit_test.py::test_extract_location PASSED [ 40%] 124 | # tests/unit_test.py::test_extract_job_title PASSED [ 50%] 125 | # tests/unit_test.py::test_extract_summary PASSED [ 60%] 126 | # tests/unit_test.py::test_extract_link PASSED [ 70%] 127 | # tests/unit_test.py::test_extract_date PASSED [ 80%] 128 | # tests/unit_test.py::test_extract_fulltext PASSED [ 90%] 129 | # tests/unit_test.py::test_get_full_job_link_ PASSED [100%] 130 | 131 | # Run Unit test # 2 132 | python tests/unit_test_celery.py -v 133 | # test_addition (__main__.TestAddTask) ... ok 134 | # test_task_state (__main__.TestAddTask) ... ok 135 | # test_multiplication (__main__.TestMultiplyTask) ... ok 136 | # test_task_state (__main__.TestMultiplyTask) ... ok 137 | # ---------------------------------------------------------------------- 138 | # Ran 4 tests in 0.131s 139 | # OK 140 | 141 | ``` 142 |
143 | 144 | ### Tech 145 | * [Celery](http://docs.celeryproject.org/en/latest/getting-started/first-steps-with-celery.html) : parallel/single thread python tasks management tool (celery broker/worker) 146 | * [Redis](https://redis.io/) : key-value DB save task data 147 | * [Flower](https://flower.readthedocs.io/en/latest/) : UI monitor celery tasks 148 | * [Flask](http://flask.palletsprojects.com/en/1.1.x/) : python light web framework, as project backend server here 149 | * [Docker](https://www.docker.com/get-started) : build the app environment 150 | 151 | 152 | ### Todo 153 |
154 | TODO 155 | 156 | ``` 157 | ### Project level 158 | 159 | 0. Deploy to Heroku cloud and make the scrapper as an API service 160 | 1. Dockerize the project 161 | 2. Run the scrapping (cron/paralel)jobs via Celery 162 | 4. Add test (unit/integration test) 163 | 5. Design DB model that save scrapping data systematically 164 | 165 | ### Programming level 166 | 167 | 1. Add utility scripts that can get XPATH of all objects in html 168 | 2. Workflow that automate whole processes 169 | 3. Job management 170 | - Multiprocessing 171 | - Asynchronous 172 | - Queue 173 | 4. Scrapping tutorial 174 | 5. Scrapy, Phantomjs 175 | 176 | ### Others 177 | 178 | 1. Web scrapping 101 tutorial 179 | 180 | ``` 181 |
182 | 183 | ### Ref 184 |
185 | Ref 186 | 187 | - Scraping via Celery 188 | - https://www.pythoncircle.com/post/518/scraping-10000-tweets-in-60-seconds-using-celery-rabbitmq-and-docker-cluster-with-rotating-proxy/ 189 | - http://allynh.com/blog/flask-asynchronous-background-tasks-with-celery-and-redis/ 190 | 191 | - Travis push to github 192 | - https://stackoverflow.com/questions/51925941/travis-ci-how-to-push-to-master-branch 193 | - https://medium.com/@preslavrachev/using-travis-for-secure-building-and-deployment-to-github-5a97afcac113 194 | - https://gist.github.com/willprice/e07efd73fb7f13f917ea 195 | - https://www.vinaygopinath.me/blog/tech/commit-to-master-branch-on-github-using-travis-ci/ 196 | - https://www.hidennis.tech/2015/07/07/deploy-blog-using-travis/ 197 | 198 | - Indeed scrapping 199 | - https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b 200 | - https://github.com/tarunsinghal92/indeedscrapperlatest 201 | 202 | - Distributed scrapping 203 | - https://github.com/tikazyq/crawlab 204 | 205 | - Unit test Celery 206 | - https://docs.celeryproject.org/en/latest/userguide/testing.html 207 |
-------------------------------------------------------------------------------- /api/.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | .dockerignore 3 | -------------------------------------------------------------------------------- /api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | ENV PYTHONPATH /app 4 | ENV CELERY_BROKER_URL redis://redis:6379/0 5 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0 6 | ENV C_FORCE_ROOT true 7 | 8 | ENV HOST 0.0.0.0 9 | ENV PORT 5001 10 | ENV DEBUG true 11 | 12 | COPY . /api 13 | WORKDIR /api 14 | 15 | # install requirements 16 | RUN pip install -r requirements.txt 17 | 18 | # expose the app port 19 | EXPOSE 5001 20 | 21 | RUN pip install gunicorn 22 | 23 | # run the app server 24 | CMD ["gunicorn", "--bind", "0.0.0.0:5001", "--workers", "3", "app:app"] -------------------------------------------------------------------------------- /api/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, url_for 2 | import celery.states as states 3 | import sys 4 | sys.path.append("..") 5 | # udf 6 | from worker import celery 7 | 8 | app = Flask(__name__) 9 | 10 | @app.route('/scrap_task') 11 | def run_github_scrape(): 12 | task = celery.send_task('tasks.scrap_task',kwargs={}) 13 | response = f"check status of {task.id} " 14 | return response 15 | 16 | @app.route('/scrap_task_api//') 17 | def run_github_scrape_api(): 18 | task = celery.send_task('tasks.scrape_github_api',kwargs={}) 19 | response = f"check status of {task.id} " 20 | return response 21 | 22 | @app.route('/indeed_scrap_task') 23 | def run_indeed_scrape(): 24 | task = celery.send_task('tasks.indeed_scrap_task',kwargs={}) 25 | response = f"check status of {task.id} " 26 | return response 27 | 28 | @app.route('/indeed_scrap_api_V1/') 29 | def run_indeed_scrape_api(city_set: str): 30 | print ('city_set :', city_set) 31 | task = celery.send_task('tasks.indeed_scrap_api_V1',city_set,kwargs={}) 32 | response = f"check status of {task.id} " 33 | return response 34 | 35 | @app.route('/check/') 36 | def check_task(task_id: str) -> str: 37 | res = celery.AsyncResult(task_id) 38 | if res.state == states.PENDING: 39 | return res.state 40 | else: 41 | return str(res.result) -------------------------------------------------------------------------------- /api/requirements.txt: -------------------------------------------------------------------------------- 1 | amqp==2.2.2 2 | Babel==2.9.1 3 | billiard==3.5.0.3 4 | celery==5.2.2 5 | click==6.7 6 | Flask==2.3.2 7 | itsdangerous==0.24 8 | Jinja2>=2.10.1 9 | kombu==4.2.0 10 | MarkupSafe==1.0 11 | pytz==2018.3 12 | redis==4.4.4 13 | tornado==5.0.2 14 | vine==1.1.4 15 | Werkzeug>=0.15.3 16 | beautifulsoup4 -------------------------------------------------------------------------------- /api/worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | from celery import Celery 3 | import sys 4 | sys.path.append("..") 5 | 6 | CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', 'redis://localhost:6379'), 7 | CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379') 8 | 9 | celery = Celery('tasks', broker=CELERY_BROKER_URL, backend=CELERY_RESULT_BACKEND) -------------------------------------------------------------------------------- /celery_queue/.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | .dockerignore 3 | -------------------------------------------------------------------------------- /celery_queue/Dockerfile: -------------------------------------------------------------------------------- 1 | #FROM python:3.6-alpine 2 | FROM python:3.6-slim 3 | #FROM continuumio/miniconda3 4 | 5 | ENV PYTHONPATH /queue 6 | ENV CELERY_BROKER_URL redis://redis:6379/0 7 | ENV CELERY_RESULT_BACKEND redis://redis:6379/0 8 | ENV C_FORCE_ROOT true 9 | COPY . /queue 10 | WORKDIR /queue 11 | RUN mkdir -p output logs && pip install -r requirements.txt 12 | ENTRYPOINT celery -A tasks worker --loglevel=info -------------------------------------------------------------------------------- /celery_queue/IndeedScrapper/README.md: -------------------------------------------------------------------------------- 1 | - Modify from https://github.com/tarunsinghal92/indeedscrapperlatest -------------------------------------------------------------------------------- /celery_queue/IndeedScrapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/celery_queue/IndeedScrapper/__init__.py -------------------------------------------------------------------------------- /celery_queue/IndeedScrapper/indeed_extract.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | from bs4 import BeautifulSoup 3 | 4 | # get soup object 5 | def get_soup(text): 6 | return BeautifulSoup(text, "lxml", from_encoding="utf-8") 7 | 8 | 9 | # extract company 10 | def extract_company(div): 11 | company = div.find_all(name="span", attrs={"class":"company"}) 12 | if len(company) > 0: 13 | for b in company: 14 | return (b.text.strip()) 15 | else: 16 | sec_try = div.find_all(name="span", attrs={"class":"result-link-source"}) 17 | for span in sec_try: 18 | return (span.text.strip()) 19 | return 'NOT_FOUND' 20 | 21 | 22 | # extract job salary 23 | def extract_salary(div): 24 | try: 25 | return (div.find('nobr').text) 26 | except: 27 | try: 28 | div_two = div.find(name='div', attrs={'class':'sjcl'}) 29 | div_three = div_two.find('div') 30 | salaries.append(div_three.text.strip()) 31 | except: 32 | return ('NOT_FOUND') 33 | return 'NOT_FOUND' 34 | 35 | 36 | # extract job location 37 | def extract_location(div): 38 | for span in div.findAll('span', attrs={'class': 'location'}): 39 | return (span.text) 40 | return 'NOT_FOUND' 41 | 42 | 43 | # extract job title 44 | def extract_job_title(div): 45 | for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}): 46 | return (a['title']) 47 | return('NOT_FOUND') 48 | 49 | 50 | # extract jd summary 51 | def extract_summary(div): 52 | spans = div.findAll('span', attrs={'class': 'summary'}) 53 | for span in spans: 54 | return (span.text.strip()) 55 | return 'NOT_FOUND' 56 | 57 | 58 | # extract link of job description 59 | def extract_link(div, city=None): 60 | for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}): 61 | #return (a['href']) 62 | return get_full_job_link(a['href'], city) 63 | return('NOT_FOUND') 64 | 65 | 66 | # extract date of job when it was posted 67 | def extract_date(div): 68 | try: 69 | spans = div.findAll('span', attrs={'class': 'date'}) 70 | for span in spans: 71 | return (span.text.strip()) 72 | except: 73 | return 'NOT_FOUND' 74 | return 'NOT_FOUND' 75 | 76 | 77 | # extract full job description from link 78 | def extract_fulltext(url): 79 | try: 80 | page = requests.get('http://www.indeed.com' + url) 81 | soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8") 82 | spans = soup.findAll('span', attrs={'class': 'summary'}) 83 | for span in spans: 84 | return (span.text.strip()) 85 | except: 86 | return 'NOT_FOUND' 87 | return 'NOT_FOUND' 88 | 89 | 90 | # write logs to file 91 | def write_logs(text): 92 | # print(text + '\n') 93 | try: 94 | f = open('logs/log.txt','a') 95 | except Exception as e: 96 | print (str(e), "logs directory not exists, save at current url instead") 97 | f = open('log.txt', 'a') 98 | f.write(text + '\n') 99 | f.close() 100 | 101 | 102 | # get full job link with country code 103 | def get_full_job_link(link, city): 104 | 105 | if city=="Singapore": 106 | return "https://www.indeed.com.sg/" + link 107 | 108 | elif city =="Tokyo": 109 | return "https://jp.indeed.com/" + link 110 | 111 | else: 112 | return "https://www.indeed.com" + link -------------------------------------------------------------------------------- /celery_queue/IndeedScrapper/indeed_scrapper.py: -------------------------------------------------------------------------------- 1 | # import packages 2 | import requests 3 | import pandas as pd 4 | import time 5 | import datetime 6 | from IndeedScrapper.indeed_extract import * 7 | 8 | def Scrape_Runner(city_set=['New+York'], job_set=['data+scientist'], max_results_per_city=50, file=1, SKIPPER=0): 9 | 10 | # current date 11 | current_time, current_date = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), datetime.datetime.now().strftime('%Y-%m-%d') 12 | 13 | # loop on all cities 14 | for city in city_set: 15 | 16 | # for each job role 17 | for job_qry in job_set: 18 | 19 | # count 20 | cnt = 0 21 | startTime = time.time() 22 | 23 | # skipper 24 | if(file > SKIPPER): 25 | 26 | # dataframe 27 | df = pd.DataFrame(columns = ['unique_id', 'city', 'job_qry','job_title', 'company_name', 'location', 'summary', 'salary', 'link', 'date', 'full_text']) 28 | 29 | # for results 30 | for start in range(0, max_results_per_city, 10): 31 | 32 | # get dom 33 | page = requests.get('http://www.indeed.com/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start)) 34 | 35 | #ensuring at least 1 second between page grabs 36 | time.sleep(1) 37 | 38 | #fetch data 39 | soup = get_soup(page.text) 40 | divs = soup.find_all(name="div", attrs={"class":"row"}) 41 | 42 | # if results exist 43 | if(len(divs) == 0): 44 | break 45 | 46 | # for all jobs on a page 47 | for div in divs: 48 | 49 | #specifying row num for index of job posting in dataframe 50 | num = (len(df) + 1) 51 | cnt = cnt + 1 52 | 53 | #job data after parsing 54 | job_post = [] 55 | 56 | #append unique id 57 | job_post.append(div['id']) 58 | 59 | #append city name 60 | job_post.append(city) 61 | 62 | #append job qry 63 | job_post.append(job_qry) 64 | 65 | #grabbing job title 66 | job_post.append(extract_job_title(div)) 67 | 68 | #grabbing company 69 | job_post.append(extract_company(div)) 70 | 71 | #grabbing location name 72 | job_post.append(extract_location(div)) 73 | 74 | #grabbing summary text 75 | job_post.append(extract_summary(div)) 76 | 77 | #grabbing salary 78 | job_post.append(extract_salary(div)) 79 | 80 | #grabbing link 81 | link = extract_link(div) 82 | job_post.append(link) 83 | 84 | #grabbing date 85 | job_post.append(extract_date(div)) 86 | 87 | #grabbing full_text 88 | job_post.append(extract_fulltext(link)) 89 | 90 | #appending list of job post info to dataframe at index num 91 | df.loc[num] = job_post 92 | 93 | #debug add 94 | write_logs(('Completed =>') + '\t' + city + '\t' + job_qry + '\t' + str(cnt) + '\t' + str(start) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + ' ' + str(current_time)) 95 | 96 | #saving df as a local csv file 97 | try: 98 | df.to_csv('output/{}_jobs_'.format(current_date) + str(file) + '.csv', encoding='utf-8') 99 | except Exception as e: 100 | print (str(e), "outout not exists, save at current url instead") 101 | df.to_csv('{}_jobs_'.format(current_date) + str(file) + '.csv', encoding='utf-8') 102 | print (df.head(3)) 103 | print ("len(df)", len(df)) 104 | 105 | else: 106 | 107 | #debug add 108 | write_logs(('Skipped =>') + '\t' + city + '\t' + job_qry + '\t' + str(-1) + '\t' + str(-1) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + ' ' + str(current_time)) 109 | 110 | # increment file 111 | file = file + 1 112 | 113 | -------------------------------------------------------------------------------- /celery_queue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/celery_queue/__init__.py -------------------------------------------------------------------------------- /celery_queue/log.txt: -------------------------------------------------------------------------------- 1 | Completed => New+York data+scientist 10 0 2.892832040786743 file_1 2020-02-02-18:29:30 2 | Completed => New+York data+scientist 20 10 5.695845127105713 file_1 2020-02-02-18:29:30 3 | Completed => New+York data+scientist 30 20 8.427727222442627 file_1 2020-02-02-18:29:30 4 | Completed => New+York data+scientist 40 30 11.270271062850952 file_1 2020-02-02-18:29:30 5 | Completed => New+York data+scientist 50 40 15.064821004867554 file_1 2020-02-02-18:29:30 6 | Completed => + data+scientist 10 0 3.3448691368103027 file_4 2020-02-02-18:29:35 7 | Completed => + data+scientist 20 10 6.482846021652222 file_4 2020-02-02-18:29:35 8 | Completed => + data+scientist 30 20 9.563256978988647 file_4 2020-02-02-18:29:35 9 | Completed => + data+scientist 40 30 12.740447044372559 file_4 2020-02-02-18:29:35 10 | Completed => + data+scientist 50 40 15.74568486213684 file_4 2020-02-02-18:29:35 11 | -------------------------------------------------------------------------------- /celery_queue/requirements.txt: -------------------------------------------------------------------------------- 1 | amqp==2.2.2 2 | Babel==2.9.1 3 | billiard==3.5.0.3 4 | celery==5.2.2 5 | flower==1.2.0 6 | kombu==4.2.0 7 | pytz==2018.3 8 | redis==4.4.4 9 | tornado==5.0.2 10 | vine==1.1.4 11 | beautifulsoup4 12 | requests 13 | pandas 14 | lxml 15 | -------------------------------------------------------------------------------- /celery_queue/tasks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import sys 4 | from datetime import timedelta 5 | import urllib.request as request 6 | from bs4 import BeautifulSoup 7 | from celery import Celery 8 | from celery.schedules import crontab 9 | from celery.task.base import periodic_task 10 | 11 | CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', 'redis://localhost:6379'), 12 | CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379') 13 | celery = Celery('tasks', broker=CELERY_BROKER_URL, backend=CELERY_RESULT_BACKEND) 14 | 15 | @celery.task(name="tasks.add") 16 | def add(x, y): 17 | return x+y 18 | 19 | @celery.task(name="tasks.multiply") 20 | def multiply(x, y): 21 | return x*y 22 | 23 | @periodic_task(run_every=(crontab(minute='*')),name="run_every_minute",ignore_result=True) 24 | def push_heart_beat(): 25 | print ("this is heart beat") 26 | return "this is heart beat" 27 | 28 | @celery.task(name='tasks.scrape_task') 29 | def scrape(): 30 | url = 'https://github.com/apache/spark' 31 | opener=request.build_opener() 32 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 33 | page = opener.open(url) 34 | soup = BeautifulSoup(page) 35 | print (soup.text) 36 | return soup.text 37 | 38 | @celery.task(name='tasks.scrape_task_api') 39 | def scrape_github_api(account, repo_name): 40 | url = 'https://github.com/{}/{}'.format(account, repo_name) 41 | print ("*** url", url) 42 | opener=request.build_opener() 43 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 44 | page = opener.open(url) 45 | soup = BeautifulSoup(page) 46 | print (soup.text) 47 | return soup.text 48 | 49 | @celery.task(name='tasks.indeed_scrap_task') 50 | def indeed_scrape(): 51 | sys.path.append(".") 52 | from IndeedScrapper.indeed_scrapper import Scrape_Runner 53 | Scrape_Runner() 54 | 55 | @celery.task(name='tasks.indeed_scrap_api_V1') 56 | def indeed_scrape_api(city_set): 57 | sys.path.append(".") 58 | from IndeedScrapper.indeed_scrapper import Scrape_Runner 59 | Scrape_Runner(city_set) 60 | -------------------------------------------------------------------------------- /cron_indeed_scrapping_test.py: -------------------------------------------------------------------------------- 1 | # import packages 2 | import requests 3 | import pandas as pd 4 | import time 5 | import datetime 6 | import os 7 | from celery_queue.IndeedScrapper.indeed_extract import * 8 | 9 | 10 | # current date 11 | current_time, current_date = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), datetime.datetime.now().strftime('%Y-%m-%d') 12 | 13 | # limit per sity 14 | max_results_per_city = 100 15 | 16 | # db of city 17 | city_set = ['New+York', 'San+Francisco','Singapore','Tokyo'] 18 | 19 | # job roles 20 | job_set = ['data+engineer', 'machine+learning+engineer', 'data+scientist'] 21 | 22 | # output dir 23 | output_dir='./output' 24 | 25 | # file num 26 | file = 1 27 | 28 | # from where to skip 29 | SKIPPER = 0 30 | 31 | # loop on all cities 32 | for city in city_set: 33 | 34 | # for each job role 35 | for job_qry in job_set: 36 | 37 | # count 38 | cnt = 0 39 | startTime = time.time() 40 | 41 | # skipper 42 | if(file > SKIPPER): 43 | 44 | # dataframe 45 | df = pd.DataFrame(columns = ['unique_id', 'city', 'job_qry','job_title', 'company_name', 'location', 'summary', 'salary', 'link', 'date', 'full_text']) 46 | 47 | # for results 48 | for start in range(0, max_results_per_city, 10): 49 | 50 | # get dom 51 | 52 | # hot fix here for Asia city scrapping (will optimize it then) 53 | if city=='Singapore': 54 | page = requests.get('http://www.indeed.com.sg/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start)) 55 | 56 | elif city=='Tokyo': 57 | page = requests.get('https://jp.indeed.com/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start)) 58 | 59 | else: 60 | page = requests.get('http://www.indeed.com/jobs?q=' + job_qry +'&l=' + str(city) + '&start=' + str(start)) 61 | 62 | #ensuring at least 1 second between page grabs 63 | time.sleep(1) 64 | 65 | #fetch data 66 | soup = get_soup(page.text) 67 | divs = soup.find_all(name="div", attrs={"class":"row"}) 68 | 69 | # if results exist 70 | if(len(divs) == 0): 71 | break 72 | 73 | # for all jobs on a page 74 | for div in divs: 75 | 76 | #specifying row num for index of job posting in dataframe 77 | num = (len(df) + 1) 78 | cnt = cnt + 1 79 | 80 | #job data after parsing 81 | job_post = [] 82 | 83 | #append unique id 84 | job_post.append(div['id']) 85 | 86 | #append city name 87 | job_post.append(city) 88 | 89 | #append job qry 90 | job_post.append(job_qry) 91 | 92 | #grabbing job title 93 | job_post.append(extract_job_title(div)) 94 | 95 | #grabbing company 96 | job_post.append(extract_company(div)) 97 | 98 | #grabbing location name 99 | job_post.append(extract_location(div)) 100 | 101 | #grabbing summary text 102 | job_post.append(extract_summary(div)) 103 | 104 | #grabbing salary 105 | job_post.append(extract_salary(div)) 106 | 107 | #grabbing link 108 | link = extract_link(div, city) 109 | job_post.append(link) 110 | 111 | #grabbing date 112 | job_post.append(extract_date(div)) 113 | 114 | #grabbing full_text 115 | job_post.append(extract_fulltext(link)) 116 | 117 | #appending list of job post info to dataframe at index num 118 | df.loc[num] = job_post 119 | 120 | #debug add 121 | write_logs(('Completed =>') + '\t' + city + '\t' + job_qry + '\t' + str(cnt) + '\t' + str(start) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + ' ' + str(current_time)) 122 | 123 | #saving df as a local csv file 124 | if not os.path.exists(output_dir): 125 | os.mkdir(output_dir) 126 | df = df.sort_values('date') # sort the df by job post date 127 | # for QA only 128 | print (df.head(10)) 129 | df.to_csv('output/{}_jobs_{}_{}'.format(current_date, str(city).replace('+','_'), str(job_qry).replace('+','_')) + '.csv', encoding='utf-8') 130 | 131 | else: 132 | 133 | #debug add 134 | write_logs(('Skipped =>') + '\t' + city + '\t' + job_qry + '\t' + str(-1) + '\t' + str(-1) + '\t' + str(time.time() - startTime) + '\t' + ('file_' + str(file)) + ' ' + str(current_time)) 135 | 136 | # increment file 137 | file = file + 1 138 | -------------------------------------------------------------------------------- /cron_test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | def main(): 4 | current_time, current_date = datetime.datetime.now(), datetime.datetime.now().strftime('%Y-%m-%d') 5 | print('current time : ', current_time) 6 | with open('output/{}.txt'.format('output-'+str(current_date)), "w") as file: 7 | file.write('* this is cron test program \n') 8 | file.write(str(current_time) + '\n') 9 | file.write('hello world') 10 | file.close() 11 | print ('write to file OK') 12 | 13 | if __name__ == '__main__': 14 | main() -------------------------------------------------------------------------------- /dev/104/code1.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from fake_useragent import UserAgent 6 | 7 | # 初始化 fake_useragent 8 | ua = UserAgent(platforms='pc') 9 | 10 | # 設定 base_url 和查詢參數 11 | base_url = "https://www.104.com.tw/jobs/search/" 12 | params = { 13 | 'keyword': 'python', 14 | 'page': 1 15 | } 16 | 17 | # 用來儲存所有工作的 URL 18 | job_urls = [] 19 | 20 | # 爬取前 150 頁 21 | for page in range(1, 151): 22 | print(f"正在抓取第 {page} 頁...") 23 | params['page'] = page 24 | 25 | # 建立隨機的 User-Agent 26 | headers = { 27 | 'User-Agent': ua.random 28 | } 29 | 30 | # 發送 GET 請求 31 | response = requests.get(base_url, headers=headers, params=params) 32 | soup = BeautifulSoup(response.text, 'lxml') 33 | 34 | # 找到所有的工作列表項目 35 | job_items = soup.find_all('article', class_='js-job-item') 36 | 37 | # For Loop 每個工作項目,提取工作 URL 38 | for job in job_items: 39 | job_link = job.find('a', class_='js-job-link') 40 | if job_link: 41 | job_url = job_link['href'] 42 | # 104 的 URL 需要補全 43 | full_job_url = "https:" + job_url 44 | job_urls.append(full_job_url) 45 | 46 | # 隨機等待 5 到 10 秒 47 | sleep_time = random.uniform(5, 10) 48 | print(f"等待 {sleep_time:.2f} 秒...") 49 | time.sleep(sleep_time) 50 | -------------------------------------------------------------------------------- /dev/104/code2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | import random 4 | import time 5 | from fake_useragent import UserAgent 6 | 7 | # 將 JSON 資料轉換為結構化字典 8 | def convert_job_data(original_dict): 9 | data = original_dict['data'] 10 | 11 | # 將 jobType 轉換為描述文字 12 | job_type_mapping = { 13 | 0: '全部', 14 | 1: '全職', 15 | 2: '兼職', 16 | 3: '高薪', 17 | 4: '派遣' 18 | } 19 | 20 | # 將 remoteWork 轉換為描述文字 21 | remote_work_mapping = { 22 | 1: '完全遠端', 23 | 2: '部分遠端' 24 | } 25 | 26 | # 建立包含工作資訊的字典 27 | job_info = { 28 | '職缺名稱': data['header']['jobName'], 29 | '公司名稱': data['header']['custName'], 30 | '公司網址': data['header']['custUrl'], 31 | '發佈日期': data['header']['appearDate'], 32 | '職缺分析網址': 'https:' + data['header']['analysisUrl'], 33 | '上班地區': data['jobDetail']['addressRegion'], 34 | '上班地點': data['jobDetail']['addressDetail'], 35 | '工作待遇': data['jobDetail']['salary'], 36 | '最低薪資': data['jobDetail']['salaryMin'], 37 | '最高薪資': data['jobDetail']['salaryMax'], 38 | '工作性質': job_type_mapping.get(data['jobDetail']['jobType'], '未知'), 39 | '上班時段': data['jobDetail']['workPeriod'], 40 | '假期政策': data['jobDetail']['vacationPolicy'], 41 | '工作經歷': data['condition']['workExp'], 42 | '學歷要求': data['condition']['edu'], 43 | '擅長工具': [specialty['description'] for specialty in data['condition']['specialty']], 44 | '工作技能': [skill['description'] for skill in data['condition']['skill']], 45 | '產業類別': data['industry'], 46 | '職務類別': [category['description'] for category in data.get('jobDetail', {}).get('jobCategory', [])], 47 | '出差外派': data['jobDetail']['businessTrip'], 48 | '遠端工作': remote_work_mapping.get((data['jobDetail'].get('remoteWork') or {}).get('type', 0), '無'), 49 | '公司人數': '' if data.get('employees') == '暫不提供' else data.get('employees', '').replace('人', ''), 50 | '管理責任': data['jobDetail']['manageResp'] 51 | } 52 | return job_info 53 | 54 | # 單獨抓取某一職缺的詳細資料 55 | def fetch_job_detail(job_id): 56 | 57 | try: 58 | ua = UserAgent(platforms='pc') 59 | 60 | url = f'https://www.104.com.tw/job/ajax/content/{job_id}' 61 | headers = { 62 | 'User-Agent': ua.random, 63 | 'Referer': f'https://www.104.com.tw/job/{job_id}' 64 | } 65 | 66 | response = requests.get(url, headers=headers) 67 | response.raise_for_status() # 檢查 HTTP 回應狀態 68 | 69 | data = response.json() 70 | job_info = convert_job_data(data) 71 | job_info['連結'] = f'https://www.104.com.tw/job/{job_id}' 72 | 73 | return job_info 74 | 75 | except Exception as e: 76 | print(f"處理職缺 {job_id} 時出錯: {e}") 77 | return None 78 | -------------------------------------------------------------------------------- /dev/104/code3.py: -------------------------------------------------------------------------------- 1 | # 取得所有職缺詳細信息並存入 DataFrame 2 | def fetch_all_job_details(job_urls): 3 | 4 | job_details = [] 5 | 6 | for index, original_url in enumerate(job_urls): 7 | job_id = original_url.split('/job/')[1].split('?')[0] 8 | job_info = fetch_job_detail(job_id) 9 | 10 | if job_info: 11 | job_details.append(job_info) 12 | print(f"已完成 {index + 1} / {len(job_urls)} : {job_info['職缺名稱']}") 13 | 14 | sleep_time = random.uniform(3, 8) 15 | time.sleep(sleep_time) 16 | 17 | df = pd.DataFrame(job_details) 18 | return df 19 | 20 | # 取得職缺詳細信息並存入 DataFrame 21 | df = fetch_all_job_details(job_urls) 22 | df.to_excel('104_jobs.xlsx') 23 | -------------------------------------------------------------------------------- /dev/104/index.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /doc/pic/architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/doc/pic/architecture.jpg -------------------------------------------------------------------------------- /doc/pic/celery.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/doc/pic/celery.jpg -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | web: 4 | build: 5 | context: ./api 6 | dockerfile: Dockerfile 7 | restart: always 8 | ports: 9 | - "5001:5001" 10 | depends_on: 11 | - redis 12 | worker: 13 | build: 14 | context: ./celery_queue 15 | dockerfile: Dockerfile 16 | depends_on: 17 | - redis 18 | monitor: 19 | build: 20 | context: ./celery_queue 21 | dockerfile: Dockerfile 22 | ports: 23 | - "5555:5555" 24 | entrypoint: flower 25 | command: -A tasks --port=5555 --broker=redis://redis:6379/0 26 | depends_on: 27 | - redis 28 | redis: 29 | image: redis 30 | ports: 31 | - "6379:6379" 32 | mongodb: 33 | image: mongo:latest 34 | ports: 35 | - "27017:27017" 36 | container_name: "mongodb" 37 | environment: 38 | - MONGO_DATA_DIR=/data/db 39 | - MONGO_LOG_DIR=/dev/null 40 | - MONGODB_USER="mongo" 41 | - MONGODB_PASS="password" 42 | volumes: 43 | - ./data/db:/data/db 44 | #command: mongod --smallfiles --logpath=/dev/null # --quiet -------------------------------------------------------------------------------- /legacy_project/archived/bank_swiftcode/grab_bank_list.py: -------------------------------------------------------------------------------- 1 | # python 3 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | import urllib 5 | # help function 6 | def parse_swift_code(swift_url): 7 | try: 8 | opener=urllib.request.build_opener() 9 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 10 | page = opener.open(swift_url) 11 | soup = BeautifulSoup(page,"html.parser") 12 | for k,j in enumerate(soup.find_all('a',{'href': True})): 13 | if k == 7: 14 | print (k,j.text) 15 | return j.text 16 | else: 17 | pass 18 | #return j.text 19 | except: 20 | return None 21 | 22 | def clean_df(df): 23 | # drop any col, row with null value 24 | #df_ = df.dropna() 25 | df_ = df_[(df_.bank_name != 'SWIFT Code Databse') | 26 | (df_.bank_name != 'Countries List') | 27 | (df_.bank_name != 'Home') | 28 | (df_.bank_name != 'Next') | 29 | (df_.bank_name != 'Last') | 30 | (df_.bank_name != 'Privacy policy') | 31 | (df_.bank_name != 'DMCA Policy') | 32 | (df_.bank_name != 'Contact Us') ] 33 | return df_ 34 | 35 | def main_(): 36 | 37 | #url="http://www.swiftcodelist.com/banks/united-kingdom-1.html" 38 | 39 | output = [[] for k in range(3)] 40 | 41 | for x in range(1,43): 42 | url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html".format(x) 43 | print (url) 44 | 45 | opener=urllib.request.build_opener() 46 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 47 | page = opener.open(url) 48 | soup = BeautifulSoup(page,"html.parser") 49 | anchors = soup.find_all('a', {'href': True}) 50 | 51 | for k in anchors: 52 | 53 | if len(k.text) < 3: 54 | print (k.text) 55 | output[0].append(None) 56 | print (k['href']) 57 | output[1].append(k['href']) 58 | output[2].append(None) 59 | else: 60 | output[0].append(k.text) 61 | print (k.text) 62 | output[1].append(k['href']) 63 | print (k['href']) 64 | output[2].append(parse_swift_code(k['href'])) 65 | 66 | df_ = pd.DataFrame(output).T 67 | cols=['bank_name','url','swift_code'] 68 | df_.columns = [cols] 69 | print (df_) 70 | #df_ =clean_df(df_) 71 | df_.to_csv('UK_bank_swift_code_list.csv') 72 | 73 | if __name__ == '__main__': 74 | main_() -------------------------------------------------------------------------------- /legacy_project/archived/bank_swiftcode/grab_bank_list_muitiprocess.py: -------------------------------------------------------------------------------- 1 | # python 3 2 | # credit 3 | # https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/2-add/ 4 | # https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/3-queue/ 5 | 6 | from bs4 import BeautifulSoup 7 | import pandas as pd 8 | import urllib 9 | # multiprocessing 10 | import multiprocessing as mp 11 | import sys 12 | 13 | sys.setrecursionlimit(10000) # 10000 is an example, try with different values 14 | 15 | # help function 16 | def parse_swift_code(swift_url): 17 | try: 18 | opener=urllib.request.build_opener() 19 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 20 | page = opener.open(swift_url) 21 | soup = BeautifulSoup(page,"html.parser") 22 | # need to fix here ( find -> find_all) 23 | for k,j in enumerate(soup.find('a',{'href': True})): 24 | if k == 7: 25 | print (k,j.text) 26 | return j.text 27 | else: 28 | pass 29 | #return j.text 30 | except: 31 | return None 32 | 33 | # main scrape function 34 | # url list 35 | url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html" 36 | url_ = [url.format(x) for x in range(1,2)] 37 | 38 | def crawl(url): 39 | print ('-------------') 40 | print (url) 41 | print ('-------------') 42 | #url="http://www.swiftcodelist.com/banks/united-kingdom-1.html" 43 | #output = [[] for k in range(3)] 44 | #for x in range(1,43): 45 | # url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html".format(x) 46 | # print (url) 47 | opener=urllib.request.build_opener() 48 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 49 | page = opener.open(url) 50 | soup = BeautifulSoup(page,"html.parser") 51 | # need to fix here ( find -> find_all) 52 | anchors = soup.find('a', {'href': True}) 53 | return anchors 54 | 55 | 56 | def parse(anchors): 57 | for k in anchors: 58 | if len(k.text) < 3: 59 | print (k.text) 60 | #output[0].append(None) 61 | print (k['href']) 62 | #output[1].append(k['href']) 63 | #output[2].append(None) 64 | else: 65 | #output[0].append(k.text) 66 | print (k.text) 67 | #output[1].append(k['href']) 68 | print (k['href']) 69 | #output[2].append(parse_swift_code(k['href'])) 70 | 71 | 72 | 73 | def main_(url): 74 | print ('-------------') 75 | print (url) 76 | print ('-------------') 77 | #url="http://www.swiftcodelist.com/banks/united-kingdom-1.html" 78 | output = [[] for k in range(3)] 79 | #for x in range(1,43): 80 | # url="http://www.swiftcodelist.com/banks/united-kingdom-{}.html".format(x) 81 | # print (url) 82 | opener=urllib.request.build_opener() 83 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 84 | page = opener.open(url) 85 | soup = BeautifulSoup(page,"html.parser") 86 | # need to fix here ( find -> find_all) 87 | anchors = soup.find('a', {'href': True}) 88 | 89 | for k in anchors: 90 | 91 | if len(k.text) < 3: 92 | print (k.text) 93 | output[0].append(None) 94 | print (k['href']) 95 | output[1].append(k['href']) 96 | output[2].append(None) 97 | else: 98 | output[0].append(k.text) 99 | print (k.text) 100 | output[1].append(k['href']) 101 | print (k['href']) 102 | output[2].append(parse_swift_code(k['href'])) 103 | 104 | df_ = pd.DataFrame(output).T 105 | cols=['bank_name','url','swift_code'] 106 | df_.columns = [cols] 107 | print (df_) 108 | return df_ 109 | 110 | # parse job 111 | def multi_scrap(): 112 | #count =0 113 | pool = mp.Pool(2) 114 | while True: 115 | # htmls = [crawl(url) for url in unseen] 116 | # ---> 117 | crawl_jobs = [pool.apply_async(main_, args=(url,)) for url in url_] 118 | output = [j.get() for j in crawl_jobs] 119 | print (output) 120 | # results = [parse(html) for html in htmls] 121 | # ---> 122 | #parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls] 123 | #results = [j.get() for j in parse_jobs] 124 | 125 | 126 | def multi_scrap_(): 127 | #count =0 128 | pool = mp.Pool(2) 129 | while True: 130 | # htmls = [crawl(url) for url in unseen] 131 | # ---> 132 | crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in url_] 133 | data = [j.get() for j in crawl_jobs] 134 | print (data) 135 | #results = [parse(html) for html in htmls] 136 | # ---> 137 | parse_jobs = [pool.apply_async(parse, args=(data,)) for a in data] 138 | results = [j.get() for j in parse_jobs] 139 | 140 | if __name__ == '__main__': 141 | multi_scrap_() -------------------------------------------------------------------------------- /legacy_project/archived/booking/bookingcom_scrap.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support.ui import Select 5 | from selenium.common.exceptions import NoSuchElementException 6 | from selenium.common.exceptions import NoAlertPresentException 7 | # https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path 8 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 9 | import unittest, time, re 10 | from bs4 import BeautifulSoup 11 | import argparse 12 | 13 | # parse parameter from command line to python 14 | # https://docs.python.org/3/howto/argparse.html 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("echo") 17 | args = parser.parse_args() 18 | print(args.echo) 19 | print ('===========') 20 | # open firefox as browser 21 | browser = webdriver.Firefox() 22 | # set up site url 23 | #base_url="https://www.booking.com/searchresults.zh-tw.html?aid=304142&label=gen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw&sid=fc93df7eb22345d0203784b4d254c349&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.zh-tw.html%3Faid%3D304142%3Blabel%3Dgen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw%3Bsid%3Dfc93df7eb22345d0203784b4d254c349%3Bcheckin_month%3D6%3Bcheckin_monthday%3D16%3Bcheckin_year%3D2017%3Bcheckout_month%3D6%3Bcheckout_monthday%3D17%3Bcheckout_year%3D2017%3Bclass_interval%3D1%3Bdest_id%3D17%3Bdest_type%3Dairport%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Blabel_click%3Dundef%3Bmap%3D1%3Bmih%3D0%3Bno_rooms%3D1%3Boffset%3D33%3Braw_dest_type%3Dairport%3Broom1%3DA%252CA%3Brows%3D33%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bss%3D%25E9%25A6%2599%25E6%25B8%25AF%25E8%25B5%25A4%25E9%25B1%25B2%25E8%25A7%2592%25E5%259C%258B%25E9%259A%259B%25E6%25A9%259F%25E5%25A0%25B4%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%3Bss_raw%3Dhk%3Bssb%3Dempty%26%3B&ss=NYC&ssne=%E8%B5%A4%E9%B1%B2%E8%A7%92&ssne_untouched=%E8%B5%A4%E9%B1%B2%E8%A7%92&checkin_year=2017&checkin_month=6&checkin_monthday=16&checkout_year=2017&checkout_month=6&checkout_monthday=17&room1=A%2CA&group_adults=2&group_children=0&no_rooms=1&highlighted_hotels=&dest_id=&dest_type=&search_pageview_id=024242b9a80c0437&search_selected=false" 24 | base_url="https://www.booking.com/searchresults.zh-tw.html?aid=304142&label=gen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw&sid=fc93df7eb22345d0203784b4d254c349&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.zh-tw.html%3Faid%3D304142%3Blabel%3Dgen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AEB6AEB-AEMkgIBeagCAw%3Bsid%3Dfc93df7eb22345d0203784b4d254c349%3Bcheckin_month%3D6%3Bcheckin_monthday%3D16%3Bcheckin_year%3D2017%3Bcheckout_month%3D6%3Bcheckout_monthday%3D17%3Bcheckout_year%3D2017%3Bclass_interval%3D1%3Bdest_id%3D17%3Bdest_type%3Dairport%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Blabel_click%3Dundef%3Bmap%3D1%3Bmih%3D0%3Bno_rooms%3D1%3Boffset%3D33%3Braw_dest_type%3Dairport%3Broom1%3DA%252CA%3Brows%3D33%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bss%3D%25E9%25A6%2599%25E6%25B8%25AF%25E8%25B5%25A4%25E9%25B1%25B2%25E8%25A7%2592%25E5%259C%258B%25E9%259A%259B%25E6%25A9%259F%25E5%25A0%25B4%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%252C%2520%25E9%25A6%2599%25E6%25B8%25AF%3Bss_raw%3Dhk%3Bssb%3Dempty%26%3B&ss={}&ssne=%E8%B5%A4%E9%B1%B2%E8%A7%92&ssne_untouched=%E8%B5%A4%E9%B1%B2%E8%A7%92&checkin_year=2017&checkin_month=6&checkin_monthday=16&checkout_year=2017&checkout_month=6&checkout_monthday=17&room1=A%2CA&group_adults=2&group_children=0&no_rooms=1&highlighted_hotels=&dest_id=&dest_type=&search_pageview_id=024242b9a80c0437&search_selected=false" 25 | print (base_url) 26 | base_url = base_url.format(args.echo) 27 | browser.get(base_url) 28 | page = 0 29 | 30 | #while len(soup.select('.paging-start')) > 0: 31 | while page < 2: 32 | page += 1 33 | try: 34 | #'======== start parse ========' 35 | soup = BeautifulSoup(browser.page_source,"html.parser") 36 | #for ele in soup.find_all('h3'): 37 | for ele in soup.findAll("span", { "class" : "sr-hotel__name" }): 38 | print ele.text 39 | # next page 40 | browser.find_element_by_link_text(u"下一頁").click() 41 | print 'page =' , page 42 | time.sleep(1) 43 | except Exception as e: 44 | print e, 'something failed' -------------------------------------------------------------------------------- /legacy_project/archived/booking/next_page_sample.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support.ui import Select 5 | from selenium.common.exceptions import NoSuchElementException 6 | from selenium.common.exceptions import NoAlertPresentException 7 | import unittest, time, re 8 | 9 | class BookingcomNextPage(unittest.TestCase): 10 | def setUp(self): 11 | self.driver = webdriver.Firefox() 12 | self.driver.implicitly_wait(30) 13 | self.base_url = "https://www.booking.com/" 14 | self.verificationErrors = [] 15 | self.accept_next_alert = True 16 | 17 | def test_bookingcom_next_page(self): 18 | driver = self.driver 19 | driver.get(self.base_url + "/index.zh-tw.html?label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmaOcBiAEBmAEwuAEHyAEP2AED6AEBkgIBeagCAw;sid=869c62621e3d43712c1fbc29cfed3288;sb_price_type=total&") 20 | driver.find_element_by_id("ss").click() 21 | driver.find_element_by_id("ss").clear() 22 | driver.find_element_by_id("ss").send_keys("hk") 23 | driver.find_element_by_xpath("//form[@id='frm']/div[2]/div/div/ul/li").click() 24 | driver.find_element_by_css_selector("button.sb-searchbox__button.").click() 25 | driver.find_element_by_id("close_map_lightbox").click() 26 | driver.find_element_by_link_text(u"下一頁").click() 27 | driver.find_element_by_link_text(u"下一頁").click() 28 | driver.find_element_by_link_text(u"下一頁").click() 29 | 30 | def is_element_present(self, how, what): 31 | try: self.driver.find_element(by=how, value=what) 32 | except NoSuchElementException as e: return False 33 | return True 34 | 35 | def is_alert_present(self): 36 | try: self.driver.switch_to_alert() 37 | except NoAlertPresentException as e: return False 38 | return True 39 | 40 | def close_alert_and_get_its_text(self): 41 | try: 42 | alert = self.driver.switch_to_alert() 43 | alert_text = alert.text 44 | if self.accept_next_alert: 45 | alert.accept() 46 | else: 47 | alert.dismiss() 48 | return alert_text 49 | finally: self.accept_next_alert = True 50 | 51 | def tearDown(self): 52 | self.driver.quit() 53 | self.assertEqual([], self.verificationErrors) 54 | 55 | if __name__ == "__main__": 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /legacy_project/archived/glassdoor/glassdoor_scrap.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support.ui import Select 5 | from selenium.common.exceptions import NoSuchElementException 6 | from selenium.common.exceptions import NoAlertPresentException 7 | # https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path 8 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 9 | import unittest, time, re 10 | from bs4 import BeautifulSoup 11 | import argparse 12 | 13 | # parse parameter from command line to python 14 | # https://docs.python.org/3/howto/argparse.html 15 | # open firefox as browser 16 | browser = webdriver.Firefox() 17 | # set up site url 18 | #base_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+&sc.keyword=data+&locT=C&locId=2671300&jobType=" 19 | base_url = "https://www.glassdoor.com/Job/london-data-jobs-SRCH_IL.0,6_IC2671300_KE7,11.htm" 20 | print (base_url) 21 | browser.get(base_url) -------------------------------------------------------------------------------- /legacy_project/archived/spotify/spotify_album copy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # http://stackoverflow.com/questions/41566971/how-to-get-spotify-artist-id-for-the-spotify-endpoint-url 3 | # input artist name 4 | echo 'plz enter artist name ' 5 | read varname 6 | echo 'varname = ' $varname 7 | 8 | # modify usl 9 | url="https://api.spotify.com/v1/search?q=${varname}&type=artist" 10 | echo $url 11 | 12 | # query 13 | API_ARTIST_URL=$(curl -s $url | jq -r '.artists.items[0].href') 14 | 15 | echo 'API_ARTIST_URL : ' $API_ARTIST_URL 16 | echo 'ALBUM' 17 | echo '==================' 18 | 19 | # print album 20 | curl -s "$API_ARTIST_URL/top-tracks?country=US" | jq -r '.tracks[].name' -------------------------------------------------------------------------------- /legacy_project/blu_move/analysis.sql: -------------------------------------------------------------------------------- 1 | -- 1. Duration period 2 | -- get duration period (minute) per booking per car 3 | 4 | SELECT id, 5 | end_reservation, 6 | start_reservation, 7 | EXTRACT (epoch 8 | FROM (end_reservation - start_reservation))::integer/60 AS duration_min 9 | FROM 10 | (SELECT DISTINCT id, 11 | end_reservation, 12 | start_reservation 13 | FROM 14 | WHERE end_reservation IS NOT NULL 15 | AND start_reservation IS NOT NULL ) sub 16 | ORDER BY id, 17 | end_reservation 18 | 19 | 20 | -- 2. utilization (day) 21 | -- get utilization per day 22 | 23 | 24 | 25 | WITH booking AS 26 | (SELECT date(start_reservation) AS date, 27 | count(DISTINCT id) AS booked_car 28 | FROM 29 | GROUP BY 1), 30 | all_ AS 31 | (SELECT date(date_of_insert) AS date, 32 | count(DISTINCT id) AS all_car 33 | FROM 34 | GROUP BY 1) 35 | SELECT booking.*, 36 | all_.all_car, 37 | booking.booked_car::NUMERIC/all_.all_car::NUMERIC AS utilization 38 | FROM booking 39 | INNER JOIN all_ ON booking.date = all_.date 40 | ORDER BY booking.date 41 | 42 | 43 | 44 | -- 3. utilization (hour) 45 | -- get utilization per hour 46 | 47 | 48 | WITH booking AS 49 | (SELECT TO_TIMESTAMP(cast(start_reservation AS TEXT),'yyyy-mm-dd HH24') AS date, 50 | count(DISTINCT id) AS booked_car 51 | FROM 52 | GROUP BY 1), 53 | all_ AS 54 | (SELECT TO_TIMESTAMP(cast(date_of_insert AS TEXT),'yyyy-mm-dd HH24') AS date, 55 | count(DISTINCT id) AS all_car 56 | FROM 57 | GROUP BY 1) 58 | SELECT booking.*, 59 | all_.all_car, 60 | booking.booked_car::NUMERIC/all_.all_car::NUMERIC AS utilization 61 | FROM booking 62 | INNER JOIN all_ ON booking.date = all_.date 63 | ORDER BY booking.date 64 | 65 | 66 | 67 | -- 4. utilization (using hour / 24 hour) 68 | ### fix the "duration across day problem" 69 | ### e.g. start : 2017-01-01 23:00, end : 2017-01-02 07:00 70 | # hour of using / 24 hour for each car (V1) 71 | 72 | 73 | WITH dates AS 74 | ( SELECT DISTINCT generate_series(min(b.start_reservation::date) OVER (PARTITION BY b.id)::TIMESTAMP, now()::date - '1 day'::interval, '1 day'::interval)::date AS date, 75 | b.id, 76 | 24 AS capacity_hours, 77 | 1 AS capacity_days 78 | FROM rw.blue_move b 79 | WHERE date(b.start_reservation) >= '2018-01-12' ), 80 | get_last_log AS 81 | (SELECT b.*, 82 | ROW_NUMBER() OVER (PARTITION BY id, 83 | date(date_of_insert) 84 | ORDER BY date(date_of_insert), 85 | date_of_insert DESC) AS row_id, 86 | ROW_NUMBER() OVER (PARTITION BY id, 87 | start_reservation 88 | ORDER BY date_of_insert DESC) AS row_id_ 89 | FROM rw.blue_move b 90 | WHERE start_reservation IS NOT NULL 91 | AND end_reservation IS NOT NULL ) 92 | SELECT d_1.date, 93 | last_log.id, 94 | last_log.start_reservation, 95 | last_log.end_reservation, 96 | CASE 97 | WHEN last_log.start_reservation < d_1.date 98 | AND last_log.end_reservation::date > d_1.date THEN 24::double precision 99 | WHEN last_log.start_reservation < d_1.date THEN date_part('hour'::text, last_log.end_reservation) + date_part('minute'::text, last_log.end_reservation) / 60::double precision 100 | WHEN last_log.start_reservation::date = d_1.date 101 | AND last_log.end_reservation::date > d_1.date THEN date_part('epoch'::text, d_1.date + '1 day'::interval - last_log.start_reservation) / 3600::double precision 102 | WHEN last_log.start_reservation::date = d_1.date 103 | AND last_log.end_reservation::date = d_1.date THEN date_part('epoch'::text, last_log.end_reservation - last_log.start_reservation) / 3600::double precision 104 | ELSE 0::double precision 105 | END AS service_hours 106 | FROM get_last_log last_log 107 | RIGHT JOIN dates d_1 ON (last_log.start_reservation::date <= d_1.date 108 | AND last_log.end_reservation::date >= d_1.date 109 | OR last_log.start_reservation::date = d_1.date) 110 | AND d_1.id::text = last_log.id::text 111 | WHERE row_id = 1 112 | AND row_id_ = 1 113 | ORDER BY id, 114 | start_reservation, date 115 | -------------------------------------------------------------------------------- /legacy_project/blu_move/blu_scrape_V1.py: -------------------------------------------------------------------------------- 1 | # credit : https://ianlondon.github.io/blog/web-scraping-discovering-hidden-apis/ 2 | # scrape in public home page 3 | #import library 4 | from bs4 import BeautifulSoup 5 | import urllib, json 6 | import pandas as pd 7 | import sys ,re,time 8 | 9 | url="https://app.bluemove.es/api/public/locations/list?cityId=100&accountId=1" 10 | 11 | def extract_data(): 12 | pass 13 | 14 | 15 | def main(): 16 | print (url) 17 | opener=urllib.request.build_opener() 18 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 19 | page = opener.open(url) 20 | soup = BeautifulSoup(page) 21 | geo_data = dict(json.loads(soup.text)) 22 | geo_data_ = geo_data['data']['locations'] 23 | print (geo_data_) 24 | print ('length of data :',len(geo_data_) ) 25 | 26 | 27 | 28 | def main_(): 29 | print (url) 30 | opener=urllib.request.build_opener() 31 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 32 | page = opener.open(url) 33 | soup = BeautifulSoup(page) 34 | geo_data = dict(json.loads(soup.text)) 35 | geo_data_ = geo_data['data']['locations'] 36 | print (geo_data_) 37 | print ('length of data :',len(geo_data_) ) 38 | 39 | # transfer to dataframe 40 | output = [[] for k in range(5)] 41 | for count in range(len(geo_data['data']['locations'])): 42 | scraped_data = geo_data['data']['locations'][count] 43 | for k,j in enumerate(scraped_data['Location']['vehicles']): 44 | #print (k) 45 | output[0].append(scraped_data['Location']['vehicles'][k]['id']) 46 | output[1].append(scraped_data['Location']['vehicles'][k]['gpslat']) 47 | output[2].append(scraped_data['Location']['vehicles'][k]['gpslong']) 48 | output[3].append(scraped_data['Location']['vehicles'][k]['gps_timestamp']) 49 | output[4].append(scraped_data['Location']['vehicles'][k]['status']) 50 | df_ = pd.DataFrame(output).T 51 | df_.columns = [['id','gpslat','gpslong','gps_timestamp','status']] 52 | print (df_) 53 | 54 | if __name__ == '__main__': 55 | main_() 56 | -------------------------------------------------------------------------------- /legacy_project/blu_move/blu_scrape_V1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # scraper V1 on user booking page 3 | curl 'https://rest.bluemove.es/api/fleet/availability' -H 'pragma: no-cache' -H 'origin: https://webapp.bluemove.es' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' -H 'user-agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36' -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'cache-control: no-cache' -H 'authority: rest.bluemove.es' -H 'referer: https://webapp.bluemove.es/en/my-bluemove' --data 'cityId=100&start=2017-12-22+22%3A00%3A00&end=2017-12-22+23%3A00%3A00&userId=142961&token=549mNphfCEefL2iYCwdM96GMFqqnTj56UhHLE70V21idilcfl3&product=cs&usageReason=private' --compressed | jq -------------------------------------------------------------------------------- /legacy_project/blu_move/blu_scrape_V2.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import sys ,re,time 4 | import os 5 | # user defined function 6 | from utility_data_IO import * 7 | 8 | db_url = os.environ['db_url'] 9 | print ('db_url : ' , db_url) 10 | 11 | def get_json(): 12 | # read json from shell scraper (blu_scrape_V2.sh) 13 | with open("blu_.json") as json_file: 14 | blu_data = json.load(json_file) 15 | return blu_data 16 | 17 | def main_(write_to_db=False): 18 | blu_data = get_json() 19 | # prepare data, parese needed columns 20 | # for loop 21 | output = [[] for k in range(11)] 22 | 23 | for loc_index in range(len(blu_data['data']['locations'])): 24 | for k in range(len(blu_data['data']['locations'][loc_index]['Location']['vehicles'])): 25 | data_ =blu_data['data']['locations'][loc_index]['Location']['vehicles'][k]['data'] 26 | print (data_['id']) 27 | # car data 28 | # gat car ID, lat, lon, status, gps_timestamp 29 | output[0].append(data_['id']) 30 | output[1].append(data_['gpslat']) 31 | output[2].append(data_['gpslong']) 32 | output[3].append(data_['gps_timestamp']) 33 | output[4].append(data_['status']) 34 | # reservation data 35 | # get reservation : end, end_block, end_reservation, start, start_block, start_reservation 36 | data_reserve = blu_data['data']['locations'][loc_index]['Location']['vehicles'][k]['occupation']['allReservations'] 37 | if len(data_reserve) == 0: 38 | output[5].append(None) 39 | output[6].append(None) 40 | output[7].append(None) 41 | output[8].append(None) 42 | output[9].append(None) 43 | output[10].append(None) 44 | 45 | else: 46 | #pd.to_datetime(data_reserve[0]['end'], format='%d/%m/%y %H:%M:%S') 47 | output[5].append(pd.to_datetime(data_reserve[0]['end'], format='%d/%m/%Y %H:%M:%S')) 48 | output[6].append(pd.to_datetime(data_reserve[0]['end_block'], format='%d/%m/%Y %H:%M:%S')) 49 | output[7].append(pd.to_datetime(data_reserve[0]['end_reservation'], format='%d/%m/%y %H:%M:%S')) 50 | output[8].append(pd.to_datetime(data_reserve[0]['start'], format='%d/%m/%Y %H:%M:%S')) 51 | output[9].append(pd.to_datetime(data_reserve[0]['start_block'], format='%d/%m/%Y %H:%M:%S')) 52 | output[10].append(pd.to_datetime(data_reserve[0]['start_reservation'], format='%d/%m/%y %H:%M:%S')) 53 | #print (data_reserve) 54 | #print ('=====') 55 | 56 | df_ = pd.DataFrame(output).T 57 | cols=['id', 'gpslat', 'gpslong', 'gps_timestamp', 'status', 'end', 58 | 'end_block', 'end_reservation', 'start', 'start_block', 59 | 'start_reservation'] 60 | 61 | df_.columns = [cols] 62 | # hot fix here 63 | #df_ = df_.drop('Unnamed: 0', 1) 64 | df_.to_csv('blu_.csv',index=False) 65 | #print (df_) 66 | 67 | if write_to_db == True: 68 | print("insert to DB....") 69 | print ('############') 70 | print (df_) 71 | print ('############') 72 | # hot fix here 73 | df_2 = pd.read_csv('blu_.csv') 74 | write_data_to_db(df_2,'blue_move',db_url) 75 | 76 | return df_ 77 | 78 | if __name__ == '__main__': 79 | main_(write_to_db = True) 80 | -------------------------------------------------------------------------------- /legacy_project/blu_move/blu_scrape_V2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # scraper V2 on user booking page : remove start & end time parameter, since the output looks the same 3 | # whatever the value of start & end 4 | #curl 'https://rest.bluemove.es/api/fleet/availability' -H 'pragma: no-cache' -H 'origin: https://webapp.bluemove.es' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' -H 'user-agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36' -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'cache-control: no-cache' -H 'authority: rest.bluemove.es' -H 'referer: https://webapp.bluemove.es/en/my-bluemove' --data 'cityId=100&userId=142961&token=549mNphfCEefL2iYCwdM96GMFqqnTj56UhHLE70V21idilcfl3&product=cs&usageReason=private' --compressed | jq 5 | curl 'https://rest.bluemove.es/api/fleet/availability' -H 'pragma: no-cache' -H 'origin: https://webapp.bluemove.es' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' -H 'user-agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36' -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'cache-control: no-cache' -H 'authority: rest.bluemove.es' -H 'referer: https://webapp.bluemove.es/en/my-bluemove' --data 'cityId=100&userId=142961&token=549mNphfCEefL2iYCwdM96GMFqqnTj56UhHLE70V21idilcfl3&product=cs&usageReason=private' --compressed > blu_.json -------------------------------------------------------------------------------- /legacy_project/blu_move/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "scraping via shell ...." 3 | bash blu_scrape_V2.sh 4 | echo "prepare data python ...." 5 | # 2 python script run way in case of different local dev envs 6 | source activate zip_dev && python blu_scrape_V2.py || /Users/yennanliu/anaconda3/envs/ds_dash/bin/python blu_scrape_V2.py || python /home/ubuntu/yen_dev/blu_move/blu_scrape_V2.py 7 | #echo "clean file..." -------------------------------------------------------------------------------- /legacy_project/blu_move/utility_data_IO.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sqlalchemy import create_engine 3 | from pytz import timezone 4 | import datetime 5 | import os 6 | 7 | european = timezone('Europe/Madrid') 8 | now_tz = datetime.datetime.now(tz = european) 9 | now = now_tz.replace(tzinfo = None) 10 | now = now.replace(microsecond = 0) 11 | db_url = os.environ['db_url'] 12 | print ('db_url : ' , db_url) 13 | 14 | def write_data_to_db(df, table_name,db_url): 15 | try: 16 | # add insert time 17 | df["date_of_insert"] = now 18 | print ('=============') 19 | print (df.head()) 20 | print (table_name) 21 | print ('=============') 22 | engine = create_engine(db_url) 23 | conn = engine.connect() 24 | df.to_sql(name= table_name, con= engine, schema= 'rw', if_exists = "append", index = False) 25 | # close the connection after imput data 26 | conn.close() 27 | print("insert to DB ok") 28 | except Exception as e: 29 | print (e) 30 | print ('fail to write to db') 31 | -------------------------------------------------------------------------------- /legacy_project/blu_move/utility_data_preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def data_prepare(): 4 | df = pd.read_csv('blu.csv') 5 | #df = df 6 | #print (df.head()) 7 | cols = ['start', 'start_block','start_reservation','end','end_block','end_reservation'] 8 | for col in cols: 9 | df[col] = pd.to_datetime(df[col]) 10 | # maybe need to modify time form 11 | # 04/01/2018 17:45:00 -> 2018-01-04 17:45:00 for example 12 | # df.col = df.col.timestrip("%Y-%M-%D hr:mm:ss") 13 | # end_date_relative = now.date().strftime("%d/%m/%Y") 14 | 15 | df['reservation_time'] = df['end_reservation'] - df['start_reservation'] 16 | print (df.head(3)) 17 | return df 18 | 19 | if __name__ == '__main__': 20 | data_prepare() 21 | -------------------------------------------------------------------------------- /legacy_project/carandclassic/README.md: -------------------------------------------------------------------------------- 1 | # carandclassic 2 | Collect car data at https://www.carandclassic.co.uk/ 3 | 4 | ## Tech 5 | python3, urllib, BeautifulSoup 6 | 7 | ## Demo 8 | ```bash 9 | # demo of cclassic_scrape_V1.py 10 | $ git clone https://github.com/yennanliu/web_scraping 11 | $ cd web_scraping/carandclassic 12 | $ python cclassic_scrape_V1.py 13 | 14 | # output 15 | ['/car/C1018332', '/car/C983211', '/car/C1018314', '/car/C1018313', '/car/C1018311', '/car/C901161', '/car/C305537', '/car/C1018308', '/car/C994875', '/car/C990970', '/car/C1018297', '/car/C1018296', '/car/C1018294', '/car/C998769', '/car/C1009081', '/car/C1018284', '/car/C1018283', '/car/C1018281', '/car/C887797', '/car/C1018280', '/car/C1018279', '/car/C1005573', '/car/C1018274', '/car/C1018272', '/car/C1018269', '/car/C1018268', '/car/C1018266', '/car/C387007', '/car/C1018263', '/car/C1018262', '/car/C1018257', '/car/C1018251', '/car/C1018249', '/car/C1018247', '/car/C1018236', '/car/C1018220'] 16 | url_ : https://www.carandclassic.co.uk/car/C1018332 17 | k_next £3999 As stated 18 | k_next Classic Cars 19 | k_next Austin Healey 20 | k_next Sprite 21 | k_next 1968 22 | k_next UK 23 | k_next 07043 229662 24 | k_next 23-Jul-2018 25 | k_next C1018332 26 | ... 27 | Price Category Make Model Year \ 28 | 0 £3999 Classic Cars Austin Healey Sprite 1968 29 | 1 £27950 Classic Cars Audi Quattro 1984 30 | 2 £3495 Classic Cars Morris Minor 1968 31 | 3 £6500 Classic Cars Volkswagen Beetle 1971 32 | 4 £12500 Classic Cars MG MGB Roadster 1973 33 | 5 £40000 Classic Cars Buick redfern saloon tourer 1937 34 | ... 35 | Country Telephone Date Ref 36 | 0 UK 07043 229662 23-Jul-2018 C1018332 37 | 1 UK 07043 216048 23-Jul-2018 C983211 38 | 2 UK 07043 225499 22-Jul-2018 C1018314 39 | 3 UK 07043 217556 22-Jul-2018 C1018313 40 | 4 UK 07043 215436 22-Jul-2018 C1018311 41 | 5 UK 07043 228310 22-Jul-2018 C901161 42 | .... 43 | 44 | 45 | 46 | 47 | ``` 48 | 49 | -------------------------------------------------------------------------------- /legacy_project/carandclassic/analysis/README.md: -------------------------------------------------------------------------------- 1 | # analysis 2 | ## demo 3 | * [Rental_Location_EDA](https://nbviewer.jupyter.org/github/yennanliu/web_scraping/blob/master/carandclassic/analysis/Rental_Location_EDA.ipynb) - notebook demo explore rental geo data -------------------------------------------------------------------------------- /legacy_project/carandclassic/carandclassic_scrape_sample.csv: -------------------------------------------------------------------------------- 1 | ,Price,Category,Make,Model,Year,Country,Telephone,Date,Ref 2 | 0,£8995,Classic Cars,Fiat,Coupe,1999,UK,07043 217757,23-Jul-2018,C921382 3 | 1,£49000,Classic Cars,Lancia,Belna,1934,Italy, 393420953091,23-Jul-2018,C980353 4 | 2,£11200,Classic Cars,Volkswagen,Beetle,1960,Netherlands,0031615231265,23-Jul-2018,C834642 5 | 3,£20000,Classic Cars,Peugeot,RCZ R,2014,UK,07043 235688,23-Jul-2018,C979402 6 | 4,£7995,Classic Cars,Volkswagen,Corrado,1996,UK,07957 430966,23-Jul-2018,C999085 7 | 5,£3999,Classic Cars,Austin Healey,Sprite,1968,UK,07043 229662,23-Jul-2018,C1018332 8 | 6,£3900,Classic Cars,Volvo,PV,1948,Sweden,0046705384888,23-Jul-2018,C1018331 9 | 7,£29500,Classic Cars,Ford,Mustang,1966,USA,210 913 8353,23-Jul-2018,C929401 10 | 8,£60000,Classic Cars,Land Rover,Range Rover,1982,United Arab Emirates,00971504593964,23-Jul-2018,C969349 11 | 9,£44000,Classic Cars,AC,Ace,1961,New Zealand,0279234902,23-Jul-2018,C963690 12 | 10,£3250,Classic Cars,Austin Healey,Frogeye,1959,USA,001 (619) 561-3182,23-Jul-2018,C1018327 13 | 11,£17490,Classic Cars,Mercedes,500,2000,UK,07043 228934,23-Jul-2018,C1018325 14 | 12,£3250,Classic Cars,Jaguar,X300 V12 6 LITRES SALOON,1995,UK,07043 227855,23-Jul-2018,C1018324 15 | 13,£6250,Classic Cars,Jaguar,XJ8,2003,UK,07043 225889,23-Jul-2018,C1018323 16 | 14,£27950,Classic Cars,Audi,Quattro,1984,UK,07043 216048,23-Jul-2018,C983211 17 | 15,£19700,Classic Cars,Volkswagen,Karmann Ghia,1962,Spain,670032847,23-Jul-2018,C998835 18 | 16,£2375,Classic Cars,Volvo,460 gle,1990,UK,07043 225863,22-Jul-2018,C1018321 19 | 17,£3750,Classic Cars,BMW,5 Series,2000,UK,07043 225834,22-Jul-2018,C1018320 20 | 18,£1995,Classic Cars,BMW,3 Series,2001,UK,01455 271345,22-Jul-2018,C1018316 21 | -------------------------------------------------------------------------------- /legacy_project/carandclassic/cclassic_scrape_V1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datetime 3 | import urllib, json 4 | from bs4 import BeautifulSoup 5 | 6 | def get_html_data(url): 7 | opener=urllib.request.build_opener() 8 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 9 | page = opener.open(url) 10 | soup = BeautifulSoup(page) 11 | return soup 12 | 13 | def fix_price(x): 14 | return x.split(' ')[0] 15 | 16 | def main_(): 17 | # ----------- collect classic car ID ----------- 18 | url='https://www.carandclassic.co.uk/cat/3/' 19 | soup = get_html_data(url) 20 | content=soup.find_all('div',attrs={'class': 'item'}) 21 | car_list = [] 22 | for i in range(len(soup.find_all('div',attrs={'class': 'item'}))): 23 | # ----------- only get car ID with not null price ----------- 24 | if len(content[i].find('li',attrs={'class':'price'}).text.replace('£','')) > 0: 25 | car_id = content[i].find('a').attrs['href'] 26 | car_list.append(car_id) 27 | else: 28 | pass 29 | print (car_list) 30 | #car_list = ['/car/C1017959', '/car/C1017957','/car/C1017957'] 31 | # ----------- go through every car page, grab the car profile information ----------- 32 | output=[[] for i in range(len(car_list))] 33 | for i,car in enumerate(car_list): 34 | url_ = 'https://www.carandclassic.co.uk' + str(car) 35 | print ('url_ : ', url_) 36 | soup = get_html_data(url_) 37 | # ----------- collect needed columns ----------- 38 | # Make, Model, Date, Ref, Telephone 39 | k_list = ['Price','Category','Make','Model','Year','Country','Telephone','Date','Ref'] 40 | content=soup.find_all('td',attrs={'class':'caption'}) 41 | for k in content: 42 | if k.text in k_list: 43 | print ('k_next' , k.find_next_siblings("td")[0].text) 44 | output[i].append(k.find_next_siblings("td")[0].text) 45 | else: 46 | pass 47 | print (output) 48 | # ----------- output scrape data as dataframe and fix column value ----------- 49 | data = pd.DataFrame(output,columns =k_list ) 50 | data['Price'] = data['Price'].apply(lambda x : fix_price(x)) 51 | print (data) 52 | return data 53 | 54 | if __name__ == '__main__': 55 | main_() 56 | -------------------------------------------------------------------------------- /legacy_project/carousell/web_crawler copy.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests, re 3 | import pandas as pd 4 | import urllib 5 | import random 6 | import os 7 | import sys 8 | import subprocess 9 | import json 10 | 11 | def regular_chat(): 12 | sample_response = ['HI THERE', 'WAZZA UP','R U KIDDING ME', '..?'] 13 | response = sample_response[random.randint(0,3)] 14 | print (response) 15 | return response 16 | 17 | def general_intro(): 18 | sample_response = """ 19 | ######## 20 | 21 | for Carousell product survey, please type "!caro prosuctname" \n 22 | for asking, please type "ask" \n 23 | for main application, please type anything \n 24 | have fun :) \n 25 | 26 | ######## 27 | 28 | """ 29 | print (sample_response) 30 | return sample_response 31 | 32 | # Spotify 33 | def spotify_album(artist): 34 | # make sure artist name feat spotify API query form 35 | artist = artist.replace (" ", "+") 36 | print (artist) 37 | url="https://api.spotify.com/v1/search?q=${}&type=artist".format(artist) 38 | 39 | command = """ 40 | 41 | API_ARTIST_URL=$(curl -s "{}" | jq -r '.artists.items[0].href') 42 | curl -s "$API_ARTIST_URL/top-tracks?country=US" > spotify_data.json 43 | 44 | """.format(url) 45 | print (command) 46 | os.system(command) 47 | album = '' 48 | try: 49 | data_spotify = json.loads(open('spotify_data.json').read()) 50 | for k in range(0,len(data_spotify['tracks'])): 51 | print (data_spotify['tracks'][k]['name']) 52 | album += data_spotify['tracks'][k]['name'] + "\n\n" 53 | except: 54 | album = 'no feat artist, return null data' 55 | print (album) 56 | # remove intermediate json 57 | os.system('rm spotify_data.json') 58 | return album 59 | 60 | # Carousell 61 | def Caro_grab_(query): 62 | url = 'https://tw.carousell.com/search/products/?query={}' 63 | url=url.format(query) 64 | opener=urllib.request.build_opener() 65 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 66 | page = opener.open(url) 67 | soup = BeautifulSoup(page,"html.parser") 68 | anchors = soup.find_all('a', {'class': 'pdt-card-thumbnail', 'href': True}) 69 | content='' 70 | url_refix = 'https://tw.carousell.com/p/' 71 | for anchor in anchors: 72 | for k in re.findall('\d+', anchor['href']): 73 | if len(k) > 3: 74 | url = url_refix + k 75 | content += anchor.find('img')['alt'] + "\n" + str(url) + "\n\n" 76 | 77 | print (content) 78 | return content[:600] 79 | 80 | def Caro_grab(): 81 | url = 'https://tw.carousell.com/?hl=en' 82 | opener=urllib.request.build_opener() 83 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 84 | page = opener.open(url) 85 | soup = BeautifulSoup(page,"html.parser") 86 | anchors = soup.find_all('a', {'class': 'pdt-card-thumbnail', 'href': True}) 87 | content='' 88 | url_refix = 'https://tw.carousell.com/p/' 89 | for anchor in anchors: 90 | for k in re.findall('\d+', anchor['href']): 91 | if len(k) > 3: 92 | url = url_refix + k 93 | content += anchor.find('img')['alt'] + "\n" + str(url) + "\n\n" 94 | 95 | print (content) 96 | return content[:600] 97 | 98 | ### ipeen 99 | def ipeen_grab(): 100 | output = [[] for k in range(2)] 101 | for page in range(1,5): 102 | url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p={}&adkw=%E5%8F%B0%E5%8C%97'.format(page) 103 | print (url) 104 | opener=urllib.request.build_opener() 105 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 106 | page = opener.open(url) 107 | soup = BeautifulSoup(page) 108 | for k in soup.find_all('a', attrs={'data-label': '店名'}): 109 | output[0].append(k.text) 110 | 111 | for k in soup.findAll('span',{"style":"padding-left:3em;"}): 112 | output[1].append(k.get_text()) 113 | data = '' 114 | for k, m in zip(output[0],output[1]): 115 | data += str(k) + str(m) 116 | # limit number of query response here, since there may be limit in msg length 117 | return data[:600] 118 | 119 | ### ptt beauty 120 | def ptt_beauty(): 121 | url = 'https://www.ptt.cc/bbs/Beauty/index.html' 122 | rs = requests.session() 123 | res = rs.get('https://www.ptt.cc/bbs/Beauty/index.html', verify=False) 124 | soup = BeautifulSoup(res.text, 'html.parser') 125 | #ALLpageURL = soup.select('.btn.wide')[1]['href'] 126 | content='' 127 | # limit number of query response here, since there may be limit in msg length 128 | for k in soup.find_all('a',href=True)[:15]: 129 | 130 | try: 131 | if len(k['href']) < 30: 132 | pass 133 | else: 134 | print ("https://www.ptt.cc/"+ k['href'], k.text) 135 | content += k.text + "\n" + 'https://www.ptt.cc%s'%(k['href']) + "\n\n" 136 | except: 137 | pass 138 | 139 | print ('==================') 140 | print (content) 141 | return content 142 | 143 | if __name__ == "__main__": 144 | spotify_album('pete rock') 145 | -------------------------------------------------------------------------------- /legacy_project/delivery_/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | *.config 4 | *.log 5 | *.pyc 6 | -------------------------------------------------------------------------------- /legacy_project/delivery_/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Tech 4 | 5 | Python 3.4.5, Pandas 0.20.3, numpy , BeautifulSoup, urllib, sqlite3 6 | 7 | 8 | ## File Structure 9 | 10 | ``` 11 | ├── README.md 12 | ├── analysis.py : calcluate average temperature, best to swim day 13 | ├── data2db.py : dump needed data to sqlite 14 | ├── scrap.py : scrap weather data from wunderground.com 15 | └── weather.db : sqlite db save whole daily weather data in 2014 16 | ``` 17 | 18 | 19 | ## QUICK START 20 | 21 | 22 | ```Bash 23 | cd web_scraping 24 | # scrap and dump data to db 25 | python data2db.py 26 | # get needed analysis output 27 | python analysis.py 28 | 29 | ``` 30 | 31 | ```Bash 32 | # output 33 | 34 | BeautifulSoup([your markup], "html5lib") 35 | 36 | markup_type=markup_type)) 37 | type temp_max temp_min CET 38 | 0 Actual: 4° -1° 2014-01-01 39 | 1 Average: 2° -2° 2014-01-01 40 | 2 Actual: 7° 1° 2014-01-02 41 | 3 Average: 3° -2° 2014-01-02 42 | ..... 43 | 44 | ------------------------------- 45 | 46 | SELECT avg(temp_max) AS avg_max_temp, 47 | avg(temp_min) AS avg_min_temp, 48 | 49 | (SELECT ((avg(temp_max)+avg(temp_min)))/2 50 | FROM weather_data) AS avg_all_temp 51 | FROM weather_data 52 | WHERE TYPE = 'Actual:' 53 | 54 | 55 | avg_max_temp avg_min_temp avg_all_temp 56 | 0 15.260274 7.312329 10.304795 57 | ------------------------------- 58 | 59 | SELECT date(CET) AS best_swim_date, 60 | (temp_min+ temp_min)/2 AS avg_day_temp 61 | FROM weather_data 62 | WHERE avg_day_temp = 63 | (SELECT max((temp_min+ temp_min)/2) AS max_mean_temp 64 | FROM weather_data 65 | WHERE TYPE = 'Actual:') 66 | AND TYPE = 'Actual:' 67 | 68 | 69 | best_swim_date avg_day_temp 70 | 0 2014-07-05 21 71 | 1 2014-07-21 21 72 | ``` 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /legacy_project/delivery_/analysis.py: -------------------------------------------------------------------------------- 1 | from scrap import * 2 | from data2db import * 3 | 4 | def get_values(): 5 | 6 | sql0=""" 7 | SELECT * 8 | FROM weather_data 9 | LIMIT 10 ; 10 | """ 11 | sql =""" 12 | SELECT avg(temp_max) AS avg_max_temp, 13 | avg(temp_min) AS avg_min_temp, 14 | 15 | (SELECT ((avg(temp_max)+avg(temp_min)))/2 16 | FROM weather_data) AS avg_all_temp 17 | FROM weather_data 18 | WHERE TYPE = 'Actual:' 19 | 20 | """ 21 | 22 | sql2 =""" 23 | SELECT date(CET) AS best_swim_date, 24 | (temp_min+ temp_min)/2 AS avg_day_temp 25 | FROM weather_data 26 | WHERE avg_day_temp = 27 | (SELECT max((temp_min+ temp_min)/2) AS max_mean_temp 28 | FROM weather_data 29 | WHERE TYPE = 'Actual:') 30 | AND TYPE = 'Actual:' 31 | 32 | """ 33 | print (sql0) 34 | outcome0 = pd.read_sql(sql0, con ='sqlite:///weather.db' ) 35 | print (outcome0) 36 | print ('-------------------------------') 37 | print (sql) 38 | outcome = pd.read_sql(sql, con ='sqlite:///weather.db' ) 39 | print (outcome) 40 | print ('-------------------------------') 41 | print (sql2) 42 | outcome2 = pd.read_sql(sql2, con ='sqlite:///weather.db' ) 43 | print (outcome2) 44 | 45 | if __name__ == '__main__': 46 | get_values() 47 | -------------------------------------------------------------------------------- /legacy_project/delivery_/analysis.sql: -------------------------------------------------------------------------------- 1 | # http://www.codedata.com.tw/database/mysql-tutorial-13-stored-routines/ 2 | 3 | # https://www.a2hosting.co.uk/kb/developer-corner/mysql/mysql-stored-functions-and-procedures 4 | 5 | # stored function 6 | DELIMITER $$ 7 | CREATE FUNCTION plus(temp_max FLOAT, temp_min FLOAT) RETURNS DECIMAL(9,2) 8 | BEGIN 9 | DECLARE tem__ DECIMAL(9,2); 10 | SET tem__ = temp_max + temp_min; 11 | RETURN tem__; 12 | END$$ 13 | DELIMITER ; 14 | 15 | 16 | # sql 17 | SELECT *, plus(temp_max,temp_min) AS tem_test FROM weather_data; 18 | 19 | 20 | # stored function 21 | DELIMITER $$ 22 | CREATE PROCEDURE procedureTest() 23 | BEGIN 24 | SELECT CET FROM weather_data; 25 | END$$ 26 | DELIMITER ; 27 | 28 | # execute 29 | CALL procedureTest() \G 30 | -------------------------------------------------------------------------------- /legacy_project/delivery_/data2db.py: -------------------------------------------------------------------------------- 1 | from scrap import * 2 | 3 | def get_data(): 4 | output = pd.DataFrame() 5 | for month in range(1,13): 6 | df = get_weather_data('2014',month) 7 | print (df) 8 | output= output.append(df) 9 | return output 10 | 11 | 12 | def dump_db(): 13 | try: 14 | df = get_data() 15 | df.to_sql('weather_data', if_exists='fail',con='sqlite:///weather.db') 16 | print ('dump to DB ok') 17 | except: 18 | print ('dump DB failed') 19 | 20 | 21 | def update_db(): 22 | try: 23 | df = get_data() 24 | df.to_sql('weather_data',if_exists='append',con='sqlite:///weather.db') 25 | print ('update to DB ok') 26 | except: 27 | print ('update DB failed') 28 | 29 | 30 | #============================== 31 | 32 | 33 | class db_manipulation: 34 | def __init__(self, *args, **kwargs): 35 | self.df = get_data() 36 | self.con = 'sqlite:///weather.db' 37 | def test(self): 38 | print (self.con) 39 | 40 | def dumb2db(self): 41 | try: 42 | self.df.to_sql('weather_data',if_exists='fail',con=self.con) 43 | print ('dump to DB ok') 44 | except: 45 | print ('dump DB failed') 46 | 47 | def update2db(self): 48 | try: 49 | df = self.df 50 | df.to_sql('weather_data',if_exists='append',con=self.con) 51 | print ('update to DB ok') 52 | except: 53 | print ('dump DB failed') 54 | 55 | if __name__ == '__main__': 56 | db_job = db_manipulation() 57 | db_job.dumb2db() 58 | #dump_db() -------------------------------------------------------------------------------- /legacy_project/delivery_/query_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | sqlite3 weather.db "select * from weather_data limit 3;" 4 | echo '---------------------' 5 | echo '' 6 | sqlite3 weather.db "select CET from weather_data limit 3;" -------------------------------------------------------------------------------- /legacy_project/delivery_/scrap.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime 4 | import urllib, json 5 | from bs4 import BeautifulSoup 6 | 7 | def get_weather_data(year,month): 8 | #url_ = "https://www.wunderground.com/history/airport/EDDT/2014/12/01/MonthlyCalendar.html?req_city=Werftpfuhl&req_statename=Germany&reqdb.zip=00000&reqdb.magic=46&reqdb.wmo=10389#calendar" 9 | url_ = "https://www.wunderground.com/history/airport/EDDT/{}/{}/01/MonthlyCalendar.html?req_city=Werftpfuhl&req_statename=Germany&reqdb.zip=00000&reqdb.magic=46&reqdb.wmo=10389#calendar".format(year,month) 10 | print (url_) 11 | # query the page 12 | opener=urllib.request.build_opener() 13 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 14 | page = opener.open(url_) 15 | soup = BeautifulSoup(page) 16 | # set up output and filter data by attrs 17 | output = [[] for k in range(3)] 18 | for k in soup.find_all('td', attrs={'class': 'value-header'}): 19 | output[0].append(k.text) 20 | 21 | for k in soup.find_all('span', attrs={'class': 'high'}): 22 | output[1].append(k.text) 23 | 24 | for k in soup.find_all('span', attrs={'class': 'low'}): 25 | output[2].append(k.text) 26 | 27 | output_ =pd.DataFrame(output).T 28 | output_.columns = ['type','temp_max','temp_min'] 29 | # get day list in 2014 30 | sample_dates = pd.date_range(start='2014-01-01',end='2014-12-31', freq='d') 31 | datetimelist = [] 32 | # get day list in specific month 33 | for x in sample_dates: 34 | # '1'.zfill(2) = 01 , '11'.zfill(2) = 11 35 | month_ = str(month).zfill(2) 36 | if str(x)[:7] == '{}-{}'.format(year,month_): 37 | #print (str(x)) 38 | datetimelist.append(pd.to_datetime(x)) 39 | else: 40 | pass 41 | datetime_ = pd.DataFrame(datetimelist) 42 | datetime_.columns=['CET'] 43 | # duplicate datetime data, since there are Actual, and Average weather data 44 | datetime_ = pd.concat([datetime_]*2).sort_values('CET').reset_index() 45 | #datetime_.head() 46 | output_['CET'] = np.array(datetime_.CET) 47 | print (output_) 48 | return output_ -------------------------------------------------------------------------------- /legacy_project/delivery_/sqlite2csv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #sqlite3 -header -csv weather.db "select * from weather_data;" > weather.csv 3 | 4 | # enter db name 5 | echo 'db list : ' 6 | echo '---------------' 7 | ls *.db 8 | echo '---------------' 9 | echo 'plz enter db file name ?' 10 | read dbname 11 | 12 | # enter csv name 13 | echo 'plz enter csv file name ?' 14 | read csvname 15 | 16 | # enter table name 17 | echo 'table list : ' 18 | echo '---------------' 19 | sqlite3 $dbname.db ".table" 20 | echo '---------------' 21 | 22 | echo 'plz enter table name ?' 23 | read tablename 24 | 25 | echo 'extract' $dbname with $tablename 'to' $csvname 26 | 27 | sqlite3 -header -csv $dbname.db "select * from $tablename ;" > $csvname.csv 28 | -------------------------------------------------------------------------------- /legacy_project/delivery_/weather.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/legacy_project/delivery_/weather.db -------------------------------------------------------------------------------- /legacy_project/env.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | *** EC2 env 5 | 6 | 7 | - Chrome driver : 8 | ChromeDriver 2.25.426924 (649f9b868f6783ec9de71c123212b908bf3b232e) on port 9515 9 | 10 | - Chrome browser : 11 | 12 | Google Chrome 58.0.3029.110 13 | 14 | 15 | *** local mac env 16 | 17 | 18 | - Chrome driver : 19 | Starting ChromeDriver 2.34.522932 20 | 21 | - Chrome browser : 22 | Google Chrome 63.0.3239.108 23 | 24 | 25 | 26 | *** Chrome driver download 27 | 28 | https://sites.google.com/a/chromium.org/chromedriver/downloads 29 | https://chromedriver.storage.googleapis.com/index.html 30 | 31 | 32 | 33 | ** Chrome browser download 34 | https://www.slimjet.com/chrome/google-chrome-old-version.php 35 | http://www.geocities.jp/ecvcn/exam/chrome_installer.html 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /legacy_project/es_scrapper_docker_demo/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | ENV ESLATICSEARCH_URL http://localhost:9200/ 4 | 5 | ADD requirements.txt /app/requirements.txt 6 | ADD . /app/ 7 | WORKDIR /app/ 8 | RUN pip install -r requirements.txt 9 | ENTRYPOINT python app.py -------------------------------------------------------------------------------- /legacy_project/es_scrapper_docker_demo/README.md: -------------------------------------------------------------------------------- 1 | ### Quick start 2 | ```bash 3 | #docker build -t es_scrapper_docker_instance . && docker run -it -t es_scrapper_docker_instance 4 | 5 | docker-compose -f docker-compose.yml up 6 | 7 | ``` 8 | 9 | ### Modify from 10 | - https://sysadmins.co.za/scraping-websites-with-python-and-beautiful-soup-and-ingesting-into-elasticsearch/ 11 | 12 | - https://sysadmins.co.za/building-a-search-engine-for-our-scraped-data-on-elasticsearch-part-2/ -------------------------------------------------------------------------------- /legacy_project/es_scrapper_docker_demo/app.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from elasticsearch import Elasticsearch 6 | 7 | # change to local host default ip 8 | #es_client = Elasticsearch(['http://localhost:9200']) 9 | es_client = Elasticsearch(['http://127.0.0.1:9200']) 10 | 11 | drop_index = es_client.indices.create(index='blog-sysadmins', ignore=400) 12 | create_index = es_client.indices.delete(index='blog-sysadmins', ignore=[400, 404]) 13 | 14 | def urlparser(title, url): 15 | # scrape title 16 | p = {} 17 | post = title 18 | page = requests.get(post).content 19 | soup = BeautifulSoup(page, 'lxml') 20 | title_name = soup.title.string 21 | 22 | # scrape tags 23 | tag_names = [] 24 | desc = soup.findAll(attrs={"property":"article:tag"}) 25 | for x in range(len(desc)): 26 | tag_names.append(desc[x-1]['content'].encode('utf-8')) 27 | 28 | # payload for elasticsearch 29 | doc = { 30 | 'date': time.strftime("%Y-%m-%d"), 31 | 'title': title_name, 32 | 'tags': tag_names, 33 | 'url': url 34 | } 35 | # ingest payload into elasticsearch 36 | res = es_client.index(index="blog-sysadmins", doc_type="docs", body=doc) 37 | time.sleep(0.5) 38 | 39 | sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml' 40 | page = requests.get(sitemap_feed) 41 | sitemap_index = BeautifulSoup(page.content, 'html.parser') 42 | urls = [element.text for element in sitemap_index.findAll('loc')] 43 | 44 | for i in range(3): 45 | for x in urls: 46 | print ('x :', x ) 47 | urlparser(x, x) 48 | time.sleep(5) 49 | -------------------------------------------------------------------------------- /legacy_project/es_scrapper_docker_demo/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | app: 4 | build: 5 | context: . 6 | dockerfile: Dockerfile 7 | networks: 8 | - docker-elk 9 | depends_on: 10 | - elasticsearch 11 | elasticsearch: 12 | image: docker.elastic.co/elasticsearch/elasticsearch:6.3.2 13 | container_name: elasticsearch 14 | environment: 15 | - node.name=es01 16 | - cluster.name=docker-cluster 17 | - bootstrap.memory_lock=true 18 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 19 | ulimits: 20 | memlock: 21 | soft: -1 22 | hard: -1 23 | networks: 24 | - docker-elk 25 | privileged: true 26 | ports: 27 | - "9200:9200" 28 | - "9300:9300" 29 | kibana: 30 | image: docker.elastic.co/kibana/kibana:6.3.2 31 | container_name: kibana 32 | environment: 33 | SERVER_NAME: localhost 34 | ELASTICSEARCH_URL: http://elasticsearch:9200" 35 | networks: 36 | - docker-elk 37 | ports: 38 | - "5601:5601" 39 | depends_on: 40 | - elasticsearch 41 | networks: 42 | docker-elk: 43 | driver: bridge -------------------------------------------------------------------------------- /legacy_project/es_scrapper_docker_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | elasticsearch 3 | requests -------------------------------------------------------------------------------- /legacy_project/eztable/eztable_scarp.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time, re 3 | from bs4 import BeautifulSoup 4 | 5 | # grab "steak restaurant " 6 | url = 'https://tw.eztable.com/search?q=%E7%89%9B%E6%8E%92' 7 | # set up selenium driver (via firefox) 8 | driver = webdriver.Firefox() 9 | driver.implicitly_wait(3) 10 | driver.get(url) 11 | # set grab 50 pages 12 | for i in range(1,50): 13 | driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') 14 | ### wait 1 sec tull JS/ajax load the all contents 15 | time.sleep(1) 16 | 17 | # analysis web page generate by JS via BeautifulSoup 18 | soup = BeautifulSoup(driver.page_source, "html5lib") 19 | for block in soup.find_all('h5'): 20 | # print restaurant name 21 | print (block.text) 22 | -------------------------------------------------------------------------------- /legacy_project/eztable/eztable_scrap_dev.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time, re 3 | from bs4 import BeautifulSoup 4 | 5 | def enter_text(xpath, text, driver): 6 | textbox = driver.find_element_by_xpath(xpath) 7 | textbox.send_keys(text) 8 | 9 | # grab "steak restaurant " 10 | url = 'https://tw.eztable.com/' 11 | # set up selenium driver (via firefox) 12 | driver = webdriver.Firefox() 13 | driver.implicitly_wait(3) 14 | driver.get(url) 15 | 16 | xpath_ = ".//input[@class='search-input']" 17 | enter_text(xpath_, "japan" , driver) -------------------------------------------------------------------------------- /legacy_project/eztable/eztable_scrap_dev2.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time, re 3 | from bs4 import BeautifulSoup 4 | from urllib.request import urlopen 5 | 6 | # grab "steak restaurant " 7 | url = 'http://www.google.com' 8 | # set up selenium driver (via firefox) 9 | driver = webdriver.Firefox() 10 | #driver.get(url) 11 | #element = driver.find_element_by_xpath("//input[@id='lst-ib']") 12 | #element.send_keys('abcde') 13 | 14 | keywords = ['sss','jp','usa'] 15 | keywords = ['日本'] 16 | 17 | for keyword in keywords: 18 | # access to website 19 | driver.get(url) 20 | element = driver.find_element_by_xpath("//input[@id='lst-ib']") 21 | element.send_keys(keyword) 22 | button = driver.find_element_by_xpath("//div[@class='jsb']/center/input[1]") 23 | button.click() 24 | # analyze website elements 25 | current_url = driver.current_url 26 | page = urlopen(current_url) 27 | #html = driver.page_source 28 | #soup = BeautifulSoup(html) 29 | soup = BeautifulSoup(page, 'html.parser') 30 | for item in soup.find_all('b'): 31 | print (item.text) 32 | driver.implicitly_wait(10) 33 | #driver.get(url) -------------------------------------------------------------------------------- /legacy_project/eztable/eztable_scrap_inputword.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time, re 3 | from bs4 import BeautifulSoup 4 | from urllib.request import urlopen 5 | import time 6 | 7 | # grab "steak restaurant " 8 | url = 'https://tw.eztable.com/search?q=' 9 | # set up selenium driver (via firefox) 10 | driver = webdriver.Firefox() 11 | 12 | keywords = ['港式','牛排','中餐'] 13 | 14 | for keyword in keywords: 15 | # access to website 16 | driver.get(url) 17 | # xpath for input word 18 | element = driver.find_element_by_xpath("//input[@class='search-input']") 19 | element.send_keys(keyword) 20 | # xpath for search button 21 | button = driver.find_element_by_class_name("search-btn") 22 | button.click() 23 | # get current url 24 | print ('current_url : ', driver.current_url) 25 | driver.get(driver.current_url) 26 | # analyze html 27 | soup = BeautifulSoup(driver.page_source, "html5lib") 28 | for block in soup.find_all('h5'): 29 | print (block.text) 30 | time.sleep(3) 31 | -------------------------------------------------------------------------------- /legacy_project/eztable/geckodriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/legacy_project/eztable/geckodriver.log -------------------------------------------------------------------------------- /legacy_project/geojson.py: -------------------------------------------------------------------------------- 1 | toy_json = { 2 | "type": "MultiPolygon", 3 | "coordinates": [ 4 | 5 | [ 6 | [ [100.0, 0.0], [101.0, 0.0], [101.0, 1.0], 7 | [100.0, 1.0], [100.0, 0.0] ] 8 | ], 9 | [ 10 | [ [200.0, 0.0], [201.0, 0.0], [201.0, 1.0], 11 | [200.0, 1.0], [200.0, 0.0] ] 12 | ] 13 | ] 14 | } 15 | 16 | 17 | def make_multiple_polygon(coordinates_set1,coordinates_set2): 18 | pass -------------------------------------------------------------------------------- /legacy_project/google_geodata/geopy_address_lon_lat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ref 3 | # https://pypi.python.org/pypi/geopy 4 | # todo : deal with geopy request limit 5 | # https://stackoverflow.com/questions/30108786/how-to-deal-with-geopys-query-limit 6 | # install Nominatim on server 7 | # https://wiki.openstreetmap.org/wiki/Nominatim/Installation 8 | # work around 9 | # https://www.shanelynn.ie/batch-geocoding-in-python-with-google-geocoding-api/ 10 | import numpy as np 11 | import time 12 | import requests 13 | import os 14 | from geopy.geocoders import Nominatim 15 | 16 | def address_2_lonlat(x): 17 | print (x) 18 | time.sleep(1) # let's see if sleep 1 per epoch is OK for limitation 19 | try: 20 | geolocator = Nominatim() 21 | location = geolocator.geocode(x) 22 | print(location.latitude, location.longitude) 23 | return [location.latitude, location.longitude] 24 | except Exception as e: 25 | print (e) 26 | print ('fail to convert address to lon & lat ') 27 | return [None,None] 28 | 29 | def address_2_lonlat_hack(x): 30 | """ 31 | in case frequent request would make script get block 32 | here is a mini hack : make script sleep until the API is able to response 33 | then do the run again 34 | """ 35 | print (x) 36 | time.sleep(1) # let's see if sleep 1 per epoch is OK for limitation 37 | try: 38 | geolocator = Nominatim() 39 | location = geolocator.geocode(x) 40 | print(location.latitude, location.longitude) 41 | return [location.latitude, location.longitude] 42 | except Exception as e: 43 | print (e) 44 | if str(e) == '[Errno 61] Connection refused': 45 | print ('meet API request limit, try again...') 46 | print ('sleep 1 min ...') 47 | time.sleep(60) 48 | address_2_lonlat_hack(x) 49 | else: 50 | print ('fail to convert address to lon & lat ') 51 | return [None,None] 52 | 53 | def split_lat(x): 54 | try: 55 | return x[0] 56 | except: 57 | return None 58 | 59 | def split_lon(x): 60 | try: 61 | return x[1] 62 | except: 63 | return None 64 | 65 | def run_hack(df): 66 | """ 67 | df : 68 | id, address zipcode , lat lon 69 | """ 70 | pass -------------------------------------------------------------------------------- /legacy_project/google_geodata/gmap_address_lon_lat.py: -------------------------------------------------------------------------------- 1 | import urllib.request, json 2 | import pandas as pd 3 | import numpy as np 4 | import requests 5 | import urllib, json 6 | import os 7 | # ref 8 | # https://developers.google.com/maps/documentation/geocoding/start 9 | 10 | gmap_api = os.environ['gmap_api'] 11 | print ('gmap_api : ' , gmap_api) 12 | 13 | 14 | def gmap_url(address_): 15 | address_fix = address_.replace(' ','+') 16 | print (address_fix) 17 | g_map_url='https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(address_fix,gmap_api) 18 | print (g_map_url) 19 | return g_map_url 20 | 21 | def address_2_lonlat(g_map_url): 22 | with urllib.request.urlopen(g_map_url) as url: 23 | try: 24 | data = json.loads(url.read().decode()) 25 | print(data) 26 | #data['results'][0]['geometry']['location'] 27 | return data 28 | except Exception as e: 29 | print (e) 30 | print ('fail to convert address to lon & lat ') 31 | return None 32 | 33 | def get_lon_lat(address_): 34 | g_map_url = gmap_url(address_) 35 | try: 36 | # to do : fix utf-8 python 3 encodin problem 37 | data = address_2_lonlat(g_map_url) 38 | result = data['results'][0]['geometry']['location'] 39 | except: 40 | result = None,None 41 | return result 42 | 43 | def split_lat(x): 44 | try: 45 | return str(x['lat']) 46 | except: 47 | return None 48 | 49 | def split_lon(x): 50 | try: 51 | return str(x['lng']) 52 | except: 53 | return None -------------------------------------------------------------------------------- /legacy_project/ipeen/README.md: -------------------------------------------------------------------------------- 1 | # web_scraping 2 | 3 | 4 | ### Tech 5 | 6 | - Python 3 7 | 8 | 9 | ### Quick start 10 | 11 | install git 12 | 13 | ``` 14 | https://git-scm.com/book/en/v2/Getting-Started-Installing-Git 15 | https://www.atlassian.com/git/tutorials/install-git 16 | ``` 17 | 18 | ``` 19 | $ git clone https://github.com/yennanliu/web_scraping 20 | $ cd web_scraping 21 | $ source setup.sh 22 | ``` 23 | 24 | scrap ipeen 25 | 26 | ``` 27 | python /ipeen/ipeen_grab.py 大安區 28 | ``` 29 | or 30 | 31 | ``` 32 | python /ipeen/ipeen_grab.py your_area 33 | ``` 34 | 35 | 36 | ### Response 37 | 38 | ``` 39 | name address url style area 40 | 0 吉宏米粉湯(西門町店) 台北市萬華區昆明街140號1樓\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\... http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店 小吃 萬華區 41 | 1 瓦法奇朵Waffogato(台北車站店) 台北市中正區信陽街29號\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t... http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog... 異國料理 中正區 42 | ``` 43 | 44 | ``` 45 | please check saving csv 46 | 47 | ``` -------------------------------------------------------------------------------- /legacy_project/ipeen/ipeen_grab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import urllib 5 | #import simplejson 6 | from urllib.request import urlopen 7 | from urllib.parse import quote 8 | import requests 9 | from bs4 import BeautifulSoup 10 | import urllib, json 11 | import pandas as pd, numpy as np 12 | import sys ,re,time 13 | # transform chinese into web url in python 3 14 | # https://stackoverflow.com/questions/1695183/how-to-percent-encode-url-parameters-in-python/13625238#13625238 15 | from urllib.parse import quote 16 | # parse parameter from command line to python 17 | import argparse 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('nums', nargs='*') 21 | #parser.add_argument("echo") 22 | args = parser.parse_args() 23 | #print(args.echo) 24 | print(args.nums) 25 | #print (quote(args.echo)) 26 | #area = quote(args.echo) 27 | search_string=" ".join(str(x) for x in args.nums) 28 | print (search_string) 29 | area = quote(search_string) 30 | 31 | print ('===========') 32 | 33 | def url_fix(x): 34 | return 'http://www.ipeen.com.tw' + x 35 | 36 | def parse_area(x): 37 | if x: 38 | return x[3:6] 39 | else: 40 | return '' 41 | 42 | def grab_raw(area): 43 | output = [[] for k in range(4)] 44 | for page in range(0,171): 45 | #url_='http://www.ipeen.com.tw/search/all/000/0-100-0-0/?adkw=%E5%A4%A7%E5%AE%89%E5%8D%80&p={}' 46 | # 0-100-0-0 : all , 1-100-0-0 : 美食 .. 47 | url_='http://www.ipeen.com.tw/search/all/000/1-100-0-0/?&baragain=1&adkw={}&p={}' 48 | url_=url_.format(area,page) 49 | print (url_) 50 | opener=urllib.request.build_opener() 51 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 52 | page = opener.open(url_) 53 | soup = BeautifulSoup(page) 54 | for k in soup.find_all('a', attrs={'data-label': '店名'}): 55 | output[0].append(k.text) 56 | 57 | for k in soup.findAll('span',{"style":"padding-left:3em;"}): 58 | output[1].append(k.get_text()) 59 | 60 | for k in soup.find_all('a', {'class':"a37 ga_tracking"}): 61 | if "/shop/" in str(k['href']): 62 | output[2].append((k['href'])) 63 | for k in soup.find_all('a', attrs={'class': 'ga_tracking'}): 64 | if "大分類" in str(k): 65 | #print (k.text) 66 | output[3].append((k.text)) 67 | 68 | else: 69 | pass 70 | #time.sleep(1) 71 | print (output) 72 | return output 73 | 74 | def grab_df(): 75 | 76 | output = grab_raw(area) 77 | df = pd.DataFrame(output).T 78 | df.columns = ['name', 'address', 'url','style'] 79 | df.url = df.url.apply(lambda x :url_fix(x) ) 80 | df['area'] = df['address'].apply(lambda x :parse_area(x) ) 81 | print (df.head()) 82 | df.to_csv('ipeen_restaurant_板橋.csv') 83 | return df 84 | 85 | if __name__ == '__main__': 86 | grab_df() 87 | -------------------------------------------------------------------------------- /legacy_project/ipeen/ipeen_pivot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import numpy as np 5 | 6 | # read all *.csv in samle file once and merge into one csv 7 | 8 | def data_prepare(): 9 | #files = !ls *.csv # IPython magic 10 | # using glob get all csv name in same route as list 11 | files = glob.glob('./*.csv') 12 | df = pd.concat([pd.read_csv(f, index_col=0, header=None) for f in files], keys=files) 13 | df.columns = ['name','address','url','style','area'] 14 | df_ = df.ix[1:].reset_index() 15 | df_ = df_[['name','address','url','style','area']] 16 | #df_.head() 17 | return df_ 18 | 19 | def data_clean(df): 20 | # modify ur city list here 21 | area_ = ['萬華區'] 22 | df = df[area_].reset_index() 23 | return df 24 | 25 | # group by city name and rename columns 26 | df_ = data_prepare() 27 | df_inter = df_.groupby(['area','style']).count().reset_index()[['area','style','name']] 28 | df_inter.columns = [['area','style','count']] 29 | 30 | # to pivot table 31 | df_pivot = pd.pivot_table(df_inter, values='count', index=['area'],columns=['style'], aggfunc=np.sum).fillna(0).T 32 | df_pivot_ = data_clean(df_pivot) 33 | df_pivot.to_csv('df_pivot_final.csv') 34 | -------------------------------------------------------------------------------- /legacy_project/ipeen/ipeen_restaurant_grab_V2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 39, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# -*- coding: utf-8 -*-" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import urllib \n", 23 | "#import simplejson \n", 24 | "from urllib.request import urlopen\n", 25 | "import csv\n", 26 | "import requests\n", 27 | "from bs4 import BeautifulSoup\n", 28 | "import lxml\n", 29 | "import urllib, json\n", 30 | "import pandas as pd, numpy as np\n", 31 | "import pprint\n", 32 | "import datetime as dt \n", 33 | "from urllib.parse import quote\n", 34 | "import sys \n", 35 | "#import urllib2\n", 36 | "import re\n", 37 | "import lxml" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 63, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "# help function \n", 49 | "\n", 50 | "def url_fix(x):\n", 51 | " return 'http://www.ipeen.com.tw' + x\n", 52 | "\n", 53 | "def parse_area(x):\n", 54 | " return x[3:6]" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 27, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "output = [[] for k in range(4)]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 76, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "'%E5%A4%A7%E5%AE%89%E5%8D%80'" 77 | ] 78 | }, 79 | "execution_count": 76, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "#parse chiese into code can integrate into url \n", 86 | "from urllib.parse import quote\n", 87 | "quote('大安區')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 49, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stderr", 97 | "output_type": "stream", 98 | "text": [ 99 | "//anaconda/envs/g_dash/lib/python3.4/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html5lib\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 100 | "\n", 101 | "The code that caused this warning is on line 170 of the file /anaconda/envs/g_dash/lib/python3.4/runpy.py. To get rid of this warning, change code that looks like this:\n", 102 | "\n", 103 | " BeautifulSoup([your markup])\n", 104 | "\n", 105 | "to this:\n", 106 | "\n", 107 | " BeautifulSoup([your markup], \"html5lib\")\n", 108 | "\n", 109 | " markup_type=markup_type))\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "for page in range(1,5):\n", 115 | " #url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p={}&adkw=%E5%8F%B0%E5%8C%97'.format(page)\n", 116 | " #print (url)\n", 117 | " url_='http://www.ipeen.com.tw/search/all/000/0-100-0-0/?adkw=%E5%A4%A7%E5%AE%89%E5%8D%80&bar={}'\n", 118 | " url_=url_.format(page)\n", 119 | " opener=urllib.request.build_opener()\n", 120 | " opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n", 121 | " page = opener.open(url_)\n", 122 | " soup = BeautifulSoup(page)\n", 123 | " for k in soup.find_all('a', attrs={'data-label': '店名'}):\n", 124 | " output[0].append(k.text)\n", 125 | "\n", 126 | " for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n", 127 | " output[1].append(k.get_text())\n", 128 | " \n", 129 | " for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n", 130 | " if \"/shop/\" in str(k['href']):\n", 131 | " output[2].append((k['href']))\n", 132 | " for k in soup.find_all('a', attrs={'class': 'ga_tracking'}):\n", 133 | " if \"大分類\" in str(k):\n", 134 | " #print (k.text)\n", 135 | " output[3].append((k.text))\n", 136 | " \n", 137 | " else:\n", 138 | " pass" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 52, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "'http://www.ipeen.com.tw/search/all/000/0-100-0-0/?adkw=%E5%A4%A7%E5%AE%89%E5%8D%80&bar=4'" 150 | ] 151 | }, 152 | "execution_count": 52, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "url_" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 50, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | "
nameaddressurlstyle
0吉宏米粉湯(西門町店)台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店小吃
1瓦法奇朵Waffogato(台北車站店)台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t...http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...異國料理
2添好運台灣 Timhowan Taiwan台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...中式料理
3威靈頓街1號 粥麵茶餐廳台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳中式料理
4劉山東小吃店台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...http://www.ipeen.com.tw/shop/44415-劉山東小吃店中式料理
\n", 218 | "
" 219 | ], 220 | "text/plain": [ 221 | " name address \\\n", 222 | "0 吉宏米粉湯(西門町店) 台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 223 | "1 瓦法奇朵Waffogato(台北車站店) 台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t... \n", 224 | "2 添好運台灣 Timhowan Taiwan 台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 225 | "3 威靈頓街1號 粥麵茶餐廳 台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 226 | "4 劉山東小吃店 台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... \n", 227 | "\n", 228 | " url style \n", 229 | "0 http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店 小吃 \n", 230 | "1 http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog... 異國料理 \n", 231 | "2 http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... 中式料理 \n", 232 | "3 http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 中式料理 \n", 233 | "4 http://www.ipeen.com.tw/shop/44415-劉山東小吃店 中式料理 " 234 | ] 235 | }, 236 | "execution_count": 50, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "df = pd.DataFrame(output).T\n", 243 | "df.columns = ['name', 'address', 'url','style']\n", 244 | "df.url = df.url.apply(lambda x :url_fix(x) )\n", 245 | "df.to_csv('ipeen_scrap.csv')\n", 246 | "df.head()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 79, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "#df.head(4)['address']" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 64, 261 | "metadata": { 262 | "collapsed": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "df['area'] = df['address'].apply(lambda x :parse_area(x) )" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 78, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/html": [ 277 | "
\n", 278 | "\n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | "
nameaddressurlstylearea
0吉宏米粉湯(西門町店)台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店小吃萬華區
1瓦法奇朵Waffogato(台北車站店)台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t...http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog...異國料理中正區
2添好運台灣 Timhowan Taiwan台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...中式料理中正區
3威靈頓街1號 粥麵茶餐廳台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳中式料理中正區
4劉山東小吃店台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...http://www.ipeen.com.tw/shop/44415-劉山東小吃店中式料理中正區
\n", 332 | "
" 333 | ], 334 | "text/plain": [ 335 | " name address \\\n", 336 | "0 吉宏米粉湯(西門町店) 台北市萬華區昆明街140號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 337 | "1 瓦法奇朵Waffogato(台北車站店) 台北市中正區信陽街29號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t... \n", 338 | "2 添好運台灣 Timhowan Taiwan 台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 339 | "3 威靈頓街1號 粥麵茶餐廳 台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 340 | "4 劉山東小吃店 台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... \n", 341 | "\n", 342 | " url style area \n", 343 | "0 http://www.ipeen.com.tw/shop/1082330-吉宏米粉湯-西門町店 小吃 萬華區 \n", 344 | "1 http://www.ipeen.com.tw/shop/979300-瓦法奇朵Waffog... 異國料理 中正區 \n", 345 | "2 http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... 中式料理 中正區 \n", 346 | "3 http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 中式料理 中正區 \n", 347 | "4 http://www.ipeen.com.tw/shop/44415-劉山東小吃店 中式料理 中正區 " 348 | ] 349 | }, 350 | "execution_count": 78, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "df.head()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 77, 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "#df.to_csv('ipeen_restaurant_0617.csv')" 368 | ] 369 | } 370 | ], 371 | "metadata": { 372 | "kernelspec": { 373 | "display_name": "Python 3", 374 | "language": "python", 375 | "name": "python3" 376 | }, 377 | "language_info": { 378 | "codemirror_mode": { 379 | "name": "ipython", 380 | "version": 3 381 | }, 382 | "file_extension": ".py", 383 | "mimetype": "text/x-python", 384 | "name": "python", 385 | "nbconvert_exporter": "python", 386 | "pygments_lexer": "ipython3", 387 | "version": "3.4.5" 388 | } 389 | }, 390 | "nbformat": 4, 391 | "nbformat_minor": 2 392 | } 393 | -------------------------------------------------------------------------------- /legacy_project/ipeen/ipeen_restaurant_pivot_table.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# -*- coding: utf-8 -*-" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 123, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import os\n", 23 | "import glob\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "#import matplotlib.pyplot as plt\n", 27 | "#%matplotlib inline" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 125, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# read all *.csv in samle file once and merge into one csv \n", 37 | "\n", 38 | "def data_prepare():\n", 39 | " files = !ls *.csv # IPython magic\n", 40 | " df = pd.concat([pd.read_csv(f, index_col=0, header=None) for f in files], keys=files)\n", 41 | " df.columns = ['name','address','url','style','area']\n", 42 | " df_ = df.ix[1:].reset_index()\n", 43 | " df_ = df_[['name','address','url','style','area']]\n", 44 | " #df_.head()\n", 45 | " return df_\n", 46 | "\n", 47 | "def data_clean(df):\n", 48 | " area_ = ['中和區', \n", 49 | " '中山區', \n", 50 | " '中正區',\n", 51 | " '信義區',\n", 52 | " '大同區', \n", 53 | " '大安區',\n", 54 | " '松山區', \n", 55 | " '板橋區', \n", 56 | " '永和區']\n", 57 | " df = df[area_].reset_index()\n", 58 | " return df " 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 72, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "df_= data_prepare()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 75, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/html": [ 78 | "
\n", 79 | "\n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | "
nameaddressurlstylearea
0鹿兒島燒肉專賣店(中和中山店)新北市中和區中山路二段28號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n...http://www.ipeen.com.tw/shop/1128665-鹿兒島燒肉專賣店-...燒烤類中和區
1青禾幸福鍋物涮涮屋(永安店)新北市中和區中和路380號2樓(永安市場捷運站旁)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/138215-青禾幸福鍋物涮涮屋-永安店鍋類中和區
\n", 109 | "
" 110 | ], 111 | "text/plain": [ 112 | " name address \\\n", 113 | "0 鹿兒島燒肉專賣店(中和中山店) 新北市中和區中山路二段28號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n... \n", 114 | "1 青禾幸福鍋物涮涮屋(永安店) 新北市中和區中和路380號2樓(永安市場捷運站旁)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 115 | "\n", 116 | " url style area \n", 117 | "0 http://www.ipeen.com.tw/shop/1128665-鹿兒島燒肉專賣店-... 燒烤類 中和區 \n", 118 | "1 http://www.ipeen.com.tw/shop/138215-青禾幸福鍋物涮涮屋-永安店 鍋類 中和區 " 119 | ] 120 | }, 121 | "execution_count": 75, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "df_.head(2)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 83, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/html": [ 138 | "
\n", 139 | "\n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | "
areastylecount
0\\t\\t\\t冷凍/冷藏包裝食品3
1\\t\\t\\t常溫包裝食品6
2\\t\\t\\t網購包裝食品12
\n", 169 | "
" 170 | ], 171 | "text/plain": [ 172 | " area style count\n", 173 | "0 \\t\\t\\t 冷凍/冷藏包裝食品 3\n", 174 | "1 \\t\\t\\t 常溫包裝食品 6\n", 175 | "2 \\t\\t\\t 網購包裝食品 12" 176 | ] 177 | }, 178 | "execution_count": 83, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "df_inter = df_.groupby(['area','style']).count().reset_index()[['area','style','name']]\n", 185 | "df_inter.columns = [['area','style','count']]\n", 186 | "df_inter.head(3)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 107, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "df_pivot = pd.pivot_table(df_inter, values='count', index=['area'],\n", 196 | " columns=['style'], aggfunc=np.sum).fillna(0).T" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 126, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df_pivot_ = data_clean(df_pivot)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 127, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | "
areastyle中和區中山區中正區信義區大同區大安區松山區板橋區永和區
0buffet自助餐1.018.06.015.06.09.08.06.02.0
1style0.00.00.00.00.00.00.00.00.0
2中式料理376.0851.0667.0524.0285.0907.0618.0462.0286.0
3主題特色餐廳20.0160.063.0101.046.0218.073.054.036.0
4亞洲料理76.0130.0117.0153.031.0243.0112.078.050.0
5其他美食46.044.068.032.014.054.040.046.028.0
6冰品、飲料、甜湯99.0193.0218.0195.0131.0315.0173.0187.0103.0
7冷凍/冷藏包裝食品0.00.00.00.00.00.00.00.00.0
8咖啡、簡餐、茶167.0667.0495.0405.0196.0798.0435.0262.0140.0
9小吃307.0352.0420.0274.0335.0525.0384.0487.0236.0
10常溫包裝食品0.00.00.00.00.00.00.00.00.0
11日式料理117.0578.0289.0265.0111.0547.0270.0231.099.0
12早餐73.069.063.066.021.075.059.0142.048.0
13烘焙、甜點、零食95.0223.0238.0221.0100.0364.0161.0166.075.0
14燒烤類74.0187.075.082.023.0182.096.0107.055.0
15異國料理109.0328.0245.0341.080.0688.0299.0259.094.0
16素食23.071.050.052.014.062.045.039.023.0
17網購包裝食品0.00.00.00.00.00.00.00.00.0
18速食料理29.052.063.047.024.073.057.047.025.0
19鍋類113.0205.0108.0117.047.0227.0143.0158.0108.0
\n", 496 | "
" 497 | ], 498 | "text/plain": [ 499 | "area style 中和區 中山區 中正區 信義區 大同區 大安區 松山區 板橋區 永和區\n", 500 | "0 buffet自助餐 1.0 18.0 6.0 15.0 6.0 9.0 8.0 6.0 2.0\n", 501 | "1 style 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", 502 | "2 中式料理 376.0 851.0 667.0 524.0 285.0 907.0 618.0 462.0 286.0\n", 503 | "3 主題特色餐廳 20.0 160.0 63.0 101.0 46.0 218.0 73.0 54.0 36.0\n", 504 | "4 亞洲料理 76.0 130.0 117.0 153.0 31.0 243.0 112.0 78.0 50.0\n", 505 | "5 其他美食 46.0 44.0 68.0 32.0 14.0 54.0 40.0 46.0 28.0\n", 506 | "6 冰品、飲料、甜湯 99.0 193.0 218.0 195.0 131.0 315.0 173.0 187.0 103.0\n", 507 | "7 冷凍/冷藏包裝食品 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", 508 | "8 咖啡、簡餐、茶 167.0 667.0 495.0 405.0 196.0 798.0 435.0 262.0 140.0\n", 509 | "9 小吃 307.0 352.0 420.0 274.0 335.0 525.0 384.0 487.0 236.0\n", 510 | "10 常溫包裝食品 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", 511 | "11 日式料理 117.0 578.0 289.0 265.0 111.0 547.0 270.0 231.0 99.0\n", 512 | "12 早餐 73.0 69.0 63.0 66.0 21.0 75.0 59.0 142.0 48.0\n", 513 | "13 烘焙、甜點、零食 95.0 223.0 238.0 221.0 100.0 364.0 161.0 166.0 75.0\n", 514 | "14 燒烤類 74.0 187.0 75.0 82.0 23.0 182.0 96.0 107.0 55.0\n", 515 | "15 異國料理 109.0 328.0 245.0 341.0 80.0 688.0 299.0 259.0 94.0\n", 516 | "16 素食 23.0 71.0 50.0 52.0 14.0 62.0 45.0 39.0 23.0\n", 517 | "17 網購包裝食品 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", 518 | "18 速食料理 29.0 52.0 63.0 47.0 24.0 73.0 57.0 47.0 25.0\n", 519 | "19 鍋類 113.0 205.0 108.0 117.0 47.0 227.0 143.0 158.0 108.0" 520 | ] 521 | }, 522 | "execution_count": 127, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "df_pivot_" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 128, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "#df_pivot.to_csv('df_final.csv')" 538 | ] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "Python 3", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.4.5" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 2 562 | } 563 | -------------------------------------------------------------------------------- /legacy_project/ipeen/ipeen_scraping-final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "- ref http://stackoverflow.com/questions/40384289/web-scraping-python-extract-data-in-class-href-tag/40384398?noredirect=1#comment68021453_40384398\n", 8 | "- ref http://chrisalbon.com/python/beautiful_soup_html_basics.html\n", 9 | "- ref https://medium.com/dualcores-studio/python-x-%E7%B6%B2%E8%B7%AF%E7%88%AC%E8%9F%B2-c30ffda0ad78#.ruh8fs4v4" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import urllib \n", 21 | "#import simplejson \n", 22 | "import sys\n", 23 | "from urllib.request import urlopen\n", 24 | "import csv\n", 25 | "import requests\n", 26 | "from bs4 import BeautifulSoup\n", 27 | "import lxml\n", 28 | "import urllib, json\n", 29 | "import pandas as pd, numpy as np\n", 30 | "import pprint\n", 31 | "import datetime as dt \n", 32 | "from urllib.parse import quote\n", 33 | "import sys \n", 34 | "#import urllib2\n", 35 | "import re\n", 36 | "import lxml" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stderr", 48 | "output_type": "stream", 49 | "text": [ 50 | "//anaconda/envs/g_dash/lib/python3.4/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 51 | "\n", 52 | "The code that caused this warning is on line 170 of the file /anaconda/envs/g_dash/lib/python3.4/runpy.py. To get rid of this warning, change code that looks like this:\n", 53 | "\n", 54 | " BeautifulSoup([your markup])\n", 55 | "\n", 56 | "to this:\n", 57 | "\n", 58 | " BeautifulSoup([your markup], \"html.parser\")\n", 59 | "\n", 60 | " markup_type=markup_type))\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?adkw=%E5%8F%B0%E5%8C%97'\n", 66 | "opener=urllib.request.build_opener()\n", 67 | "opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n", 68 | "page = opener.open(url)\n", 69 | "soup = BeautifulSoup(page)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Restaurant name " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 39, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "黑潮市集花甲蟹鍋\n", 91 | "添好運台灣 Timhowan Taiwan\n", 92 | "威靈頓街1號 粥麵茶餐廳\n", 93 | "劉山東小吃店\n", 94 | "阜杭豆漿店\n", 95 | "點水樓(懷寧店)\n", 96 | "蘇杭餐廳(濟南店)\n", 97 | "北平田園餡餅粥\n", 98 | "小南門點心世界\n", 99 | "叁和院 台灣風格飲食\n", 100 | "小魏川菜餐廳\n", 101 | "123養生雞湯\n", 102 | "達人食社-嘉味水餃\n", 103 | "餃先生創意手工水餃\n", 104 | "小喬阿姨私房泡菜\n", 105 | "中式餐廳\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "for k in soup.find_all('a', attrs={'data-label': '店名'}):\n", 111 | " print (k.text)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "# Restaurant address" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 42, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "台北市大安區光復南路692巷6號\n", 133 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 134 | "\n", 135 | "台北市中正區忠孝西路一段36號1樓\n", 136 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 137 | "\n", 138 | "台北市中正區館前路12號5樓 (UNIQLO樓上)\n", 139 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 140 | "\n", 141 | "台北市中正區開封街一段14巷2號\n", 142 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 143 | "\n", 144 | "台北市中正區忠孝東路一段108號2樓\n", 145 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 146 | "\n", 147 | "台北市中正區懷寧街64號\n", 148 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 149 | "\n", 150 | "台北市中正區濟南路一段2-1號1樓\n", 151 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 152 | "\n", 153 | "台北市中正區重慶南路一段5巷1號\n", 154 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 155 | "\n", 156 | "台北市中正區北平西路3號(台北車站2樓)\n", 157 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 158 | "\n", 159 | "台北市大安區忠孝東路四段101巷14號\n", 160 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 161 | "\n", 162 | "台北市中正區公園路13號3樓\n", 163 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 164 | "\n", 165 | "台灣\n", 166 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 167 | "台灣\n", 168 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 169 | "台灣\n", 170 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 171 | "台灣\n", 172 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\n", 173 | "台灣\n", 174 | "\t\t\t\t\t\t\t\t\t\t\t\t\t\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n", 180 | " print (k.get_text())" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "# Restaurant ID" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 38, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "/shop/805942-黑潮市集花甲蟹鍋\n", 202 | "/shop/965236-添好運台灣-Timhowan-Taiwan\n", 203 | "/shop/156281-威靈頓街1號-粥麵茶餐廳\n", 204 | "/shop/44415-劉山東小吃店\n", 205 | "/shop/27702-阜杭豆漿店\n", 206 | "/shop/52988-點水樓-懷寧店\n", 207 | "/shop/22553-蘇杭餐廳-濟南店\n", 208 | "/shop/3800-北平田園餡餅粥\n", 209 | "/shop/39130-小南門點心世界\n", 210 | "/shop/941562-叁和院-台灣風格飲食\n", 211 | "/shop/6325-小魏川菜餐廳\n", 212 | "/shop/86651-123養生雞湯\n", 213 | "/shop/62767-達人食社-嘉味水餃\n", 214 | "/shop/583883-餃先生創意手工水餃\n", 215 | "/shop/711282-小喬阿姨私房泡菜\n", 216 | "/shop/593406-中式餐廳\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n", 222 | " \n", 223 | " if \"/shop/\" in str(k['href']):\n", 224 | " \n", 225 | " print (k['href'])\n", 226 | " else:\n", 227 | " pass" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "# Put all together " 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 82, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "output = [[] for k in range(3)]" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 50, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "for k in soup.find_all('a', attrs={'data-label': '店名'}):\n", 257 | " output[0].append(k.text)\n", 258 | "\n", 259 | "for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n", 260 | " output[1].append(k.get_text())\n", 261 | " \n", 262 | "\n", 263 | "for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n", 264 | " \n", 265 | " if \"/shop/\" in str(k['href']):\n", 266 | " output[2].append((k['href']))\n", 267 | " else:\n", 268 | " pass" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 65, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "def url_fix(x):\n", 280 | " return 'http://www.ipeen.com.tw' + x " 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 67, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "df = pd.DataFrame(output).T\n", 292 | "df.columns = ['name', 'address', 'url']\n", 293 | "df.url = df.url.apply(lambda x :url_fix(x) )" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 70, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/html": [ 306 | "
\n", 307 | "\n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | "
nameaddressurl
0黑潮市集花甲蟹鍋台北市大安區光復南路692巷6號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\nhttp://www.ipeen.com.tw/shop/805942-黑潮市集花甲蟹鍋
1添好運台灣 Timhowan Taiwan台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\nhttp://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...
2威靈頓街1號 粥麵茶餐廳台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳
3劉山東小吃店台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\nhttp://www.ipeen.com.tw/shop/44415-劉山東小吃店
4阜杭豆漿店台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...http://www.ipeen.com.tw/shop/27702-阜杭豆漿店
\n", 349 | "
" 350 | ], 351 | "text/plain": [ 352 | " name address \\\n", 353 | "0 黑潮市集花甲蟹鍋 台北市大安區光復南路692巷6號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n \n", 354 | "1 添好運台灣 Timhowan Taiwan 台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n \n", 355 | "2 威靈頓街1號 粥麵茶餐廳 台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 356 | "3 劉山東小吃店 台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n \n", 357 | "4 阜杭豆漿店 台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... \n", 358 | "\n", 359 | " url \n", 360 | "0 http://www.ipeen.com.tw/shop/805942-黑潮市集花甲蟹鍋 \n", 361 | "1 http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... \n", 362 | "2 http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 \n", 363 | "3 http://www.ipeen.com.tw/shop/44415-劉山東小吃店 \n", 364 | "4 http://www.ipeen.com.tw/shop/27702-阜杭豆漿店 " 365 | ] 366 | }, 367 | "execution_count": 70, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "df.to_csv('ipeen_scrap.csv')\n", 374 | "df.head()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "# Loop over 5 pages " 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 86, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "output = [[] for k in range(3)]" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 87, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=1&adkw=%E5%8F%B0%E5%8C%97\n" 407 | ] 408 | }, 409 | { 410 | "name": "stderr", 411 | "output_type": "stream", 412 | "text": [ 413 | "//anaconda/envs/g_dash/lib/python3.4/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 414 | "\n", 415 | "The code that caused this warning is on line 170 of the file /anaconda/envs/g_dash/lib/python3.4/runpy.py. To get rid of this warning, change code that looks like this:\n", 416 | "\n", 417 | " BeautifulSoup([your markup])\n", 418 | "\n", 419 | "to this:\n", 420 | "\n", 421 | " BeautifulSoup([your markup], \"html.parser\")\n", 422 | "\n", 423 | " markup_type=markup_type))\n" 424 | ] 425 | }, 426 | { 427 | "name": "stdout", 428 | "output_type": "stream", 429 | "text": [ 430 | "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=2&adkw=%E5%8F%B0%E5%8C%97\n", 431 | "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=3&adkw=%E5%8F%B0%E5%8C%97\n", 432 | "http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p=4&adkw=%E5%8F%B0%E5%8C%97\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "for page in range(1,5):\n", 438 | " url ='http://www.ipeen.com.tw/search/all/000/0-100-0-0/%E4%B8%AD%E5%BC%8F/?p={}&adkw=%E5%8F%B0%E5%8C%97'.format(page)\n", 439 | " print (url)\n", 440 | " opener=urllib.request.build_opener()\n", 441 | " opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n", 442 | " page = opener.open(url)\n", 443 | " soup = BeautifulSoup(page)\n", 444 | " for k in soup.find_all('a', attrs={'data-label': '店名'}):\n", 445 | " output[0].append(k.text)\n", 446 | "\n", 447 | " for k in soup.findAll('span',{\"style\":\"padding-left:3em;\"}):\n", 448 | " output[1].append(k.get_text())\n", 449 | " \n", 450 | " for k in soup.find_all('a', {'class':\"a37 ga_tracking\"}):\n", 451 | " if \"/shop/\" in str(k['href']):\n", 452 | " output[2].append((k['href']))\n", 453 | " else:\n", 454 | " pass\n" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 89, 460 | "metadata": { 461 | "collapsed": false, 462 | "scrolled": false 463 | }, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/html": [ 468 | "
\n", 469 | "\n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | "
nameaddressurl
0酒食坊 Pān-toh Bistro台北市松山區光復北路7號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\nhttp://www.ipeen.com.tw/shop/1046690-酒食坊-Pān-t...
1添好運台灣 Timhowan Taiwan台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\nhttp://www.ipeen.com.tw/shop/965236-添好運台灣-Timh...
2威靈頓街1號 粥麵茶餐廳台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\...http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳
3劉山東小吃店台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\nhttp://www.ipeen.com.tw/shop/44415-劉山東小吃店
4阜杭豆漿店台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t...http://www.ipeen.com.tw/shop/27702-阜杭豆漿店
\n", 511 | "
" 512 | ], 513 | "text/plain": [ 514 | " name address \\\n", 515 | "0 酒食坊 Pān-toh Bistro 台北市松山區光復北路7號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n \n", 516 | "1 添好運台灣 Timhowan Taiwan 台北市中正區忠孝西路一段36號1樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n \n", 517 | "2 威靈頓街1號 粥麵茶餐廳 台北市中正區館前路12號5樓 (UNIQLO樓上)\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\... \n", 518 | "3 劉山東小吃店 台北市中正區開封街一段14巷2號\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n \n", 519 | "4 阜杭豆漿店 台北市中正區忠孝東路一段108號2樓\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t... \n", 520 | "\n", 521 | " url \n", 522 | "0 http://www.ipeen.com.tw/shop/1046690-酒食坊-Pān-t... \n", 523 | "1 http://www.ipeen.com.tw/shop/965236-添好運台灣-Timh... \n", 524 | "2 http://www.ipeen.com.tw/shop/156281-威靈頓街1號-粥麵茶餐廳 \n", 525 | "3 http://www.ipeen.com.tw/shop/44415-劉山東小吃店 \n", 526 | "4 http://www.ipeen.com.tw/shop/27702-阜杭豆漿店 " 527 | ] 528 | }, 529 | "execution_count": 89, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "df = pd.DataFrame(output).T\n", 536 | "df.columns = ['name', 'address', 'url']\n", 537 | "df.url = df.url.apply(lambda x :url_fix(x) )\n", 538 | "df.to_csv('ipeen_scrap.csv')\n", 539 | "df.head()" 540 | ] 541 | } 542 | ], 543 | "metadata": { 544 | "kernelspec": { 545 | "display_name": "Python 3", 546 | "language": "python", 547 | "name": "python3" 548 | }, 549 | "language_info": { 550 | "codemirror_mode": { 551 | "name": "ipython", 552 | "version": 3 553 | }, 554 | "file_extension": ".py", 555 | "mimetype": "text/x-python", 556 | "name": "python", 557 | "nbconvert_exporter": "python", 558 | "pygments_lexer": "ipython3", 559 | "version": "3.4.5" 560 | } 561 | }, 562 | "nbformat": 4, 563 | "nbformat_minor": 1 564 | } 565 | -------------------------------------------------------------------------------- /legacy_project/script/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yennanliu/web_scraping/5ed0b340f114b14218c7e9c0c1d157551b9ff208/legacy_project/script/__init__.py -------------------------------------------------------------------------------- /legacy_project/script/utility_data_IO.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sqlalchemy import create_engine 3 | from pytz import timezone 4 | import datetime 5 | import os 6 | 7 | european = timezone('Europe/Madrid') 8 | now_tz = datetime.datetime.now(tz = european) 9 | now = now_tz.replace(tzinfo = None) 10 | now = now.replace(microsecond = 0) 11 | 12 | db_url = os.environ['db_url'] 13 | print ('db_url : ' , db_url) 14 | 15 | def write_data_to_db(df, table_name,db_url): 16 | try: 17 | # add insert time 18 | df["date_of_insert"] = now 19 | print ('=============') 20 | print (df.head()) 21 | print (table_name) 22 | print ('=============') 23 | engine = create_engine(db_url) 24 | conn = engine.connect() 25 | df.to_sql(name= table_name, con= engine, schema= 'rw', if_exists = "append", index = False) 26 | # close the connection after imput data 27 | conn.close() 28 | print("insert to DB ok") 29 | except Exception as e: 30 | print (e) 31 | print ('fail to write to db') 32 | -------------------------------------------------------------------------------- /legacy_project/script/utility_operation.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | class Login(): 4 | def user_login(self,driver): 5 | """ 6 | clear form before input id / password 7 | can find element by other ways : Xpath/CSS/parent.. 8 | """ 9 | # input user id 10 | driver.find_element_by_id("idinput").clear() 11 | driver.find_element_by_id("idinput").send_keys("user_name") 12 | # iiput password 13 | driver.find_element_by_id("pwdinput").clear() 14 | driver.find_element_by_id("pwdinput").send_keys("user_password") 15 | # click login button 16 | driver.find_element_by_id("loginbtn").click() 17 | print ('### login success ###') 18 | def user_logout(self,driver): 19 | driver.find_element_by_link_text("logout").click() 20 | driver.quit() 21 | print ('### log out success ###') -------------------------------------------------------------------------------- /legacy_project/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # set up env 4 | read -p "Please set your env name: " env_name 5 | echo 'your env name :' $env_name 6 | echo 'creating env....' 7 | conda create --name $env_name python=3 && 8 | 9 | # install library 10 | echo 'start install env.... ' 11 | source activate $env_name && pip install pandas urllib3 beautifulsoup4 12 | echo 'all env library installed successfully ! ' 13 | -------------------------------------------------------------------------------- /legacy_project/weather_scrapper/LDN_weather_scrapper_V1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime 4 | import urllib, json 5 | from bs4 import BeautifulSoup 6 | # UDF 7 | from script.utility_data_IO import * 8 | #from script.utility_operation import * 9 | #import utility_data_IO 10 | 11 | cols = ['Mean Temperature', 'Max Temperature', 'Min Temperature', 12 | 'Heating Degree Days', 'Dew Point', 'Average Humidity', 13 | 'Maximum Humidity', 'Minimum Humidity', 'Precipitation', 14 | 'Sea Level Pressure', 'Wind Speed', 'Max Wind Speed', 'Max Gust Speed', 15 | 'Visibility', 'Events', 'timestamp'] 16 | 17 | def col_fix(df): 18 | for col in cols: 19 | if col in df.columns: 20 | pass 21 | else: 22 | df[col] = None 23 | return df 24 | 25 | def main_(start_date,end_date): 26 | output=pd.DataFrame([]) 27 | # ------------- 28 | print ('-----------------') 29 | print ('start_date : ',start_date ) 30 | print ('end_date : ',end_date ) 31 | print ('-----------------') 32 | for day in pd.date_range(start=start_date, end=end_date, freq='D'): 33 | #for day in pd.date_range(start_date='3/1/2017', end_date='3/5/2017', freq='D'): 34 | print ((day)) 35 | date_ = str(day).split(' ')[0] 36 | year_ = date_.split('-')[0] 37 | month_ = date_.split('-')[1] 38 | day_ = date_.split('-')[2] 39 | # ------------- 40 | url_new = 'https://www.wunderground.com/history/airport/EGMC/{}/{}/{}/DailyHistory.html?cm_ven=localwx_history'.format(year_,month_,day_) 41 | print (url_new) 42 | 43 | # query the page 44 | opener=urllib.request.build_opener() 45 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 46 | page = opener.open(url_new) 47 | soup = BeautifulSoup(page) 48 | trs = soup.find_all('td', attrs={'class': 'indent'}) 49 | col=[] 50 | val=[] 51 | for tr in trs: 52 | if tr.text in cols: 53 | tds = tr.find_next_siblings("td") # you get list 54 | print (tr.text ) 55 | col.append(tr.text) 56 | print (tds[0].text) 57 | val.append(tds[0].text.strip('\n') 58 | .replace('\xa0','') 59 | .replace('°C','') 60 | .replace('mm','') 61 | .replace('hPa','') 62 | .replace('km/h\n ()','') 63 | .replace('km/h','') 64 | .replace('kilometers','') 65 | .replace('\n\t', '') 66 | .replace('\t', '') 67 | .replace('\n', '') 68 | .replace('- ()', '')) 69 | #.replace(' -', '')) 70 | else: 71 | col.append(tr.text) 72 | val.append(None) 73 | 74 | df = pd.DataFrame({'col':col,'val':val}).set_index('col').T.reset_index() 75 | df['timestamp'] = day 76 | del df['index'] 77 | df = col_fix(df) 78 | print ('df.columns : ' , df.columns ) 79 | print ('cols : ' , cols ) 80 | #df.columns = cols 81 | df = df[cols] 82 | ### update output dataframe 83 | output = output.append(df) 84 | output = output.reset_index() 85 | print (output) 86 | del output['index'] 87 | # fix column name 88 | output.columns = ['mean_temperature','max_temperature', 'min_temperature', 89 | 'heating_degree_days', 'dew_point', 'avg_humidity', 90 | 'max_humidity', 'min_humidity', 'precipitation', 91 | 'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed', 92 | 'visibility', 'events', 'timestamp'] 93 | # re-order columns 94 | output = output[['timestamp','mean_temperature','max_temperature', 'min_temperature', 95 | 'heating_degree_days', 'dew_point', 'avg_humidity','max_humidity', 'min_humidity', 'precipitation', 96 | 'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed', 97 | 'visibility','events']] 98 | # clean data 99 | output=output.replace(' -', np.nan) 100 | print (output) 101 | return output 102 | 103 | if __name__ == '__main__': 104 | df_ = main_('1/1/2016', '12/31/2017') 105 | # dump to DB 106 | write_data_to_db(df_, 'weather_ldn',db_url) 107 | -------------------------------------------------------------------------------- /legacy_project/weather_scrapper/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ### Quick start 5 | 6 | ```bash 7 | # get repo and install packages 8 | $ git clone https://github.com/yennanliu/web_scraping 9 | $ cd web_scraping 10 | # install library 11 | $ source setup.sh 12 | # run demo script 13 | # export pythonpath 14 | # export PYTHONPATH=/Users/yennanliu/web_scraping 15 | $ export PYTHONPATH=$(pwd) 16 | # export db_url 17 | $ export db_url= 18 | $ python weather_scrapper/br_weather_scrapper_V1.py 19 | 20 | ``` 21 | -------------------------------------------------------------------------------- /legacy_project/weather_scrapper/br_weather_scrapper_V1.py: -------------------------------------------------------------------------------- 1 | # ops 2 | import pandas as pd 3 | import numpy as np 4 | import datetime 5 | import urllib, json 6 | from bs4 import BeautifulSoup 7 | 8 | # UDF 9 | from script.utility_data_IO import * 10 | #from script.utility_operation import * 11 | #import utility_data_IO 12 | 13 | cols = ['Mean Temperature', 'Max Temperature', 'Min Temperature', 14 | 'Heating Degree Days', 'Dew Point', 'Average Humidity', 15 | 'Maximum Humidity', 'Minimum Humidity', 'Precipitation', 16 | 'Sea Level Pressure', 'Wind Speed', 'Max Wind Speed', 'Max Gust Speed', 17 | 'Visibility', 'Events', 'timestamp'] 18 | 19 | def col_fix(df): 20 | for col in cols: 21 | if col in df.columns: 22 | pass 23 | else: 24 | df[col] = None 25 | return df 26 | 27 | def main_(start_date,end_date): 28 | output=pd.DataFrame([]) 29 | # ------------- 30 | print ('-----------------') 31 | print ('start_date : ',start_date ) 32 | print ('end_date : ',end_date ) 33 | print ('-----------------') 34 | for day in pd.date_range(start=start_date, end=end_date, freq='D'): 35 | #for day in pd.date_range(start_date='3/1/2017', end_date='3/5/2017', freq='D'): 36 | print ((day)) 37 | date_ = str(day).split(' ')[0] 38 | year_ = date_.split('-')[0] 39 | month_ = date_.split('-')[1] 40 | day_ = date_.split('-')[2] 41 | # ------------- 42 | url_new = 'https://www.wunderground.com/history/airport/EBFS/{}/{}/{}/DailyHistory.html?req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo='.format(year_,month_,day_) 43 | print (url_new) 44 | 45 | # query the page 46 | opener=urllib.request.build_opener() 47 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 48 | page = opener.open(url_new) 49 | soup = BeautifulSoup(page) 50 | trs = soup.find_all('td', attrs={'class': 'indent'}) 51 | col=[] 52 | val=[] 53 | for tr in trs: 54 | if tr.text in cols: 55 | tds = tr.find_next_siblings("td") # you get list 56 | print (tr.text ) 57 | col.append(tr.text) 58 | print (tds[0].text) 59 | val.append(tds[0].text.strip('\n') 60 | .replace('\xa0','') 61 | .replace('°C','') 62 | .replace('mm','') 63 | .replace('hPa','') 64 | .replace('km/h\n ()','') 65 | .replace('km/h','') 66 | .replace('kilometers','') 67 | .replace('\n\t', '') 68 | .replace('\t', '') 69 | .replace('\n', '') 70 | .replace('- ()', '')) 71 | #.replace('-', '')) 72 | else: 73 | col.append(tr.text) 74 | val.append(None) 75 | 76 | df = pd.DataFrame({'col':col,'val':val}).set_index('col').T.reset_index() 77 | df['timestamp'] = day 78 | del df['index'] 79 | df = col_fix(df) 80 | print ('df.columns : ' , df.columns ) 81 | print ('cols : ' , cols ) 82 | #df.columns = cols 83 | df = df[cols] 84 | ### update output dataframe 85 | output = output.append(df) 86 | output = output.reset_index() 87 | print (output) 88 | del output['index'] 89 | output.columns = ['mean_temperature','max_temperature', 'min_temperature', 90 | 'heating_degree_days', 'dew_point', 'avg_humidity', 91 | 'max_humidity', 'min_humidity', 'precipitation', 92 | 'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed', 93 | 'visibility', 'events', 'timestamp'] 94 | 95 | output = output[['timestamp','mean_temperature','max_temperature', 'min_temperature', 96 | 'heating_degree_days', 'dew_point', 'avg_humidity','max_humidity', 'min_humidity', 'precipitation', 97 | 'sea_level_pressure', 'wind_speed', 'max_wind_speed', 'max_gust_speed', 98 | 'visibility','events']] 99 | # clean data 100 | output=output.replace(' -', np.nan) 101 | """ 102 | # modify data type 103 | output['mean_temperature']=output['mean_temperature'].astype('float') 104 | output['max_temperature']=output['max_temperature'].astype('float') 105 | output['min_temperature']=output['min_temperature'].astype('float') 106 | output['heating_degree_days']=output['heating_degree_days'].astype('float') 107 | output['dew_point']=output['dew_point'].astype('float') 108 | output['avg_humidity']=output['avg_humidity'].astype('float') 109 | output['max_humidity']=output['max_humidity'].astype('float') 110 | output['min_humidity']=output['min_humidity'].astype('float') 111 | output['precipitation']=output['precipitation'].astype('float') 112 | output['sea_level_pressure']=output['sea_level_pressure'].astype('float') 113 | output['wind_speed']=output['wind_speed'].astype('float') 114 | output['max_wind_speed']=output['max_wind_speed'].astype('float') 115 | output['max_gust_speed']=output['max_gust_speed'].astype('float') 116 | output['events']=output['events'].astype('str') 117 | 118 | print (output.info()) 119 | """ 120 | print (output.head()) 121 | return output 122 | 123 | if __name__ == '__main__': 124 | df_ = main_('1/1/2018', '1/31/2018') 125 | #df_ = main_('2/1/2016', '2/10/2016') 126 | print (df_) 127 | # dump to DB 128 | write_data_to_db(df_, 'weather_brussels',db_url) -------------------------------------------------------------------------------- /logs/log.txt: -------------------------------------------------------------------------------- 1 | Completed => New+York data+scientist 10 0 6.766853094100952 file_1 2019-08-04-17:45:15 2 | Completed => New+York data+engineer 10 0 3.0191662311553955 file_1 2019-08-31-14:43:31 3 | Completed => New+York data+engineer 20 10 6.937615156173706 file_1 2019-08-31-14:43:31 4 | Completed => New+York data+engineer 30 20 10.762848138809204 file_1 2019-08-31-14:43:31 5 | Completed => New+York data+engineer 40 30 14.53726601600647 file_1 2019-08-31-14:43:31 6 | Completed => New+York data+engineer 50 40 17.391534090042114 file_1 2019-08-31-14:43:31 7 | Completed => New+York data+engineer 60 50 20.694072008132935 file_1 2019-08-31-14:43:31 8 | Completed => New+York data+engineer 70 60 23.58491015434265 file_1 2019-08-31-14:43:31 9 | Completed => New+York data+engineer 80 70 26.531862020492554 file_1 2019-08-31-14:43:31 10 | Completed => New+York data+engineer 90 80 29.492170095443726 file_1 2019-08-31-14:43:31 11 | Completed => New+York data+engineer 100 90 32.42632722854614 file_1 2019-08-31-14:43:31 12 | Completed => New+York data+engineer 110 100 35.59654521942139 file_1 2019-08-31-14:43:31 13 | Completed => New+York data+engineer 120 110 38.5950391292572 file_1 2019-08-31-14:43:31 14 | Completed => New+York data+engineer 130 120 41.553301095962524 file_1 2019-08-31-14:43:31 15 | Completed => New+York data+engineer 140 130 44.89548707008362 file_1 2019-08-31-14:43:31 16 | Completed => New+York data+engineer 150 140 47.64503002166748 file_1 2019-08-31-14:43:31 17 | Completed => New+York data+engineer 160 150 50.49106001853943 file_1 2019-08-31-14:43:31 18 | Completed => New+York data+engineer 170 160 53.34058403968811 file_1 2019-08-31-14:43:31 19 | Completed => New+York data+engineer 180 170 56.251546144485474 file_1 2019-08-31-14:43:31 20 | Completed => New+York data+engineer 190 180 59.024163007736206 file_1 2019-08-31-14:43:31 21 | Completed => New+York data+engineer 200 190 61.882164001464844 file_1 2019-08-31-14:43:31 22 | Completed => New+York data+engineer 210 200 64.63623714447021 file_1 2019-08-31-14:43:31 23 | Completed => New+York data+engineer 220 210 67.43612313270569 file_1 2019-08-31-14:43:31 24 | Completed => New+York data+engineer 230 220 70.1706280708313 file_1 2019-08-31-14:43:31 25 | Completed => New+York data+engineer 240 230 73.02157711982727 file_1 2019-08-31-14:43:31 26 | Completed => New+York data+engineer 250 240 75.7884030342102 file_1 2019-08-31-14:43:31 27 | Completed => New+York data+engineer 260 250 78.51018619537354 file_1 2019-08-31-14:43:31 28 | Completed => New+York data+engineer 270 260 81.20932006835938 file_1 2019-08-31-14:43:31 29 | Completed => New+York data+engineer 280 270 84.06421709060669 file_1 2019-08-31-14:43:31 30 | Completed => New+York data+engineer 290 280 86.99712109565735 file_1 2019-08-31-14:43:31 31 | Completed => New+York data+engineer 300 290 89.98329901695251 file_1 2019-08-31-14:43:31 32 | Completed => Singapore data+engineer 10 0 1.6717309951782227 file_2 2019-08-31-14:43:31 33 | Completed => Singapore data+engineer 20 10 3.3881940841674805 file_2 2019-08-31-14:43:31 34 | Completed => Singapore data+engineer 30 20 5.25773811340332 file_2 2019-08-31-14:43:31 35 | Completed => Singapore data+engineer 40 30 6.985188007354736 file_2 2019-08-31-14:43:31 36 | Completed => Singapore data+engineer 50 40 8.669186115264893 file_2 2019-08-31-14:43:31 37 | Completed => Singapore data+engineer 60 50 10.399969100952148 file_2 2019-08-31-14:43:31 38 | Completed => Singapore data+engineer 70 60 12.182042121887207 file_2 2019-08-31-14:43:31 39 | Completed => Singapore data+engineer 80 70 13.971599102020264 file_2 2019-08-31-14:43:31 40 | Completed => Singapore data+engineer 90 80 15.70194411277771 file_2 2019-08-31-14:43:31 41 | Completed => Singapore data+engineer 100 90 17.422276973724365 file_2 2019-08-31-14:43:31 42 | Completed => Singapore data+engineer 110 100 19.24924898147583 file_2 2019-08-31-14:43:31 43 | Completed => Singapore data+engineer 120 110 21.29511594772339 file_2 2019-08-31-14:43:31 44 | Completed => Singapore data+engineer 130 120 23.084742069244385 file_2 2019-08-31-14:43:31 45 | Completed => Singapore data+engineer 140 130 24.825146913528442 file_2 2019-08-31-14:43:31 46 | Completed => Singapore data+engineer 150 140 26.45589303970337 file_2 2019-08-31-14:43:31 47 | Completed => Singapore data+engineer 160 150 28.121127128601074 file_2 2019-08-31-14:43:31 48 | Completed => Singapore data+engineer 170 160 29.95150399208069 file_2 2019-08-31-14:43:31 49 | Completed => Singapore data+engineer 180 170 31.694020986557007 file_2 2019-08-31-14:43:31 50 | Completed => Singapore data+engineer 190 180 33.31903910636902 file_2 2019-08-31-14:43:31 51 | Completed => Singapore data+engineer 200 190 34.96339702606201 file_2 2019-08-31-14:43:31 52 | Completed => Singapore data+engineer 210 200 36.58315300941467 file_2 2019-08-31-14:43:31 53 | Completed => Singapore data+engineer 220 210 38.307523012161255 file_2 2019-08-31-14:43:31 54 | Completed => Singapore data+engineer 230 220 40.03498101234436 file_2 2019-08-31-14:43:31 55 | Completed => Singapore data+engineer 240 230 41.822685956954956 file_2 2019-08-31-14:43:31 56 | Completed => Singapore data+engineer 250 240 43.43740200996399 file_2 2019-08-31-14:43:31 57 | Completed => Singapore data+engineer 260 250 45.15139198303223 file_2 2019-08-31-14:43:31 58 | Completed => Singapore data+engineer 270 260 46.89502310752869 file_2 2019-08-31-14:43:31 59 | Completed => Singapore data+engineer 280 270 48.57830190658569 file_2 2019-08-31-14:43:31 60 | Completed => Singapore data+engineer 290 280 50.21092700958252 file_2 2019-08-31-14:43:31 61 | Completed => Singapore data+engineer 300 290 51.86261200904846 file_2 2019-08-31-14:43:31 62 | Completed => Tokyo data+engineer 10 0 1.824260950088501 file_3 2019-08-31-14:43:31 63 | Completed => Tokyo data+engineer 20 10 3.4073591232299805 file_3 2019-08-31-14:43:31 64 | Completed => Tokyo data+engineer 30 20 5.069762945175171 file_3 2019-08-31-14:43:31 65 | Completed => Tokyo data+engineer 40 30 6.628056049346924 file_3 2019-08-31-14:43:31 66 | Completed => Tokyo data+engineer 50 40 8.243564128875732 file_3 2019-08-31-14:43:31 67 | Completed => Tokyo data+engineer 60 50 9.836205959320068 file_3 2019-08-31-14:43:31 68 | Completed => Tokyo data+engineer 70 60 11.402924060821533 file_3 2019-08-31-14:43:31 69 | Completed => Tokyo data+engineer 80 70 12.990663051605225 file_3 2019-08-31-14:43:31 70 | Completed => Tokyo data+engineer 90 80 14.621431112289429 file_3 2019-08-31-14:43:31 71 | Completed => Tokyo data+engineer 100 90 16.241452932357788 file_3 2019-08-31-14:43:31 72 | Completed => Tokyo data+engineer 110 100 17.904778003692627 file_3 2019-08-31-14:43:31 73 | Completed => Tokyo data+engineer 120 110 19.482118129730225 file_3 2019-08-31-14:43:31 74 | Completed => Tokyo data+engineer 130 120 21.03421711921692 file_3 2019-08-31-14:43:31 75 | Completed => Tokyo data+engineer 140 130 22.70096707344055 file_3 2019-08-31-14:43:31 76 | Completed => Tokyo data+engineer 150 140 24.55684208869934 file_3 2019-08-31-14:43:31 77 | Completed => Tokyo data+engineer 160 150 26.177658081054688 file_3 2019-08-31-14:43:31 78 | Completed => Tokyo data+engineer 170 160 27.839234113693237 file_3 2019-08-31-14:43:31 79 | Completed => Tokyo data+engineer 180 170 29.469682931900024 file_3 2019-08-31-14:43:31 80 | Completed => Tokyo data+engineer 190 180 31.041446924209595 file_3 2019-08-31-14:43:31 81 | Completed => Tokyo data+engineer 200 190 32.85800004005432 file_3 2019-08-31-14:43:31 82 | Completed => Tokyo data+engineer 210 200 34.42713212966919 file_3 2019-08-31-14:43:31 83 | Completed => Tokyo data+engineer 220 210 35.96600794792175 file_3 2019-08-31-14:43:31 84 | Completed => Tokyo data+engineer 230 220 37.524291038513184 file_3 2019-08-31-14:43:31 85 | Completed => Tokyo data+engineer 240 230 39.25459814071655 file_3 2019-08-31-14:43:31 86 | Completed => Tokyo data+engineer 250 240 41.38111710548401 file_3 2019-08-31-14:43:31 87 | Completed => Tokyo data+engineer 260 250 42.982495069503784 file_3 2019-08-31-14:43:31 88 | Completed => Tokyo data+engineer 270 260 44.96594715118408 file_3 2019-08-31-14:43:31 89 | Completed => Tokyo data+engineer 280 270 46.65275692939758 file_3 2019-08-31-14:43:31 90 | Completed => Tokyo data+engineer 290 280 48.339587926864624 file_3 2019-08-31-14:43:31 91 | Completed => Tokyo data+engineer 300 290 49.98532509803772 file_3 2019-08-31-14:43:31 92 | Completed => New+York data+engineer 10 0 2.8843870162963867 file_1 2019-09-05-19:11:36 93 | Completed => New+York machine+learning+engineer 10 0 2.7446670532226562 file_2 2019-09-05-19:11:36 94 | Completed => Singapore data+engineer 10 0 1.896003007888794 file_3 2019-09-05-19:11:36 95 | Completed => Singapore machine+learning+engineer 10 0 1.860619068145752 file_4 2019-09-05-19:11:36 96 | Completed => Tokyo data+engineer 10 0 1.6421759128570557 file_5 2019-09-05-19:11:36 97 | Completed => Tokyo machine+learning+engineer 10 0 1.54783296585083 file_6 2019-09-05-19:11:36 98 | Completed => New+York data+engineer 10 0 2.88240909576416 file_1 2019-09-05-19:13:34 99 | Completed => New+York machine+learning+engineer 10 0 2.709868907928467 file_2 2019-09-05-19:13:34 100 | Completed => Singapore data+engineer 10 0 1.807615041732788 file_3 2019-09-05-19:13:34 101 | Completed => Singapore machine+learning+engineer 10 0 1.8387038707733154 file_4 2019-09-05-19:13:34 102 | Completed => Tokyo data+engineer 10 0 1.774501085281372 file_5 2019-09-05-19:13:34 103 | Completed => Tokyo machine+learning+engineer 10 0 1.5109920501708984 file_6 2019-09-05-19:13:34 104 | Completed => New+York data+engineer 10 0 2.8072328567504883 file_1 2019-09-05-19:15:41 105 | Completed => New+York machine+learning+engineer 10 0 2.7286789417266846 file_2 2019-09-05-19:15:41 106 | Completed => Singapore data+engineer 10 0 1.6815550327301025 file_3 2019-09-05-19:15:41 107 | Completed => Singapore machine+learning+engineer 10 0 1.6483988761901855 file_4 2019-09-05-19:15:41 108 | Completed => Tokyo data+engineer 10 0 1.5782999992370605 file_5 2019-09-05-19:15:41 109 | Completed => Tokyo machine+learning+engineer 10 0 1.572638988494873 file_6 2019-09-05-19:15:41 110 | Completed => New+York data+engineer 10 0 2.9347329139709473 file_1 2019-09-05-19:16:35 111 | Completed => New+York machine+learning+engineer 10 0 3.2965481281280518 file_2 2019-09-05-19:16:35 112 | Completed => Singapore data+engineer 10 0 1.686279058456421 file_3 2019-09-05-19:16:35 113 | Completed => Singapore machine+learning+engineer 10 0 1.6427698135375977 file_4 2019-09-05-19:16:35 114 | Completed => Tokyo data+engineer 10 0 1.5504868030548096 file_5 2019-09-05-19:16:35 115 | Completed => Tokyo machine+learning+engineer 10 0 1.6476211547851562 file_6 2019-09-05-19:16:35 116 | Completed => New+York data+engineer 10 0 3.029778003692627 file_1 2019-09-05-19:21:41 117 | Completed => New+York machine+learning+engineer 10 0 2.7649660110473633 file_2 2019-09-05-19:21:41 118 | Completed => San+Francisco data+engineer 10 0 2.9623639583587646 file_3 2019-09-05-19:21:41 119 | Completed => San+Francisco machine+learning+engineer 10 0 2.8309409618377686 file_4 2019-09-05-19:21:41 120 | Completed => Singapore data+engineer 10 0 1.9062411785125732 file_5 2019-09-05-19:21:41 121 | Completed => Singapore machine+learning+engineer 10 0 2.7771799564361572 file_6 2019-09-05-19:21:41 122 | Completed => Tokyo data+engineer 10 0 2.502600908279419 file_7 2019-09-05-19:21:41 123 | Completed => Tokyo machine+learning+engineer 10 0 1.9742908477783203 file_8 2019-09-05-19:21:41 124 | Completed => New+York data+engineer 10 0 2.835904836654663 file_1 2019-09-05-19:30:26 125 | Completed => New+York data+engineer 20 10 5.764356851577759 file_1 2019-09-05-19:30:26 126 | Completed => New+York data+engineer 30 20 8.615731000900269 file_1 2019-09-05-19:30:26 127 | Completed => New+York data+engineer 40 30 11.705473899841309 file_1 2019-09-05-19:30:26 128 | Completed => New+York data+engineer 50 40 14.562924861907959 file_1 2019-09-05-19:30:26 129 | Completed => New+York machine+learning+engineer 10 0 2.725008010864258 file_2 2019-09-05-19:30:26 130 | Completed => New+York machine+learning+engineer 20 10 5.403141021728516 file_2 2019-09-05-19:30:26 131 | Completed => New+York machine+learning+engineer 30 20 8.16013503074646 file_2 2019-09-05-19:30:26 132 | Completed => New+York machine+learning+engineer 40 30 10.907695055007935 file_2 2019-09-05-19:30:26 133 | Completed => New+York machine+learning+engineer 50 40 13.59488296508789 file_2 2019-09-05-19:30:26 134 | Completed => New+York data+scientist 10 0 2.8523430824279785 file_3 2019-09-05-19:30:26 135 | Completed => New+York data+scientist 20 10 5.502207040786743 file_3 2019-09-05-19:30:26 136 | Completed => New+York data+scientist 30 20 8.322864055633545 file_3 2019-09-05-19:30:26 137 | Completed => New+York data+scientist 40 30 11.044306993484497 file_3 2019-09-05-19:30:26 138 | Completed => New+York data+scientist 50 40 13.77783203125 file_3 2019-09-05-19:30:26 139 | Completed => San+Francisco data+engineer 10 0 3.283784866333008 file_4 2019-09-05-19:30:26 140 | Completed => San+Francisco data+engineer 20 10 6.079689979553223 file_4 2019-09-05-19:30:26 141 | Completed => San+Francisco data+engineer 30 20 9.213353872299194 file_4 2019-09-05-19:30:26 142 | Completed => San+Francisco data+engineer 40 30 11.911937952041626 file_4 2019-09-05-19:30:26 143 | Completed => San+Francisco data+engineer 50 40 14.650459051132202 file_4 2019-09-05-19:30:26 144 | Completed => San+Francisco data+scientist 10 0 2.6817221641540527 file_6 2019-09-05-19:30:26 145 | Completed => San+Francisco data+scientist 20 10 5.355727195739746 file_6 2019-09-05-19:30:26 146 | Completed => San+Francisco data+scientist 30 20 8.048684120178223 file_6 2019-09-05-19:30:26 147 | Completed => San+Francisco data+scientist 40 30 10.72742509841919 file_6 2019-09-05-19:30:26 148 | Completed => San+Francisco data+scientist 50 40 13.383225202560425 file_6 2019-09-05-19:30:26 149 | Completed => Singapore data+engineer 10 0 1.6111540794372559 file_7 2019-09-05-19:30:26 150 | Completed => Singapore data+engineer 20 10 3.2746400833129883 file_7 2019-09-05-19:30:26 151 | Completed => Singapore data+engineer 30 20 4.83650803565979 file_7 2019-09-05-19:30:26 152 | Completed => Singapore data+engineer 40 30 6.416918992996216 file_7 2019-09-05-19:30:26 153 | Completed => Singapore data+engineer 50 40 7.974658012390137 file_7 2019-09-05-19:30:26 154 | Completed => Singapore machine+learning+engineer 10 0 1.604966163635254 file_8 2019-09-05-19:30:26 155 | Completed => Singapore machine+learning+engineer 20 10 3.1582190990448 file_8 2019-09-05-19:30:26 156 | Completed => Singapore machine+learning+engineer 30 20 4.670161962509155 file_8 2019-09-05-19:30:26 157 | Completed => Singapore machine+learning+engineer 40 30 6.392184019088745 file_8 2019-09-05-19:30:26 158 | Completed => Singapore machine+learning+engineer 50 40 7.988824129104614 file_8 2019-09-05-19:30:26 159 | Completed => Singapore data+scientist 10 0 1.6494929790496826 file_9 2019-09-05-19:30:26 160 | Completed => Singapore data+scientist 20 10 3.2304630279541016 file_9 2019-09-05-19:30:26 161 | Completed => Singapore data+scientist 30 20 4.793152093887329 file_9 2019-09-05-19:30:26 162 | Completed => Singapore data+scientist 40 30 6.354517936706543 file_9 2019-09-05-19:30:26 163 | Completed => Singapore data+scientist 50 40 7.877964973449707 file_9 2019-09-05-19:30:26 164 | Completed => Tokyo data+engineer 10 0 1.583630084991455 file_10 2019-09-05-19:30:26 165 | Completed => Tokyo data+engineer 20 10 3.121217966079712 file_10 2019-09-05-19:30:26 166 | Completed => Tokyo data+engineer 30 20 4.6353020668029785 file_10 2019-09-05-19:30:26 167 | Completed => Tokyo data+engineer 40 30 6.194667100906372 file_10 2019-09-05-19:30:26 168 | Completed => Tokyo data+engineer 50 40 7.732399940490723 file_10 2019-09-05-19:30:26 169 | Completed => Tokyo machine+learning+engineer 10 0 1.5184919834136963 file_11 2019-09-05-19:30:26 170 | Completed => Tokyo machine+learning+engineer 20 10 3.030639886856079 file_11 2019-09-05-19:30:26 171 | Completed => Tokyo machine+learning+engineer 30 20 4.5386269092559814 file_11 2019-09-05-19:30:26 172 | Completed => Tokyo machine+learning+engineer 40 30 6.047660827636719 file_11 2019-09-05-19:30:26 173 | Completed => Tokyo machine+learning+engineer 50 40 7.574220895767212 file_11 2019-09-05-19:30:26 174 | Completed => Tokyo data+scientist 10 0 1.5887770652770996 file_12 2019-09-05-19:30:26 175 | Completed => Tokyo data+scientist 20 10 3.1191091537475586 file_12 2019-09-05-19:30:26 176 | Completed => Tokyo data+scientist 30 20 4.641216993331909 file_12 2019-09-05-19:30:26 177 | Completed => Tokyo data+scientist 40 30 6.14634895324707 file_12 2019-09-05-19:30:26 178 | Completed => Tokyo data+scientist 50 40 7.6380720138549805 file_12 2019-09-05-19:30:26 179 | Completed => New+York data+scientist 10 0 2.875648021697998 file_1 2020-01-03-10:16:24 180 | Completed => New+York data+scientist 20 10 5.757162094116211 file_1 2020-01-03-10:16:24 181 | Completed => New+York data+scientist 30 20 8.728279113769531 file_1 2020-01-03-10:16:24 182 | Completed => New+York data+scientist 40 30 11.757231950759888 file_1 2020-01-03-10:16:24 183 | Completed => New+York data+scientist 50 40 14.588212013244629 file_1 2020-01-03-10:16:24 184 | -------------------------------------------------------------------------------- /output/2019-08-14_jobs_1.txt: -------------------------------------------------------------------------------- 1 | ,unique_id,city,job_qry,job_title,company_name,location,summary,salary,link,date,full_text 2 | 1,p_baf234a5dd0cc155,New+York,data+scientist,Data Scientist,PepsiCo,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=baf234a5dd0cc155&fccid=2973259ddc967948&vjs=3,4 hours ago,NOT_FOUND 3 | 2,p_1942a17d2bd166e1,New+York,data+scientist,Jr. Data Scientist,Viacom,"New York, NY 10036",NOT_FOUND,NOT_FOUND,/rc/clk?jk=1942a17d2bd166e1&fccid=ae0c894528aa6eee&vjs=3,15 days ago,NOT_FOUND 4 | 3,p_fa03c1992457a2b9,New+York,data+scientist,Data Scientist,AETNA,"New York, NY 10016 (Gramercy area)",NOT_FOUND,NOT_FOUND,/rc/clk?jk=fa03c1992457a2b9&fccid=7077d7e88049c02a&vjs=3,3 days ago,NOT_FOUND 5 | 4,p_9c04b28a806b92ab,New+York,data+scientist,Data Scientist,Butterfly Network,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=9c04b28a806b92ab&fccid=f34adc12ba09e47f&vjs=3,1 day ago,NOT_FOUND 6 | 5,p_f6daa5cf3e224f5b,New+York,data+scientist,Data Scientist,"AbleTo, Inc.","New York, NY 10010 (Gramercy area)",NOT_FOUND,NOT_FOUND,/rc/clk?jk=f6daa5cf3e224f5b&fccid=954e57501f6bca1f&vjs=3,10 hours ago,NOT_FOUND 7 | 6,p_7a578b59b17acc55,New+York,data+scientist,Machine Learning Data Scientist,UBS,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=7a578b59b17acc55&fccid=1c76c3a36f6c7557&vjs=3,4 days ago,NOT_FOUND 8 | 7,p_62769c201dded401,New+York,data+scientist,Data Scientist,WW International (formerly Weight Watchers),"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=62769c201dded401&fccid=c29a08660ede9319&vjs=3,30+ days ago,NOT_FOUND 9 | 8,p_d19b36ecfcd4b8a7,New+York,data+scientist,Junior Data Scientist,"Remedy BPCI Partners, LLC.","New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=d19b36ecfcd4b8a7&fccid=9744e569304da4d3&vjs=3,13 days ago,NOT_FOUND 10 | 9,p_2646d9747e37d496,New+York,data+scientist,Junior Data Scientist,Remedy Partners,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=2646d9747e37d496&fccid=3f0ffcc867369bf0&vjs=3,13 days ago,NOT_FOUND 11 | 10,p_4f7dc642a14556eb,New+York,data+scientist,Data Scientist (NY),Debtsy,"New York, NY",NOT_FOUND,NOT_FOUND,/rc/clk?jk=4f7dc642a14556eb&fccid=ec3c520f9d48f531&vjs=3,26 days ago,NOT_FOUND 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | celery==5.2.2 2 | requests 3 | pymongo==3.4.0 4 | redis 5 | beautifulsoup4 6 | pandas 7 | lxml 8 | -------------------------------------------------------------------------------- /script/send_mail.py: -------------------------------------------------------------------------------- 1 | import os, smtplib 2 | 3 | class SendMail: 4 | 5 | def __init__(self): 6 | pass 7 | 8 | def send_mail(self, message, to, subject): 9 | # Gmail Sign In 10 | if not os.getenv('FLASK_ENV') == 'testing': 11 | gmail_sender = os.getenv("EMAIL") 12 | gmail_passwd = os.getenv("EMAIL_PASS") 13 | 14 | try: 15 | server = smtplib.SMTP('smtp.gmail.com', 587) 16 | server.ehlo() 17 | server.starttls() 18 | server.login(gmail_sender, gmail_passwd) 19 | 20 | BODY = '\r\n'.join(['To: %s' % to, 21 | 'From: %s' % gmail_sender, 22 | 'Subject: %s' % subject, 23 | '', message]) 24 | 25 | server.sendmail(gmail_sender, [to], BODY) 26 | res = "sent" 27 | except: 28 | res = "fail" 29 | server.quit() 30 | return res 31 | return True 32 | 33 | def main(message, to, subject): 34 | try: 35 | SendMail().send_mail(message, to, subject) 36 | print ('email sent!') 37 | except Exception as e: 38 | print ('email sending failed', e) 39 | 40 | if __name__ == '__main__': 41 | message='this is test msg' 42 | to='' 43 | subject='eamil from SendMail script' 44 | main(message, to, subject) -------------------------------------------------------------------------------- /slack_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ########################################################################################## 4 | # MODIFY FROM https://github.com/guzzijason/slack-upload-bash/blob/master/slack-upload.sh 5 | ########################################################################################## 6 | 7 | # This bash script makes use of the Slack API to upload files. 8 | # I found this useful due to the fact that the attachement option 9 | # available in incoming webhooks seems to have an upper limit of 10 | # content size, which is way too small. 11 | # 12 | # See also: https://api.slack.com/methods/files.upload 13 | 14 | # safety first 15 | set -euf -o pipefail 16 | 17 | echo='echo -e' 18 | 19 | Usage() { 20 | ${echo} 21 | ${echo} "\tusage:\n\t\t$0 [OPTIONS]" 22 | ${echo} 23 | ${echo} "Required:" 24 | ${echo} " -c CHANNEL\tSlack channel to post to" 25 | ${echo} " -f FILENAME\tName of file to upload" 26 | ${echo} " -s SLACK_TOKEN\tAPI auth token" 27 | ${echo} 28 | ${echo} "Optional:" 29 | ${echo} " -u API_URL\tSlack API endpoint to use (default: https://slack.com/api/files.upload)" 30 | ${echo} " -h \tPrint help" 31 | ${echo} " -m TYPE\tFile type (see https://api.slack.com/types/file#file_types)" 32 | ${echo} " -n TITLE\tTitle for slack post" 33 | ${echo} " -v \tVerbose mode" 34 | ${echo} " -x COMMENT\tAdd a comment to the file" 35 | ${echo} 36 | exit ${1:-$USAGE} 37 | } 38 | 39 | # Exit Vars 40 | : ${HELP:=0} 41 | : ${USAGE:=1} 42 | 43 | # Default Vars 44 | API_URL='https://slack.com/api/files.upload' 45 | CURL_OPTS='-s' 46 | 47 | # main 48 | while getopts :c:f:s:u:hm:n:vx: OPT; do 49 | case ${OPT} in 50 | c) 51 | CHANNEL="$OPTARG" 52 | ;; 53 | f) 54 | FILENAME="$OPTARG" 55 | SHORT_FILENAME=$(basename ${FILENAME}) 56 | ;; 57 | s) 58 | SLACK_TOKEN="$OPTARG" 59 | ;; 60 | u) 61 | API_URL="$OPTARG" 62 | ;; 63 | h) 64 | Usage ${HELP} 65 | ;; 66 | m) 67 | CURL_OPTS="${CURL_OPTS} -F filetype=${OPTARG}" 68 | ;; 69 | n) 70 | CURL_OPTS="${CURL_OPTS} -F title='${OPTARG}'" 71 | ;; 72 | v) 73 | CURL_OPTS="${CURL_OPTS} -v" 74 | ;; 75 | x) 76 | CURL_OPTS="${CURL_OPTS} -F initial_comment='${OPTARG}'" 77 | ;; 78 | \?) 79 | echo "Invalid option: -$OPTARG" >&2 80 | Usage ${USAGE} 81 | ;; 82 | esac 83 | done 84 | 85 | if [[ ( "${CHANNEL}" != "#"* ) && ( "${CHANNEL}" != "@"* ) ]]; then 86 | CHANNEL="#${CHANNEL}" 87 | fi 88 | 89 | # had to use eval to avoid strange whitespace behavior in options 90 | eval curl $CURL_OPTS \ 91 | --form-string channels=${CHANNEL} \ 92 | -F file=@${FILENAME} \ 93 | -F filename=${SHORT_FILENAME} \ 94 | -F token=${SLACK_TOKEN} \ 95 | ${API_URL} 96 | 97 | exit 0 -------------------------------------------------------------------------------- /tests/unit_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | sys.path.append(".") 4 | from bs4 import BeautifulSoup 5 | from celery_queue.IndeedScrapper.indeed_extract import (get_soup as get_soup_, 6 | extract_company as extract_company_, 7 | extract_salary as extract_salary_, 8 | extract_location as extract_location_, 9 | extract_job_title as extract_job_title_, 10 | extract_summary as extract_summary_, 11 | extract_link as extract_link_, 12 | extract_date as extract_date_, 13 | extract_fulltext as extract_fulltext_, 14 | write_logs as write_logs_, 15 | get_full_job_link as get_full_job_link_) 16 | 17 | 18 | with open('tests/unittest_data.txt', 'r') as file: 19 | html = file.read() 20 | 21 | def test_get_soup(): 22 | text = """

Here's a paragraph of text!

""" 23 | result = get_soup_(text) 24 | assert result.text.strip() == "Here's a paragraph of text!" 25 | 26 | def test_extract_company(): 27 | expected = '\\n\\nU3 INFOTECH PTE. LTD.' 28 | soup = BeautifulSoup(html) 29 | result = extract_company_(soup) 30 | assert result == expected 31 | 32 | def test_extract_salary(): 33 | expected = 'NOT_FOUND' 34 | soup = BeautifulSoup(html) 35 | result = extract_salary_(soup) 36 | assert result == expected 37 | 38 | def test_extract_location(): 39 | expected= 'Shenton Way' 40 | soup = BeautifulSoup(html) 41 | result = extract_location_(soup) 42 | assert result == expected 43 | 44 | def test_extract_job_title(): 45 | expected= 'NOT_FOUND' 46 | soup = BeautifulSoup(html) 47 | result = extract_job_title_(soup) 48 | assert result == expected 49 | 50 | def test_extract_summary(): 51 | expected= 'NOT_FOUND' 52 | soup = BeautifulSoup(html) 53 | result = extract_summary_(soup) 54 | assert result == expected 55 | 56 | def test_extract_link(): 57 | expected= 'NOT_FOUND' 58 | soup = BeautifulSoup(html) 59 | result = extract_link_(soup) 60 | assert result == expected 61 | 62 | def test_extract_date(): 63 | expected= '1 day ago' 64 | soup = BeautifulSoup(html) 65 | result = extract_date_(soup) 66 | assert result == expected 67 | 68 | def test_extract_fulltext(): 69 | expected= 'NOT_FOUND' 70 | soup = BeautifulSoup(html) 71 | result = extract_fulltext_(soup) 72 | assert result == expected 73 | 74 | def test_get_full_job_link_(): 75 | expected1 = 'https://www.indeed.com.sg/123' 76 | expected2 = 'https://jp.indeed.com/123' 77 | result1 = get_full_job_link_("123", city='Singapore') 78 | result2 = get_full_job_link_("123", city='Tokyo') 79 | assert result1 == expected1 80 | assert result2 == expected2 81 | 82 | 83 | if __name__ == '__main__': 84 | pytest.main([__file__]) 85 | -------------------------------------------------------------------------------- /tests/unit_test_celery.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | sys.path.append(".") 4 | sys.path.append("./celery_queue") 5 | import pytest 6 | from unittest.mock import patch 7 | from unittest import TestCase 8 | from celery import chain 9 | from celery_queue import tasks 10 | 11 | # Ref of celery mock unit test 12 | # - https://www.distributedpython.com/2018/05/15/testing-celery-chains/ 13 | # - https://www.distributedpython.com/2018/05/01/unit-testing-celery-tasks/ 14 | # - http://docs.celeryproject.org/en/latest/userguide/testing.html 15 | 16 | class TestAddTask(unittest.TestCase): 17 | 18 | def test_task_state_and_addition(self): 19 | 20 | task = tasks.add.apply(args=[3, 5]) 21 | self.assertEqual(task.status, "SUCCESS") 22 | self.assertEqual(task.result, 8) 23 | 24 | class TestMultiplyTask(unittest.TestCase): 25 | 26 | def test_task_state_and_multiply(self): 27 | 28 | task = tasks.multiply.apply(args=[3, 5]) 29 | self.assertEqual(task.status, "SUCCESS") 30 | self.assertEqual(task.result, 15) 31 | 32 | class TestScrapeTask(unittest.TestCase): 33 | 34 | def test_task_state_and_scrape(self): 35 | 36 | task = tasks.scrape.apply() 37 | self.assertEqual(task.status, "SUCCESS") 38 | self.assertEqual(type(task.result), str) 39 | 40 | class TestIndeedScrapTask(unittest.TestCase): 41 | 42 | def test_task_indeed_scrape(self): 43 | 44 | task = tasks.indeed_scrape.apply() 45 | self.assertEqual(task.status, "SUCCESS") 46 | self.assertEqual(type(task.result), type(None)) 47 | 48 | class TestIndeedScrapAPITask(unittest.TestCase): 49 | 50 | def test_task_indeed_scrape_api(self): 51 | 52 | task = tasks.indeed_scrape_api.apply(args=["Tokyo"]) 53 | self.assertEqual(task.status, "SUCCESS") 54 | self.assertEqual(type(task.result), type(None)) 55 | 56 | 57 | # class TestAddTask(unittest.TestCase): 58 | # 59 | # def setUp(self): 60 | # self.task = add.apply_async(args=[3, 5]) 61 | # self.results = self.task.get() 62 | # 63 | # def test_task_state(self): 64 | # self.assertEqual(self.task.state, "SUCCESS") 65 | # 66 | # def test_addition(self): 67 | # self.assertEqual(self.results, 8) 68 | # 69 | # class TestMultiplyTask(unittest.TestCase): 70 | # 71 | # def setUp(self): 72 | # self.task = multiply.apply_async(args=[3, 5]) 73 | # self.results = self.task.get() 74 | # 75 | # def test_task_state(self): 76 | # self.assertEqual(self.task.state, "SUCCESS") 77 | # 78 | # def test_multiplication(self): 79 | # self.assertEqual(self.results, 15) 80 | # 81 | # class TestScrapeTask(unittest.TestCase): 82 | # 83 | # def setUp(self): 84 | # self.task = scrape.apply_async() 85 | # self.results = self.task.get() 86 | # 87 | # def test_task_state(self): 88 | # self.assertEqual(self.task.state, "SUCCESS") 89 | # 90 | # def test_scraping(self): 91 | # self.assertEqual(type(self.results), str) 92 | 93 | if __name__ == '__main__': 94 | unittest.main() -------------------------------------------------------------------------------- /travis_push_github.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #################################################################### 3 | # MODIFY FROM https://gist.github.com/willprice/e07efd73fb7f13f917ea 4 | #################################################################### 5 | 6 | setup_git() { 7 | git init 8 | git config --global user.email "travis@travis-ci.org" 9 | git config --global user.name "Travis CI" 10 | } 11 | 12 | commit_website_files() { 13 | git checkout -b gh-pages 14 | git add . *.html 15 | git commit --message "Travis build: $TRAVIS_BUILD_NUMBER" 16 | } 17 | 18 | commit_output_file() { 19 | git status 20 | git add output/* 21 | git commit --m "Travis build : $TRAVIS_BUILD_NUMBER" 22 | } 23 | 24 | commit_new_output_file() { 25 | d=`date +%Y-%m-%d` && echo $d 26 | git status 27 | for file in "output"/* 28 | do 29 | if [[ "$file" == *"$d"* ]];then 30 | echo "no today's new file, nothing to commit" 31 | else 32 | echo "commit new file..." 33 | git add output/* 34 | git commit --m "Travis build : $TRAVIS_BUILD_NUMBER" 35 | fi 36 | done 37 | } 38 | 39 | upload_files() { 40 | echo 'Travis push to github' 41 | git push https://yennanliu:${GH_TOKEN}@${GH_REF} HEAD:master --quiet 42 | 43 | } 44 | 45 | GH_REF=github.com/yennanliu/web_scraping.git 46 | setup_git 47 | commit_new_output_file 48 | upload_files --------------------------------------------------------------------------------