├── 2018_datayanolja__more_effective_web_crawling.pdf ├── README.md └── source ├── google_trend_realtime.py ├── google_trend_realtime_webdriver.py ├── time_test.py ├── vogue_crawler.py └── vogue_crawler_webdriver.py /2018_datayanolja__more_effective_web_crawling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BbChip0103/2018_datayanolja_webcrawling/f4eb58d35f4862165f4facd77932e00f49cf94b5/2018_datayanolja__more_effective_web_crawling.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2018_datayanolja_webcrawling 2 | ### 2018 데이터야 놀자에서 '웹 크롤링 좀 더 잘하기' 세션의 자료입니다. 3 | ### 'source/'의 경우 Case5에서 테스트했던 예제 소스를 담았습니다. 4 | 5 | ### https://www.slideshare.net/wangwonLee/2018-datayanolja-moreeffectivewebcrawling 6 | -------------------------------------------------------------------------------- /source/google_trend_realtime.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import json 5 | 6 | import time 7 | 8 | def res_to_dict(text): 9 | # 첫줄이')]}\',와 같은 의미없는 문자가 들어있어서 제외 10 | # text = text.split('\n')[1] 11 | # text = convert_utf8_to_euckr(text) 12 | text = text[5:] 13 | trend_dic = json.loads(text) 14 | return trend_dic 15 | 16 | def utf8_to_euckr(unicode_string): 17 | return unicode_string.encode('euc-kr', 'replace').decode('euc-kr') 18 | 19 | def get_keyword_id_list(hl='ko',geo='US',category='all'): 20 | url = 'https://trends.google.co.kr/trends/api/realtimetrends' 21 | query = {'hl':hl, 'geo':geo, 'cat':category, 'sort':'0', 22 | 'tz':'-540', 'fi':'0', 'fs':'0', 'ri':'300', 'rs':'20'} 23 | response = requests.get(url, params=query) 24 | keyword_dict = res_to_dict(response.text) 25 | keyword_list = keyword_dict['trendingStoryIds'] 26 | return keyword_list 27 | 28 | def get_realtime_keyword_data(keyword): 29 | url = "https://trends.google.co.kr/trends/api/stories"+'/'+keyword 30 | query = {'hl':'ko', 'tz':'-540'} 31 | response = requests.get(url, params=query) 32 | keyword_dict = res_to_dict(response.text) 33 | return keyword_dict 34 | 35 | def get_reltime_keword_summary(keyword_list): 36 | url = 'https://trends.google.co.kr/trends/api/stories/summary?id=' 37 | url = url + '&id='.join(keyword_list) 38 | query = {'hl':'ko', 'tz':'-540', 'cat':'all'} 39 | response = requests.get(url, params=query) 40 | summary_dict = res_to_dict(response.text) 41 | return summary_dict 42 | 43 | def google_trend_title_no_webdriver(): 44 | id_list = get_keyword_id_list() 45 | 46 | chunk_size = 40 # max chunk size is 40 47 | id_list_chunked = [id_list[i:i+chunk_size] 48 | for i in range(0, len(id_list), chunk_size)] 49 | 50 | # 이 부분 분산처리 가능 51 | title_list = [] 52 | for chunk in id_list_chunked: 53 | summary_dict = get_reltime_keword_summary(chunk) 54 | keyword_list = summary_dict['trendingStories'] 55 | title_list += [utf8_to_euckr(data['title']) for data in keyword_list] 56 | 57 | # print(title_list) 58 | # print(len(title_list)) 59 | 60 | return title_list 61 | 62 | 63 | if __name__ == '__main__': 64 | google_trend_title_no_webdriver() 65 | -------------------------------------------------------------------------------- /source/google_trend_realtime_webdriver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.common.exceptions import TimeoutException 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.common.by import By 7 | import time 8 | 9 | def convert_utf8_to_euckr(unicode_string): 10 | return unicode_string.encode('euc-kr', 'replace').decode('euc-kr') 11 | 12 | def load_all_contents(driver, timeout=20): 13 | try: 14 | while(True): 15 | button_present = EC.presence_of_element_located((By.CLASS_NAME, 'feed-load-more-button')) 16 | WebDriverWait(driver, timeout).until(button_present) 17 | 18 | driver.find_element_by_class_name('feed-load-more-button').click() 19 | time.sleep(1) 20 | 21 | except TimeoutException: 22 | # print ("Timed out waiting for page to load") 23 | return 24 | except: 25 | # print('???') 26 | return 27 | 28 | 29 | def google_trend_title_webdriver(): 30 | driver = webdriver.Chrome('./chromedriver') 31 | # driver.implicitly_wait(5) 32 | 33 | driver.get("https://trends.google.com/trends/trendingsearches/realtime?geo=US&category=all") 34 | load_all_contents(driver) 35 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") 36 | 37 | title_list = driver.find_elements_by_class_name('details-top') 38 | title_list = [convert_utf8_to_euckr(tag.text) for tag in title_list] 39 | 40 | # print(title_list) 41 | # print(len(title_list)) 42 | 43 | driver.close() 44 | 45 | return title_list 46 | 47 | 48 | if __name__=='__main__': 49 | google_trend_title_webdriver() 50 | -------------------------------------------------------------------------------- /source/time_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | from google_trend_realtime_webdriver import google_trend_title_webdriver 3 | from google_trend_realtime import google_trend_title_no_webdriver 4 | from vogue_crawler_webdriver import vogue_korea_title_webdriver 5 | from vogue_crawler import vogue_korea_title_no_webdriver 6 | from functools import partial 7 | 8 | def check_time(func): 9 | start_time = time.time() 10 | result = func() 11 | e = int(time.time() - start_time) 12 | print(result[:5]) 13 | # print(len(result)) 14 | print('{:02d}:{:02d}:{:02d}'.format(e//3600, (e%3600 // 60), e%60)) 15 | return e 16 | 17 | def google_trend_time_test(test_cnt=1): 18 | print('--google_trend_title_webdriver--') 19 | total_time = 0 20 | for i in range(test_cnt): 21 | total_time += check_time(google_trend_title_webdriver) 22 | average_sec = int(total_time / test_cnt) 23 | average_time = '{:02d}:{:02d}:{:02d}'.format(average_sec//3600, 24 | average_sec%3600 // 60, 25 | average_sec%60) 26 | print('average_time:', average_time, ',', average_sec, '[sec]') 27 | print() 28 | 29 | print('--google_trend_title_no_webdriver--') 30 | total_time = 0 31 | for i in range(test_cnt): 32 | total_time += check_time(google_trend_title_no_webdriver) 33 | average_sec = int(total_time / test_cnt) 34 | average_time = '{:02d}:{:02d}:{:02d}'.format(average_sec//3600, 35 | average_sec%3600 // 60, 36 | average_sec%60) 37 | print('average_time:', average_time, ',', average_sec, '[sec]') 38 | print() 39 | 40 | def vogue_korea_time_test(test_cnt=1, page_numb=1): 41 | vogue_webd = partial(vogue_korea_title_webdriver, page_numb=page_numb) 42 | vogue_no_webd = partial(vogue_korea_title_no_webdriver, page_numb=page_numb) 43 | 44 | print('--vogue_korea_title_webdriver--') 45 | total_time = 0 46 | for i in range(test_cnt): 47 | total_time += check_time(vogue_webd) 48 | average_sec = int(total_time / test_cnt) 49 | average_time = '{:02d}:{:02d}:{:02d}'.format(average_sec//3600, 50 | average_sec%3600 // 60, 51 | average_sec%60) 52 | print('average_time:', average_time, ',', average_sec, '[sec]') 53 | print() 54 | 55 | print('--vogue_korea_title_no_webdriver--') 56 | total_time = 0 57 | for i in range(test_cnt): 58 | total_time += check_time(vogue_no_webd) 59 | average_sec = int(total_time / test_cnt) 60 | average_time = '{:02d}:{:02d}:{:02d}'.format(average_sec//3600, 61 | average_sec%3600 // 60, 62 | average_sec%60) 63 | print('average_time:', average_time, ',', average_sec, '[sec]') 64 | print() 65 | 66 | 67 | if __name__ == '__main__': 68 | google_trend_time_test(10) 69 | vogue_korea_time_test(10, 10) 70 | -------------------------------------------------------------------------------- /source/vogue_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from bs4 import BeautifulSoup 5 | import requests 6 | import time 7 | import re 8 | 9 | def get_page(url): 10 | headers = {'Content-Type': 'application/json; charset=utf-8', 11 | 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} 12 | r = requests.get(url, headers=headers) 13 | soup = BeautifulSoup(r.content, 'lxml', from_encoding='utf-8') 14 | r.close() 15 | return soup 16 | 17 | def get_post_list(soup): 18 | return soup.select('h2.entry-title.fusion-post-title > a') 19 | 20 | def utf8_to_euckr(unicode_string): 21 | p = re.compile('\xc2|\xa0') 22 | text = p.sub('', unicode_string) 23 | text = text.encode('euc-kr', 'replace').decode('euc-kr') 24 | return text 25 | 26 | def vogue_korea_title_no_webdriver(page_numb): 27 | base_url = 'http://www.vogue.co.kr/category/fashion/page/{}/?noCache' 28 | 29 | # 이 부분 분산처리 가능 30 | title_list = [] 31 | for page_numb in range(1, page_numb+1): 32 | target_url = base_url.format(page_numb) 33 | soup = get_page(target_url) 34 | post_list = get_post_list(soup) 35 | 36 | title_list += [utf8_to_euckr(post.get_text(strip=True)) 37 | for post in post_list] 38 | 39 | # print(title_list) 40 | # print(len(title_list)) 41 | 42 | return title_list 43 | 44 | if __name__ == '__main__': 45 | page_numb = 10 46 | vogue_korea_title_no_webdriver(page_numb) 47 | -------------------------------------------------------------------------------- /source/vogue_crawler_webdriver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.common.exceptions import TimeoutException 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.common.by import By 7 | import time 8 | 9 | def convert_utf8_to_euckr(unicode_string): 10 | return unicode_string.encode('euc-kr', 'replace').decode('euc-kr') 11 | 12 | def load_all_contents(driver, timeout=20): 13 | try: 14 | while(True): 15 | button_present = EC.presence_of_element_located((By.CLASS_NAME, 'fusion-infinite-scroll-trigger')) 16 | WebDriverWait(driver, timeout).until(button_present) 17 | 18 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") 19 | time.sleep(1) 20 | 21 | except TimeoutException: 22 | print ("Timed out waiting for page to load") 23 | return 24 | except: 25 | print('???') 26 | return 27 | 28 | def load_contents_upto_pagenumb(driver, page_numb, timeout=20): 29 | try: 30 | while(True): 31 | button_present = EC.presence_of_element_located((By.CLASS_NAME, 'fusion-infinite-scroll-trigger')) 32 | WebDriverWait(driver, timeout).until(button_present) 33 | 34 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") 35 | 36 | post_cnt = len(driver.find_elements_by_css_selector('h2.entry-title.fusion-post-title > a')) 37 | if post_cnt > (12*page_numb): 38 | time.sleep(timeout) 39 | return 40 | 41 | time.sleep(1) 42 | 43 | except TimeoutException: 44 | print ("Timed out waiting for page to load") 45 | return 46 | except: 47 | print('???') 48 | return 49 | 50 | def vogue_korea_title_webdriver(page_numb=1): 51 | driver = webdriver.Chrome('./chromedriver') 52 | # driver.implicitly_wait(5) 53 | 54 | driver.get("http://www.vogue.co.kr/category/fashion/") 55 | # load_all_contents(driver) 56 | load_contents_upto_pagenumb(driver, page_numb) 57 | 58 | title_list = driver.find_elements_by_css_selector('h2.entry-title.fusion-post-title > a') 59 | title_list = [convert_utf8_to_euckr(tag.text) for tag in title_list] 60 | title_list = title_list[:(12*page_numb)] 61 | 62 | # print(title_list) 63 | # print(len(title_list)) 64 | 65 | driver.close() 66 | 67 | return title_list 68 | 69 | 70 | if __name__=='__main__': 71 | vogue_korea_title_no_webdriver(10) 72 | --------------------------------------------------------------------------------