├── 2013301027_박춘소(데이터크롤링).pptx ├── README.md ├── term-project.py └── 결과 ├── CAD분야_베스트셀러_list.csv ├── IT일반분야_베스트셀러_list.csv ├── OA_사무자동화분야_베스트셀러_list.csv ├── OS분야_베스트셀러_list.csv ├── WordCloud.png ├── 개발방법론분야_베스트셀러_list.csv ├── 게임분야_베스트셀러_list.csv ├── 그래픽분야_베스트셀러_list.csv ├── 네트워크분야_베스트셀러_list.csv ├── 대학교재분야_베스트셀러_list.csv ├── 데이터베이스분야_베스트셀러_list.csv ├── 멀티미디어분야_베스트셀러_list.csv ├── 모바일프로그래밍분야_베스트셀러_list.csv ├── 보안_해킹분야_베스트셀러_list.csv ├── 웹사이트분야_베스트셀러_list.csv ├── 웹프로그래밍분야_베스트셀러_list.csv ├── 자격증_수험서분야_베스트셀러_list.csv ├── 전산통계_해석분야_베스트셀러_list.csv ├── 컴퓨터공학분야_베스트셀러_list.csv ├── 컴퓨터입문_활용분야_베스트셀러_list.csv └── 프로그래밍언어분야_베스트셀러_list.csv /2013301027_박춘소(데이터크롤링).pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/2013301027_박춘소(데이터크롤링).pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 데이터 크롤링 기말 프로젝트 2 | - term-project.py: 파이썬 코드 3 | > 실행시 NotoSansCJKkr-Medium.otf 폰트 pytagcloud에 넣고 시작해야합니다. 4 | > https://www.google.com/get/noto/#/family/noto-sans-kore 5 | - 2013301027_박춘소(데이터크롤링).pptx: 발표 자료 6 | - 결과: 각 분야별 베스트셀러 정리 csv파일과 word cloud 파일 7 | -------------------------------------------------------------------------------- /term-project.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from bs4 import BeautifulSoup 4 | import pandas as pd 5 | from itertools import count 6 | from selenium import webdriver 7 | import webbrowser 8 | from konlpy.tag import Hannanum 9 | from collections import Counter 10 | from itertools import chain 11 | import pickle 12 | import pytagcloud 13 | 14 | category_num = [3301, 3302, 3303, 3305, 3307, 3309, 3310, 3311, 3312, 3313, 3314, 3315, 3316 15 | , 3317, 3319, 3321, 3323, 3325, 3328, 3329] 16 | 17 | category_dic = {3301:'컴퓨터공학', 3302:'IT일반', 3303:'컴퓨터입문_활용', 3305:'전산통계_해석', 3307:'OS', 3309:'네트워크' 18 | , 3310:'보안_해킹', 3311:'데이터베이스', 3312:'개발방법론', 3313:'게임', 3314:'웹프로그래밍', 3315:'프로그래밍언어' 19 | , 3316:'모바일프로그래밍', 3317:'OA_사무자동화', 3319:'웹사이트', 3321:'그래픽', 3323:'멀티미디어', 3325:'CAD' 20 | , 3328:'자격증_수험서' , 3329:'대학교재'} 21 | 22 | r = lambda: random.randint(0, 255) 23 | color = lambda: (r(), r(), r()) 24 | 25 | def get_kobomungo_data(): 26 | result = [] 27 | wd = webdriver.Chrome('C:/Users/chunso/AppData/Local/Programs/Python/Python37/webDriver/chromedriver.exe') 28 | for category_idx in category_num: 29 | kobomungo_URL = "http://www.kyobobook.co.kr/categoryRenewal/categoryMain.laf?linkClass=%s&mallGb=KOR&orderClick=sgx" %str(category_idx) 30 | wd.get(kobomungo_URL) 31 | print ("Category Index [%s] Called" % (str(category_idx))) 32 | time.sleep(5) 33 | tmp = [] 34 | 35 | for page_idx in count(): 36 | try: 37 | wd.execute_script("_go_targetPage('%s')" % str(page_idx + 1)) 38 | print ("PageIndex [%s] Called" % (str(page_idx + 1))) 39 | except Exception as e: 40 | break 41 | 42 | time.sleep(3) 43 | rcv_data = wd.page_source 44 | soupData = BeautifulSoup(rcv_data, 'html.parser') 45 | book_list = soupData.findAll('li', attrs={'class': 'id_detailli'}) 46 | 47 | for book in book_list: 48 | book_rank = book.find('em', attrs={'class': 'best_flag'}).find('span').string 49 | book_title = remove_bracket(book.find('div', attrs={'class': 'title'}).find('strong').string) 50 | book_author = remove_bracket(book.find('div', attrs={'class': 'pub_info'}).find('span', attrs={'class': 'author'}).string) 51 | book_publication = remove_bracket(book.find('div', attrs={'class': 'pub_info'}).find('span', attrs={'class': 'publication'}).string) 52 | book_sumary = remove_bracket(book.find('div', attrs={'class': 'info'}).find('span').string) 53 | 54 | print(book_rank , book_title) 55 | tmp.append([book_rank] + [book_title] + [book_author] + [book_publication] + [book_sumary]) 56 | result.append([category_idx] + [tmp]) 57 | return result 58 | 59 | def remove_bracket(s): 60 | if(str(type(s)) != ""): 61 | s = str(s) 62 | return s.replace("("," ").replace(")", " ") 63 | 64 | def abstract_title(book_list): 65 | title_list = [] 66 | for category in book_list: 67 | for book in category[1]: 68 | title_list.append(book[1]) 69 | return title_list 70 | 71 | def text_mining(title_list, ntags=50, multiplier=1): 72 | h = Hannanum() 73 | data_nouns = [] 74 | for title in title_list: 75 | data_nouns.extend(h.nouns(title)) 76 | 77 | count = Counter(data_nouns) 78 | 79 | return [{'color': color(),'tag':n,'size':int(c*multiplier*0.5)} for n,c in count.most_common(ntags)] 80 | 81 | def draw_wordcloud(tags, filename, fontname = 'Noto Sans CJK',size1 = (1300,800)): 82 | pytagcloud.create_tag_image(tags, filename, fontname=fontname, size=size1) 83 | webbrowser.open(filename) 84 | return 85 | 86 | def save_data(book_list): 87 | for category in book_list: 88 | category_name = category_dic[category[0]] 89 | book_table = pd.DataFrame(category[1], columns=('순위', '제목', '저자', '출판사', '요약')) 90 | book_table.to_csv("./%s분야_베스트셀러_list.csv" %str(category_name), encoding="cp949", mode='w', index=False) 91 | return 92 | 93 | def main(): 94 | print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') 95 | print('!!!!!!!!!!!!!!!!!!!!!PARKCHUNSO TERM PROJECT START!!!!!!!!!!!!!!!!!!!!!') 96 | print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') 97 | 98 | print('') 99 | print('CRAWLING START') 100 | book_list = get_kobomungo_data() 101 | print('CRAWLING FINISH') 102 | print('') 103 | 104 | print('') 105 | print('TEXTMINING START') 106 | title_list = abstract_title(book_list) 107 | draw_data = text_mining(title_list) 108 | print('TEXTMINING FINISH') 109 | print('') 110 | 111 | print('') 112 | print('RESULT SAVE START') 113 | draw_wordcloud(draw_data, "WordCloud.png") 114 | save_data(book_list) 115 | print('RESULT SAVE FINISH') 116 | print('') 117 | 118 | print('FINISHED') 119 | 120 | if __name__ == '__main__': 121 | main () 122 | -------------------------------------------------------------------------------- /결과/CAD분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/CAD분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/IT일반분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/IT일반분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/OA_사무자동화분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/OA_사무자동화분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/OS분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/OS분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/WordCloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/WordCloud.png -------------------------------------------------------------------------------- /결과/개발방법론분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/개발방법론분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/게임분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/게임분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/그래픽분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/그래픽분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/네트워크분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/네트워크분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/대학교재분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/대학교재분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/데이터베이스분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/데이터베이스분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/멀티미디어분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/멀티미디어분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/모바일프로그래밍분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/모바일프로그래밍분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/보안_해킹분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/보안_해킹분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/웹사이트분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/웹사이트분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/웹프로그래밍분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/웹프로그래밍분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/자격증_수험서분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/자격증_수험서분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/전산통계_해석분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/전산통계_해석분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/컴퓨터공학분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/컴퓨터공학분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/컴퓨터입문_활용분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/컴퓨터입문_활용분야_베스트셀러_list.csv -------------------------------------------------------------------------------- /결과/프로그래밍언어분야_베스트셀러_list.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParkChunSo/term-project-by-python/2bb74ecc8c0eed5fa20a4c9d534f1c83b74d0501/결과/프로그래밍언어분야_베스트셀러_list.csv --------------------------------------------------------------------------------