├── .gitignore
├── README.md
├── crawling_final_kimyunsang_presentation.pptx
├── ko_stopwords.csv
├── naver_web_crawler.py
├── resultData
    ├── naver_it_20191216.png
    ├── naver_it_20191217.png
    ├── naver_politics_20191216.png
    ├── naver_society_20191216.png
    └── naver_world_20191216.png
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 | 
3 | .idea
4 | venv
5 | test.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/README.md


--------------------------------------------------------------------------------
/crawling_final_kimyunsang_presentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/crawling_final_kimyunsang_presentation.pptx


--------------------------------------------------------------------------------
/ko_stopwords.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/ko_stopwords.csv


--------------------------------------------------------------------------------
/naver_web_crawler.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | import urllib.request
  3 | import random
  4 | import webbrowser
  5 | import sys
  6 | from bs4 import BeautifulSoup
  7 | import pandas as pd
  8 | import datetime
  9 | from konlpy.tag import Okt
 10 | from collections import Counter
 11 | import pytagcloud
 12 | from datetime import date
 13 | 
 14 | # 랜덤한 색을 지정하기 위한 함수
 15 | r = lambda: random.randint(0, 255)
 16 | color = lambda: (r(), r(), r())
 17 | 
 18 | # argv로 받은 섹션값을 코드로 변환하기 위한 딕셔너리
 19 | # 정치, 경제,  사회,  문화,  세계,  IT
 20 | section_dic = {'politics': 100, 'economy': 101, 'society': 102, 'culture': 103, 'world': 104, 'it': 105}
 21 | 
 22 | # 섹션 코드 지정
 23 | if len(sys.argv) > 1:
 24 |     section_string = sys.argv[1]
 25 | else:
 26 |     section_string = 'it'
 27 | section = section_dic[section_string]
 28 | 
 29 | # 날짜 지정
 30 | today = date.today()
 31 | if len(sys.argv) > 2:
 32 |     date = sys.argv[2]
 33 | else:
 34 |     date = today.strftime("%Y%m%d")
 35 | 
 36 | # URL에 접근해서 태그들 가져오는 함수
 37 | def get_request_url(url, enc='euc-kr'):
 38 |     # 리퀘스트 객체 생성
 39 |     req = urllib.request.Request(url)
 40 |     try:
 41 |         # url 접근
 42 |         response = urllib.request.urlopen(req)
 43 |         if response.getcode() == 200:
 44 |             try:
 45 |                 # 데이터 가져와서 디코딩
 46 |                 rcv = response.read()
 47 |                 ret = rcv.decode(enc)
 48 |             except UnicodeDecodeError:
 49 |                 ret = rcv.decode(enc, 'replace')
 50 |             return ret
 51 | 
 52 |     # 에러 발생 시 로그 생성
 53 |     except Exception as e:
 54 |         print(e)
 55 |         print("[%s] Error for URL : %s" % (datetime.datetime.now(), url))
 56 |         return None
 57 | 
 58 | # 크롤링 한 뉴스 데이터에서 명사를 추출하고
 59 | # 나온 횟수 구하는 함수
 60 | def get_tags(text, ntags=50):
 61 |     # Okt 객체 생성
 62 |     spliter = Okt()
 63 |     # 명사 추출
 64 |     nouns = spliter.nouns(text)
 65 |     # 명사 빈도수 구하기
 66 |     count = Counter(nouns)
 67 | 
 68 |     # 명사와 빈도수, 색을 딕셔너리 객체에 저장
 69 |     return_list = []
 70 |     for n, c in count.most_common(ntags):
 71 |         temp = {'color': color(), 'tag': n, 'size': c}
 72 |         return_list.append(temp)
 73 | 
 74 |     return return_list
 75 | 
 76 | # 데이터 시각화
 77 | def draw_cloud(tags, filename, fontname='Noto Sans CJK', size=(800, 600)):
 78 |     # 데이터 시각화
 79 |     pytagcloud.create_tag_image(tags, filename, fontname=fontname, size=size)
 80 |     # 시각화 파일 오픈
 81 |     webbrowser.open(filename)
 82 | 
 83 | # 명사 추출 데이터에서 불용어 삭제하기
 84 | def remove_stopword(tags):
 85 |     # 불용어 데이터 가져오기
 86 |     ko_stopwords = pd.read_csv('./ko_stopwords.csv', encoding='cp949', engine='python')
 87 |     # 불용어 데이터 리스트 변환
 88 |     ko_stopwords = ko_stopwords.STOPWORDS.tolist()
 89 |     # print(str(ko_stopwords))
 90 | 
 91 |     # 태그들 리스트에서 불용어 제거
 92 |     for w in ko_stopwords:
 93 |         for item in tags:
 94 |             if item['tag'] == w:
 95 |                 tags.remove(item)
 96 | 
 97 |     return tags
 98 | 
 99 | # 데이터 크롤링
100 | def getNews():
101 |     # 네이버 뉴스 url
102 |     naver_url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=%s&date=%s' \
103 |                 % (section, date)
104 | 
105 |     print(naver_url)
106 | 
107 |     # 뉴스 페이지 데이터 가져오기
108 |     rcv_data = get_request_url(naver_url)
109 |     # html 파서로 데이터 변환
110 |     soup_data = BeautifulSoup(rcv_data, 'html.parser')
111 |     # 뉴스 리스트 가져오기
112 |     news_list = soup_data.find('ol', attrs={'class': 'ranking_list'})
113 |     # print(len(news_list))
114 | 
115 |     body_text = ''
116 |     # 각 뉴스 접근
117 |     for news in news_list.findAll('li'):
118 |         # 뉴스의 헤드라인 정보 가져오기
119 |         news_data = news.find('div', attrs={'class': 'ranking_headline'})
120 |         # 해당 뉴스 링크 가져오기
121 |         href_data = news_data.find('a')['href']
122 |         # 해당 링크 접근
123 |         article_data = get_request_url('https://news.naver.com/' + href_data)
124 |         # html 파서로 데이터 변환
125 |         soup_data = BeautifulSoup(article_data, 'html.parser')
126 | 
127 |         body = []
128 |         # 뉴스 본문의 내용 모두 가져오기
129 |         for item in soup_data.findAll('div', attrs={'class': '_article_body_contents'}):
130 |             body.append(item.findAll(text=True, recursive=False))
131 | 
132 |         # 모든 뉴스 본문 내용 하나로 합치기
133 |         for item in body[0]:
134 |             if item == "\n":
135 |                 continue
136 |             body_text = body_text + ' ' + item
137 | 
138 |     # print(body_text)
139 |     # 본문 내용 명사 & 빈도수 추출
140 |     result_list = get_tags(body_text)
141 | 
142 |     return result_list
143 | 
144 | # 메인 함수
145 | def naver_crawler():
146 |     print('NAVER CRAWLING START')
147 | 
148 |     # 뉴스의 명사 & 빈도수 추출
149 |     result_list = getNews()
150 |     # print('getNews() finished')
151 |     # print(str(result_list))
152 | 
153 |     # 불용어 제거
154 |     result_list = remove_stopword(result_list)
155 |     print(str(result_list))
156 |     # 데이터 시각화
157 |     draw_cloud(result_list, './resultData/naver_' + section_string + '_' + date + '.png')
158 |     print('FINISHED')
159 | 
160 | if __name__ == '__main__':
161 |     naver_crawler()


--------------------------------------------------------------------------------
/resultData/naver_it_20191216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/resultData/naver_it_20191216.png


--------------------------------------------------------------------------------
/resultData/naver_it_20191217.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/resultData/naver_it_20191217.png


--------------------------------------------------------------------------------
/resultData/naver_politics_20191216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/resultData/naver_politics_20191216.png


--------------------------------------------------------------------------------
/resultData/naver_society_20191216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/resultData/naver_society_20191216.png


--------------------------------------------------------------------------------
/resultData/naver_world_20191216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KimYunsang-v/python_crawling_final/ebabf306ab7664eaf120593a634fd7083e05e141/resultData/naver_world_20191216.png


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | from konlpy.tag import Twitter
2 | from collections import Counter


--------------------------------------------------------------------------------