├── update.sh ├── .gitignore ├── Pipfile ├── LICENSE ├── README.md ├── Pipfile.lock └── petition.py /update.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python petition.py 3 | aws s3 sync data s3://data10902/petition 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project specifics 2 | data/ 3 | 4 | # Python 5 | .venv/ 6 | *.py[co] 7 | 8 | # IDEs 9 | .idea/ 10 | 11 | # OS 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | pytest = "*" 8 | "html5lib" = "*" 9 | "beautifulsoup4" = "*" 10 | 11 | [dev-packages] 12 | 13 | [requires] 14 | python_version = "3.6" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Alan Kang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [청와대 국민청원](https://www1.president.go.kr/petitions) 사이트의 2 | [만료된 청원](https://www1.president.go.kr/petitions?only=finished) 데이터 모음. 3 | 4 | ## 데이터 5 | 6 | [petition.csv](https://s3.ap-northeast-2.amazonaws.com/data10902/petition/petition.csv) 7 | 8 | * 전체 데이터 9 | 10 | [petition_corrupted.csv](https://s3.ap-northeast-2.amazonaws.com/data10902/petition/petition_corrupted.csv) 11 | 12 | * 전체 행 중에서 5%는 임의 필드 1개에 결측치 삽입 13 | * 범주(category)가 '육아/교육'이고 투표수(votes)가 50건 초과이면 20% 확률로 투표수에 결측치 넣기 14 | * 나머지는 전체 데이터와 동일 15 | 16 | [petition_sampled.csv](https://s3.ap-northeast-2.amazonaws.com/data10902/petition/petition_sampled.csv) 17 | 18 | * 전체 데이터 중 5%만 임의추출한 데이터 19 | 20 | [petition_corrupted_sampled.csv](https://s3.ap-northeast-2.amazonaws.com/data10902/petition/petition_corrupted_sampled.csv) 21 | 22 | * 결측치가 삽입된 샘플 데이터 23 | * ``petition_corrupted.csv`` 파일에서 5%만 임의추출하여 생성 24 | 25 | ## 저작권 26 | 27 | CSV 데이터의 저작권은 [KOGL 제1유형](http://www.kogl.or.kr/info/license.do)을 따름. 28 | 29 | * 출처표시 30 | * 상업적, 비상업적 이용가능 31 | * 변형 등 2차적 저작물 작성 가능 32 | 33 | 소스 코드는 [MIT License](LICENSE)를 따름. 34 | 35 | ## 설치 및 실행 36 | 37 | 소스코드 받기: 38 | 39 | git clone https://github.com/akngs/petitions.git 40 | cd petitions 41 | 42 | 설치 ([pipenv](https://github.com/pypa/pipenv)가 설치되어 있어야 합니다): 43 | 44 | pipenv install 45 | 46 | 실행: 47 | 48 | pipenv shell 49 | python petition.py 50 | 51 | 생성된 데이터 확인: 52 | 53 | tail data/*.csv 54 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "9ae3efeb5bb7be5f7557d9484609089781ee18c2adf5d4b263fa696e12f8c843" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.python.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "attrs": { 20 | "hashes": [ 21 | "sha256:1c7960ccfd6a005cd9f7ba884e6316b5e430a3f1a6c37c5f87d8b43f83b54ec9", 22 | "sha256:a17a9573a6f475c99b551c0e0a812707ddda1ec9653bed04c13841404ed6f450" 23 | ], 24 | "version": "==17.4.0" 25 | }, 26 | "beautifulsoup4": { 27 | "hashes": [ 28 | "sha256:11a9a27b7d3bddc6d86f59fb76afb70e921a25ac2d6cc55b40d072bd68435a76", 29 | "sha256:7015e76bf32f1f574636c4288399a6de66ce08fb7b2457f628a8d70c0fbabb11", 30 | "sha256:808b6ac932dccb0a4126558f7dfdcf41710dd44a4ef497a0bb59a77f9f078e89" 31 | ], 32 | "index": "pypi", 33 | "version": "==4.6.0" 34 | }, 35 | "html5lib": { 36 | "hashes": [ 37 | "sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3", 38 | "sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736" 39 | ], 40 | "index": "pypi", 41 | "version": "==1.0.1" 42 | }, 43 | "more-itertools": { 44 | "hashes": [ 45 | "sha256:0dd8f72eeab0d2c3bd489025bb2f6a1b8342f9b198f6fc37b52d15cfa4531fea", 46 | "sha256:11a625025954c20145b37ff6309cd54e39ca94f72f6bb9576d1195db6fa2442e", 47 | "sha256:c9ce7eccdcb901a2c75d326ea134e0886abfbea5f93e91cc95de9507c0816c44" 48 | ], 49 | "version": "==4.1.0" 50 | }, 51 | "pluggy": { 52 | "hashes": [ 53 | "sha256:7f8ae7f5bdf75671a718d2daf0a64b7885f74510bcd98b1a0bb420eb9a9d0cff", 54 | "sha256:d345c8fe681115900d6da8d048ba67c25df42973bda370783cd58826442dcd7c", 55 | "sha256:e160a7fcf25762bb60efc7e171d4497ff1d8d2d75a3d0df7a21b76821ecbf5c5" 56 | ], 57 | "version": "==0.6.0" 58 | }, 59 | "py": { 60 | "hashes": [ 61 | "sha256:29c9fab495d7528e80ba1e343b958684f4ace687327e6f789a94bf3d1915f881", 62 | "sha256:983f77f3331356039fdd792e9220b7b8ee1aa6bd2b25f567a963ff1de5a64f6a" 63 | ], 64 | "version": "==1.5.3" 65 | }, 66 | "pytest": { 67 | "hashes": [ 68 | "sha256:54713b26c97538db6ff0703a12b19aeaeb60b5e599de542e7fca0ec83b9038e8", 69 | "sha256:829230122facf05a5f81a6d4dfe6454a04978ea3746853b2b84567ecf8e5c526" 70 | ], 71 | "index": "pypi", 72 | "version": "==3.5.1" 73 | }, 74 | "six": { 75 | "hashes": [ 76 | "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", 77 | "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" 78 | ], 79 | "version": "==1.11.0" 80 | }, 81 | "webencodings": { 82 | "hashes": [ 83 | "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", 84 | "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" 85 | ], 86 | "version": "==0.5.1" 87 | } 88 | }, 89 | "develop": {} 90 | } 91 | -------------------------------------------------------------------------------- /petition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import csv 4 | import logging 5 | import os 6 | import random 7 | import re 8 | import time 9 | from concurrent.futures import ThreadPoolExecutor 10 | from typing import Dict 11 | from urllib import request 12 | from urllib.error import HTTPError 13 | 14 | from bs4 import BeautifulSoup 15 | 16 | DATA_DIR = 'data' 17 | CSV_WHOLE = os.path.join(DATA_DIR, 'petition.csv') 18 | CSV_CORRUPT = os.path.join(DATA_DIR, 'petition_corrupted.csv') 19 | CSV_SAMPLE = os.path.join(DATA_DIR, 'petition_sampled.csv') 20 | CSV_CORRUPT_SAMPLE = os.path.join(DATA_DIR, 'petition_corrupted_sampled.csv') 21 | SAMPLE_RATE = 0.05 22 | FIELDS_TO_CORRUPT = ['category', 'votes', 'start', 'end'] 23 | 24 | logging.basicConfig(level=logging.INFO) 25 | 26 | 27 | def main(): 28 | while True: 29 | try: 30 | run() 31 | break 32 | except: 33 | # 너무 많은 내용을 한 번에 가져가려고 하면 간혹 일시적으로 차단됨. 34 | # 5초 쉬었다가 다시 시도. 35 | print('Retrying after 5 seconds...') 36 | time.sleep(5) 37 | 38 | # 결측치 넣은 파일 생성 39 | generate_modified_file(CSV_WHOLE, CSV_CORRUPT, False, True) 40 | # 샘플링한 파일 생성 41 | generate_modified_file(CSV_WHOLE, CSV_SAMPLE, True, False) 42 | # 결측치 넣고 샘플링한 파일 생성 43 | generate_modified_file(CSV_WHOLE, CSV_CORRUPT_SAMPLE, True, True) 44 | 45 | 46 | def run(): 47 | # 데이터 저장 디렉터리 생성 48 | try: 49 | os.mkdir(DATA_DIR) 50 | except FileExistsError: 51 | pass 52 | 53 | # 추가로 만료된 청원을 수집하여 기존 CSV 파일에 덧붙이기 54 | latest_id = get_latest_article_id() 55 | next_id = get_latest_saved_article_id() + 1 56 | 57 | logging.info( 58 | f'From {next_id} to {latest_id}: ' 59 | f'about {latest_id - next_id} articles to go...' 60 | ) 61 | 62 | # 동시에 두 개씩 병렬로 처리. workers를 더 늘리면 더 자주 차단됨. 63 | with ThreadPoolExecutor(max_workers=2) as exe: 64 | for article in exe.map(fetch_article, range(next_id, latest_id)): 65 | if article is None: 66 | continue 67 | save_article(article) 68 | logging.info( 69 | f'{article["article_id"]} of {latest_id}: {article["title"]} ' 70 | f'https://www1.president.go.kr/petitions/' 71 | f'{article["article_id"]}' 72 | ) 73 | 74 | 75 | def generate_modified_file(src, dst, sample, corrupt): 76 | """원본 파일을 샘플링하고 결측치 넣은 새 파일 생성""" 77 | 78 | # 랜덤 시드 고정. 매번 동일한 결과가 보장되도록. 79 | random.seed(0) 80 | with open(src, 'r') as fr: 81 | with open(dst, 'w') as fw: 82 | csvr = csv.DictReader(fr) 83 | csvw = csv.DictWriter(fw, csvr.fieldnames) 84 | 85 | csvw.writeheader() 86 | 87 | rows = csvr 88 | 89 | # 샘플링 90 | if sample: 91 | rows = (row for row in rows if random.random() <= SAMPLE_RATE) 92 | # 결측치 추가 93 | if corrupt: 94 | rows = (corrupt_row(row) for row in rows) 95 | 96 | csvw.writerows(rows) 97 | 98 | 99 | def corrupt_row(row): 100 | # 범주가 '육아/교육'이고 투표수가 50건 초과이면 20% 확률로 투표수에 결측치 넣기 101 | category = row['category'] == '육아/교육' 102 | votes = int(row['votes']) > 50 103 | if category and votes and random.random() <= 0.2: 104 | row['votes'] = '' 105 | # 각 행마다 5% 확률로 특정 필드에 결측치 넣기 106 | if random.random() <= 0.05: 107 | key = random.choice(FIELDS_TO_CORRUPT) 108 | row[key] = '' 109 | return row 110 | 111 | 112 | def get_latest_article_id() -> int: 113 | """만료된 청원 목록 페이지를 분석하여 가장 최근에 만료된 글번호를 가져오기""" 114 | html = fetch_html('https://www1.president.go.kr/petitions?only=finished') 115 | soup = BeautifulSoup(html, "html5lib") 116 | href = soup.select_one('.bl_body .bl_wrap .bl_subject a')['href'] 117 | article_id = int(re.match(r'.+/petitions/(\d+).*', href).group(1)) 118 | return article_id 119 | 120 | 121 | def get_latest_saved_article_id() -> int: 122 | """이미 저장한 가장 최근 글번호를 가져오기. 저장된 글이 없으면 0을 반환""" 123 | # 글이 없으면 0 124 | if not os.path.isfile(CSV_WHOLE): 125 | return 0 126 | 127 | # 파일 끝 부분에서 몇 줄 읽어온 뒤 마지막 줄의 첫 칼럼(article_id) 반환 128 | with open(CSV_WHOLE, 'rb') as f: 129 | # 마지막 줄을 빠르게 찾기 위해 "거의" 끝 부분으로 이동 130 | f.seek(0, os.SEEK_END) 131 | f.seek(-min([f.tell(), 1024 * 100]), os.SEEK_CUR) 132 | 133 | # 마지막 줄에서 article id 추출 134 | last_line = f.readlines()[-1].decode('utf-8') 135 | article_id = int(last_line.split(',')[0]) 136 | 137 | return article_id 138 | 139 | 140 | def fetch_article(article_id: int) -> Dict[str, any] or None: 141 | """글번호에 해당하는 글의 HTML 텍스트를 가져와서 파싱. 해당 글이 없으면 None""" 142 | url = f'https://www1.president.go.kr/petitions/{article_id}' 143 | 144 | try: 145 | html = fetch_html(url) 146 | except ValueError: 147 | return None 148 | 149 | soup = BeautifulSoup(html, "html5lib") 150 | 151 | title = query(soup, '.petitionsView_title') 152 | votes = int(query(soup, '.petitionsView_count .counter').replace(',', '')) 153 | category = query(soup, '.petitionsView_info_list li:nth-of-type(1)')[4:] 154 | start = query(soup, '.petitionsView_info_list li:nth-of-type(2)')[4:] 155 | end = query(soup, '.petitionsView_info_list li:nth-of-type(3)')[4:] 156 | 157 | answered = query(soup, '.petitionsView_progress h4') == '브리핑' 158 | if answered: 159 | content_selector = '.petitionsView_write > div:nth-of-type(4)' 160 | else: 161 | content_selector = '.petitionsView_write > div:nth-of-type(2)' 162 | 163 | content = remove_whitespaces(query(soup, content_selector)) \ 164 | .replace('\n', '\\n') \ 165 | .replace('\t', '\\t') 166 | 167 | return { 168 | 'article_id': article_id, 169 | 'title': title, 170 | 'votes': votes, 171 | 'answered': 1 if answered else 0, 172 | 'category': category, 173 | 'start': start, 174 | 'end': end, 175 | 'content': content, 176 | } 177 | 178 | 179 | def save_article(article: Dict[str, any]) -> None: 180 | """글을 CSV 형태로 저장한다""" 181 | cols = [ 182 | 'article_id', 'start', 'end', 'answered', 'votes', 'category', 'title', 183 | 'content' 184 | ] 185 | 186 | # 파일이 없으면 새로 만들고 칼럼 이름 저장 187 | if not os.path.isfile(CSV_WHOLE): 188 | with open(CSV_WHOLE, 'w', newline='', encoding='utf-8') as f: 189 | w = csv.writer(f) 190 | w.writerow(cols) 191 | 192 | # 새로운 행 추가 193 | with open(CSV_WHOLE, 'a', newline='', encoding='utf-8') as f: 194 | w = csv.writer(f) 195 | w.writerow(article[col] for col in cols) 196 | 197 | 198 | def fetch_html(url: str) -> str: 199 | """웹에서 HTML 문서를 읽어서 반환""" 200 | try: 201 | with request.urlopen(url) as f: 202 | if f.getcode() != 200: 203 | raise ValueError(f'Invalid status code: {f.getcode()}') 204 | html = f.read().decode('utf-8') 205 | return html 206 | except HTTPError as e: 207 | if e.code == 404: 208 | raise ValueError(f'Not found: {url}') 209 | else: 210 | raise e 211 | 212 | 213 | def query(soup: BeautifulSoup, selector: str) -> str: 214 | """CSS selector로 요소를 찾은 뒤 텍스트 컨텐츠를 반환""" 215 | return soup.select_one(selector).text 216 | 217 | 218 | def remove_whitespaces(text: str) -> str: 219 | """본문 텍스트에서 불필요한 공백 문자들 제거""" 220 | lines = text.split('\n') 221 | lines = (l.strip() for l in lines) 222 | lines = (l for l in lines if len(l) > 0) 223 | return '\n'.join(lines) 224 | 225 | 226 | if __name__ == '__main__': 227 | main() 228 | --------------------------------------------------------------------------------