├── .github └── workflows │ └── setting.yml ├── .gitignore ├── README.md ├── crawling.py ├── incruit.py ├── preprocessing.py ├── requirements.txt ├── setting_github.py ├── thinkgood.py └── utils.py /.github/workflows/setting.yml: -------------------------------------------------------------------------------- 1 | name: Auto crawl & issue management action 2 | 3 | on: 4 | schedule: 5 | - cron: '0 21 */3 * *' 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.7' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 20 | - name: Step1_crawling 21 | run: | 22 | python crawling.py 23 | - name: Step2_preprocessing 24 | run: | 25 | python preprocessing.py 26 | - name: Step3_setting github 27 | env: 28 | MY_GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }} 29 | run: | 30 | python setting_github.py 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | venv 3 | .idea 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Realtime-IT-Contest-notification 2 | IT 공모전 실시간 알림 repository.
3 | 3일에 한 번, KST 06:00 에 공모전 리스트를 업데이트 합니다. 4 | 5 | ### File Structure 6 | ```shell 7 | . 8 | ├─ .github 9 | │ └─ workflows 10 | │ └─ setting.yml 11 | ├─ .gitignore 12 | ├─ README.md 13 | ├─ crawling.py 14 | ├─ incruit.py 15 | ├─ preprocessing.py 16 | ├─ requirements.txt 17 | ├─ setting_github.py 18 | ├─ thinkgood.py 19 | └─ utils.py 20 | ``` 21 | 22 | ### Contest Site 23 | * 씽굿: https://www.thinkcontest.com/ 24 | * 인크루트 공모전: http://gongmo.incruit.com/ 25 | 26 | ### Developers 27 | * [@Jinho Kim](https://github.com/kimjinho1) 28 | * [@Ruby Kim](https://github.com/ruby-kim) 29 | * [@moonhyeok song](https://github.com/mike2ox) 30 | * [@xifoxy]() 31 | 32 | -------------------------------------------------------------------------------- /crawling.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from incruit import Incruit 4 | from thinkgood import Thinkgood 5 | 6 | 7 | if __name__ == "__main__": 8 | """ init setting """ 9 | incruit = Incruit() 10 | thinkgood = Thinkgood() 11 | 12 | """ crawling & save contests data""" 13 | incruit.crawling(), incruit.save_result() 14 | thinkgood.crawling(), thinkgood.save_result() 15 | -------------------------------------------------------------------------------- /incruit.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import json 5 | 6 | 7 | class Incruit: 8 | def __init__(self): 9 | self.base_url = 'http://gongmo.incruit.com/list/gongmolist.asp' 10 | self.categories = ['?ct=1&category=10', '?ct=1&category=11'] 11 | self.contests = dict() 12 | 13 | def crawling(self): 14 | """ 카테고리별 공모전 리스트 크롤링 """ 15 | print("===== [Incruit] Start Crawling data... =====") 16 | for category in self.categories: 17 | req = requests.get(self.base_url + category) 18 | soup = BeautifulSoup(req.content, "html.parser") 19 | data_list = soup.find(id='tbdyGmScrap').find_all('a') 20 | self.scraping(data_list) 21 | print("===== [Incruit] Finish Crawling data... =====") 22 | 23 | def scraping(self, data_list): 24 | """ 25 | 공모전 세부 정보 크롤링 & dict 형태로 데이터 저장 26 | 공모전 이름(title): [기간(host), 27 | 분류(classify) - 과학/공학, 소프트웨어 항목만, 28 | 주최자(host), 29 | 사이트링크(link)] 30 | """ 31 | for data in data_list: 32 | req = requests.get(data.get('href')) 33 | soup = BeautifulSoup(req.content, "html.parser") 34 | tmp = soup.find(class_='tBrd1Gray').find_all('td') 35 | 36 | title = soup.find(class_='job_new_top_title').get_text() 37 | term = tmp[3].get_text() 38 | classify = tmp[0].get_text().replace("
", ",") 39 | host = tmp[1].get_text() 40 | link = tmp[4].find('a').get('href').replace('\t', '') 41 | self.contests[title] = [term, classify, host, link] 42 | 43 | def save_result(self): 44 | base_dir = os.path.dirname(os.path.abspath(__file__)) 45 | with open(os.path.join(base_dir, 'incruit.json'), 'w+', encoding='utf-8') as json_file: 46 | json.dump(self.contests, json_file, ensure_ascii=False, indent='\t') 47 | assert "===== [Incruit] Save data... =====\n" 48 | 49 | def check_result(self): 50 | for key, value in self.contests.items(): 51 | print(key, ":", value) 52 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import os 4 | import re 5 | import math 6 | from collections import Counter 7 | from pytz import timezone 8 | from datetime import datetime 9 | 10 | from utils import save, load 11 | from copy import deepcopy 12 | 13 | 14 | def text2vec(text): 15 | Word = re.compile(r'\w+') 16 | words = Word.findall(text) 17 | return Counter(words) 18 | 19 | 20 | def get_cosine(vec1, vec2): 21 | intersection = set(vec1.keys()) & set(vec2.keys()) 22 | numerator = sum([vec1[x] * vec2[x] for x in intersection]) 23 | 24 | sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) 25 | sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) 26 | denominator = math.sqrt(sum1) * math.sqrt(sum2) 27 | 28 | if not denominator: 29 | return 0.0 30 | else: 31 | return float(numerator) / denominator 32 | 33 | 34 | if __name__ == "__main__": 35 | """ init settings """ 36 | base_dir = os.path.dirname(os.path.abspath(__file__)) 37 | KST = str(datetime.now(timezone('Asia/Seoul')))[:10] 38 | 39 | """ load data """ 40 | contests_incruit = load(base_dir, "/incruit.json") 41 | contests_thinkgood = load(base_dir, "/thinkgood.json") 42 | 43 | """ check duplicate titles """ 44 | result = deepcopy(contests_thinkgood) 45 | for thinkgood_key, thinkgood_val in contests_thinkgood.items(): 46 | vec1 = text2vec(thinkgood_key) 47 | flag = 0 48 | key, value = None, None 49 | for incruit_key, incruit_val in contests_incruit.items(): 50 | vec2 = text2vec(incruit_key) 51 | cosine = get_cosine(vec1, vec2) 52 | if cosine >= 0.5: 53 | continue 54 | else: 55 | key = incruit_key 56 | value = incruit_val 57 | flag = 1 58 | if flag: 59 | result[key] = value 60 | 61 | """ separate data based on DATE """ 62 | open = dict() 63 | close = dict() 64 | for key, val in result.items(): 65 | if val[0][13:] < KST: 66 | close[key] = val 67 | else: 68 | open[key] = val 69 | 70 | """ save data """ 71 | save(base_dir, open, 'open.json') 72 | save(base_dir, close, 'close.json') 73 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyGithub 2 | beautifulsoup4==4.9.1 3 | certifi==2020.6.20 4 | chardet==3.0.4 5 | idna==2.10 6 | requests==2.24.0 7 | soupsieve==2.0.1 8 | urllib3==1.25.10 9 | wincertstore==0.2 10 | pytz==2018.9 11 | -------------------------------------------------------------------------------- /setting_github.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import os 4 | from github import Github 5 | from pytz import timezone 6 | from datetime import datetime 7 | 8 | from utils import save, load 9 | 10 | 11 | def get_github_repo(access_token, repository_name): 12 | g = Github(access_token) 13 | repo = g.get_user().get_repo(repository_name) 14 | return repo 15 | 16 | 17 | def upload_github_issue(repo, open): 18 | already_opened = [elem.title for elem in repo.get_issues(state='open')] 19 | for key, val in open.items(): 20 | if key not in already_opened: 21 | issue_title = key 22 | upload_contents = "* 기간: %s\n * 분류: %s\n * 주최자: %s\n * 사이트링크: %s" % (val[0], val[1], val[2], val[3]) 23 | repo.create_issue(title=issue_title, body=upload_contents) 24 | 25 | 26 | def close_github_issue(repo, close): 27 | open_issues = repo.get_issues(state='open') 28 | KST = str(datetime.now(timezone('Asia/Seoul')))[:10] 29 | need_to_close = list(close.keys()) 30 | for issue in open_issues: 31 | if issue.title in need_to_close: 32 | issue.edit(state='closed') 33 | if issue.body[19:29] < KST: 34 | issue.edit(state='closed') 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | """ init settings """ 40 | base_dir = os.path.dirname(os.path.abspath(__file__)) 41 | access_token = os.environ['MY_GITHUB_TOKEN'] 42 | repository_name = "Realtime-IT-Contest-notification" 43 | repo = get_github_repo(access_token, repository_name) 44 | 45 | """ load data """ 46 | open = load(base_dir, "/open.json") 47 | close = load(base_dir, "/close.json") 48 | 49 | """ setting issues """ 50 | close_github_issue(repo, close) 51 | upload_github_issue(repo, open) 52 | -------------------------------------------------------------------------------- /thinkgood.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import json 5 | 6 | 7 | class Thinkgood: 8 | def __init__(self): 9 | self.base_url = 'https://www.thinkcontest.com/Contest' 10 | self.categories = ['/CateField.html?page=1&c=11', '/CateField.html?page=1&c=12'] 11 | self.select = 'div#wrapper > div#trunk > div#main > div.container >' \ 12 | 'div.body.contest-cate > div.all-contest > table.type-2.mg-t-5.contest-table >' \ 13 | 'tbody > tr > td.txt-left > div.contest-title > a ' 14 | self.contests = dict() 15 | 16 | def crawling(self): 17 | """ 카테고리별 공모전 리스트 크롤링 """ 18 | print("===== [Thinkgood] Start Crawling data... =====") 19 | for category in self.categories: 20 | req = requests.get(self.base_url + category) 21 | soup = BeautifulSoup(req.content, "html.parser") 22 | data_list = soup.select(self.select) 23 | self.scraping(data_list) 24 | print("===== [Thinkgood] Finish Crawling data... =====") 25 | 26 | def scraping(self, data_list): 27 | """ 28 | 공모전 세부 정보 크롤링 & dict 형태로 데이터 저장 29 | 공모전 이름(title): [기간(term), 30 | 분류(classify) - 과학/공학, 소프트웨어 항목만, 31 | 주최자(host), 32 | 사이트링크(link)] 33 | """ 34 | for data in data_list: 35 | req = requests.get('https://www.thinkcontest.com' + data.get('href')) 36 | soup = BeautifulSoup(req.content, "html.parser") 37 | tmp = soup.find(class_='body contest-detail') 38 | val_tmp = tmp.select('div.contest-overview > table.type-5 > tbody > tr') 39 | 40 | title = soup.find(class_='body contest-detail').find(class_='title').get_text() 41 | values = ['0'] * 4 42 | for elem in val_tmp: 43 | elem = str(elem.get_text()) 44 | if "접수기간" in elem: # term 45 | values[0] = elem.replace("접수기간\n", "").replace("\n", "") 46 | elif "응모분야" in elem: # classify 47 | values[1] = elem.replace("응모분야\n", "").replace("\n", ", ")[6:-6] 48 | elif "주최" in elem: # host 49 | values[2] = elem.replace("주최\n", "").replace("\n", "") 50 | elif "주관" in elem: 51 | values[2] += ": " + elem.replace("주관", "").replace("\n", "") 52 | if values[2][0] == ":": 53 | values[2].replace(": ", "") 54 | 55 | values[3] = tmp.find(class_="linker").get('href') if tmp.find(class_="linker") is not None \ 56 | else "(링크 미지원)" # link 57 | self.contests[title] = values 58 | 59 | def save_result(self): 60 | base_dir = os.path.dirname(os.path.abspath(__file__)) 61 | with open(os.path.join(base_dir, 'thinkgood.json'), 'w+', encoding='utf-8') as json_file: 62 | json.dump(self.contests, json_file, ensure_ascii=False, indent='\t') 63 | assert "===== [Thinkgood] Save data... =====\n" 64 | 65 | def check_result(self): 66 | for key, value in self.contests.items(): 67 | print(key, ":", value) 68 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | 5 | def save(base_dir, data, filename): 6 | with open(os.path.join(base_dir, filename), 'w', encoding='utf-8') as json_file: 7 | json.dump(data, json_file, ensure_ascii=False, indent='\t') 8 | print("===== Finish saving data... =====") 9 | 10 | 11 | def load(base_dir, filename): 12 | with open(base_dir + filename, encoding='utf-8', errors='ignore') as data: 13 | json_file = json.load(data, strict=False) 14 | return json_file 15 | --------------------------------------------------------------------------------