├── .github
└── workflows
│ └── setting.yml
├── .gitignore
├── README.md
├── crawling.py
├── incruit.py
├── preprocessing.py
├── requirements.txt
├── setting_github.py
├── thinkgood.py
└── utils.py
/.github/workflows/setting.yml:
--------------------------------------------------------------------------------
1 | name: Auto crawl & issue management action
2 |
3 | on:
4 | schedule:
5 | - cron: '0 21 */3 * *'
6 |
7 | jobs:
8 | build:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - name: Set up Python
13 | uses: actions/setup-python@v2
14 | with:
15 | python-version: '3.7'
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
20 | - name: Step1_crawling
21 | run: |
22 | python crawling.py
23 | - name: Step2_preprocessing
24 | run: |
25 | python preprocessing.py
26 | - name: Step3_setting github
27 | env:
28 | MY_GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }}
29 | run: |
30 | python setting_github.py
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | venv
3 | .idea
4 | .DS_Store
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Realtime-IT-Contest-notification
2 | IT 공모전 실시간 알림 repository.
3 | 3일에 한 번, KST 06:00 에 공모전 리스트를 업데이트 합니다.
4 |
5 | ### File Structure
6 | ```shell
7 | .
8 | ├─ .github
9 | │ └─ workflows
10 | │ └─ setting.yml
11 | ├─ .gitignore
12 | ├─ README.md
13 | ├─ crawling.py
14 | ├─ incruit.py
15 | ├─ preprocessing.py
16 | ├─ requirements.txt
17 | ├─ setting_github.py
18 | ├─ thinkgood.py
19 | └─ utils.py
20 | ```
21 |
22 | ### Contest Site
23 | * 씽굿: https://www.thinkcontest.com/
24 | * 인크루트 공모전: http://gongmo.incruit.com/
25 |
26 | ### Developers
27 | * [@Jinho Kim](https://github.com/kimjinho1)
28 | * [@Ruby Kim](https://github.com/ruby-kim)
29 | * [@moonhyeok song](https://github.com/mike2ox)
30 | * [@xifoxy]()
31 |
32 |
--------------------------------------------------------------------------------
/crawling.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 |
3 | from incruit import Incruit
4 | from thinkgood import Thinkgood
5 |
6 |
7 | if __name__ == "__main__":
8 | """ init setting """
9 | incruit = Incruit()
10 | thinkgood = Thinkgood()
11 |
12 | """ crawling & save contests data"""
13 | incruit.crawling(), incruit.save_result()
14 | thinkgood.crawling(), thinkgood.save_result()
15 |
--------------------------------------------------------------------------------
/incruit.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import os
4 | import json
5 |
6 |
7 | class Incruit:
8 | def __init__(self):
9 | self.base_url = 'http://gongmo.incruit.com/list/gongmolist.asp'
10 | self.categories = ['?ct=1&category=10', '?ct=1&category=11']
11 | self.contests = dict()
12 |
13 | def crawling(self):
14 | """ 카테고리별 공모전 리스트 크롤링 """
15 | print("===== [Incruit] Start Crawling data... =====")
16 | for category in self.categories:
17 | req = requests.get(self.base_url + category)
18 | soup = BeautifulSoup(req.content, "html.parser")
19 | data_list = soup.find(id='tbdyGmScrap').find_all('a')
20 | self.scraping(data_list)
21 | print("===== [Incruit] Finish Crawling data... =====")
22 |
23 | def scraping(self, data_list):
24 | """
25 | 공모전 세부 정보 크롤링 & dict 형태로 데이터 저장
26 | 공모전 이름(title): [기간(host),
27 | 분류(classify) - 과학/공학, 소프트웨어 항목만,
28 | 주최자(host),
29 | 사이트링크(link)]
30 | """
31 | for data in data_list:
32 | req = requests.get(data.get('href'))
33 | soup = BeautifulSoup(req.content, "html.parser")
34 | tmp = soup.find(class_='tBrd1Gray').find_all('td')
35 |
36 | title = soup.find(class_='job_new_top_title').get_text()
37 | term = tmp[3].get_text()
38 | classify = tmp[0].get_text().replace("
", ",")
39 | host = tmp[1].get_text()
40 | link = tmp[4].find('a').get('href').replace('\t', '')
41 | self.contests[title] = [term, classify, host, link]
42 |
43 | def save_result(self):
44 | base_dir = os.path.dirname(os.path.abspath(__file__))
45 | with open(os.path.join(base_dir, 'incruit.json'), 'w+', encoding='utf-8') as json_file:
46 | json.dump(self.contests, json_file, ensure_ascii=False, indent='\t')
47 | assert "===== [Incruit] Save data... =====\n"
48 |
49 | def check_result(self):
50 | for key, value in self.contests.items():
51 | print(key, ":", value)
52 |
--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 |
3 | import os
4 | import re
5 | import math
6 | from collections import Counter
7 | from pytz import timezone
8 | from datetime import datetime
9 |
10 | from utils import save, load
11 | from copy import deepcopy
12 |
13 |
14 | def text2vec(text):
15 | Word = re.compile(r'\w+')
16 | words = Word.findall(text)
17 | return Counter(words)
18 |
19 |
20 | def get_cosine(vec1, vec2):
21 | intersection = set(vec1.keys()) & set(vec2.keys())
22 | numerator = sum([vec1[x] * vec2[x] for x in intersection])
23 |
24 | sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
25 | sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
26 | denominator = math.sqrt(sum1) * math.sqrt(sum2)
27 |
28 | if not denominator:
29 | return 0.0
30 | else:
31 | return float(numerator) / denominator
32 |
33 |
34 | if __name__ == "__main__":
35 | """ init settings """
36 | base_dir = os.path.dirname(os.path.abspath(__file__))
37 | KST = str(datetime.now(timezone('Asia/Seoul')))[:10]
38 |
39 | """ load data """
40 | contests_incruit = load(base_dir, "/incruit.json")
41 | contests_thinkgood = load(base_dir, "/thinkgood.json")
42 |
43 | """ check duplicate titles """
44 | result = deepcopy(contests_thinkgood)
45 | for thinkgood_key, thinkgood_val in contests_thinkgood.items():
46 | vec1 = text2vec(thinkgood_key)
47 | flag = 0
48 | key, value = None, None
49 | for incruit_key, incruit_val in contests_incruit.items():
50 | vec2 = text2vec(incruit_key)
51 | cosine = get_cosine(vec1, vec2)
52 | if cosine >= 0.5:
53 | continue
54 | else:
55 | key = incruit_key
56 | value = incruit_val
57 | flag = 1
58 | if flag:
59 | result[key] = value
60 |
61 | """ separate data based on DATE """
62 | open = dict()
63 | close = dict()
64 | for key, val in result.items():
65 | if val[0][13:] < KST:
66 | close[key] = val
67 | else:
68 | open[key] = val
69 |
70 | """ save data """
71 | save(base_dir, open, 'open.json')
72 | save(base_dir, close, 'close.json')
73 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyGithub
2 | beautifulsoup4==4.9.1
3 | certifi==2020.6.20
4 | chardet==3.0.4
5 | idna==2.10
6 | requests==2.24.0
7 | soupsieve==2.0.1
8 | urllib3==1.25.10
9 | wincertstore==0.2
10 | pytz==2018.9
11 |
--------------------------------------------------------------------------------
/setting_github.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 |
3 | import os
4 | from github import Github
5 | from pytz import timezone
6 | from datetime import datetime
7 |
8 | from utils import save, load
9 |
10 |
11 | def get_github_repo(access_token, repository_name):
12 | g = Github(access_token)
13 | repo = g.get_user().get_repo(repository_name)
14 | return repo
15 |
16 |
17 | def upload_github_issue(repo, open):
18 | already_opened = [elem.title for elem in repo.get_issues(state='open')]
19 | for key, val in open.items():
20 | if key not in already_opened:
21 | issue_title = key
22 | upload_contents = "* 기간: %s\n * 분류: %s\n * 주최자: %s\n * 사이트링크: %s" % (val[0], val[1], val[2], val[3])
23 | repo.create_issue(title=issue_title, body=upload_contents)
24 |
25 |
26 | def close_github_issue(repo, close):
27 | open_issues = repo.get_issues(state='open')
28 | KST = str(datetime.now(timezone('Asia/Seoul')))[:10]
29 | need_to_close = list(close.keys())
30 | for issue in open_issues:
31 | if issue.title in need_to_close:
32 | issue.edit(state='closed')
33 | if issue.body[19:29] < KST:
34 | issue.edit(state='closed')
35 |
36 |
37 |
38 | if __name__ == "__main__":
39 | """ init settings """
40 | base_dir = os.path.dirname(os.path.abspath(__file__))
41 | access_token = os.environ['MY_GITHUB_TOKEN']
42 | repository_name = "Realtime-IT-Contest-notification"
43 | repo = get_github_repo(access_token, repository_name)
44 |
45 | """ load data """
46 | open = load(base_dir, "/open.json")
47 | close = load(base_dir, "/close.json")
48 |
49 | """ setting issues """
50 | close_github_issue(repo, close)
51 | upload_github_issue(repo, open)
52 |
--------------------------------------------------------------------------------
/thinkgood.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import os
4 | import json
5 |
6 |
7 | class Thinkgood:
8 | def __init__(self):
9 | self.base_url = 'https://www.thinkcontest.com/Contest'
10 | self.categories = ['/CateField.html?page=1&c=11', '/CateField.html?page=1&c=12']
11 | self.select = 'div#wrapper > div#trunk > div#main > div.container >' \
12 | 'div.body.contest-cate > div.all-contest > table.type-2.mg-t-5.contest-table >' \
13 | 'tbody > tr > td.txt-left > div.contest-title > a '
14 | self.contests = dict()
15 |
16 | def crawling(self):
17 | """ 카테고리별 공모전 리스트 크롤링 """
18 | print("===== [Thinkgood] Start Crawling data... =====")
19 | for category in self.categories:
20 | req = requests.get(self.base_url + category)
21 | soup = BeautifulSoup(req.content, "html.parser")
22 | data_list = soup.select(self.select)
23 | self.scraping(data_list)
24 | print("===== [Thinkgood] Finish Crawling data... =====")
25 |
26 | def scraping(self, data_list):
27 | """
28 | 공모전 세부 정보 크롤링 & dict 형태로 데이터 저장
29 | 공모전 이름(title): [기간(term),
30 | 분류(classify) - 과학/공학, 소프트웨어 항목만,
31 | 주최자(host),
32 | 사이트링크(link)]
33 | """
34 | for data in data_list:
35 | req = requests.get('https://www.thinkcontest.com' + data.get('href'))
36 | soup = BeautifulSoup(req.content, "html.parser")
37 | tmp = soup.find(class_='body contest-detail')
38 | val_tmp = tmp.select('div.contest-overview > table.type-5 > tbody > tr')
39 |
40 | title = soup.find(class_='body contest-detail').find(class_='title').get_text()
41 | values = ['0'] * 4
42 | for elem in val_tmp:
43 | elem = str(elem.get_text())
44 | if "접수기간" in elem: # term
45 | values[0] = elem.replace("접수기간\n", "").replace("\n", "")
46 | elif "응모분야" in elem: # classify
47 | values[1] = elem.replace("응모분야\n", "").replace("\n", ", ")[6:-6]
48 | elif "주최" in elem: # host
49 | values[2] = elem.replace("주최\n", "").replace("\n", "")
50 | elif "주관" in elem:
51 | values[2] += ": " + elem.replace("주관", "").replace("\n", "")
52 | if values[2][0] == ":":
53 | values[2].replace(": ", "")
54 |
55 | values[3] = tmp.find(class_="linker").get('href') if tmp.find(class_="linker") is not None \
56 | else "(링크 미지원)" # link
57 | self.contests[title] = values
58 |
59 | def save_result(self):
60 | base_dir = os.path.dirname(os.path.abspath(__file__))
61 | with open(os.path.join(base_dir, 'thinkgood.json'), 'w+', encoding='utf-8') as json_file:
62 | json.dump(self.contests, json_file, ensure_ascii=False, indent='\t')
63 | assert "===== [Thinkgood] Save data... =====\n"
64 |
65 | def check_result(self):
66 | for key, value in self.contests.items():
67 | print(key, ":", value)
68 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 |
4 |
5 | def save(base_dir, data, filename):
6 | with open(os.path.join(base_dir, filename), 'w', encoding='utf-8') as json_file:
7 | json.dump(data, json_file, ensure_ascii=False, indent='\t')
8 | print("===== Finish saving data... =====")
9 |
10 |
11 | def load(base_dir, filename):
12 | with open(base_dir + filename, encoding='utf-8', errors='ignore') as data:
13 | json_file = json.load(data, strict=False)
14 | return json_file
15 |
--------------------------------------------------------------------------------