├── requirements.txt ├── img ├── 6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png └── 69E70949-0F7D-4B70-9B49-B9D290B82BAE.png ├── src ├── config.py └── spider.py ├── LICENSE ├── README.md └── .gitignore /requirements.txt: -------------------------------------------------------------------------------- 1 | pysqlite3 2 | urllib3 3 | beautifulsoup4 4 | requests -------------------------------------------------------------------------------- /img/6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hu-chi/arxiv-feishu-bot/HEAD/img/6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png -------------------------------------------------------------------------------- /img/69E70949-0F7D-4B70-9B49-B9D290B82BAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hu-chi/arxiv-feishu-bot/HEAD/img/69E70949-0F7D-4B70-9B49-B9D290B82BAE.png -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # Set new submission url of subject 2 | # Other options: 'https://arxiv.org/list/cs/new', 3 | # 'https://arxiv.org/list/cs.AI/new' 4 | # 'https://arxiv.org/list/cs.CV/new' 5 | NEW_SUB_URL = 'https://arxiv.org/list/cs.CL/new' 6 | 7 | # Keywords to search 8 | KEYWORD_LIST = ["sentiment analysis", "aspect based", "augmentation"] 9 | 10 | # WEBHOOK_URL in feishu bot, Please replace the fake URL and secret with yours 11 | WEBHOOK_URL = "https://open.feishu.cn/open-apis/bot/v2/hook/#" 12 | WEBHOOK_SECRET = "#" 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 huchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arxiv-feishu-bot 2 | We develop A simple feishu bot script daily pushes arxiv latest articles. His 3 | effect is as follows: 4 | ![](img/6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png) 5 | 6 | Of course, you can also use other online website to monitor the latest progress 7 | of Arxiv such as https://deeplearn.org/ . 8 | 9 | And our purpose of making this is to facilitate the discussion of the latest 10 | articles in Feishu, hope this script can be helpful to you. 11 | 12 | Our code is mainly based on this 13 | [project](https://github.com/kobiso/get-daily-arxiv-noti) . 14 | 15 | Next we mainly introduce how to use. 16 | 17 | ## How to use 18 | 19 | 1. New a feishu bot and replace feishu webhook URL and secret in 20 | `src/config.py` 21 | ![](img/69E70949-0F7D-4B70-9B49-B9D290B82BAE.png) 22 | 23 | 2. pip install -r requirements.txt 24 | 25 | 3. (Option) Rewrite the function `filter_paper` in `src/spider.py` with your 26 | needs. 27 | 28 | 4. Set this script as planned work through `crontab` or other commands. 29 | 30 | ```bash 31 | 30 10 * * * python3 $Absolute path$ 32 | ``` 33 | 34 | And then you can see the news pushed at 10:30 every day. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff 7 | .idea/**/workspace.xml 8 | .idea/**/tasks.xml 9 | .idea/**/usage.statistics.xml 10 | .idea/**/dictionaries 11 | .idea/**/shelf 12 | 13 | # Generated files 14 | .idea/**/contentModel.xml 15 | 16 | # Sensitive or high-churn files 17 | .idea/**/dataSources/ 18 | .idea/**/dataSources.ids 19 | .idea/**/dataSources.local.xml 20 | .idea/**/sqlDataSources.xml 21 | .idea/**/dynamic.xml 22 | .idea/**/uiDesigner.xml 23 | .idea/**/dbnavigator.xml 24 | 25 | # Gradle 26 | .idea/**/gradle.xml 27 | .idea/**/libraries 28 | 29 | # Gradle and Maven with auto-import 30 | # When using Gradle or Maven with auto-import, you should exclude module files, 31 | # since they will be recreated, and may cause churn. Uncomment if using 32 | # auto-import. 33 | # .idea/modules.xml 34 | # .idea/*.iml 35 | # .idea/modules 36 | # *.iml 37 | # *.ipr 38 | 39 | # CMake 40 | cmake-build-*/ 41 | 42 | # Mongo Explorer plugin 43 | .idea/**/mongoSettings.xml 44 | 45 | # File-based project format 46 | *.iws 47 | 48 | # IntelliJ 49 | out/ 50 | 51 | # mpeltonen/sbt-idea plugin 52 | .idea_modules/ 53 | 54 | # JIRA plugin 55 | atlassian-ide-plugin.xml 56 | 57 | # Cursive Clojure plugin 58 | .idea/replstate.xml 59 | 60 | # Crashlytics plugin (for Android Studio and IntelliJ) 61 | com_crashlytics_export_strings.xml 62 | crashlytics.properties 63 | crashlytics-build.properties 64 | fabric.properties 65 | 66 | # Editor-based Rest Client 67 | .idea/httpRequests 68 | 69 | # Android studio 3.1+ serialized cache file 70 | .idea/caches/build_file_checksums.ser 71 | 72 | ### macOS template 73 | # General 74 | .DS_Store 75 | .AppleDouble 76 | .LSOverride 77 | 78 | # Icon must end with two \r 79 | Icon 80 | 81 | # Thumbnails 82 | ._* 83 | 84 | # Files that might appear in the root of a volume 85 | .DocumentRevisions-V100 86 | .fseventsd 87 | .Spotlight-V100 88 | .TemporaryItems 89 | .Trashes 90 | .VolumeIcon.icns 91 | .com.apple.timemachine.donotpresent 92 | 93 | # Directories potentially created on remote AFP share 94 | .AppleDB 95 | .AppleDesktop 96 | Network Trash Folder 97 | Temporary Items 98 | .apdisk 99 | 100 | ### Python template 101 | # Byte-compiled / optimized / DLL files 102 | __pycache__/ 103 | *.py[cod] 104 | *$py.class 105 | 106 | # C extensions 107 | *.so 108 | 109 | # Distribution / packaging 110 | .Python 111 | build/ 112 | develop-eggs/ 113 | dist/ 114 | downloads/ 115 | eggs/ 116 | .eggs/ 117 | lib/ 118 | lib64/ 119 | parts/ 120 | sdist/ 121 | var/ 122 | wheels/ 123 | pip-wheel-metadata/ 124 | share/python-wheels/ 125 | *.egg-info/ 126 | .installed.cfg 127 | *.egg 128 | MANIFEST 129 | 130 | # PyInstaller 131 | # Usually these files are written by a python script from a template 132 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 133 | *.manifest 134 | *.spec 135 | 136 | # Installer logs 137 | pip-log.txt 138 | pip-delete-this-directory.txt 139 | 140 | # Unit test / coverage reports 141 | htmlcov/ 142 | .tox/ 143 | .nox/ 144 | .coverage 145 | .coverage.* 146 | .cache 147 | nosetests.xml 148 | coverage.xml 149 | *.cover 150 | .hypothesis/ 151 | .pytest_cache/ 152 | 153 | # Translations 154 | *.mo 155 | *.pot 156 | 157 | # Django stuff: 158 | *.log 159 | local_settings.py 160 | db.sqlite3 161 | db.sqlite3-journal 162 | 163 | # Flask stuff: 164 | instance/ 165 | .webassets-cache 166 | 167 | # Scrapy stuff: 168 | .scrapy 169 | 170 | # Sphinx documentation 171 | docs/_build/ 172 | 173 | # PyBuilder 174 | target/ 175 | 176 | # Jupyter Notebook 177 | .ipynb_checkpoints 178 | 179 | # IPython 180 | profile_default/ 181 | ipython_config.py 182 | 183 | # pyenv 184 | .python-version 185 | 186 | # pipenv 187 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 188 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 189 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 190 | # install all needed dependencies. 191 | #Pipfile.lock 192 | 193 | # celery beat schedule file 194 | celerybeat-schedule 195 | 196 | # SageMath parsed files 197 | *.sage.py 198 | 199 | # Environments 200 | .env 201 | .venv 202 | env/ 203 | venv/ 204 | ENV/ 205 | env.bak/ 206 | venv.bak/ 207 | 208 | # Spyder project settings 209 | .spyderproject 210 | .spyproject 211 | 212 | # Rope project settings 213 | .ropeproject 214 | 215 | # mkdocs documentation 216 | /site 217 | 218 | # mypy 219 | .mypy_cache/ 220 | .dmypy.json 221 | dmypy.json 222 | 223 | # Pyre type checker 224 | .pyre/ 225 | 226 | /src/paper.db 227 | -------------------------------------------------------------------------------- /src/spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # based on https://github.com/kobiso/get-daily-arxiv-noti 3 | import base64 4 | import hashlib 5 | import hmac 6 | import sqlite3 7 | import urllib.request 8 | from datetime import datetime 9 | 10 | import requests 11 | from bs4 import BeautifulSoup 12 | 13 | from config import NEW_SUB_URL, KEYWORD_LIST, WEBHOOK_SECRET, WEBHOOK_URL 14 | 15 | timestamp = int(datetime.now().timestamp()) 16 | 17 | 18 | def gen_sign(secret: str): 19 | string_to_sign = '{}\n{}'.format(timestamp, secret) 20 | hmac_code = hmac.new( 21 | string_to_sign.encode("utf-8"), 22 | digestmod=hashlib.sha256 23 | ).digest() 24 | sign = base64.b64encode(hmac_code).decode('utf-8') 25 | return sign 26 | 27 | 28 | def send_to_bot(content: dict): 29 | sign = gen_sign(WEBHOOK_SECRET) 30 | 31 | params = { 32 | "timestamp": timestamp, 33 | "msg_type": "interactive", 34 | "sign": sign, 35 | "card": content 36 | } 37 | 38 | resp = requests.post(WEBHOOK_URL, json=params) 39 | resp.raise_for_status() 40 | result = resp.json() 41 | if result.get("code") and result.get("code") != 0: 42 | print(f"Message Send Error For: %s" % result['msg']) 43 | return False 44 | return True 45 | 46 | 47 | def filter_paper(paper, keyword_list): 48 | for keyword in keyword_list: 49 | if keyword.lower() in paper["abstract"].lower() \ 50 | or keyword.lower() in paper["title"].lower(): 51 | return False 52 | return True 53 | 54 | 55 | def main(): 56 | page = urllib.request.urlopen(NEW_SUB_URL) 57 | soup = BeautifulSoup(page, 'html.parser') 58 | content = soup.body.find("div", {'id': 'content'}) 59 | 60 | issue_title = content.find("h3").text 61 | dt_list = content.dl.find_all("dt") 62 | dd_list = content.dl.find_all("dd") 63 | arxiv_base = "https://arxiv.org/abs/" 64 | 65 | assert len(dt_list) == len(dd_list) 66 | 67 | keyword_list = KEYWORD_LIST 68 | paper_list = [] 69 | 70 | conn = sqlite3.connect('paper.db') 71 | c = conn.cursor() 72 | try: 73 | c.execute('''CREATE TABLE PAPER(NAME TEXT NOT NULL);''') 74 | conn.commit() 75 | except Exception: 76 | pass 77 | 78 | def check_in_sql(paper): 79 | cursor = c.execute( 80 | "SELECT NAME FROM PAPER WHERE NAME = '{}'".format(paper['title']) 81 | ) 82 | return len(cursor.fetchall()) > 0 83 | 84 | for i in range(len(dt_list)): 85 | paper = {} 86 | paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1] 87 | paper['main_page'] = arxiv_base + paper_number 88 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number 89 | 90 | paper['title'] = dd_list[i].find( 91 | "div", {"class": "list-title mathjax"} 92 | ).text.replace("Title: ", "").strip() 93 | 94 | paper['authors'] = dd_list[i].find( 95 | "div", {"class": "list-authors"} 96 | ).text.replace("Authors:\n", "").replace("\n", "").strip() 97 | 98 | paper['subjects'] = dd_list[i].find( 99 | "div", {"class": "list-subjects"} 100 | ).text.replace("Subjects: ", "").strip() 101 | 102 | paper['abstract'] = dd_list[i].find( 103 | "p", {"class": "mathjax"} 104 | ).text.replace("\n", " ").strip() 105 | 106 | if not filter_paper(paper, keyword_list) and not check_in_sql(paper): 107 | paper_list.append(paper) 108 | 109 | card_content = { 110 | "config": { 111 | "wide_screen_mode": True, 112 | "enable_forward": True 113 | }, 114 | "elements": [], 115 | "header": { 116 | "template": "blue", 117 | "title": { 118 | "content": "今日论文推荐", 119 | "tag": "plain_text" 120 | } 121 | } 122 | } 123 | 124 | for idx, paper in enumerate(paper_list): 125 | card_content["elements"].append({ 126 | "tag": "div", 127 | "text": { 128 | "content": 129 | "[{}] [{}]({})\n - **Authors:** {}\n - **Subjects:** {}\n ".format( 130 | idx, paper['title'], paper['main_page'], 131 | paper['authors'], paper['subjects']), 132 | "tag": "lark_md" 133 | } 134 | }) 135 | if idx != len(paper_list) - 1: 136 | card_content["elements"].append({ 137 | "tag": "hr" 138 | }) 139 | c.execute( 140 | "INSERT INTO PAPER (NAME) VALUES ('{}')".format(paper['title']) 141 | ) 142 | 143 | conn.commit() 144 | conn.close() 145 | 146 | if len(paper_list) == 0: 147 | card_content["elements"].append({ 148 | "tag": "div", 149 | "text": { 150 | "content": "No papers to follow today.", 151 | "tag": "lark_md" 152 | } 153 | }) 154 | card_content["elements"].append({ 155 | "actions": [ 156 | { 157 | "tag": "button", 158 | "text": { 159 | "content": "查看今日全部论文", 160 | "tag": "plain_text" 161 | }, 162 | "type": "primary", 163 | "url": NEW_SUB_URL 164 | } 165 | ], 166 | "tag": "action" 167 | }) 168 | 169 | send_to_bot(card_content) 170 | 171 | 172 | if __name__ == '__main__': 173 | main() 174 | --------------------------------------------------------------------------------