├── requirements.txt
├── img
    ├── 6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png
    └── 69E70949-0F7D-4B70-9B49-B9D290B82BAE.png
├── src
    ├── config.py
    └── spider.py
├── LICENSE
├── README.md
└── .gitignore


/requirements.txt:
--------------------------------------------------------------------------------
1 | pysqlite3
2 | urllib3
3 | beautifulsoup4
4 | requests


--------------------------------------------------------------------------------
/img/6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hu-chi/arxiv-feishu-bot/HEAD/img/6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png


--------------------------------------------------------------------------------
/img/69E70949-0F7D-4B70-9B49-B9D290B82BAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hu-chi/arxiv-feishu-bot/HEAD/img/69E70949-0F7D-4B70-9B49-B9D290B82BAE.png


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | # Set new submission url of subject
 2 | # Other options: 'https://arxiv.org/list/cs/new',
 3 | #                'https://arxiv.org/list/cs.AI/new'
 4 | #                'https://arxiv.org/list/cs.CV/new'
 5 | NEW_SUB_URL = 'https://arxiv.org/list/cs.CL/new'
 6 | 
 7 | # Keywords to search
 8 | KEYWORD_LIST = ["sentiment analysis", "aspect based", "augmentation"]
 9 | 
10 | # WEBHOOK_URL in feishu bot, Please replace the fake URL and secret with yours
11 | WEBHOOK_URL = "https://open.feishu.cn/open-apis/bot/v2/hook/#"
12 | WEBHOOK_SECRET = "#"
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 huchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # arxiv-feishu-bot
 2 | We develop A simple feishu bot script daily pushes arxiv latest articles. His
 3 |  effect is as follows:
 4 |  ![](img/6193CD9D-1324-4E65-85B3-3EFA23B6FD0B.png)
 5 | 
 6 | Of course, you can also use other online website to monitor the latest progress
 7 |  of  Arxiv such as https://deeplearn.org/ .
 8 |  
 9 | And our purpose of making this is to facilitate the discussion of the latest
10 |   articles in Feishu, hope this script can be helpful to you.
11 |   
12 | Our code is mainly based on this 
13 | [project](https://github.com/kobiso/get-daily-arxiv-noti) .
14 | 
15 | Next we mainly introduce how to use.
16 | 
17 | ## How to use 
18 | 
19 | 1. New a feishu bot and replace feishu webhook URL and secret in
20 |  `src/config.py` 
21 | ![](img/69E70949-0F7D-4B70-9B49-B9D290B82BAE.png)
22 | 
23 | 2. pip install -r requirements.txt
24 | 
25 | 3. (Option) Rewrite the function `filter_paper` in `src/spider.py` with your
26 |  needs.
27 |  
28 | 4. Set this script as planned work through `crontab` or other commands. 
29 | 
30 | ```bash
31 | 30 10 * * * python3 $Absolute path$
32 | ```
33 | 
34 | And then you can see the news pushed at 10:30 every day.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | # User-specific stuff
  7 | .idea/**/workspace.xml
  8 | .idea/**/tasks.xml
  9 | .idea/**/usage.statistics.xml
 10 | .idea/**/dictionaries
 11 | .idea/**/shelf
 12 | 
 13 | # Generated files
 14 | .idea/**/contentModel.xml
 15 | 
 16 | # Sensitive or high-churn files
 17 | .idea/**/dataSources/
 18 | .idea/**/dataSources.ids
 19 | .idea/**/dataSources.local.xml
 20 | .idea/**/sqlDataSources.xml
 21 | .idea/**/dynamic.xml
 22 | .idea/**/uiDesigner.xml
 23 | .idea/**/dbnavigator.xml
 24 | 
 25 | # Gradle
 26 | .idea/**/gradle.xml
 27 | .idea/**/libraries
 28 | 
 29 | # Gradle and Maven with auto-import
 30 | # When using Gradle or Maven with auto-import, you should exclude module files,
 31 | # since they will be recreated, and may cause churn.  Uncomment if using
 32 | # auto-import.
 33 | # .idea/modules.xml
 34 | # .idea/*.iml
 35 | # .idea/modules
 36 | # *.iml
 37 | # *.ipr
 38 | 
 39 | # CMake
 40 | cmake-build-*/
 41 | 
 42 | # Mongo Explorer plugin
 43 | .idea/**/mongoSettings.xml
 44 | 
 45 | # File-based project format
 46 | *.iws
 47 | 
 48 | # IntelliJ
 49 | out/
 50 | 
 51 | # mpeltonen/sbt-idea plugin
 52 | .idea_modules/
 53 | 
 54 | # JIRA plugin
 55 | atlassian-ide-plugin.xml
 56 | 
 57 | # Cursive Clojure plugin
 58 | .idea/replstate.xml
 59 | 
 60 | # Crashlytics plugin (for Android Studio and IntelliJ)
 61 | com_crashlytics_export_strings.xml
 62 | crashlytics.properties
 63 | crashlytics-build.properties
 64 | fabric.properties
 65 | 
 66 | # Editor-based Rest Client
 67 | .idea/httpRequests
 68 | 
 69 | # Android studio 3.1+ serialized cache file
 70 | .idea/caches/build_file_checksums.ser
 71 | 
 72 | ### macOS template
 73 | # General
 74 | .DS_Store
 75 | .AppleDouble
 76 | .LSOverride
 77 | 
 78 | # Icon must end with two \r
 79 | Icon
 80 | 
 81 | # Thumbnails
 82 | ._*
 83 | 
 84 | # Files that might appear in the root of a volume
 85 | .DocumentRevisions-V100
 86 | .fseventsd
 87 | .Spotlight-V100
 88 | .TemporaryItems
 89 | .Trashes
 90 | .VolumeIcon.icns
 91 | .com.apple.timemachine.donotpresent
 92 | 
 93 | # Directories potentially created on remote AFP share
 94 | .AppleDB
 95 | .AppleDesktop
 96 | Network Trash Folder
 97 | Temporary Items
 98 | .apdisk
 99 | 
100 | ### Python template
101 | # Byte-compiled / optimized / DLL files
102 | __pycache__/
103 | *.py[cod]
104 | *$py.class
105 | 
106 | # C extensions
107 | *.so
108 | 
109 | # Distribution / packaging
110 | .Python
111 | build/
112 | develop-eggs/
113 | dist/
114 | downloads/
115 | eggs/
116 | .eggs/
117 | lib/
118 | lib64/
119 | parts/
120 | sdist/
121 | var/
122 | wheels/
123 | pip-wheel-metadata/
124 | share/python-wheels/
125 | *.egg-info/
126 | .installed.cfg
127 | *.egg
128 | MANIFEST
129 | 
130 | # PyInstaller
131 | #  Usually these files are written by a python script from a template
132 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
133 | *.manifest
134 | *.spec
135 | 
136 | # Installer logs
137 | pip-log.txt
138 | pip-delete-this-directory.txt
139 | 
140 | # Unit test / coverage reports
141 | htmlcov/
142 | .tox/
143 | .nox/
144 | .coverage
145 | .coverage.*
146 | .cache
147 | nosetests.xml
148 | coverage.xml
149 | *.cover
150 | .hypothesis/
151 | .pytest_cache/
152 | 
153 | # Translations
154 | *.mo
155 | *.pot
156 | 
157 | # Django stuff:
158 | *.log
159 | local_settings.py
160 | db.sqlite3
161 | db.sqlite3-journal
162 | 
163 | # Flask stuff:
164 | instance/
165 | .webassets-cache
166 | 
167 | # Scrapy stuff:
168 | .scrapy
169 | 
170 | # Sphinx documentation
171 | docs/_build/
172 | 
173 | # PyBuilder
174 | target/
175 | 
176 | # Jupyter Notebook
177 | .ipynb_checkpoints
178 | 
179 | # IPython
180 | profile_default/
181 | ipython_config.py
182 | 
183 | # pyenv
184 | .python-version
185 | 
186 | # pipenv
187 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
188 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
189 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
190 | #   install all needed dependencies.
191 | #Pipfile.lock
192 | 
193 | # celery beat schedule file
194 | celerybeat-schedule
195 | 
196 | # SageMath parsed files
197 | *.sage.py
198 | 
199 | # Environments
200 | .env
201 | .venv
202 | env/
203 | venv/
204 | ENV/
205 | env.bak/
206 | venv.bak/
207 | 
208 | # Spyder project settings
209 | .spyderproject
210 | .spyproject
211 | 
212 | # Rope project settings
213 | .ropeproject
214 | 
215 | # mkdocs documentation
216 | /site
217 | 
218 | # mypy
219 | .mypy_cache/
220 | .dmypy.json
221 | dmypy.json
222 | 
223 | # Pyre type checker
224 | .pyre/
225 | 
226 | /src/paper.db
227 | 


--------------------------------------------------------------------------------
/src/spider.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | # based on https://github.com/kobiso/get-daily-arxiv-noti
  3 | import base64
  4 | import hashlib
  5 | import hmac
  6 | import sqlite3
  7 | import urllib.request
  8 | from datetime import datetime
  9 | 
 10 | import requests
 11 | from bs4 import BeautifulSoup
 12 | 
 13 | from config import NEW_SUB_URL, KEYWORD_LIST, WEBHOOK_SECRET, WEBHOOK_URL
 14 | 
 15 | timestamp = int(datetime.now().timestamp())
 16 | 
 17 | 
 18 | def gen_sign(secret: str):
 19 |     string_to_sign = '{}\n{}'.format(timestamp, secret)
 20 |     hmac_code = hmac.new(
 21 |         string_to_sign.encode("utf-8"),
 22 |         digestmod=hashlib.sha256
 23 |     ).digest()
 24 |     sign = base64.b64encode(hmac_code).decode('utf-8')
 25 |     return sign
 26 | 
 27 | 
 28 | def send_to_bot(content: dict):
 29 |     sign = gen_sign(WEBHOOK_SECRET)
 30 | 
 31 |     params = {
 32 |         "timestamp": timestamp,
 33 |         "msg_type": "interactive",
 34 |         "sign": sign,
 35 |         "card": content
 36 |     }
 37 | 
 38 |     resp = requests.post(WEBHOOK_URL, json=params)
 39 |     resp.raise_for_status()
 40 |     result = resp.json()
 41 |     if result.get("code") and result.get("code") != 0:
 42 |         print(f"Message Send Error For: %s" % result['msg'])
 43 |         return False
 44 |     return True
 45 | 
 46 | 
 47 | def filter_paper(paper, keyword_list):
 48 |     for keyword in keyword_list:
 49 |         if keyword.lower() in paper["abstract"].lower() \
 50 |                 or keyword.lower() in paper["title"].lower():
 51 |             return False
 52 |     return True
 53 | 
 54 | 
 55 | def main():
 56 |     page = urllib.request.urlopen(NEW_SUB_URL)
 57 |     soup = BeautifulSoup(page, 'html.parser')
 58 |     content = soup.body.find("div", {'id': 'content'})
 59 | 
 60 |     issue_title = content.find("h3").text
 61 |     dt_list = content.dl.find_all("dt")
 62 |     dd_list = content.dl.find_all("dd")
 63 |     arxiv_base = "https://arxiv.org/abs/"
 64 | 
 65 |     assert len(dt_list) == len(dd_list)
 66 | 
 67 |     keyword_list = KEYWORD_LIST
 68 |     paper_list = []
 69 | 
 70 |     conn = sqlite3.connect('paper.db')
 71 |     c = conn.cursor()
 72 |     try:
 73 |         c.execute('''CREATE TABLE PAPER(NAME  TEXT    NOT NULL);''')
 74 |         conn.commit()
 75 |     except Exception:
 76 |         pass
 77 | 
 78 |     def check_in_sql(paper):
 79 |         cursor = c.execute(
 80 |             "SELECT NAME FROM PAPER WHERE NAME = '{}'".format(paper['title'])
 81 |         )
 82 |         return len(cursor.fetchall()) > 0
 83 | 
 84 |     for i in range(len(dt_list)):
 85 |         paper = {}
 86 |         paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
 87 |         paper['main_page'] = arxiv_base + paper_number
 88 |         paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
 89 | 
 90 |         paper['title'] = dd_list[i].find(
 91 |             "div", {"class": "list-title mathjax"}
 92 |         ).text.replace("Title: ", "").strip()
 93 | 
 94 |         paper['authors'] = dd_list[i].find(
 95 |             "div", {"class": "list-authors"}
 96 |         ).text.replace("Authors:\n", "").replace("\n", "").strip()
 97 | 
 98 |         paper['subjects'] = dd_list[i].find(
 99 |             "div", {"class": "list-subjects"}
100 |         ).text.replace("Subjects: ", "").strip()
101 | 
102 |         paper['abstract'] = dd_list[i].find(
103 |             "p", {"class": "mathjax"}
104 |         ).text.replace("\n", " ").strip()
105 | 
106 |         if not filter_paper(paper, keyword_list) and not check_in_sql(paper):
107 |             paper_list.append(paper)
108 | 
109 |     card_content = {
110 |         "config": {
111 |             "wide_screen_mode": True,
112 |             "enable_forward": True
113 |         },
114 |         "elements": [],
115 |         "header": {
116 |             "template": "blue",
117 |             "title": {
118 |                 "content": "今日论文推荐",
119 |                 "tag": "plain_text"
120 |             }
121 |         }
122 |     }
123 | 
124 |     for idx, paper in enumerate(paper_list):
125 |         card_content["elements"].append({
126 |             "tag": "div",
127 |             "text": {
128 |                 "content":
129 |                     "[{}] [{}]({})\n - **Authors:** {}\n - **Subjects:** {}\n ".format(
130 |                         idx, paper['title'], paper['main_page'],
131 |                         paper['authors'], paper['subjects']),
132 |                 "tag": "lark_md"
133 |             }
134 |         })
135 |         if idx != len(paper_list) - 1:
136 |             card_content["elements"].append({
137 |                 "tag": "hr"
138 |             })
139 |         c.execute(
140 |             "INSERT INTO PAPER (NAME) VALUES ('{}')".format(paper['title'])
141 |         )
142 | 
143 |     conn.commit()
144 |     conn.close()
145 | 
146 |     if len(paper_list) == 0:
147 |         card_content["elements"].append({
148 |             "tag": "div",
149 |             "text": {
150 |                 "content": "No papers to follow today.",
151 |                 "tag": "lark_md"
152 |             }
153 |         })
154 |     card_content["elements"].append({
155 |         "actions": [
156 |             {
157 |                 "tag": "button",
158 |                 "text": {
159 |                     "content": "查看今日全部论文",
160 |                     "tag": "plain_text"
161 |                 },
162 |                 "type": "primary",
163 |                 "url": NEW_SUB_URL
164 |             }
165 |         ],
166 |         "tag": "action"
167 |     })
168 | 
169 |     send_to_bot(card_content)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     main()
174 | 


--------------------------------------------------------------------------------