├── docs ├── wechat.md ├── nlp-arxiv-daily-wechat.json └── _config.yml ├── .github └── workflows │ └── main.yml └── daily_arxiv.py /docs/wechat.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/nlp-arxiv-daily-wechat.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | title: NLP Arxiv Daily 2 | description: Automatically Update Papers Daily using Github Actions (Update Every 8th hours) 3 | show_downloads: true 4 | #theme: jekyll-theme-slate 5 | 6 | remote_theme: pages-themes/slate@v0.2.0 7 | plugins: 8 | - jekyll-remote-theme 9 | 10 | github: 11 | zip_url: https://github.com/LearnNLP/nlp-arxiv-daily 12 | another_url: https://github.com/LearnNLP/nlp-arxiv-daily 13 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Run Arxiv Papers Daily 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Allows you to run this workflow manually from the Actions tab 8 | workflow_dispatch: 9 | schedule: 10 | - cron: "* 0/12 * * *" #'*/60 * * * *' 11 | # Triggers the workflow on push or pull request events but only for the main branch 12 | # push: 13 | # branches: 14 | # - main 15 | 16 | env: 17 | 18 | GITHUB_USER_NAME: LearnNLP 19 | GITHUB_USER_EMAIL: qxk554@126.com 20 | 21 | 22 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 23 | jobs: 24 | # This workflow contains a single job called "build" 25 | build: 26 | name: update 27 | # The type of runner that the job will run on 28 | runs-on: ubuntu-latest 29 | 30 | # Steps represent a sequence of tasks that will be executed as part of the job 31 | steps: 32 | - name: Checkout 33 | uses: actions/checkout@v3 34 | 35 | - name: Set up Python Env 36 | uses: actions/setup-python@v1 37 | with: 38 | python-version: 3.8 39 | 40 | - name: Install dependencies 41 | run: | 42 | python -m pip install --upgrade pip 43 | pip install arxiv 44 | pip install requests 45 | 46 | - name: Run daily arxiv 47 | run: | 48 | python daily_arxiv.py 49 | 50 | - name: Push new nlp-arxiv-daily.md 51 | uses: github-actions-x/commit@v2.9 52 | with: 53 | github-token: ${{ secrets.GITHUB_TOKEN }} 54 | commit-message: "Github Action Automatic Update NLP Arxiv Papers" 55 | files: README.md nlp-arxiv-daily.json docs/nlp-arxiv-daily-web.json docs/index.md 56 | rebase: 'true' 57 | name: ${{ env.GITHUB_USER_NAME }} 58 | email: ${{ env.GITHUB_USER_EMAIL }} 59 | -------------------------------------------------------------------------------- /daily_arxiv.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import requests 3 | import json 4 | import arxiv 5 | import os 6 | 7 | base_url = "https://arxiv.paperswithcode.com/api/v0/papers/" 8 | 9 | def get_authors(authors, first_author = False): 10 | output = str() 11 | if first_author == False: 12 | output = ", ".join(str(author) for author in authors) 13 | else: 14 | output = authors[0] 15 | return output 16 | def sort_papers(papers): 17 | output = dict() 18 | keys = list(papers.keys()) 19 | keys.sort(reverse=True) 20 | for key in keys: 21 | output[key] = papers[key] 22 | return output 23 | 24 | def get_daily_papers(topic,query, max_results=2): 25 | """ 26 | @param topic: str 27 | @param query: str 28 | @return paper_with_code: dict 29 | """ 30 | 31 | # output 32 | content = dict() 33 | content_to_web = dict() 34 | 35 | # content 36 | output = dict() 37 | 38 | search_engine = arxiv.Search( 39 | query = query, 40 | max_results = max_results, 41 | sort_by = arxiv.SortCriterion.SubmittedDate 42 | ) 43 | 44 | cnt = 0 45 | 46 | for result in search_engine.results(): 47 | 48 | paper_id = result.get_short_id() 49 | paper_title = result.title 50 | paper_url = result.entry_id 51 | code_url = base_url + paper_id 52 | paper_abstract = result.summary.replace("\n"," ") 53 | paper_authors = get_authors(result.authors) 54 | paper_first_author = paper_authors #get_authors(result.authors,first_author = True) 55 | primary_category = result.primary_category 56 | publish_time = result.published.date() 57 | update_time = result.updated.date() 58 | comments = result.comment 59 | 60 | 61 | 62 | print("Time = ", update_time , 63 | " title = ", paper_title, 64 | " author = ", paper_first_author) 65 | 66 | # eg: 2108.09112v1 -> 2108.09112 67 | ver_pos = paper_id.find('v') 68 | if ver_pos == -1: 69 | paper_key = paper_id 70 | else: 71 | paper_key = paper_id[0:ver_pos] 72 | 73 | try: 74 | r = requests.get(code_url).json() 75 | # source code link 76 | if "official" in r and r["official"]: 77 | cnt += 1 78 | repo_url = r["official"]["url"] 79 | content[paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|**[link]({repo_url})**|\n" 80 | content_to_web[paper_key] = f"- {update_time}, **{paper_title}**, {paper_first_author} et.al., Paper: [{paper_url}]({paper_url}), Code: **[{repo_url}]({repo_url})**" 81 | 82 | else: 83 | content[paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|null|\n" 84 | content_to_web[paper_key] = f"- {update_time}, **{paper_title}**, {paper_first_author} et.al., Paper: [{paper_url}]({paper_url})" 85 | 86 | # TODO: select useful comments 87 | comments = None 88 | if comments != None: 89 | content_to_web[paper_key] = content_to_web[paper_key] + f", {comments}\n" 90 | else: 91 | content_to_web[paper_key] = content_to_web[paper_key] + f"\n" 92 | 93 | except Exception as e: 94 | print(f"exception: {e} with id: {paper_key}") 95 | 96 | data = {topic:content} 97 | data_web = {topic:content_to_web} 98 | return data,data_web 99 | 100 | def update_json_file(filename,data_all): 101 | with open(filename,"r") as f: 102 | content = f.read() 103 | if len(content.strip())<2: 104 | m = {} 105 | else: 106 | m = json.loads(content) 107 | 108 | json_data = m.copy() 109 | 110 | # update papers in each keywords 111 | for data in data_all: 112 | for keyword in data.keys(): 113 | papers = data[keyword] 114 | 115 | if keyword in json_data.keys(): 116 | json_data[keyword].update(papers) 117 | else: 118 | json_data[keyword] = papers 119 | 120 | with open(filename,"w") as f: 121 | json.dump(json_data,f) 122 | 123 | def json_to_md(filename,md_filename, 124 | to_web = False, 125 | use_title = True, 126 | use_tc = True, 127 | show_badge = True): 128 | """ 129 | @param filename: str 130 | @param md_filename: str 131 | @return None 132 | """ 133 | 134 | DateNow = datetime.date.today() 135 | DateNow = str(DateNow) 136 | DateNow = DateNow.replace('-','.') 137 | 138 | with open(filename,"r") as f: 139 | content = f.read() 140 | if not content: 141 | data = {} 142 | else: 143 | data = json.loads(content) 144 | 145 | # clean README.md if daily already exist else create it 146 | with open(md_filename,"w+") as f: 147 | pass 148 | 149 | # write data into README.md 150 | with open(md_filename,"a+") as f: 151 | 152 | if (use_title == True) and (to_web == True): 153 | f.write("---\n" + "layout: default\n" + "---\n\n") 154 | 155 | if show_badge == True: 156 | f.write(f"[![Contributors][contributors-shield]][contributors-url]\n") 157 | f.write(f"[![Forks][forks-shield]][forks-url]\n") 158 | f.write(f"[![Stargazers][stars-shield]][stars-url]\n") 159 | f.write(f"[![Issues][issues-shield]][issues-url]\n\n") 160 | 161 | if use_title == True: 162 | f.write("## Updated on " + DateNow + "\n\n") 163 | else: 164 | f.write("> Updated on " + DateNow + "\n\n") 165 | 166 | #Add: table of contents 167 | if use_tc == True: 168 | f.write("
\n") 169 | f.write(" Table of Contents\n") 170 | f.write("
    \n") 171 | for keyword in data.keys(): 172 | day_content = data[keyword] 173 | if not day_content: 174 | continue 175 | kw = keyword.replace(' ','-') 176 | f.write(f"
  1. {keyword}
  2. \n") 177 | f.write("
\n") 178 | f.write("
\n\n") 179 | 180 | for keyword in data.keys(): 181 | day_content = data[keyword] 182 | if not day_content: 183 | continue 184 | # the head of each part 185 | f.write(f"## {keyword}\n\n") 186 | 187 | if use_title == True : 188 | if to_web == False: 189 | f.write("|Publish Date|Title|Authors|PDF|Code|\n" + "|---|---|---|---|---|\n") 190 | else: 191 | f.write("| Publish Date | Title | Authors | PDF | Code |\n") 192 | f.write("|:---------|:-----------------------|:---------|:------|:------|\n") 193 | 194 | # sort papers by date 195 | day_content = sort_papers(day_content) 196 | 197 | for _,v in day_content.items(): 198 | if v is not None: 199 | f.write(v) 200 | 201 | f.write(f"\n") 202 | 203 | #Add: back to top 204 | top_info = f"#Updated on {DateNow}" 205 | top_info = top_info.replace(' ','-').replace('.','') 206 | f.write(f"

(back to top)

\n\n") 207 | 208 | if show_badge == True: 209 | f.write(f"[contributors-shield]: https://img.shields.io/github/contributors/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n") 210 | f.write(f"[contributors-url]: https://github.com/LearnNLP/nlp-arxiv-daily/graphs/contributors\n") 211 | f.write(f"[forks-shield]: https://img.shields.io/github/forks/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n") 212 | f.write(f"[forks-url]: https://github.com/LearnNLP/nlp-arxiv-daily/network/members\n") 213 | f.write(f"[stars-shield]: https://img.shields.io/github/stars/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n") 214 | f.write(f"[stars-url]: https://github.com/LearnNLP/nlp-arxiv-daily/stargazers\n") 215 | f.write(f"[issues-shield]: https://img.shields.io/github/issues/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n") 216 | f.write(f"[issues-url]: https://github.com/LearnNLP/nlp-arxiv-daily/issues\n\n") 217 | 218 | 219 | print("finished") 220 | 221 | 222 | 223 | if __name__ == "__main__": 224 | 225 | data_collector = [] 226 | data_collector_web= [] 227 | 228 | keywords = dict() 229 | keywords["Speech Translation"] = "ti:\"speech translation\""+"OR"+"ti:\"speech-to-text translation\"" 230 | keywords["Speech Recognition"] = "ti:\"speech recognition\"" 231 | keywords["Audio Forenisc"] = "ti:\"Audio Splicing Detection\""+"OR"+"ti:\"Speech Deepfake Detection\"" 232 | keywords["Legal"] = "ti:\"legal\"" 233 | 234 | for topic,keyword in keywords.items(): 235 | 236 | # topic = keyword.replace("\"","") 237 | print("Keyword: " + topic) 238 | 239 | data,data_web = get_daily_papers(topic, query = keyword, max_results = 100) 240 | data_collector.append(data) 241 | data_collector_web.append(data_web) 242 | 243 | print("\n") 244 | 245 | # 1. update README.md file 246 | json_file = "nlp-arxiv-daily.json" 247 | md_file = "README.md" 248 | # update json data 249 | update_json_file(json_file,data_collector) 250 | # json data to markdown 251 | json_to_md(json_file,md_file) 252 | 253 | # 2. update docs/index.md file 254 | json_file = "./docs/nlp-arxiv-daily-web.json" 255 | md_file = "./docs/index.md" 256 | # update json data 257 | update_json_file(json_file,data_collector) 258 | # json data to markdown 259 | json_to_md(json_file, md_file, to_web = True) 260 | 261 | # 3. Update docs/wechat.md file 262 | json_file = "./docs/nlp-arxiv-daily-wechat.json" 263 | md_file = "./docs/wechat.md" 264 | # update json data 265 | update_json_file(json_file, data_collector_web) 266 | # json data to markdown 267 | json_to_md(json_file, md_file, to_web=False, use_title= False) 268 | --------------------------------------------------------------------------------