├── docs
    ├── wechat.md
    ├── nlp-arxiv-daily-wechat.json
    └── _config.yml
├── .github
    └── workflows
    │   └── main.yml
└── daily_arxiv.py


/docs/wechat.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/nlp-arxiv-daily-wechat.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | title: NLP Arxiv Daily
 2 | description: Automatically Update Papers Daily using Github Actions (Update Every 8th hours)
 3 | show_downloads: true
 4 | #theme: jekyll-theme-slate
 5 | 
 6 | remote_theme: pages-themes/slate@v0.2.0
 7 | plugins:
 8 | - jekyll-remote-theme 
 9 | 
10 | github:
11 |   zip_url: https://github.com/LearnNLP/nlp-arxiv-daily
12 |   another_url: https://github.com/LearnNLP/nlp-arxiv-daily
13 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Run Arxiv Papers Daily
 4 | 
 5 | # Controls when the workflow will run
 6 | on:
 7 |   # Allows you to run this workflow manually from the Actions tab
 8 |   workflow_dispatch:
 9 |   schedule:
10 |     - cron:  "* 0/12 * * *"  #'*/60 * * * *'
11 |   # Triggers the workflow on push or pull request events but only for the main branch
12 | #   push:
13 | #     branches:
14 | #     - main
15 | 
16 | env:
17 | 
18 |   GITHUB_USER_NAME: LearnNLP
19 |   GITHUB_USER_EMAIL: qxk554@126.com
20 |   
21 |   
22 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
23 | jobs:
24 |   # This workflow contains a single job called "build"
25 |   build:
26 |     name: update
27 |     # The type of runner that the job will run on
28 |     runs-on: ubuntu-latest
29 |     
30 |     # Steps represent a sequence of tasks that will be executed as part of the job
31 |     steps:
32 |       - name: Checkout
33 |         uses: actions/checkout@v3
34 |         
35 |       - name: Set up Python Env
36 |         uses: actions/setup-python@v1
37 |         with:
38 |           python-version: 3.8        
39 | 
40 |       - name: Install dependencies
41 |         run: |
42 |           python -m pip install --upgrade pip
43 |           pip install arxiv
44 |           pip install requests
45 |           
46 |       - name: Run daily arxiv 
47 |         run: |
48 |           python daily_arxiv.py
49 |           
50 |       - name: Push new nlp-arxiv-daily.md
51 |         uses: github-actions-x/commit@v2.9
52 |         with:
53 |           github-token: ${{ secrets.GITHUB_TOKEN }}
54 |           commit-message: "Github Action Automatic Update NLP Arxiv Papers"
55 |           files: README.md nlp-arxiv-daily.json docs/nlp-arxiv-daily-web.json docs/index.md 
56 |           rebase: 'true'
57 |           name: ${{ env.GITHUB_USER_NAME }}
58 |           email: ${{ env.GITHUB_USER_EMAIL }}
59 | 


--------------------------------------------------------------------------------
/daily_arxiv.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import requests
  3 | import json
  4 | import arxiv
  5 | import os
  6 | 
  7 | base_url = "https://arxiv.paperswithcode.com/api/v0/papers/"
  8 | 
  9 | def get_authors(authors, first_author = False):
 10 |     output = str()
 11 |     if first_author == False:
 12 |         output = ", ".join(str(author) for author in authors)
 13 |     else:
 14 |         output = authors[0]
 15 |     return output
 16 | def sort_papers(papers):
 17 |     output = dict()
 18 |     keys = list(papers.keys())
 19 |     keys.sort(reverse=True)
 20 |     for key in keys:
 21 |         output[key] = papers[key]
 22 |     return output    
 23 | 
 24 | def get_daily_papers(topic,query, max_results=2):
 25 |     """
 26 |     @param topic: str
 27 |     @param query: str
 28 |     @return paper_with_code: dict
 29 |     """
 30 | 
 31 |     # output 
 32 |     content = dict() 
 33 |     content_to_web = dict()
 34 | 
 35 |     # content
 36 |     output = dict()
 37 |     
 38 |     search_engine = arxiv.Search(
 39 |         query = query,
 40 |         max_results = max_results,
 41 |         sort_by = arxiv.SortCriterion.SubmittedDate
 42 |     )
 43 | 
 44 |     cnt = 0
 45 | 
 46 |     for result in search_engine.results():
 47 | 
 48 |         paper_id            = result.get_short_id()
 49 |         paper_title         = result.title
 50 |         paper_url           = result.entry_id
 51 |         code_url            = base_url + paper_id
 52 |         paper_abstract      = result.summary.replace("\n"," ")
 53 |         paper_authors       = get_authors(result.authors)
 54 |         paper_first_author  = paper_authors #get_authors(result.authors,first_author = True)
 55 |         primary_category    = result.primary_category
 56 |         publish_time        = result.published.date()
 57 |         update_time         = result.updated.date()
 58 |         comments            = result.comment
 59 | 
 60 | 
 61 |       
 62 |         print("Time = ", update_time ,
 63 |               " title = ", paper_title,
 64 |               " author = ", paper_first_author)
 65 | 
 66 |         # eg: 2108.09112v1 -> 2108.09112
 67 |         ver_pos = paper_id.find('v')
 68 |         if ver_pos == -1:
 69 |             paper_key = paper_id
 70 |         else:
 71 |             paper_key = paper_id[0:ver_pos]    
 72 | 
 73 |         try:
 74 |             r = requests.get(code_url).json()
 75 |             # source code link
 76 |             if "official" in r and r["official"]:
 77 |                 cnt += 1
 78 |                 repo_url = r["official"]["url"]
 79 |                 content[paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|**[link]({repo_url})**|\n"
 80 |                 content_to_web[paper_key] = f"- {update_time}, **{paper_title}**, {paper_first_author} et.al., Paper: [{paper_url}]({paper_url}), Code: **[{repo_url}]({repo_url})**"
 81 | 
 82 |             else:
 83 |                 content[paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|null|\n"
 84 |                 content_to_web[paper_key] = f"- {update_time}, **{paper_title}**, {paper_first_author} et.al., Paper: [{paper_url}]({paper_url})"
 85 | 
 86 |             # TODO: select useful comments
 87 |             comments = None
 88 |             if comments != None:
 89 |                 content_to_web[paper_key] = content_to_web[paper_key] + f", {comments}\n"
 90 |             else:
 91 |                 content_to_web[paper_key] = content_to_web[paper_key] + f"\n"
 92 | 
 93 |         except Exception as e:
 94 |             print(f"exception: {e} with id: {paper_key}")
 95 | 
 96 |     data = {topic:content}
 97 |     data_web = {topic:content_to_web}
 98 |     return data,data_web 
 99 | 
100 | def update_json_file(filename,data_all):
101 |     with open(filename,"r") as f:
102 |         content = f.read()
103 |         if len(content.strip())<2:
104 |             m = {}
105 |         else:
106 |             m = json.loads(content)
107 |             
108 |     json_data = m.copy() 
109 |     
110 |     # update papers in each keywords         
111 |     for data in data_all:
112 |         for keyword in data.keys():
113 |             papers = data[keyword]
114 | 
115 |             if keyword in json_data.keys():
116 |                 json_data[keyword].update(papers)
117 |             else:
118 |                 json_data[keyword] = papers
119 | 
120 |     with open(filename,"w") as f:
121 |         json.dump(json_data,f)
122 |     
123 | def json_to_md(filename,md_filename,
124 |                to_web = False, 
125 |                use_title = True, 
126 |                use_tc = True,
127 |                show_badge = True):
128 |     """
129 |     @param filename: str
130 |     @param md_filename: str
131 |     @return None
132 |     """
133 |     
134 |     DateNow = datetime.date.today()
135 |     DateNow = str(DateNow)
136 |     DateNow = DateNow.replace('-','.')
137 |     
138 |     with open(filename,"r") as f:
139 |         content = f.read()
140 |         if not content:
141 |             data = {}
142 |         else:
143 |             data = json.loads(content)
144 | 
145 |     # clean README.md if daily already exist else create it
146 |     with open(md_filename,"w+") as f:
147 |         pass
148 | 
149 |     # write data into README.md
150 |     with open(md_filename,"a+") as f:
151 | 
152 |         if (use_title == True) and (to_web == True):
153 |             f.write("---\n" + "layout: default\n" + "---\n\n")
154 |         
155 |         if show_badge == True:
156 |             f.write(f"[![Contributors][contributors-shield]][contributors-url]\n")
157 |             f.write(f"[![Forks][forks-shield]][forks-url]\n")
158 |             f.write(f"[![Stargazers][stars-shield]][stars-url]\n")
159 |             f.write(f"[![Issues][issues-shield]][issues-url]\n\n")    
160 |                 
161 |         if use_title == True:
162 |             f.write("## Updated on " + DateNow + "\n\n")
163 |         else:
164 |             f.write("> Updated on " + DateNow + "\n\n")
165 |         
166 |         #Add: table of contents
167 |         if use_tc == True:
168 |             f.write("<details>\n")
169 |             f.write("  <summary>Table of Contents</summary>\n")
170 |             f.write("  <ol>\n")
171 |             for keyword in data.keys():
172 |                 day_content = data[keyword]
173 |                 if not day_content:
174 |                     continue
175 |                 kw = keyword.replace(' ','-')      
176 |                 f.write(f"    <li><a href=#{kw}>{keyword}</a></li>\n")
177 |             f.write("  </ol>\n")
178 |             f.write("</details>\n\n")
179 |         
180 |         for keyword in data.keys():
181 |             day_content = data[keyword]
182 |             if not day_content:
183 |                 continue
184 |             # the head of each part
185 |             f.write(f"## {keyword}\n\n")
186 | 
187 |             if use_title == True :
188 |                 if to_web == False:
189 |                     f.write("|Publish Date|Title|Authors|PDF|Code|\n" + "|---|---|---|---|---|\n")
190 |                 else:
191 |                     f.write("| Publish Date | Title | Authors | PDF | Code |\n")
192 |                     f.write("|:---------|:-----------------------|:---------|:------|:------|\n")
193 | 
194 |             # sort papers by date
195 |             day_content = sort_papers(day_content)
196 |         
197 |             for _,v in day_content.items():
198 |                 if v is not None:
199 |                     f.write(v)
200 | 
201 |             f.write(f"\n")
202 |             
203 |             #Add: back to top
204 |             top_info = f"#Updated on {DateNow}"
205 |             top_info = top_info.replace(' ','-').replace('.','')
206 |             f.write(f"<p align=right>(<a href={top_info}>back to top</a>)</p>\n\n")
207 |         
208 |         if show_badge == True:
209 |             f.write(f"[contributors-shield]: https://img.shields.io/github/contributors/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n")
210 |             f.write(f"[contributors-url]: https://github.com/LearnNLP/nlp-arxiv-daily/graphs/contributors\n")
211 |             f.write(f"[forks-shield]: https://img.shields.io/github/forks/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n")
212 |             f.write(f"[forks-url]: https://github.com/LearnNLP/nlp-arxiv-daily/network/members\n")
213 |             f.write(f"[stars-shield]: https://img.shields.io/github/stars/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n")
214 |             f.write(f"[stars-url]: https://github.com/LearnNLP/nlp-arxiv-daily/stargazers\n")
215 |             f.write(f"[issues-shield]: https://img.shields.io/github/issues/LearnNLP/nlp-arxiv-daily.svg?style=for-the-badge\n")
216 |             f.write(f"[issues-url]: https://github.com/LearnNLP/nlp-arxiv-daily/issues\n\n")
217 |             
218 |                 
219 |     print("finished")        
220 | 
221 |  
222 | 
223 | if __name__ == "__main__":
224 | 
225 |     data_collector = []
226 |     data_collector_web= []
227 |     
228 |     keywords = dict()
229 |     keywords["Speech Translation"]  = "ti:\"speech translation\""+"OR"+"ti:\"speech-to-text translation\""
230 |     keywords["Speech Recognition"]  = "ti:\"speech recognition\""
231 |     keywords["Audio Forenisc"]       = "ti:\"Audio Splicing Detection\""+"OR"+"ti:\"Speech Deepfake Detection\""
232 |     keywords["Legal"]               = "ti:\"legal\""
233 |     
234 |     for topic,keyword in keywords.items():
235 |  
236 |         # topic = keyword.replace("\"","")
237 |         print("Keyword: " + topic)
238 | 
239 |         data,data_web = get_daily_papers(topic, query = keyword, max_results = 100)
240 |         data_collector.append(data)
241 |         data_collector_web.append(data_web)
242 | 
243 |         print("\n")
244 | 
245 |     # 1. update README.md file
246 |     json_file = "nlp-arxiv-daily.json"
247 |     md_file   = "README.md"
248 |     # update json data
249 |     update_json_file(json_file,data_collector)
250 |     # json data to markdown
251 |     json_to_md(json_file,md_file)
252 | 
253 |     # 2. update docs/index.md file
254 |     json_file = "./docs/nlp-arxiv-daily-web.json"
255 |     md_file   = "./docs/index.md"
256 |     # update json data
257 |     update_json_file(json_file,data_collector)
258 |     # json data to markdown
259 |     json_to_md(json_file, md_file, to_web = True)
260 | 
261 |     # 3. Update docs/wechat.md file
262 |     json_file = "./docs/nlp-arxiv-daily-wechat.json"
263 |     md_file   = "./docs/wechat.md"
264 |     # update json data
265 |     update_json_file(json_file, data_collector_web)
266 |     # json data to markdown
267 |     json_to_md(json_file, md_file, to_web=False, use_title= False)
268 | 


--------------------------------------------------------------------------------