├── docs
├── wechat.md
├── nlp-arxiv-daily-wechat.json
└── _config.yml
├── .github
└── workflows
│ └── main.yml
└── daily_arxiv.py
/docs/wechat.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/nlp-arxiv-daily-wechat.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | title: NLP Arxiv Daily
2 | description: Automatically Update Papers Daily using Github Actions (Update Every 8th hours)
3 | show_downloads: true
4 | #theme: jekyll-theme-slate
5 |
6 | remote_theme: pages-themes/slate@v0.2.0
7 | plugins:
8 | - jekyll-remote-theme
9 |
10 | github:
11 | zip_url: https://github.com/LearnNLP/nlp-arxiv-daily
12 | another_url: https://github.com/LearnNLP/nlp-arxiv-daily
13 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | # This is a basic workflow to help you get started with Actions
2 |
3 | name: Run Arxiv Papers Daily
4 |
5 | # Controls when the workflow will run
6 | on:
7 | # Allows you to run this workflow manually from the Actions tab
8 | workflow_dispatch:
9 | schedule:
10 | - cron: "* 0/12 * * *" #'*/60 * * * *'
11 | # Triggers the workflow on push or pull request events but only for the main branch
12 | # push:
13 | # branches:
14 | # - main
15 |
16 | env:
17 |
18 | GITHUB_USER_NAME: LearnNLP
19 | GITHUB_USER_EMAIL: qxk554@126.com
20 |
21 |
22 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
23 | jobs:
24 | # This workflow contains a single job called "build"
25 | build:
26 | name: update
27 | # The type of runner that the job will run on
28 | runs-on: ubuntu-latest
29 |
30 | # Steps represent a sequence of tasks that will be executed as part of the job
31 | steps:
32 | - name: Checkout
33 | uses: actions/checkout@v3
34 |
35 | - name: Set up Python Env
36 | uses: actions/setup-python@v1
37 | with:
38 | python-version: 3.8
39 |
40 | - name: Install dependencies
41 | run: |
42 | python -m pip install --upgrade pip
43 | pip install arxiv
44 | pip install requests
45 |
46 | - name: Run daily arxiv
47 | run: |
48 | python daily_arxiv.py
49 |
50 | - name: Push new nlp-arxiv-daily.md
51 | uses: github-actions-x/commit@v2.9
52 | with:
53 | github-token: ${{ secrets.GITHUB_TOKEN }}
54 | commit-message: "Github Action Automatic Update NLP Arxiv Papers"
55 | files: README.md nlp-arxiv-daily.json docs/nlp-arxiv-daily-web.json docs/index.md
56 | rebase: 'true'
57 | name: ${{ env.GITHUB_USER_NAME }}
58 | email: ${{ env.GITHUB_USER_EMAIL }}
59 |
--------------------------------------------------------------------------------
/daily_arxiv.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import requests
3 | import json
4 | import arxiv
5 | import os
6 |
7 | base_url = "https://arxiv.paperswithcode.com/api/v0/papers/"
8 |
9 | def get_authors(authors, first_author = False):
10 | output = str()
11 | if first_author == False:
12 | output = ", ".join(str(author) for author in authors)
13 | else:
14 | output = authors[0]
15 | return output
16 | def sort_papers(papers):
17 | output = dict()
18 | keys = list(papers.keys())
19 | keys.sort(reverse=True)
20 | for key in keys:
21 | output[key] = papers[key]
22 | return output
23 |
24 | def get_daily_papers(topic,query, max_results=2):
25 | """
26 | @param topic: str
27 | @param query: str
28 | @return paper_with_code: dict
29 | """
30 |
31 | # output
32 | content = dict()
33 | content_to_web = dict()
34 |
35 | # content
36 | output = dict()
37 |
38 | search_engine = arxiv.Search(
39 | query = query,
40 | max_results = max_results,
41 | sort_by = arxiv.SortCriterion.SubmittedDate
42 | )
43 |
44 | cnt = 0
45 |
46 | for result in search_engine.results():
47 |
48 | paper_id = result.get_short_id()
49 | paper_title = result.title
50 | paper_url = result.entry_id
51 | code_url = base_url + paper_id
52 | paper_abstract = result.summary.replace("\n"," ")
53 | paper_authors = get_authors(result.authors)
54 | paper_first_author = paper_authors #get_authors(result.authors,first_author = True)
55 | primary_category = result.primary_category
56 | publish_time = result.published.date()
57 | update_time = result.updated.date()
58 | comments = result.comment
59 |
60 |
61 |
62 | print("Time = ", update_time ,
63 | " title = ", paper_title,
64 | " author = ", paper_first_author)
65 |
66 | # eg: 2108.09112v1 -> 2108.09112
67 | ver_pos = paper_id.find('v')
68 | if ver_pos == -1:
69 | paper_key = paper_id
70 | else:
71 | paper_key = paper_id[0:ver_pos]
72 |
73 | try:
74 | r = requests.get(code_url).json()
75 | # source code link
76 | if "official" in r and r["official"]:
77 | cnt += 1
78 | repo_url = r["official"]["url"]
79 | content[paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|**[link]({repo_url})**|\n"
80 | content_to_web[paper_key] = f"- {update_time}, **{paper_title}**, {paper_first_author} et.al., Paper: [{paper_url}]({paper_url}), Code: **[{repo_url}]({repo_url})**"
81 |
82 | else:
83 | content[paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|null|\n"
84 | content_to_web[paper_key] = f"- {update_time}, **{paper_title}**, {paper_first_author} et.al., Paper: [{paper_url}]({paper_url})"
85 |
86 | # TODO: select useful comments
87 | comments = None
88 | if comments != None:
89 | content_to_web[paper_key] = content_to_web[paper_key] + f", {comments}\n"
90 | else:
91 | content_to_web[paper_key] = content_to_web[paper_key] + f"\n"
92 |
93 | except Exception as e:
94 | print(f"exception: {e} with id: {paper_key}")
95 |
96 | data = {topic:content}
97 | data_web = {topic:content_to_web}
98 | return data,data_web
99 |
100 | def update_json_file(filename,data_all):
101 | with open(filename,"r") as f:
102 | content = f.read()
103 | if len(content.strip())<2:
104 | m = {}
105 | else:
106 | m = json.loads(content)
107 |
108 | json_data = m.copy()
109 |
110 | # update papers in each keywords
111 | for data in data_all:
112 | for keyword in data.keys():
113 | papers = data[keyword]
114 |
115 | if keyword in json_data.keys():
116 | json_data[keyword].update(papers)
117 | else:
118 | json_data[keyword] = papers
119 |
120 | with open(filename,"w") as f:
121 | json.dump(json_data,f)
122 |
123 | def json_to_md(filename,md_filename,
124 | to_web = False,
125 | use_title = True,
126 | use_tc = True,
127 | show_badge = True):
128 | """
129 | @param filename: str
130 | @param md_filename: str
131 | @return None
132 | """
133 |
134 | DateNow = datetime.date.today()
135 | DateNow = str(DateNow)
136 | DateNow = DateNow.replace('-','.')
137 |
138 | with open(filename,"r") as f:
139 | content = f.read()
140 | if not content:
141 | data = {}
142 | else:
143 | data = json.loads(content)
144 |
145 | # clean README.md if daily already exist else create it
146 | with open(md_filename,"w+") as f:
147 | pass
148 |
149 | # write data into README.md
150 | with open(md_filename,"a+") as f:
151 |
152 | if (use_title == True) and (to_web == True):
153 | f.write("---\n" + "layout: default\n" + "---\n\n")
154 |
155 | if show_badge == True:
156 | f.write(f"[![Contributors][contributors-shield]][contributors-url]\n")
157 | f.write(f"[![Forks][forks-shield]][forks-url]\n")
158 | f.write(f"[![Stargazers][stars-shield]][stars-url]\n")
159 | f.write(f"[![Issues][issues-shield]][issues-url]\n\n")
160 |
161 | if use_title == True:
162 | f.write("## Updated on " + DateNow + "\n\n")
163 | else:
164 | f.write("> Updated on " + DateNow + "\n\n")
165 |
166 | #Add: table of contents
167 | if use_tc == True:
168 | f.write("Table of Contents
\n")
170 | f.write(" \n")
171 | for keyword in data.keys():
172 | day_content = data[keyword]
173 | if not day_content:
174 | continue
175 | kw = keyword.replace(' ','-')
176 | f.write(f"
\n")
178 | f.write("