├── .gitignore
├── LICENSE
├── README.md
├── arxivpy
    ├── __init__.py
    └── arxiv.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python ###
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *,cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # IPython Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # dotenv
 80 | .env
 81 | 
 82 | # virtualenv
 83 | .venv/
 84 | venv/
 85 | ENV/
 86 | 
 87 | # Spyder project settings
 88 | .spyderproject
 89 | 
 90 | # Rope project settings
 91 | .ropeproject
 92 | 
 93 | 
 94 | ### IPythonNotebook ###
 95 | # Temporary data
 96 | .ipynb_checkpoints/
 97 | 
 98 | 
 99 | ### OSX ###
100 | *.DS_Store
101 | .AppleDouble
102 | .LSOverride
103 | 
104 | # Icon must end with two \r
105 | Icon
106 | 
107 | 
108 | # Thumbnails
109 | ._*
110 | 
111 | # Files that might appear in the root of a volume
112 | .DocumentRevisions-V100
113 | .fseventsd
114 | .Spotlight-V100
115 | .TemporaryItems
116 | .Trashes
117 | .VolumeIcon.icns
118 | .com.apple.timemachine.donotpresent
119 | 
120 | # Directories potentially created on remote AFP share
121 | .AppleDB
122 | .AppleDesktop
123 | Network Trash Folder
124 | Temporary Items
125 | .apdisk
126 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Titipat Achakulvisut
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Arxivpy
 2 | 
 3 | [![License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://github.com/titipata/arxivpy/blob/master/LICENSE)
 4 | 
 5 | Python wrapper for [arXiv API](http://arxiv.org/help/api/index).
 6 | Here are related libraries and repositories: [arxiv.py](https://github.com/lukasschwab/arxiv.py),
 7 | [python_arXiv_parsing_example.py](https://arxiv.org/help/api/examples/python_arXiv_parsing_example.txt)
 8 | and [arxiv-sanity-preserver](https://github.com/karpathy/arxiv-sanity-preserver).
 9 | [arXiv](http://arxiv.org/) is an open-access journal which has 1M+ e-prints in
10 | Physics, Mathematics, Computer Science, Quantitative Biology,
11 | Quantitative Finance and Statistics.
12 | 
13 | ## Example
14 | 
15 | Here is an example on how to use `arxivpy`.
16 | 
17 | ```python
18 | import arxivpy
19 | articles = arxivpy.query(search_query=['cs.CV', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML'],
20 |                          start_index=0, max_index=200, results_per_iteration=100,
21 |                          wait_time=5.0, sort_by='lastUpdatedDate') # grab 200 articles
22 | ```
23 | 
24 | Input `search_query` can be list of [categories](https://github.com/titipata/arxivpy/wiki)
25 | or string of arXiv formatted query. Output is a list of dictionary parsed from arXiv XML file.
26 | This example will parse 200 last update papers (from index 0 to 200), 100 at a time with wait time
27 | around 5 seconds (see **note** below if scraping many papers).
28 | 
29 | ### Queries
30 | 
31 | You can use other search queries, for example:
32 | 
33 | ```python
34 | search_query=['cs.DB', 'cs.IR']
35 | search_query='cs.DB' # select only Databases papers
36 | search_query='au:kording' # author name includes Kording
37 | search_query='ti:deep+AND+ti:learning' # title with `deep` and `learning`
38 | search_query='abs:%22deep+learning%22' # deep learning as a phrase
39 | ```
40 | 
41 | Or you can make simple search query using `arxivpy.generate_query`
42 | 
43 | ```python
44 | search_query = arxivpy.generate_query(terms=['cs.CV', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML'],
45 |                                       prefix='category', boolean='OR')
46 | ```
47 | 
48 | Or convert plain simple text to arXiv query using `arxivpy.generate_query_from_text`
49 | 
50 | ```python
51 | query = arxivpy.generate_query_from_text("author k kording & author achakulvisut & title science & abstract recommendation") # awesome paper
52 | articles = arxivpy.query(search_query=query)
53 | ```
54 | 
55 | More search query prefixes, booleans and categories available can be seen
56 | from [wiki page](https://github.com/titipata/arxivpy/wiki). More example queries
57 | can be found from [arXiv user manual](http://arxiv.org/help/api/user-manual)
58 | 
59 | ### Download PDF
60 | 
61 | You can also use `arxivpy.download` to download the articles to given directory.
62 | Here is a snippet to do that.
63 | 
64 | ```python
65 | arxivpy.download(articles, path='arxiv_pdf')
66 | ```
67 | 
68 | **Note from API**
69 | 
70 | - The maximum number of results returned from a single call (`max_index`)
71 | is limited to 30000 in slices of at most 2000 at a time.
72 | - In case where the API needs to be called multiple times in a row,
73 | we encourage you to play nice and incorporate a 3 seconds delay in your code.
74 | 
75 | ## Installation
76 | 
77 | The easiest way is to use `pip`.
78 | 
79 | ```bash
80 | pip install git+https://github.com/titipata/arxivpy
81 | ```
82 | 
83 | You can also do it manually by cloning the repository and run `setup.py` to install the package.
84 | 
85 | ```bash
86 | git clone https://github.com/titipata/arxivpy
87 | cd arxivpy
88 | python setup.py install
89 | ```
90 | 
91 | ## Dependencies
92 | 
93 | - [feedparser](https://github.com/kurtmckee/feedparser)
94 | - [dateutil](https://github.com/dateutil/dateutil)
95 | 


--------------------------------------------------------------------------------
/arxivpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .arxiv import query, generate_query, generate_query_from_text, download
2 | 


--------------------------------------------------------------------------------
/arxivpy/arxiv.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re
  4 | import time
  5 | import urllib
  6 | import feedparser
  7 | import random
  8 | from dateutil import parser
  9 | 
 10 | if sys.version_info[0] == 3:
 11 |     from urllib.request import urlretrieve, urlopen
 12 | else:
 13 |     from urllib import urlretrieve, urlopen
 14 | 
 15 | categories = ['cs.', 'stat.', 'q-bio.', 'nlin.', 'math.',
 16 |               'astro-ph', 'cond-mat.', 'gr-qc', 'hep-ex',
 17 |               'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nucl-ex',
 18 |               'nucl-th', 'physics.', 'quant-ph']
 19 | 
 20 | 
 21 | def query(search_query=['cs.CV', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML'],
 22 |           start_index=0,
 23 |           max_index=100,
 24 |           results_per_iteration=100,
 25 |           wait_time=5.0,
 26 |           sort_by='lastUpdatedDate',
 27 |           sort_order=None,
 28 |           verbose=False):
 29 |     """
 30 |     Function to parse arXiv XML from the arXiv API.
 31 |     See http://arxiv.org/help/api/index for more information.
 32 |     We wrote function so that it returns basic information from XML file.
 33 | 
 34 |     Parameters
 35 |     ==========
 36 |     search_query: list or str, list of categories or plain search query string
 37 |         see the end of this page http://arxiv.org/help/api/user-manual#python_simple_example
 38 |         for more publication categories see https://github.com/titipata/arxivpy/wiki
 39 | 
 40 |         example:
 41 |             search_query=['cs.DB', 'cs.IR']
 42 |             search_query='cs.DB' # don't need to specify if given a category
 43 |             search_query='au:kording'
 44 |             search_query='au:kording+AND+ti:science'
 45 |             search_query='au:Kording_K'
 46 |         search query prefixes includes ti (title), au (author), abs (abstract) and more.
 47 |         See repository wiki page for more information including search query boolean
 48 | 
 49 |     start_index: int, start index
 50 | 
 51 |     max_index: int, end or max index
 52 | 
 53 |     results_per_iteration: number of article parsing per iteration
 54 |         this control so we don't parse too many articles at once
 55 | 
 56 |     wait_time: float, waiting time when scrape more than results_per_iteration
 57 |         this will wait for wait_time + uniform(0, 3) seconds
 58 | 
 59 |     sort_by: str, either 'relevance' or 'lastUpdatedDate' or 'submittedDate' or None
 60 | 
 61 |     sort_order: str, either 'ascending' or 'descending' or None
 62 | 
 63 |     Returns
 64 |     =======
 65 |     articles_all: list of dictionary each contains following keys
 66 |         id: url.split('/abs/')[-1],
 67 |         term: category terms
 68 |         main_author: main_author of the article
 69 |         authors: list of authors separated by commas
 70 |         url: url of the article
 71 |         pdf_url: pdf url of the article
 72 |         title: title of the article
 73 |         abstract: abstract of the article
 74 |         publish_date: publish_date in datetime format
 75 |         comment: comment of the article if available
 76 |         journal_ref: reference to the journal if existed
 77 |     """
 78 | 
 79 |     base_url = 'http://export.arxiv.org/api/query?'
 80 | 
 81 |     if isinstance(search_query, list):
 82 |         # assume giving a list of categories
 83 |         search_query = '+OR+'.join(['cat:%s' % c for c in search_query])
 84 |     elif isinstance(search_query, str) and any([c for c in categories if c in search_query]) and (not 'cat:' in search_query):
 85 |         search_query = 'cat:%s' % search_query
 86 |     else:
 87 |         search_query = search_query
 88 |     search_query_string = 'search_query=%s' % search_query
 89 | 
 90 |     if results_per_iteration is None or results_per_iteration > (max_index - start_index):
 91 |         results_per_iteration = max_index - start_index
 92 | 
 93 |     if sort_by is not None:
 94 |         sort_by_query = 'sortBy=%s' % sort_by
 95 |     else:
 96 |         sort_by_query = ''
 97 | 
 98 |     if sort_order is not None:
 99 |         sort_order_query = 'sortOrder=%s' % sort_order
100 |     else:
101 |         sort_order_query = ''
102 | 
103 |     articles_all = list()
104 |     for i in range(start_index, max_index, results_per_iteration):
105 |         start_query = 'start=%i' % int(i)
106 |         max_results_query = 'max_results=%i' % int(results_per_iteration)
107 | 
108 |         ql = [search_query_string, sort_by_query, sort_order_query, start_query, max_results_query]
109 |         query_list = [q for q in ql if q is not '']
110 |         query = '&'.join(query_list)
111 | 
112 |         if verbose:
113 |             print('start index = %i, end index = %i' % (int(i), int(i + results_per_iteration)))
114 |             print('arXiv query: \n %s' % base_url + query)
115 | 
116 |         articles = list()
117 |         response = urlopen(base_url + query).read()
118 |         entries = feedparser.parse(response)
119 |         for entry in entries['entries']:
120 |             if entry['title'] == 'Error':
121 |                 print('Error %s' % entry['summary'])
122 |                 print('Check query %s from the website if it returns anything' % (base_url + query))
123 |             main_term = entry['arxiv_primary_category']['term']
124 |             terms = '|'.join([tag['term'] for tag in entry['tags']])
125 |             main_author = entry['author']
126 |             update_date = parser.parse(entry['updated'])
127 |             authors = ', '.join([author['name'].strip() for author in entry['authors']])
128 |             url = entry['link']
129 |             for e in entry['links']:
130 |                 if 'title' in e.keys():
131 |                     if e['title'] == 'pdf':
132 |                         pdf_url = e['href']
133 |                 else:
134 |                     pdf_url = 'http://arxiv.org/pdf/%s' % url.split('/abs/')[-1]
135 |             if 'arxiv_comment' in entry.keys():
136 |                 comment = entry['arxiv_comment']
137 |             else:
138 |                 comment = 'No comment found'
139 |             if 'journal_ref' in entry.keys():
140 |                 journal_ref = entry['journal_ref']
141 |             else:
142 |                 journal_ref = 'No journal ref found'
143 | 
144 |             title = entry['title_detail']['value'].replace('\n', ' ').strip()
145 |             abstract = entry['summary'].replace('\n', ' ')
146 |             publish_date = parser.parse(entry['published'])
147 |             article = {'id': url.split('/abs/')[-1],
148 |                        'term': main_term,
149 |                        'terms': terms,
150 |                        'main_author': main_author,
151 |                        'authors': authors,
152 |                        'url': url,
153 |                        'pdf_url': pdf_url,
154 |                        'title': title,
155 |                        'abstract': abstract,
156 |                        'update_date': update_date,
157 |                        'publish_date': publish_date,
158 |                        'comment': comment,
159 |                        'journal_ref': journal_ref}
160 |             articles.append(article)
161 |         if i > start_index: time.sleep(wait_time + random.uniform(0, 3))
162 |         articles_all.extend(articles)
163 |     return articles_all
164 | 
165 | 
166 | def generate_query(terms, prefix='category', boolean='OR', group_bool=False):
167 |     """
168 |     Generate simple arXiv query from given list of terms
169 | 
170 |     example:
171 |         >> title = arxivpy.generate_query(['neural network', 'deep learning'], prefix='title', boolean='AND')
172 |         >> cat = arxivpy.generate_query(['cs.CV', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML'], prefix='category', boolean='OR', group_bool=True)
173 |         >> search_query = title + '+AND+' + cat
174 |         >> articles = arxivpy.query(search_query=search_query)
175 | 
176 |         >> search_query = arxivpy.generate_query(['k kording', 't achakulvisut'], prefix='author', boolean='AND')
177 |         >> articles = arxivpy.query(search_query=search_query)
178 | 
179 |     Parameters
180 |     ==========
181 |     terms: list, list of terms related to specified prefix
182 | 
183 |     prefix: string, prefix of the query either from
184 |         'title' or 'abstract' or 'author' or 'category'
185 | 
186 |     boolean: string, boolean between terms
187 |         either from 'OR' or 'AND' or 'ANDNOT'
188 | 
189 |     group_bool: boolean, default False
190 |         if True, it will return query with parentheses %28 ... %29
191 |         elif False, it will return plain query
192 | 
193 |     Returns
194 |     =======
195 |     query: string, output query of given prefix
196 | 
197 |     """
198 |     if isinstance(terms, str):
199 |         terms = [terms] # change to list
200 |     if boolean not in ['OR', 'AND', 'ANDNOT']:
201 |         print("Boolean should be only from OR, AND or ANDNOT")
202 |     if prefix not in ['title', 'abstract', 'author', 'category']:
203 |         print("Prefix should be only from 'title' or 'abstract' or 'author' or 'category'")
204 |     boolean_str = '+%s+' % boolean
205 | 
206 |     if prefix == ('title' or 'abstract'):
207 |         terms_ = []
208 |         for term in terms:
209 |             if ' ' in term:
210 |                 phrase = "%%22%s%%22" % '+'.join(term.split(' '))
211 |                 terms_.append(phrase)
212 |             else:
213 |                 terms_.append(term)
214 |         if prefix == 'abstract':
215 |             query = boolean_str.join(['abs:%s' % t for t in terms_])
216 |         elif prefix == 'title':
217 |             query = boolean_str.join(['ti:%s' % t for t in terms_])
218 | 
219 |     elif prefix == 'author':
220 |         terms_ = []
221 |         for term in terms:
222 |             if ' ' in term:
223 |                 terms_.append('_'.join(term.split(' ')[::-1]))
224 |             else:
225 |                 terms_.append(term)
226 |         query = boolean_str.join('au:%s' % t for t in terms_)
227 | 
228 |     elif prefix == 'category':
229 |         query = boolean_str.join(['cat:%s' % t for t in terms])
230 | 
231 |     else:
232 |         query = '' # return empty in not
233 | 
234 |     if group_bool:
235 |         query = '%28' + query + '%29'
236 |     return query
237 | 
238 | 
239 | def generate_query_from_text(query_text):
240 |     """
241 |     Function to generate arXiv query from plain intuitive string
242 |     to arXiv query format. Each string starts with 'title', 'abstract'
243 |     'cat', 'author' following by query string. For categories,
244 |     you should specify all categories separated by '|' e.g.
245 |         "cat stat.ML|cs.CV"
246 | 
247 |     Query each seprated byeither & (AND) or &! (ANDNOT).
248 |     (work in progress)
249 | 
250 | 
251 |     Parameters
252 |     ==========
253 |     query_text: str, query text in plain string with
254 |         example strings:
255 |         >> "author konrad kording & title neural nets & cat stat.ML|cs.CV"
256 |         >> "author kording & author achakulvisut"
257 | 
258 |     Returns
259 |     =======
260 |     query_arxiv: str, arXiv query format
261 |     """
262 |     keys = ['title ', 'abstract ', 'author ', 'cat ']
263 |     q_out_list = list()
264 |     queries = re.split('&!|&', query_text)
265 |     for query in queries:
266 |         for k in keys:
267 |             if k in query:
268 |                 k = k.strip()
269 |                 q = query.replace(k, '').strip()
270 |                 if k == 'author':
271 |                     if ' ' in q:
272 |                         q_out = '_'.join(q.split(' ')[::-1])
273 |                     else:
274 |                         q_out = q
275 |                     q_out_list.append('au:' + q_out)
276 |                 elif k in ('title', 'abstract'):
277 |                     if ' ' in q:
278 |                         q_out = '%%22%s%%22' % '+'.join(q.split(' '))
279 |                     else:
280 |                         q_out = q
281 |                     if k == 'title':
282 |                         q_out_list.append('ti:' + q_out)
283 |                     elif k == 'abstract':
284 |                         q_out_list.append('abs:' + q_out)
285 |                 elif k == 'cat':
286 |                     cs = q.split('|')
287 |                     q_out = '+OR+'.join(['cat:%s' % c for c in cs])
288 |                     if len(cs) > 1: q_out = '%28' + q_out + '%29'
289 |                     q_out_list.append(q_out)
290 | 
291 |     seperators = list()
292 |     for sep in re.findall("&!|&", query_text):
293 |         if sep == '&':
294 |             seperators.append('+AND+')
295 |         elif sep == '&!':
296 |             seperators.append('+ANDNOT+')
297 |         elif sep == '|':
298 |             seperators.append('+OR+')
299 |         else:
300 |             seperators.append('+AND+')
301 | 
302 |     for i, j in zip(range(1, 2*len(q_out_list)+1, 2), range(len(seperators))):
303 |         q_out_list.insert(i, seperators[j])
304 | 
305 |     query_arxiv = ''.join(q_out_list)
306 |     return query_arxiv
307 | 
308 | 
309 | def download(articles, path='arxiv_pdf'):
310 |     """
311 |     Give list of parsed Arxiv dictionary, download pdf to the given path.
312 |     This will save file name as
313 | 
314 |     Parameters
315 |     ==========
316 |     articles: list, list of dictionary parsed from arXiv API
317 | 
318 |     path: str: path or directory to save pdf to
319 |     """
320 |     if not os.path.isdir(path):
321 |         os.mkdir(path)
322 |     if len(articles) >= 1:
323 |         for article in articles:
324 |             if article['pdf_url']:
325 |                 try:
326 |                     filename = article['id'] + '.pdf'
327 |                     urlretrieve(article['pdf_url'], os.path.join(path, filename))
328 |                 except:
329 |                     print('Error downloading: %s' % filename)
330 |     else:
331 |         print("No pdf available for arXiv at %d" % article['pdf_url'])
332 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | feedparser
2 | python-dateutil
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | from setuptools import setup
 3 | 
 4 | descr = '''Python wrapper for Arxiv API'''
 5 | 
 6 | if __name__ == "__main__":
 7 |     setup(
 8 |         name='arxivpy',
 9 |         version='0.1.dev',
10 |         description='Python wrapper for Arxiv API',
11 |         long_description=open('README.md').read(),
12 |         url='https://github.com/titipata/arxivpy',
13 |         author='Titipat Achakulvisut',
14 |         author_email='titipata@u.northwestern.edu',
15 |         license='(c) 2016 Titipat Achakulvisut',
16 |         keywords='arxiv,xml,pdf',
17 |         install_requires=['feedparser', 'python-dateutil'],
18 |         packages=['arxivpy'],
19 |     )
20 | 


--------------------------------------------------------------------------------