├── keywords.txt
├── get_pubs.sh
├── get_pubs_multiple.sh
├── README.md
├── utils.py
└── parse.py


/keywords.txt:
--------------------------------------------------------------------------------
1 | # Keywords should be separated by newline. In case you want the search to be case sensitive append a " *c*" at the end of the line.
2 | 
3 | Case Sensitive *c*
4 | not case sensitive
5 | example
6 | another example


--------------------------------------------------------------------------------
/get_pubs.sh:
--------------------------------------------------------------------------------
 1 | # filter yesterdays publications
 2 | 
 3 | set -e
 4 | 
 5 | yesterday=$(date -d "yesterday 13:00" '+%Y-%m-%d')
 6 | kwfile='keywords.txt'
 7 | 
 8 | python parse.py $yesterday $kwfile
 9 | (okular "./summaries/${yesterday}.md") &
10 | 
11 | 


--------------------------------------------------------------------------------
/get_pubs_multiple.sh:
--------------------------------------------------------------------------------
 1 | # filter multiple dates at once
 2 | # first argument is the start date
 3 | # second argument is the end date, if empty current date is used
 4 | 
 5 | set -e
 6 | 
 7 | kwfile='keywords.txt'
 8 | 
 9 | from=$(date -d "${1}" '+%Y-%m-%d')
10 | to=$(date -d "${2}" '+%Y-%m-%d')
11 | 
12 | datum=$(date -d "$from-1 days" '+%Y-%m-%d')
13 | 
14 | while true; do
15 | 	datum=$(date -d "$datum+1 days" '+%Y-%m-%d')
16 | 	python parse.py $datum $kwfile
17 | 	
18 | 	(okular "./summaries/${datum}.md") &	
19 | 	
20 | 	if [[ "$datum" == "$to" ]]; then
21 | 	    break
22 |   	fi
23 | done
24 | 
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # arxiv_parser
 2 | 
 3 | Basic tool using arxiv API to filter the latest publications on arxiv from cs.CV mailing list.
 4 | 
 5 | ### Requirements
 6 | 
 7 | Make sure to use feedparser version 5.x, e.g. by running
 8 | ```
 9 | conda install -c anaconda feedparser
10 | ```
11 | The code uses `okular` for displaying the `.md` output files 
12 | but you can view them with the document viewer of your choice, e.g. `vim`.
13 | If you do not use `okular` simply comment out the corresponding lines in `get_pubs.sh` and `get_pubs_multiple.sh`.
14 | 
15 | ### Usage
16 | 
17 | Start by entering your custom keywords in `keywords.txt`.
18 | 
19 | Then you can filter yesterday's publications by running
20 | ```
21 | ./get_pubs.sh
22 | ```
23 | This creates a folder `summaries` and creates a file `./summaries/YYYY-MM-DD.md` with the summary of 
24 | yesterday's filtered publications.
25 | 
26 | For processing multiple dates you can run
27 | ```
28 | ./get_pubs_multiple.sh FROM TO
29 | ```
30 | where `FROM` is the earliest date in the format `YYYY-MM-DD`. `TO` (same format) is the latest date. 
31 | It is optional and defaults to the current date.
32 | Note, that values for `FROM` from more than a few days ago currently make the code very slow.


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import re
 3 | 
 4 | 
 5 | def get_date(entry):
 6 |     y, m, d = entry.published.split('T')[0].split('-')
 7 |     return datetime(int(y), int(m), int(d), 0, 0, 0, 0)
 8 | 
 9 | 
10 | def entries_on(sorted_entries, date):
11 |     date = date.replace(minute=0, hour=0, second=0, microsecond=0)
12 |     first, last = None, None
13 |     for i, entry in enumerate(sorted_entries):
14 |         entry_date = get_date(entry)
15 |         if entry_date == date and first is None:
16 |             first = i
17 |         elif entry_date < date:
18 |             last = i
19 |             break
20 | 
21 |     return sorted_entries[first:last]
22 | 
23 | 
24 | def contains(string, keywords, flaglist=None):
25 |     if isinstance(keywords, str):
26 |         keywords = (keywords,)
27 | 
28 |     if flaglist is None:
29 |         flaglist = [()] * len(keywords)
30 | 
31 |     positives = filter(lambda args: bool(re.search(args[0], string, *args[1])), zip(keywords, flaglist))
32 | 
33 |     return list(map(lambda x: x[0], positives))
34 | 
35 | 
36 | def clean_whitespaces(string):
37 |     return ' '.join(string.split())
38 | 
39 | 
40 | def write_in_color(string, color):
41 |     return '<span style="color:%s"> %s </span>' % (color, string)
42 | 


--------------------------------------------------------------------------------
/parse.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import urllib.request
  3 | import feedparser
  4 | import datetime
  5 | import re
  6 | import os
  7 | 
  8 | from utils import get_date, entries_on, contains, clean_whitespaces, write_in_color
  9 | 
 10 | 
 11 | if __name__ == '__main__':
 12 |     # Arguments
 13 |     parser = argparse.ArgumentParser(
 14 |         description='Parse cs.CV entries from arxiv.'
 15 |     )
 16 |     parser.add_argument('date', type=str, help='date from which articles are parsed, format: yyyy-mm-dd')
 17 |     parser.add_argument('keyword_file', type=str,
 18 |                         help='.txt file containing the keywords')
 19 | 
 20 |     args = parser.parse_args()
 21 | 
 22 |     # configs
 23 |     output_dir = 'summaries'
 24 |     os.makedirs(output_dir, exist_ok=True)
 25 |     output_file = os.path.join(output_dir, '%s.md' % args.date)
 26 |     log_file = os.path.join(output_dir, '.log_%s.txt' % args.date)
 27 |     with open(output_file, 'w') as f:           # create empty file and add header
 28 |         f.write(write_in_color('Filtered arxiv publications from %s' % args.date, 'grey') + '\n\n\n\n')
 29 | 
 30 |     # read keywords from file
 31 |     keywords = []
 32 |     flaglist = []
 33 |     with open(args.keyword_file, "r") as f:
 34 |         for line in f:
 35 |             if line.startswith('#') or line == '\n':        # ignore comments and empty lines
 36 |                 continue
 37 | 
 38 |             flag = []
 39 |             if line.endswith('*c*\n'):
 40 |                 line = line[:-5]
 41 |             else:
 42 |                 flag.append(re.IGNORECASE)
 43 | 
 44 |             keywords.append(line.strip())
 45 |             flaglist.append(flag)
 46 | 
 47 |     # save used keywords to log file
 48 |     with open(log_file, 'w') as f:
 49 |         f.write(', '.join(keywords))
 50 | 
 51 |     # date for which to get publications
 52 |     date = [int(n) for n in args.date.split('-')]
 53 |     date = datetime.datetime(*date, minute=0, hour=0, second=0, microsecond=0)
 54 | 
 55 |     # Base api query url
 56 |     base_url = 'http://export.arxiv.org/api/query?';
 57 | 
 58 |     # Search parameters
 59 |     search_query = 'cat:cs.CV'  # cs.CV mailing list
 60 |     start = 0  # retreive the first n results
 61 |     max_results = 100
 62 |     # TODO: adjust start value to enabling starting from far in the past
 63 | 
 64 |     print('Search for papers from %s...' % str(date))
 65 |     while True:
 66 |         query = 'search_query=%s&' \
 67 |                 'start=%i&' \
 68 |                 'max_results=%i&' \
 69 |                 'sortBy=submittedDate&' \
 70 |                 'sortOrder=descending' % (search_query, start, max_results)
 71 | 
 72 |         # Opensearch metadata such as totalResults, startIndex,
 73 |         # and itemsPerPage live in the opensearch namespase.
 74 |         # Some entry metadata lives in the arXiv namespace.
 75 |         # This is a hack to expose both of these namespaces in
 76 |         # feedparser v4.1
 77 |         feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
 78 |         feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
 79 | 
 80 |         # perform a GET request using the base_url and query
 81 |         response = urllib.request.urlopen(base_url + query).read()
 82 | 
 83 |         # parse the response using feedparser
 84 |         feed = feedparser.parse(response)
 85 | 
 86 |         if get_date(feed.entries[-1]) < date:
 87 |             break
 88 | 
 89 |         max_results += 100
 90 | 
 91 |     feed.entries = entries_on(feed.entries, date)
 92 | 
 93 |     print('Found %d papers.' % len(feed.entries))
 94 | 
 95 |     # Run through each entry, and filter for keywords
 96 |     for entry in feed.entries:
 97 |         abstract, title = entry.summary, entry.title
 98 | 
 99 |         matches = set(contains(title, keywords, flaglist) + contains(abstract, keywords, flaglist))
100 |         if len(matches) == 0:
101 |             continue
102 | 
103 |         authors = ', '.join(author.name for author in entry.authors)
104 | 
105 |         with open(output_file, 'a') as f:
106 |             f.write('## %s \n\n' % clean_whitespaces(title))
107 |             f.write('%s \n\n' % authors)
108 |             f.write('###' + write_in_color('Keywords: ' + ', '.join(matches), 'gray') + '\n\n')
109 |             f.write(abstract + '\n\n')
110 | 
111 |         # get the links to the abs page and pdf for this e-print
112 |         for link in entry.links:
113 |             if link.rel == 'alternate':
114 |                 with open(output_file, 'a') as f:
115 |                     f.write('abs page link: %s' % link.href + '\n\n')
116 |             elif link.title == 'pdf':
117 |                 with open(output_file, 'a') as f:
118 |                     f.write('pdf link: %s' % link.href + '\n\n')
119 | 
120 | 


--------------------------------------------------------------------------------