├── keywords.txt ├── get_pubs.sh ├── get_pubs_multiple.sh ├── README.md ├── utils.py └── parse.py /keywords.txt: -------------------------------------------------------------------------------- 1 | # Keywords should be separated by newline. In case you want the search to be case sensitive append a " *c*" at the end of the line. 2 | 3 | Case Sensitive *c* 4 | not case sensitive 5 | example 6 | another example -------------------------------------------------------------------------------- /get_pubs.sh: -------------------------------------------------------------------------------- 1 | # filter yesterdays publications 2 | 3 | set -e 4 | 5 | yesterday=$(date -d "yesterday 13:00" '+%Y-%m-%d') 6 | kwfile='keywords.txt' 7 | 8 | python parse.py $yesterday $kwfile 9 | (okular "./summaries/${yesterday}.md") & 10 | 11 | -------------------------------------------------------------------------------- /get_pubs_multiple.sh: -------------------------------------------------------------------------------- 1 | # filter multiple dates at once 2 | # first argument is the start date 3 | # second argument is the end date, if empty current date is used 4 | 5 | set -e 6 | 7 | kwfile='keywords.txt' 8 | 9 | from=$(date -d "${1}" '+%Y-%m-%d') 10 | to=$(date -d "${2}" '+%Y-%m-%d') 11 | 12 | datum=$(date -d "$from-1 days" '+%Y-%m-%d') 13 | 14 | while true; do 15 | datum=$(date -d "$datum+1 days" '+%Y-%m-%d') 16 | python parse.py $datum $kwfile 17 | 18 | (okular "./summaries/${datum}.md") & 19 | 20 | if [[ "$datum" == "$to" ]]; then 21 | break 22 | fi 23 | done 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arxiv_parser 2 | 3 | Basic tool using arxiv API to filter the latest publications on arxiv from cs.CV mailing list. 4 | 5 | ### Requirements 6 | 7 | Make sure to use feedparser version 5.x, e.g. by running 8 | ``` 9 | conda install -c anaconda feedparser 10 | ``` 11 | The code uses `okular` for displaying the `.md` output files 12 | but you can view them with the document viewer of your choice, e.g. `vim`. 13 | If you do not use `okular` simply comment out the corresponding lines in `get_pubs.sh` and `get_pubs_multiple.sh`. 14 | 15 | ### Usage 16 | 17 | Start by entering your custom keywords in `keywords.txt`. 18 | 19 | Then you can filter yesterday's publications by running 20 | ``` 21 | ./get_pubs.sh 22 | ``` 23 | This creates a folder `summaries` and creates a file `./summaries/YYYY-MM-DD.md` with the summary of 24 | yesterday's filtered publications. 25 | 26 | For processing multiple dates you can run 27 | ``` 28 | ./get_pubs_multiple.sh FROM TO 29 | ``` 30 | where `FROM` is the earliest date in the format `YYYY-MM-DD`. `TO` (same format) is the latest date. 31 | It is optional and defaults to the current date. 32 | Note, that values for `FROM` from more than a few days ago currently make the code very slow. -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import re 3 | 4 | 5 | def get_date(entry): 6 | y, m, d = entry.published.split('T')[0].split('-') 7 | return datetime(int(y), int(m), int(d), 0, 0, 0, 0) 8 | 9 | 10 | def entries_on(sorted_entries, date): 11 | date = date.replace(minute=0, hour=0, second=0, microsecond=0) 12 | first, last = None, None 13 | for i, entry in enumerate(sorted_entries): 14 | entry_date = get_date(entry) 15 | if entry_date == date and first is None: 16 | first = i 17 | elif entry_date < date: 18 | last = i 19 | break 20 | 21 | return sorted_entries[first:last] 22 | 23 | 24 | def contains(string, keywords, flaglist=None): 25 | if isinstance(keywords, str): 26 | keywords = (keywords,) 27 | 28 | if flaglist is None: 29 | flaglist = [()] * len(keywords) 30 | 31 | positives = filter(lambda args: bool(re.search(args[0], string, *args[1])), zip(keywords, flaglist)) 32 | 33 | return list(map(lambda x: x[0], positives)) 34 | 35 | 36 | def clean_whitespaces(string): 37 | return ' '.join(string.split()) 38 | 39 | 40 | def write_in_color(string, color): 41 | return ' %s ' % (color, string) 42 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import urllib.request 3 | import feedparser 4 | import datetime 5 | import re 6 | import os 7 | 8 | from utils import get_date, entries_on, contains, clean_whitespaces, write_in_color 9 | 10 | 11 | if __name__ == '__main__': 12 | # Arguments 13 | parser = argparse.ArgumentParser( 14 | description='Parse cs.CV entries from arxiv.' 15 | ) 16 | parser.add_argument('date', type=str, help='date from which articles are parsed, format: yyyy-mm-dd') 17 | parser.add_argument('keyword_file', type=str, 18 | help='.txt file containing the keywords') 19 | 20 | args = parser.parse_args() 21 | 22 | # configs 23 | output_dir = 'summaries' 24 | os.makedirs(output_dir, exist_ok=True) 25 | output_file = os.path.join(output_dir, '%s.md' % args.date) 26 | log_file = os.path.join(output_dir, '.log_%s.txt' % args.date) 27 | with open(output_file, 'w') as f: # create empty file and add header 28 | f.write(write_in_color('Filtered arxiv publications from %s' % args.date, 'grey') + '\n\n\n\n') 29 | 30 | # read keywords from file 31 | keywords = [] 32 | flaglist = [] 33 | with open(args.keyword_file, "r") as f: 34 | for line in f: 35 | if line.startswith('#') or line == '\n': # ignore comments and empty lines 36 | continue 37 | 38 | flag = [] 39 | if line.endswith('*c*\n'): 40 | line = line[:-5] 41 | else: 42 | flag.append(re.IGNORECASE) 43 | 44 | keywords.append(line.strip()) 45 | flaglist.append(flag) 46 | 47 | # save used keywords to log file 48 | with open(log_file, 'w') as f: 49 | f.write(', '.join(keywords)) 50 | 51 | # date for which to get publications 52 | date = [int(n) for n in args.date.split('-')] 53 | date = datetime.datetime(*date, minute=0, hour=0, second=0, microsecond=0) 54 | 55 | # Base api query url 56 | base_url = 'http://export.arxiv.org/api/query?'; 57 | 58 | # Search parameters 59 | search_query = 'cat:cs.CV' # cs.CV mailing list 60 | start = 0 # retreive the first n results 61 | max_results = 100 62 | # TODO: adjust start value to enabling starting from far in the past 63 | 64 | print('Search for papers from %s...' % str(date)) 65 | while True: 66 | query = 'search_query=%s&' \ 67 | 'start=%i&' \ 68 | 'max_results=%i&' \ 69 | 'sortBy=submittedDate&' \ 70 | 'sortOrder=descending' % (search_query, start, max_results) 71 | 72 | # Opensearch metadata such as totalResults, startIndex, 73 | # and itemsPerPage live in the opensearch namespase. 74 | # Some entry metadata lives in the arXiv namespace. 75 | # This is a hack to expose both of these namespaces in 76 | # feedparser v4.1 77 | feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch' 78 | feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv' 79 | 80 | # perform a GET request using the base_url and query 81 | response = urllib.request.urlopen(base_url + query).read() 82 | 83 | # parse the response using feedparser 84 | feed = feedparser.parse(response) 85 | 86 | if get_date(feed.entries[-1]) < date: 87 | break 88 | 89 | max_results += 100 90 | 91 | feed.entries = entries_on(feed.entries, date) 92 | 93 | print('Found %d papers.' % len(feed.entries)) 94 | 95 | # Run through each entry, and filter for keywords 96 | for entry in feed.entries: 97 | abstract, title = entry.summary, entry.title 98 | 99 | matches = set(contains(title, keywords, flaglist) + contains(abstract, keywords, flaglist)) 100 | if len(matches) == 0: 101 | continue 102 | 103 | authors = ', '.join(author.name for author in entry.authors) 104 | 105 | with open(output_file, 'a') as f: 106 | f.write('## %s \n\n' % clean_whitespaces(title)) 107 | f.write('%s \n\n' % authors) 108 | f.write('###' + write_in_color('Keywords: ' + ', '.join(matches), 'gray') + '\n\n') 109 | f.write(abstract + '\n\n') 110 | 111 | # get the links to the abs page and pdf for this e-print 112 | for link in entry.links: 113 | if link.rel == 'alternate': 114 | with open(output_file, 'a') as f: 115 | f.write('abs page link: %s' % link.href + '\n\n') 116 | elif link.title == 'pdf': 117 | with open(output_file, 'a') as f: 118 | f.write('pdf link: %s' % link.href + '\n\n') 119 | 120 | --------------------------------------------------------------------------------