├── README.md ├── requirements.txt ├── setup.py └── sotawhat ├── __init__.py └── sotawhat.py /README.md: -------------------------------------------------------------------------------- 1 | # sotawhat 2 | 3 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) 4 | 5 | Read more about SOTAWHAT [here](https://huyenchip.com/2018/10/04/sotawhat.html). 6 | 7 | You can use sotawhat through a web interface [here](https://sotawhat.herokuapp.com/#/). Thanks hmchuong! 8 | 9 | This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps: 10 | 11 | 12 | Step 1: clone this repo, and go inside that repo: 13 | ```bash 14 | $ git clone [HTTPS or SSH linnk to this repo] 15 | $ cd sotawhat 16 | ``` 17 | Step 2: install using pip 18 | 19 | ```bash 20 | $ pip3 install . 21 | ``` 22 | 23 | On Windows, due to encoding errors, the script may cause issues when run on the command line. It is 24 | recommended to use `pip install win-unicode-console --upgrade` prior to launching the script. If you get 25 | UnicodeEncodingError, you *must* install the above. 26 | 27 | In MacOS, you can get the SSL error 28 | 29 | ``` 30 | [nltk_data] Error loading punkt: 33 | ``` 34 | 35 | this will be fixed by reinstalling certificates 36 | ```shell 37 | $ /Applications/Python\ 3.x/Install\ Certificates.command 38 | ``` 39 | 40 | # Usage 41 | This project adds the `sotawhat` script for you to run globally on Terminal or commandline. 42 | 43 | To query for a certain keyword, run: 44 | 45 | ```bash 46 | $ sotawhat [keyword] [number of results] 47 | ``` 48 | 49 | For example: 50 | 51 | ```bash 52 | $ sotawhat perplexity 10 53 | ``` 54 | 55 | or 56 | 57 | ```bash 58 | $ sotawhat language model 10 59 | ``` 60 | 61 | If you don't specify the number of results, by default, the script returns 5 results. Each result contains the title of the paper with author and published date, a summary of the abstract, and link to the paper. 62 | 63 | We've found that this script works well with keywords that are: 64 | + a model (e.g. transformer, wavenet, ...) 65 | + a dataset (e.g. wikitext, imagenet, ...) 66 | + a task (e.g. language model, machine translation, fuzzing, ...) 67 | + a metric (e.g. BLEU, perplexity, ...) 68 | + random stuff 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | six 3 | pyspellchecker 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sotawhat 3 | 4 | setup( 5 | name='sotawhat', 6 | version=str(sotawhat.__VERSION__), 7 | packages=find_packages(), 8 | description='arxiv-sanity query script', 9 | long_description=str('SOTAwhat is a script to query Arxiv for the latest ' 10 | 'abstracts and extract summaries from them. '), 11 | url='https://huyenchip.com/2018/10/04/sotawhat.html', 12 | license="", 13 | install_requires=['six', 'nltk', 'pyspellchecker'], 14 | entry_points={ 15 | 'console_scripts': ['sotawhat=sotawhat.sotawhat:main'], 16 | } 17 | ) 18 | -------------------------------------------------------------------------------- /sotawhat/__init__.py: -------------------------------------------------------------------------------- 1 | __VERSION__ = '0.0.1' 2 | -------------------------------------------------------------------------------- /sotawhat/sotawhat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import urllib.error 5 | import urllib.request 6 | import warnings 7 | 8 | import nltk 9 | from nltk.tokenize import word_tokenize 10 | from six.moves.html_parser import HTMLParser 11 | from spellchecker import SpellChecker 12 | 13 | try: 14 | nltk.data.find('tokenizers/punkt') 15 | except LookupError: 16 | nltk.download('punkt') 17 | 18 | h = HTMLParser() 19 | 20 | AUTHOR_TAG = '' 22 | ABSTRACT_TAG = '') 32 | if lines[i].endswith(','): 33 | authors.append(lines[i][idx + 1: -5]) 34 | else: 35 | authors.append(lines[i][idx + 1: -4]) 36 | i += 1 37 | return authors, i 38 | 39 | 40 | def get_next_result(lines, start): 41 | """ 42 | Extract paper from the xml file obtained from arxiv search. 43 | 44 | Each paper is a dict that contains: 45 | + 'title': str 46 | + 'pdf_link': str 47 | + 'main_page': str 48 | + 'authors': [] 49 | + 'abstract': str 50 | """ 51 | 52 | result = {} 53 | idx = lines[start + 3][10:].find('"') 54 | result['main_page'] = lines[start + 3][9:10 + idx] 55 | idx = lines[start + 4][23:].find('"') 56 | result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf' 57 | 58 | start += 4 59 | 60 | while lines[start].strip() != TITLE_TAG: 61 | start += 1 62 | 63 | title = lines[start + 1].strip() 64 | title = title.replace('', '') 65 | title = title.replace('', '') 66 | result['title'] = title 67 | 68 | authors, start = get_authors(lines, start + 5) # orig: add 8 69 | 70 | while not lines[start].strip().startswith(ABSTRACT_TAG): 71 | start += 1 72 | abstract = lines[start + 1] 73 | abstract = abstract.replace('', '') 74 | abstract = abstract.replace('', '') 75 | result['abstract'] = abstract 76 | 77 | result['authors'] = authors 78 | 79 | while not lines[start].strip().startswith(DATE_TAG): 80 | start += 1 81 | 82 | idx = lines[start].find(' ') 83 | end = lines[start][idx:].find(';') 84 | 85 | result['date'] = lines[start][idx + 8: idx + end] 86 | 87 | return result, start 88 | 89 | 90 | def clean_empty_lines(lines): 91 | cleaned = [] 92 | for line in lines: 93 | line = line.strip() 94 | if line: 95 | cleaned.append(line) 96 | return cleaned 97 | 98 | 99 | def is_float(token): 100 | return re.match("^\d+?\.\d+?$", token) is not None 101 | 102 | 103 | def is_citation_year(tokens, i): 104 | if len(tokens[i]) != 4: 105 | return False 106 | if re.match(r'[12][0-9]{3}', tokens[i]) is None: 107 | return False 108 | if i == 0 or i == len(tokens) - 1: 109 | return False 110 | if (tokens[i - 1] == ',' or tokens[i - 1] == '(') and tokens[i + 1] == ')': 111 | return True 112 | return False 113 | 114 | 115 | def is_list_numer(tokens, i, value): 116 | if value < 1 or value > 4: 117 | return False 118 | if i == len(tokens) - 1: 119 | return False 120 | 121 | if (i == 0 or tokens[i - 1] in set(['(', '.', ':'])) and tokens[i + 1] == ')': 122 | return True 123 | return False 124 | 125 | 126 | def has_number(sent): 127 | tokens = word_tokenize(sent) 128 | for i, token in enumerate(tokens): 129 | if token.endswith('\\'): 130 | token = token[:-2] 131 | if token.endswith('x'): # sometimes people write numbers as 1.7x 132 | token = token[:-1] 133 | if token.startswith('x'): # sometimes people write numbers as x1.7 134 | token = token[1:] 135 | if token.startswith('$') and token.endswith('$'): 136 | token = token[1:-1] 137 | if is_float(token): 138 | return True 139 | try: 140 | value = int(token) 141 | except: 142 | continue 143 | if (not is_citation_year(tokens, i)) and (not is_list_numer(tokens, i, value)): 144 | return True 145 | 146 | return False 147 | 148 | 149 | def contains_sota(sent): 150 | return 'state-of-the-art' in sent or 'state of the art' in sent or 'SOTA' in sent 151 | 152 | 153 | def extract_line(abstract, keyword, limit): 154 | lines = [] 155 | numbered_lines = [] 156 | kw_mentioned = False 157 | abstract = abstract.replace("et. al", "et al.") 158 | sentences = abstract.split('. ') 159 | kw_sentences = [] 160 | for i, sent in enumerate(sentences): 161 | if keyword in sent.lower(): 162 | kw_mentioned = True 163 | if has_number(sent): 164 | numbered_lines.append(sent) 165 | elif contains_sota(sent): 166 | numbered_lines.append(sent) 167 | else: 168 | kw_sentences.append(sent) 169 | lines.append(sent) 170 | continue 171 | 172 | if kw_mentioned and has_number(sent): 173 | if not numbered_lines: 174 | numbered_lines.append(kw_sentences[-1]) 175 | numbered_lines.append(sent) 176 | if kw_mentioned and contains_sota(sent): 177 | lines.append(sent) 178 | 179 | if len(numbered_lines) > 0: 180 | return '. '.join(numbered_lines), True 181 | return '. '.join(lines[-2:]), False 182 | 183 | 184 | def get_report(paper, keyword): 185 | if keyword in paper['abstract'].lower(): 186 | title = h.unescape(paper['title']) 187 | headline = '{} ({} - {})\n'.format(title, paper['authors'][0], paper['date']) 188 | abstract = h.unescape(paper['abstract']) 189 | extract, has_number = extract_line(abstract, keyword, 280 - len(headline)) 190 | if extract: 191 | report = headline + extract + '\nLink: {}'.format(paper['main_page']) 192 | return report, has_number 193 | return '', False 194 | 195 | 196 | def txt2reports(txt, keyword, num_to_show): 197 | found = False 198 | txt = ''.join(chr(c) for c in txt) 199 | lines = txt.split('\n') 200 | lines = clean_empty_lines(lines) 201 | unshown = [] 202 | 203 | for i in range(len(lines)): 204 | if num_to_show <= 0: 205 | return unshown, num_to_show, found 206 | 207 | line = lines[i].strip() 208 | if len(line) == 0: 209 | continue 210 | if line == '
  • ': 211 | found = True 212 | paper, i = get_next_result(lines, i) 213 | report, has_number = get_report(paper, keyword) 214 | 215 | if has_number: 216 | print(report) 217 | print('====================================================') 218 | num_to_show -= 1 219 | elif report: 220 | unshown.append(report) 221 | if line == '': 222 | break 223 | return unshown, num_to_show, found 224 | 225 | 226 | def get_papers(keyword, num_results=5): 227 | """ 228 | If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity 229 | """ 230 | 231 | if keyword in set(['GAN', 'bpc']): 232 | query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}' 233 | keyword = keyword.lower() 234 | else: 235 | keyword = keyword.lower() 236 | words = keyword.split() 237 | d = SpellChecker() 238 | if not d.unknown(words): 239 | query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}' 240 | else: 241 | query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}' 242 | keyword_q = keyword.replace(' ', '+') 243 | page = 0 244 | per_page = 200 245 | num_to_show = num_results 246 | all_unshown = [] 247 | 248 | while num_to_show > 0: 249 | query = query_temp.format(keyword_q, str(per_page), str(per_page * page)) 250 | 251 | req = urllib.request.Request(query) 252 | try: 253 | response = urllib.request.urlopen(req) 254 | except urllib.error.HTTPError as e: 255 | print('Error {}: problem accessing the server'.format(e.code)) 256 | return 257 | 258 | txt = response.read() 259 | unshown, num_to_show, found = txt2reports(txt, keyword, num_to_show) 260 | if not found and not all_unshown and num_to_show == num_results: 261 | print('Sorry, we were unable to find any abstract with the word {}'.format(keyword)) 262 | return 263 | 264 | if num_to_show < num_results / 2 or not found: 265 | for report in all_unshown[:num_to_show]: 266 | print(report) 267 | print('====================================================') 268 | if not found: 269 | return 270 | num_to_show -= len(all_unshown) 271 | else: 272 | all_unshown.extend(unshown) 273 | page += 1 274 | 275 | 276 | def main(): 277 | if 'nt' in os.name: 278 | try: 279 | import win_unicode_console 280 | win_unicode_console.enable() 281 | except ImportError: 282 | warnings.warn('On Windows, encoding errors may arise when displaying the data.\n' 283 | 'If such errors occur, please install `win_unicode_consolde` via \n' 284 | 'the command `pip install win-unicode-console`.') 285 | 286 | if len(sys.argv) < 2: 287 | raise ValueError('You must specify a keyword') 288 | 289 | try: 290 | num_results = int(sys.argv[-1]) 291 | assert num_results > 0, 'You must choose to show a positive number of results' 292 | keyword = ' '.join(sys.argv[1:-1]) 293 | 294 | except ValueError: 295 | keyword = ' '.join(sys.argv[1:]) 296 | num_results = 5 297 | 298 | 299 | get_papers(keyword, num_results) 300 | 301 | 302 | if __name__ == '__main__': 303 | main() 304 | --------------------------------------------------------------------------------