├── .gitignore ├── Makefile.PL ├── Procfile ├── README.md ├── mendeleyparser ├── __init__.py └── mendeleyparser.py ├── parsecv.py ├── parsecv_tests.py ├── referenceparser ├── __init__.py └── referenceparser.py ├── requirements.txt ├── sample.json ├── settings.py ├── training.py └── utils ├── __init__.py ├── get_url.py ├── jsonify.py └── ratelimit.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | # Various 38 | env 39 | venv 40 | .hg 41 | .hgignore 42 | -------------------------------------------------------------------------------- /Makefile.PL: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use ExtUtils::MakeMaker; 5 | 6 | WriteMakefile( 7 | NAME => 'app.pl', 8 | VERSION => '1.0', 9 | AUTHOR => 'Marc ', 10 | EXE_FILES => ['app.pl'], 11 | PREREQ_PM => {'Mojolicious' => '2.0', 12 | 'Class::Struct', 13 | 'Getopt::Long', 14 | 'Getopt::Std', 15 | 'File::Basename', 16 | 'File::Spec', 17 | 'FindBin', 18 | 'HTML::Entities', 19 | 'IO::File', 20 | 'POSIX', 21 | 'XML::Parser', 22 | 'XML::Twig', 23 | 'XML::Writer', 24 | 'XML::Writer::String'}, 25 | test => {TESTS => 't/*.t'} 26 | ); 27 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn parsecv:app 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cv-parser 2 | ========= 3 | 4 | NO LONGER MAINTAINED. 5 | 6 | 7 | *** 8 | 9 | An api to parse a CV, in particular the elements of its publication list. 10 | 11 | To deploy, it used to work to use a custom buildpack (not sure if this is still the correct syntax for heroku) 12 | heroku config:set BUILDPACK_URL=https://github.com/stochastic-technologies/impactstory-buildpack.git -a heroku_app_name 13 | 14 | Accepts POST requests to /parsecv/ either with a "url" field or a "file" field for pdf files. 15 | 16 | URLs can point to a Mendeley user profile or a custom HTML CV. 17 | The only accepted file format is PDF. Multicolumn PDFs produce unpredictable results. 18 | Non-standard citation formats are parsed with less accuracy due to the training dataset 19 | using mostly standard citations. 20 | 21 | Rate limiting is currently turned off due to the lack of a redis server 22 | on the test heroku instance. 23 | 24 | To activate, uncomment the lines before and after the parse_request() method. 25 | -------------------------------------------------------------------------------- /mendeleyparser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ourresearch/cv-parser/87be9c84dd2b712860e3b95aaebcb46d1ffd7c76/mendeleyparser/__init__.py -------------------------------------------------------------------------------- /mendeleyparser/mendeleyparser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import lxml.html as ET 3 | from utils import get_url 4 | import urlparse 5 | 6 | 7 | def parse_mendeley_html(base_url): 8 | """ 9 | Takes a mendeley profile url. 10 | 11 | Returns the user's publications in bibjson format. 12 | """ 13 | # We know that the parsed_url's path starts with /profiles/ 14 | # and the scheme for mendeley urls is 15 | # www.mendeley.com/profiles/userid/other_things 16 | # so if we split the path by /, then user_id is split_path[2]. 17 | purl = urlparse.urlparse(base_url) 18 | split_path = purl.path.split("/") 19 | user_id = split_path[2] 20 | sanitized_path = "/profiles/" + user_id + "/publications/journal/" 21 | new_url_tuple = (purl.scheme, purl.netloc, sanitized_path, "", "", "") 22 | url = urlparse.urlunparse(new_url_tuple) 23 | 24 | page = get_url(url) 25 | soup = ET.fromstring(page.getvalue()) 26 | try: 27 | pagination = soup.get_element_by_id("user-publications").find_class("right")[0] 28 | except KeyError: 29 | return [] 30 | 31 | num_pages = len(pagination.cssselect("div.pagemenu > ul > li")) 32 | if num_pages == 0: 33 | num_pages = 1 34 | 35 | citation_dict = {} 36 | for i in range(num_pages): 37 | page_url = "/".join([url, str(i)]) 38 | page = get_url(page_url) 39 | soup = ET.fromstring(page.getvalue()) 40 | citation_dict.update(parse_citation_page(soup)) 41 | 42 | citation_list = [item for item in citation_dict.itervalues()] 43 | return citation_list 44 | 45 | 46 | def parse_citation_page(soup): 47 | root = soup.get_element_by_id("user-publications") 48 | try: 49 | articles = root.get_element_by_id("user-publications").find_class("document-desc") 50 | except KeyError: 51 | return [] 52 | 53 | bibjson_dict = {} 54 | for article in articles: 55 | try: 56 | data_text = [line.strip() for line in article.text.strip().split("\n")] 57 | except IndexError: 58 | data_text = [] 59 | try: 60 | authors = [author.strip() for author in data_text[0].split(",")] 61 | except IndexError: 62 | authors = [] 63 | 64 | try: 65 | title = article.cssselect("a")[0] 66 | except IndexError: 67 | title = None 68 | try: 69 | journal = article.cssselect("em")[0] 70 | except IndexError: 71 | journal = None 72 | 73 | try: 74 | vol_issue = article.cssselect("span")[0] 75 | except IndexError: 76 | vol_issue = None 77 | 78 | if len(authors) and title is not None: 79 | article_id = article.get("id") 80 | try: 81 | year = data_text[1].strip("()") 82 | except IndexError: 83 | year = None 84 | 85 | bibjson = {} 86 | bibjson["authors"] = authors 87 | bibjson["title"] = title.text 88 | if journal is not None: 89 | bibjson["journal"] = journal.text 90 | if year is not None: 91 | bibjson["year"] = year 92 | if vol_issue.text is not None: 93 | vol_issue_re = re.match("(?P\d+)*\s*(\((?P.*)\))*", vol_issue.text.strip()) 94 | matched_items = vol_issue_re.groupdict() 95 | if matched_items["issue"] is not None: 96 | bibjson["issue"] = matched_items["issue"] 97 | if matched_items["volume"] is not None: 98 | bibjson["volume"] = matched_items["volume"] 99 | 100 | bibjson_dict[article_id] = bibjson 101 | else: 102 | print article.text_content() 103 | 104 | return bibjson_dict 105 | -------------------------------------------------------------------------------- /parsecv.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from urlparse import urlparse 3 | 4 | from referenceparser import referenceparser 5 | from mendeleyparser import mendeleyparser 6 | import lxml.html as ET 7 | 8 | from utils import ratelimit, jsonify, get_view_rate_limit, get_url 9 | 10 | from flask import Flask, request 11 | from werkzeug.datastructures import FileStorage 12 | 13 | from pdfminer.pdfdocument import PDFParser, PDFDocument 14 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine 15 | from pdfminer.converter import PDFPageAggregator 16 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 17 | from pdfminer.pdfdevice import PDFDevice 18 | 19 | app = Flask(__name__) 20 | app.config.from_object('settings') 21 | 22 | 23 | def extract_resource_from_request(): 24 | """Extracts and returns a python file type object from POST field data.""" 25 | 26 | if not request.form and not request.files: 27 | raise ValueError("Received no data.") 28 | 29 | if request.form: 30 | input_file = get_url(request.form["url"]) 31 | return input_file 32 | else: 33 | if not isinstance(request.files["file"], FileStorage): 34 | raise ValueError("Invalid file type.") 35 | return request.files["file"] 36 | 37 | 38 | def is_pdf(resource): 39 | """Function to determine whether the input datatype is in PDF format.""" 40 | resource.seek(0) 41 | magic_number = resource.read(4) 42 | resource.seek(0) 43 | if magic_number == "%PDF": 44 | return True 45 | else: 46 | return False 47 | 48 | 49 | def pdf_from_resource(resource): 50 | """ 51 | Builds PDF mining objects from input data. 52 | 53 | This function attempts to open a PDF file for processing. 54 | """ 55 | parser = PDFParser(resource) 56 | document = PDFDocument() 57 | parser.set_document(document) 58 | 59 | document.set_parser(parser) 60 | document.initialize() 61 | 62 | return document 63 | 64 | 65 | def pdf_to_text(pdf): 66 | """ 67 | Takes pdfminer PDFDocument and converts to plaintext. 68 | 69 | Returns a string. 70 | """ 71 | output = "" 72 | # create PDFMiner objects for data extraction 73 | rsrcmgr = PDFResourceManager() 74 | device = PDFDevice(rsrcmgr) 75 | interpreter = PDFPageInterpreter(rsrcmgr, device) 76 | laparams = LAParams() 77 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 78 | interpreter = PDFPageInterpreter(rsrcmgr, device) 79 | 80 | # iterate over all pages, select textbox objects and extract plaintext 81 | for page in pdf.get_pages(): 82 | interpreter.process_page(page) 83 | layout = device.get_result() 84 | for element in layout: 85 | if isinstance(element, LTTextBox) or isinstance(element, LTTextLine): 86 | output += element.get_text() 87 | return output 88 | 89 | 90 | def html_to_plaintext(resource): 91 | """Takes a file object containing HTML and returns all text elements.""" 92 | data = ET.fromstring(resource.getvalue()) 93 | text = data.text_content() 94 | 95 | return text 96 | 97 | 98 | def parse_references(text): 99 | return referenceparser.parse_plaintext(text) 100 | 101 | 102 | def is_mendeley_profile(url): 103 | purl = urlparse(url) 104 | return purl.netloc.endswith("mendeley.com") and purl.path.startswith("/profiles") 105 | 106 | 107 | @app.route('/parsecv/', methods=['POST']) 108 | #@ratelimit(limit=app.config["REQUESTS_PER_MINUTE"], per=60) 109 | @jsonify 110 | def parse_request(): 111 | """ 112 | Process HTTP requests with associated POST data. 113 | 114 | Expected POST fields are: 115 | file -- an attached PDF file 116 | url -- full URL 117 | """ 118 | 119 | text = "" 120 | need_parsing = 1 121 | 122 | try: 123 | if not request.form and not request.files: 124 | raise ValueError("Received no data.") 125 | 126 | if request.form: 127 | if is_mendeley_profile(request.form["url"]): 128 | text = mendeleyparser.parse_mendeley_html(request.form["url"]) 129 | need_parsing = 0 130 | 131 | else: 132 | input_file = get_url(request.form["url"]) 133 | text = html_to_plaintext(input_file) 134 | else: 135 | input_file = request.files["file"] 136 | 137 | if is_pdf(input_file): 138 | try: 139 | pdf_file = pdf_from_resource(input_file) 140 | except Exception, e: 141 | return {"status": "error", "message": str(e)} 142 | 143 | try: 144 | text = pdf_to_text(pdf_file) 145 | except Exception, e: 146 | return {"status": "error", "message": str(e)} 147 | else: 148 | return {"status": "error", "message": "Unsupported file format."} 149 | 150 | try: 151 | if need_parsing: 152 | references = parse_references(text) 153 | else: 154 | references = text 155 | except Exception, e: 156 | return {"status": "error", "message": str(e)} 157 | 158 | except ValueError, e: 159 | return {"status": "error", "message": str(e)} 160 | except urllib2.HTTPError, e: 161 | return {"status": "error", "message": str(e)} 162 | 163 | return references 164 | 165 | #@app.after_request 166 | #def inject_x_rate_headers(response): 167 | # limit = get_view_rate_limit() 168 | # if limit and limit.send_x_headers: 169 | # h = response.headers 170 | # h.add('X-RateLimit-Remaining', str(limit.remaining)) 171 | # h.add('X-RateLimit-Limit', str(limit.limit)) 172 | # h.add('X-RateLimit-Reset', str(limit.reset)) 173 | # return response 174 | 175 | 176 | if __name__ == '__main__': 177 | app.run() 178 | -------------------------------------------------------------------------------- /parsecv_tests.py: -------------------------------------------------------------------------------- 1 | import parsecv 2 | import unittest 3 | from StringIO import StringIO 4 | import json 5 | 6 | 7 | class ParseCVTestCase(unittest.TestCase): 8 | 9 | def setUp(self): 10 | parsecv.app.config['TESTING'] = True 11 | self.app = parsecv.app.test_client() 12 | 13 | def tearDown(self): 14 | pass 15 | 16 | def test_get_request(self): 17 | result = self.app.get("parsecv/") 18 | self.assertEqual(result.status_code, 405) 19 | 20 | def test_post_wrongurl(self): 21 | result = self.app.post("/") 22 | self.assertEqual(result.status_code, 404) 23 | 24 | def test_post_empty(self): 25 | result = self.app.post("parsecv/") 26 | self.assertEquals(result.status_code, 422) 27 | data = json.loads(result.data) 28 | self.assertIn("Received no data", data["message"]) 29 | 30 | def test_bogus_file(self): 31 | result = self.app.post("parsecv/", data={"file": (StringIO("hello"), "hello.txt")}) 32 | self.assertEqual(result.status_code, 422) 33 | data = json.loads(result.data) 34 | self.assertIn("Unsupported file format", data["message"]) 35 | 36 | def test_ridiculously_large_file(self): 37 | file_length_limit = parsecv.app.config["MAX_CONTENT_LENGTH"] 38 | result = self.app.post("parsecv/", data={"file": (StringIO("a" * (file_length_limit + 1)), "hello.txt")}) 39 | self.assertEqual(result.status_code, 413) 40 | 41 | def test_wrong_mendeley_url(self): 42 | result = self.app.post("parsecv/", data={"url": "http://www.mendeley.com/profiles/adfsafsf-zbyasudgasbby"}) 43 | data = json.loads(result.data) 44 | self.assertIn("404", data["message"]) 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /referenceparser/__init__.py: -------------------------------------------------------------------------------- 1 | from subprocess import call 2 | STUB = [ 3 | { 4 | "bibnumber": "3.1", 5 | "school": "Dept. Prob. and Stat., University of Sheffield", 6 | "title": "Stopping time identities and limit theorems for Markov chains", 7 | "author": [ 8 | "Pitman, J W" 9 | ], 10 | "collection": "pitnoid", 11 | "id": "p74t", 12 | "year": "1974", 13 | "keywords": [ 14 | "Stopping time", 15 | "Identities", 16 | "Markov chain", 17 | "Occupation time", 18 | "Rate of convergence", 19 | "Transition probabilities", 20 | "Coupling" 21 | ], 22 | "type": "phdthesis" 23 | }, 24 | { 25 | "bibnumber": "9", 26 | "title": "Birth, death and conditioning of Markov chains", 27 | "journal": "Annals of Probability", 28 | "author": [ 29 | "Jacobsen, M", 30 | "Pitman, J W" 31 | ], 32 | "mrclass": "60J10", 33 | "collection": "pitnoid", 34 | "volume": "5", 35 | "id": "jp77", 36 | "mrnumber": "MR0445613", 37 | "year": "1977", 38 | "keywords": [ 39 | "Path decomposition", 40 | "Conditioned process", 41 | "Conditional independence", 42 | "Markov chain", 43 | "Birth time", 44 | "Death time" 45 | ], 46 | "type": "article", 47 | "pages": "430 to 450", 48 | "znumber": "0363.60052" 49 | }, 50 | ] 51 | -------------------------------------------------------------------------------- /referenceparser/referenceparser.py: -------------------------------------------------------------------------------- 1 | import shortuuid 2 | import os 3 | import xml.etree.ElementTree as ET 4 | import codecs 5 | import re 6 | import envoy 7 | from unidecode import unidecode 8 | from settings import PARSER_STRICTNESS 9 | 10 | 11 | def parse_plaintext(body): 12 | """Parse plaintext and return references in a BibJSON-like dict.""" 13 | tmpin = os.path.join("/tmp", shortuuid.uuid()) 14 | #tmpin = shortuuid.uuid() 15 | body = unidecode(body) 16 | #print body 17 | preprocessed_body = preprocess_body(body) 18 | #preprocessed_body = "References\n\n" + group_citations(preprocessed_body) 19 | #print preprocessed_body 20 | file = open(tmpin, "w") 21 | file.write(codecs.BOM_UTF8) 22 | file.write(preprocessed_body.encode("utf-8")) 23 | file.close() 24 | try: 25 | parsing = envoy.run("perl ParsCit/bin/citeExtract.pl -m extract_citations %s" % tmpin) 26 | bibjson_citations = xml_to_bibjson(parsing.std_out) 27 | except IOError: 28 | return {"status": "error", "message": "Could not run parser on file"} 29 | try: 30 | os.remove(tmpin) 31 | os.remove(tmpin + ".cite") 32 | os.remove(tmpin + ".body") 33 | except OSError: 34 | pass 35 | 36 | return bibjson_citations 37 | 38 | 39 | def preprocess_body(body): 40 | body = re.sub("[ \t\r\f\v]+", " ", body) 41 | list_of_ways_to_say_publications = ( 42 | "Refereed Research Publications", 43 | "Peer-Reviewed Publications", 44 | "Peer-Reviewed Journals", 45 | "Journal papers", 46 | "Published and Accepted Papers", 47 | "Articles", 48 | "Publications", 49 | "Publication", 50 | ) 51 | 52 | list_of_things_people_write_about_after_publications = ( 53 | "Book Chapters", 54 | "Workshops", 55 | "Workshop Papers", 56 | "Conference publications", 57 | "Conference Publications", 58 | "Seminars", 59 | "Selected Media", 60 | "Research Grants", 61 | "Research Interests", 62 | "Teaching:", 63 | "Professional Service", 64 | "Manuscripts", 65 | "Other Papers", 66 | "Workshop Presentations", 67 | "Referee Activity", 68 | "Talks", 69 | "Invited Talks", 70 | "Posters", 71 | ) 72 | 73 | lines = body.split("\n") 74 | # in CVs, "References" refers to something different than in scientific 75 | # articles. ParsCit expects "References" to refer to published papers, 76 | # not people. 77 | for index, line in enumerate(lines): 78 | if "References" in line or "REFERENCES" in line: 79 | lines[index] = "\n" 80 | norm_volume = re.search("(?P\d+)\s*(?:\((?P\d+)\)|)\s*:\s*(?P[a-zA-Z]?\d+)(-(?P[a-zA-Z]?\d+))?", line) 81 | if norm_volume: 82 | gd = norm_volume.groupdict() 83 | if gd["issue"] is not None: 84 | if gd["end_page"] is not None: 85 | replacement_string = "%s (%s), pp %s-%s" % (gd["volume"], gd["issue"], gd["start_page"], gd["end_page"]) 86 | else: 87 | replacement_string = "%s (%s), p. %s" % (gd["volume"], gd["issue"], gd["start_page"]) 88 | 89 | else: 90 | if gd["end_page"] is not None: 91 | replacement_string = "%s, pp %s-%s" % (gd["volume"], gd["start_page"], gd["end_page"]) 92 | else: 93 | replacement_string = "%s, p. %s" % (gd["volume"], gd["start_page"]) 94 | 95 | lines[index] = re.sub("\d+\s*(?:\(\d+\)|)\s*:\s*[a-zA-Z]?\d+(-[a-zA-Z]?\d+)?", replacement_string, lines[index]) 96 | 97 | marker = re.search("(?P^\d+)\.(?P\D)", lines[index]) 98 | if marker: 99 | gd = marker.groupdict() 100 | replaced = gd["marker"] + "- %s" % gd["other"] 101 | lines[index] = re.sub("^\d+\.[^0-9\s]", replaced, lines[index]) 102 | 103 | found_reference_marker = 0 104 | found_end_marker = 0 105 | reference_marker = 0 106 | end_marker = 0 107 | 108 | # Preprocess the body to fit the heuristic rules used by Parscit. 109 | # Parscit starts parsing references when it sees a section called 110 | # "References" and stops when it sees "Acknowledgements". It also 111 | # looks for many other keywords but those were the simplest. 112 | for substring in list_of_ways_to_say_publications: 113 | for index, line in enumerate(lines): 114 | if not found_reference_marker: 115 | if substring in line or substring.upper() in line: 116 | #print "Beginning Marker:", line 117 | lines[index] = "References" 118 | reference_marker = index 119 | found_reference_marker = 1 120 | break 121 | if found_reference_marker: 122 | break 123 | 124 | for index, line in enumerate(lines): 125 | if found_reference_marker and not found_end_marker: 126 | if index > reference_marker + 3: 127 | for substring in list_of_things_people_write_about_after_publications: 128 | if substring in line or substring.upper() in line: 129 | if index > reference_marker + 3: 130 | #print "End Marker:", line 131 | lines[index] = "Appendix" 132 | end_marker = index 133 | found_end_marker = 1 134 | break 135 | if not found_end_marker: 136 | end_marker = len(lines) 137 | 138 | if found_reference_marker: 139 | body = "\n".join(lines[reference_marker:end_marker]) 140 | else: 141 | body = "References\n" + "\n".join(lines) 142 | 143 | return body 144 | 145 | 146 | def make_bibjson_citation(citation): 147 | cit = {} 148 | for element in citation: 149 | if element.tag == "authors": 150 | author_list = [] 151 | for author in element: 152 | author_list.append({"name": author.text}) 153 | cit["authors"] = author_list 154 | elif element.tag == "date": 155 | cit["year"] = element.text 156 | else: 157 | cit[element.tag] = element.text 158 | return cit 159 | 160 | 161 | def xml_to_bibjson(xml_string): 162 | #print xml_string 163 | try: 164 | root = ET.fromstring(xml_string) 165 | except IOError: 166 | return {"status": "error", "message": "CV caused error in parsing process."} 167 | except ET.ParseError: 168 | return {"status": "error", "message": "Parser did not produce an output."} 169 | 170 | alg = root[0] 171 | cit_list = alg[0] 172 | bibjson_citations = [] 173 | 174 | # PARSER_STRICTNESS = 0: Accept anything parsed by Parscit. 175 | # PARSER_STRICTNESS = 1: Accept anything determined "valid" 176 | # by the automatic Parscit tester. 177 | # PARSER_STRICTNESS = 2: Accept only citations with a parsed 178 | # "Journal" entry. 179 | for citation in cit_list: 180 | cit = make_bibjson_citation(citation) 181 | if PARSER_STRICTNESS == 1: 182 | if citation.attrib["valid"] == "true": 183 | if PARSER_STRICTNESS == 2: 184 | if "journal" in cit: 185 | bibjson_citations.append(cit) 186 | else: 187 | bibjson_citations.append(cit) 188 | else: 189 | bibjson_citations.append(cit) 190 | 191 | return bibjson_citations 192 | 193 | 194 | def group_citations(body): 195 | pdf_lines = body.split("\n") 196 | matched_citations = [] 197 | concatenated_lines = [] 198 | for line in pdf_lines: 199 | stripped_line = line.strip() 200 | # Check for a blank line. If line is blank and concatenated_lines 201 | # is non-empty, then concatenated_lines should be merged. 202 | if len(stripped_line) == 0: 203 | if len(concatenated_lines) > 0: 204 | matched_citations.append("\n".join(concatenated_lines)) 205 | concatenated_lines = [] 206 | continue 207 | 208 | # If concatenated_lines is empty and the line is smaller 209 | # than N characters, then it's probably garbage. 210 | if len(concatenated_lines) == 0: 211 | if len(stripped_line) < 30: 212 | continue 213 | 214 | concatenated_lines.append(stripped_line) 215 | 216 | if len(concatenated_lines) > 1: 217 | if 2 < len(stripped_line) < 0.70 * len(concatenated_lines[-2]): 218 | matched_citations.append("\n".join(concatenated_lines)) 219 | concatenated_lines = [] 220 | continue 221 | if len(stripped_line) > 4.0 * len(concatenated_lines[-2]): 222 | del concatenated_lines[-2] 223 | 224 | return "\n".join(matched_citations) 225 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.9 2 | gunicorn==0.17.2 3 | pdfminer 4 | redis==2.7.2 5 | envoy 6 | shortuuid 7 | unidecode 8 | lxml 9 | cssselect 10 | -------------------------------------------------------------------------------- /sample.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "bibnumber": "3.1", 4 | "school": "Dept. Prob. and Stat., University of Sheffield", 5 | "title": "Stopping time identities and limit theorems for Markov chains", 6 | "author": [ 7 | "Pitman, J W" 8 | ], 9 | "collection": "pitnoid", 10 | "id": "p74t", 11 | "year": "1974", 12 | "keywords": [ 13 | "Stopping time", 14 | "Identities", 15 | "Markov chain", 16 | "Occupation time", 17 | "Rate of convergence", 18 | "Transition probabilities", 19 | "Coupling" 20 | ], 21 | "type": "phdthesis" 22 | }, 23 | { 24 | "bibnumber": "9", 25 | "title": "Birth, death and conditioning of Markov chains", 26 | "journal": "Annals of Probability", 27 | "author": [ 28 | "Jacobsen, M", 29 | "Pitman, J W" 30 | ], 31 | "mrclass": "60J10", 32 | "collection": "pitnoid", 33 | "volume": "5", 34 | "id": "jp77", 35 | "mrnumber": "MR0445613", 36 | "year": "1977", 37 | "keywords": [ 38 | "Path decomposition", 39 | "Conditioned process", 40 | "Conditional independence", 41 | "Markov chain", 42 | "Birth time", 43 | "Death time" 44 | ], 45 | "type": "article", 46 | "pages": "430 to 450", 47 | "znumber": "0363.60052" 48 | }, 49 | { 50 | "bibnumber": "10", 51 | "title": "An extension of de Finetti's theorem", 52 | "journal": "Advances in Applied Probability", 53 | "author": [ 54 | "Pitman, J" 55 | ], 56 | "collection": "pitnoid", 57 | "volume": "10", 58 | "id": "p78", 59 | "year": "1978", 60 | "type": "article", 61 | "pages": "268 to 270" 62 | }, 63 | { 64 | "bibnumber": "11", 65 | "title": "A pointwise ergodic theorem for the group of rational rotations", 66 | "journal": "Trans. Amer. Math. Soc.", 67 | "author": [ 68 | "Dubins, Lester E", 69 | "Pitman, Jim" 70 | ], 71 | "mrclass": "60G42 (28D99)", 72 | "collection": "pitnoid", 73 | "volume": "251", 74 | "id": "dp79", 75 | "mrnumber": "MR531981", 76 | "year": "1980", 77 | "keywords": [ 78 | "Ergodic theory", 79 | "Rational rotation" 80 | ], 81 | "type": "article", 82 | "pages": "299 to 308" 83 | }, 84 | { 85 | "bibnumber": "23", 86 | "title": "A decomposition of Bessel bridges", 87 | "journal": "Z. Wahrsch. Verw. Gebiete", 88 | "author": [ 89 | "Pitman, Jim", 90 | "Yor, Marc" 91 | ], 92 | "mrclass": "60J60 (60J55)", 93 | "collection": "pitnoid", 94 | "volume": "59", 95 | "id": "py82", 96 | "mrnumber": "MR656509", 97 | "year": "1982", 98 | "keywords": [ 99 | "Bessel bridge", 100 | "Levy Khintchine representation", 101 | "Local time", 102 | "Ray Knight theorem", 103 | "Markov excursion", 104 | "Occupation time" 105 | ], 106 | "type": "article", 107 | "pages": "425 to 457", 108 | "znumber": "0484.60062" 109 | }, 110 | { 111 | "bibnumber": "32", 112 | "title": "Comple\u0301ments a\u0300 l'e\u0301tude asymptotique des nombres de tours du mouvement brownien complexe autour d'un nombre fini de points", 113 | "journal": "C.R. Acad. Sc. Paris, Se\u0301rie I", 114 | "author": [ 115 | "Pitman, Jim", 116 | "Yor, Marc" 117 | ], 118 | "mrclass": "60J65", 119 | "collection": "pitnoid", 120 | "volume": "305", 121 | "id": "py87", 122 | "mrnumber": "MR921145", 123 | "year": "1987", 124 | "keywords": [ 125 | "Planar Brownian motion", 126 | "Winding", 127 | "Asymptotic law" 128 | ], 129 | "type": "article", 130 | "pages": "757 to 760" 131 | }, 132 | { 133 | "bibnumber": "35", 134 | "links": [ 135 | "http://stat.berkeley.edu/users/pitman/further.asym.pdf" 136 | ], 137 | "title": "Further asymptotic laws of planar Brownian motion", 138 | "journal": "Annals of Probability", 139 | "author": [ 140 | "Pitman, Jim", 141 | "Yor, Marc" 142 | ], 143 | "mrclass": "60J65 (60F05 60G44)", 144 | "collection": "pitnoid", 145 | "volume": "17", 146 | "id": "py89", 147 | "mrnumber": "MR1009441", 148 | "year": "1989", 149 | "keywords": [ 150 | "Planar Brownian motion", 151 | "Winding", 152 | "Asymptotic law" 153 | ], 154 | "type": "article", 155 | "pages": "965 to 1011", 156 | "znumber": "0686.60085" 157 | }, 158 | { 159 | "bibnumber": "34", 160 | "links": [ 161 | "http://stat.berkeley.edu/users/pitman/arc.pdf" 162 | ], 163 | "title": "The shortest planar arc of width one", 164 | "journal": "Amer. Math. Monthly", 165 | "author": [ 166 | "Adhikari, Ani", 167 | "Pitman, Jim" 168 | ], 169 | "mrclass": "52A40", 170 | "collection": "pitnoid", 171 | "volume": "96, No 4", 172 | "id": "ap89", 173 | "mrnumber": "MR992078", 174 | "year": "1989", 175 | "keywords": [ 176 | "Planar arc", 177 | "Worm problem" 178 | ], 179 | "type": "article", 180 | "pages": "309 to 327", 181 | "znumber": "0692.52001" 182 | }, 183 | { 184 | "bibnumber": "46.1", 185 | "title": "The two-parameter generalization of Ewens' random partition structure", 186 | "author": [ 187 | "Pitman, J" 188 | ], 189 | "number": "345", 190 | "collection": "pitnoid", 191 | "id": "jp.ew", 192 | "year": "1992", 193 | "keywords": [ 194 | "Ewens sampling formula", 195 | "Two parameter family of partition structures", 196 | "Partition structure" 197 | ], 198 | "type": "techreport", 199 | "institution": "Dept. Statistics, U.C. Berkeley" 200 | }, 201 | { 202 | "bibnumber": "48", 203 | "title": "Dilatations d'espace-temps, re\u0301arrangements des trajectoires browniennes, et quelques extensions d'une identite\u0301 de Knight", 204 | "journal": "C.R. Acad. Sci. Paris", 205 | "author": [ 206 | "Pitman, James W", 207 | "Yor, Marc" 208 | ], 209 | "mrclass": "60J65", 210 | "collection": "pitnoid", 211 | "volume": "t. 316, Se\u0301rie I", 212 | "id": "py93c", 213 | "mrnumber": "MR1214423", 214 | "year": "1993", 215 | "keywords": [ 216 | "Random scaling", 217 | "Path rearrangement", 218 | "Knights identity" 219 | ], 220 | "type": "article", 221 | "pages": "723 to 726" 222 | } 223 | ] 224 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | MAX_CONTENT_LENGTH = 4 * 1024 ** 2 # Maximum upload/download size in bytes. 2 | DEBUG = True 3 | REQUESTS_PER_MINUTE = 60 # Used for rate-limiting. 4 | PARSER_STRICTNESS = 1 5 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import numpy 4 | 5 | from sklearn.naive_bayes import GaussianNB 6 | from sklearn.ensemble import GradientBoostingClassifier 7 | from sklearn import cross_validation 8 | 9 | 10 | def calculate_features(element): 11 | length = float(len(element)) 12 | punctuation_rate = len(re.findall("[,.]", element)) / length 13 | numbers_rate = len(re.findall("\d", element)) / length 14 | uppercase = sum(1 for character in element if character.isupper()) / length 15 | words = len(re.split("\s+", element)) 16 | return [punctuation_rate, uppercase, words, numbers_rate, length] 17 | 18 | 19 | def calculate_set_features(items): 20 | """Calculate the features of every item in the set and return them, along with the labels.""" 21 | features = [] 22 | labels = [] 23 | for item in items: 24 | for label in ("author", "title", "journal"): 25 | data = item.get(label) 26 | if not data: 27 | continue 28 | features.append(calculate_features(data)) 29 | labels.append(label) 30 | 31 | return features, labels 32 | 33 | input_dataset = json.loads(unicode(open("./publications.json").read(), "ISO-8859-1")) 34 | 35 | features, labels = calculate_set_features(input_dataset) 36 | 37 | # Cross-validate. 38 | for classifier in ( 39 | GaussianNB(), 40 | GradientBoostingClassifier(n_estimators=100, learning_rate=1.4, max_depth=1, random_state=0), 41 | ): 42 | scores = cross_validation.cross_val_score(classifier, features, numpy.array(labels), cv=cross_validation.KFold(len(features), n_folds=10)) 43 | print "Final accuracy for %s: %0.3f (+/- %0.2f)" % (classifier.__class__.__name__, scores.mean(), scores.std() / 2) 44 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from jsonify import jsonify 2 | from ratelimit import ratelimit, get_view_rate_limit 3 | from get_url import get_url 4 | -------------------------------------------------------------------------------- /utils/get_url.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import StringIO 3 | from settings import MAX_CONTENT_LENGTH 4 | 5 | 6 | def get_url(url): 7 | resource_size = 0 8 | page = StringIO.StringIO() 9 | http_response = urllib2.urlopen(url) 10 | 11 | maximum_size = MAX_CONTENT_LENGTH 12 | # open resource but keep track of the size, throw exception if size exceeded 13 | byte = True 14 | while (byte): 15 | byte = http_response.read(10240) 16 | page.write(byte) 17 | resource_size += 10240 18 | if resource_size > maximum_size: 19 | byte = False 20 | raise ValueError("File size threshold exceeded.") 21 | return page 22 | -------------------------------------------------------------------------------- /utils/jsonify.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from flask import Response 4 | from functools import wraps 5 | 6 | 7 | def jsonify(f): 8 | """Decorator to set appropriate mimetype and response code.""" 9 | @wraps(f) 10 | def inner(*args, **kwargs): 11 | output = f(*args, **kwargs) 12 | if isinstance(output, dict) and output.get("status") == "error": 13 | response_code = 422 14 | else: 15 | response_code = 200 16 | return Response(json.dumps(output), mimetype='application/json', status=response_code) 17 | return inner 18 | -------------------------------------------------------------------------------- /utils/ratelimit.py: -------------------------------------------------------------------------------- 1 | import time 2 | from functools import update_wrapper 3 | from flask import request, g, Response 4 | from redis import Redis 5 | redis = Redis() 6 | 7 | 8 | class RateLimit(object): 9 | expiration_window = 10 10 | 11 | def __init__(self, key_prefix, limit, per, send_x_headers): 12 | self.reset = (int(time.time()) // per) * per + per 13 | self.key = key_prefix + str(self.reset) 14 | self.limit = limit 15 | self.per = per 16 | self.send_x_headers = send_x_headers 17 | p = redis.pipeline() 18 | p.incr(self.key) 19 | p.expireat(self.key, self.reset + self.expiration_window) 20 | self.current = min(p.execute()[0], limit) 21 | 22 | remaining = property(lambda x: x.limit - x.current) 23 | over_limit = property(lambda x: x.current >= x.limit) 24 | 25 | 26 | def get_view_rate_limit(): 27 | return getattr(g, '_view_rate_limit', None) 28 | 29 | 30 | def on_over_limit(limit): 31 | return Response('{"status": "error", "message": "Too many requests."}', mimetype='application/json', status=429) 32 | 33 | 34 | def ratelimit(limit, per=300, send_x_headers=True, 35 | over_limit=on_over_limit, 36 | scope_func=lambda: request.remote_addr, 37 | key_func=lambda: request.endpoint): 38 | def decorator(f): 39 | def rate_limited(*args, **kwargs): 40 | key = 'rate-limit/%s/%s/' % (key_func(), scope_func()) 41 | rlimit = RateLimit(key, limit, per, send_x_headers) 42 | g._view_rate_limit = rlimit 43 | if over_limit is not None and rlimit.over_limit: 44 | return over_limit(rlimit) 45 | return f(*args, **kwargs) 46 | return update_wrapper(rate_limited, f) 47 | return decorator 48 | --------------------------------------------------------------------------------