├── .gitignore
├── Makefile.PL
├── Procfile
├── README.md
├── mendeleyparser
    ├── __init__.py
    └── mendeleyparser.py
├── parsecv.py
├── parsecv_tests.py
├── referenceparser
    ├── __init__.py
    └── referenceparser.py
├── requirements.txt
├── sample.json
├── settings.py
├── training.py
└── utils
    ├── __init__.py
    ├── get_url.py
    ├── jsonify.py
    └── ratelimit.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 
37 | # Various
38 | env
39 | venv
40 | .hg
41 | .hgignore
42 | 


--------------------------------------------------------------------------------
/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | use ExtUtils::MakeMaker;
 5 | 
 6 | WriteMakefile(
 7 |   NAME         => 'app.pl',
 8 |   VERSION      => '1.0',
 9 |   AUTHOR       => 'Marc <marrrcandre@gmail.com>',
10 |   EXE_FILES    => ['app.pl'],
11 |   PREREQ_PM    => {'Mojolicious' => '2.0',
12 |                     'Class::Struct',
13 |                     'Getopt::Long',
14 |                     'Getopt::Std',
15 |                     'File::Basename',
16 |                     'File::Spec',
17 |                     'FindBin',
18 |                     'HTML::Entities',
19 |                     'IO::File',
20 |                     'POSIX',
21 |                     'XML::Parser',
22 |                     'XML::Twig',
23 |                     'XML::Writer',
24 |                     'XML::Writer::String'},
25 |   test         => {TESTS => 't/*.t'}
26 | );
27 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn parsecv:app
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | cv-parser
 2 | =========
 3 | 
 4 | NO LONGER MAINTAINED.
 5 | 
 6 | 
 7 | ***
 8 | 
 9 | An api to parse a CV, in particular the elements of its publication list.
10 | 
11 | To deploy, it used to work to use a custom buildpack (not sure if this is still the correct syntax for heroku)
12 | heroku config:set BUILDPACK_URL=https://github.com/stochastic-technologies/impactstory-buildpack.git -a heroku_app_name
13 | 
14 | Accepts POST requests to /parsecv/ either with a "url" field or a "file" field for pdf files.
15 | 
16 | URLs can point to a Mendeley user profile or a custom HTML CV.
17 | The only accepted file format is PDF. Multicolumn PDFs produce unpredictable results.
18 | Non-standard citation formats are parsed with less accuracy due to the training dataset
19 | using mostly standard citations. 
20 | 
21 | Rate limiting is currently turned off due to the lack of a redis server
22 | on the test heroku instance. 
23 | 
24 | To activate, uncomment the lines before and after the parse_request() method.
25 | 


--------------------------------------------------------------------------------
/mendeleyparser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ourresearch/cv-parser/87be9c84dd2b712860e3b95aaebcb46d1ffd7c76/mendeleyparser/__init__.py


--------------------------------------------------------------------------------
/mendeleyparser/mendeleyparser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import lxml.html as ET
  3 | from utils import get_url
  4 | import urlparse
  5 | 
  6 | 
  7 | def parse_mendeley_html(base_url):
  8 |     """
  9 |     Takes a mendeley profile url.
 10 | 
 11 |     Returns the user's publications in bibjson format.
 12 |     """
 13 |     # We know that the parsed_url's path starts with /profiles/
 14 |     # and the scheme for mendeley urls is
 15 |     # www.mendeley.com/profiles/userid/other_things
 16 |     # so if we split the path by /, then user_id is split_path[2].
 17 |     purl = urlparse.urlparse(base_url)
 18 |     split_path = purl.path.split("/")
 19 |     user_id = split_path[2]
 20 |     sanitized_path = "/profiles/" + user_id + "/publications/journal/"
 21 |     new_url_tuple = (purl.scheme, purl.netloc, sanitized_path, "", "", "")
 22 |     url = urlparse.urlunparse(new_url_tuple)
 23 | 
 24 |     page = get_url(url)
 25 |     soup = ET.fromstring(page.getvalue())
 26 |     try:
 27 |         pagination = soup.get_element_by_id("user-publications").find_class("right")[0]
 28 |     except KeyError:
 29 |         return []
 30 | 
 31 |     num_pages = len(pagination.cssselect("div.pagemenu > ul > li"))
 32 |     if num_pages == 0:
 33 |         num_pages = 1
 34 | 
 35 |     citation_dict = {}
 36 |     for i in range(num_pages):
 37 |         page_url = "/".join([url, str(i)])
 38 |         page = get_url(page_url)
 39 |         soup = ET.fromstring(page.getvalue())
 40 |         citation_dict.update(parse_citation_page(soup))
 41 | 
 42 |     citation_list = [item for item in citation_dict.itervalues()]
 43 |     return citation_list
 44 | 
 45 | 
 46 | def parse_citation_page(soup):
 47 |     root = soup.get_element_by_id("user-publications")
 48 |     try:
 49 |         articles = root.get_element_by_id("user-publications").find_class("document-desc")
 50 |     except KeyError:
 51 |         return []
 52 | 
 53 |     bibjson_dict = {}
 54 |     for article in articles:
 55 |         try:
 56 |             data_text = [line.strip() for line in article.text.strip().split("\n")]
 57 |         except IndexError:
 58 |             data_text = []
 59 |         try:
 60 |             authors = [author.strip() for author in data_text[0].split(",")]
 61 |         except IndexError:
 62 |             authors = []
 63 | 
 64 |         try:
 65 |             title = article.cssselect("a")[0]
 66 |         except IndexError:
 67 |             title = None
 68 |         try:
 69 |             journal = article.cssselect("em")[0]
 70 |         except IndexError:
 71 |             journal = None
 72 | 
 73 |         try:
 74 |             vol_issue = article.cssselect("span")[0]
 75 |         except IndexError:
 76 |             vol_issue = None
 77 | 
 78 |         if len(authors) and title is not None:
 79 |             article_id = article.get("id")
 80 |             try:
 81 |                 year = data_text[1].strip("()")
 82 |             except IndexError:
 83 |                 year = None
 84 | 
 85 |             bibjson = {}
 86 |             bibjson["authors"] = authors
 87 |             bibjson["title"] = title.text
 88 |             if journal is not None:
 89 |                 bibjson["journal"] = journal.text
 90 |             if year is not None:
 91 |                 bibjson["year"] = year
 92 |             if vol_issue.text is not None:
 93 |                 vol_issue_re = re.match("(?P<volume>\d+)*\s*(\((?P<issue>.*)\))*", vol_issue.text.strip())
 94 |                 matched_items = vol_issue_re.groupdict()
 95 |                 if matched_items["issue"] is not None:
 96 |                     bibjson["issue"] = matched_items["issue"]
 97 |                 if matched_items["volume"] is not None:
 98 |                     bibjson["volume"] = matched_items["volume"]
 99 | 
100 |             bibjson_dict[article_id] = bibjson
101 |         else:
102 |             print article.text_content()
103 | 
104 |     return bibjson_dict
105 | 


--------------------------------------------------------------------------------
/parsecv.py:
--------------------------------------------------------------------------------
  1 | import urllib2
  2 | from urlparse import urlparse
  3 | 
  4 | from referenceparser import referenceparser
  5 | from mendeleyparser import mendeleyparser
  6 | import lxml.html as ET
  7 | 
  8 | from utils import ratelimit, jsonify, get_view_rate_limit, get_url
  9 | 
 10 | from flask import Flask, request
 11 | from werkzeug.datastructures import FileStorage
 12 | 
 13 | from pdfminer.pdfdocument import PDFParser, PDFDocument
 14 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine
 15 | from pdfminer.converter import PDFPageAggregator
 16 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 17 | from pdfminer.pdfdevice import PDFDevice
 18 | 
 19 | app = Flask(__name__)
 20 | app.config.from_object('settings')
 21 | 
 22 | 
 23 | def extract_resource_from_request():
 24 |     """Extracts and returns a python file type object from POST field data."""
 25 | 
 26 |     if not request.form and not request.files:
 27 |         raise ValueError("Received no data.")
 28 | 
 29 |     if request.form:
 30 |         input_file = get_url(request.form["url"])
 31 |         return input_file
 32 |     else:
 33 |         if not isinstance(request.files["file"], FileStorage):
 34 |             raise ValueError("Invalid file type.")
 35 |         return request.files["file"]
 36 | 
 37 | 
 38 | def is_pdf(resource):
 39 |     """Function to determine whether the input datatype is in PDF format."""
 40 |     resource.seek(0)
 41 |     magic_number = resource.read(4)
 42 |     resource.seek(0)
 43 |     if magic_number == "%PDF":
 44 |         return True
 45 |     else:
 46 |         return False
 47 | 
 48 | 
 49 | def pdf_from_resource(resource):
 50 |     """
 51 |     Builds PDF mining objects from input data.
 52 | 
 53 |     This function attempts to open a PDF file for processing.
 54 |     """
 55 |     parser = PDFParser(resource)
 56 |     document = PDFDocument()
 57 |     parser.set_document(document)
 58 | 
 59 |     document.set_parser(parser)
 60 |     document.initialize()
 61 | 
 62 |     return document
 63 | 
 64 | 
 65 | def pdf_to_text(pdf):
 66 |     """
 67 |     Takes pdfminer PDFDocument and converts to plaintext.
 68 | 
 69 |     Returns a string.
 70 |     """
 71 |     output = ""
 72 |     # create PDFMiner objects for data extraction
 73 |     rsrcmgr = PDFResourceManager()
 74 |     device = PDFDevice(rsrcmgr)
 75 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
 76 |     laparams = LAParams()
 77 |     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
 78 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
 79 | 
 80 |     # iterate over all pages, select textbox objects and extract plaintext
 81 |     for page in pdf.get_pages():
 82 |         interpreter.process_page(page)
 83 |         layout = device.get_result()
 84 |         for element in layout:
 85 |             if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
 86 |                 output += element.get_text()
 87 |     return output
 88 | 
 89 | 
 90 | def html_to_plaintext(resource):
 91 |     """Takes a file object containing HTML and returns all text elements."""
 92 |     data = ET.fromstring(resource.getvalue())
 93 |     text = data.text_content()
 94 | 
 95 |     return text
 96 | 
 97 | 
 98 | def parse_references(text):
 99 |     return referenceparser.parse_plaintext(text)
100 | 
101 | 
102 | def is_mendeley_profile(url):
103 |     purl = urlparse(url)
104 |     return purl.netloc.endswith("mendeley.com") and purl.path.startswith("/profiles")
105 | 
106 | 
107 | @app.route('/parsecv/', methods=['POST'])
108 | #@ratelimit(limit=app.config["REQUESTS_PER_MINUTE"], per=60)
109 | @jsonify
110 | def parse_request():
111 |     """
112 |     Process HTTP requests with associated POST data.
113 | 
114 |     Expected POST fields are:
115 |     file -- an attached PDF file
116 |     url -- full URL
117 |     """
118 | 
119 |     text = ""
120 |     need_parsing = 1
121 | 
122 |     try:
123 |         if not request.form and not request.files:
124 |             raise ValueError("Received no data.")
125 | 
126 |         if request.form:
127 |             if is_mendeley_profile(request.form["url"]):
128 |                 text = mendeleyparser.parse_mendeley_html(request.form["url"])
129 |                 need_parsing = 0
130 | 
131 |             else:
132 |                 input_file = get_url(request.form["url"])
133 |                 text = html_to_plaintext(input_file)
134 |         else:
135 |             input_file = request.files["file"]
136 | 
137 |             if is_pdf(input_file):
138 |                 try:
139 |                     pdf_file = pdf_from_resource(input_file)
140 |                 except Exception, e:
141 |                     return {"status": "error", "message": str(e)}
142 | 
143 |                 try:
144 |                     text = pdf_to_text(pdf_file)
145 |                 except Exception, e:
146 |                     return {"status": "error", "message": str(e)}
147 |             else:
148 |                 return {"status": "error", "message": "Unsupported file format."}
149 | 
150 |         try:
151 |             if need_parsing:
152 |                 references = parse_references(text)
153 |             else:
154 |                 references = text
155 |         except Exception, e:
156 |             return {"status": "error", "message": str(e)}
157 | 
158 |     except ValueError, e:
159 |         return {"status": "error", "message": str(e)}
160 |     except urllib2.HTTPError, e:
161 |         return {"status": "error", "message": str(e)}
162 | 
163 |     return references
164 | 
165 | #@app.after_request
166 | #def inject_x_rate_headers(response):
167 | #    limit = get_view_rate_limit()
168 | #    if limit and limit.send_x_headers:
169 | #        h = response.headers
170 | #        h.add('X-RateLimit-Remaining', str(limit.remaining))
171 | #        h.add('X-RateLimit-Limit', str(limit.limit))
172 | #        h.add('X-RateLimit-Reset', str(limit.reset))
173 | #    return response
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     app.run()
178 | 


--------------------------------------------------------------------------------
/parsecv_tests.py:
--------------------------------------------------------------------------------
 1 | import parsecv
 2 | import unittest
 3 | from StringIO import StringIO
 4 | import json
 5 | 
 6 | 
 7 | class ParseCVTestCase(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         parsecv.app.config['TESTING'] = True
11 |         self.app = parsecv.app.test_client()
12 | 
13 |     def tearDown(self):
14 |         pass
15 | 
16 |     def test_get_request(self):
17 |         result = self.app.get("parsecv/")
18 |         self.assertEqual(result.status_code, 405)
19 | 
20 |     def test_post_wrongurl(self):
21 |         result = self.app.post("/")
22 |         self.assertEqual(result.status_code, 404)
23 | 
24 |     def test_post_empty(self):
25 |         result = self.app.post("parsecv/")
26 |         self.assertEquals(result.status_code, 422)
27 |         data = json.loads(result.data)
28 |         self.assertIn("Received no data", data["message"])
29 | 
30 |     def test_bogus_file(self):
31 |         result = self.app.post("parsecv/", data={"file": (StringIO("hello"), "hello.txt")})
32 |         self.assertEqual(result.status_code, 422)
33 |         data = json.loads(result.data)
34 |         self.assertIn("Unsupported file format", data["message"])
35 | 
36 |     def test_ridiculously_large_file(self):
37 |         file_length_limit = parsecv.app.config["MAX_CONTENT_LENGTH"]
38 |         result = self.app.post("parsecv/", data={"file": (StringIO("a" * (file_length_limit + 1)), "hello.txt")})
39 |         self.assertEqual(result.status_code, 413)
40 | 
41 |     def test_wrong_mendeley_url(self):
42 |         result = self.app.post("parsecv/", data={"url": "http://www.mendeley.com/profiles/adfsafsf-zbyasudgasbby"})
43 |         data = json.loads(result.data)
44 |         self.assertIn("404", data["message"])
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/referenceparser/__init__.py:
--------------------------------------------------------------------------------
 1 | from subprocess import call
 2 | STUB = [
 3 |         {
 4 |             "bibnumber": "3.1",
 5 |             "school": "Dept. Prob. and Stat., University of Sheffield",
 6 |             "title": "Stopping time identities and limit theorems for Markov chains",
 7 |             "author": [
 8 |                 "Pitman, J W"
 9 |                 ],
10 |             "collection": "pitnoid",
11 |             "id": "p74t",
12 |             "year": "1974",
13 |             "keywords": [
14 |                 "Stopping time",
15 |                 "Identities",
16 |                 "Markov chain",
17 |                 "Occupation time",
18 |                 "Rate of convergence",
19 |                 "Transition probabilities",
20 |                 "Coupling"
21 |                 ],
22 |             "type": "phdthesis"
23 |             },
24 |         {
25 |             "bibnumber": "9",
26 |             "title": "Birth, death and conditioning of Markov chains",
27 |             "journal": "Annals of Probability",
28 |             "author": [
29 |                 "Jacobsen, M",
30 |                 "Pitman, J W"
31 |                 ],
32 |             "mrclass": "60J10",
33 |             "collection": "pitnoid",
34 |             "volume": "5",
35 |             "id": "jp77",
36 |             "mrnumber": "MR0445613",
37 |             "year": "1977",
38 |             "keywords": [
39 |                 "Path decomposition",
40 |                 "Conditioned process",
41 |                 "Conditional independence",
42 |                 "Markov chain",
43 |                 "Birth time",
44 |                 "Death time"
45 |                 ],
46 |             "type": "article",
47 |             "pages": "430 to 450",
48 |             "znumber": "0363.60052"
49 |             },
50 |         ]
51 | 


--------------------------------------------------------------------------------
/referenceparser/referenceparser.py:
--------------------------------------------------------------------------------
  1 | import shortuuid
  2 | import os
  3 | import xml.etree.ElementTree as ET
  4 | import codecs
  5 | import re
  6 | import envoy
  7 | from unidecode import unidecode
  8 | from settings import PARSER_STRICTNESS
  9 | 
 10 | 
 11 | def parse_plaintext(body):
 12 |     """Parse plaintext and return references in a BibJSON-like dict."""
 13 |     tmpin = os.path.join("/tmp", shortuuid.uuid())
 14 |     #tmpin = shortuuid.uuid()
 15 |     body = unidecode(body)
 16 |     #print body
 17 |     preprocessed_body = preprocess_body(body)
 18 |     #preprocessed_body = "References\n\n" + group_citations(preprocessed_body)
 19 |     #print preprocessed_body
 20 |     file = open(tmpin, "w")
 21 |     file.write(codecs.BOM_UTF8)
 22 |     file.write(preprocessed_body.encode("utf-8"))
 23 |     file.close()
 24 |     try:
 25 |         parsing = envoy.run("perl ParsCit/bin/citeExtract.pl -m extract_citations %s" % tmpin)
 26 |         bibjson_citations = xml_to_bibjson(parsing.std_out)
 27 |     except IOError:
 28 |         return {"status": "error", "message": "Could not run parser on file"}
 29 |     try:
 30 |         os.remove(tmpin)
 31 |         os.remove(tmpin + ".cite")
 32 |         os.remove(tmpin + ".body")
 33 |     except OSError:
 34 |         pass
 35 | 
 36 |     return bibjson_citations
 37 | 
 38 | 
 39 | def preprocess_body(body):
 40 |     body = re.sub("[ \t\r\f\v]+", " ", body)
 41 |     list_of_ways_to_say_publications = (
 42 |         "Refereed Research Publications",
 43 |         "Peer-Reviewed Publications",
 44 |         "Peer-Reviewed Journals",
 45 |         "Journal papers",
 46 |         "Published and Accepted Papers",
 47 |         "Articles",
 48 |         "Publications",
 49 |         "Publication",
 50 |     )
 51 | 
 52 |     list_of_things_people_write_about_after_publications = (
 53 |         "Book Chapters",
 54 |         "Workshops",
 55 |         "Workshop Papers",
 56 |         "Conference publications",
 57 |         "Conference Publications",
 58 |         "Seminars",
 59 |         "Selected Media",
 60 |         "Research Grants",
 61 |         "Research Interests",
 62 |         "Teaching:",
 63 |         "Professional Service",
 64 |         "Manuscripts",
 65 |         "Other Papers",
 66 |         "Workshop Presentations",
 67 |         "Referee Activity",
 68 |         "Talks",
 69 |         "Invited Talks",
 70 |         "Posters",
 71 |     )
 72 | 
 73 |     lines = body.split("\n")
 74 |     # in CVs, "References" refers to something different than in scientific
 75 |     # articles. ParsCit expects "References" to refer to published papers,
 76 |     # not people.
 77 |     for index, line in enumerate(lines):
 78 |         if "References" in line or "REFERENCES" in line:
 79 |             lines[index] = "\n"
 80 |         norm_volume = re.search("(?P<volume>\d+)\s*(?:\((?P<issue>\d+)\)|)\s*:\s*(?P<start_page>[a-zA-Z]?\d+)(-(?P<end_page>[a-zA-Z]?\d+))?", line)
 81 |         if norm_volume:
 82 |             gd = norm_volume.groupdict()
 83 |             if gd["issue"] is not None:
 84 |                 if gd["end_page"] is not None:
 85 |                     replacement_string = "%s (%s), pp %s-%s" % (gd["volume"], gd["issue"], gd["start_page"], gd["end_page"])
 86 |                 else:
 87 |                     replacement_string = "%s (%s), p. %s" % (gd["volume"], gd["issue"], gd["start_page"])
 88 | 
 89 |             else:
 90 |                 if gd["end_page"] is not None:
 91 |                     replacement_string = "%s, pp %s-%s" % (gd["volume"], gd["start_page"], gd["end_page"])
 92 |                 else:
 93 |                     replacement_string = "%s, p. %s" % (gd["volume"], gd["start_page"])
 94 | 
 95 |             lines[index] = re.sub("\d+\s*(?:\(\d+\)|)\s*:\s*[a-zA-Z]?\d+(-[a-zA-Z]?\d+)?", replacement_string, lines[index])
 96 | 
 97 |         marker = re.search("(?P<marker>^\d+)\.(?P<other>\D)", lines[index])
 98 |         if marker:
 99 |             gd = marker.groupdict()
100 |             replaced = gd["marker"] + "- %s" % gd["other"]
101 |             lines[index] = re.sub("^\d+\.[^0-9\s]", replaced, lines[index])
102 | 
103 |     found_reference_marker = 0
104 |     found_end_marker = 0
105 |     reference_marker = 0
106 |     end_marker = 0
107 | 
108 |     # Preprocess the body to fit the heuristic rules used by Parscit.
109 |     # Parscit starts parsing references when it sees a section called
110 |     # "References" and stops when it sees "Acknowledgements". It also
111 |     # looks for many other keywords but those were the simplest.
112 |     for substring in list_of_ways_to_say_publications:
113 |         for index, line in enumerate(lines):
114 |             if not found_reference_marker:
115 |                 if substring in line or substring.upper() in line:
116 |                     #print "Beginning Marker:", line
117 |                     lines[index] = "References"
118 |                     reference_marker = index
119 |                     found_reference_marker = 1
120 |                     break
121 |         if found_reference_marker:
122 |             break
123 | 
124 |     for index, line in enumerate(lines):
125 |         if found_reference_marker and not found_end_marker:
126 |             if index > reference_marker + 3:
127 |                 for substring in list_of_things_people_write_about_after_publications:
128 |                     if substring in line or substring.upper() in line:
129 |                         if index > reference_marker + 3:
130 |                             #print "End Marker:", line
131 |                             lines[index] = "Appendix"
132 |                             end_marker = index
133 |                             found_end_marker = 1
134 |                             break
135 |     if not found_end_marker:
136 |         end_marker = len(lines)
137 | 
138 |     if found_reference_marker:
139 |         body = "\n".join(lines[reference_marker:end_marker])
140 |     else:
141 |         body = "References\n" + "\n".join(lines)
142 | 
143 |     return body
144 | 
145 | 
146 | def make_bibjson_citation(citation):
147 |     cit = {}
148 |     for element in citation:
149 |         if element.tag == "authors":
150 |             author_list = []
151 |             for author in element:
152 |                 author_list.append({"name": author.text})
153 |             cit["authors"] = author_list
154 |         elif element.tag == "date":
155 |             cit["year"] = element.text
156 |         else:
157 |             cit[element.tag] = element.text
158 |     return cit
159 | 
160 | 
161 | def xml_to_bibjson(xml_string):
162 |     #print xml_string
163 |     try:
164 |         root = ET.fromstring(xml_string)
165 |     except IOError:
166 |         return {"status": "error", "message": "CV caused error in parsing process."}
167 |     except ET.ParseError:
168 |         return {"status": "error", "message": "Parser did not produce an output."}
169 | 
170 |     alg = root[0]
171 |     cit_list = alg[0]
172 |     bibjson_citations = []
173 | 
174 |     # PARSER_STRICTNESS = 0: Accept anything parsed by Parscit.
175 |     # PARSER_STRICTNESS = 1: Accept anything determined "valid"
176 |     #   by the automatic Parscit tester.
177 |     # PARSER_STRICTNESS = 2: Accept only citations with a parsed
178 |     #   "Journal" entry.
179 |     for citation in cit_list:
180 |         cit = make_bibjson_citation(citation)
181 |         if PARSER_STRICTNESS == 1:
182 |             if citation.attrib["valid"] == "true":
183 |                 if PARSER_STRICTNESS == 2:
184 |                     if "journal" in cit:
185 |                         bibjson_citations.append(cit)
186 |                 else:
187 |                     bibjson_citations.append(cit)
188 |         else:
189 |             bibjson_citations.append(cit)
190 | 
191 |     return bibjson_citations
192 | 
193 | 
194 | def group_citations(body):
195 |     pdf_lines = body.split("\n")
196 |     matched_citations = []
197 |     concatenated_lines = []
198 |     for line in pdf_lines:
199 |         stripped_line = line.strip()
200 |         # Check for a blank line. If line is blank and concatenated_lines
201 |         # is non-empty, then concatenated_lines should be merged.
202 |         if len(stripped_line) == 0:
203 |             if len(concatenated_lines) > 0:
204 |                 matched_citations.append("\n".join(concatenated_lines))
205 |                 concatenated_lines = []
206 |             continue
207 | 
208 |         # If concatenated_lines is empty and the line is smaller
209 |         # than N characters, then it's probably garbage.
210 |         if len(concatenated_lines) == 0:
211 |             if len(stripped_line) < 30:
212 |                 continue
213 | 
214 |         concatenated_lines.append(stripped_line)
215 | 
216 |         if len(concatenated_lines) > 1:
217 |             if 2 < len(stripped_line) < 0.70 * len(concatenated_lines[-2]):
218 |                 matched_citations.append("\n".join(concatenated_lines))
219 |                 concatenated_lines = []
220 |                 continue
221 |             if len(stripped_line) > 4.0 * len(concatenated_lines[-2]):
222 |                 del concatenated_lines[-2]
223 | 
224 |     return "\n".join(matched_citations)
225 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask==0.9
 2 | gunicorn==0.17.2
 3 | pdfminer
 4 | redis==2.7.2
 5 | envoy
 6 | shortuuid
 7 | unidecode
 8 | lxml
 9 | cssselect
10 | 


--------------------------------------------------------------------------------
/sample.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "bibnumber": "3.1",
  4 |         "school": "Dept. Prob. and Stat., University of Sheffield",
  5 |         "title": "Stopping time identities and limit theorems for Markov chains",
  6 |         "author": [
  7 |             "Pitman, J W"
  8 |         ],
  9 |         "collection": "pitnoid",
 10 |         "id": "p74t",
 11 |         "year": "1974",
 12 |         "keywords": [
 13 |             "Stopping time",
 14 |             "Identities",
 15 |             "Markov chain",
 16 |             "Occupation time",
 17 |             "Rate of convergence",
 18 |             "Transition probabilities",
 19 |             "Coupling"
 20 |         ],
 21 |         "type": "phdthesis"
 22 |     },
 23 |     {
 24 |         "bibnumber": "9",
 25 |         "title": "Birth, death and conditioning of Markov chains",
 26 |         "journal": "Annals of Probability",
 27 |         "author": [
 28 |             "Jacobsen, M",
 29 |             "Pitman, J W"
 30 |         ],
 31 |         "mrclass": "60J10",
 32 |         "collection": "pitnoid",
 33 |         "volume": "5",
 34 |         "id": "jp77",
 35 |         "mrnumber": "MR0445613",
 36 |         "year": "1977",
 37 |         "keywords": [
 38 |             "Path decomposition",
 39 |             "Conditioned process",
 40 |             "Conditional independence",
 41 |             "Markov chain",
 42 |             "Birth time",
 43 |             "Death time"
 44 |         ],
 45 |         "type": "article",
 46 |         "pages": "430 to 450",
 47 |         "znumber": "0363.60052"
 48 |     },
 49 |     {
 50 |         "bibnumber": "10",
 51 |         "title": "An extension of de Finetti's theorem",
 52 |         "journal": "Advances in Applied Probability",
 53 |         "author": [
 54 |             "Pitman, J"
 55 |         ],
 56 |         "collection": "pitnoid",
 57 |         "volume": "10",
 58 |         "id": "p78",
 59 |         "year": "1978",
 60 |         "type": "article",
 61 |         "pages": "268 to 270"
 62 |     },
 63 |     {
 64 |         "bibnumber": "11",
 65 |         "title": "A pointwise ergodic theorem for the group of rational rotations",
 66 |         "journal": "Trans. Amer. Math. Soc.",
 67 |         "author": [
 68 |             "Dubins, Lester E",
 69 |             "Pitman, Jim"
 70 |         ],
 71 |         "mrclass": "60G42 (28D99)",
 72 |         "collection": "pitnoid",
 73 |         "volume": "251",
 74 |         "id": "dp79",
 75 |         "mrnumber": "MR531981",
 76 |         "year": "1980",
 77 |         "keywords": [
 78 |             "Ergodic theory",
 79 |             "Rational rotation"
 80 |         ],
 81 |         "type": "article",
 82 |         "pages": "299 to 308"
 83 |     },
 84 |     {
 85 |         "bibnumber": "23",
 86 |         "title": "A decomposition of Bessel bridges",
 87 |         "journal": "Z. Wahrsch. Verw. Gebiete",
 88 |         "author": [
 89 |             "Pitman, Jim",
 90 |             "Yor, Marc"
 91 |         ],
 92 |         "mrclass": "60J60 (60J55)",
 93 |         "collection": "pitnoid",
 94 |         "volume": "59",
 95 |         "id": "py82",
 96 |         "mrnumber": "MR656509",
 97 |         "year": "1982",
 98 |         "keywords": [
 99 |             "Bessel bridge",
100 |             "Levy Khintchine representation",
101 |             "Local time",
102 |             "Ray Knight theorem",
103 |             "Markov excursion",
104 |             "Occupation time"
105 |         ],
106 |         "type": "article",
107 |         "pages": "425 to 457",
108 |         "znumber": "0484.60062"
109 |     },
110 |     {
111 |         "bibnumber": "32",
112 |         "title": "Comple\u0301ments a\u0300 l'e\u0301tude asymptotique des nombres de tours du mouvement brownien complexe autour d'un nombre fini de points",
113 |         "journal": "C.R. Acad. Sc. Paris, Se\u0301rie I",
114 |         "author": [
115 |             "Pitman, Jim",
116 |             "Yor, Marc"
117 |         ],
118 |         "mrclass": "60J65",
119 |         "collection": "pitnoid",
120 |         "volume": "305",
121 |         "id": "py87",
122 |         "mrnumber": "MR921145",
123 |         "year": "1987",
124 |         "keywords": [
125 |             "Planar Brownian motion",
126 |             "Winding",
127 |             "Asymptotic law"
128 |         ],
129 |         "type": "article",
130 |         "pages": "757 to 760"
131 |     },
132 |     {
133 |         "bibnumber": "35",
134 |         "links": [
135 |             "http://stat.berkeley.edu/users/pitman/further.asym.pdf"
136 |         ],
137 |         "title": "Further asymptotic laws of planar Brownian motion",
138 |         "journal": "Annals of Probability",
139 |         "author": [
140 |             "Pitman, Jim",
141 |             "Yor, Marc"
142 |         ],
143 |         "mrclass": "60J65 (60F05 60G44)",
144 |         "collection": "pitnoid",
145 |         "volume": "17",
146 |         "id": "py89",
147 |         "mrnumber": "MR1009441",
148 |         "year": "1989",
149 |         "keywords": [
150 |             "Planar Brownian motion",
151 |             "Winding",
152 |             "Asymptotic law"
153 |         ],
154 |         "type": "article",
155 |         "pages": "965 to 1011",
156 |         "znumber": "0686.60085"
157 |     },
158 |     {
159 |         "bibnumber": "34",
160 |         "links": [
161 |             "http://stat.berkeley.edu/users/pitman/arc.pdf"
162 |         ],
163 |         "title": "The shortest planar arc of width one",
164 |         "journal": "Amer. Math. Monthly",
165 |         "author": [
166 |             "Adhikari, Ani",
167 |             "Pitman, Jim"
168 |         ],
169 |         "mrclass": "52A40",
170 |         "collection": "pitnoid",
171 |         "volume": "96, No 4",
172 |         "id": "ap89",
173 |         "mrnumber": "MR992078",
174 |         "year": "1989",
175 |         "keywords": [
176 |             "Planar arc",
177 |             "Worm problem"
178 |         ],
179 |         "type": "article",
180 |         "pages": "309 to 327",
181 |         "znumber": "0692.52001"
182 |     },
183 |     {
184 |         "bibnumber": "46.1",
185 |         "title": "The two-parameter generalization of Ewens' random partition structure",
186 |         "author": [
187 |             "Pitman, J"
188 |         ],
189 |         "number": "345",
190 |         "collection": "pitnoid",
191 |         "id": "jp.ew",
192 |         "year": "1992",
193 |         "keywords": [
194 |             "Ewens sampling formula",
195 |             "Two parameter family of partition structures",
196 |             "Partition structure"
197 |         ],
198 |         "type": "techreport",
199 |         "institution": "Dept. Statistics, U.C. Berkeley"
200 |     },
201 |     {
202 |         "bibnumber": "48",
203 |         "title": "Dilatations d'espace-temps, re\u0301arrangements des trajectoires browniennes, et quelques extensions d'une identite\u0301 de Knight",
204 |         "journal": "C.R. Acad. Sci. Paris",
205 |         "author": [
206 |             "Pitman, James W",
207 |             "Yor, Marc"
208 |         ],
209 |         "mrclass": "60J65",
210 |         "collection": "pitnoid",
211 |         "volume": "t. 316, Se\u0301rie I",
212 |         "id": "py93c",
213 |         "mrnumber": "MR1214423",
214 |         "year": "1993",
215 |         "keywords": [
216 |             "Random scaling",
217 |             "Path rearrangement",
218 |             "Knights identity"
219 |         ],
220 |         "type": "article",
221 |         "pages": "723 to 726"
222 |     }
223 | ]
224 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
1 | MAX_CONTENT_LENGTH = 4 * 1024 ** 2  # Maximum upload/download size in bytes.
2 | DEBUG = True
3 | REQUESTS_PER_MINUTE = 60  # Used for rate-limiting.
4 | PARSER_STRICTNESS = 1
5 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import numpy
 4 | 
 5 | from sklearn.naive_bayes import GaussianNB
 6 | from sklearn.ensemble import GradientBoostingClassifier
 7 | from sklearn import cross_validation
 8 | 
 9 | 
10 | def calculate_features(element):
11 |     length = float(len(element))
12 |     punctuation_rate = len(re.findall("[,.]", element)) / length
13 |     numbers_rate = len(re.findall("\d", element)) / length
14 |     uppercase = sum(1 for character in element if character.isupper()) / length
15 |     words = len(re.split("\s+", element))
16 |     return [punctuation_rate, uppercase, words, numbers_rate, length]
17 | 
18 | 
19 | def calculate_set_features(items):
20 |     """Calculate the features of every item in the set and return them, along with the labels."""
21 |     features = []
22 |     labels = []
23 |     for item in items:
24 |         for label in ("author", "title", "journal"):
25 |             data = item.get(label)
26 |             if not data:
27 |                 continue
28 |             features.append(calculate_features(data))
29 |             labels.append(label)
30 | 
31 |     return features, labels
32 | 
33 | input_dataset = json.loads(unicode(open("./publications.json").read(), "ISO-8859-1"))
34 | 
35 | features, labels = calculate_set_features(input_dataset)
36 | 
37 | # Cross-validate.
38 | for classifier in (
39 |         GaussianNB(),
40 |         GradientBoostingClassifier(n_estimators=100, learning_rate=1.4, max_depth=1, random_state=0),
41 |         ):
42 |     scores = cross_validation.cross_val_score(classifier, features, numpy.array(labels), cv=cross_validation.KFold(len(features), n_folds=10))
43 |     print "Final accuracy for %s: %0.3f (+/- %0.2f)" % (classifier.__class__.__name__, scores.mean(), scores.std() / 2)
44 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from jsonify import jsonify
2 | from ratelimit import ratelimit, get_view_rate_limit
3 | from get_url import get_url
4 | 


--------------------------------------------------------------------------------
/utils/get_url.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import StringIO
 3 | from settings import MAX_CONTENT_LENGTH
 4 | 
 5 | 
 6 | def get_url(url):
 7 |     resource_size = 0
 8 |     page = StringIO.StringIO()
 9 |     http_response = urllib2.urlopen(url)
10 | 
11 |     maximum_size = MAX_CONTENT_LENGTH
12 |     # open resource but keep track of the size, throw exception if size exceeded
13 |     byte = True
14 |     while (byte):
15 |         byte = http_response.read(10240)
16 |         page.write(byte)
17 |         resource_size += 10240
18 |         if resource_size > maximum_size:
19 |             byte = False
20 |             raise ValueError("File size threshold exceeded.")
21 |     return page
22 | 


--------------------------------------------------------------------------------
/utils/jsonify.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from flask import Response
 4 | from functools import wraps
 5 | 
 6 | 
 7 | def jsonify(f):
 8 |     """Decorator to set appropriate mimetype and response code."""
 9 |     @wraps(f)
10 |     def inner(*args, **kwargs):
11 |         output = f(*args, **kwargs)
12 |         if isinstance(output, dict) and output.get("status") == "error":
13 |             response_code = 422
14 |         else:
15 |             response_code = 200
16 |         return Response(json.dumps(output), mimetype='application/json', status=response_code)
17 |     return inner
18 | 


--------------------------------------------------------------------------------
/utils/ratelimit.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from functools import update_wrapper
 3 | from flask import request, g, Response
 4 | from redis import Redis
 5 | redis = Redis()
 6 | 
 7 | 
 8 | class RateLimit(object):
 9 |     expiration_window = 10
10 | 
11 |     def __init__(self, key_prefix, limit, per, send_x_headers):
12 |         self.reset = (int(time.time()) // per) * per + per
13 |         self.key = key_prefix + str(self.reset)
14 |         self.limit = limit
15 |         self.per = per
16 |         self.send_x_headers = send_x_headers
17 |         p = redis.pipeline()
18 |         p.incr(self.key)
19 |         p.expireat(self.key, self.reset + self.expiration_window)
20 |         self.current = min(p.execute()[0], limit)
21 | 
22 |     remaining = property(lambda x: x.limit - x.current)
23 |     over_limit = property(lambda x: x.current >= x.limit)
24 | 
25 | 
26 | def get_view_rate_limit():
27 |     return getattr(g, '_view_rate_limit', None)
28 | 
29 | 
30 | def on_over_limit(limit):
31 |     return Response('{"status": "error", "message": "Too many requests."}', mimetype='application/json', status=429)
32 | 
33 | 
34 | def ratelimit(limit, per=300, send_x_headers=True,
35 |               over_limit=on_over_limit,
36 |               scope_func=lambda: request.remote_addr,
37 |               key_func=lambda: request.endpoint):
38 |     def decorator(f):
39 |         def rate_limited(*args, **kwargs):
40 |             key = 'rate-limit/%s/%s/' % (key_func(), scope_func())
41 |             rlimit = RateLimit(key, limit, per, send_x_headers)
42 |             g._view_rate_limit = rlimit
43 |             if over_limit is not None and rlimit.over_limit:
44 |                 return over_limit(rlimit)
45 |             return f(*args, **kwargs)
46 |         return update_wrapper(rate_limited, f)
47 |     return decorator
48 | 


--------------------------------------------------------------------------------