├── bibtagger ├── __init__.py ├── fixeddata │ ├── journals.txt.gz │ ├── common-words.txt.gz │ ├── common-surnames.txt.gz │ ├── common-given-names.txt.gz │ └── README.md ├── print_tokens.py ├── given_names.py ├── test_given_names.py ├── test_chunker.py ├── tokenizer.py ├── chunker.py └── featurize.py ├── model.crfsuite ├── frontend ├── static │ ├── favicon.ico │ ├── base.css │ ├── colorize-output.css │ ├── selecttext.js │ ├── sticky-footer.css │ ├── index.html │ └── file.js └── app.py ├── requirements.txt ├── retreive-data ├── downloaded │ ├── jcp-tagged.jsonl.gz │ ├── references.jsonl.gz │ ├── plos-tagged.jsonl.gz │ └── pnas-tagged.jsonl.gz ├── crossref │ ├── expand-crossref.py │ ├── download-crossref.py │ └── tag-crossref.py ├── README.md ├── download-tag-jcp.py ├── download-tag-plos.py └── download-tag-pnas.py ├── update-site.sh ├── setup.py ├── README.md ├── .gitignore ├── training ├── feature_extract.py └── train-model.ipynb └── LICENSE /bibtagger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model.crfsuite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/model.crfsuite -------------------------------------------------------------------------------- /frontend/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/frontend/static/favicon.ico -------------------------------------------------------------------------------- /bibtagger/fixeddata/journals.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/journals.txt.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | marisa-trie 2 | python-crfsuite 3 | unidecode 4 | requests 5 | nltk 6 | tornado 7 | titlecase 8 | streql 9 | -------------------------------------------------------------------------------- /bibtagger/fixeddata/common-words.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/common-words.txt.gz -------------------------------------------------------------------------------- /bibtagger/fixeddata/common-surnames.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/common-surnames.txt.gz -------------------------------------------------------------------------------- /retreive-data/downloaded/jcp-tagged.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/jcp-tagged.jsonl.gz -------------------------------------------------------------------------------- /retreive-data/downloaded/references.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/references.jsonl.gz -------------------------------------------------------------------------------- /bibtagger/fixeddata/common-given-names.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/common-given-names.txt.gz -------------------------------------------------------------------------------- /retreive-data/downloaded/plos-tagged.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/plos-tagged.jsonl.gz -------------------------------------------------------------------------------- /retreive-data/downloaded/pnas-tagged.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/pnas-tagged.jsonl.gz -------------------------------------------------------------------------------- /update-site.sh: -------------------------------------------------------------------------------- 1 | # Update remote site to the git master 2 | # and restart the supervisor 3 | ssh reftag.rmcgibbo.org \ 4 | 'cd /home/rmcgibbo/reftagger/ && 5 | /home/rmcgibbo/venv/bin/pip install -r requirements.txt && 6 | git pull origin master && 7 | cd /home/rmcgibbo/ && 8 | supervisorctl restart tornado-5000' 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='reftag', 5 | author='Robert McGibbon', 6 | author_email='rmcgibbo@gmail.com', 7 | url='http://github.com/rmcgibbo/reftagger', 8 | packages=find_packages(), 9 | package_data={'bibtagger': ['fixeddata/*']}, 10 | zip_safe=False, 11 | ) 12 | -------------------------------------------------------------------------------- /frontend/static/base.css: -------------------------------------------------------------------------------- 1 | .highlight { 2 | padding:9px 14px; 3 | margin:-15px -15px 15px; 4 | margin-bottom:14px; 5 | background-color:#f7f7f9; 6 | border:1px solid #e1e1e8; 7 | border-radius:4px 8 | } 9 | 10 | .highlight pre { 11 | padding:0; 12 | margin-top:0; 13 | margin-bottom:0; 14 | word-break:normal; 15 | white-space:nowrap; 16 | background-color: 17 | transparent;border:0 18 | } 19 | -------------------------------------------------------------------------------- /frontend/static/colorize-output.css: -------------------------------------------------------------------------------- 1 | #output-zone > span.fam { 2 | color: #B02826; 3 | } 4 | #output-zone > span.given { 5 | color: #003399; 6 | } 7 | #output-zone > span.title { 8 | color: #859900; 9 | } 10 | #output-zone > span.year { 11 | color: #2aa198; 12 | } 13 | #output-zone > span.vol { 14 | color: #6c71c4; 15 | } 16 | #output-zone > span.page { 17 | color: #cb4b16; 18 | } 19 | #output-zone > span.journ { 20 | color: #d33682; 21 | } 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /frontend/static/selecttext.js: -------------------------------------------------------------------------------- 1 | /* Select an element (as if the user had selected a chunk of text w/ mouse) 2 | */ 3 | function SelectText(element) { 4 | var doc = document 5 | , text = doc.getElementById(element) 6 | , range, selection 7 | ; 8 | if (doc.body.createTextRange) { 9 | range = document.body.createTextRange(); 10 | range.moveToElementText(text); 11 | range.select(); 12 | } else if (window.getSelection) { 13 | selection = window.getSelection(); 14 | range = document.createRange(); 15 | range.selectNodeContents(text); 16 | selection.removeAllRanges(); 17 | selection.addRange(range); 18 | } 19 | }; 20 | -------------------------------------------------------------------------------- /frontend/static/sticky-footer.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer styles 2 | -------------------------------------------------- */ 3 | html { 4 | position: relative; 5 | min-height: 100%; 6 | } 7 | body { 8 | /* Margin bottom by footer height */ 9 | margin-bottom: 60px; 10 | } 11 | .footer { 12 | position: absolute; 13 | bottom: 0; 14 | width: 100%; 15 | /* Set the fixed height of the footer here */ 16 | height: 60px; 17 | background-color: #f5f5f5; 18 | } 19 | 20 | 21 | /* Custom page CSS 22 | -------------------------------------------------- */ 23 | /* Not required for template or sticky footer method. */ 24 | 25 | .container { 26 | width: auto; 27 | max-width: 740px; 28 | padding: 0 15px; 29 | } 30 | .container .text-muted { 31 | margin: 20px 0; 32 | } -------------------------------------------------------------------------------- /bibtagger/print_tokens.py: -------------------------------------------------------------------------------- 1 | from termcolor import colored 2 | 3 | COLORMAP = { 4 | 'page': 'red', 5 | 'vol': 'magenta', 6 | 'year': 'cyan', 7 | 'journ': 'yellow', 8 | 'given': 'magenta', 9 | 'fam': 'red', 10 | None: 'white', 11 | 'title': 'green', 12 | 'issue': 'green', 13 | } 14 | 15 | 16 | def render_tokens(tags_and_tokens): 17 | line = [] 18 | n_tokens = len(tags_and_tokens) 19 | for i in range(n_tokens): 20 | tag, tok = tags_and_tokens[i] 21 | line.append(colored(tok, color=COLORMAP[tag])) 22 | if tok == '(': 23 | continue 24 | if i < n_tokens-1: 25 | tok1 = tags_and_tokens[i+1][1] 26 | if tok1 not in [',', ')', ';', ':', '.']: 27 | line.append(' ') 28 | return ''.join(line) 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Project unmaintained][unmaintained-image]][unmaintained-url] 2 | 3 | [unmaintained-url]: .github/ISSUE_TEMPLATE.md 4 | [unmaintained-image]: https://img.shields.io/badge/project-unmaintained-red.svg 5 | 6 | Reference Tagger 7 | ================ 8 | *Parse and tag unstructured academic citations.* 9 | 10 | A system that identifies, parses and formats unstructured academic citations 11 | using conditional random fields. 12 | 13 | It can take a raw string like _"Wang, L.-P.; Titov, A.; McGibbon, R.; Liu, F.; 14 | Pande, V. S.; Martinez, T. J. Nature Chemistry 2014, 6, 1044-1048."_ and 15 | format it into a structured BibTeX record for example. 16 | 17 | License: AGPL. Runtime: python 3.4. 18 | 19 | ------------------------- 20 | 21 | **This is just to let you know that this project is unmaintained.** 22 | 23 | **If you'd like to adopt this repo, please open a few PRs and I'll happily hand 24 | it over.** 25 | -------------------------------------------------------------------------------- /bibtagger/given_names.py: -------------------------------------------------------------------------------- 1 | import re 2 | import itertools 3 | 4 | 5 | def abbreviations(given, only_period=False): 6 | split = given.split() 7 | 8 | 9 | if len(split) > 1: 10 | #a0 = abbreviations(split[0]) 11 | abrvs = ( 12 | abbreviations(s, only_period=i>0) 13 | for i, s in enumerate(split)) 14 | prod = itertools.product(*abrvs) 15 | out = {' '.join(item) for item in prod} 16 | 17 | extra = set() 18 | for o in out: 19 | if re.search('\.\s\w\.', o): 20 | extra.add(o.replace('. ', '.')) 21 | out.update(extra) 22 | return out 23 | 24 | 25 | if len(split) == 1: 26 | item = split[0] 27 | first_letter = item[0] 28 | 29 | if only_period: 30 | return {item, first_letter+'.'} 31 | else: 32 | return {item, first_letter, first_letter+'.'} 33 | 34 | raise ValueError(given) 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | .ipynb_checkpoints/ 57 | retreive-data/.DS_Store 58 | -------------------------------------------------------------------------------- /bibtagger/test_given_names.py: -------------------------------------------------------------------------------- 1 | from .given_names import abbreviations 2 | 3 | 4 | def test_1(): 5 | 6 | assert abbreviations('J') == {'J', 'J.'} 7 | assert abbreviations('Miguel') == {'M', 'M.', 'Miguel'} 8 | assert abbreviations('Miguel Thomas') == {'Miguel Thomas', 'M T.', 'M.T.', 'M. Thomas', 'M. T.', 'Miguel T.', 'M Thomas'} 9 | assert abbreviations('S.') == {'S', 'S.'} 10 | assert abbreviations('B. I.') == {'B. I.', 'B.I.', 'B I.'} 11 | 12 | assert abbreviations('John A. T.') == {'J A.T.', 'John A.T.', 'J.A.T.', 'John A. T.', 'J. A. T.', 'J A. T.'} 13 | assert abbreviations('R. I. C. C.') == {'R I. C. C.', 'R I.C.C.', 'R.I.C.C.', 'R. I. C. C.'} 14 | assert abbreviations('Radboud J. Duintjer') == {'R.J.D.', 'R.J.Duintjer', 'R. J. D.', 'Radboud J. Duintjer', 'Radboud J. D.', 'R. J. Duintjer', 'Radboud J.D.', 'R J.D.', 'R J. Duintjer', 'R J. D.'} 15 | assert abbreviations('Karl A. Von') == {'K A.V.', 'Karl A. Von', 'K.A.V.', 'K.A.Von', 'Karl A.V.', 'K A. V.', 'Karl A. V.', 'K A. Von', 'K. A. Von', 'K. A. V.'} 16 | -------------------------------------------------------------------------------- /bibtagger/fixeddata/README.md: -------------------------------------------------------------------------------- 1 | Auxiliary Data 2 | ============== 3 | 4 | These are data files that are not _exactly_ part of the training set, since they 5 | they're not parsed citations. They're used in the token featurization to provide 6 | semantically rich features that should make the classification more accurate. 7 | 8 | 1. `common-surnames.txt.gz` 9 | 10 | All surnames appearing 100 or more times in the 2000 US cencus. There are 11 | 151671 of them, written in all caps. They're stored in flat text with 12 | newline separators. They were downloaded from this census.gov website 13 | 14 | http://www.census.gov/topics/population/genealogy/data/2000_surnames.html 15 | 16 | 17 | 2. `common-given-names.txt.gz` 18 | 19 | Common given (first) names, from the US Social Security Administration. I 20 | summed the counts accross the years of birth and took the the 1000 most 21 | common names. The data is from 22 | 23 | http://www.ssa.gov/oact/babynames/limits.html 24 | 25 | 26 | 3. `common-words.txt.gz` 27 | 28 | 5000 most common english words. Newline separated. Data is from 29 | 30 | http://norvig.com/ngrams/count_1w.txt 31 | 32 | 33 | 4. `journals.txt.gz` 34 | 35 | List of journal titles, including both the full name, MedLine abbreviation 36 | and ISO abbreviation. There are a total of 52274 unique entries. The data 37 | comes from the PubMed and NCBI Molecular Biology Database Journals list. 38 | 39 | http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.pubmedhelptable45/ 40 | 41 | 42 | 5. [WordNet](http://wordnet.princeton.edu/) 43 | 44 | We also use wordnet, through the `nltk` interface. See `featurize.py`. 45 | 46 | -------------------------------------------------------------------------------- /training/feature_extract.py: -------------------------------------------------------------------------------- 1 | """This script takes tagged citations (training data) as produced by 2 | `retreive-data/tag-citations.py` and does feature extraction, producing 3 | the direct input data for the CRF model. 4 | 5 | The features are word prefixes and suffixes, whether or not they 6 | contain digits or dots, their lengths, and their relationship to the 7 | words forward and backward in the sequence. 8 | """ 9 | import re 10 | import sys 11 | import json 12 | import pickle 13 | import argparse 14 | from os.path import dirname, abspath, join, isfile 15 | from collections import Counter 16 | 17 | PROJECT_ROOT = join(dirname(abspath(__file__)), '..') 18 | sys.path.insert(0, PROJECT_ROOT) 19 | from bibtagger.featurize import featurize 20 | 21 | 22 | def main(): 23 | p = argparse.ArgumentParser(description=__doc__, 24 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 25 | p.add_argument('source', nargs='+', help='Tokenize and tagged citations (jsonlines format)') 26 | p.add_argument('dest', help='Featurized training data (pkl format)') 27 | p.add_argument('-n', '--n-rare', help='Minimum number of occurances of a ' 28 | 'token to label it \'rare\'.', type=int, default=10) 29 | 30 | args = p.parse_args() 31 | if isfile(args.dest): 32 | p.error('File exists. %s' % args.dest) 33 | 34 | phrases, y = [], [] 35 | 36 | for source in args.source: 37 | with open(source, 'r') as f: 38 | for i, line in enumerate(f): 39 | item = json.loads(line)['tagged'] 40 | if len(item) > 0: 41 | yy, xx = zip(*item) 42 | phrases.append(xx) 43 | y.append([str(tag) for tag in yy]) 44 | 45 | print('Featurizing') 46 | X = [] 47 | for i, phrase in enumerate(phrases): 48 | if i % 100 == 0: 49 | print('%d/%d' % (i, len(phrases))) 50 | X.append(featurize(phrase)) 51 | 52 | #print('len(X)', len(X)) 53 | #print('len(y)', len(y)) 54 | #print(X[0]) 55 | #print(y[0]) 56 | 57 | 58 | with open(args.dest, 'wb') as fout: 59 | pickle.dump({ 60 | 'X': X, 61 | 'y': y, 62 | }, fout) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | 68 | -------------------------------------------------------------------------------- /retreive-data/crossref/expand-crossref.py: -------------------------------------------------------------------------------- 1 | """Create _new_ styled entries from data download by `download-citations.py` 2 | from each variant of the `container-title` entry. 3 | 4 | For example, say we download the following AIP styled citation: 5 | 6 | { 7 | 'container-title': ['Pesq. agropec. bras.', 8 | 'Pesquisa Agropecuaria Brasileira'], 9 | 'styled': [ 10 | {'style': 'american-institute-of-physics', 11 | 'value': 'N.P. Stamford, C.E. de R. e S. Santos, R. Medeiros, ' 12 | 'and A.D.S. de Freitas, Pesquisa Agropecuaria ' 13 | 'Brasileira 34, 1831 (1999).'}] 14 | } 15 | 16 | This script will create a new entry in the `styled` list containing the 17 | reference with the other `container-title`. 18 | 19 | """ 20 | import json 21 | import copy 22 | import argparse 23 | from unidecode import unidecode 24 | from pprint import pprint 25 | 26 | 27 | def main(): 28 | p = argparse.ArgumentParser(description=__doc__, 29 | formatter_class=argparse.RawDescriptionHelpFormatter) 30 | p.add_argument('source', help='Input (jsonlines)') 31 | p.add_argument('dest', help='Output (jsonlines)') 32 | args = p.parse_args() 33 | 34 | with open(args.source, 'r') as fin, open(args.dest, 'w') as fout: 35 | for i, line in enumerate(fin): 36 | if (i % 100) == 0: 37 | print('LINE %d' % i) 38 | 39 | newcit = expand_journal_abbreviations(json.loads(line)) 40 | json.dump(newcit, fout) 41 | fout.write('\n') 42 | 43 | 44 | def expand_journal_abbreviations(cit): 45 | if len(cit['container-title']) <= 1: 46 | return cit 47 | 48 | container_titles = list(map(unidecode, cit['container-title'])) 49 | new_styled = [] 50 | 51 | for s in cit['styled']: 52 | for ct in container_titles: 53 | if s['value'].find(ct) != -1: 54 | for jj, ot in enumerate(container_titles): 55 | new_value = s['value'].replace(ct, ot) 56 | if all(new_value != ss['value'] for ss in cit['styled']): 57 | 58 | new_styled.append({ 59 | 'value': new_value, 60 | 'style': s['style'] + '-abbrev-%d' % jj 61 | }) 62 | 63 | cit['styled'].extend(new_styled) 64 | return cit 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /retreive-data/README.md: -------------------------------------------------------------------------------- 1 | Scripts for Acquiring Training Data 2 | ----------------------------------- 3 | 4 | These are the scripts for getting training data. They download citations from the web, 5 | and then parse and format them in such a way that that they're suitable for learning 6 | from. 7 | 8 | 1. `download-citations.py` 9 | 10 | This script retreives the main source of training data, the CrossRef records associated 11 | with random DOIs. It uses the `sample` endpoint from the [CrossRef API](https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md). It also pulls down 12 | a styled version of the reference, formatted according to the `american-chemical-society`, 13 | `apa`, or whatever. 14 | 15 | Each record looks something like this: 16 | 17 | `{"URL": "http://dx.doi.org/10.1016/s0300-483x(96)03593-7", "score": 1.0, "issued": {"date-parts": [[1997, 4, 11]]}, "issue": "1", "volume": "119", "ISSN": ["0300-483X"], "prefix": "http://id.crossref.org/prefix/10.1016", "title": ["Animal models in autoimmune disease in immunotoxicity assessment"], "deposited": {"date-parts": [[2011, 7, 11]], "timestamp": 1310342400000}, "member": "http://id.crossref.org/member/78", "container-title": ["Toxicology"], "author": [{"given": "J", "family": "Farine"}], "source": "CrossRef", "subtitle": [], "type": "journal-article", "reference-count": 0, "indexed": {"date-parts": [[2015, 2, 6]], "timestamp": 1423247513443}, "DOI": "10.1016/s0300-483x(96)03593-7", "publisher": "Elsevier BV", "styled": [{"value": "Farine, J. (1997). Animal models in autoimmune disease in immunotoxicity assessment. Toxicology, 119(1), 29-35. doi:10.1016/s0300-483x(96)03593-7", "style": "apa"}, {"value": "Farine, J. Toxicology 1997, 119, 29-35.", "style": "american-chemical-society"}], "page": "29-35", "subject": ["Toxicology"]}` 18 | 19 | These contain both annotated information about a paper, like the journal, 20 | title, authors, etc, and also the styled reference, as it would be written 21 | in a paper. 22 | 23 | 2. `expand-citations.py` 24 | 25 | The styled references typically only include 1 version of the journal title. 26 | Usually this is the long version (Journal of Organic Chemistry) as opposed 27 | to the abbreviated title (J. Org. Chem.). 28 | 29 | We want our classifer to be able to handle both, so `expand-citations.py` 30 | adds new synthetic styled references to each of the entries, by 31 | find-and-replacing on the journal title and substituting its other variants. 32 | 33 | The output JSON format of `expand-citations.py` and `download-citations.py` 34 | are the same. `expand-citations.py` just adds a couple entries to the 35 | list of styled references in each citation. 36 | 37 | 3. `tag-citations.py` 38 | 39 | This script takes as input the result of `expand-citations.py` (or 40 | `download-citations.py`) 41 | 42 | It prroduces jsonlines output containing styled citations that have been 43 | tokenized and tagged. 44 | 45 | `{"value": "J. Fransson, A. Talamelli, L. Brandt, and C. Cossu, Phys. Rev. Lett. 96, (2006).", "tagged": [["J.", "given"], ["Fransson", "fam"], [",", "None"], ["A.", "given"], ["Talamelli", "fam"], [",", "None"], ["L.", "given"], ["Brandt", "fam"], [",", "None"], ["and", "None"], ["C.", "given"], ["Cossu", "fam"], [",", "None"], ["Phys.", "journ"], ["Rev.", "journ"], ["Lett.", "journ"], ["96", "vol"], [",", "None"], ["(", "None"], ["2006", "year"], [").", "None"]]}` 46 | -------------------------------------------------------------------------------- /bibtagger/test_chunker.py: -------------------------------------------------------------------------------- 1 | from .chunker import greedy_label, tokenize_and_tag 2 | 3 | 4 | def test_1(): 5 | text = 'hello hello ; world' 6 | chunks = ['hello', 'hello ;', 'world', 'sdf'] 7 | assert greedy_label(text, chunks) == [(0, 0), (1, 6), (2, 14)] 8 | 9 | 10 | def test_2(): 11 | text = 'A.B.; J. Chem. Phys.' 12 | chunk_sets = { 13 | 'label_AB': ['A.B.'], 14 | 'label_J': ['J. Chem. Phys.'],} 15 | 16 | tokens, tags = tokenize_and_tag(text, chunk_sets) 17 | assert tokens == ['A', '.', 'B', '.', ';', 'J', '.', 'Chem', '.', 'Phys', '.'] 18 | assert tags == ['label_AB', 'label_AB', 'label_AB', 'label_AB', None, 'label_J', 'label_J', 'label_J', 'label_J', 'label_J', 'label_J'] 19 | 20 | 21 | def test_3(): 22 | text = 'a A.B.; J. Chem. Phys. b' 23 | chunk_sets = { 24 | 'label_AB': ['A.B.'], 25 | 'label_J': ['J. Chem. Phys.'],} 26 | 27 | tokens, tags = tokenize_and_tag(text, chunk_sets) 28 | z = list(zip(tokens, tags)) 29 | assert z == [ 30 | ('a', None), ('A', 'label_AB'), 31 | ('.', 'label_AB'), ('B', 'label_AB'), 32 | ('.', 'label_AB'), (';', None), 33 | ('J', 'label_J'), ('.', 'label_J'), 34 | ('Chem', 'label_J'), ('.', 'label_J'), 35 | ('Phys', 'label_J'), ('.', 'label_J'), 36 | ('b', None)] 37 | 38 | 39 | def test_4(): 40 | text = 'Farine, J. (1997). Title. Toxicology, 119(1), 29-35.' 41 | chunk_sets = { 42 | 'family': ['Farine'], 43 | 'given': ['J.'], 44 | 'year': ['1997'], 45 | 'title': ['Title'], 46 | 'journal': ['Toxicology'], 47 | } 48 | chunk_sets = { 49 | 'page': ['29-35', '29', '35'], 50 | 'year': ['1997'], 51 | 'fam': ['Farine'], 52 | 'journ': ['Toxicology'], 53 | 'vol': ['119'], 54 | 'given': ['J.'], 55 | 'title': ['Title'] 56 | } 57 | tokens, tags = tokenize_and_tag(text, chunk_sets) 58 | z = list(zip(tokens, tags)) 59 | print(z) 60 | 61 | 62 | def test_5(): 63 | text = 'Jafelicci Jr . , M . , & Loh , W . ( 1999 ) . Editorial . Journal of the Brazilian Chemical Society , 10 ( 5 ) .' 64 | chunks = ['10', 'Editorial', 'Jafelicci', 'Jr .', 'Loh', 'Braz', 'Chem', 'Soc', 'Journal', 'of', 'the', 'Brazilian', 'Chemical', 'Society', '', '', 'M .', 'W .', '1999'] 65 | greedy_label(text, chunks) 66 | 67 | 68 | def test_6(): 69 | text = '03' 70 | chunk_sets = {0: ['0'], 3: ['3']} 71 | tokenize_and_tag(text, chunk_sets) 72 | 73 | 74 | def test_7(): 75 | text = 'A . Dow and R . Pichardo - Mendoza , Topology and Its Applications 160 , 2207 ( 2013 ) .' 76 | chunk_sets = { 77 | 'title': ["Efimov's problem and Boolean algebras"], 78 | 'given': {'A', 'A.', 'Alan', 'R', 'R.', 'Roberto'}, 79 | 'fam': ['Dow', 'Pichardo-Mendoza'], 80 | 'journ': ['Topology and Its Applications'], 81 | 'vol': ['160'], 82 | 'page': ['2207-2231', '2207'], 83 | 'issue': ['17'], 84 | 'year': ['2013'] 85 | } 86 | tokens, tags = tokenize_and_tag(text, chunk_sets) 87 | z = list(zip(tokens, tags)) 88 | assert z == [('A', 'given'), ('.', 'given'), ('Dow', 'fam'), 89 | ('and', None), ('R', 'given'), ('.', 'given'), 90 | ('Pichardo', 'fam'), ('-', 'fam'), ('Mendoza', 'fam'), 91 | (',', None), ('Topology', 'journ'), ('and', 'journ'), 92 | ('Its', 'journ'), ('Applications', 'journ'), ('160', 'vol'), 93 | (',', None), ('2207', 'page'), ('(', None), ('2013', 'year'), 94 | (')', None), ('.', None)] 95 | -------------------------------------------------------------------------------- /bibtagger/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def tokenize(text): 5 | """Very simple word tokenizer. 6 | """ 7 | 8 | # punctuation 9 | text = re.sub(r'\.\.\.|\.', r' \g<0> ', text) 10 | text = re.sub(r'[;@#$%&:,?!\(\)"\']', r' \g<0> ', text) 11 | 12 | #parens, brackets, etc. 13 | text = re.sub(r'--', r' -- ', text) 14 | text = re.sub(r'([^\s])-([^\s])', r'\g<1> - \g<2>', text) 15 | 16 | #add extra space to make things easier 17 | text = " " + text + " " 18 | 19 | split = text.split() 20 | return split 21 | 22 | 23 | def tokenize_with_pos(text): 24 | """Variant of ``tokenize`` that also returns the indices in 25 | ``text`` where each of the tokens begin. 26 | """ 27 | 28 | WHITESPACE = set('\t\r\n ') 29 | SPECIAL_TOKENS = set(('...', ';', '@', '#', '$', '%', '&', ':', ',', 30 | '?', '!', '(', ')', '.', '\"', '\'', '--', '-')) 31 | LONGEST_SPECIAL_TOKEN = max(len(e) for e in SPECIAL_TOKENS) 32 | SHORTEST_SPECIAL_TOKEN = min(len(e) for e in SPECIAL_TOKENS) 33 | 34 | 35 | def inner(): 36 | current_token_start = None 37 | current_token = [] 38 | 39 | i = 0 40 | while i < len(text): 41 | matched_special = False 42 | for n in range(LONGEST_SPECIAL_TOKEN, SHORTEST_SPECIAL_TOKEN - 1, -1): 43 | if text[i:i+n] in SPECIAL_TOKENS: 44 | matched_special = True 45 | break 46 | 47 | if text[i] in WHITESPACE: 48 | if current_token_start is not None: 49 | yield (''.join(current_token), current_token_start) 50 | current_token_start = None 51 | current_token = [] 52 | elif matched_special: 53 | if current_token_start is not None: 54 | yield (''.join(current_token), current_token_start) 55 | yield text[i:i+n], i 56 | i += n-1 57 | current_token_start = None 58 | current_token = [] 59 | else: 60 | if current_token_start is None: 61 | current_token_start = i 62 | current_token.append(text[i]) 63 | 64 | i += 1 65 | 66 | if current_token_start is not None: 67 | yield (''.join(current_token), current_token_start) 68 | 69 | return list(zip(*inner())) 70 | 71 | 72 | def untokenize(tokens, positions=None): 73 | if positions is not None: 74 | return untokenize_with_positions(tokens, positions) 75 | return untokenize_heuristic(tokens) 76 | 77 | 78 | def untokenize_with_positions(tokens, positions): 79 | with_whitespace = [] 80 | length = 0 81 | 82 | for tok, pos in zip(tokens, positions): 83 | gap = pos - length 84 | if gap > 0: 85 | with_whitespace.append(' ' * gap) 86 | length += gap 87 | with_whitespace.append(tok) 88 | length += len(tok) 89 | 90 | return ''.join(with_whitespace) 91 | 92 | 93 | def untokenize_heuristic(tokens): 94 | with_whitespace = [] 95 | for i in range(len(tokens)): 96 | tok = tokens[i] 97 | with_whitespace.append(tok) 98 | 99 | if tok != '(' and i < len(tokens) - 1: 100 | if tokens[i+1] not in ('?', ')', ';', '!', ':', ',', '.', '...'): 101 | with_whitespace.append(' ') 102 | 103 | return ''.join(with_whitespace) 104 | 105 | 106 | 107 | def test_1(): 108 | s = 'Hello Wo-rld... . sdf; sdf--ddd one.two' 109 | out1 = tokenize(s) 110 | out2, pos2 = list(tokenize_with_pos(s)) 111 | 112 | assert untokenize(out2, pos2) == s 113 | assert list(out1) == list(out2) 114 | assert untokenize(out1) == 'Hello Wo - rld.... sdf; sdf -- ddd one. two' 115 | -------------------------------------------------------------------------------- /retreive-data/download-tag-jcp.py: -------------------------------------------------------------------------------- 1 | '''Download and tag training data from the refence section 2 | of random J. Chem. Phys. papers 3 | ''' 4 | import json 5 | import argparse 6 | import requests 7 | import traceback 8 | from bs4 import BeautifulSoup, Tag 9 | 10 | from unidecode import unidecode 11 | from bibtagger.tokenizer import tokenize 12 | from bibtagger.print_tokens import render_tokens 13 | 14 | unitokenize = lambda x: tokenize(unidecode(x)) if x is not None else [] 15 | 16 | 17 | def main(): 18 | p = argparse.ArgumentParser() 19 | p.add_argument('-n', help='Number of articles to scrape', type=int, default=2) 20 | p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re tagged.') 21 | p.add_argument('dest', help='tokenized and tagged citations (jsonlines)') 22 | args = p.parse_args() 23 | 24 | with open(args.dest, 'a') as fout: 25 | for doi, article in sample_jcp(args.n): 26 | print('\n===== %s ======' % doi) 27 | for cit in article.find_all('div', {'class': 'citation'}): 28 | try: 29 | tokens = list(itertokens(cit)) 30 | if args.verbose: 31 | print(render_tokens(tokens)) 32 | json.dump({'tagged': tokens}, fout) 33 | fout.write('\n') 34 | except UnexpectedTagError as e: 35 | print() 36 | print(e) 37 | print() 38 | 39 | 40 | class UnexpectedTagError(Exception): 41 | pass 42 | 43 | 44 | def itertokens(citation_node): 45 | xmltag_to_ourtag = { 46 | 'reference-volume': 'vol', 47 | 'reference-fpage': 'page', 48 | 'reference-year':'year', 49 | 'reference-source': 'journ', 50 | 'reference-surname': 'fam', 51 | 'reference-given-names': 'given', 52 | 'reference-issue': 'issue', 53 | 'reference-suffix': 'fam', 54 | 'reference-article-title': 'title', 55 | } 56 | 57 | tags_seen = set() 58 | children = list(citation_node.children) 59 | while len(children) > 0: 60 | part = children.pop(0) 61 | 62 | if isinstance(part, Tag): 63 | try: 64 | klass = part['class'][0] 65 | except: 66 | raise UnexpectedTagError(str(citation_node), part) 67 | 68 | 69 | if klass in ('citation-label', 'group-citation-label'): 70 | pass 71 | elif klass == 'reference-fpage': 72 | fpage = part.text 73 | if len(children) > 1 and children[1]['class'][0] == 'reference-lpage': 74 | middle = children.pop(0) 75 | part = children.pop(0) 76 | lpage = part.text 77 | yield from (('page', t) for t in unitokenize(fpage + middle + lpage)) 78 | else: 79 | yield from (('page', t) for t in unitokenize(fpage)) 80 | else: 81 | if klass not in xmltag_to_ourtag: 82 | raise UnexpectedTagError(str(citation_node), klass) 83 | else: 84 | tags_seen.add(xmltag_to_ourtag[klass]) 85 | 86 | yield from ((xmltag_to_ourtag[klass], t) for t in unitokenize(part.text)) 87 | else: 88 | if 'given' in tags_seen: 89 | yield from ((None, t) for t in unitokenize(part)) 90 | 91 | 92 | def sample_jcp(n_articles): 93 | # issn for J. Chem. Phys. 94 | r = requests.get('http://api.crossref.org/works', 95 | params={'sample': n_articles, 'filter': 'issn:1089-7690'}) 96 | 97 | dois = (e['DOI'] for e in r.json()['message']['items']) 98 | 99 | for doi in dois: 100 | r = requests.get('http://dx.doi.org/%s' % doi) 101 | yield doi, BeautifulSoup(r.content) 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /frontend/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import pickle 4 | import functools 5 | import hashlib 6 | import ssl 7 | import hmac 8 | import base64 9 | import time 10 | from uuid import uuid4 11 | from os.path import dirname, abspath, join 12 | 13 | from tornado.options import options, define 14 | import tornado.ioloop 15 | import tornado.web 16 | import streql 17 | import pycrfsuite 18 | from unidecode import unidecode 19 | 20 | from bibtagger.tokenizer import tokenize 21 | from bibtagger.featurize import featurize 22 | 23 | 24 | def build_signature(json_obj): 25 | msg = json.dumps(json_obj, sort_keys=True).encode('utf-8') 26 | sig = hmac.new(SECRET_KEY, msg, digestmod=hashlib.sha256).digest() 27 | return base64.b64encode(sig).decode('utf-8') 28 | 29 | 30 | class ResolveHandler(tornado.web.RequestHandler): 31 | def get(self): 32 | q = self.get_argument('q', default=None) 33 | result = {'request-id': str(uuid4()), 'tokens': [], 'tags': [], 'q': ''} 34 | if q is not None: 35 | result.update(self._tag(q)) 36 | 37 | signature = build_signature(result) 38 | result['signature'] = signature 39 | self.write(result) 40 | 41 | @functools.lru_cache(maxsize=1024) 42 | def _tag(self, q): 43 | q = unidecode(q) 44 | tokens = tokenize(q) 45 | 46 | if len(tokens) == 0: 47 | return {'tokens': [], 'tags': [], 'q': ''} 48 | 49 | X = featurize(tokens) 50 | tags = TAGGER.tag(X) 51 | return {'tokens': tokens, 'tags': tags, 'q': q} 52 | 53 | 54 | class FeedbackHandler(tornado.web.RequestHandler): 55 | def post(self, status): 56 | assert status in ('accept', 'reject') 57 | 58 | data = json.loads(self.request.body.decode('utf-8')) 59 | received_signature = data.pop('signature') 60 | signature = build_signature(data) 61 | 62 | if streql.equals(received_signature, signature): 63 | with open(FEEDBACK_LOG_PATH, 'a') as f: 64 | json.dump({ 65 | 'status': status, 66 | 'time': time.time(), 67 | 'data': data 68 | }, f) 69 | f.write('\n') 70 | 71 | 72 | class NoCacheStaticFileHandler(tornado.web.StaticFileHandler): 73 | def set_extra_headers(self, path): 74 | self.set_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0') 75 | 76 | 77 | class IndexHandler(tornado.web.RequestHandler): 78 | def get(self): 79 | with open(join(STATIC_PATH, 'index.html'), 'r') as f: 80 | self.write(f.read()) 81 | 82 | def set_extra_headers(self, path): 83 | self.set_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0') 84 | 85 | 86 | if __name__ == "__main__": 87 | define("host", default="localhost", help="app host", type=str) 88 | define("port", default=5000, help="app port", type=int) 89 | define("feedbackdir", default=".", help='directory for feedback log file', type=str) 90 | options.parse_command_line() 91 | 92 | # load up the tokenizer 93 | PROJECT_DIR = join(dirname(abspath(__file__)), '..') 94 | STATIC_PATH = join(dirname(abspath(__file__)), 'static') 95 | FEEDBACK_LOG_PATH = join(options.feedbackdir, 'feedback.jsonl') 96 | TAGGER = pycrfsuite.Tagger() 97 | TAGGER.open(join(PROJECT_DIR, 'model.crfsuite')) 98 | SECRET_KEY = ssl.RAND_bytes(16) 99 | 100 | application = tornado.web.Application([ 101 | (r"/resolve", ResolveHandler), 102 | (r"/feedback/(accept|reject)", FeedbackHandler), 103 | 104 | # in production, these are handled by nginx, so here we just have 105 | # them as non-caching routes for dev. 106 | (r"/", IndexHandler), 107 | (r"/static/(.*)", NoCacheStaticFileHandler, {"path": STATIC_PATH}) 108 | ]) 109 | 110 | application.listen(options.port, options.host) 111 | tornado.ioloop.IOLoop.instance().start() 112 | -------------------------------------------------------------------------------- /bibtagger/chunker.py: -------------------------------------------------------------------------------- 1 | from .tokenizer import tokenize 2 | 3 | 4 | def tokenize_and_tag(text, chunk_sets): 5 | # all tokens separated by whitespace 6 | text = ' '.join(tokenize(text)) + ' ' 7 | tokens = text.split() 8 | 9 | 10 | chunks = [] 11 | chunk2tag = [] 12 | 13 | for k in chunk_sets: 14 | for c in chunk_sets[k]: 15 | tc = tokenize(c) 16 | 17 | # only use chunks if every token within the chunk is actually 18 | # one of our tokens 19 | if all(t in tokens for t in tc): 20 | chunks.append(' '.join(tc) + ' ') 21 | chunk2tag.append(k) 22 | 23 | labels = greedy_label(text, chunks) 24 | start_end = [] 25 | tags = [] 26 | 27 | # print(text) 28 | # print(chunks) 29 | # print() 30 | # for i, _ in labels: 31 | # print(chunks[i]) 32 | # print() 33 | # 34 | # print(tokens) 35 | # print() 36 | 37 | for i in range(len(labels)): 38 | # print([(tags[t], tokens[t]) for t in range(len(tags))]) 39 | 40 | t, start = labels[i] 41 | end = start + len(chunks[t]) 42 | if i == 0: 43 | tags.extend([None for _ in tokenize(text[:start])]) 44 | 45 | tags.extend([chunk2tag[t] for _ in tokenize(text[start:end])]) 46 | 47 | if i < len(labels)-1: 48 | # interior space between matched blocks 49 | next_start = labels[i+1][1] 50 | interspace = text[end:next_start] 51 | tags.extend([None for _ in tokenize(interspace)]) 52 | 53 | if i == len(labels)-1: 54 | # text after the last matched block 55 | tags.extend([None for _ in tokenize(text[end:])]) 56 | 57 | if len(labels) == 0: 58 | tags = [None] 59 | 60 | 61 | 62 | 63 | if len(tokens) != len(tags): 64 | print(text) 65 | print(len(labels)) 66 | print(chunk_sets) 67 | print('chunks', chunks) 68 | print('tokens', tokens) 69 | print('tags', tags) 70 | print(list(zip(tags, tokens))) 71 | assert False 72 | 73 | 74 | return tokens, tags 75 | 76 | 77 | def greedy_label(text, chunks): 78 | """Find non-overlapping chunks in text. 79 | 80 | Parameters 81 | ---------- 82 | text : str 83 | chunks : list of str 84 | 85 | Returns 86 | ------- 87 | matches : list of 2-tuples 88 | Each element in the returned list is a length-2 tuple `(i, j)` s.t. 89 | `i` is the index of the matching chunk and `j` is the index in 90 | `text` where the substring match begins. 91 | 92 | Example 93 | ------- 94 | >>> text = 'hello hello; world' 95 | >>> greedy_label(text, ['hello', 'world']) 96 | [(0, 0), (0, 6), (1, 13)] 97 | 98 | # the semantics of the return value is that chunk[0] matches begining 99 | # at text[0], then chunk[0] matches again at beggining text[6], and then 100 | # chunk[1] matches beginning at text[13]. 101 | """ 102 | stack = [] 103 | 104 | p = 0 105 | while True: 106 | gap = {} 107 | matchlength = {} 108 | 109 | # for label, ch in chunks.items(): 110 | for label, ch in enumerate(chunks): 111 | if len(ch) > 0: 112 | i = text.find(ch, p) 113 | if i > -1: 114 | gap[label] = i-p 115 | matchlength[label] = len(ch) 116 | 117 | if len(gap) == 0: 118 | # we're at the end of the text, with no more 119 | # matching chunks in text[p:] 120 | break 121 | 122 | # sort the chunks that match text[p:]. we want to pick the one that 123 | # introduces the smallest gap. if two chunks both introduce the 124 | # same gap, then we take the one that's longest. 125 | label = min(gap.keys(), key=lambda k: (gap[k], -matchlength[k])) 126 | stack.append((label, p+gap[label])) 127 | p += gap[label] + matchlength[label] 128 | 129 | return stack 130 | 131 | -------------------------------------------------------------------------------- /retreive-data/download-tag-plos.py: -------------------------------------------------------------------------------- 1 | '''Download and tag training data from the refence section 2 | of random PLoS ONE papers 3 | ''' 4 | import json 5 | import argparse 6 | import requests 7 | import traceback 8 | from xml.etree import ElementTree as ET 9 | 10 | from unidecode import unidecode 11 | from bibtagger.tokenizer import tokenize 12 | from bibtagger.print_tokens import render_tokens 13 | 14 | unitokenize = lambda x: tokenize(unidecode(x)) if x is not None else [] 15 | 16 | 17 | def main(): 18 | # itertokens 19 | p = argparse.ArgumentParser(description=__doc__, 20 | formatter_class=argparse.RawDescriptionHelpFormatter) 21 | p.add_argument('-n', help='Number of articles to scrape', type=int, default=2) 22 | p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re tagged.') 23 | p.add_argument('dest', help='tokenized and tagged citations (jsonlines)') 24 | args = p.parse_args() 25 | 26 | with open(args.dest, 'a') as fout: 27 | # pull random articles 28 | for doi, article in sample_plos_xml(args.n): 29 | # get all of the citations from the articles 30 | references = article.findall('back/ref-list/ref/mixed-citation') 31 | 32 | if args.verbose: 33 | print('\n\n==== PULLING FROM DOI %s ====\n' % doi) 34 | 35 | for ref in references: 36 | if ref.get('publication-type') != 'journal': 37 | continue 38 | 39 | # tokenize each citation 40 | try: 41 | tokens = list(itertokens(ref)) 42 | if args.verbose: 43 | print(render_tokens(tokens)) 44 | 45 | json.dump({'tagged': tokens}, fout) 46 | fout.write('\n') 47 | except UnexpectedTagError: 48 | traceback.print_exc() 49 | 50 | 51 | class UnexpectedTagError(Exception): 52 | pass 53 | 54 | 55 | def itertokens(citation_node): 56 | xmltag_to_ourtag = { 57 | 'volume': 'vol', 58 | 'year':'year', 59 | 'source': 'journ', 60 | 'surname': 'fam', 61 | 'given-names': 'given', 62 | 'suffix': 'given', 63 | 'article-title': 'title', 64 | 'etal': None, 65 | 'issue': 'issue', 66 | 'issue-id': 'issue', 67 | } 68 | 69 | children = citation_node.getchildren() 70 | while len(children) > 0: 71 | part = children.pop(0) 72 | if part.tag == 'person-group': 73 | children = part.getchildren() + children 74 | part = children.pop(0) 75 | 76 | if part.tag in ('name'): 77 | for name_part in part.getchildren(): 78 | assert name_part.tail is None 79 | for tok in unitokenize(name_part.text): 80 | yield (xmltag_to_ourtag[name_part.tag], tok) 81 | 82 | elif part.tag in ('year', 'article-title', 'source', 'volume', 'etal', 'issue'): 83 | for tok in unitokenize(part.text): 84 | yield (xmltag_to_ourtag[part.tag], tok) 85 | 86 | elif part.tag == 'fpage': 87 | fpage, middle = part.text, part.tail 88 | if len(children) > 0 and children[0].tag == 'lpage': 89 | part = children.pop(0) 90 | lpage = part.text 91 | for tok in unitokenize(fpage + middle + lpage): 92 | yield ('page', tok) 93 | else: 94 | for tok in unitokenize(fpage): 95 | yield ('page', tok) 96 | elif part.tag == 'comment': 97 | pass 98 | 99 | else: 100 | ET.dump(citation_node) 101 | raise UnexpectedTagError('unexpected tag', part.tag) 102 | 103 | 104 | if part.tail is not None: 105 | for tok in unitokenize(part.tail): 106 | yield (None, tok) 107 | 108 | 109 | def sample_plos_xml(n_articles): 110 | # issn for PLoS One 111 | r = requests.get('http://api.crossref.org/works', 112 | params={'sample': n_articles, 'filter': 'issn:1932-6203'}) 113 | 114 | dois = (e['DOI'] for e in r.json()['message']['items']) 115 | 116 | for doi in dois: 117 | # print(doi) 118 | r = requests.get('http://journals.plos.org/plosone/article/asset', 119 | params={'id': doi+'.XML'}) 120 | yield doi, ET.fromstring(r.content) 121 | 122 | 123 | if __name__ == '__main__': 124 | main() 125 | -------------------------------------------------------------------------------- /retreive-data/crossref/download-crossref.py: -------------------------------------------------------------------------------- 1 | """Download random citations from the CrossRef DOI system (over their API), 2 | including both the JSON metadata for the reference _and_ the styled citation 3 | in one or more formats (American Chemical Society, AIP, APA, etc). 4 | 5 | Each citation will be written as a line to the outputfile in JSON format 6 | (jsonlines). 7 | """ 8 | 9 | # Example record 10 | # {'DOI': '10.1016/s0300-483x(96)03593-7', 11 | # 'prefix': 'http://id.crossref.org/prefix/10.1016', 12 | # 'member': 'http://id.crossref.org/member/78', 13 | # 'indexed': {'date-parts': [[2015, 2, 6]], 'timestamp': 1423247513443}, 14 | # 'deposited': {'date-parts': [[2011, 7, 11]], 'timestamp': 1310342400000}, 15 | # 'publisher': 'Elsevier BV', 16 | # 'title': ['Animal models in autoimmune disease in immunotoxicity assessment'], 17 | # 'ISSN': ['0300-483X'], 18 | # 'score': 1.0, 19 | # 'container-title': ['Toxicology'], 20 | # 'subject': ['Toxicology'], 21 | # 'reference-count': 0, 22 | # 'author': [{'given': 'J', 'family': 'Farine'}], 23 | # 'URL': 'http://dx.doi.org/10.1016/s0300-483x(96)03593-7', 24 | # 'issue': '1', 25 | # 'volume': '119', 26 | # 'issued': {'date-parts': [[1997, 4, 11]]}, 27 | # 'subtitle': [], 28 | # 'styled': [{'value': 'Farine, J. (1997). Animal models in autoimmune disease in immunotoxicity assessment. Toxicology, 119(1), 29-35. doi:10.1016/s0300-483x(96)03593-7', 29 | # 'style': 'apa'}, 30 | # {'value': 'Farine, J. Toxicology 1997, 119, 29-35.', 31 | # 'style': 'american-chemical-society'}], 32 | # 'page': '29-35', 33 | # 'type': 'journal-article', 34 | # 'source': 'CrossRef'} 35 | 36 | import re 37 | import json 38 | import argparse 39 | import requests 40 | from pprint import pprint 41 | from unidecode import unidecode 42 | 43 | 44 | def main(): 45 | p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 46 | p.add_argument('file', help='File to append citations to (jsonlines format)') 47 | p.add_argument('-n', help='Number of citations to download', type=int, default=100) 48 | args = p.parse_args() 49 | 50 | # might want to customize this later. 51 | # STYLES = ['nature', 'american-chemical-society', 'american-institute-of-physics'] 52 | # STYLES = ['organization'] 53 | STYLES = ['society-of-biblical-literature-fullnote-bibliography'] 54 | 55 | with open(args.file, 'a') as f: 56 | for item in sample(n=args.n, styles=STYLES): 57 | json.dump(item, f, sort_keys=True) 58 | f.write('\n') 59 | f.flush() 60 | 61 | 62 | def get_styled(doi, styles=['american-chemical-society']): 63 | """Get the styled citation for a given DOI from CrossRef 64 | 65 | Example 66 | ------- 67 | >>> get_styled('10.1016/s0300-483x(96)03593-7') 68 | [{'style': 'american-chemical-society', 'value': 'Farine, J. Toxicology 1997, 119, 29-35.'}] 69 | """ 70 | 71 | out = [] 72 | for style in styles: 73 | fmt = 'text/x-bibliography; style=%s' % style 74 | r = requests.get('http://dx.doi.org/%s' % doi, headers={'Accept': fmt}) 75 | # content is utf-8 encoded. To simplify the downstream stuff, we convert 76 | # non-ascii unicode characters to "nearest" ascii characters using 77 | # unidecode. 78 | c = unidecode(r.content.decode('utf-8')) 79 | 80 | strip_prefixes = ['(1)', '1.', '1'] 81 | for p in strip_prefixes: 82 | if c.startswith(p): 83 | c = c[len(p):] 84 | 85 | c = c.replace('Online: http://dx.doi.org/' , '') 86 | c = c.replace(doi + '.', '') 87 | c = c.replace(' No pages.', '') 88 | 89 | out.append({'style': style, 'value': c.strip()}) 90 | 91 | return out 92 | 93 | 94 | def sample(n=10, styles=['american-chemical-society', 'apa', 'nature']): 95 | def skip(item): 96 | if item['subtitle'] != []: 97 | # skip everything with a subtitle 98 | return True 99 | 100 | if len(item['title']) != 1: 101 | # title should be a list containing 1 string 102 | return True 103 | 104 | if len(item['title'][0].split()) < 2: 105 | # skip 1 word titles 106 | return True 107 | return False 108 | 109 | 110 | r = requests.get('http://api.crossref.org/works', 111 | params={'sample': n, 'filter': 'type:journal-article'}) 112 | items = r.json()['message']['items'] 113 | items = [x for x in items if not skip(x)] 114 | for item in items: 115 | item['styled'] = get_styled(item['DOI'], styles) 116 | pprint(item) 117 | print() 118 | yield item 119 | 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /bibtagger/featurize.py: -------------------------------------------------------------------------------- 1 | import string 2 | import gzip 3 | import functools 4 | from pprint import pprint 5 | from os.path import join, dirname, abspath 6 | from pkg_resources import resource_filename 7 | 8 | import marisa_trie 9 | from titlecase import titlecase 10 | from nltk.corpus import wordnet as wn 11 | from bibtagger.tokenizer import untokenize, tokenize 12 | 13 | def _load_reference(fname, trans=None): 14 | with gzip.open(resource_filename('bibtagger', join('fixeddata', fname))) as f: 15 | items = set() 16 | for line in f: 17 | line = line.strip().decode('utf-8') 18 | if trans is not None: 19 | line = trans(line) 20 | items.add(line) 21 | return frozenset(items) 22 | 23 | 24 | DIGITS = set([str(e) for e in range(10)]) 25 | UPPERCASE = set(string.ascii_uppercase) 26 | COMMON_GIVEN_NAMES = _load_reference('common-given-names.txt.gz') 27 | COMMON_SURNAMES = _load_reference('common-surnames.txt.gz', trans=str.lower) 28 | COMMON_WORDS = _load_reference('common-words.txt.gz') 29 | JOURNAL_SET = _load_reference('journals.txt.gz', trans=titlecase) 30 | JOURNAL_TRIE = marisa_trie.Trie(JOURNAL_SET) 31 | 32 | 33 | def common_hypernym(synsets): 34 | """Walk up the hypernym tree above a collection of WordNet synsets 35 | finding the first (hyper) synset that's in COMMON_WORDS. 36 | """ 37 | if len(synsets) == 0: 38 | return '' 39 | 40 | names = {l.name().split('_')[0].lower() for syn in synsets for l in syn.lemmas()} 41 | intersect = names.intersection(COMMON_WORDS) 42 | if len(intersect) > 0: 43 | # just deterministically pick one of the words to use. we'll 44 | # take the shortest. 45 | return min(intersect, key=len) 46 | else: 47 | hypersets = [hyper for s in synsets for hyper in s.hypernyms()] 48 | return common_hypernym(hypersets) 49 | 50 | 51 | def featurize(phrase): 52 | @functools.lru_cache(maxsize=1024) 53 | def get_local_features(word): 54 | word_lower = word.lower() 55 | 56 | hypernym = '' 57 | in_wordnet = 'NA' 58 | if len(word) > 4: 59 | synsets = wn.synsets(word) 60 | in_wordnet = len(synsets) > 0 61 | hypernym = common_hypernym(wn.synsets(word)) 62 | 63 | return { 64 | 'word': word, 65 | 'prefix4': word[:4], 66 | 'hypernym': hypernym, 67 | 'in_wordnet': in_wordnet, 68 | 'common_given': word_lower in COMMON_GIVEN_NAMES, 69 | 'common_surname': word_lower in COMMON_SURNAMES, 70 | 'cont_num': len(set(word).intersection(DIGITS)) > 0, 71 | 'all_num': all(l in DIGITS for l in word), 72 | } 73 | 74 | 75 | n = len(phrase) 76 | 77 | local_features = [dict() for _ in phrase] 78 | shift_features = [dict() for _ in local_features] 79 | for i, word in enumerate(phrase): 80 | local_features[i].update(get_local_features(word)) 81 | 82 | for i in range(n): 83 | local_features[i]['known_journal'] = False 84 | 85 | for i in range(n): 86 | matches = JOURNAL_TRIE.prefixes(titlecase(untokenize(phrase[i:]))) 87 | if len(matches) == 0: 88 | continue 89 | 90 | match = max(matches, key=len) 91 | t = tokenize(match) 92 | # only deal with multitoken matches. for single token journals, there 93 | # are a lot of false positives, and they can presumably be handled 94 | # easily by the model in training 95 | if len(t) > 2: 96 | for j, tok in enumerate(t): 97 | local_features[i+j]['known_journal'] = True 98 | 99 | # print(untokenize(phrase)) 100 | # print([(i, phrase[i]) for i, e in enumerate(local_features) if e['known_journal']]) 101 | # print() 102 | 103 | 104 | for i in range(1, n): 105 | shift_features[i].update({k+'[-1]': v for k, v in local_features[i-1].items()}) 106 | for i in range(n-1): 107 | shift_features[i].update({k+'[+1]': v for k, v in local_features[i+1].items()}) 108 | 109 | features = [] 110 | for i in range(n): 111 | features.append( 112 | ['%s=%s' % (k,v) for k, v in local_features[i].items()] + 113 | ['%s=%s' % (k,v) for k, v in shift_features[i].items()] 114 | ) 115 | features[0].append('__BOS__') 116 | features[-1].append('__EOS__') 117 | 118 | 119 | return features 120 | 121 | 122 | if __name__ == '__main__': 123 | from pprint import pprint 124 | item = {"tagged": [ 125 | ["fam", "Massague"], ["given", "J"], [None, ","], ["fam", "Seoane"], 126 | ["given", "J"], [None, ","], ["fam", "Wotton"], ["given", "D"], 127 | [None, "("], ["year", "2005"], [None, ")"], ["title", "Smad"], 128 | ["title", "transcription"], ["title", "factors"], [None, "."], 129 | ["journ", "Genes"], ["journ", "Dev"], ["vol", "19"], [None, ":"], 130 | ["page", "2783"], ["page", "-"], ["page", "2810"], [None, "."]] 131 | } 132 | features = featurize([e[1] for e in item['tagged']]) 133 | pprint(features[15]) 134 | -------------------------------------------------------------------------------- /retreive-data/download-tag-pnas.py: -------------------------------------------------------------------------------- 1 | '''Download and tag training data from the refence section 2 | of random J. Chem. Phys. papers 3 | ''' 4 | import time 5 | import sys 6 | import json 7 | import argparse 8 | import requests 9 | import traceback 10 | from bs4 import BeautifulSoup, Tag 11 | 12 | from unidecode import unidecode 13 | from bibtagger.tokenizer import tokenize 14 | from bibtagger.print_tokens import render_tokens 15 | 16 | unitokenize = lambda x: tokenize(unidecode(x)) if x is not None else [] 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' 18 | 19 | 20 | def main(): 21 | p = argparse.ArgumentParser() 22 | p.add_argument('-n', help='Number of articles to scrape', type=int, default=2) 23 | p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re tagged.') 24 | p.add_argument('dest', help='tokenized and tagged citations (jsonlines)') 25 | args = p.parse_args() 26 | 27 | with open(args.dest, 'a') as fout: 28 | for doi, article in sample_pnas(args.n): 29 | print('\n===== %s ======' % doi) 30 | for cit in article.find_all('div', {'class': 'cit-metadata'}): 31 | try: 32 | tokens = list(itertokens(cit)) 33 | if any(len(tok)>5 and tag is None for (tag, tok) in tokens): 34 | print('ERROR LONG TOKEN NOT MATCHED IN', render_tokens(tokens), file=sys.stderr) 35 | continue 36 | 37 | if args.verbose: 38 | print(render_tokens(tokens)) 39 | json.dump({'tagged': tokens}, fout) 40 | fout.write('\n') 41 | except UnexpectedTagError as e: 42 | print() 43 | print(e) 44 | print() 45 | # exit(1) 46 | 47 | 48 | class UnexpectedTagError(Exception): 49 | pass 50 | 51 | 52 | def itertokens(citation_node): 53 | htmlclass_to_tag = { 54 | 'cit-vol': 'vol', 55 | 'cit-fpage': 'page', 56 | 'cit-lpage': 'page', 57 | 'cit-pub-date':'year', 58 | # 'reference-source': 'journ', 59 | 'cit-name-surname': 'fam', 60 | 'cit-name-given-names': 'given', 61 | 'cit-jnl-abbrev': 'journ', 62 | 'cit-issue': 'issue', 63 | 'cit-name-suffix': 'fam', 64 | 'cit-article-title': 'title', 65 | } 66 | 67 | for part in citation_node.children: 68 | if isinstance(part, str): 69 | yield from ((None, t) for t in unitokenize(part)) 70 | elif isinstance(part, Tag) and part.name == 'ol': 71 | for li in part.find_all('li'): 72 | for part in li.children: 73 | if isinstance(part, Tag): 74 | for auth_part in part.find_all('span'): 75 | yield from ((htmlclass_to_tag[auth_part['class'][0]], t) for t in unitokenize(auth_part.text)) 76 | else: 77 | yield from ((None, t) for t in unitokenize(part)) 78 | 79 | elif isinstance(part, Tag) and part.name == 'cite': 80 | last_class = None 81 | for item in part.children: 82 | if isinstance(item, str): 83 | tag = 'page' if last_class == 'cit-fpage' else None 84 | yield from ((tag, t) for t in unitokenize(item)) 85 | else: 86 | if ('class' not in item.attrs) or item['class'][0] not in htmlclass_to_tag: 87 | raise UnexpectedTagError(citation_node, item) 88 | last_class = item['class'][0] 89 | yield from ((htmlclass_to_tag[item['class'][0]], t) for t in unitokenize(item.text)) 90 | 91 | 92 | # print(list(citation_node.children)[2]) 93 | # exit(1) 94 | # klass = part['class'][0] 95 | 96 | # exit(1) 97 | # cit-auth-list 98 | 99 | def sample_pnas(n_articles): 100 | #soup = BeautifulSoup(open('172.full', encoding='utf-8').read(), 'html.parser') 101 | ## soup = BeautifulSoup(open('untitled.txt', encoding='utf-8').read()) 102 | #yield '172.full', soup 103 | # issn for PNAS 104 | r = requests.get('http://api.crossref.org/works', 105 | params={'sample': n_articles, 'filter': 'issn:1091-6490'}) 106 | dois = (e['DOI'] for e in r.json()['message']['items']) 107 | 108 | for doi in dois: 109 | r = requests.get('http://dx.doi.org/%s' % doi, headers={'User-Agent': USER_AGENT}) 110 | soup = BeautifulSoup(r.content, 'html.parser') 111 | full_text_link = soup.find('a', {'rel': 'view-full-text'}) 112 | if full_text_link is None: 113 | print(r.url) 114 | print(r.content) 115 | print('Skipping. No full text HTML availavle') 116 | time.sleep(4) 117 | continue 118 | 119 | r2 = requests.get('http://www.pnas.org' + full_text_link['href']) 120 | print(r2.url) 121 | yield doi, BeautifulSoup(r2.content) 122 | time.sleep(4) 123 | 124 | 125 | 126 | if __name__ == '__main__': 127 | main() 128 | -------------------------------------------------------------------------------- /frontend/static/index.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 | 6 | 7 |88 | Take an arbitrary unstructured academic citation (e.g copy-pasted from a paper), 89 | and automatically parse 90 | and format it into a BibTeX record. 91 |
92 | 93 | 94 |