├── bibtagger
    ├── __init__.py
    ├── fixeddata
    │   ├── journals.txt.gz
    │   ├── common-words.txt.gz
    │   ├── common-surnames.txt.gz
    │   ├── common-given-names.txt.gz
    │   └── README.md
    ├── print_tokens.py
    ├── given_names.py
    ├── test_given_names.py
    ├── test_chunker.py
    ├── tokenizer.py
    ├── chunker.py
    └── featurize.py
├── model.crfsuite
├── frontend
    ├── static
    │   ├── favicon.ico
    │   ├── base.css
    │   ├── colorize-output.css
    │   ├── selecttext.js
    │   ├── sticky-footer.css
    │   ├── index.html
    │   └── file.js
    └── app.py
├── requirements.txt
├── retreive-data
    ├── downloaded
    │   ├── jcp-tagged.jsonl.gz
    │   ├── references.jsonl.gz
    │   ├── plos-tagged.jsonl.gz
    │   └── pnas-tagged.jsonl.gz
    ├── crossref
    │   ├── expand-crossref.py
    │   ├── download-crossref.py
    │   └── tag-crossref.py
    ├── README.md
    ├── download-tag-jcp.py
    ├── download-tag-plos.py
    └── download-tag-pnas.py
├── update-site.sh
├── setup.py
├── README.md
├── .gitignore
├── training
    ├── feature_extract.py
    └── train-model.ipynb
└── LICENSE


/bibtagger/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model.crfsuite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/model.crfsuite


--------------------------------------------------------------------------------
/frontend/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/frontend/static/favicon.ico


--------------------------------------------------------------------------------
/bibtagger/fixeddata/journals.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/journals.txt.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | marisa-trie
2 | python-crfsuite
3 | unidecode
4 | requests
5 | nltk
6 | tornado
7 | titlecase
8 | streql
9 | 


--------------------------------------------------------------------------------
/bibtagger/fixeddata/common-words.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/common-words.txt.gz


--------------------------------------------------------------------------------
/bibtagger/fixeddata/common-surnames.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/common-surnames.txt.gz


--------------------------------------------------------------------------------
/retreive-data/downloaded/jcp-tagged.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/jcp-tagged.jsonl.gz


--------------------------------------------------------------------------------
/retreive-data/downloaded/references.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/references.jsonl.gz


--------------------------------------------------------------------------------
/bibtagger/fixeddata/common-given-names.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/bibtagger/fixeddata/common-given-names.txt.gz


--------------------------------------------------------------------------------
/retreive-data/downloaded/plos-tagged.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/plos-tagged.jsonl.gz


--------------------------------------------------------------------------------
/retreive-data/downloaded/pnas-tagged.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmcgibbo/reftagger/HEAD/retreive-data/downloaded/pnas-tagged.jsonl.gz


--------------------------------------------------------------------------------
/update-site.sh:
--------------------------------------------------------------------------------
1 | # Update remote site to the git master
2 | # and restart the supervisor
3 | ssh reftag.rmcgibbo.org \
4 |     'cd /home/rmcgibbo/reftagger/  &&
5 |     /home/rmcgibbo/venv/bin/pip install -r requirements.txt &&
6 |     git pull origin master &&
7 |     cd /home/rmcgibbo/ &&
8 |     supervisorctl restart tornado-5000'
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='reftag',
 5 |     author='Robert McGibbon',
 6 |     author_email='rmcgibbo@gmail.com',
 7 |     url='http://github.com/rmcgibbo/reftagger',
 8 |     packages=find_packages(),
 9 |     package_data={'bibtagger': ['fixeddata/*']},
10 |     zip_safe=False,
11 | )
12 | 


--------------------------------------------------------------------------------
/frontend/static/base.css:
--------------------------------------------------------------------------------
 1 | .highlight {
 2 |     padding:9px 14px;
 3 |     margin:-15px -15px 15px;
 4 |     margin-bottom:14px;
 5 |     background-color:#f7f7f9;
 6 |     border:1px solid #e1e1e8;
 7 |     border-radius:4px
 8 | }
 9 | 
10 | .highlight pre {
11 |     padding:0;
12 |     margin-top:0;
13 |     margin-bottom:0;
14 |     word-break:normal;
15 |     white-space:nowrap;
16 |     background-color:
17 |     transparent;border:0
18 | }
19 | 


--------------------------------------------------------------------------------
/frontend/static/colorize-output.css:
--------------------------------------------------------------------------------
 1 | #output-zone > span.fam {
 2 |     color: #B02826;
 3 | }
 4 | #output-zone > span.given {
 5 |     color: #003399;
 6 | }
 7 | #output-zone > span.title {
 8 |     color: #859900;
 9 | }
10 | #output-zone > span.year {
11 |     color: #2aa198;
12 | }
13 | #output-zone > span.vol {
14 |     color: #6c71c4;
15 | }
16 | #output-zone > span.page {
17 |     color: #cb4b16;
18 | }
19 | #output-zone > span.journ {
20 |     color: #d33682;
21 | }
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/frontend/static/selecttext.js:
--------------------------------------------------------------------------------
 1 | /* Select an element (as if the user had selected a chunk of text w/ mouse)
 2 | */
 3 | function SelectText(element) {
 4 |     var doc = document
 5 |         , text = doc.getElementById(element)
 6 |         , range, selection
 7 |     ;
 8 |     if (doc.body.createTextRange) {
 9 |         range = document.body.createTextRange();
10 |         range.moveToElementText(text);
11 |         range.select();
12 |     } else if (window.getSelection) {
13 |         selection = window.getSelection();
14 |         range = document.createRange();
15 |         range.selectNodeContents(text);
16 |         selection.removeAllRanges();
17 |         selection.addRange(range);
18 |     }
19 | };
20 | 


--------------------------------------------------------------------------------
/frontend/static/sticky-footer.css:
--------------------------------------------------------------------------------
 1 | /* Sticky footer styles
 2 | -------------------------------------------------- */
 3 | html {
 4 |   position: relative;
 5 |   min-height: 100%;
 6 | }
 7 | body {
 8 |   /* Margin bottom by footer height */
 9 |   margin-bottom: 60px;
10 | }
11 | .footer {
12 |   position: absolute;
13 |   bottom: 0;
14 |   width: 100%;
15 |   /* Set the fixed height of the footer here */
16 |   height: 60px;
17 |   background-color: #f5f5f5;
18 | }
19 | 
20 | 
21 | /* Custom page CSS
22 | -------------------------------------------------- */
23 | /* Not required for template or sticky footer method. */
24 | 
25 | .container {
26 |   width: auto;
27 |   max-width: 740px;
28 |   padding: 0 15px;
29 | }
30 | .container .text-muted {
31 |   margin: 20px 0;
32 | }


--------------------------------------------------------------------------------
/bibtagger/print_tokens.py:
--------------------------------------------------------------------------------
 1 | from termcolor import colored
 2 | 
 3 | COLORMAP = {
 4 |     'page': 'red',
 5 |     'vol': 'magenta',
 6 |     'year': 'cyan',
 7 |     'journ': 'yellow',
 8 |     'given': 'magenta',
 9 |     'fam': 'red',
10 |     None: 'white',
11 |     'title': 'green',
12 |     'issue': 'green',
13 |  }
14 | 
15 | 
16 | def render_tokens(tags_and_tokens):
17 |     line = []
18 |     n_tokens = len(tags_and_tokens)
19 |     for i in range(n_tokens):
20 |         tag, tok = tags_and_tokens[i]
21 |         line.append(colored(tok, color=COLORMAP[tag]))
22 |         if tok == '(':
23 |             continue
24 |         if i < n_tokens-1:
25 |             tok1 = tags_and_tokens[i+1][1]
26 |             if tok1 not in [',', ')', ';', ':', '.']:
27 |                 line.append(' ')
28 |     return ''.join(line)
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Project unmaintained][unmaintained-image]][unmaintained-url]
 2 | 
 3 | [unmaintained-url]: .github/ISSUE_TEMPLATE.md
 4 | [unmaintained-image]: https://img.shields.io/badge/project-unmaintained-red.svg
 5 | 
 6 | Reference Tagger
 7 | ================
 8 | *Parse and tag unstructured academic citations.*
 9 | 
10 | A system that identifies, parses and formats unstructured academic citations
11 | using conditional random fields.
12 | 
13 | It can take a raw string like _"Wang, L.-P.; Titov, A.; McGibbon, R.; Liu, F.;
14 | Pande, V. S.; Martinez, T. J. Nature Chemistry 2014, 6, 1044-1048."_ and
15 | format it into a structured BibTeX record for example.
16 | 
17 | License: AGPL. Runtime: python 3.4.
18 | 
19 | -------------------------
20 | 
21 | **This is just to let you know that this project is unmaintained.**
22 | 
23 | **If you'd like to adopt this repo, please open a few PRs and I'll happily hand
24 | it over.**
25 | 


--------------------------------------------------------------------------------
/bibtagger/given_names.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import itertools
 3 | 
 4 | 
 5 | def abbreviations(given, only_period=False):
 6 |     split = given.split()
 7 | 
 8 | 
 9 |     if len(split) > 1:
10 |         #a0 = abbreviations(split[0])
11 |         abrvs = (
12 |             abbreviations(s, only_period=i>0)
13 |             for i, s in enumerate(split))
14 |         prod = itertools.product(*abrvs)
15 |         out = {' '.join(item) for item in prod}
16 | 
17 |         extra = set()
18 |         for o in out:
19 |             if re.search('\.\s\w\.', o):
20 |                 extra.add(o.replace('. ', '.'))
21 |         out.update(extra)
22 |         return out
23 | 
24 | 
25 |     if len(split) == 1:
26 |         item = split[0]
27 |         first_letter = item[0]
28 | 
29 |         if only_period:
30 |             return {item, first_letter+'.'}
31 |         else:
32 |             return {item, first_letter, first_letter+'.'}
33 | 
34 |     raise ValueError(given)
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | 
44 | # Translations
45 | *.mo
46 | *.pot
47 | 
48 | # Django stuff:
49 | *.log
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | 
54 | # PyBuilder
55 | target/
56 | .ipynb_checkpoints/
57 | retreive-data/.DS_Store
58 | 


--------------------------------------------------------------------------------
/bibtagger/test_given_names.py:
--------------------------------------------------------------------------------
 1 | from .given_names import abbreviations
 2 | 
 3 | 
 4 | def test_1():
 5 | 
 6 |     assert abbreviations('J') == {'J', 'J.'}
 7 |     assert abbreviations('Miguel') == {'M', 'M.', 'Miguel'}
 8 |     assert abbreviations('Miguel Thomas') == {'Miguel Thomas', 'M T.', 'M.T.', 'M. Thomas', 'M. T.', 'Miguel T.', 'M Thomas'}
 9 |     assert abbreviations('S.') == {'S', 'S.'}
10 |     assert abbreviations('B. I.') == {'B. I.', 'B.I.', 'B I.'}
11 | 
12 |     assert abbreviations('John A. T.') == {'J A.T.', 'John A.T.', 'J.A.T.', 'John A. T.', 'J. A. T.', 'J A. T.'}
13 |     assert abbreviations('R. I. C. C.') == {'R I. C. C.', 'R I.C.C.', 'R.I.C.C.', 'R. I. C. C.'}
14 |     assert abbreviations('Radboud J. Duintjer') == {'R.J.D.', 'R.J.Duintjer', 'R. J. D.', 'Radboud J. Duintjer', 'Radboud J. D.', 'R. J. Duintjer', 'Radboud J.D.', 'R J.D.', 'R J. Duintjer', 'R J. D.'}
15 |     assert abbreviations('Karl A. Von') == {'K A.V.', 'Karl A. Von', 'K.A.V.', 'K.A.Von', 'Karl A.V.', 'K A. V.', 'Karl A. V.', 'K A. Von', 'K. A. Von', 'K. A. V.'}
16 | 


--------------------------------------------------------------------------------
/bibtagger/fixeddata/README.md:
--------------------------------------------------------------------------------
 1 | Auxiliary Data
 2 | ==============
 3 | 
 4 | These are data files that are not _exactly_ part of the training set, since they
 5 | they're not parsed citations. They're used in the token featurization to provide
 6 | semantically rich features that should make the classification more accurate.
 7 | 
 8 | 1. `common-surnames.txt.gz`
 9 | 
10 |     All surnames appearing 100 or more times in the 2000 US cencus. There are
11 |     151671 of them, written in all caps. They're stored in flat text with
12 |     newline separators. They were downloaded from this census.gov website
13 | 
14 |     http://www.census.gov/topics/population/genealogy/data/2000_surnames.html
15 | 
16 | 
17 | 2. `common-given-names.txt.gz`
18 | 
19 |     Common given (first) names, from the US Social Security Administration. I
20 |     summed the counts accross the years of birth and took the the 1000 most
21 |     common names. The data is from
22 | 
23 |     http://www.ssa.gov/oact/babynames/limits.html
24 | 
25 | 
26 | 3. `common-words.txt.gz`
27 | 
28 |     5000 most common english words. Newline separated. Data is from
29 | 
30 |     http://norvig.com/ngrams/count_1w.txt
31 | 
32 | 
33 | 4. `journals.txt.gz`
34 | 
35 |     List of journal titles, including both the full name, MedLine abbreviation
36 |     and ISO abbreviation. There are a total of 52274 unique entries. The data
37 |     comes from the PubMed and NCBI Molecular Biology Database Journals list.
38 | 
39 |     http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.pubmedhelptable45/
40 | 
41 | 
42 | 5. [WordNet](http://wordnet.princeton.edu/)
43 | 
44 |     We also use wordnet, through the `nltk` interface. See `featurize.py`.
45 | 
46 | 


--------------------------------------------------------------------------------
/training/feature_extract.py:
--------------------------------------------------------------------------------
 1 | """This script takes tagged citations (training data) as produced by
 2 | `retreive-data/tag-citations.py` and does feature extraction, producing
 3 | the direct input data for the CRF model.
 4 | 
 5 | The features are word prefixes and suffixes, whether or not they
 6 | contain digits or dots, their lengths, and their relationship to the
 7 | words forward and backward in the sequence.
 8 | """
 9 | import re
10 | import sys
11 | import json
12 | import pickle
13 | import argparse
14 | from os.path import dirname, abspath, join, isfile
15 | from collections import Counter
16 | 
17 | PROJECT_ROOT = join(dirname(abspath(__file__)), '..')
18 | sys.path.insert(0, PROJECT_ROOT)
19 | from bibtagger.featurize import featurize
20 | 
21 | 
22 | def main():
23 |     p = argparse.ArgumentParser(description=__doc__,
24 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
25 |     p.add_argument('source', nargs='+', help='Tokenize and tagged citations (jsonlines format)')
26 |     p.add_argument('dest', help='Featurized training data (pkl format)')
27 |     p.add_argument('-n', '--n-rare', help='Minimum number of occurances of a '
28 |         'token to label it \'rare\'.', type=int, default=10)
29 | 
30 |     args = p.parse_args()
31 |     if isfile(args.dest):
32 |         p.error('File exists. %s' % args.dest)
33 | 
34 |     phrases, y = [], []
35 | 
36 |     for source in args.source:
37 |         with open(source, 'r') as f:
38 |             for i, line in enumerate(f):
39 |                 item = json.loads(line)['tagged']
40 |                 if len(item) > 0:
41 |                     yy, xx = zip(*item)
42 |                     phrases.append(xx)
43 |                     y.append([str(tag) for tag in yy])
44 | 
45 |     print('Featurizing')
46 |     X = []
47 |     for i, phrase in enumerate(phrases):
48 |         if i % 100 == 0:
49 |             print('%d/%d' % (i, len(phrases)))
50 |         X.append(featurize(phrase))
51 | 
52 |     #print('len(X)', len(X))
53 |     #print('len(y)', len(y))
54 |     #print(X[0])
55 |     #print(y[0])
56 | 
57 | 
58 |     with open(args.dest, 'wb') as fout:
59 |         pickle.dump({
60 |             'X': X,
61 |             'y': y,
62 |         }, fout)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 
68 | 


--------------------------------------------------------------------------------
/retreive-data/crossref/expand-crossref.py:
--------------------------------------------------------------------------------
 1 | """Create _new_ styled entries from data download by `download-citations.py`
 2 | from each variant of the `container-title` entry.
 3 | 
 4 | For example, say we download the following AIP styled citation:
 5 | 
 6 | {
 7 |  'container-title': ['Pesq. agropec. bras.',
 8 |                      'Pesquisa Agropecuaria Brasileira'],
 9 |  'styled': [
10 |      {'style': 'american-institute-of-physics',
11 |      'value': 'N.P. Stamford, C.E. de R. e S. Santos, R. Medeiros, '
12 |               'and A.D.S. de Freitas, Pesquisa Agropecuaria '
13 |               'Brasileira 34, 1831 (1999).'}]
14 | }
15 | 
16 | This script will create a new entry in the `styled` list containing the
17 | reference with the other `container-title`.
18 | 
19 | """
20 | import json
21 | import copy
22 | import argparse
23 | from unidecode import unidecode
24 | from pprint import pprint
25 | 
26 | 
27 | def main():
28 |     p = argparse.ArgumentParser(description=__doc__,
29 |         formatter_class=argparse.RawDescriptionHelpFormatter)
30 |     p.add_argument('source', help='Input (jsonlines)')
31 |     p.add_argument('dest', help='Output (jsonlines)')
32 |     args = p.parse_args()
33 | 
34 |     with open(args.source, 'r') as fin, open(args.dest, 'w') as fout:
35 |         for i, line in enumerate(fin):
36 |             if (i % 100) == 0:
37 |                 print('LINE %d' % i)
38 | 
39 |             newcit = expand_journal_abbreviations(json.loads(line))
40 |             json.dump(newcit, fout)
41 |             fout.write('\n')
42 | 
43 | 
44 | def expand_journal_abbreviations(cit):
45 |     if len(cit['container-title']) <= 1:
46 |         return cit
47 | 
48 |     container_titles = list(map(unidecode, cit['container-title']))
49 |     new_styled = []
50 | 
51 |     for s in cit['styled']:
52 |         for ct in container_titles:
53 |             if s['value'].find(ct) != -1:
54 |                 for jj, ot in enumerate(container_titles):
55 |                     new_value = s['value'].replace(ct, ot)
56 |                     if all(new_value != ss['value'] for ss in cit['styled']):
57 | 
58 |                         new_styled.append({
59 |                             'value': new_value,
60 |                             'style': s['style'] + '-abbrev-%d' % jj
61 |                         })
62 | 
63 |     cit['styled'].extend(new_styled)
64 |     return cit
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/retreive-data/README.md:
--------------------------------------------------------------------------------
 1 | Scripts for Acquiring Training Data
 2 | -----------------------------------
 3 | 
 4 | These are the scripts for getting training data. They download citations from the web,
 5 | and then parse and format them in such a way that that they're suitable for learning
 6 | from.
 7 | 
 8 | 1. `download-citations.py`
 9 | 
10 |   This script retreives the main source of training data, the CrossRef records associated
11 |   with random DOIs. It uses the `sample` endpoint from the [CrossRef API](https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md). It also pulls down
12 |   a styled version of the reference, formatted according to the  `american-chemical-society`,
13 |   `apa`, or whatever.
14 | 
15 |   Each record looks something like this:
16 | 
17 |     `{"URL": "http://dx.doi.org/10.1016/s0300-483x(96)03593-7", "score": 1.0, "issued": {"date-parts": [[1997, 4, 11]]}, "issue": "1", "volume": "119", "ISSN": ["0300-483X"], "prefix": "http://id.crossref.org/prefix/10.1016", "title": ["Animal models in autoimmune disease in immunotoxicity assessment"], "deposited": {"date-parts": [[2011, 7, 11]], "timestamp": 1310342400000}, "member": "http://id.crossref.org/member/78", "container-title": ["Toxicology"], "author": [{"given": "J", "family": "Farine"}], "source": "CrossRef", "subtitle": [], "type": "journal-article", "reference-count": 0, "indexed": {"date-parts": [[2015, 2, 6]], "timestamp": 1423247513443}, "DOI": "10.1016/s0300-483x(96)03593-7", "publisher": "Elsevier BV", "styled": [{"value": "Farine, J. (1997). Animal models in autoimmune disease in immunotoxicity assessment. Toxicology, 119(1), 29-35. doi:10.1016/s0300-483x(96)03593-7", "style": "apa"}, {"value": "Farine, J. Toxicology 1997, 119, 29-35.", "style": "american-chemical-society"}], "page": "29-35", "subject": ["Toxicology"]}`
18 | 
19 |   These contain both annotated information about a paper, like the journal,
20 |   title, authors, etc, and also the styled reference, as it would be written
21 |   in a paper.
22 | 
23 | 2. `expand-citations.py`
24 | 
25 |   The styled references typically only include 1 version of the journal title.
26 |   Usually this is the long version (Journal of Organic Chemistry) as opposed
27 |   to the abbreviated title (J. Org. Chem.).
28 | 
29 |   We want our classifer to be able to handle both, so `expand-citations.py`
30 |   adds new synthetic styled references to each of the entries, by
31 |   find-and-replacing on the journal title and substituting its other variants.
32 | 
33 |   The output JSON format of `expand-citations.py` and `download-citations.py`
34 |   are the same. `expand-citations.py` just adds a couple entries to the
35 |   list of styled references in each citation.
36 | 
37 | 3. `tag-citations.py`
38 | 
39 |   This script takes as input the result of `expand-citations.py` (or
40 |   `download-citations.py`)
41 | 
42 |   It prroduces jsonlines output containing styled citations that have been
43 |   tokenized and tagged.
44 | 
45 |     `{"value": "J. Fransson, A. Talamelli, L. Brandt, and C. Cossu, Phys. Rev. Lett. 96, (2006).", "tagged": [["J.", "given"], ["Fransson", "fam"], [",", "None"], ["A.", "given"], ["Talamelli", "fam"], [",", "None"], ["L.", "given"], ["Brandt", "fam"], [",", "None"], ["and", "None"], ["C.", "given"], ["Cossu", "fam"], [",", "None"], ["Phys.", "journ"], ["Rev.", "journ"], ["Lett.", "journ"], ["96", "vol"], [",", "None"], ["(", "None"], ["2006", "year"], [").", "None"]]}`
46 | 


--------------------------------------------------------------------------------
/bibtagger/test_chunker.py:
--------------------------------------------------------------------------------
 1 | from .chunker import greedy_label, tokenize_and_tag
 2 | 
 3 | 
 4 | def test_1():
 5 |     text = 'hello hello ; world'
 6 |     chunks = ['hello', 'hello ;', 'world', 'sdf']
 7 |     assert greedy_label(text, chunks) == [(0, 0), (1, 6), (2, 14)]
 8 | 
 9 | 
10 | def test_2():
11 |     text = 'A.B.; J. Chem. Phys.'
12 |     chunk_sets = {
13 |         'label_AB': ['A.B.'],
14 |         'label_J': ['J. Chem. Phys.'],}
15 | 
16 |     tokens, tags = tokenize_and_tag(text, chunk_sets)
17 |     assert tokens == ['A', '.', 'B', '.', ';', 'J', '.', 'Chem', '.', 'Phys', '.']
18 |     assert tags == ['label_AB', 'label_AB', 'label_AB', 'label_AB', None, 'label_J', 'label_J', 'label_J', 'label_J', 'label_J', 'label_J']
19 | 
20 | 
21 | def test_3():
22 |     text = 'a A.B.; J. Chem. Phys. b'
23 |     chunk_sets = {
24 |         'label_AB': ['A.B.'],
25 |         'label_J': ['J. Chem. Phys.'],}
26 | 
27 |     tokens, tags = tokenize_and_tag(text, chunk_sets)
28 |     z = list(zip(tokens, tags))
29 |     assert z == [
30 |         ('a', None), ('A', 'label_AB'),
31 |         ('.', 'label_AB'), ('B', 'label_AB'),
32 |         ('.', 'label_AB'), (';', None),
33 |         ('J', 'label_J'), ('.', 'label_J'),
34 |         ('Chem', 'label_J'), ('.', 'label_J'),
35 |         ('Phys', 'label_J'), ('.', 'label_J'),
36 |         ('b', None)]
37 | 
38 | 
39 | def test_4():
40 |     text = 'Farine, J. (1997). Title. Toxicology, 119(1), 29-35.'
41 |     chunk_sets = {
42 |         'family': ['Farine'],
43 |         'given': ['J.'],
44 |         'year': ['1997'],
45 |         'title': ['Title'],
46 |         'journal': ['Toxicology'],
47 |     }
48 |     chunk_sets = {
49 |         'page': ['29-35', '29', '35'],
50 |         'year': ['1997'],
51 |         'fam': ['Farine'],
52 |         'journ': ['Toxicology'],
53 |         'vol': ['119'],
54 |         'given': ['J.'],
55 |         'title': ['Title']
56 |     }
57 |     tokens, tags = tokenize_and_tag(text, chunk_sets)
58 |     z = list(zip(tokens, tags))
59 |     print(z)
60 | 
61 | 
62 | def test_5():
63 |     text = 'Jafelicci Jr . , M . , & Loh , W . ( 1999 ) . Editorial . Journal of the Brazilian Chemical Society , 10 ( 5 ) .'
64 |     chunks = ['10', 'Editorial', 'Jafelicci', 'Jr .', 'Loh', 'Braz', 'Chem', 'Soc', 'Journal', 'of', 'the', 'Brazilian', 'Chemical', 'Society', '', '', 'M .', 'W .', '1999']
65 |     greedy_label(text, chunks)
66 | 
67 | 
68 | def test_6():
69 |     text = '03'
70 |     chunk_sets = {0: ['0'], 3: ['3']}
71 |     tokenize_and_tag(text, chunk_sets)
72 | 
73 | 
74 | def test_7():
75 |     text = 'A . Dow and R . Pichardo - Mendoza , Topology and Its Applications 160 , 2207 ( 2013 ) .'
76 |     chunk_sets = {
77 |         'title': ["Efimov's problem and Boolean algebras"],
78 |         'given': {'A', 'A.', 'Alan', 'R', 'R.', 'Roberto'},
79 |         'fam': ['Dow', 'Pichardo-Mendoza'],
80 |         'journ': ['Topology and Its Applications'],
81 |         'vol': ['160'],
82 |         'page': ['2207-2231', '2207'],
83 |         'issue': ['17'],
84 |         'year': ['2013']
85 |     }
86 |     tokens, tags = tokenize_and_tag(text, chunk_sets)
87 |     z = list(zip(tokens, tags))
88 |     assert z == [('A', 'given'), ('.', 'given'), ('Dow', 'fam'),
89 |         ('and', None), ('R', 'given'), ('.', 'given'),
90 |         ('Pichardo', 'fam'), ('-', 'fam'), ('Mendoza', 'fam'),
91 |         (',', None), ('Topology', 'journ'), ('and', 'journ'),
92 |         ('Its', 'journ'), ('Applications', 'journ'), ('160', 'vol'),
93 |         (',', None), ('2207', 'page'), ('(', None), ('2013', 'year'),
94 |         (')', None), ('.', None)]
95 | 


--------------------------------------------------------------------------------
/bibtagger/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | def tokenize(text):
  5 |     """Very simple word tokenizer.
  6 |     """
  7 | 
  8 |     # punctuation
  9 |     text = re.sub(r'\.\.\.|\.', r' \g<0> ', text)
 10 |     text = re.sub(r'[;@#$%&:,?!\(\)"\']', r' \g<0> ', text)
 11 | 
 12 |     #parens, brackets, etc.
 13 |     text = re.sub(r'--', r' -- ', text)
 14 |     text = re.sub(r'([^\s])-([^\s])', r'\g<1> - \g<2>', text)
 15 | 
 16 |     #add extra space to make things easier
 17 |     text = " " + text + " "
 18 | 
 19 |     split = text.split()
 20 |     return split
 21 | 
 22 | 
 23 | def tokenize_with_pos(text):
 24 |     """Variant of ``tokenize`` that also returns the indices in
 25 |     ``text`` where each of the tokens begin.
 26 |     """
 27 | 
 28 |     WHITESPACE = set('\t\r\n ')
 29 |     SPECIAL_TOKENS = set(('...', ';', '@', '#', '$', '%', '&', ':', ',',
 30 |                           '?', '!', '(', ')', '.', '\"', '\'', '--', '-'))
 31 |     LONGEST_SPECIAL_TOKEN = max(len(e) for e in SPECIAL_TOKENS)
 32 |     SHORTEST_SPECIAL_TOKEN = min(len(e) for e in SPECIAL_TOKENS)
 33 | 
 34 | 
 35 |     def inner():
 36 |         current_token_start = None
 37 |         current_token = []
 38 | 
 39 |         i = 0
 40 |         while i < len(text):
 41 |             matched_special = False
 42 |             for n in range(LONGEST_SPECIAL_TOKEN, SHORTEST_SPECIAL_TOKEN - 1, -1):
 43 |                 if text[i:i+n] in SPECIAL_TOKENS:
 44 |                     matched_special = True
 45 |                     break
 46 | 
 47 |             if text[i] in WHITESPACE:
 48 |                 if current_token_start is not None:
 49 |                     yield (''.join(current_token), current_token_start)
 50 |                     current_token_start = None
 51 |                     current_token = []
 52 |             elif matched_special:
 53 |                 if current_token_start is not None:
 54 |                     yield (''.join(current_token), current_token_start)
 55 |                 yield text[i:i+n], i
 56 |                 i += n-1
 57 |                 current_token_start = None
 58 |                 current_token = []
 59 |             else:
 60 |                 if current_token_start is None:
 61 |                     current_token_start = i
 62 |                 current_token.append(text[i])
 63 | 
 64 |             i += 1
 65 | 
 66 |         if current_token_start is not None:
 67 |             yield (''.join(current_token), current_token_start)
 68 | 
 69 |     return list(zip(*inner()))
 70 | 
 71 | 
 72 | def untokenize(tokens, positions=None):
 73 |     if positions is not None:
 74 |         return untokenize_with_positions(tokens, positions)
 75 |     return untokenize_heuristic(tokens)
 76 | 
 77 | 
 78 | def untokenize_with_positions(tokens, positions):
 79 |     with_whitespace = []
 80 |     length = 0
 81 | 
 82 |     for tok, pos in zip(tokens, positions):
 83 |         gap = pos - length
 84 |         if gap > 0:
 85 |             with_whitespace.append(' ' * gap)
 86 |             length += gap
 87 |         with_whitespace.append(tok)
 88 |         length += len(tok)
 89 | 
 90 |     return ''.join(with_whitespace)
 91 | 
 92 | 
 93 | def untokenize_heuristic(tokens):
 94 |     with_whitespace = []
 95 |     for i in range(len(tokens)):
 96 |         tok = tokens[i]
 97 |         with_whitespace.append(tok)
 98 | 
 99 |         if tok != '(' and i < len(tokens) - 1:
100 |             if tokens[i+1] not in ('?', ')', ';', '!', ':', ',', '.', '...'):
101 |                 with_whitespace.append(' ')
102 | 
103 |     return ''.join(with_whitespace)
104 | 
105 | 
106 | 
107 | def test_1():
108 |     s = 'Hello   Wo-rld... . sdf; sdf--ddd one.two'
109 |     out1 = tokenize(s)
110 |     out2, pos2 = list(tokenize_with_pos(s))
111 | 
112 |     assert untokenize(out2, pos2) == s
113 |     assert list(out1) == list(out2)
114 |     assert untokenize(out1) == 'Hello Wo - rld.... sdf; sdf -- ddd one. two'
115 | 


--------------------------------------------------------------------------------
/retreive-data/download-tag-jcp.py:
--------------------------------------------------------------------------------
  1 | '''Download and tag training data from the refence section
  2 | of random J. Chem. Phys. papers
  3 | '''
  4 | import json
  5 | import argparse
  6 | import requests
  7 | import traceback
  8 | from bs4 import BeautifulSoup, Tag
  9 | 
 10 | from unidecode import unidecode
 11 | from bibtagger.tokenizer import tokenize
 12 | from bibtagger.print_tokens import render_tokens
 13 | 
 14 | unitokenize = lambda x: tokenize(unidecode(x)) if x is not None else []
 15 | 
 16 | 
 17 | def main():
 18 |     p = argparse.ArgumentParser()
 19 |     p.add_argument('-n', help='Number of articles to scrape', type=int, default=2)
 20 |     p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re tagged.')
 21 |     p.add_argument('dest', help='tokenized and tagged citations (jsonlines)')
 22 |     args = p.parse_args()
 23 | 
 24 |     with open(args.dest, 'a') as fout:
 25 |         for doi, article in sample_jcp(args.n):
 26 |             print('\n===== %s ======' % doi)
 27 |             for cit in article.find_all('div', {'class': 'citation'}):
 28 |                 try:
 29 |                     tokens = list(itertokens(cit))
 30 |                     if args.verbose:
 31 |                         print(render_tokens(tokens))
 32 |                         json.dump({'tagged': tokens}, fout)
 33 |                     fout.write('\n')
 34 |                 except UnexpectedTagError as e:
 35 |                     print()
 36 |                     print(e)
 37 |                     print()
 38 | 
 39 | 
 40 | class UnexpectedTagError(Exception):
 41 |     pass
 42 | 
 43 | 
 44 | def itertokens(citation_node):
 45 |     xmltag_to_ourtag = {
 46 |         'reference-volume': 'vol',
 47 |         'reference-fpage': 'page',
 48 |         'reference-year':'year',
 49 |         'reference-source': 'journ',
 50 |         'reference-surname': 'fam',
 51 |         'reference-given-names': 'given',
 52 |         'reference-issue': 'issue',
 53 |         'reference-suffix': 'fam',
 54 |         'reference-article-title': 'title',
 55 |     }
 56 | 
 57 |     tags_seen = set()
 58 |     children = list(citation_node.children)
 59 |     while len(children) > 0:
 60 |         part = children.pop(0)
 61 | 
 62 |         if isinstance(part, Tag):
 63 |             try:
 64 |                 klass = part['class'][0]
 65 |             except:
 66 |                 raise UnexpectedTagError(str(citation_node), part)
 67 | 
 68 | 
 69 |             if klass in ('citation-label', 'group-citation-label'):
 70 |                 pass
 71 |             elif klass == 'reference-fpage':
 72 |                 fpage = part.text
 73 |                 if len(children) > 1 and children[1]['class'][0] == 'reference-lpage':
 74 |                     middle = children.pop(0)
 75 |                     part = children.pop(0)
 76 |                     lpage = part.text
 77 |                     yield from (('page', t) for t in unitokenize(fpage + middle + lpage))
 78 |                 else:
 79 |                     yield from (('page', t) for t in unitokenize(fpage))
 80 |             else:
 81 |                 if klass not in xmltag_to_ourtag:
 82 |                     raise UnexpectedTagError(str(citation_node), klass)
 83 |                 else:
 84 |                     tags_seen.add(xmltag_to_ourtag[klass])
 85 | 
 86 |                 yield from ((xmltag_to_ourtag[klass], t) for t in unitokenize(part.text))
 87 |         else:
 88 |             if 'given' in tags_seen:
 89 |                 yield from ((None, t) for t in unitokenize(part))
 90 | 
 91 | 
 92 | def sample_jcp(n_articles):
 93 |     # issn for J. Chem. Phys.
 94 |     r = requests.get('http://api.crossref.org/works',
 95 |         params={'sample': n_articles, 'filter': 'issn:1089-7690'})
 96 | 
 97 |     dois = (e['DOI'] for e in r.json()['message']['items'])
 98 | 
 99 |     for doi in dois:
100 |         r = requests.get('http://dx.doi.org/%s' % doi)
101 |         yield doi, BeautifulSoup(r.content)
102 | 
103 | if __name__ == '__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/frontend/app.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import pickle
  4 | import functools
  5 | import hashlib
  6 | import ssl
  7 | import hmac
  8 | import base64
  9 | import time
 10 | from uuid import uuid4
 11 | from os.path import dirname, abspath, join
 12 | 
 13 | from tornado.options import options, define
 14 | import tornado.ioloop
 15 | import tornado.web
 16 | import streql
 17 | import pycrfsuite
 18 | from unidecode import unidecode
 19 | 
 20 | from bibtagger.tokenizer import tokenize
 21 | from bibtagger.featurize import featurize
 22 | 
 23 | 
 24 | def build_signature(json_obj):
 25 |     msg = json.dumps(json_obj, sort_keys=True).encode('utf-8')
 26 |     sig = hmac.new(SECRET_KEY, msg, digestmod=hashlib.sha256).digest()
 27 |     return base64.b64encode(sig).decode('utf-8')
 28 | 
 29 | 
 30 | class ResolveHandler(tornado.web.RequestHandler):
 31 |     def get(self):
 32 |         q = self.get_argument('q', default=None)
 33 |         result = {'request-id': str(uuid4()), 'tokens': [], 'tags': [], 'q': ''}
 34 |         if q is not None:
 35 |             result.update(self._tag(q))
 36 | 
 37 |         signature = build_signature(result)
 38 |         result['signature'] = signature
 39 |         self.write(result)
 40 | 
 41 |     @functools.lru_cache(maxsize=1024)
 42 |     def _tag(self, q):
 43 |         q = unidecode(q)
 44 |         tokens = tokenize(q)
 45 | 
 46 |         if len(tokens) == 0:
 47 |             return {'tokens': [], 'tags': [], 'q': ''}
 48 | 
 49 |         X = featurize(tokens)
 50 |         tags = TAGGER.tag(X)
 51 |         return {'tokens': tokens, 'tags': tags, 'q': q}
 52 | 
 53 | 
 54 | class FeedbackHandler(tornado.web.RequestHandler):
 55 |     def post(self, status):
 56 |         assert status in ('accept', 'reject')
 57 | 
 58 |         data = json.loads(self.request.body.decode('utf-8'))
 59 |         received_signature = data.pop('signature')
 60 |         signature = build_signature(data)
 61 | 
 62 |         if streql.equals(received_signature, signature):
 63 |             with open(FEEDBACK_LOG_PATH, 'a') as f:
 64 |                 json.dump({
 65 |                     'status': status,
 66 |                     'time': time.time(),
 67 |                     'data': data
 68 |                 }, f)
 69 |                 f.write('\n')
 70 | 
 71 | 
 72 | class NoCacheStaticFileHandler(tornado.web.StaticFileHandler):
 73 |     def set_extra_headers(self, path):
 74 |         self.set_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0')
 75 | 
 76 | 
 77 | class IndexHandler(tornado.web.RequestHandler):
 78 |     def get(self):
 79 |         with open(join(STATIC_PATH, 'index.html'), 'r') as f:
 80 |             self.write(f.read())
 81 | 
 82 |     def set_extra_headers(self, path):
 83 |         self.set_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0')
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 |     define("host", default="localhost", help="app host", type=str)
 88 |     define("port", default=5000, help="app port", type=int)
 89 |     define("feedbackdir", default=".", help='directory for feedback log file', type=str)
 90 |     options.parse_command_line()
 91 | 
 92 |     # load up the tokenizer
 93 |     PROJECT_DIR = join(dirname(abspath(__file__)), '..')
 94 |     STATIC_PATH = join(dirname(abspath(__file__)), 'static')
 95 |     FEEDBACK_LOG_PATH = join(options.feedbackdir, 'feedback.jsonl')
 96 |     TAGGER = pycrfsuite.Tagger()
 97 |     TAGGER.open(join(PROJECT_DIR, 'model.crfsuite'))
 98 |     SECRET_KEY = ssl.RAND_bytes(16)
 99 | 
100 |     application = tornado.web.Application([
101 |         (r"/resolve", ResolveHandler),
102 |         (r"/feedback/(accept|reject)", FeedbackHandler),
103 | 
104 |         # in production, these are handled by nginx, so here we just have
105 |         # them as non-caching routes for dev.
106 |         (r"/", IndexHandler),
107 |         (r"/static/(.*)", NoCacheStaticFileHandler, {"path": STATIC_PATH})
108 |     ])
109 | 
110 |     application.listen(options.port, options.host)
111 |     tornado.ioloop.IOLoop.instance().start()
112 | 


--------------------------------------------------------------------------------
/bibtagger/chunker.py:
--------------------------------------------------------------------------------
  1 | from .tokenizer import tokenize
  2 | 
  3 | 
  4 | def tokenize_and_tag(text, chunk_sets):
  5 |     # all tokens separated by whitespace
  6 |     text = ' '.join(tokenize(text)) + ' '
  7 |     tokens = text.split()
  8 | 
  9 | 
 10 |     chunks = []
 11 |     chunk2tag = []
 12 | 
 13 |     for k in chunk_sets:
 14 |         for c in chunk_sets[k]:
 15 |             tc = tokenize(c)
 16 | 
 17 |             # only use chunks if every token within the chunk is actually
 18 |             # one of our tokens
 19 |             if all(t in tokens for t in tc):
 20 |                 chunks.append(' '.join(tc) + ' ')
 21 |                 chunk2tag.append(k)
 22 | 
 23 |     labels = greedy_label(text, chunks)
 24 |     start_end = []
 25 |     tags = []
 26 | 
 27 |     # print(text)
 28 |     # print(chunks)
 29 |     # print()
 30 |     # for i, _ in labels:
 31 |     #     print(chunks[i])
 32 |     # print()
 33 |     #
 34 |     # print(tokens)
 35 |     # print()
 36 | 
 37 |     for i in range(len(labels)):
 38 |         # print([(tags[t], tokens[t]) for t in range(len(tags))])
 39 | 
 40 |         t, start = labels[i]
 41 |         end = start + len(chunks[t])
 42 |         if i == 0:
 43 |             tags.extend([None for _ in tokenize(text[:start])])
 44 | 
 45 |         tags.extend([chunk2tag[t] for _ in tokenize(text[start:end])])
 46 | 
 47 |         if i < len(labels)-1:
 48 |             # interior space between matched blocks
 49 |             next_start = labels[i+1][1]
 50 |             interspace = text[end:next_start]
 51 |             tags.extend([None for _ in tokenize(interspace)])
 52 | 
 53 |         if i == len(labels)-1:
 54 |             # text after the last matched block
 55 |             tags.extend([None for _ in tokenize(text[end:])])
 56 | 
 57 |     if len(labels) == 0:
 58 |         tags = [None]
 59 | 
 60 | 
 61 | 
 62 | 
 63 |     if len(tokens) != len(tags):
 64 |         print(text)
 65 |         print(len(labels))
 66 |         print(chunk_sets)
 67 |         print('chunks', chunks)
 68 |         print('tokens', tokens)
 69 |         print('tags', tags)
 70 |         print(list(zip(tags, tokens)))
 71 |         assert False
 72 | 
 73 | 
 74 |     return tokens, tags
 75 | 
 76 | 
 77 | def greedy_label(text, chunks):
 78 |     """Find non-overlapping chunks in text.
 79 | 
 80 |     Parameters
 81 |     ----------
 82 |     text : str
 83 |     chunks : list of str
 84 | 
 85 |     Returns
 86 |     -------
 87 |     matches : list of 2-tuples
 88 |         Each element in the returned list is a length-2 tuple `(i, j)` s.t.
 89 |         `i` is the index of the matching chunk and `j` is the index in
 90 |         `text` where the substring match begins.
 91 | 
 92 |     Example
 93 |     -------
 94 |     >>> text = 'hello hello; world'
 95 |     >>> greedy_label(text, ['hello', 'world'])
 96 |     [(0, 0), (0, 6), (1, 13)]
 97 | 
 98 |     # the semantics of the return value is that chunk[0] matches begining
 99 |     # at text[0], then chunk[0] matches again at beggining text[6], and then
100 |     # chunk[1] matches beginning at text[13].
101 |     """
102 |     stack = []
103 | 
104 |     p = 0
105 |     while True:
106 |         gap = {}
107 |         matchlength = {}
108 | 
109 |         # for label, ch in chunks.items():
110 |         for label, ch in enumerate(chunks):
111 |             if len(ch) > 0:
112 |                 i = text.find(ch, p)
113 |                 if i > -1:
114 |                     gap[label] = i-p
115 |                     matchlength[label] = len(ch)
116 | 
117 |         if len(gap) == 0:
118 |             # we're at the end of the text, with no more
119 |             # matching chunks in text[p:]
120 |             break
121 | 
122 |         # sort the chunks that match text[p:]. we want to pick the one that
123 |         # introduces the smallest gap. if two chunks both introduce the
124 |         # same gap, then we take the one that's longest.
125 |         label = min(gap.keys(), key=lambda k: (gap[k], -matchlength[k]))
126 |         stack.append((label, p+gap[label]))
127 |         p += gap[label] + matchlength[label]
128 | 
129 |     return stack
130 | 
131 | 


--------------------------------------------------------------------------------
/retreive-data/download-tag-plos.py:
--------------------------------------------------------------------------------
  1 | '''Download and tag training data from the refence section
  2 | of random PLoS ONE papers
  3 | '''
  4 | import json
  5 | import argparse
  6 | import requests
  7 | import traceback
  8 | from xml.etree import ElementTree as ET
  9 | 
 10 | from unidecode import unidecode
 11 | from bibtagger.tokenizer import tokenize
 12 | from bibtagger.print_tokens import render_tokens
 13 | 
 14 | unitokenize = lambda x: tokenize(unidecode(x)) if x is not None else []
 15 | 
 16 | 
 17 | def main():
 18 |     # itertokens
 19 |     p = argparse.ArgumentParser(description=__doc__,
 20 |         formatter_class=argparse.RawDescriptionHelpFormatter)
 21 |     p.add_argument('-n', help='Number of articles to scrape', type=int, default=2)
 22 |     p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re tagged.')
 23 |     p.add_argument('dest', help='tokenized and tagged citations (jsonlines)')
 24 |     args = p.parse_args()
 25 | 
 26 |     with open(args.dest, 'a') as fout:
 27 |         # pull random articles
 28 |         for doi, article in sample_plos_xml(args.n):
 29 |             # get all of the citations from the articles
 30 |             references = article.findall('back/ref-list/ref/mixed-citation')
 31 | 
 32 |             if args.verbose:
 33 |                 print('\n\n==== PULLING FROM DOI %s ====\n' % doi)
 34 | 
 35 |             for ref in references:
 36 |                 if ref.get('publication-type') != 'journal':
 37 |                     continue
 38 | 
 39 |                 # tokenize each citation
 40 |                 try:
 41 |                     tokens = list(itertokens(ref))
 42 |                     if args.verbose:
 43 |                         print(render_tokens(tokens))
 44 | 
 45 |                     json.dump({'tagged': tokens}, fout)
 46 |                     fout.write('\n')
 47 |                 except UnexpectedTagError:
 48 |                     traceback.print_exc()
 49 | 
 50 | 
 51 | class UnexpectedTagError(Exception):
 52 |     pass
 53 | 
 54 | 
 55 | def itertokens(citation_node):
 56 |     xmltag_to_ourtag = {
 57 |         'volume': 'vol',
 58 |         'year':'year',
 59 |         'source': 'journ',
 60 |         'surname': 'fam',
 61 |         'given-names': 'given',
 62 |         'suffix': 'given',
 63 |         'article-title': 'title',
 64 |         'etal': None,
 65 |         'issue': 'issue',
 66 |         'issue-id': 'issue',
 67 |     }
 68 | 
 69 |     children = citation_node.getchildren()
 70 |     while len(children) > 0:
 71 |         part = children.pop(0)
 72 |         if part.tag == 'person-group':
 73 |             children = part.getchildren() + children
 74 |             part = children.pop(0)
 75 | 
 76 |         if part.tag in ('name'):
 77 |             for name_part in part.getchildren():
 78 |                 assert name_part.tail is None
 79 |                 for tok in unitokenize(name_part.text):
 80 |                     yield (xmltag_to_ourtag[name_part.tag], tok)
 81 | 
 82 |         elif part.tag in ('year', 'article-title', 'source', 'volume', 'etal', 'issue'):
 83 |             for tok in unitokenize(part.text):
 84 |                 yield (xmltag_to_ourtag[part.tag], tok)
 85 | 
 86 |         elif part.tag == 'fpage':
 87 |             fpage, middle = part.text, part.tail
 88 |             if len(children) > 0 and children[0].tag == 'lpage':
 89 |                 part = children.pop(0)
 90 |                 lpage = part.text
 91 |                 for tok in unitokenize(fpage + middle + lpage):
 92 |                     yield ('page', tok)
 93 |             else:
 94 |                 for tok in unitokenize(fpage):
 95 |                     yield ('page', tok)
 96 |         elif part.tag == 'comment':
 97 |             pass
 98 | 
 99 |         else:
100 |             ET.dump(citation_node)
101 |             raise UnexpectedTagError('unexpected tag', part.tag)
102 | 
103 | 
104 |         if part.tail is not None:
105 |             for tok in unitokenize(part.tail):
106 |                 yield (None, tok)
107 | 
108 | 
109 | def sample_plos_xml(n_articles):
110 |     # issn for PLoS One
111 |     r = requests.get('http://api.crossref.org/works',
112 |         params={'sample': n_articles, 'filter': 'issn:1932-6203'})
113 | 
114 |     dois = (e['DOI'] for e in r.json()['message']['items'])
115 | 
116 |     for doi in dois:
117 |         # print(doi)
118 |         r = requests.get('http://journals.plos.org/plosone/article/asset',
119 |                 params={'id': doi+'.XML'})
120 |         yield doi, ET.fromstring(r.content)
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     main()
125 | 


--------------------------------------------------------------------------------
/retreive-data/crossref/download-crossref.py:
--------------------------------------------------------------------------------
  1 | """Download random citations from the CrossRef DOI system (over their API),
  2 | including both the JSON metadata for the reference _and_ the styled citation
  3 | in one or more formats (American Chemical Society, AIP, APA, etc).
  4 | 
  5 | Each citation will be written as a line to the outputfile in JSON format
  6 | (jsonlines).
  7 | """
  8 | 
  9 | # Example record
 10 | # {'DOI': '10.1016/s0300-483x(96)03593-7',
 11 | #  'prefix': 'http://id.crossref.org/prefix/10.1016',
 12 | #  'member': 'http://id.crossref.org/member/78',
 13 | #  'indexed': {'date-parts': [[2015, 2, 6]], 'timestamp': 1423247513443},
 14 | #  'deposited': {'date-parts': [[2011, 7, 11]], 'timestamp': 1310342400000},
 15 | #  'publisher': 'Elsevier BV',
 16 | #  'title': ['Animal models in autoimmune disease in immunotoxicity assessment'],
 17 | #  'ISSN': ['0300-483X'],
 18 | #  'score': 1.0,
 19 | #  'container-title': ['Toxicology'],
 20 | #  'subject': ['Toxicology'],
 21 | #  'reference-count': 0,
 22 | #  'author': [{'given': 'J', 'family': 'Farine'}],
 23 | #  'URL': 'http://dx.doi.org/10.1016/s0300-483x(96)03593-7',
 24 | #  'issue': '1',
 25 | #  'volume': '119',
 26 | #  'issued': {'date-parts': [[1997, 4, 11]]},
 27 | #  'subtitle': [],
 28 | #  'styled': [{'value': 'Farine, J. (1997). Animal models in autoimmune disease in immunotoxicity assessment. Toxicology, 119(1), 29-35. doi:10.1016/s0300-483x(96)03593-7',
 29 | #    'style': 'apa'},
 30 | #   {'value': 'Farine, J. Toxicology 1997, 119, 29-35.',
 31 | #    'style': 'american-chemical-society'}],
 32 | #  'page': '29-35',
 33 | #  'type': 'journal-article',
 34 | #  'source': 'CrossRef'}
 35 | 
 36 | import re
 37 | import json
 38 | import argparse
 39 | import requests
 40 | from pprint import pprint
 41 | from unidecode import unidecode
 42 | 
 43 | 
 44 | def main():
 45 |     p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 46 |     p.add_argument('file', help='File to append citations to (jsonlines format)')
 47 |     p.add_argument('-n', help='Number of citations to download', type=int, default=100)
 48 |     args = p.parse_args()
 49 | 
 50 |     # might want to customize this later.
 51 |     # STYLES = ['nature', 'american-chemical-society', 'american-institute-of-physics']
 52 |     # STYLES = ['organization']
 53 |     STYLES = ['society-of-biblical-literature-fullnote-bibliography']
 54 | 
 55 |     with open(args.file, 'a') as f:
 56 |         for item in sample(n=args.n, styles=STYLES):
 57 |             json.dump(item, f, sort_keys=True)
 58 |             f.write('\n')
 59 |             f.flush()
 60 | 
 61 | 
 62 | def get_styled(doi, styles=['american-chemical-society']):
 63 |     """Get the styled citation for a given DOI from CrossRef
 64 | 
 65 |     Example
 66 |     -------
 67 |     >>> get_styled('10.1016/s0300-483x(96)03593-7')
 68 |     [{'style': 'american-chemical-society', 'value': 'Farine, J. Toxicology 1997, 119, 29-35.'}]
 69 |     """
 70 | 
 71 |     out = []
 72 |     for style in styles:
 73 |         fmt = 'text/x-bibliography; style=%s' % style
 74 |         r = requests.get('http://dx.doi.org/%s' % doi, headers={'Accept': fmt})
 75 |         # content is utf-8 encoded. To simplify the downstream stuff, we convert
 76 |         # non-ascii unicode characters to "nearest" ascii characters using
 77 |         # unidecode.
 78 |         c = unidecode(r.content.decode('utf-8'))
 79 | 
 80 |         strip_prefixes = ['(1)', '1.', '1']
 81 |         for p in strip_prefixes:
 82 |             if c.startswith(p):
 83 |                 c = c[len(p):]
 84 | 
 85 |         c = c.replace('Online: http://dx.doi.org/' , '')
 86 |         c = c.replace(doi + '.', '')
 87 |         c = c.replace(' No pages.', '')
 88 | 
 89 |         out.append({'style': style, 'value': c.strip()})
 90 | 
 91 |     return out
 92 | 
 93 | 
 94 | def sample(n=10, styles=['american-chemical-society', 'apa', 'nature']):
 95 |     def skip(item):
 96 |         if item['subtitle'] != []:
 97 |             # skip everything with a subtitle
 98 |             return True
 99 | 
100 |         if len(item['title']) != 1:
101 |             # title should be a list containing 1 string
102 |             return True
103 | 
104 |         if len(item['title'][0].split()) < 2:
105 |             # skip 1 word titles
106 |             return True
107 |         return False
108 | 
109 | 
110 |     r = requests.get('http://api.crossref.org/works',
111 |         params={'sample': n, 'filter': 'type:journal-article'})
112 |     items = r.json()['message']['items']
113 |     items = [x for x in items if not skip(x)]
114 |     for item in items:
115 |         item['styled'] = get_styled(item['DOI'], styles)
116 |         pprint(item)
117 |         print()
118 |         yield item
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 


--------------------------------------------------------------------------------
/bibtagger/featurize.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | import gzip
  3 | import functools
  4 | from pprint import pprint
  5 | from os.path import join, dirname, abspath
  6 | from pkg_resources import resource_filename
  7 | 
  8 | import marisa_trie
  9 | from titlecase import titlecase
 10 | from nltk.corpus import wordnet as wn
 11 | from bibtagger.tokenizer import untokenize, tokenize
 12 | 
 13 | def _load_reference(fname, trans=None):
 14 |     with gzip.open(resource_filename('bibtagger', join('fixeddata', fname))) as f:
 15 |         items = set()
 16 |         for line in f:
 17 |             line = line.strip().decode('utf-8')
 18 |             if trans is not None:
 19 |                 line = trans(line)
 20 |             items.add(line)
 21 |         return frozenset(items)
 22 | 
 23 | 
 24 | DIGITS = set([str(e) for e in range(10)])
 25 | UPPERCASE = set(string.ascii_uppercase)
 26 | COMMON_GIVEN_NAMES = _load_reference('common-given-names.txt.gz')
 27 | COMMON_SURNAMES = _load_reference('common-surnames.txt.gz', trans=str.lower)
 28 | COMMON_WORDS = _load_reference('common-words.txt.gz')
 29 | JOURNAL_SET = _load_reference('journals.txt.gz', trans=titlecase)
 30 | JOURNAL_TRIE = marisa_trie.Trie(JOURNAL_SET)
 31 | 
 32 | 
 33 | def common_hypernym(synsets):
 34 |     """Walk up the hypernym tree above a collection of WordNet synsets
 35 |     finding the first (hyper) synset that's in COMMON_WORDS.
 36 |     """
 37 |     if len(synsets) == 0:
 38 |         return ''
 39 | 
 40 |     names = {l.name().split('_')[0].lower() for syn in synsets for l in syn.lemmas()}
 41 |     intersect = names.intersection(COMMON_WORDS)
 42 |     if len(intersect) > 0:
 43 |         # just deterministically pick one of the words to use. we'll
 44 |         # take the shortest.
 45 |         return min(intersect, key=len)
 46 |     else:
 47 |         hypersets = [hyper for s in synsets for hyper in s.hypernyms()]
 48 |         return common_hypernym(hypersets)
 49 | 
 50 | 
 51 | def featurize(phrase):
 52 |     @functools.lru_cache(maxsize=1024)
 53 |     def get_local_features(word):
 54 |         word_lower = word.lower()
 55 | 
 56 |         hypernym = ''
 57 |         in_wordnet = 'NA'
 58 |         if len(word) > 4:
 59 |             synsets = wn.synsets(word)
 60 |             in_wordnet = len(synsets) > 0
 61 |             hypernym = common_hypernym(wn.synsets(word))
 62 | 
 63 |         return {
 64 |             'word': word,
 65 |             'prefix4': word[:4],
 66 |             'hypernym': hypernym,
 67 |             'in_wordnet': in_wordnet,
 68 |             'common_given': word_lower in COMMON_GIVEN_NAMES,
 69 |             'common_surname': word_lower in COMMON_SURNAMES,
 70 |             'cont_num': len(set(word).intersection(DIGITS)) > 0,
 71 |             'all_num': all(l in DIGITS for l in word),
 72 |         }
 73 | 
 74 | 
 75 |     n = len(phrase)
 76 | 
 77 |     local_features = [dict() for _ in phrase]
 78 |     shift_features = [dict() for _ in local_features]
 79 |     for i, word in enumerate(phrase):
 80 |         local_features[i].update(get_local_features(word))
 81 | 
 82 |     for i in range(n):
 83 |         local_features[i]['known_journal'] = False
 84 | 
 85 |     for i in range(n):
 86 |         matches = JOURNAL_TRIE.prefixes(titlecase(untokenize(phrase[i:])))
 87 |         if len(matches) == 0:
 88 |             continue
 89 | 
 90 |         match = max(matches, key=len)
 91 |         t = tokenize(match)
 92 |         # only deal with multitoken matches. for single token journals, there
 93 |         # are a lot of false positives, and they can presumably be handled
 94 |         # easily by the model in training
 95 |         if len(t) > 2:
 96 |             for j, tok in enumerate(t):
 97 |                 local_features[i+j]['known_journal'] = True
 98 | 
 99 |     # print(untokenize(phrase))
100 |     # print([(i, phrase[i]) for i, e in enumerate(local_features) if e['known_journal']])
101 |     # print()
102 | 
103 | 
104 |     for i in range(1, n):
105 |         shift_features[i].update({k+'[-1]': v for k, v in local_features[i-1].items()})
106 |     for i in range(n-1):
107 |         shift_features[i].update({k+'[+1]': v for k, v in local_features[i+1].items()})
108 | 
109 |     features = []
110 |     for i in range(n):
111 |         features.append(
112 |             ['%s=%s' % (k,v) for k, v in local_features[i].items()] +
113 |             ['%s=%s' % (k,v) for k, v in shift_features[i].items()]
114 |         )
115 |     features[0].append('__BOS__')
116 |     features[-1].append('__EOS__')
117 | 
118 | 
119 |     return features
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     from pprint import pprint
124 |     item = {"tagged": [
125 |         ["fam", "Massague"], ["given", "J"], [None, ","], ["fam", "Seoane"],
126 |         ["given", "J"], [None, ","], ["fam", "Wotton"], ["given", "D"],
127 |         [None, "("], ["year", "2005"], [None, ")"], ["title", "Smad"],
128 |         ["title", "transcription"], ["title", "factors"], [None, "."],
129 |         ["journ", "Genes"], ["journ", "Dev"], ["vol", "19"], [None, ":"],
130 |         ["page", "2783"], ["page", "-"], ["page", "2810"], [None, "."]]
131 |     }
132 |     features = featurize([e[1] for e in item['tagged']])
133 |     pprint(features[15])
134 | 


--------------------------------------------------------------------------------
/retreive-data/download-tag-pnas.py:
--------------------------------------------------------------------------------
  1 | '''Download and tag training data from the refence section
  2 | of random J. Chem. Phys. papers
  3 | '''
  4 | import time
  5 | import sys
  6 | import json
  7 | import argparse
  8 | import requests
  9 | import traceback
 10 | from bs4 import BeautifulSoup, Tag
 11 | 
 12 | from unidecode import unidecode
 13 | from bibtagger.tokenizer import tokenize
 14 | from bibtagger.print_tokens import render_tokens
 15 | 
 16 | unitokenize = lambda x: tokenize(unidecode(x)) if x is not None else []
 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
 18 | 
 19 | 
 20 | def main():
 21 |     p = argparse.ArgumentParser()
 22 |     p.add_argument('-n', help='Number of articles to scrape', type=int, default=2)
 23 |     p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re tagged.')
 24 |     p.add_argument('dest', help='tokenized and tagged citations (jsonlines)')
 25 |     args = p.parse_args()
 26 | 
 27 |     with open(args.dest, 'a') as fout:
 28 |         for doi, article in sample_pnas(args.n):
 29 |             print('\n===== %s ======' % doi)
 30 |             for cit in article.find_all('div', {'class': 'cit-metadata'}):
 31 |                 try:
 32 |                     tokens = list(itertokens(cit))
 33 |                     if any(len(tok)>5 and tag is None for (tag, tok) in tokens):
 34 |                         print('ERROR LONG TOKEN NOT MATCHED IN', render_tokens(tokens), file=sys.stderr)
 35 |                         continue
 36 | 
 37 |                     if args.verbose:
 38 |                         print(render_tokens(tokens))
 39 |                         json.dump({'tagged': tokens}, fout)
 40 |                     fout.write('\n')
 41 |                 except UnexpectedTagError as e:
 42 |                     print()
 43 |                     print(e)
 44 |                     print()
 45 |                 # exit(1)
 46 | 
 47 | 
 48 | class UnexpectedTagError(Exception):
 49 |     pass
 50 | 
 51 | 
 52 | def itertokens(citation_node):
 53 |     htmlclass_to_tag = {
 54 |         'cit-vol': 'vol',
 55 |         'cit-fpage': 'page',
 56 |         'cit-lpage': 'page',
 57 |         'cit-pub-date':'year',
 58 |         # 'reference-source': 'journ',
 59 |         'cit-name-surname': 'fam',
 60 |         'cit-name-given-names':  'given',
 61 |         'cit-jnl-abbrev': 'journ',
 62 |         'cit-issue': 'issue',
 63 |         'cit-name-suffix': 'fam',
 64 |         'cit-article-title': 'title',
 65 |     }
 66 | 
 67 |     for part in citation_node.children:
 68 |         if isinstance(part, str):
 69 |             yield from ((None, t) for t in unitokenize(part))
 70 |         elif isinstance(part, Tag) and part.name == 'ol':
 71 |             for li in part.find_all('li'):
 72 |                 for part in li.children:
 73 |                     if isinstance(part, Tag):
 74 |                         for auth_part in part.find_all('span'):
 75 |                             yield from ((htmlclass_to_tag[auth_part['class'][0]], t) for t in unitokenize(auth_part.text))
 76 |                     else:
 77 |                         yield from ((None, t) for t in unitokenize(part))
 78 | 
 79 |         elif isinstance(part, Tag) and part.name == 'cite':
 80 |             last_class = None
 81 |             for item in part.children:
 82 |                 if isinstance(item, str):
 83 |                     tag = 'page' if last_class == 'cit-fpage' else None
 84 |                     yield from ((tag, t) for t in unitokenize(item))
 85 |                 else:
 86 |                     if ('class' not in item.attrs) or item['class'][0] not in htmlclass_to_tag:
 87 |                         raise UnexpectedTagError(citation_node, item)
 88 |                     last_class = item['class'][0]
 89 |                     yield from ((htmlclass_to_tag[item['class'][0]], t) for t in unitokenize(item.text))
 90 | 
 91 | 
 92 |     # print(list(citation_node.children)[2])
 93 |     # exit(1)
 94 |     # klass = part['class'][0]
 95 | 
 96 |     # exit(1)
 97 |     # cit-auth-list
 98 | 
 99 | def sample_pnas(n_articles):
100 |     #soup = BeautifulSoup(open('172.full', encoding='utf-8').read(), 'html.parser')
101 |     ## soup = BeautifulSoup(open('untitled.txt',  encoding='utf-8').read())
102 |     #yield '172.full', soup
103 |     # issn for PNAS
104 |     r = requests.get('http://api.crossref.org/works',
105 |         params={'sample': n_articles, 'filter': 'issn:1091-6490'})
106 |     dois = (e['DOI'] for e in r.json()['message']['items'])
107 | 
108 |     for doi in dois:
109 |         r = requests.get('http://dx.doi.org/%s' % doi, headers={'User-Agent': USER_AGENT})
110 |         soup = BeautifulSoup(r.content, 'html.parser')
111 |         full_text_link = soup.find('a', {'rel': 'view-full-text'})
112 |         if full_text_link is None:
113 |             print(r.url)
114 |             print(r.content)
115 |             print('Skipping. No full text HTML availavle')
116 |             time.sleep(4)
117 |             continue
118 | 
119 |         r2 = requests.get('http://www.pnas.org' + full_text_link['href'])
120 |         print(r2.url)
121 |         yield doi, BeautifulSoup(r2.content)
122 |         time.sleep(4)
123 | 
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     main()
128 | 


--------------------------------------------------------------------------------
/frontend/static/index.html:
--------------------------------------------------------------------------------
  1 | <html lang="en">
  2 | <head>
  3 |     <meta charset="utf-8">
  4 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  6 |     <link rel="shortcut icon" href="static/favicon.ico" />
  7 |     <title>Reference Tagger</title>
  8 | 
  9 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
 10 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.2/js/bootstrap.js"></script>
 11 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.7.0/underscore-min.js"></script>
 12 | 
 13 |     <link href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.2/css/bootstrap.min.css" rel="stylesheet">
 14 |     <link href="static/sticky-footer.css" rel="stylesheet">
 15 |     <link href="static/colorize-output.css" rel="stylesheet">
 16 |     <link href="static/base.css" rel="stylesheet">
 17 | 
 18 |     <script src="static/file.js"></script>
 19 |     <script src="static/selecttext.js"></script>
 20 | 
 21 | 
 22 | <style>
 23 |   .success-button-group {
 24 |       float: right;
 25 |       margin-right: -15px;
 26 |   }
 27 |   span.success-button-text {
 28 |       position: relative;
 29 |       font-size: 110%;
 30 |       bottom: 2px;
 31 |   }
 32 |   a.success-button {
 33 |     font-size: 20px;
 34 |   }
 35 |   #success-button-check {
 36 |       color: #1A9622;
 37 |   }
 38 |   #success-button-x {
 39 |       padding-left: 3px;
 40 |       color: #AB2020;
 41 |   }
 42 | 
 43 |   p.lead {
 44 |      margin-bottom: 2em;
 45 |   }
 46 |   #output-zone {
 47 |       margin-right: 3em;
 48 |       min-height: 5em;
 49 |       white-space: pre-wrap;
 50 |   }
 51 |   textarea {
 52 |       resize: none;
 53 |   }
 54 |   #dropdown-button {
 55 |       margin-top: -10;
 56 |       margin-right: -15;
 57 |   }
 58 | </style>
 59 | </head>
 60 | 
 61 | 
 62 | <body>
 63 | 
 64 | <!-- Begin page content -->
 65 | <div class="container">
 66 | 
 67 | <script>
 68 | $(function() {
 69 |     if (Math.max(document.documentElement.clientWidth, window.innerWidth || 0) > 900) {
 70 |         var el = $("<a href='https://github.com/rmcgibbo/reftagger'><img style='position: absolute; top: 0; right: 0; border: 0; width: 149px; height: 149px;' src='http://aral.github.com/fork-me-on-github-retina-ribbons/right-grey@2x.png' alt='Fork me on GitHub'></a>");
 71 |         el.appendTo('body');
 72 |     }
 73 | });
 74 | </script>
 75 | 
 76 | <!-- HEADER -->
 77 | <div class="page-header">
 78 |   <nav>
 79 |     <ul class="nav nav-pills pull-right">
 80 |       <li role="presentation"><a href="http://rmcgibbo.org/posts/building-a-conditional-random-fields-tagger-for-academic-citations/">About</a></li>
 81 |       <li role="presentation"><a href="https://github.com/rmcgibbo/reftagger/issues">Issues</a></li>
 82 |     </ul>
 83 |   </nav>
 84 |   <h1>Reference Tagger</h1>
 85 | </div>
 86 | 
 87 | <p class="lead">
 88 |     Take an arbitrary unstructured academic citation (e.g copy-pasted from a paper),
 89 |     and automatically <span style="color: #003399;">parse</span>
 90 |     and <span style="color: #cb4b16">format</span> it into a BibTeX record.
 91 | </p>
 92 | 
 93 | <!-- BODY -->
 94 | <div class="row">
 95 |   <div class="col-md-offset-1 col-md-10">
 96 |       <p><textarea class="form-control" id="form-box" class="form-control" placeholder="Wang, L.-P.; Titov, A.; McGibbon, R.; Liu, F.; Pande, V. S.; Martinez, T. J. Nature Chemistry 2014, 6, 1044-1048." rows="4"></textarea></p>
 97 |   </div>
 98 | </div>
 99 | 
100 | <div class="row"><div class="col-md-offset-0 col-md-12">
101 |   <br/><br/><br/>
102 | 
103 |   <div class="highlight">
104 | 
105 |     <div class="btn-group pull-right" id="dropdown-button">
106 |       <button type="button" id="btn-toggle-format" class="btn btn-default btn-sm dropdown-toggle" data-toggle="dropdown" aria-expanded="false">
107 |         BibTeX <span class="caret"></span>
108 |       </button>
109 |       <ul class="dropdown-menu" role="menu">
110 |         <li><a href="#plain">Plain</a></li>
111 |         <!-- <li><a href="#">EndNote</a></li> -->
112 |         <li><a href="#bibtex">BibTeX</a></li>
113 |       </ul>
114 |     </div>
115 | 
116 |     <pre onclick="SelectText('output-zone')" oncontextmenu="SelectText('output-zone')" id="output-zone"></pre>
117 |   </div>
118 | 
119 |   <span class='success-button-group'>
120 |   <span class='success-button-text'>Rate accuracy: &nbsp;</span>
121 |   <a class="success-button" id='success-button-check' href='#'><span class="glyphicon glyphicon-ok" aria-hidden="true"></span></a>
122 |   <a class="success-button" id='success-button-x' href="#"><span class="glyphicon glyphicon-remove" aria-hidden="true"></span></a>
123 |   </span>
124 | 
125 | </div></div>
126 | 
127 | 
128 | 
129 | </div> <!-- container -->
130 | 
131 | <!-- FOOTER -->
132 | <footer class="footer">
133 |   <div class="container">
134 |     <p class="text-muted">
135 |     Built by <a href="http://rmcgibbo.org/">@rmcgibbo</a>
136 |         with <a href="http://www.chokkan.org/software/crfsuite/">crfsuite</a>,
137 |         <a href="http://www.nltk.org/">nltk</a> &amp;
138 |         <a href="http://www.crossref.org/">crossref</a>.
139 |     </p>
140 |   </div>
141 | </footer>
142 | 
143 | <script>
144 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
145 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
146 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
147 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
148 | 
149 |   ga('create', 'UA-55213879-1', 'auto');
150 |   ga('send', 'pageview');
151 | 
152 | </script>
153 | </body>
154 | </html>
155 | 


--------------------------------------------------------------------------------
/retreive-data/crossref/tag-crossref.py:
--------------------------------------------------------------------------------
  1 | """Tag the syled citations downloaded by `download-citations.py`
  2 | 
  3 | The output is written as jsonlines, with one line per reference. The lines
  4 | have structure like the following. The entry `tagged` in the line gives
  5 | the tokenized-and-tagged version of the free-form styled citation
  6 | (in `value`).
  7 | 
  8 | {'tagged': [['J.', 'given'], ['Fransson', 'fam'],
  9 |   [',', 'None'], ['A.', 'given'],
 10 |   ['Talamelli', 'fam'], [',', 'None'],
 11 |   ['L.', 'given'], ['Brandt', 'fam'], [',', 'None'],
 12 |   ['and', 'None'],  ['C.', 'given'],
 13 |   ['Cossu', 'fam'],  [',', 'None'],
 14 |   ['Physical', 'journ'],  ['Review', 'journ'],
 15 |   ['Letters', 'journ'],   ['96', 'vol'],
 16 |   [',', 'None'],  ['(', 'None'],
 17 |   ['2006', 'year'], [')', 'None']],
 18 |  'value': 'J. Fransson, A. Talamelli, L. Brandt, and C. Cossu, Physical Review Letters 96, (2006).'}
 19 | """
 20 | import json
 21 | import re
 22 | import string
 23 | import argparse
 24 | import sys
 25 | import traceback
 26 | from os.path import abspath, dirname, join, commonprefix
 27 | from difflib import SequenceMatcher
 28 | from pprint import pprint
 29 | 
 30 | from titlecase import titlecase
 31 | from unidecode import unidecode
 32 | 
 33 | from bibtagger.tokenizer import tokenize
 34 | from bibtagger.chunker import tokenize_and_tag
 35 | from bibtagger.given_names import abbreviations
 36 | from bibtagger.print_tokens import render_tokens
 37 | 
 38 | 
 39 | def main():
 40 |     p = argparse.ArgumentParser(description=__doc__,
 41 |         formatter_class=argparse.RawDescriptionHelpFormatter)
 42 |     p.add_argument('source', help='DOI metadata and styled citations for each reference (jsonlines)')
 43 |     p.add_argument('dest', help='tokenized and tagged citations (jsonlines)')
 44 |     p.add_argument('-v', '--verbose', action="store_true", help='Print out the tagged citations in ASCII colors as they\'re being tagged. This helps a lot to debug')
 45 |     args = p.parse_args()
 46 | 
 47 |     with open(args.source, 'r') as fin, open(args.dest, 'a') as fout:
 48 |         for i, line in enumerate(fin):
 49 |             if (i % 50 == 0):
 50 |                 print('LINE %d' % i)
 51 | 
 52 |             item = json.loads(line)
 53 | 
 54 |             # the styled string should not have any html in it. this
 55 |             # means that CrossRef messed up
 56 |             if any('<?xml' in s['value'] or 'http://' in s['value'] for s in item['styled']):
 57 |                 print('SKIPPING GARBAGE ITEM')
 58 |                 print(item['styled'])
 59 |                 continue
 60 | 
 61 | 
 62 |             props = extract_properties(item)
 63 |             if props is None:
 64 |                 continue
 65 | 
 66 |             for s in item['styled']:
 67 |                 tagged = tag_citation(s['value'], props, verbose=args.verbose)
 68 |                 if tagged is None:
 69 |                     continue
 70 | 
 71 |                 json.dump({'value': s['value'], 'tagged': tagged}, fout)
 72 |                 fout.write('\n')
 73 | 
 74 | 
 75 | def extract_properties(item):
 76 |     """Extract a dict of properties from a the DOI metadata for a citation.
 77 |     The keys are the different tags within the citation, and the values are
 78 |     tokens that (should/could) indicate that tag.
 79 | 
 80 |     Example
 81 |     -------
 82 |     {'title': ['Shorter', 'Notices'],
 83 |      'given': ['P.'],
 84 |      'journ': ['Eng', 'Hist', 'Rev', 'The', 'English', 'Historical', 'Review'],
 85 |      'page': ['976-977', '976', '977'],
 86 |      'fam': ['HORDEN'],
 87 |      'vol': ['CVI'],
 88 |      'year': ['1991']}
 89 |      """
 90 |     if 'author' not in item:
 91 |         return None
 92 |     if any('family' not in a or 'given' not in a for a in item['author']):
 93 |         return None
 94 | 
 95 |     def author_given(item):
 96 |         out = set()
 97 |         for auth in item['author']:
 98 |             out.update(abbreviations(unidecode(auth['given'])))
 99 |         return out
100 | 
101 | 
102 | 
103 |     try:
104 |         page = [item.get('page', '')]
105 |         if '-' in page[0]:
106 |             page_from, page_to = page[0].split('-')
107 |             page.append(page_from)
108 |             page.append('%s-%s' % (page_from, page_to[-2:]))
109 | 
110 |         assert len(item['title']) == 1
111 |         title = unidecode(item['title'][0])
112 | 
113 |         props = {
114 |             'page': page,
115 |             'vol': [item.get('volume', '')],
116 |             'year': [str(item['issued']['date-parts'][0][0])],
117 |             'journ': [unidecode(j) for j in item['container-title']],
118 |             'given': author_given(item),
119 |             'title': [title, titlecase(title)],
120 |             'fam': [unidecode(part) for auth in item['author'] for part in auth['family'].split()],
121 |             'issue': [item.get('issue', '')],
122 |         }
123 |     except ValueError as e:
124 |         # caused by given names that are more than 2 words.
125 |         # print('ERROR', file=sys.stderr)
126 |         # traceback.print_exc()
127 |         print('-----')
128 |         print(e)
129 |         print(item['styled'])
130 |         print('----')
131 |         # raise
132 |         return None
133 | 
134 |     return props
135 | 
136 | 
137 | def tag_citation(text, props, verbose=True):
138 |     def skip(w):
139 |         return 'doi' in w
140 | 
141 |     text = ' '.join([e for e in text.split() if not skip(e)])
142 |     tokens, tags = tokenize_and_tag(text, props)
143 | 
144 |     out = list(zip(tags, tokens))
145 |     if verbose:
146 |         print(render_tokens(out))
147 | 
148 |     if not any(t=='journ' for t in tags):
149 |         print('ERROR NO JOURNAL ENTRY MATCHED IN', text, file=sys.stderr)
150 |         #print(props, file=sys.stderr)
151 |         print()
152 |         return None
153 | 
154 |     if any(len(tok)>5 and tag is None for (tok, tag) in zip(tokens, tags)):
155 |         print('ERROR LONG TOKEN NOT MATCHED IN', text, file=sys.stderr)
156 |         print(props, file=sys.stderr)
157 |         print()
158 |         return None
159 | 
160 |     return out
161 | 
162 | if __name__ == '__main__':
163 |     main()
164 | 


--------------------------------------------------------------------------------
/training/train-model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:3c525b0cdc73cd1e4893125c18a68b9ef59df91f315c990ff1aa411614c2a744"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "import numpy as np\n",
 16 |       "import pycrfsuite"
 17 |      ],
 18 |      "language": "python",
 19 |      "metadata": {},
 20 |      "outputs": []
 21 |     },
 22 |     {
 23 |      "cell_type": "code",
 24 |      "collapsed": false,
 25 |      "input": [
 26 |       "data = np.load('../features.pkl')\n",
 27 |       "print(len(data['X']))\n",
 28 |       "print(len(data['y']))"
 29 |      ],
 30 |      "language": "python",
 31 |      "metadata": {},
 32 |      "outputs": []
 33 |     },
 34 |     {
 35 |      "cell_type": "code",
 36 |      "collapsed": false,
 37 |      "input": [
 38 |       "random = np.random.RandomState(0)\n",
 39 |       "indices = random.permutation(len(data['X']))\n",
 40 |       "train_indices = indices[:len(indices)//2]\n",
 41 |       "test_indices = indices[len(indices)//2:]\n",
 42 |       "\n",
 43 |       "X_train = [data['X'][i] for i in train_indices]\n",
 44 |       "y_train = [data['y'][i] for i in train_indices]\n",
 45 |       "\n",
 46 |       "X_test = [data['X'][i] for i in test_indices]\n",
 47 |       "y_test = [data['y'][i] for i in test_indices]"
 48 |      ],
 49 |      "language": "python",
 50 |      "metadata": {},
 51 |      "outputs": []
 52 |     },
 53 |     {
 54 |      "cell_type": "code",
 55 |      "collapsed": false,
 56 |      "input": [
 57 |       "[[e.split('=')[1] for e in wordf if e.split('=')[0] in ('word', 'known_journal')] for wordf in X_train[2]]\n",
 58 |       "#[type(e) for e in X_train[0]]\n",
 59 |       "#[dict(list((f.split('=') for f in e))) for e in X_train[0]]"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "code",
 67 |      "collapsed": false,
 68 |      "input": [
 69 |       "trainer = pycrfsuite.Trainer()\n",
 70 |       "\n",
 71 |       "for x, y in zip(X_train, y_train):\n",
 72 |       "    trainer.append(x, y)\n",
 73 |       "\n",
 74 |       "trainer.set_params({\n",
 75 |       "    'c1': 0.1,   # coefficient for L1 penalty\n",
 76 |       "    'c2': 1,  # coefficient for L2 penalty\n",
 77 |       "    'max_iterations': 100,  # stop earlier\n",
 78 |       "})\n",
 79 |       "trainer.train('scratch.crfsuite')"
 80 |      ],
 81 |      "language": "python",
 82 |      "metadata": {},
 83 |      "outputs": []
 84 |     },
 85 |     {
 86 |      "cell_type": "code",
 87 |      "collapsed": false,
 88 |      "input": [
 89 |       "tagger = pycrfsuite.Tagger()\n",
 90 |       "tagger.open('scratch.crfsuite')"
 91 |      ],
 92 |      "language": "python",
 93 |      "metadata": {},
 94 |      "outputs": []
 95 |     },
 96 |     {
 97 |      "cell_type": "code",
 98 |      "collapsed": false,
 99 |      "input": [
100 |       "def score(X, y, tagger):\n",
101 |       "    total_entities = 0\n",
102 |       "    total_correct = 0 \n",
103 |       "    for i, (xx, yy) in enumerate(zip(X, y)):\n",
104 |       "        predicted = tagger.tag(xx)\n",
105 |       "        total_entities += len(xx)\n",
106 |       "        total_correct += sum(pred == true for pred, true in zip(predicted, yy))\n",
107 |       "\n",
108 |       "    print(total_entities, total_correct, total_correct/total_entities)\n",
109 |       "    \n",
110 |       "score(X_train, y_train, tagger)\n",
111 |       "score(X_test, y_test, tagger)"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "code",
119 |      "collapsed": false,
120 |      "input": [
121 |       "from pprint import pprint\n",
122 |       "\n",
123 |       "def errors(X, y, tagger):\n",
124 |       "    n_errored = 0\n",
125 |       "    for i, (xx, yy) in enumerate(zip(X, y)):\n",
126 |       "        predicted = tagger.tag(xx)\n",
127 |       "        if sum((a!=b) for a, b in zip(predicted, yy)) > 0:\n",
128 |       "            phrase = [next(f.split('=')[1] for f in word if f.split('=')[0]=='word') for word in xx]\n",
129 |       "            pprint(list(zip(yy, predicted, ((a==b) for a,b in zip(predicted, yy)), phrase)))\n",
130 |       "            n_errored += 1\n",
131 |       "            print()\n",
132 |       "        if n_errored > 5:\n",
133 |       "            break\n",
134 |       "\n",
135 |       "errors(X_train, y_train, tagger)"
136 |      ],
137 |      "language": "python",
138 |      "metadata": {},
139 |      "outputs": []
140 |     },
141 |     {
142 |      "cell_type": "code",
143 |      "collapsed": false,
144 |      "input": [
145 |       "print(list(zip(y_test[0], tagger.tag(X_test[0]))))"
146 |      ],
147 |      "language": "python",
148 |      "metadata": {},
149 |      "outputs": []
150 |     },
151 |     {
152 |      "cell_type": "code",
153 |      "collapsed": false,
154 |      "input": [
155 |       "# print('|'.join([e[0].split('=')[1] for e in X_test[4]]))\n",
156 |       "# tagger.tag(X_test[4])"
157 |      ],
158 |      "language": "python",
159 |      "metadata": {},
160 |      "outputs": []
161 |     },
162 |     {
163 |      "cell_type": "code",
164 |      "collapsed": false,
165 |      "input": [
166 |       "info = tagger.info()"
167 |      ],
168 |      "language": "python",
169 |      "metadata": {},
170 |      "outputs": []
171 |     },
172 |     {
173 |      "cell_type": "code",
174 |      "collapsed": false,
175 |      "input": [
176 |       "from collections import Counter\n",
177 |       "Counter(info.state_features).most_common(10)"
178 |      ],
179 |      "language": "python",
180 |      "metadata": {},
181 |      "outputs": []
182 |     },
183 |     {
184 |      "cell_type": "code",
185 |      "collapsed": false,
186 |      "input": [
187 |       "trainer = pycrfsuite.Trainer()\n",
188 |       "for x, y in zip(data['X'], data['y']):\n",
189 |       "    trainer.append(x, y)\n",
190 |       "\n",
191 |       "trainer.set_params({\n",
192 |       "    'c1': 0.1,   # coefficient for L1 penalty\n",
193 |       "    'c2': 1,  # coefficient for L2 penalty\n",
194 |       "    'max_iterations': 500,  # stop earlier\n",
195 |       "})\n",
196 |       "trainer.train('model.crfsuite')"
197 |      ],
198 |      "language": "python",
199 |      "metadata": {},
200 |      "outputs": []
201 |     },
202 |     {
203 |      "cell_type": "code",
204 |      "collapsed": false,
205 |      "input": [],
206 |      "language": "python",
207 |      "metadata": {},
208 |      "outputs": []
209 |     }
210 |    ],
211 |    "metadata": {}
212 |   }
213 |  ]
214 | }


--------------------------------------------------------------------------------
/frontend/static/file.js:
--------------------------------------------------------------------------------
  1 | /*jslint browser: true*/
  2 | /*global $, jQuery, alert, _*/
  3 | "use strict";
  4 | 
  5 | (function () {
  6 | 
  7 |     var RENDERED_PLAIN_COLORIZED,
  8 |         RENDERED_BIBTEX,
  9 |         API_DATA;
 10 | 
 11 |     /*
 12 |         Hit the server API.
 13 |     */
 14 |     function query(q) {
 15 |         $.get('/resolve', {'q': q})
 16 |             .done(function (data) {
 17 |                 API_DATA = data;
 18 | 
 19 |                 // reset these. they'll need to be recomputed.
 20 |                 RENDERED_PLAIN_COLORIZED = null;
 21 |                 RENDERED_BIBTEX = null;
 22 |                 render_output_zone();
 23 |             });
 24 |     }
 25 | 
 26 |     /*
 27 |         Compute whatever represntation is supposed to be shown
 28 |         in the output box if necessary, and display it.
 29 |     */
 30 |     function render_output_zone() {
 31 |         var fmt = $('#btn-toggle-format').text().trim(),
 32 |             outputzone = $('#output-zone');
 33 | 
 34 |         if (fmt === 'BibTeX') {
 35 |             if (RENDERED_BIBTEX === null) {
 36 |                 RENDERED_BIBTEX = build_bibtex(API_DATA);
 37 |             }
 38 |             $('#output-zone').text(RENDERED_BIBTEX);
 39 |         } else {
 40 |             if (RENDERED_PLAIN_COLORIZED === null) {
 41 |                 RENDERED_PLAIN_COLORIZED = build_plain_colorized(API_DATA);
 42 |             }
 43 |             outputzone.empty();
 44 |             outputzone.append(RENDERED_PLAIN_COLORIZED);
 45 |         }
 46 |     }
 47 | 
 48 | 
 49 |     /*
 50 |         Build the (colored) plain text representation of the tokenized/tagged
 51 |         data returned from the API.
 52 |     */
 53 |     function build_plain_colorized(data) {
 54 |         var i,
 55 |             o = $('<div></div>'),
 56 |             n = data['tokens'].length,
 57 |             tok,
 58 |             tag;
 59 | 
 60 |         o.text('');
 61 |         for (i = 0; i < n; i += 1) {
 62 |             tok = data.tokens[i];
 63 |             tag = data.tags[i];
 64 |             o.append($('<span></span>').text(tok).addClass(tag));
 65 | 
 66 |             if (tok[0] !== '(' && i < n - 1) {
 67 |                 if (!_.contains(['?', ')', ';', ':', ',', '.'], data.tokens[i + 1])) {
 68 |                     o.append($('<span></span>').text(' '));
 69 |                 }
 70 |             }
 71 |         }
 72 | 
 73 |         return o.children();
 74 |     }
 75 | 
 76 | 
 77 |     /*
 78 |         Build a BibTeX representation (string) of the tokenized/tagged
 79 |         data returned from the API.
 80 |     */
 81 |     function build_bibtex(data) {
 82 |         // Chunk the zipper into contiguous runs of the same tag.
 83 |         var i,
 84 |             last_tag = null,
 85 |             chunk_toks = [],   // final length n_runs.
 86 |             chunk_tags = [];   // final length n_runs.
 87 | 
 88 |         var tok, tag;
 89 | 
 90 |         // each chunk_toks[i] is a list of variable length, with the tokens
 91 |         // in that run of consecutive tag.
 92 |         for (i = 0; i < data.tokens.length; i += 1) {
 93 |             tok = data.tokens[i];
 94 |             tag = data.tags[i];
 95 | 
 96 |             if (tag !== 'None') {
 97 |                 if (tag === last_tag) {
 98 |                     chunk_toks[chunk_toks.length - 1].push(tok);
 99 |                 } else {
100 |                     chunk_toks.push([tok]);
101 |                     chunk_tags.push(tag);
102 |                 }
103 |             }
104 |             last_tag = tag;
105 |         }
106 | 
107 |         var fields = {
108 |             authors: [{}],
109 |             title: [],
110 |             journal: [],
111 |             volume: [],
112 |             year: [],
113 |             issue: [],
114 |             page: [],
115 |         };
116 | 
117 |         for (i = 0; i < chunk_toks.length; i += 1) {
118 |             if (chunk_tags[i] === 'journ') {
119 |                 Array.prototype.push.apply(fields.journal, chunk_toks[i]);
120 |             } else if (chunk_tags[i] === 'title') {
121 |                 Array.prototype.push.apply(fields.title, chunk_toks[i]);
122 |             } else if (chunk_tags[i] === 'vol') {
123 |                 Array.prototype.push.apply(fields.volume, chunk_toks[i]);
124 |             } else if (chunk_tags[i] === 'year') {
125 |                 Array.prototype.push.apply(fields.year, chunk_toks[i]);
126 |             } else if (chunk_tags[i] === 'page') {
127 |                 Array.prototype.push.apply(fields.page, chunk_toks[i]);
128 |             } else if (chunk_tags[i] === 'issue') {
129 |                 Array.prototype.push.apply(fields.issue, chunk_toks[i]);
130 |             } else if (chunk_tags[i] === 'given' || chunk_tags[i] === 'fam') {
131 |                 var authors = fields.authors;
132 |                 if (Object.keys(authors[fields.authors.length - 1]).length === 2) {
133 |                     authors.push({});
134 |                 }
135 |                 var last_auth = authors[authors.length - 1];
136 |                 last_auth[chunk_tags[i]] = chunk_toks[i];
137 |             }
138 |         }
139 |         console.log(chunk_tags);
140 |         console.log(chunk_toks);
141 | 
142 |         // format the author list
143 |         var author_records = fields.authors,
144 |             authors = [];
145 |         for (i = 0; i < author_records.length; i += 1) {
146 |             var g = author_records[i].given;
147 |             var f = author_records[i].fam;
148 | 
149 |             if (g === undefined) {
150 |                 g = [];
151 |             }
152 | 
153 |             if (f === undefined) {
154 |                 continue;
155 |             }
156 |             authors.push(join_tokens(g.concat(f)));
157 |         }
158 | 
159 |         var author_list = authors.join(' and '),
160 |             citekey = 'citekey';
161 |         if (author_records.length > 0) {
162 |             citekey = author_records[0].fam + join_tokens(fields.year);
163 |             if (fields.title.length > 0) {
164 |                 citekey += fields.title[0];
165 |             }
166 |         }
167 | 
168 |         var lines = ['@article{' + citekey];
169 |         if (author_list.length > 0) {
170 |             lines.push('  author = {' + author_list + '}');
171 |         }
172 |         if (fields.title.length > 0) {
173 |             lines.push('  title = {' + join_tokens(fields.title) + '}');
174 |         }
175 |         if (fields.journal.length > 0) {
176 |             lines.push('  journal = {' + join_tokens(fields.journal) + '}');
177 |         }
178 |         if (fields.year.length > 0) {
179 |             lines.push('  year = {' + join_tokens(fields.year) + '}');
180 |         }
181 |         if (fields.volume.length > 0) {
182 |             lines.push('  volume = {' + join_tokens(fields.volume) + '}');
183 |         }
184 |         if (fields.issue.length > 0) {
185 |             lines.push('  issue= {' + join_tokens(fields.issue) + '}');
186 |         }
187 |         if (fields.page.length > 0) {
188 |             lines.push('  page= {' + join_tokens(fields.page) + '}');
189 |         }
190 | 
191 |         var bibtex_string = lines.join(',\n') + '\n}';
192 |         return bibtex_string;
193 | 
194 |     }
195 | 
196 |     /*
197 |         Utility function to join tokens with whitespace.
198 |     */
199 |     function join_tokens(tokens) {
200 |         var i,
201 |             out = [];
202 | 
203 |         for (i = 0; i < tokens.length; i += 1) {
204 |             out.push(tokens[i]);
205 |             if (i < tokens.length - 1) {
206 |                 if (!_.contains(['?', ')', ';', ':', ',', '.'], tokens[i + 1])) {
207 |                     out.push(' ')
208 |                 }
209 |             }
210 |         }
211 |         return out.join('');
212 |     }
213 | 
214 | 
215 |     // main
216 |     $(function () {
217 |         if (window.location.hash == '#bibtex') {
218 |             $('#btn-toggle-format').text('BibTeX ').append('<span class="caret"></span>');
219 |         } else if (window.location.hash == '#plain') {
220 |             $('#btn-toggle-format').text('Plain ').append('<span class="caret"></span>');
221 |         }
222 | 
223 |         // show placeholder's output
224 |         query($('#form-box').attr('placeholder'));
225 | 
226 |         var queryTimeOut;
227 |         $("#form-box").keyup(function(){
228 |             var that = this;
229 |             clearTimeout(queryTimeOut);
230 |             setTimeout(function() {
231 |                 query($(that).val());
232 |             }, 100);
233 |         });
234 | 
235 | 
236 |         $(".dropdown-menu li a").click(function() {
237 |             var top = $(this).parents(".btn-group").find('.btn');
238 |             top.text($(this).text() + ' ').append('<span class="caret"></span>');
239 |             render_output_zone();
240 |         });
241 | 
242 |         $("#success-button-check").click(function() {
243 |             $.post('/feedback/accept', JSON.stringify(API_DATA));
244 |         });
245 |         $("#success-button-x").click(function() {
246 |             $.post('/feedback/reject', JSON.stringify(API_DATA));
247 |         });
248 |     });
249 | 
250 | })();
251 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published by
637 |     the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <http://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------