├── MANIFEST.in
├── biblib
    ├── __init__.py
    ├── messages.py
    ├── test.py
    ├── algo.py
    └── bib.py
├── .gitignore
├── setup.py
├── LICENSE
├── examples
    ├── bib2bib
    └── bibparse
└── README.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | 


--------------------------------------------------------------------------------
/biblib/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = 'bib algo messages'.split()
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 
3 | # Files created by setup.py
4 | /build/
5 | /dist/
6 | MANIFEST
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='biblib',
 5 |     version='0.1.0',
 6 |     description='Simple, correct BibTeX parser and algorithms',
 7 |     url='https://github.com/aclements/biblib',
 8 |     author='Austin Clements',
 9 |     author_email='aclements@csail.mit.edu',
10 |     packages=['biblib'],
11 |     keywords=['bibtex', 'tex'],
12 |     classifiers=[
13 |         'Development Status :: 4 - Beta',
14 |         'Intended Audience :: Developers',
15 |         'Intended Audience :: Science/Research',
16 |         'License :: OSI Approved :: MIT License',
17 |         'Programming Language :: Python',
18 |         'Programming Language :: Python :: 3',
19 |         'Topic :: Database',
20 |         'Topic :: Text Processing',
21 |     ],
22 |     long_description=open('README.md').read(),
23 | )
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013 Austin Clements
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/examples/bib2bib:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import biblib.bib
 4 | import argparse
 5 | import sys
 6 | 
 7 | def main():
 8 |     arg_parser = argparse.ArgumentParser(
 9 |         description='Flatten macros, combine, and pretty-print .bib database(s)')
10 |     arg_parser.add_argument('bib', nargs='+', help='.bib file(s) to process',
11 |                             type=open)
12 |     arg_parser.add_argument('--min-crossrefs', type=int,
13 |                             help='minimum number of cross-referencing entries'
14 |                             ' required to expand a crossref; if omitted, no'
15 |                             ' expansion occurs', default=None)
16 |     args = arg_parser.parse_args()
17 | 
18 |     try:
19 |         # Load databases
20 |         db = biblib.bib.Parser().parse(args.bib, log_fp=sys.stderr).get_entries()
21 | 
22 |         # Optionally resolve cross-references
23 |         if args.min_crossrefs is not None:
24 |             db = biblib.bib.resolve_crossrefs(
25 |                 db, min_crossrefs=args.min_crossrefs)
26 |     except biblib.messages.InputError:
27 |         sys.exit(1)
28 | 
29 |     # Pretty-print entries
30 |     for ent in db.values():
31 |         print(ent.to_bib())
32 |         print()
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/examples/bibparse:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import biblib.bib
 4 | import biblib.messages
 5 | import biblib.algo
 6 | import argparse
 7 | import sys
 8 | import re
 9 | 
10 | MONTHS = 'January February March April May June July August September October November December'.split()
11 | 
12 | def main():
13 |     arg_parser = argparse.ArgumentParser(
14 |         description='Parse .bib database(s) and print basic fields as text')
15 |     arg_parser.add_argument('bib', nargs='+', help='.bib file(s) to process',
16 |                             type=open)
17 |     args = arg_parser.parse_args()
18 | 
19 |     try:
20 |         # Load databases
21 |         db = biblib.bib.Parser().parse(args.bib, log_fp=sys.stderr).get_entries()
22 | 
23 |         # Resolve cross-references
24 |         db = biblib.bib.resolve_crossrefs(db)
25 | 
26 |         # Print entries
27 |         recoverer = biblib.messages.InputErrorRecoverer()
28 |         for ent in db.values():
29 |             with recoverer:
30 |                 print_entry(ent)
31 |         recoverer.reraise()
32 |     except biblib.messages.InputError:
33 |         sys.exit(1)
34 | 
35 | def print_entry(ent):
36 |     print('{ent.key} ({ent.typ}):'.format(ent=ent))
37 |     if 'title' in ent:
38 |         print('  ' + biblib.algo.tex_to_unicode(biblib.algo.title_case(
39 |             ent['title'], pos=ent.field_pos['title'])))
40 | 
41 |     if 'author' in ent:
42 |         authors = [
43 |             biblib.algo.tex_to_unicode(author.pretty(),
44 |                                        pos=ent.field_pos['author'])
45 |             for author in ent.authors()]
46 |         if len(authors) == 0:
47 |             author = None
48 |         elif len(authors) == 1:
49 |             author = authors[0]
50 |         else:
51 |             author = ', '.join(authors[:-1])
52 |             if len(authors) > 2:
53 |                 author += ','
54 |             if ent.authors()[-1].is_others():
55 |                 author += ' et al.'
56 |             else:
57 |                 author += ' and ' + authors[-1]
58 |         if author:
59 |             print('  By ' + author)
60 | 
61 |     if 'year' in ent:
62 |         if 'month' in ent:
63 |             mnum = ent.month_num()
64 |             print('  {} {}'.format(MONTHS[mnum - 1], ent['year']))
65 |         else:
66 |             print('  {}'.format(ent['year']))
67 | 
68 |     print()
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Biblib provides a simple, standalone Python3 package for parsing
  2 | BibTeX bibliographic databases, as well as algorithms for manipulating
  3 | BibTeX entries in BibTeX-y ways.
  4 | 
  5 | There are a lot of BibTeX parsers out there.  Most of them are
  6 | complete nonsense based on some imaginary grammar made up by the
  7 | module's author that is almost, but not quite, entirely unlike
  8 | BibTeX's actual grammar.  *BibTeX has a grammar*.  It's even pretty
  9 | simple, though it's probably not what you think it is.  The hardest
 10 | part of BibTeX's grammar is that it's only written down in one place:
 11 | the BibTeX source code.
 12 | 
 13 | Biblib's parser is derived directly from the WEB source code for
 14 | BibTeX and hence (barring bugs in translation) should be fully
 15 | compatible with BibTeX's own parser.
 16 | 
 17 | 
 18 | Features
 19 | --------
 20 | 
 21 | * BibTeX-compatible `.bib` file parser
 22 | 
 23 | * BibTeX-compatible name parser for fields like `author`
 24 | 
 25 | * Crossref resolution
 26 | 
 27 | * BibTeX-compatible title casing
 28 | 
 29 | * Translator for common TeX markup (like accents) to Unicode (which
 30 |   can, in turn, be used in HTML and other formats).
 31 | 
 32 | 
 33 | Installation
 34 | ------------
 35 | 
 36 | Since biblib has no external dependencies or C modules, you can use
 37 | biblib in your project by simply unpacking it under your source tree
 38 | and adding
 39 | 
 40 |     sys.path.append('biblib')
 41 | 
 42 | before importing it.
 43 | 
 44 | Biblib can also be installed system-wide with
 45 | 
 46 |     python3 setup.py install
 47 | 
 48 | 
 49 | Examples
 50 | --------
 51 | 
 52 | There are a few simple examples of biblib's use in `examples/`.  To
 53 | run these dircetly from the source tree, use, for example
 54 | 
 55 |     PYTHONPATH=$PWD ./examples/bibparse test.bib
 56 | 
 57 | 
 58 | Recognized grammar
 59 | ------------------
 60 | 
 61 | For reference, the `.bib` parser implements a grammar equivalent to
 62 | the following PEG.  All literals are matched case-*insensitively*.
 63 | 
 64 |     bib_db = comment (command_or_entry comment)*
 65 | 
 66 |     comment = [^@]*
 67 | 
 68 |     ws = [ \t\n]*
 69 | 
 70 |     ident = ![0-9] (![ \t"#%'(),={}] [\x20-\x7f])+
 71 | 
 72 |     command_or_entry = '@' ws (comment / preamble / string / entry)
 73 | 
 74 |     comment = 'comment'
 75 | 
 76 |     preamble = 'preamble' ws ( '{' ws preamble_body ws '}'
 77 |                              / '(' ws preamble_body ws ')' )
 78 | 
 79 |     preamble_body = value
 80 | 
 81 |     string = 'string' ws ( '{' ws string_body ws '}'
 82 |                          / '(' ws string_body ws ')' )
 83 | 
 84 |     string_body = ident ws '=' ws value
 85 | 
 86 |     entry = ident ws ( '{' ws key ws entry_body? ws '}'
 87 |                      / '(' ws key_paren ws entry_body? ws ')' )
 88 | 
 89 |     key = [^, \t}\n]*
 90 | 
 91 |     key_paren = [^, \t\n]*
 92 | 
 93 |     entry_body = (',' ws ident ws '=' ws value ws)* ','?
 94 | 
 95 |     value = piece (ws '#' ws piece)*
 96 | 
 97 |     piece
 98 |         = [0-9]+
 99 |         / '{' balanced* '}'
100 |         / '"' (!'"' balanced)* '"'
101 |         / ident
102 | 
103 |     balanced
104 |         = '{' balanced* '}'
105 |         / [^{}]
106 | 


--------------------------------------------------------------------------------
/biblib/messages.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import threading
  3 | import warnings
  4 | 
  5 | class Pos(collections.namedtuple('Pos', 'fname line col log_fp')):
  6 |     """A position in a file.
  7 | 
  8 |     This also optionally tracks a file-like object for logging
  9 |     warnings and errors associated with this file.
 10 |     """
 11 | 
 12 |     def __str__(self):
 13 |         return '{}:{}:{}'.format(self.fname, self.line, self.col)
 14 | 
 15 |     def warn(self, msg):
 16 |         """Log msg to this Pos's logger.
 17 | 
 18 |         If log_fp is None, the warning is silently discarded.
 19 |         """
 20 |         if self.log_fp is not None:
 21 |             self.log_fp.write('{}: warning: {}\n'.format(self, msg))
 22 | 
 23 |     def raise_error(self, msg):
 24 |         """Log and raise InputError([(self, msg)])."""
 25 |         if self.log_fp is not None:
 26 |             self.log_fp.write('{}: error: {}\n'.format(self, msg))
 27 |         raise InputError([(self, msg)])
 28 | 
 29 | Pos.unknown = Pos('<unknown>', 1, 0, None)
 30 | 
 31 | class PosFactory:
 32 |     """A factory that translates character offsets to Pos instances."""
 33 | 
 34 |     def __init__(self, fname, string, log_fp=None):
 35 |         self.__fname = fname
 36 |         self.__string = string
 37 |         self.__log_fp = log_fp
 38 |         self.__cache = (0, 1, 0)
 39 | 
 40 |     def offset_to_pos(self, offset):
 41 |         last_off, last_line, last_col = self.__cache
 42 |         if last_off < offset:
 43 |             last_off, last_line, last_col = 0, 1, 0
 44 | 
 45 |         line = self.__string.count('\n', last_off, offset) + last_line
 46 |         lastnl = self.__string.rfind('\n', last_off, offset)
 47 |         if lastnl == -1:
 48 |             col = last_col + (offset - last_off)
 49 |         else:
 50 |             col = offset - lastnl - 1
 51 |         self.__cache = (offset, line, col)
 52 | 
 53 |         return Pos(self.__fname, line, col, self.__log_fp)
 54 | 
 55 | class InputError(ValueError):
 56 |     """One or more errors with associated Pos instances.
 57 | 
 58 |     An InputError representing multiple errors is a "bundled" error.
 59 |     These can be created when recovering from errors with an
 60 |     InputErrorRecoverer.
 61 |     """
 62 | 
 63 |     def __init__(self, list_of_pos_msg):
 64 |         super().__init__(list_of_pos_msg)
 65 | 
 66 |     def __str__(self):
 67 |         list_of_msg_pos = self.args[0]
 68 |         if len(list_of_msg_pos) == 1:
 69 |             return '1 error'
 70 |         return '{} errors'.format(len(list_of_msg_pos))
 71 | 
 72 | class InputErrorRecoverer:
 73 |     """A context manager for recovering from and bundling InputErrors.
 74 | 
 75 |     This context manager catches and collects InputErrors, effectively
 76 |     recovering from InputErrors at the end of the with block.  An
 77 |     InputErrorRecoverer can be used several times, after which
 78 |     collected errors can be re-raised as a bundled InputError.  For
 79 |     example, a caller that wishes to parse several files without
 80 |     stopping because of an error in one file can do something like
 81 |     this:
 82 | 
 83 |         recoverer = InputErrorRecoverer()
 84 |         for filename in filenames:
 85 |             with recoverer:
 86 |                 parse(filename)
 87 |         recoverer.reraise()
 88 | 
 89 |     An InputErrorRecoverer *must* be either reraised or disposed (even
 90 |     if no errors occurred).  Otherwise a UserWarning will be issued.
 91 |     """
 92 | 
 93 |     def __init__(self):
 94 |         self.__errors = []
 95 | 
 96 |     def __enter__(self):
 97 |         return None
 98 | 
 99 |     def __exit__(self, exc_type, exc_value, traceback):
100 |         if self.__errors is None:
101 |             raise ValueError('InputErrorRecoverer already disposed')
102 |         if isinstance(exc_value, InputError):
103 |             self.__errors.extend(exc_value.args)
104 |             return True
105 | 
106 |     def __del__(self):
107 |         if self.__errors is not None:
108 |             try:
109 |                 warnings.warn('InputErrorRecoverer must be reraised or disposed',
110 |                               stacklevel=2)
111 |             except TypeError as e:
112 |                 # If Python is exiting, warnings.warn has a habit of
113 |                 # raising TypeError("'NoneType' object is not
114 |                 # iterable",).  Ignore it.
115 |                 pass
116 | 
117 |     def reraise(self):
118 |         """If any errors have been collected, raise a bundled InputError."""
119 |         errors = self.__errors
120 |         self.dispose()
121 |         if errors:
122 |             raise InputError(errors)
123 | 
124 |     def dispose(self):
125 |         """Discard all collected errors."""
126 |         self.__errors = None
127 | 


--------------------------------------------------------------------------------
/biblib/test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import collections
  3 | import io
  4 | from .bib import *
  5 | from .algo import *
  6 | from .messages import *
  7 | from . import algo
  8 | 
  9 | def od(*args):
 10 |     return collections.OrderedDict(zip(args[::2], args[1::2]))
 11 | 
 12 | def ent(typ, key, fields):
 13 |     return Entry(fields, typ, key)
 14 | 
 15 | class BibParserTest(unittest.TestCase):
 16 |     def __test_parse(self, string, ents):
 17 |         # Parse string and get entries
 18 |         got = list(Parser().parse(string).get_entries().values())
 19 |         self.assertEqual(got, ents)
 20 | 
 21 |     def test_basic(self):
 22 |         self.__test_parse(
 23 |             '@misc{x, title="title", author={author}}\n@misc{y, title=123,}',
 24 |             [ent('misc', 'x', od('title', 'title',
 25 |                                  'author', 'author')),
 26 |              ent('misc', 'y', od('title', '123'))])
 27 | 
 28 |     def test_balanced(self):
 29 |         self.__test_parse(
 30 |             '@misc{x, title={a{b}c}, author="x{y}z"}',
 31 |             [ent('misc', 'x', od('title', 'a{b}c',
 32 |                                  'author', 'x{y}z'))])
 33 | 
 34 |     def test_whitespace(self):
 35 |         self.__test_parse(
 36 |             ' @ misc { x , title = {a} , }',
 37 |             [ent('misc', 'x', od('title', 'a'))])
 38 | 
 39 |     def test_compress(self):
 40 |         self.__test_parse(
 41 |             '@misc{x, title={  a\t  b\n  c  }}',
 42 |             [ent('misc', 'x', od('title', 'a b c'))])
 43 | 
 44 |     def test_funny_keys(self):
 45 |         self.__test_parse(
 46 |             '@misc{@"#%\'()=, title="a"}',
 47 |             [ent('misc', '@"#%\'()=', od('title', 'a'))])
 48 |         for string in ['@misc{}', '@misc{,}', '@misc{\n}']:
 49 |             self.__test_parse(string, [ent('misc', '', od())])
 50 |         self.__test_parse(
 51 |             '@misc{,title="a"}',
 52 |             [ent('misc', '', od('title', 'a'))])
 53 |         self.__test_parse(
 54 |             '@misc(k)ey, title="a")\n@misc{k{ey, title="a"}',
 55 |             [ent('misc', 'k)ey', od('title', 'a')),
 56 |              ent('misc', 'k{ey', od('title', 'a'))])
 57 | 
 58 |     def test_string(self):
 59 |         self.__test_parse(
 60 |             '@string{foo = {a}}\n@misc{x, title = foo # "b" # foo # 2}',
 61 |             [ent('misc', 'x', od('title', 'aba2'))])
 62 | 
 63 |     def test_comment(self):
 64 |         self.__test_parse(
 65 |             # Braces intentionally unbalanced, everything on one line
 66 |             '@comment{abc@misc{x}',
 67 |             [ent('misc', 'x', od())])
 68 | 
 69 | class EntryTest(unittest.TestCase):
 70 |     def test_to_bib(self):
 71 |         entry = Entry([('author', 'An Author'),
 72 |                        ('title', 'This is a ' + 'really '*10 + 'long title'),
 73 |                        ('month', 'November'), ('year', '2013')],
 74 |                       typ='misc', key='key', field_pos={'month': Pos.unknown})
 75 |         self.assertEqual(
 76 |             entry.to_bib(),
 77 |             '''\
 78 | @misc{key,
 79 |   author       = {An Author},
 80 |   title        = {This is a really really really really really really
 81 |     really really really really long title},
 82 |   month        = nov,
 83 |   year         = 2013,
 84 | }''')
 85 |         self.assertEqual(
 86 |             entry.to_bib(month_to_macro=False, wrap_width=None),
 87 |             '''\
 88 | @misc{key,
 89 |   author       = {An Author},
 90 |   title        = {This is a really really really really really really really really really really long title},
 91 |   month        = {November},
 92 |   year         = 2013,
 93 | }''')
 94 | 
 95 |     def test_month_num(self):
 96 |         def test(string, expect):
 97 |             entry = Entry([('month', string)], field_pos={'month': Pos.unknown})
 98 |             self.assertEqual(entry.month_num(), expect)
 99 |         for i, name in enumerate(['Jan.','Feb.','Mar.','Apr.','May','June',
100 |                                   'July','Aug.','Sept.','Oct.','Nov.','Dec.']):
101 |             test(name, i+1)
102 |         for i, name in enumerate(['January','February','March','April',
103 |                                   'May','June','July','August',
104 |                                   'September','October','November','December']):
105 |             test(name, i+1)
106 |         self.assertRaises(InputError, test, 'Foo', None)
107 |         self.assertRaises(InputError, test, 'Janruary', None)
108 | 
109 |     def test_date_key(self):
110 |         def test(year, month, expect):
111 |             fields = []
112 |             if year: fields.append(('year', year))
113 |             if month: fields.append(('month', month))
114 |             entry = Entry(fields, field_pos={'year' : Pos.unknown,
115 |                                              'month' : Pos.unknown})
116 |             got = entry.date_key()
117 |             if expect is not None:
118 |                 self.assertEqual(got, expect)
119 |         test(None, None, ())
120 |         test('2013', None, (2013,))
121 |         test('2013', 'jan', (2013, 1))
122 |         self.assertRaises(InputError, test, 'x', None, None)
123 |         self.assertRaises(InputError, test, None, 'jan', None)
124 |         self.assertRaises(InputError, test, '2013', 'foo', None)
125 | 
126 | class CrossRefTest(unittest.TestCase):
127 |     def setUp(self):
128 |         self.parser = Parser().parse("""\
129 |         @misc{ent1, title={Title 1}, crossref={ent2}}
130 |         @misc{ent2, title={Title 2}, booktitle={Book title 2}}""")
131 | 
132 |     def test_basic(self):
133 |         db = resolve_crossrefs(self.parser.get_entries())
134 |         self.assertEqual(
135 |             [('ent1', ent('misc', 'ent1', [('title', 'Title 1'),
136 |                                            ('booktitle', 'Book title 2')])),
137 |              ('ent2', ent('misc', 'ent2', [('title', 'Title 2'),
138 |                                            ('booktitle', 'Book title 2')]))],
139 |             list(db.items()))
140 | 
141 |     def test_min_crossrefs(self):
142 |         db = resolve_crossrefs(self.parser.get_entries(), min_crossrefs=1)
143 |         self.assertEqual(
144 |             [('ent1', ent('misc', 'ent1', [('title', 'Title 1'),
145 |                                            ('crossref', 'ent2')])),
146 |              ('ent2', ent('misc', 'ent2', [('title', 'Title 2'),
147 |                                            ('booktitle', 'Book title 2')]))],
148 |             list(db.items()))
149 | 
150 |         db = resolve_crossrefs(self.parser.get_entries(), min_crossrefs=2)
151 |         self.assertEqual(
152 |             [('ent1', ent('misc', 'ent1', [('title', 'Title 1'),
153 |                                            ('booktitle', 'Book title 2')])),
154 |              ('ent2', ent('misc', 'ent2', [('title', 'Title 2'),
155 |                                            ('booktitle', 'Book title 2')]))],
156 |             list(db.items()))
157 | 
158 |     def test_self_crossref(self):
159 |         # This is accepted, believe it or not (though BibTeX warns)
160 |         log = io.StringIO()
161 |         self.parser.parse("@misc{ent3, title={Title 3}, crossref={ent3}}",
162 |                           name='<ent3>', log_fp=log)
163 |         resolve_crossrefs(self.parser.get_entries())
164 |         self.assertIn('<ent3>:1:', log.getvalue())
165 | 
166 |     def test_bad_order(self):
167 |         self.parser.parse("""\
168 |         @misc{ent3, title={Title 3}, crossref={ent2}}""")
169 |         with self.assertRaises(InputError) as ar:
170 |             resolve_crossrefs(self.parser.get_entries())
171 |         self.assertEqual(1, len(ar.exception.args[0]))
172 | 
173 | class NameParserTest(unittest.TestCase):
174 |     def test_first_char(self):
175 |         p = algo.NameParser()
176 |         for check, expect in [('abc', 'a'), ('ABC', 'A'), (' abc', 'a'),
177 |                               ('\\abc', 'a'), ('{a} bc', 'b'),
178 |                               ('{\\`a}bc', 'a'), ('{\\aa}bc', 'å'),
179 |                               ('{\\a}bc', 'a')]:
180 |             self.assertEqual(p._first_char(check), expect)
181 | 
182 |     def test_and(self):
183 |         p = parse_names
184 |         self.assertEqual(p('A B and C D'),
185 |                          [Name('A', '', 'B', ''), Name('C', '', 'D', '')])
186 |         self.assertEqual(p('A B AND C D'),
187 |                          [Name('A', '', 'B', ''), Name('C', '', 'D', '')])
188 |         self.assertEqual(p('A B and and C D'),
189 |                          [Name('A', '', 'B', ''), Name('', '', '', ''),
190 |                           Name('C', '', 'D', '')])
191 |         self.assertEqual(p('A B and and'),
192 |                          [Name('A', '', 'B', ''), Name('', '', 'and', '')])
193 |         self.assertEqual(p('A B { and } C D'),
194 |                          [Name('A B { and } C', '', 'D', '')])
195 |         self.assertEqual(p('A B {\\ and } C D'),
196 |                          [Name('A B', '{\\ and }', 'C D', '')])
197 | 
198 |     def __test_names(self, *tests):
199 |         for test in tests:
200 |             if len(test) == 4:
201 |                 test = test + ('',)
202 |             names = parse_names(test[0])
203 |             self.assertEqual(names, [Name(*test[1:])],
204 |                              'parsing {!r}'.format(test[0]))
205 | 
206 |     def test_first_von_last(self):
207 |         # Examples mostly from Nicolas Markey's Tame the BeaST
208 |         self.__test_names(
209 |             ('jean de la fontaine',     '', 'jean de la', 'fontaine'),
210 |             ('Jean de la fontaine',     'Jean', 'de la', 'fontaine'),
211 |             ('Jean De La fontaine',     'Jean De La', '', 'fontaine'),
212 |             ('Jean {de} la fontaine',   'Jean {de}', 'la', 'fontaine'),
213 |             ('jean {de} {la} fontaine', '', 'jean', '{de} {la} fontaine'),
214 |             ('Jean {de} {la} fontaine', 'Jean {de} {la}', '', 'fontaine'),
215 |             ('Jean De La Fontaine',     'Jean De La', '', 'Fontaine'),
216 |             ('jean De la Fontaine',     '', 'jean De la', 'Fontaine'),
217 |             ('Jean de La Fontaine',     'Jean', 'de', 'La Fontaine'),
218 |             ('Jean-Baptiste Poquelin',  'Jean-Baptiste', '', 'Poquelin'),
219 |             ('Jean-Baptiste-Poquelin',  '', '', 'Jean-Baptiste-Poquelin'),
220 |             ('Jean- Baptiste-Poquelin', '', '', 'Jean-Baptiste-Poquelin'),
221 |             ('Jean Baptiste Poquelin',  'Jean Baptiste', '', 'Poquelin'),
222 |             ('Jean Baptiste-Poquelin',  'Jean', '', 'Baptiste-Poquelin'),
223 |             ('Jean Baptiste~Poquelin',  'Jean Baptiste', '', 'Poquelin'),
224 |             ('Jean-baptiste Poquelin',  'Jean', 'baptiste', 'Poquelin'))
225 | 
226 |     def test_von_last_first(self):
227 |         self.__test_names(
228 |             ('de la fontaine, Jean',    'Jean', 'de la', 'fontaine'),
229 |             ('De La Fontaine, Jean',    'Jean', '', 'De La Fontaine'),
230 |             ('De la Fontaine, Jean',    'Jean', 'De la', 'Fontaine'),
231 |             ('de La Fontaine, Jean',    'Jean', 'de', 'La Fontaine'),
232 |             ('{D}e {L}a Cruz, Maria',   'Maria', '{D}e {L}a', 'Cruz'))
233 | 
234 |     def test_von_last_jr_first(self):
235 |         self.__test_names(
236 |             ('de la fontaine, Jean, Jr', 'Jean', 'de la', 'fontaine', 'Jr'),
237 |             ('De La Fontaine, Jean, Jr', 'Jean', '', 'De La Fontaine', 'Jr'),
238 |             ('De la Fontaine, Jean, Jr', 'Jean', 'De la', 'Fontaine', 'Jr'),
239 |             ('de La Fontaine, Jean, Jr', 'Jean', 'de', 'La Fontaine', 'Jr'))
240 | 
241 | class NamePrettyTest(unittest.TestCase):
242 |     def __test(self, template, gen):
243 |         for i in range(0x10):
244 |             name = Name('f' if i & 1 else '',
245 |                         'v' if i & 2 else '',
246 |                         'l' if i & 4 else '',
247 |                         'j' if i & 8 else '')
248 |             got = name.pretty(template)
249 |             self.assertEqual(gen(name), got)
250 | 
251 |     def __clean(self, string):
252 |         import re
253 |         string = re.sub(' +', ' ', string)
254 |         string = re.sub(' *(, )+', ', ', string)
255 |         return string.strip(' ,')
256 | 
257 |     def test_basic(self):
258 |         self.__test(
259 |             '{first}{von}{jr}{last}',
260 |             lambda n: n.first+n.von+n.jr+n.last)
261 |         self.__test(
262 |             '{first} {von} {last} {jr}',
263 |             lambda n: self.__clean(' '.join([n.first, n.von, n.last, n.jr])))
264 |         self.__test(
265 |             '{von} {last}, {first}',
266 |             lambda n: self.__clean('{0.von} {0.last}, {0.first}'.format(n)))
267 |         self.__test(
268 |             '{von} {last}, {first}, {jr}',
269 |             lambda n: self.__clean('{0.von} {0.last}, {0.first}, {0.jr}'.format(n)))
270 | 
271 |     def test_before_and_after(self):
272 |         self.__test(
273 |             'a{first}{von}{jr}{last}b',
274 |             lambda n: 'a'+n.first+n.von+n.jr+n.last+'b')
275 |         self.__test(
276 |             'a{first} {von} {last} {jr}b',
277 |             lambda n: 'a'+self.__clean(' '.join([n.first, n.von, n.last, n.jr]))+'b')
278 | 
279 | class CaseTest(unittest.TestCase):
280 |     def __test(self, *tests):
281 |         for string, want in tests:
282 |             self.assertEqual(title_case(string), want,
283 |                              'title-casing {!r}'.format(string))
284 | 
285 |     def test_basic(self):
286 |         self.__test(
287 |             ('ABC DEF',       'Abc def'),
288 |             ('abc def',       'abc def'),
289 |             ('ABC {DEF} GHI', 'Abc {DEF} ghi'),
290 |             ('ABC D{E}F GHI', 'Abc d{E}f ghi'))
291 | 
292 |     def test_colons(self):
293 |         self.__test(
294 |             ('ABC DEF: GHI JKL',  'Abc def: Ghi jkl'),
295 |             ('ABC DEF:  GHI JKL', 'Abc def:  Ghi jkl'),
296 |             ('ABC DEF:GHI JKL',   'Abc def:ghi jkl'))
297 | 
298 |     def test_special(self):
299 |         self.__test(
300 |             # Brace groups beginning with special characters are
301 |             # lower-cased throughout, even under deeper braces
302 |             (r'x {\AE X {X \AE}}', r'x {\ae x {x \ae}}'),
303 |             # Unknown control sequences also trigger brace group
304 |             # lower-casing, but are themselves left alone
305 |             (r'x {\LaTeX X {X} \AE \LaTeX}', r'x {\LaTeX x {x} \ae \LaTeX}'),
306 |             # Special characters are only interpreted at level 1
307 |             (r'x {{\AE}}', r'x {{\AE}}'),
308 |             # If a brace group does not start with a slash, it doesn't
309 |             # get touched.
310 |             (r'x {AE X \AE}', r'x {AE X \AE}'),
311 |             # Special characters that start at position 0 or after a
312 |             # colon are untouched
313 |             (r'{\AE X {X} \AE} X', r'{\AE X {X} \AE} x'),
314 |             (r'X: {\AE X {X} \AE}', r'X: {\AE X {X} \AE}'),
315 |             (r'X:{\ae x {x} \ae}', r'X:{\ae x {x} \ae}'))
316 | 
317 | class TeXToUnicodeTest(unittest.TestCase):
318 |     def test_simple(self):
319 |         self.assertEqual(tex_to_unicode(r'~\%\&\#\$'), '\u00A0%&#$')
320 |         self.assertEqual(tex_to_unicode(r'x\ss y\i'), 'xßyı')
321 | 
322 |     def test_accents(self):
323 |         self.assertEqual(tex_to_unicode(r'{\`a}\^{e}'), 'àê')
324 |         self.assertEqual(tex_to_unicode(r'\`i\`\i'), 'ìì')
325 | 
326 |     def test_ligatures(self):
327 |         self.assertEqual(tex_to_unicode(r'a--b---c-{-}d'), 'a\u2013b\u2014c--d')
328 | 


--------------------------------------------------------------------------------
/biblib/algo.py:
--------------------------------------------------------------------------------
  1 | """Algorithms for manipulating BibTeX data.
  2 | 
  3 | This module implements various algorithms supplied by BibTeX to style
  4 | files, as well as some algorithms to make BibTeX data more accessible
  5 | to Python.
  6 | """
  7 | 
  8 | __all__ = ('Name parse_names ' +
  9 |            'parse_month ' +
 10 |            'title_case ' +
 11 |            'TeXProcessor TeXToUnicode tex_to_unicode').split()
 12 | 
 13 | import re
 14 | import collections
 15 | import unicodedata
 16 | import string
 17 | 
 18 | from . import messages
 19 | 
 20 | # Control sequences (defined as "control_seq_ilk" in bibtex) and their
 21 | # Unicode translations.  This is similar to, but slightly different
 22 | # from the TeX definitions (of course).
 23 | _CONTROL_SEQS = {
 24 |     '\\i': 'ı', '\\j': 'ȷ', '\\oe': 'œ', '\\OE': 'Œ',
 25 |     '\\ae': 'æ', '\\AE': 'Æ', '\\aa': 'å', '\\AA': 'Å',
 26 |     '\\o': 'ø', '\\O': 'Ø', '\\l': 'ł', '\\L': 'Ł', '\\ss': 'ß'
 27 | }
 28 | 
 29 | class NameParser:
 30 |     def __init__(self):
 31 |         pass
 32 | 
 33 |     def __depth(self, data):
 34 |         depth, depths = 0, [0] * len(data)
 35 |         for pos, ch in enumerate(data):
 36 |             depths[pos] = depth
 37 |             if ch == '{':
 38 |                 depth += 1
 39 |                 depths[pos] = depth
 40 |             elif ch == '}':
 41 |                 depth -= 1
 42 |         return depths
 43 | 
 44 |     def __split_depth0(self, regexp, data, flags=0):
 45 |         regexp = re.compile(regexp, flags=flags)
 46 |         depths = self.__depth(data)
 47 |         parts, last = [], 0
 48 |         for m in regexp.finditer(data):
 49 |             if depths[m.start()] == 0:
 50 |                 parts.append(data[last:m.start()])
 51 |                 last = m.end()
 52 |                 if regexp.groups:
 53 |                     parts.extend(m.groups())
 54 |         parts.append(data[last:])
 55 |         return parts
 56 | 
 57 |     def _first_char(self, data):
 58 |         """Return the first character of data (in bibtex's sense)."""
 59 |         # XXX Should this be pulled out as some generic algorithm?
 60 |         pos = 0
 61 |         depths = self.__depth(data)
 62 |         while True:
 63 |             if pos == len(data):
 64 |                 return ''
 65 |             elif data[pos].isalpha():
 66 |                 return data[pos]
 67 |             elif data.startswith('{\\', pos):
 68 |                 # Special character
 69 |                 pos += 1
 70 |                 m = re.compile(r'\\[a-zA-Z]+').match(data, pos)
 71 |                 if m and m.group() in _CONTROL_SEQS:
 72 |                     # Known bibtex control sequence
 73 |                     return _CONTROL_SEQS[m.group()]
 74 |                 # Scan for the first alphabetic character
 75 |                 while pos < len(data) and depths[pos]:
 76 |                     if data[pos].isalpha():
 77 |                         return data[pos]
 78 |                     pos += 1
 79 |             elif data[pos] == '{':
 80 |                 # Skip brace group
 81 |                 while pos < len(data) and depths[pos]:
 82 |                     pos += 1
 83 |             else:
 84 |                 pos += 1
 85 | 
 86 |     def __split_von_last(self, toks):
 87 |         # See von_name_ends_and_last_name_starts_stuff
 88 |         for von_end in range(len(toks) - 1, 1, -2):
 89 |             if self._first_char(toks[von_end - 2]).islower():
 90 |                 return (toks[:von_end-1], toks[von_end:])
 91 |         return ([], toks)
 92 | 
 93 |     def parse(self, string, pos):
 94 |         """Parse a BibTeX name list.
 95 | 
 96 |         Returns a list of Name objects.  Raises InputError if there is
 97 |         a syntax error.
 98 |         """
 99 | 
100 |         # See x_format_name
101 | 
102 |         # Split names (see name_scan_for_and)
103 |         name_strings = [n.strip() for n in self.__split_depth0(
104 |             '[ \t]and(?=[ \t])', string, flags=re.IGNORECASE)]
105 | 
106 |         # Process each name
107 |         names = []
108 |         for name_string in name_strings:
109 |             # Remove leading and trailing white space, ~, and -, and
110 |             # trailing commas.
111 |             name_string = name_trailing = name_string.lstrip('-~ \t')
112 |             name_string = name_string.rstrip('-~ \t,')
113 |             if ',' in name_trailing[len(name_string):]:
114 |                 # BibTeX warns about this because it often indicates a
115 |                 # bigger syntax problem
116 |                 pos.warn('trailing comma after name `{}\''.format(name_string))
117 | 
118 |             # Split on depth-0 commas and further split tokens in each
119 |             # part, keeping only the first connector between each
120 |             # token.
121 |             parts = [self.__split_depth0('([-~ \t])[-~ \t]*', part.strip())
122 |                      for part in self.__split_depth0(',', name_string)]
123 | 
124 |             # Process name depending on how many commas there were
125 |             first = von = last = jr = []
126 |             if len(parts) == 1:
127 |                 # "First von Last"
128 |                 toks = parts[0]
129 |                 # The von tokens start with the first lower-case token
130 |                 # (but cannot start at the last token)
131 |                 for von_start in range(0, len(toks) - 2, 2):
132 |                     if self._first_char(toks[von_start]).islower():
133 |                         # Found beginning; now find the end
134 |                         first = toks[:max(0, von_start-1)]
135 |                         von, last = self.__split_von_last(toks[von_start:])
136 |                         break
137 |                 else:
138 |                     # No von tokens.  Find hyphen-connected last name
139 |                     # tokens.
140 |                     for last_start in range(len(toks) - 1, -1, -2):
141 |                         if last_start and toks[last_start-1] != '-':
142 |                             break
143 |                     first = toks[:max(0, last_start-1)]
144 |                     last = toks[last_start:]
145 |             elif 2 <= len(parts) <= 3:
146 |                 # "von Last, First[, Jr]"
147 |                 von, last = self.__split_von_last(parts[0])
148 |                 first = parts[1]
149 |                 if len(parts) == 3:
150 |                     jr = parts[2]
151 |             else:
152 |                 pos.raise_error(
153 |                     'too many commas in name `{}\''.format(name_string))
154 | 
155 |             names.append(Name(''.join(first), ''.join(von),
156 |                               ''.join(last), ''.join(jr)))
157 |         return names
158 | 
159 | class Name(collections.namedtuple('Name', 'first von last jr')):
160 |     """A parsed name.
161 | 
162 |     The name is parsed in to first name, "von", last name, and the
163 |     complement (or "jr").  Each component is in uninterpreted form
164 |     (e.g., TeX syntax).  Missing components are set to the empty
165 |     string.
166 |     """
167 | 
168 |     def is_others(self):
169 |         return self.first == '' and self.von == '' and \
170 |             self.last == 'others' and self.jr == ''
171 | 
172 |     def pretty(self, template='{first} {von} {last} {jr}'):
173 |         """Pretty-print author according to template.
174 | 
175 |         The template is a 'format' template with the added feature
176 |         that literal text surrounding fields that expand to empty
177 |         strings is prioritized, rather than concatenated.
178 |         Specifically, of the literal text snippets between two
179 |         non-null fields, only the first of the highest priority is
180 |         kept, where non-white space outranks white space outranks the
181 |         empty string.  Literal text before and after the first and
182 |         last fields is always kept.
183 | 
184 |         Hence, if the template is '{von} {last}, {first}, {jr}' and
185 |         the name has a last and a jr not no von or first, then the
186 |         first comma will be kept and the space and second dropped.  If
187 |         the name has only a von and a last, then both commas will be
188 |         dropped.  If the name has only a last, then all separators
189 |         will be dropped.
190 |         """
191 | 
192 |         # XXX BibTeX's own format.name$ templates are more
193 |         # sophisticated than this, and it's not clear these are easier
194 |         # to use.  These do have the (dubious) benefit of having
195 |         # access to the usual format machinery.
196 | 
197 |         def priority(string):
198 |             if not string:
199 |                 return 0
200 |             elif string.isspace():
201 |                 return 1
202 |             return 2
203 |         fields = {'first': self.first, 'von': self.von,
204 |                   'last': self.last, 'jr': self.jr}
205 |         f = string.Formatter()
206 |         pieces = ['']
207 |         first_field, last_field = 0, -1
208 |         leading = trailing = ''
209 |         for i, (literal_text, field_name, format_spec, conv) in \
210 |             enumerate(f.parse(template)):
211 |             if i == 0:
212 |                 # Always keep leading text
213 |                 leading = literal_text
214 |             elif field_name is None:
215 |                 # Always keep trailing test
216 |                 trailing = literal_text
217 |             elif priority(literal_text) > priority(pieces[-1]):
218 |                 # Overrides previous piece
219 |                 pieces[-1] = literal_text
220 | 
221 |             if field_name is not None:
222 |                 obj, _ = f.get_field(field_name, (), fields)
223 |                 if not obj:
224 |                     continue
225 |                 obj = f.convert_field(obj, conv)
226 |                 if first_field == 0:
227 |                     first_field = len(pieces)
228 |                 last_field = len(pieces)
229 |                 pieces.extend([f.format_field(obj, format_spec), ''])
230 |         # Only keep the pieces between non-null fields
231 |         pieces = pieces[first_field:last_field + 1]
232 |         return leading + ''.join(pieces) + trailing
233 | 
234 | def parse_names(string, pos=messages.Pos.unknown):
235 |     """Parse a BibTeX name list (e.g., an author or editor field).
236 | 
237 |     Returns a list of Name objects.  The parsing is equivalent to
238 |     BibTeX's built-in "format.name$" function.  Raises InputError if
239 |     there is a syntax error.
240 |     """
241 |     return NameParser().parse(string, pos)
242 | 
243 | _MONTHS = 'January February March April May June July August September October November December'.lower().split()
244 | 
245 | def parse_month(string, pos=messages.Pos.unknown):
246 |     """Parse a BibTeX month field.
247 | 
248 |     This performs fairly fuzzy parsing that supports all standard
249 |     month macro styles (and then some).
250 | 
251 |     Raises InputError if the field cannot be parsed.
252 |     """
253 |     val = string.strip().rstrip('.').lower()
254 |     for i, name in enumerate(_MONTHS):
255 |         if name.startswith(val) and len(val) >= 3:
256 |             return i + 1
257 |     pos.raise_error('invalid month `{}\''.format(string))
258 | 
259 | CS_RE = re.compile(r'\\[a-zA-Z]+')
260 | 
261 | def title_case(string, pos=messages.Pos.unknown):
262 |     """Convert to title case (like BibTeX's built-in "change.case$").
263 | 
264 |     Raises InputError if the title string contains syntax errors.
265 |     """
266 | 
267 |     # See "@<Perform the case conversion@>"
268 |     out = []
269 |     level, prev_colon, pos = 0, False, 0
270 |     while pos < len(string):
271 |         keep = (pos == 0 or (prev_colon and string[pos-1] in ' \t\n'))
272 | 
273 |         if level == 0 and string.startswith('{\\', pos) and not keep:
274 |             # Special character
275 |             out.append(string[pos])
276 |             pos += 1
277 |             level += 1
278 | 
279 |             while level and pos < len(string):
280 |                 if string[pos] == '\\':
281 |                     m = CS_RE.match(string, pos)
282 |                     if m:
283 |                         if m.group() in _CONTROL_SEQS:
284 |                             # Lower case control sequence
285 |                             out.append(m.group().lower())
286 |                         else:
287 |                             # Unknown control sequence, keep case
288 |                             out.append(m.group())
289 |                         pos = m.end()
290 |                         continue
291 |                 elif string[pos] == '{':
292 |                     level += 1
293 |                 elif string[pos] == '}':
294 |                     level -= 1
295 | 
296 |                 # Lower-case non-control sequence
297 |                 out.append(string[pos].lower())
298 |                 pos += 1
299 | 
300 |             prev_colon = False
301 |             continue
302 | 
303 |         # Handle braces
304 |         char = string[pos]
305 |         if char == '{':
306 |             level += 1
307 |         elif char == '}':
308 |             if level == 0:
309 |                 pos.raise_error('unexpected }')
310 |             level -= 1
311 | 
312 |         # Handle colon state
313 |         if char == ':':
314 |             prev_colon = True
315 |         elif char not in ' \t\n':
316 |             prev_colon = False
317 | 
318 |         # Change case of a regular character
319 |         if level > 0 or keep:
320 |             out.append(string[pos])
321 |         else:
322 |             out.append(string[pos].lower())
323 |         pos += 1
324 | 
325 |     return ''.join(out)
326 | 
327 | # A TeX control sequence is
328 | #
329 | # 1) an active character (subsequent white space is NOT ignored) or,
330 | # 2) a \ followed by either
331 | # 2.1) a sequence of letter-category characters (subsequent white
332 | #      space is ignored), or
333 | # 2.2) a single space-category character (subsequent white space is
334 | #      ignored), or
335 | # 2.3) a single other character (subsequent white space is NOT
336 | #      ignored).
337 | #
338 | # This regexp assumes plain TeX's initial category codes.  Technically
339 | # only ~ and \f are active characters, but we include several other
340 | # special characters that we want to abort on.
341 | tex_cs_re = re.compile(
342 |     r'([~\f$&#^_]|(\\[a-zA-Z]+|\\[ \t\r\n])|\\.)(?(2)[ \t\r\n]*)')
343 | 
344 | class TeXProcessor:
345 |     """Base class for simple TeX macro processors.
346 | 
347 |     This assumes the initial category codes set up by plain.tex (and,
348 |     likewise, LaTeX).
349 |     """
350 | 
351 |     def process(self, string, pos):
352 |         """Expand active characters and macros in string.
353 | 
354 |         Raises InputError if it encounters an active character or
355 |         macro it doesn't recognize.
356 |         """
357 | 
358 |         self.__data = string
359 |         self.__off = 0
360 |         self.__pos = pos
361 | 
362 |         # Process macros
363 |         while True:
364 |             m = tex_cs_re.search(self.__data, self.__off)
365 |             if not m:
366 |                 break
367 |             self.__off = m.end()
368 |             macro = m.group(1)
369 |             nval = self._expand(macro)
370 |             if nval is None:
371 |                 if macro.startswith('\\'):
372 |                     pos.raise_error('unknown macro `{}\''.format(macro))
373 |                 pos.raise_error(
374 |                     'unknown special character `{}\''.format(macro))
375 |             self.__data = self.__data[:m.start()] + nval + \
376 |                           self.__data[self.__off:]
377 |             self.__off = m.start() + len(nval)
378 | 
379 |         return self.__data
380 | 
381 |     def _scan_argument(self):
382 |         """Scan an return a macro argument."""
383 |         if self.__off >= len(self.__data):
384 |             self.__pos.raise_error('macro argument expected')
385 |         if self.__data[self.__off] == '{':
386 |             start = self.__off
387 |             depth = 0
388 |             while depth or self.__off == start:
389 |                 if self.__data[self.__off] == '{':
390 |                     depth += 1
391 |                 elif self.__data[self.__off] == '}':
392 |                     depth -= 1
393 |                 self.__off += 1
394 |             return self.__data[start + 1:self.__off - 1]
395 |         elif self.__data[self.__off] == '\\':
396 |             m = tex_cs_re.match(self.__data, self.__off)
397 |             self.__off = m.end()
398 |             return m.group(1)
399 |         else:
400 |             arg = self.__data[self.__off]
401 |             self.__off += 1
402 |             return arg
403 | 
404 |     def _expand(self, cs):
405 |         """Return the expansion of an active character or control sequence.
406 | 
407 |         Returns None if the sequence is unknown.  This should be
408 |         overridden by sub-classes.
409 |         """
410 |         return None
411 | 
412 | class TeXToUnicode(TeXProcessor):
413 |     """A simple TeX-to-unicode converter.
414 | 
415 |     This interprets accents and other special tokens like '--' and
416 |     eliminates braces.
417 |     """
418 | 
419 |     # Simple TeX-to-Unicode replacements
420 |     _SIMPLE = {
421 |         # Active characters
422 |         '~': '\u00A0',
423 |         # chardefs from plain.tex
424 |         '\\%': '%', '\\&': '&', '\\#': '#', '\\$': '$', '\\ss': 'ß',
425 |         '\\ae': 'æ', '\\oe': 'œ', '\\o': 'ø',
426 |         '\\AE': 'Æ', '\\OE': 'Œ', '\\O': 'Ø',
427 |         '\\i': 'ı', '\\j': 'ȷ',
428 |         '\\aa': 'å', '\\AA': 'Å', '\\l': 'ł', '\\L': 'Ł',
429 |         # Other defs from plain.tex
430 |         '\\_': '_', '\\dag': '†', '\\ddag': '‡', '\\S': '§', '\\P': '¶',
431 |     }
432 | 
433 |     # TeX accent control sequences to Unicode combining characters
434 |     _ACCENTS = {
435 |         # Accents defined in plain.tex
436 |         '\\`': '\u0300', "\\'": '\u0301', '\\v': '\u030C', '\\u': '\u0306',
437 |         '\\=': '\u0304', '\\^': '\u0302', '\\.': '\u0307', '\\H': '\u030B',
438 |         '\\~': '\u0303', '\\"': '\u0308',
439 |         '\\d': '\u0323', '\\b': '\u0331', '\\c': '\u0327',
440 |         # Other accents that seem to be standard, but I can't find
441 |         # their definitions
442 |         '\\r': '\u030A', '\\k': '\u0328'
443 |     }
444 | 
445 |     def process(self, string, pos):
446 |         string = super().process(string, pos)
447 | 
448 |         # Handle ligatures that are unique to TeX.  This must be done
449 |         # after macro expansion, but before brace removal because
450 |         # braces inhibit ligatures.
451 |         string = string.replace('---', '\u2014').replace('--', '\u2013')
452 | 
453 |         # Remove braces
454 |         return string.replace('{', '').replace('}', '')
455 | 
456 |     def _expand(self, cs):
457 |         if cs in self._SIMPLE:
458 |             return self._SIMPLE[cs]
459 |         if cs in self._ACCENTS:
460 |             arg = self._scan_argument()
461 |             if len(arg) == 0:
462 |                 seq, rest = ' ' + self._ACCENTS[cs], ''
463 |             elif arg.startswith('\\i') or arg.startswith('\\j'):
464 |                 # Unicode combining marks should be applied to the
465 |                 # regular i, not the dotless i.
466 |                 seq, rest = arg[1] + self._ACCENTS[cs], arg[2:]
467 |             else:
468 |                 seq, rest = arg[0] + self._ACCENTS[cs], arg[1:]
469 |             return unicodedata.normalize('NFC', seq) + rest
470 |         return None
471 | 
472 | def tex_to_unicode(string, pos=messages.Pos.unknown):
473 |     """Convert a BibTeX field value written in TeX to Unicode.
474 | 
475 |     This interprets accents and other special tokens like '--' and
476 |     eliminates braces.  Raises InputError if it encounters a macro it
477 |     doesn't understand.
478 | 
479 |     Note that BibTeX's internal understanding of accented characters
480 |     (e.g., purify$ and change.case$) is much more limited than TeX's.
481 |     This implements something closer to TeX on the assumption that the
482 |     goal is to display the string.
483 |     """
484 | 
485 |     return TeXToUnicode().process(string, pos)
486 | 


--------------------------------------------------------------------------------
/biblib/bib.py:
--------------------------------------------------------------------------------
  1 | """Parser and representation for BibTeX .bib databases.
  2 | 
  3 | This parser is derived directly from the WEB source code for BibTeX --
  4 | especially section "Reading the database file(s)" -- and hence
  5 | (barring bugs in translation) should be fully compatible with BibTeX's
  6 | own parser.
  7 | """
  8 | 
  9 | __all__ = 'Parser Entry FieldError resolve_crossrefs'.split()
 10 | 
 11 | import sys
 12 | import re
 13 | import collections
 14 | import textwrap
 15 | 
 16 | from . import messages
 17 | 
 18 | # Match sequences of legal identifier characters, except that the
 19 | # first is not allowed to be a digit (see id_class)
 20 | ID_RE = re.compile('(?![0-9])(?:(?![ \t"#%\'(),={}])[\x20-\x7f])+')
 21 | # BibTeX only considers space, tab, and newline to be white space (see
 22 | # lex_class)
 23 | SPACE_RE = re.compile('[ \t\n]*')
 24 | 
 25 | class ParseError(Exception):
 26 |     pass
 27 | 
 28 | class Parser:
 29 |     """A parser for .bib BibTeX database files."""
 30 | 
 31 |     def __init__(self, *, month_style='full'):
 32 |         """Initialize an empty database.
 33 | 
 34 |         This also initializes standard month macros (which are usually
 35 |         provided by the style file).  month_style may be 'full' to get
 36 |         full names, 'abbrv' to get abbrv.bst-style abbreviated names,
 37 |         or None to not initialize month macros.
 38 | 
 39 |         The database should be populated by calling parse one or more
 40 |         times.  The final contents of the database can be retrieved by
 41 |         calling finalize.
 42 |         """
 43 | 
 44 |         self.__log, self.__errors = [], False
 45 |         self.__entries = collections.OrderedDict()
 46 | 
 47 |         if month_style == 'full':
 48 |             self.__macros = {'jan': 'January',   'feb': 'February',
 49 |                              'mar': 'March',     'apr': 'April',
 50 |                              'may': 'May',       'jun': 'June',
 51 |                              'jul': 'July',      'aug': 'August',
 52 |                              'sep': 'September', 'oct': 'October',
 53 |                              'nov': 'November',  'dec': 'December'}
 54 |         elif month_style == 'abbrv':
 55 |             self.__macros = {'jan': 'Jan.',  'feb': 'Feb.',
 56 |                              'mar': 'Mar.',  'apr': 'Apr.',
 57 |                              'may': 'May',   'jun': 'June',
 58 |                              'jul': 'July',  'aug': 'Aug.',
 59 |                              'sep': 'Sept.', 'oct': 'Oct.',
 60 |                              'nov': 'Nov.',  'dec': 'Dec.'}
 61 |         elif month_style == None:
 62 |             self.__macros = {}
 63 |         else:
 64 |             raise ValueError('Unknown month style {}'.format(month_style))
 65 | 
 66 |     def string(self, name, value):
 67 |         """Declare a macro, just like an @string command."""
 68 |         self.__macros[name] = value
 69 | 
 70 |     def parse(self, str_or_fp_or_iter, name=None, *, log_fp=None):
 71 |         """Parse the contents of str_or_fp_or_iter and return self.
 72 | 
 73 |         str_or_fp_or_iter must be a string, a file-like object, or an
 74 |         iterable of string or file-like objects to parse in
 75 |         succession.  If name is not None, it is used as the file name.
 76 |         Otherwise, a name is constructed in a type-appropriate way.
 77 | 
 78 |         If log_fp is not None, it must be a file-local object to which
 79 |         warnings and InputErrors will be logged.  This logger will be
 80 |         attached to all Pos instances created from the file being
 81 |         parsed, so any warnings or InputErrors raised from later
 82 |         operations on derived objects (like entries or field values)
 83 |         will also be logged to log_fp.
 84 | 
 85 |         If there are any errors in the input, raises a (potentially
 86 |         bundled) InputError.
 87 | 
 88 |         Parse can be called multiple times to parse subsequent .bib
 89 |         files.  Later files will have access to, for example, strings
 90 |         defined in earlier files.
 91 |         """
 92 | 
 93 |         recoverer = messages.InputErrorRecoverer()
 94 |         if isinstance(str_or_fp_or_iter, str):
 95 |             self.__data = str_or_fp_or_iter
 96 |             fname = name or '<string>'
 97 |         elif isinstance(str_or_fp_or_iter, collections.Iterable) and \
 98 |              not hasattr(str_or_fp_or_iter, 'read'):
 99 |             for obj in str_or_fp_or_iter:
100 |                 with recoverer:
101 |                     self.parse(obj, name=name, log_fp=log_fp)
102 |             recoverer.reraise()
103 |             return self
104 |         else:
105 |             self.__data = str_or_fp_or_iter.read()
106 |             try:
107 |                 fname = name or str_or_fp_or_iter.name
108 |             except AttributeError:
109 |                 fname = '<unknown>'
110 |         self.__off = 0
111 | 
112 |         # Remove trailing whitespace from lines in data (see input_ln
113 |         # in bibtex.web)
114 |         self.__data = re.sub('[ \t]+$', '', self.__data, flags=re.MULTILINE)
115 |         self.__pos_factory = messages.PosFactory(fname, self.__data, log_fp)
116 | 
117 |         # Parse entries
118 |         while self.__off < len(self.__data):
119 |             # Just continue to the next entry if there's an error
120 |             with recoverer:
121 |                 self._scan_command_or_entry()
122 |         recoverer.reraise()
123 |         return self
124 | 
125 |     def get_entries(self):
126 |         """Return the entry database.
127 | 
128 |         The database is an ordered dictionary mapping from lower-cased
129 |         keys to Entry objects.
130 |         """
131 |         return self.__entries
132 | 
133 |     def _fail(self, msg, off=None):
134 |         if off is None:
135 |             off = self.__off
136 |         self.__pos_factory.offset_to_pos(off).raise_error(msg)
137 | 
138 |     def _warn(self, msg, off=None):
139 |         if off is None:
140 |             off = self.__off
141 |         self.__pos_factory.offset_to_pos(off).warn(msg)
142 | 
143 |     # Base parsers.  These are the only methods that directly
144 |     # manipulate self.__data.
145 | 
146 |     def _try_tok(self, regexp, skip_space=True):
147 |         """Scan regexp followed by white space.
148 | 
149 |         Returns the matched text, or None if the match failed."""
150 |         if isinstance(regexp, str):
151 |             regexp = re.compile(regexp)
152 |         m = regexp.match(self.__data, self.__off)
153 |         if m is None:
154 |             return None
155 |         self.__off = m.end()
156 |         if skip_space:
157 |             self._skip_space()
158 |         return m.group(0)
159 | 
160 |     def _scan_balanced_text(self, term):
161 |         """Scan brace-balanced text terminated with character term."""
162 |         start, level = self.__off, 0
163 |         while self.__off < len(self.__data):
164 |             char = self.__data[self.__off]
165 |             if level == 0 and char == term:
166 |                 text = self.__data[start:self.__off]
167 |                 self.__off += 1
168 |                 self._skip_space()
169 |                 return text
170 |             elif char == '{':
171 |                 level += 1
172 |             elif char == '}':
173 |                 level -= 1
174 |                 if level < 0:
175 |                     self._fail('unexpected }')
176 |             self.__off += 1
177 |         self._fail('unterminated string')
178 | 
179 |     def _skip_space(self):
180 |         # This is equivalent to eat_bib_white_space, except that we do
181 |         # it automatically after every token, whereas bibtex carefully
182 |         # and explicitly does it between every token.
183 |         self.__off = SPACE_RE.match(self.__data, self.__off).end()
184 | 
185 |     # Helpers
186 | 
187 |     def _tok(self, regexp, fail=None):
188 |         """Scan token regexp or fail with the given message."""
189 |         res = self._try_tok(regexp)
190 |         if res is None:
191 |             assert fail
192 |             self._fail(fail)
193 |         return res
194 | 
195 |     # Productions
196 | 
197 |     def _scan_identifier(self):
198 |         return self._tok(ID_RE, 'expected identifier')
199 | 
200 |     def _scan_command_or_entry(self):
201 |         # See get_bib_command_or_entry_and_process
202 | 
203 |         # Skip to the next database entry or command
204 |         self._tok('[^@]*')
205 |         pos = self.__pos_factory.offset_to_pos(self.__off)
206 |         if not self._try_tok('@'):
207 |             return None
208 | 
209 |         # Scan command or entry type
210 |         typ = self._scan_identifier().lower()
211 | 
212 |         if typ == 'comment':
213 |             # Believe it or not, BibTeX doesn't do anything with what
214 |             # comes after an @comment, treating it like any other
215 |             # inter-entry noise.
216 |             return None
217 | 
218 |         left = self._tok('[{(]', 'expected { or ( after entry type')
219 |         right, right_re = (')', '\\)') if left == '(' else ('}', '}')
220 | 
221 |         if typ == 'preamble':
222 |             # Parse the preamble, but ignore it
223 |             self._scan_field_value()
224 |             self._tok(right_re, 'expected '+right)
225 |             return None
226 | 
227 |         if typ == 'string':
228 |             name = self._scan_identifier().lower()
229 |             if name in self.__macros:
230 |                 self._warn('macro `{}\' redefined'.format(name))
231 |             self._tok('=', 'expected = after string name')
232 |             value = self._scan_field_value()
233 |             self._tok(right_re, 'expected '+right)
234 |             self.__macros[name] = value
235 |             return None
236 | 
237 |         # Not a command, must be a database entry
238 | 
239 |         # Scan the entry's database key
240 |         if left == '(':
241 |             # The database key is anything up to a comma, white
242 |             # space, or end-of-line (yes, the key can be empty,
243 |             # and it can include a close paren)
244 |             key = self._tok('[^, \t\n]*')
245 |         else:
246 |             # The database key is anything up to comma, white
247 |             # space, right brace, or end-of-line
248 |             key = self._tok('[^, \t}\n]*')
249 | 
250 |         # Scan entries (starting with comma or close after key)
251 |         fields = []
252 |         field_pos = {}
253 |         while True:
254 |             if self._try_tok(right_re):
255 |                 break
256 |             self._tok(',', 'expected {} or ,'.format(right))
257 |             if self._try_tok(right_re):
258 |                 break
259 | 
260 |             # Scan field name and value
261 |             field_off = self.__off
262 |             field = self._scan_identifier().lower()
263 |             self._tok('=', 'expected = after field name')
264 |             value = self._scan_field_value()
265 | 
266 |             if field in field_pos:
267 |                 pos.warn('repeated field `{}\''.format(field))
268 |                 continue
269 | 
270 |             fields.append((field, value))
271 |             field_pos[field] = self.__pos_factory.offset_to_pos(field_off)
272 | 
273 |         if key.lower() in self.__entries:
274 |             self._fail('repeated entry')
275 |         self.__entries[key.lower()] = Entry(fields, typ, key, pos, field_pos)
276 | 
277 |     def _scan_field_value(self):
278 |         # See scan_and_store_the_field_value_and_eat_white
279 |         value = self._scan_field_piece()
280 |         while self._try_tok('#'):
281 |             value += self._scan_field_piece()
282 |         # Compress spaces in the text.  Bibtex does this
283 |         # (painstakingly) as it goes, but the final effect is the same
284 |         # (see check_for_and_compress_bib_white_space).
285 |         value = re.sub('[ \t\n]+', ' ', value)
286 |         # Strip leading and trailing space (literally just space, see
287 |         # @<Store the field value string@>)
288 |         return value.strip(' ')
289 | 
290 |     def _scan_field_piece(self):
291 |         # See scan_a_field_token_and_eat_white
292 |         piece = self._try_tok('[0-9]+')
293 |         if piece is not None:
294 |             return piece
295 |         if self._try_tok('{', skip_space=False):
296 |             return self._scan_balanced_text('}')
297 |         if self._try_tok('"', skip_space=False):
298 |             return self._scan_balanced_text('"')
299 |         opos = self.__off
300 |         piece = self._try_tok(ID_RE)
301 |         if piece is not None:
302 |             if piece.lower() not in self.__macros:
303 |                 self._warn('unknown macro `{}\''.format(piece), opos)
304 |                 return ''
305 |             return self.__macros[piece.lower()]
306 |         self._fail('expected string, number, or macro name')
307 | 
308 | class FieldError(KeyError):
309 |     def __init__(self, field, entry=None):
310 |         super().__init__(field)
311 |         self.__entry = entry
312 | 
313 |     def __str__(self):
314 |         return '{}: missing field `{}\''.format(self.__entry, self.args[0])
315 | 
316 | MONTH_MACROS = 'jan feb mar apr may jun jul aug sep oct nov dec'.split()
317 | 
318 | class Entry(collections.OrderedDict):
319 |     """An entry in a BibTeX database.
320 | 
321 |     This is an ordered dictionary of fields, plus some additional
322 |     properties: typ gives the type of the entry, such as "journal",
323 |     canonicalized to lower case.  key gives the database entry key
324 |     (case is preserved, but should be ignored for comparisons).  pos
325 |     is a messages.Pos instance giving the position of this entry in
326 |     the database file.  field_pos is a simple dictionary from field
327 |     names to message.Pos instances.
328 | 
329 |     Field values are as they would be seen by a .bst file: white space
330 |     is cleaned up, but they retain macros, BibTeX-style accents, etc.
331 |     Use algo.tex_to_unicode to interpret field values to user-friendly
332 |     Unicode strings.
333 |     """
334 | 
335 |     def __init__(self, fields, typ=None, key=None, pos=None, field_pos=None):
336 |         super().__init__(fields)
337 |         self.typ, self.key, self.pos, self.field_pos = typ, key, pos, field_pos
338 | 
339 |     def copy(self):
340 |         return self.__class__(self, self.typ, self.key, self.pos, self.field_pos)
341 | 
342 |     def __str__(self):
343 |         return '`{}\' at {}'.format(self.key, self.pos)
344 | 
345 |     def __getitem__(self, field):
346 |         try:
347 |             return super().__getitem__(field)
348 |         except KeyError:
349 |             raise FieldError(field, self) from None
350 | 
351 |     def __eq__(self, o):
352 |         """Two Entries are equal if they have the same fields, type, and key."""
353 |         return super().__eq__(o) and self.typ == o.typ and self.key == o.key
354 | 
355 |     def to_bib(self, *, month_to_macro=True, wrap_width=70):
356 |         """Return this entry formatted as a BibTeX .bib entry.
357 | 
358 |         If month_to_macro is True, attempt to parse month names and
359 |         replace them with their standard macro.
360 | 
361 |         If wrap_width is not None, word wrap the entry at this many
362 |         columns (long words and hyphens are not split).
363 |         """
364 | 
365 |         lines = ['@%s{%s,' % (self.typ, self.key)]
366 |         for k, v in self.items():
367 |             start = '  {:12} = '.format(k)
368 | 
369 |             if month_to_macro and k == 'month':
370 |                 try:
371 |                     macro = MONTH_MACROS[self.month_num() - 1]
372 |                 except messages.InputError:
373 |                     pass
374 |                 else:
375 |                     lines.append(start + macro + ',')
376 |                     continue
377 | 
378 |             if v.isdigit():
379 |                 lines.append(start + v + ',')
380 |             elif wrap_width is None:
381 |                 lines.append(start + '{' + v + '},')
382 |             else:
383 |                 lines.append(textwrap.fill(
384 |                     v, width=wrap_width,
385 |                     # Keep whitespace formatting as it is
386 |                     expand_tabs=False, replace_whitespace=False,
387 |                     # Don't break long things like URLs
388 |                     break_long_words=False, break_on_hyphens=False,
389 |                     initial_indent=start + '{', subsequent_indent='    ') + '},')
390 |         lines.append('}')
391 |         return '\n'.join(lines)
392 | 
393 |     def resolve_crossref(self, entries):
394 |         """Return a new entry with crossref-ed fields incorporated.
395 | 
396 |         entries must be the database in which to find any crossref-ed
397 |         database entries.
398 |         """
399 |         if 'crossref' not in self:
400 |             return self
401 |         nentry = self.copy()
402 |         source = entries[self['crossref'].lower()]
403 |         if 'crossref' in source:
404 |             self.field_pos['crossref'].warn('nested crossref')
405 |         for k, v in source.items():
406 |             if k not in nentry:
407 |                 nentry[k] = v
408 |                 nentry.field_pos[k] = source.field_pos[k]
409 |         del nentry['crossref']
410 |         return nentry
411 | 
412 |     def date_key(self):
413 |         """Return a sort key appropriate for sorting by date.
414 | 
415 |         Returns a tuple ([year, [month]]) where year and month are
416 |         numeric.  Raises InputError if the entry has year and/or month
417 |         fields, but they are malformed.
418 |         """
419 | 
420 |         key = ()
421 |         year, month = self.get('year'), self.get('month')
422 |         if year is not None:
423 |             if not year.isdigit():
424 |                 self.field_pos['year'].raise_error(
425 |                     'invalid year `{}\''.format(year))
426 |             key += (int(year),)
427 |         if month is not None:
428 |             if year is None:
429 |                 self.field_pos['month'].raise_error('month without year')
430 |             key += (self.month_num(),)
431 |         return key
432 | 
433 |     def authors(self, field='author'):
434 |         """Return a list of parsed author names.
435 | 
436 |         This is a wrapper for biblib.algo.parse_names.
437 |         """
438 |         from .algo import parse_names
439 |         return parse_names(self[field], self.field_pos[field])
440 | 
441 |     def month_num(self, field='month'):
442 |         """Convert the month of this entry into a number in [1,12].
443 | 
444 |         This is a wrapper for biblib.algo.parse_month (which see).
445 | 
446 |         Raises KeyError if this entry does not have the specified
447 |         field and InputError if the field cannot be parsed.
448 |         """
449 |         from .algo import parse_month
450 |         return parse_month(self[field], pos=self.field_pos[field])
451 | 
452 | def resolve_crossrefs(db, min_crossrefs=None):
453 |     """Resolve cross-referenced entries in db.
454 | 
455 |     This returns a new database containing the same entries in the
456 |     same order as db, but any entries that crossref another entry are
457 |     expanded with the fields for the cross-referenced entry.
458 | 
459 |     If min_crossrefs is not None, then any entry that is
460 |     cross-referenced by min_crossrefs or more other entries will *not*
461 |     be expanded and entries that cross-reference it will retain their
462 |     crossref field.  If min_crossrefs is None, entries are always
463 |     expanded.  (This mimics BibTeX "-min-crossrefs" option.)
464 | 
465 |     If there are unknown crossrefs, raises a (potentially bundled)
466 |     InputError.
467 |     """
468 |     if min_crossrefs is not None:
469 |         counts = collections.Counter(entry['crossref'].lower()
470 |                                      for entry in db.values()
471 |                                      if 'crossref' in entry)
472 |     else:
473 |         counts = None
474 | 
475 |     key_idx = {k: i for i, k in enumerate(db)}
476 |     recoverer = messages.InputErrorRecoverer()
477 |     ndb = collections.OrderedDict()
478 |     for entry_idx, (key, entry) in enumerate(db.items()):
479 |         crossref = entry.get('crossref')
480 |         if crossref is None:
481 |             ndb[key] = entry
482 |         else:
483 |             with recoverer:
484 |                 crossref_idx = key_idx.get(crossref.lower())
485 |                 if crossref_idx is None:
486 |                     entry.field_pos['crossref'].raise_error(
487 |                         'unknown crossref `{}\''.format(crossref))
488 |                 elif crossref_idx < entry_idx:
489 |                     entry.field_pos['crossref'].raise_error(
490 |                         'crossref `{}\' must come after entry'.format(crossref))
491 |                 elif counts and counts[crossref.lower()] >= min_crossrefs:
492 |                     ndb[key] = entry
493 |                 else:
494 |                     ndb[key] = entry.resolve_crossref(db)
495 |     recoverer.reraise()
496 |     return ndb
497 | 


--------------------------------------------------------------------------------