├── MANIFEST.in ├── biblib ├── __init__.py ├── messages.py ├── test.py ├── algo.py └── bib.py ├── .gitignore ├── setup.py ├── LICENSE ├── examples ├── bib2bib └── bibparse └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /biblib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = 'bib algo messages'.split() 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | 3 | # Files created by setup.py 4 | /build/ 5 | /dist/ 6 | MANIFEST 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='biblib', 5 | version='0.1.0', 6 | description='Simple, correct BibTeX parser and algorithms', 7 | url='https://github.com/aclements/biblib', 8 | author='Austin Clements', 9 | author_email='aclements@csail.mit.edu', 10 | packages=['biblib'], 11 | keywords=['bibtex', 'tex'], 12 | classifiers=[ 13 | 'Development Status :: 4 - Beta', 14 | 'Intended Audience :: Developers', 15 | 'Intended Audience :: Science/Research', 16 | 'License :: OSI Approved :: MIT License', 17 | 'Programming Language :: Python', 18 | 'Programming Language :: Python :: 3', 19 | 'Topic :: Database', 20 | 'Topic :: Text Processing', 21 | ], 22 | long_description=open('README.md').read(), 23 | ) 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 Austin Clements 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /examples/bib2bib: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import biblib.bib 4 | import argparse 5 | import sys 6 | 7 | def main(): 8 | arg_parser = argparse.ArgumentParser( 9 | description='Flatten macros, combine, and pretty-print .bib database(s)') 10 | arg_parser.add_argument('bib', nargs='+', help='.bib file(s) to process', 11 | type=open) 12 | arg_parser.add_argument('--min-crossrefs', type=int, 13 | help='minimum number of cross-referencing entries' 14 | ' required to expand a crossref; if omitted, no' 15 | ' expansion occurs', default=None) 16 | args = arg_parser.parse_args() 17 | 18 | try: 19 | # Load databases 20 | db = biblib.bib.Parser().parse(args.bib, log_fp=sys.stderr).get_entries() 21 | 22 | # Optionally resolve cross-references 23 | if args.min_crossrefs is not None: 24 | db = biblib.bib.resolve_crossrefs( 25 | db, min_crossrefs=args.min_crossrefs) 26 | except biblib.messages.InputError: 27 | sys.exit(1) 28 | 29 | # Pretty-print entries 30 | for ent in db.values(): 31 | print(ent.to_bib()) 32 | print() 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /examples/bibparse: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import biblib.bib 4 | import biblib.messages 5 | import biblib.algo 6 | import argparse 7 | import sys 8 | import re 9 | 10 | MONTHS = 'January February March April May June July August September October November December'.split() 11 | 12 | def main(): 13 | arg_parser = argparse.ArgumentParser( 14 | description='Parse .bib database(s) and print basic fields as text') 15 | arg_parser.add_argument('bib', nargs='+', help='.bib file(s) to process', 16 | type=open) 17 | args = arg_parser.parse_args() 18 | 19 | try: 20 | # Load databases 21 | db = biblib.bib.Parser().parse(args.bib, log_fp=sys.stderr).get_entries() 22 | 23 | # Resolve cross-references 24 | db = biblib.bib.resolve_crossrefs(db) 25 | 26 | # Print entries 27 | recoverer = biblib.messages.InputErrorRecoverer() 28 | for ent in db.values(): 29 | with recoverer: 30 | print_entry(ent) 31 | recoverer.reraise() 32 | except biblib.messages.InputError: 33 | sys.exit(1) 34 | 35 | def print_entry(ent): 36 | print('{ent.key} ({ent.typ}):'.format(ent=ent)) 37 | if 'title' in ent: 38 | print(' ' + biblib.algo.tex_to_unicode(biblib.algo.title_case( 39 | ent['title'], pos=ent.field_pos['title']))) 40 | 41 | if 'author' in ent: 42 | authors = [ 43 | biblib.algo.tex_to_unicode(author.pretty(), 44 | pos=ent.field_pos['author']) 45 | for author in ent.authors()] 46 | if len(authors) == 0: 47 | author = None 48 | elif len(authors) == 1: 49 | author = authors[0] 50 | else: 51 | author = ', '.join(authors[:-1]) 52 | if len(authors) > 2: 53 | author += ',' 54 | if ent.authors()[-1].is_others(): 55 | author += ' et al.' 56 | else: 57 | author += ' and ' + authors[-1] 58 | if author: 59 | print(' By ' + author) 60 | 61 | if 'year' in ent: 62 | if 'month' in ent: 63 | mnum = ent.month_num() 64 | print(' {} {}'.format(MONTHS[mnum - 1], ent['year'])) 65 | else: 66 | print(' {}'.format(ent['year'])) 67 | 68 | print() 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Biblib provides a simple, standalone Python3 package for parsing 2 | BibTeX bibliographic databases, as well as algorithms for manipulating 3 | BibTeX entries in BibTeX-y ways. 4 | 5 | There are a lot of BibTeX parsers out there. Most of them are 6 | complete nonsense based on some imaginary grammar made up by the 7 | module's author that is almost, but not quite, entirely unlike 8 | BibTeX's actual grammar. *BibTeX has a grammar*. It's even pretty 9 | simple, though it's probably not what you think it is. The hardest 10 | part of BibTeX's grammar is that it's only written down in one place: 11 | the BibTeX source code. 12 | 13 | Biblib's parser is derived directly from the WEB source code for 14 | BibTeX and hence (barring bugs in translation) should be fully 15 | compatible with BibTeX's own parser. 16 | 17 | 18 | Features 19 | -------- 20 | 21 | * BibTeX-compatible `.bib` file parser 22 | 23 | * BibTeX-compatible name parser for fields like `author` 24 | 25 | * Crossref resolution 26 | 27 | * BibTeX-compatible title casing 28 | 29 | * Translator for common TeX markup (like accents) to Unicode (which 30 | can, in turn, be used in HTML and other formats). 31 | 32 | 33 | Installation 34 | ------------ 35 | 36 | Since biblib has no external dependencies or C modules, you can use 37 | biblib in your project by simply unpacking it under your source tree 38 | and adding 39 | 40 | sys.path.append('biblib') 41 | 42 | before importing it. 43 | 44 | Biblib can also be installed system-wide with 45 | 46 | python3 setup.py install 47 | 48 | 49 | Examples 50 | -------- 51 | 52 | There are a few simple examples of biblib's use in `examples/`. To 53 | run these dircetly from the source tree, use, for example 54 | 55 | PYTHONPATH=$PWD ./examples/bibparse test.bib 56 | 57 | 58 | Recognized grammar 59 | ------------------ 60 | 61 | For reference, the `.bib` parser implements a grammar equivalent to 62 | the following PEG. All literals are matched case-*insensitively*. 63 | 64 | bib_db = comment (command_or_entry comment)* 65 | 66 | comment = [^@]* 67 | 68 | ws = [ \t\n]* 69 | 70 | ident = ![0-9] (![ \t"#%'(),={}] [\x20-\x7f])+ 71 | 72 | command_or_entry = '@' ws (comment / preamble / string / entry) 73 | 74 | comment = 'comment' 75 | 76 | preamble = 'preamble' ws ( '{' ws preamble_body ws '}' 77 | / '(' ws preamble_body ws ')' ) 78 | 79 | preamble_body = value 80 | 81 | string = 'string' ws ( '{' ws string_body ws '}' 82 | / '(' ws string_body ws ')' ) 83 | 84 | string_body = ident ws '=' ws value 85 | 86 | entry = ident ws ( '{' ws key ws entry_body? ws '}' 87 | / '(' ws key_paren ws entry_body? ws ')' ) 88 | 89 | key = [^, \t}\n]* 90 | 91 | key_paren = [^, \t\n]* 92 | 93 | entry_body = (',' ws ident ws '=' ws value ws)* ','? 94 | 95 | value = piece (ws '#' ws piece)* 96 | 97 | piece 98 | = [0-9]+ 99 | / '{' balanced* '}' 100 | / '"' (!'"' balanced)* '"' 101 | / ident 102 | 103 | balanced 104 | = '{' balanced* '}' 105 | / [^{}] 106 | -------------------------------------------------------------------------------- /biblib/messages.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import threading 3 | import warnings 4 | 5 | class Pos(collections.namedtuple('Pos', 'fname line col log_fp')): 6 | """A position in a file. 7 | 8 | This also optionally tracks a file-like object for logging 9 | warnings and errors associated with this file. 10 | """ 11 | 12 | def __str__(self): 13 | return '{}:{}:{}'.format(self.fname, self.line, self.col) 14 | 15 | def warn(self, msg): 16 | """Log msg to this Pos's logger. 17 | 18 | If log_fp is None, the warning is silently discarded. 19 | """ 20 | if self.log_fp is not None: 21 | self.log_fp.write('{}: warning: {}\n'.format(self, msg)) 22 | 23 | def raise_error(self, msg): 24 | """Log and raise InputError([(self, msg)]).""" 25 | if self.log_fp is not None: 26 | self.log_fp.write('{}: error: {}\n'.format(self, msg)) 27 | raise InputError([(self, msg)]) 28 | 29 | Pos.unknown = Pos('', 1, 0, None) 30 | 31 | class PosFactory: 32 | """A factory that translates character offsets to Pos instances.""" 33 | 34 | def __init__(self, fname, string, log_fp=None): 35 | self.__fname = fname 36 | self.__string = string 37 | self.__log_fp = log_fp 38 | self.__cache = (0, 1, 0) 39 | 40 | def offset_to_pos(self, offset): 41 | last_off, last_line, last_col = self.__cache 42 | if last_off < offset: 43 | last_off, last_line, last_col = 0, 1, 0 44 | 45 | line = self.__string.count('\n', last_off, offset) + last_line 46 | lastnl = self.__string.rfind('\n', last_off, offset) 47 | if lastnl == -1: 48 | col = last_col + (offset - last_off) 49 | else: 50 | col = offset - lastnl - 1 51 | self.__cache = (offset, line, col) 52 | 53 | return Pos(self.__fname, line, col, self.__log_fp) 54 | 55 | class InputError(ValueError): 56 | """One or more errors with associated Pos instances. 57 | 58 | An InputError representing multiple errors is a "bundled" error. 59 | These can be created when recovering from errors with an 60 | InputErrorRecoverer. 61 | """ 62 | 63 | def __init__(self, list_of_pos_msg): 64 | super().__init__(list_of_pos_msg) 65 | 66 | def __str__(self): 67 | list_of_msg_pos = self.args[0] 68 | if len(list_of_msg_pos) == 1: 69 | return '1 error' 70 | return '{} errors'.format(len(list_of_msg_pos)) 71 | 72 | class InputErrorRecoverer: 73 | """A context manager for recovering from and bundling InputErrors. 74 | 75 | This context manager catches and collects InputErrors, effectively 76 | recovering from InputErrors at the end of the with block. An 77 | InputErrorRecoverer can be used several times, after which 78 | collected errors can be re-raised as a bundled InputError. For 79 | example, a caller that wishes to parse several files without 80 | stopping because of an error in one file can do something like 81 | this: 82 | 83 | recoverer = InputErrorRecoverer() 84 | for filename in filenames: 85 | with recoverer: 86 | parse(filename) 87 | recoverer.reraise() 88 | 89 | An InputErrorRecoverer *must* be either reraised or disposed (even 90 | if no errors occurred). Otherwise a UserWarning will be issued. 91 | """ 92 | 93 | def __init__(self): 94 | self.__errors = [] 95 | 96 | def __enter__(self): 97 | return None 98 | 99 | def __exit__(self, exc_type, exc_value, traceback): 100 | if self.__errors is None: 101 | raise ValueError('InputErrorRecoverer already disposed') 102 | if isinstance(exc_value, InputError): 103 | self.__errors.extend(exc_value.args) 104 | return True 105 | 106 | def __del__(self): 107 | if self.__errors is not None: 108 | try: 109 | warnings.warn('InputErrorRecoverer must be reraised or disposed', 110 | stacklevel=2) 111 | except TypeError as e: 112 | # If Python is exiting, warnings.warn has a habit of 113 | # raising TypeError("'NoneType' object is not 114 | # iterable",). Ignore it. 115 | pass 116 | 117 | def reraise(self): 118 | """If any errors have been collected, raise a bundled InputError.""" 119 | errors = self.__errors 120 | self.dispose() 121 | if errors: 122 | raise InputError(errors) 123 | 124 | def dispose(self): 125 | """Discard all collected errors.""" 126 | self.__errors = None 127 | -------------------------------------------------------------------------------- /biblib/test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import collections 3 | import io 4 | from .bib import * 5 | from .algo import * 6 | from .messages import * 7 | from . import algo 8 | 9 | def od(*args): 10 | return collections.OrderedDict(zip(args[::2], args[1::2])) 11 | 12 | def ent(typ, key, fields): 13 | return Entry(fields, typ, key) 14 | 15 | class BibParserTest(unittest.TestCase): 16 | def __test_parse(self, string, ents): 17 | # Parse string and get entries 18 | got = list(Parser().parse(string).get_entries().values()) 19 | self.assertEqual(got, ents) 20 | 21 | def test_basic(self): 22 | self.__test_parse( 23 | '@misc{x, title="title", author={author}}\n@misc{y, title=123,}', 24 | [ent('misc', 'x', od('title', 'title', 25 | 'author', 'author')), 26 | ent('misc', 'y', od('title', '123'))]) 27 | 28 | def test_balanced(self): 29 | self.__test_parse( 30 | '@misc{x, title={a{b}c}, author="x{y}z"}', 31 | [ent('misc', 'x', od('title', 'a{b}c', 32 | 'author', 'x{y}z'))]) 33 | 34 | def test_whitespace(self): 35 | self.__test_parse( 36 | ' @ misc { x , title = {a} , }', 37 | [ent('misc', 'x', od('title', 'a'))]) 38 | 39 | def test_compress(self): 40 | self.__test_parse( 41 | '@misc{x, title={ a\t b\n c }}', 42 | [ent('misc', 'x', od('title', 'a b c'))]) 43 | 44 | def test_funny_keys(self): 45 | self.__test_parse( 46 | '@misc{@"#%\'()=, title="a"}', 47 | [ent('misc', '@"#%\'()=', od('title', 'a'))]) 48 | for string in ['@misc{}', '@misc{,}', '@misc{\n}']: 49 | self.__test_parse(string, [ent('misc', '', od())]) 50 | self.__test_parse( 51 | '@misc{,title="a"}', 52 | [ent('misc', '', od('title', 'a'))]) 53 | self.__test_parse( 54 | '@misc(k)ey, title="a")\n@misc{k{ey, title="a"}', 55 | [ent('misc', 'k)ey', od('title', 'a')), 56 | ent('misc', 'k{ey', od('title', 'a'))]) 57 | 58 | def test_string(self): 59 | self.__test_parse( 60 | '@string{foo = {a}}\n@misc{x, title = foo # "b" # foo # 2}', 61 | [ent('misc', 'x', od('title', 'aba2'))]) 62 | 63 | def test_comment(self): 64 | self.__test_parse( 65 | # Braces intentionally unbalanced, everything on one line 66 | '@comment{abc@misc{x}', 67 | [ent('misc', 'x', od())]) 68 | 69 | class EntryTest(unittest.TestCase): 70 | def test_to_bib(self): 71 | entry = Entry([('author', 'An Author'), 72 | ('title', 'This is a ' + 'really '*10 + 'long title'), 73 | ('month', 'November'), ('year', '2013')], 74 | typ='misc', key='key', field_pos={'month': Pos.unknown}) 75 | self.assertEqual( 76 | entry.to_bib(), 77 | '''\ 78 | @misc{key, 79 | author = {An Author}, 80 | title = {This is a really really really really really really 81 | really really really really long title}, 82 | month = nov, 83 | year = 2013, 84 | }''') 85 | self.assertEqual( 86 | entry.to_bib(month_to_macro=False, wrap_width=None), 87 | '''\ 88 | @misc{key, 89 | author = {An Author}, 90 | title = {This is a really really really really really really really really really really long title}, 91 | month = {November}, 92 | year = 2013, 93 | }''') 94 | 95 | def test_month_num(self): 96 | def test(string, expect): 97 | entry = Entry([('month', string)], field_pos={'month': Pos.unknown}) 98 | self.assertEqual(entry.month_num(), expect) 99 | for i, name in enumerate(['Jan.','Feb.','Mar.','Apr.','May','June', 100 | 'July','Aug.','Sept.','Oct.','Nov.','Dec.']): 101 | test(name, i+1) 102 | for i, name in enumerate(['January','February','March','April', 103 | 'May','June','July','August', 104 | 'September','October','November','December']): 105 | test(name, i+1) 106 | self.assertRaises(InputError, test, 'Foo', None) 107 | self.assertRaises(InputError, test, 'Janruary', None) 108 | 109 | def test_date_key(self): 110 | def test(year, month, expect): 111 | fields = [] 112 | if year: fields.append(('year', year)) 113 | if month: fields.append(('month', month)) 114 | entry = Entry(fields, field_pos={'year' : Pos.unknown, 115 | 'month' : Pos.unknown}) 116 | got = entry.date_key() 117 | if expect is not None: 118 | self.assertEqual(got, expect) 119 | test(None, None, ()) 120 | test('2013', None, (2013,)) 121 | test('2013', 'jan', (2013, 1)) 122 | self.assertRaises(InputError, test, 'x', None, None) 123 | self.assertRaises(InputError, test, None, 'jan', None) 124 | self.assertRaises(InputError, test, '2013', 'foo', None) 125 | 126 | class CrossRefTest(unittest.TestCase): 127 | def setUp(self): 128 | self.parser = Parser().parse("""\ 129 | @misc{ent1, title={Title 1}, crossref={ent2}} 130 | @misc{ent2, title={Title 2}, booktitle={Book title 2}}""") 131 | 132 | def test_basic(self): 133 | db = resolve_crossrefs(self.parser.get_entries()) 134 | self.assertEqual( 135 | [('ent1', ent('misc', 'ent1', [('title', 'Title 1'), 136 | ('booktitle', 'Book title 2')])), 137 | ('ent2', ent('misc', 'ent2', [('title', 'Title 2'), 138 | ('booktitle', 'Book title 2')]))], 139 | list(db.items())) 140 | 141 | def test_min_crossrefs(self): 142 | db = resolve_crossrefs(self.parser.get_entries(), min_crossrefs=1) 143 | self.assertEqual( 144 | [('ent1', ent('misc', 'ent1', [('title', 'Title 1'), 145 | ('crossref', 'ent2')])), 146 | ('ent2', ent('misc', 'ent2', [('title', 'Title 2'), 147 | ('booktitle', 'Book title 2')]))], 148 | list(db.items())) 149 | 150 | db = resolve_crossrefs(self.parser.get_entries(), min_crossrefs=2) 151 | self.assertEqual( 152 | [('ent1', ent('misc', 'ent1', [('title', 'Title 1'), 153 | ('booktitle', 'Book title 2')])), 154 | ('ent2', ent('misc', 'ent2', [('title', 'Title 2'), 155 | ('booktitle', 'Book title 2')]))], 156 | list(db.items())) 157 | 158 | def test_self_crossref(self): 159 | # This is accepted, believe it or not (though BibTeX warns) 160 | log = io.StringIO() 161 | self.parser.parse("@misc{ent3, title={Title 3}, crossref={ent3}}", 162 | name='', log_fp=log) 163 | resolve_crossrefs(self.parser.get_entries()) 164 | self.assertIn(':1:', log.getvalue()) 165 | 166 | def test_bad_order(self): 167 | self.parser.parse("""\ 168 | @misc{ent3, title={Title 3}, crossref={ent2}}""") 169 | with self.assertRaises(InputError) as ar: 170 | resolve_crossrefs(self.parser.get_entries()) 171 | self.assertEqual(1, len(ar.exception.args[0])) 172 | 173 | class NameParserTest(unittest.TestCase): 174 | def test_first_char(self): 175 | p = algo.NameParser() 176 | for check, expect in [('abc', 'a'), ('ABC', 'A'), (' abc', 'a'), 177 | ('\\abc', 'a'), ('{a} bc', 'b'), 178 | ('{\\`a}bc', 'a'), ('{\\aa}bc', 'å'), 179 | ('{\\a}bc', 'a')]: 180 | self.assertEqual(p._first_char(check), expect) 181 | 182 | def test_and(self): 183 | p = parse_names 184 | self.assertEqual(p('A B and C D'), 185 | [Name('A', '', 'B', ''), Name('C', '', 'D', '')]) 186 | self.assertEqual(p('A B AND C D'), 187 | [Name('A', '', 'B', ''), Name('C', '', 'D', '')]) 188 | self.assertEqual(p('A B and and C D'), 189 | [Name('A', '', 'B', ''), Name('', '', '', ''), 190 | Name('C', '', 'D', '')]) 191 | self.assertEqual(p('A B and and'), 192 | [Name('A', '', 'B', ''), Name('', '', 'and', '')]) 193 | self.assertEqual(p('A B { and } C D'), 194 | [Name('A B { and } C', '', 'D', '')]) 195 | self.assertEqual(p('A B {\\ and } C D'), 196 | [Name('A B', '{\\ and }', 'C D', '')]) 197 | 198 | def __test_names(self, *tests): 199 | for test in tests: 200 | if len(test) == 4: 201 | test = test + ('',) 202 | names = parse_names(test[0]) 203 | self.assertEqual(names, [Name(*test[1:])], 204 | 'parsing {!r}'.format(test[0])) 205 | 206 | def test_first_von_last(self): 207 | # Examples mostly from Nicolas Markey's Tame the BeaST 208 | self.__test_names( 209 | ('jean de la fontaine', '', 'jean de la', 'fontaine'), 210 | ('Jean de la fontaine', 'Jean', 'de la', 'fontaine'), 211 | ('Jean De La fontaine', 'Jean De La', '', 'fontaine'), 212 | ('Jean {de} la fontaine', 'Jean {de}', 'la', 'fontaine'), 213 | ('jean {de} {la} fontaine', '', 'jean', '{de} {la} fontaine'), 214 | ('Jean {de} {la} fontaine', 'Jean {de} {la}', '', 'fontaine'), 215 | ('Jean De La Fontaine', 'Jean De La', '', 'Fontaine'), 216 | ('jean De la Fontaine', '', 'jean De la', 'Fontaine'), 217 | ('Jean de La Fontaine', 'Jean', 'de', 'La Fontaine'), 218 | ('Jean-Baptiste Poquelin', 'Jean-Baptiste', '', 'Poquelin'), 219 | ('Jean-Baptiste-Poquelin', '', '', 'Jean-Baptiste-Poquelin'), 220 | ('Jean- Baptiste-Poquelin', '', '', 'Jean-Baptiste-Poquelin'), 221 | ('Jean Baptiste Poquelin', 'Jean Baptiste', '', 'Poquelin'), 222 | ('Jean Baptiste-Poquelin', 'Jean', '', 'Baptiste-Poquelin'), 223 | ('Jean Baptiste~Poquelin', 'Jean Baptiste', '', 'Poquelin'), 224 | ('Jean-baptiste Poquelin', 'Jean', 'baptiste', 'Poquelin')) 225 | 226 | def test_von_last_first(self): 227 | self.__test_names( 228 | ('de la fontaine, Jean', 'Jean', 'de la', 'fontaine'), 229 | ('De La Fontaine, Jean', 'Jean', '', 'De La Fontaine'), 230 | ('De la Fontaine, Jean', 'Jean', 'De la', 'Fontaine'), 231 | ('de La Fontaine, Jean', 'Jean', 'de', 'La Fontaine'), 232 | ('{D}e {L}a Cruz, Maria', 'Maria', '{D}e {L}a', 'Cruz')) 233 | 234 | def test_von_last_jr_first(self): 235 | self.__test_names( 236 | ('de la fontaine, Jean, Jr', 'Jean', 'de la', 'fontaine', 'Jr'), 237 | ('De La Fontaine, Jean, Jr', 'Jean', '', 'De La Fontaine', 'Jr'), 238 | ('De la Fontaine, Jean, Jr', 'Jean', 'De la', 'Fontaine', 'Jr'), 239 | ('de La Fontaine, Jean, Jr', 'Jean', 'de', 'La Fontaine', 'Jr')) 240 | 241 | class NamePrettyTest(unittest.TestCase): 242 | def __test(self, template, gen): 243 | for i in range(0x10): 244 | name = Name('f' if i & 1 else '', 245 | 'v' if i & 2 else '', 246 | 'l' if i & 4 else '', 247 | 'j' if i & 8 else '') 248 | got = name.pretty(template) 249 | self.assertEqual(gen(name), got) 250 | 251 | def __clean(self, string): 252 | import re 253 | string = re.sub(' +', ' ', string) 254 | string = re.sub(' *(, )+', ', ', string) 255 | return string.strip(' ,') 256 | 257 | def test_basic(self): 258 | self.__test( 259 | '{first}{von}{jr}{last}', 260 | lambda n: n.first+n.von+n.jr+n.last) 261 | self.__test( 262 | '{first} {von} {last} {jr}', 263 | lambda n: self.__clean(' '.join([n.first, n.von, n.last, n.jr]))) 264 | self.__test( 265 | '{von} {last}, {first}', 266 | lambda n: self.__clean('{0.von} {0.last}, {0.first}'.format(n))) 267 | self.__test( 268 | '{von} {last}, {first}, {jr}', 269 | lambda n: self.__clean('{0.von} {0.last}, {0.first}, {0.jr}'.format(n))) 270 | 271 | def test_before_and_after(self): 272 | self.__test( 273 | 'a{first}{von}{jr}{last}b', 274 | lambda n: 'a'+n.first+n.von+n.jr+n.last+'b') 275 | self.__test( 276 | 'a{first} {von} {last} {jr}b', 277 | lambda n: 'a'+self.__clean(' '.join([n.first, n.von, n.last, n.jr]))+'b') 278 | 279 | class CaseTest(unittest.TestCase): 280 | def __test(self, *tests): 281 | for string, want in tests: 282 | self.assertEqual(title_case(string), want, 283 | 'title-casing {!r}'.format(string)) 284 | 285 | def test_basic(self): 286 | self.__test( 287 | ('ABC DEF', 'Abc def'), 288 | ('abc def', 'abc def'), 289 | ('ABC {DEF} GHI', 'Abc {DEF} ghi'), 290 | ('ABC D{E}F GHI', 'Abc d{E}f ghi')) 291 | 292 | def test_colons(self): 293 | self.__test( 294 | ('ABC DEF: GHI JKL', 'Abc def: Ghi jkl'), 295 | ('ABC DEF: GHI JKL', 'Abc def: Ghi jkl'), 296 | ('ABC DEF:GHI JKL', 'Abc def:ghi jkl')) 297 | 298 | def test_special(self): 299 | self.__test( 300 | # Brace groups beginning with special characters are 301 | # lower-cased throughout, even under deeper braces 302 | (r'x {\AE X {X \AE}}', r'x {\ae x {x \ae}}'), 303 | # Unknown control sequences also trigger brace group 304 | # lower-casing, but are themselves left alone 305 | (r'x {\LaTeX X {X} \AE \LaTeX}', r'x {\LaTeX x {x} \ae \LaTeX}'), 306 | # Special characters are only interpreted at level 1 307 | (r'x {{\AE}}', r'x {{\AE}}'), 308 | # If a brace group does not start with a slash, it doesn't 309 | # get touched. 310 | (r'x {AE X \AE}', r'x {AE X \AE}'), 311 | # Special characters that start at position 0 or after a 312 | # colon are untouched 313 | (r'{\AE X {X} \AE} X', r'{\AE X {X} \AE} x'), 314 | (r'X: {\AE X {X} \AE}', r'X: {\AE X {X} \AE}'), 315 | (r'X:{\ae x {x} \ae}', r'X:{\ae x {x} \ae}')) 316 | 317 | class TeXToUnicodeTest(unittest.TestCase): 318 | def test_simple(self): 319 | self.assertEqual(tex_to_unicode(r'~\%\&\#\$'), '\u00A0%&#$') 320 | self.assertEqual(tex_to_unicode(r'x\ss y\i'), 'xßyı') 321 | 322 | def test_accents(self): 323 | self.assertEqual(tex_to_unicode(r'{\`a}\^{e}'), 'àê') 324 | self.assertEqual(tex_to_unicode(r'\`i\`\i'), 'ìì') 325 | 326 | def test_ligatures(self): 327 | self.assertEqual(tex_to_unicode(r'a--b---c-{-}d'), 'a\u2013b\u2014c--d') 328 | -------------------------------------------------------------------------------- /biblib/algo.py: -------------------------------------------------------------------------------- 1 | """Algorithms for manipulating BibTeX data. 2 | 3 | This module implements various algorithms supplied by BibTeX to style 4 | files, as well as some algorithms to make BibTeX data more accessible 5 | to Python. 6 | """ 7 | 8 | __all__ = ('Name parse_names ' + 9 | 'parse_month ' + 10 | 'title_case ' + 11 | 'TeXProcessor TeXToUnicode tex_to_unicode').split() 12 | 13 | import re 14 | import collections 15 | import unicodedata 16 | import string 17 | 18 | from . import messages 19 | 20 | # Control sequences (defined as "control_seq_ilk" in bibtex) and their 21 | # Unicode translations. This is similar to, but slightly different 22 | # from the TeX definitions (of course). 23 | _CONTROL_SEQS = { 24 | '\\i': 'ı', '\\j': 'ȷ', '\\oe': 'œ', '\\OE': 'Œ', 25 | '\\ae': 'æ', '\\AE': 'Æ', '\\aa': 'å', '\\AA': 'Å', 26 | '\\o': 'ø', '\\O': 'Ø', '\\l': 'ł', '\\L': 'Ł', '\\ss': 'ß' 27 | } 28 | 29 | class NameParser: 30 | def __init__(self): 31 | pass 32 | 33 | def __depth(self, data): 34 | depth, depths = 0, [0] * len(data) 35 | for pos, ch in enumerate(data): 36 | depths[pos] = depth 37 | if ch == '{': 38 | depth += 1 39 | depths[pos] = depth 40 | elif ch == '}': 41 | depth -= 1 42 | return depths 43 | 44 | def __split_depth0(self, regexp, data, flags=0): 45 | regexp = re.compile(regexp, flags=flags) 46 | depths = self.__depth(data) 47 | parts, last = [], 0 48 | for m in regexp.finditer(data): 49 | if depths[m.start()] == 0: 50 | parts.append(data[last:m.start()]) 51 | last = m.end() 52 | if regexp.groups: 53 | parts.extend(m.groups()) 54 | parts.append(data[last:]) 55 | return parts 56 | 57 | def _first_char(self, data): 58 | """Return the first character of data (in bibtex's sense).""" 59 | # XXX Should this be pulled out as some generic algorithm? 60 | pos = 0 61 | depths = self.__depth(data) 62 | while True: 63 | if pos == len(data): 64 | return '' 65 | elif data[pos].isalpha(): 66 | return data[pos] 67 | elif data.startswith('{\\', pos): 68 | # Special character 69 | pos += 1 70 | m = re.compile(r'\\[a-zA-Z]+').match(data, pos) 71 | if m and m.group() in _CONTROL_SEQS: 72 | # Known bibtex control sequence 73 | return _CONTROL_SEQS[m.group()] 74 | # Scan for the first alphabetic character 75 | while pos < len(data) and depths[pos]: 76 | if data[pos].isalpha(): 77 | return data[pos] 78 | pos += 1 79 | elif data[pos] == '{': 80 | # Skip brace group 81 | while pos < len(data) and depths[pos]: 82 | pos += 1 83 | else: 84 | pos += 1 85 | 86 | def __split_von_last(self, toks): 87 | # See von_name_ends_and_last_name_starts_stuff 88 | for von_end in range(len(toks) - 1, 1, -2): 89 | if self._first_char(toks[von_end - 2]).islower(): 90 | return (toks[:von_end-1], toks[von_end:]) 91 | return ([], toks) 92 | 93 | def parse(self, string, pos): 94 | """Parse a BibTeX name list. 95 | 96 | Returns a list of Name objects. Raises InputError if there is 97 | a syntax error. 98 | """ 99 | 100 | # See x_format_name 101 | 102 | # Split names (see name_scan_for_and) 103 | name_strings = [n.strip() for n in self.__split_depth0( 104 | '[ \t]and(?=[ \t])', string, flags=re.IGNORECASE)] 105 | 106 | # Process each name 107 | names = [] 108 | for name_string in name_strings: 109 | # Remove leading and trailing white space, ~, and -, and 110 | # trailing commas. 111 | name_string = name_trailing = name_string.lstrip('-~ \t') 112 | name_string = name_string.rstrip('-~ \t,') 113 | if ',' in name_trailing[len(name_string):]: 114 | # BibTeX warns about this because it often indicates a 115 | # bigger syntax problem 116 | pos.warn('trailing comma after name `{}\''.format(name_string)) 117 | 118 | # Split on depth-0 commas and further split tokens in each 119 | # part, keeping only the first connector between each 120 | # token. 121 | parts = [self.__split_depth0('([-~ \t])[-~ \t]*', part.strip()) 122 | for part in self.__split_depth0(',', name_string)] 123 | 124 | # Process name depending on how many commas there were 125 | first = von = last = jr = [] 126 | if len(parts) == 1: 127 | # "First von Last" 128 | toks = parts[0] 129 | # The von tokens start with the first lower-case token 130 | # (but cannot start at the last token) 131 | for von_start in range(0, len(toks) - 2, 2): 132 | if self._first_char(toks[von_start]).islower(): 133 | # Found beginning; now find the end 134 | first = toks[:max(0, von_start-1)] 135 | von, last = self.__split_von_last(toks[von_start:]) 136 | break 137 | else: 138 | # No von tokens. Find hyphen-connected last name 139 | # tokens. 140 | for last_start in range(len(toks) - 1, -1, -2): 141 | if last_start and toks[last_start-1] != '-': 142 | break 143 | first = toks[:max(0, last_start-1)] 144 | last = toks[last_start:] 145 | elif 2 <= len(parts) <= 3: 146 | # "von Last, First[, Jr]" 147 | von, last = self.__split_von_last(parts[0]) 148 | first = parts[1] 149 | if len(parts) == 3: 150 | jr = parts[2] 151 | else: 152 | pos.raise_error( 153 | 'too many commas in name `{}\''.format(name_string)) 154 | 155 | names.append(Name(''.join(first), ''.join(von), 156 | ''.join(last), ''.join(jr))) 157 | return names 158 | 159 | class Name(collections.namedtuple('Name', 'first von last jr')): 160 | """A parsed name. 161 | 162 | The name is parsed in to first name, "von", last name, and the 163 | complement (or "jr"). Each component is in uninterpreted form 164 | (e.g., TeX syntax). Missing components are set to the empty 165 | string. 166 | """ 167 | 168 | def is_others(self): 169 | return self.first == '' and self.von == '' and \ 170 | self.last == 'others' and self.jr == '' 171 | 172 | def pretty(self, template='{first} {von} {last} {jr}'): 173 | """Pretty-print author according to template. 174 | 175 | The template is a 'format' template with the added feature 176 | that literal text surrounding fields that expand to empty 177 | strings is prioritized, rather than concatenated. 178 | Specifically, of the literal text snippets between two 179 | non-null fields, only the first of the highest priority is 180 | kept, where non-white space outranks white space outranks the 181 | empty string. Literal text before and after the first and 182 | last fields is always kept. 183 | 184 | Hence, if the template is '{von} {last}, {first}, {jr}' and 185 | the name has a last and a jr not no von or first, then the 186 | first comma will be kept and the space and second dropped. If 187 | the name has only a von and a last, then both commas will be 188 | dropped. If the name has only a last, then all separators 189 | will be dropped. 190 | """ 191 | 192 | # XXX BibTeX's own format.name$ templates are more 193 | # sophisticated than this, and it's not clear these are easier 194 | # to use. These do have the (dubious) benefit of having 195 | # access to the usual format machinery. 196 | 197 | def priority(string): 198 | if not string: 199 | return 0 200 | elif string.isspace(): 201 | return 1 202 | return 2 203 | fields = {'first': self.first, 'von': self.von, 204 | 'last': self.last, 'jr': self.jr} 205 | f = string.Formatter() 206 | pieces = [''] 207 | first_field, last_field = 0, -1 208 | leading = trailing = '' 209 | for i, (literal_text, field_name, format_spec, conv) in \ 210 | enumerate(f.parse(template)): 211 | if i == 0: 212 | # Always keep leading text 213 | leading = literal_text 214 | elif field_name is None: 215 | # Always keep trailing test 216 | trailing = literal_text 217 | elif priority(literal_text) > priority(pieces[-1]): 218 | # Overrides previous piece 219 | pieces[-1] = literal_text 220 | 221 | if field_name is not None: 222 | obj, _ = f.get_field(field_name, (), fields) 223 | if not obj: 224 | continue 225 | obj = f.convert_field(obj, conv) 226 | if first_field == 0: 227 | first_field = len(pieces) 228 | last_field = len(pieces) 229 | pieces.extend([f.format_field(obj, format_spec), '']) 230 | # Only keep the pieces between non-null fields 231 | pieces = pieces[first_field:last_field + 1] 232 | return leading + ''.join(pieces) + trailing 233 | 234 | def parse_names(string, pos=messages.Pos.unknown): 235 | """Parse a BibTeX name list (e.g., an author or editor field). 236 | 237 | Returns a list of Name objects. The parsing is equivalent to 238 | BibTeX's built-in "format.name$" function. Raises InputError if 239 | there is a syntax error. 240 | """ 241 | return NameParser().parse(string, pos) 242 | 243 | _MONTHS = 'January February March April May June July August September October November December'.lower().split() 244 | 245 | def parse_month(string, pos=messages.Pos.unknown): 246 | """Parse a BibTeX month field. 247 | 248 | This performs fairly fuzzy parsing that supports all standard 249 | month macro styles (and then some). 250 | 251 | Raises InputError if the field cannot be parsed. 252 | """ 253 | val = string.strip().rstrip('.').lower() 254 | for i, name in enumerate(_MONTHS): 255 | if name.startswith(val) and len(val) >= 3: 256 | return i + 1 257 | pos.raise_error('invalid month `{}\''.format(string)) 258 | 259 | CS_RE = re.compile(r'\\[a-zA-Z]+') 260 | 261 | def title_case(string, pos=messages.Pos.unknown): 262 | """Convert to title case (like BibTeX's built-in "change.case$"). 263 | 264 | Raises InputError if the title string contains syntax errors. 265 | """ 266 | 267 | # See "@" 268 | out = [] 269 | level, prev_colon, pos = 0, False, 0 270 | while pos < len(string): 271 | keep = (pos == 0 or (prev_colon and string[pos-1] in ' \t\n')) 272 | 273 | if level == 0 and string.startswith('{\\', pos) and not keep: 274 | # Special character 275 | out.append(string[pos]) 276 | pos += 1 277 | level += 1 278 | 279 | while level and pos < len(string): 280 | if string[pos] == '\\': 281 | m = CS_RE.match(string, pos) 282 | if m: 283 | if m.group() in _CONTROL_SEQS: 284 | # Lower case control sequence 285 | out.append(m.group().lower()) 286 | else: 287 | # Unknown control sequence, keep case 288 | out.append(m.group()) 289 | pos = m.end() 290 | continue 291 | elif string[pos] == '{': 292 | level += 1 293 | elif string[pos] == '}': 294 | level -= 1 295 | 296 | # Lower-case non-control sequence 297 | out.append(string[pos].lower()) 298 | pos += 1 299 | 300 | prev_colon = False 301 | continue 302 | 303 | # Handle braces 304 | char = string[pos] 305 | if char == '{': 306 | level += 1 307 | elif char == '}': 308 | if level == 0: 309 | pos.raise_error('unexpected }') 310 | level -= 1 311 | 312 | # Handle colon state 313 | if char == ':': 314 | prev_colon = True 315 | elif char not in ' \t\n': 316 | prev_colon = False 317 | 318 | # Change case of a regular character 319 | if level > 0 or keep: 320 | out.append(string[pos]) 321 | else: 322 | out.append(string[pos].lower()) 323 | pos += 1 324 | 325 | return ''.join(out) 326 | 327 | # A TeX control sequence is 328 | # 329 | # 1) an active character (subsequent white space is NOT ignored) or, 330 | # 2) a \ followed by either 331 | # 2.1) a sequence of letter-category characters (subsequent white 332 | # space is ignored), or 333 | # 2.2) a single space-category character (subsequent white space is 334 | # ignored), or 335 | # 2.3) a single other character (subsequent white space is NOT 336 | # ignored). 337 | # 338 | # This regexp assumes plain TeX's initial category codes. Technically 339 | # only ~ and \f are active characters, but we include several other 340 | # special characters that we want to abort on. 341 | tex_cs_re = re.compile( 342 | r'([~\f$&#^_]|(\\[a-zA-Z]+|\\[ \t\r\n])|\\.)(?(2)[ \t\r\n]*)') 343 | 344 | class TeXProcessor: 345 | """Base class for simple TeX macro processors. 346 | 347 | This assumes the initial category codes set up by plain.tex (and, 348 | likewise, LaTeX). 349 | """ 350 | 351 | def process(self, string, pos): 352 | """Expand active characters and macros in string. 353 | 354 | Raises InputError if it encounters an active character or 355 | macro it doesn't recognize. 356 | """ 357 | 358 | self.__data = string 359 | self.__off = 0 360 | self.__pos = pos 361 | 362 | # Process macros 363 | while True: 364 | m = tex_cs_re.search(self.__data, self.__off) 365 | if not m: 366 | break 367 | self.__off = m.end() 368 | macro = m.group(1) 369 | nval = self._expand(macro) 370 | if nval is None: 371 | if macro.startswith('\\'): 372 | pos.raise_error('unknown macro `{}\''.format(macro)) 373 | pos.raise_error( 374 | 'unknown special character `{}\''.format(macro)) 375 | self.__data = self.__data[:m.start()] + nval + \ 376 | self.__data[self.__off:] 377 | self.__off = m.start() + len(nval) 378 | 379 | return self.__data 380 | 381 | def _scan_argument(self): 382 | """Scan an return a macro argument.""" 383 | if self.__off >= len(self.__data): 384 | self.__pos.raise_error('macro argument expected') 385 | if self.__data[self.__off] == '{': 386 | start = self.__off 387 | depth = 0 388 | while depth or self.__off == start: 389 | if self.__data[self.__off] == '{': 390 | depth += 1 391 | elif self.__data[self.__off] == '}': 392 | depth -= 1 393 | self.__off += 1 394 | return self.__data[start + 1:self.__off - 1] 395 | elif self.__data[self.__off] == '\\': 396 | m = tex_cs_re.match(self.__data, self.__off) 397 | self.__off = m.end() 398 | return m.group(1) 399 | else: 400 | arg = self.__data[self.__off] 401 | self.__off += 1 402 | return arg 403 | 404 | def _expand(self, cs): 405 | """Return the expansion of an active character or control sequence. 406 | 407 | Returns None if the sequence is unknown. This should be 408 | overridden by sub-classes. 409 | """ 410 | return None 411 | 412 | class TeXToUnicode(TeXProcessor): 413 | """A simple TeX-to-unicode converter. 414 | 415 | This interprets accents and other special tokens like '--' and 416 | eliminates braces. 417 | """ 418 | 419 | # Simple TeX-to-Unicode replacements 420 | _SIMPLE = { 421 | # Active characters 422 | '~': '\u00A0', 423 | # chardefs from plain.tex 424 | '\\%': '%', '\\&': '&', '\\#': '#', '\\$': '$', '\\ss': 'ß', 425 | '\\ae': 'æ', '\\oe': 'œ', '\\o': 'ø', 426 | '\\AE': 'Æ', '\\OE': 'Œ', '\\O': 'Ø', 427 | '\\i': 'ı', '\\j': 'ȷ', 428 | '\\aa': 'å', '\\AA': 'Å', '\\l': 'ł', '\\L': 'Ł', 429 | # Other defs from plain.tex 430 | '\\_': '_', '\\dag': '†', '\\ddag': '‡', '\\S': '§', '\\P': '¶', 431 | } 432 | 433 | # TeX accent control sequences to Unicode combining characters 434 | _ACCENTS = { 435 | # Accents defined in plain.tex 436 | '\\`': '\u0300', "\\'": '\u0301', '\\v': '\u030C', '\\u': '\u0306', 437 | '\\=': '\u0304', '\\^': '\u0302', '\\.': '\u0307', '\\H': '\u030B', 438 | '\\~': '\u0303', '\\"': '\u0308', 439 | '\\d': '\u0323', '\\b': '\u0331', '\\c': '\u0327', 440 | # Other accents that seem to be standard, but I can't find 441 | # their definitions 442 | '\\r': '\u030A', '\\k': '\u0328' 443 | } 444 | 445 | def process(self, string, pos): 446 | string = super().process(string, pos) 447 | 448 | # Handle ligatures that are unique to TeX. This must be done 449 | # after macro expansion, but before brace removal because 450 | # braces inhibit ligatures. 451 | string = string.replace('---', '\u2014').replace('--', '\u2013') 452 | 453 | # Remove braces 454 | return string.replace('{', '').replace('}', '') 455 | 456 | def _expand(self, cs): 457 | if cs in self._SIMPLE: 458 | return self._SIMPLE[cs] 459 | if cs in self._ACCENTS: 460 | arg = self._scan_argument() 461 | if len(arg) == 0: 462 | seq, rest = ' ' + self._ACCENTS[cs], '' 463 | elif arg.startswith('\\i') or arg.startswith('\\j'): 464 | # Unicode combining marks should be applied to the 465 | # regular i, not the dotless i. 466 | seq, rest = arg[1] + self._ACCENTS[cs], arg[2:] 467 | else: 468 | seq, rest = arg[0] + self._ACCENTS[cs], arg[1:] 469 | return unicodedata.normalize('NFC', seq) + rest 470 | return None 471 | 472 | def tex_to_unicode(string, pos=messages.Pos.unknown): 473 | """Convert a BibTeX field value written in TeX to Unicode. 474 | 475 | This interprets accents and other special tokens like '--' and 476 | eliminates braces. Raises InputError if it encounters a macro it 477 | doesn't understand. 478 | 479 | Note that BibTeX's internal understanding of accented characters 480 | (e.g., purify$ and change.case$) is much more limited than TeX's. 481 | This implements something closer to TeX on the assumption that the 482 | goal is to display the string. 483 | """ 484 | 485 | return TeXToUnicode().process(string, pos) 486 | -------------------------------------------------------------------------------- /biblib/bib.py: -------------------------------------------------------------------------------- 1 | """Parser and representation for BibTeX .bib databases. 2 | 3 | This parser is derived directly from the WEB source code for BibTeX -- 4 | especially section "Reading the database file(s)" -- and hence 5 | (barring bugs in translation) should be fully compatible with BibTeX's 6 | own parser. 7 | """ 8 | 9 | __all__ = 'Parser Entry FieldError resolve_crossrefs'.split() 10 | 11 | import sys 12 | import re 13 | import collections 14 | import textwrap 15 | 16 | from . import messages 17 | 18 | # Match sequences of legal identifier characters, except that the 19 | # first is not allowed to be a digit (see id_class) 20 | ID_RE = re.compile('(?![0-9])(?:(?![ \t"#%\'(),={}])[\x20-\x7f])+') 21 | # BibTeX only considers space, tab, and newline to be white space (see 22 | # lex_class) 23 | SPACE_RE = re.compile('[ \t\n]*') 24 | 25 | class ParseError(Exception): 26 | pass 27 | 28 | class Parser: 29 | """A parser for .bib BibTeX database files.""" 30 | 31 | def __init__(self, *, month_style='full'): 32 | """Initialize an empty database. 33 | 34 | This also initializes standard month macros (which are usually 35 | provided by the style file). month_style may be 'full' to get 36 | full names, 'abbrv' to get abbrv.bst-style abbreviated names, 37 | or None to not initialize month macros. 38 | 39 | The database should be populated by calling parse one or more 40 | times. The final contents of the database can be retrieved by 41 | calling finalize. 42 | """ 43 | 44 | self.__log, self.__errors = [], False 45 | self.__entries = collections.OrderedDict() 46 | 47 | if month_style == 'full': 48 | self.__macros = {'jan': 'January', 'feb': 'February', 49 | 'mar': 'March', 'apr': 'April', 50 | 'may': 'May', 'jun': 'June', 51 | 'jul': 'July', 'aug': 'August', 52 | 'sep': 'September', 'oct': 'October', 53 | 'nov': 'November', 'dec': 'December'} 54 | elif month_style == 'abbrv': 55 | self.__macros = {'jan': 'Jan.', 'feb': 'Feb.', 56 | 'mar': 'Mar.', 'apr': 'Apr.', 57 | 'may': 'May', 'jun': 'June', 58 | 'jul': 'July', 'aug': 'Aug.', 59 | 'sep': 'Sept.', 'oct': 'Oct.', 60 | 'nov': 'Nov.', 'dec': 'Dec.'} 61 | elif month_style == None: 62 | self.__macros = {} 63 | else: 64 | raise ValueError('Unknown month style {}'.format(month_style)) 65 | 66 | def string(self, name, value): 67 | """Declare a macro, just like an @string command.""" 68 | self.__macros[name] = value 69 | 70 | def parse(self, str_or_fp_or_iter, name=None, *, log_fp=None): 71 | """Parse the contents of str_or_fp_or_iter and return self. 72 | 73 | str_or_fp_or_iter must be a string, a file-like object, or an 74 | iterable of string or file-like objects to parse in 75 | succession. If name is not None, it is used as the file name. 76 | Otherwise, a name is constructed in a type-appropriate way. 77 | 78 | If log_fp is not None, it must be a file-local object to which 79 | warnings and InputErrors will be logged. This logger will be 80 | attached to all Pos instances created from the file being 81 | parsed, so any warnings or InputErrors raised from later 82 | operations on derived objects (like entries or field values) 83 | will also be logged to log_fp. 84 | 85 | If there are any errors in the input, raises a (potentially 86 | bundled) InputError. 87 | 88 | Parse can be called multiple times to parse subsequent .bib 89 | files. Later files will have access to, for example, strings 90 | defined in earlier files. 91 | """ 92 | 93 | recoverer = messages.InputErrorRecoverer() 94 | if isinstance(str_or_fp_or_iter, str): 95 | self.__data = str_or_fp_or_iter 96 | fname = name or '' 97 | elif isinstance(str_or_fp_or_iter, collections.Iterable) and \ 98 | not hasattr(str_or_fp_or_iter, 'read'): 99 | for obj in str_or_fp_or_iter: 100 | with recoverer: 101 | self.parse(obj, name=name, log_fp=log_fp) 102 | recoverer.reraise() 103 | return self 104 | else: 105 | self.__data = str_or_fp_or_iter.read() 106 | try: 107 | fname = name or str_or_fp_or_iter.name 108 | except AttributeError: 109 | fname = '' 110 | self.__off = 0 111 | 112 | # Remove trailing whitespace from lines in data (see input_ln 113 | # in bibtex.web) 114 | self.__data = re.sub('[ \t]+$', '', self.__data, flags=re.MULTILINE) 115 | self.__pos_factory = messages.PosFactory(fname, self.__data, log_fp) 116 | 117 | # Parse entries 118 | while self.__off < len(self.__data): 119 | # Just continue to the next entry if there's an error 120 | with recoverer: 121 | self._scan_command_or_entry() 122 | recoverer.reraise() 123 | return self 124 | 125 | def get_entries(self): 126 | """Return the entry database. 127 | 128 | The database is an ordered dictionary mapping from lower-cased 129 | keys to Entry objects. 130 | """ 131 | return self.__entries 132 | 133 | def _fail(self, msg, off=None): 134 | if off is None: 135 | off = self.__off 136 | self.__pos_factory.offset_to_pos(off).raise_error(msg) 137 | 138 | def _warn(self, msg, off=None): 139 | if off is None: 140 | off = self.__off 141 | self.__pos_factory.offset_to_pos(off).warn(msg) 142 | 143 | # Base parsers. These are the only methods that directly 144 | # manipulate self.__data. 145 | 146 | def _try_tok(self, regexp, skip_space=True): 147 | """Scan regexp followed by white space. 148 | 149 | Returns the matched text, or None if the match failed.""" 150 | if isinstance(regexp, str): 151 | regexp = re.compile(regexp) 152 | m = regexp.match(self.__data, self.__off) 153 | if m is None: 154 | return None 155 | self.__off = m.end() 156 | if skip_space: 157 | self._skip_space() 158 | return m.group(0) 159 | 160 | def _scan_balanced_text(self, term): 161 | """Scan brace-balanced text terminated with character term.""" 162 | start, level = self.__off, 0 163 | while self.__off < len(self.__data): 164 | char = self.__data[self.__off] 165 | if level == 0 and char == term: 166 | text = self.__data[start:self.__off] 167 | self.__off += 1 168 | self._skip_space() 169 | return text 170 | elif char == '{': 171 | level += 1 172 | elif char == '}': 173 | level -= 1 174 | if level < 0: 175 | self._fail('unexpected }') 176 | self.__off += 1 177 | self._fail('unterminated string') 178 | 179 | def _skip_space(self): 180 | # This is equivalent to eat_bib_white_space, except that we do 181 | # it automatically after every token, whereas bibtex carefully 182 | # and explicitly does it between every token. 183 | self.__off = SPACE_RE.match(self.__data, self.__off).end() 184 | 185 | # Helpers 186 | 187 | def _tok(self, regexp, fail=None): 188 | """Scan token regexp or fail with the given message.""" 189 | res = self._try_tok(regexp) 190 | if res is None: 191 | assert fail 192 | self._fail(fail) 193 | return res 194 | 195 | # Productions 196 | 197 | def _scan_identifier(self): 198 | return self._tok(ID_RE, 'expected identifier') 199 | 200 | def _scan_command_or_entry(self): 201 | # See get_bib_command_or_entry_and_process 202 | 203 | # Skip to the next database entry or command 204 | self._tok('[^@]*') 205 | pos = self.__pos_factory.offset_to_pos(self.__off) 206 | if not self._try_tok('@'): 207 | return None 208 | 209 | # Scan command or entry type 210 | typ = self._scan_identifier().lower() 211 | 212 | if typ == 'comment': 213 | # Believe it or not, BibTeX doesn't do anything with what 214 | # comes after an @comment, treating it like any other 215 | # inter-entry noise. 216 | return None 217 | 218 | left = self._tok('[{(]', 'expected { or ( after entry type') 219 | right, right_re = (')', '\\)') if left == '(' else ('}', '}') 220 | 221 | if typ == 'preamble': 222 | # Parse the preamble, but ignore it 223 | self._scan_field_value() 224 | self._tok(right_re, 'expected '+right) 225 | return None 226 | 227 | if typ == 'string': 228 | name = self._scan_identifier().lower() 229 | if name in self.__macros: 230 | self._warn('macro `{}\' redefined'.format(name)) 231 | self._tok('=', 'expected = after string name') 232 | value = self._scan_field_value() 233 | self._tok(right_re, 'expected '+right) 234 | self.__macros[name] = value 235 | return None 236 | 237 | # Not a command, must be a database entry 238 | 239 | # Scan the entry's database key 240 | if left == '(': 241 | # The database key is anything up to a comma, white 242 | # space, or end-of-line (yes, the key can be empty, 243 | # and it can include a close paren) 244 | key = self._tok('[^, \t\n]*') 245 | else: 246 | # The database key is anything up to comma, white 247 | # space, right brace, or end-of-line 248 | key = self._tok('[^, \t}\n]*') 249 | 250 | # Scan entries (starting with comma or close after key) 251 | fields = [] 252 | field_pos = {} 253 | while True: 254 | if self._try_tok(right_re): 255 | break 256 | self._tok(',', 'expected {} or ,'.format(right)) 257 | if self._try_tok(right_re): 258 | break 259 | 260 | # Scan field name and value 261 | field_off = self.__off 262 | field = self._scan_identifier().lower() 263 | self._tok('=', 'expected = after field name') 264 | value = self._scan_field_value() 265 | 266 | if field in field_pos: 267 | pos.warn('repeated field `{}\''.format(field)) 268 | continue 269 | 270 | fields.append((field, value)) 271 | field_pos[field] = self.__pos_factory.offset_to_pos(field_off) 272 | 273 | if key.lower() in self.__entries: 274 | self._fail('repeated entry') 275 | self.__entries[key.lower()] = Entry(fields, typ, key, pos, field_pos) 276 | 277 | def _scan_field_value(self): 278 | # See scan_and_store_the_field_value_and_eat_white 279 | value = self._scan_field_piece() 280 | while self._try_tok('#'): 281 | value += self._scan_field_piece() 282 | # Compress spaces in the text. Bibtex does this 283 | # (painstakingly) as it goes, but the final effect is the same 284 | # (see check_for_and_compress_bib_white_space). 285 | value = re.sub('[ \t\n]+', ' ', value) 286 | # Strip leading and trailing space (literally just space, see 287 | # @) 288 | return value.strip(' ') 289 | 290 | def _scan_field_piece(self): 291 | # See scan_a_field_token_and_eat_white 292 | piece = self._try_tok('[0-9]+') 293 | if piece is not None: 294 | return piece 295 | if self._try_tok('{', skip_space=False): 296 | return self._scan_balanced_text('}') 297 | if self._try_tok('"', skip_space=False): 298 | return self._scan_balanced_text('"') 299 | opos = self.__off 300 | piece = self._try_tok(ID_RE) 301 | if piece is not None: 302 | if piece.lower() not in self.__macros: 303 | self._warn('unknown macro `{}\''.format(piece), opos) 304 | return '' 305 | return self.__macros[piece.lower()] 306 | self._fail('expected string, number, or macro name') 307 | 308 | class FieldError(KeyError): 309 | def __init__(self, field, entry=None): 310 | super().__init__(field) 311 | self.__entry = entry 312 | 313 | def __str__(self): 314 | return '{}: missing field `{}\''.format(self.__entry, self.args[0]) 315 | 316 | MONTH_MACROS = 'jan feb mar apr may jun jul aug sep oct nov dec'.split() 317 | 318 | class Entry(collections.OrderedDict): 319 | """An entry in a BibTeX database. 320 | 321 | This is an ordered dictionary of fields, plus some additional 322 | properties: typ gives the type of the entry, such as "journal", 323 | canonicalized to lower case. key gives the database entry key 324 | (case is preserved, but should be ignored for comparisons). pos 325 | is a messages.Pos instance giving the position of this entry in 326 | the database file. field_pos is a simple dictionary from field 327 | names to message.Pos instances. 328 | 329 | Field values are as they would be seen by a .bst file: white space 330 | is cleaned up, but they retain macros, BibTeX-style accents, etc. 331 | Use algo.tex_to_unicode to interpret field values to user-friendly 332 | Unicode strings. 333 | """ 334 | 335 | def __init__(self, fields, typ=None, key=None, pos=None, field_pos=None): 336 | super().__init__(fields) 337 | self.typ, self.key, self.pos, self.field_pos = typ, key, pos, field_pos 338 | 339 | def copy(self): 340 | return self.__class__(self, self.typ, self.key, self.pos, self.field_pos) 341 | 342 | def __str__(self): 343 | return '`{}\' at {}'.format(self.key, self.pos) 344 | 345 | def __getitem__(self, field): 346 | try: 347 | return super().__getitem__(field) 348 | except KeyError: 349 | raise FieldError(field, self) from None 350 | 351 | def __eq__(self, o): 352 | """Two Entries are equal if they have the same fields, type, and key.""" 353 | return super().__eq__(o) and self.typ == o.typ and self.key == o.key 354 | 355 | def to_bib(self, *, month_to_macro=True, wrap_width=70): 356 | """Return this entry formatted as a BibTeX .bib entry. 357 | 358 | If month_to_macro is True, attempt to parse month names and 359 | replace them with their standard macro. 360 | 361 | If wrap_width is not None, word wrap the entry at this many 362 | columns (long words and hyphens are not split). 363 | """ 364 | 365 | lines = ['@%s{%s,' % (self.typ, self.key)] 366 | for k, v in self.items(): 367 | start = ' {:12} = '.format(k) 368 | 369 | if month_to_macro and k == 'month': 370 | try: 371 | macro = MONTH_MACROS[self.month_num() - 1] 372 | except messages.InputError: 373 | pass 374 | else: 375 | lines.append(start + macro + ',') 376 | continue 377 | 378 | if v.isdigit(): 379 | lines.append(start + v + ',') 380 | elif wrap_width is None: 381 | lines.append(start + '{' + v + '},') 382 | else: 383 | lines.append(textwrap.fill( 384 | v, width=wrap_width, 385 | # Keep whitespace formatting as it is 386 | expand_tabs=False, replace_whitespace=False, 387 | # Don't break long things like URLs 388 | break_long_words=False, break_on_hyphens=False, 389 | initial_indent=start + '{', subsequent_indent=' ') + '},') 390 | lines.append('}') 391 | return '\n'.join(lines) 392 | 393 | def resolve_crossref(self, entries): 394 | """Return a new entry with crossref-ed fields incorporated. 395 | 396 | entries must be the database in which to find any crossref-ed 397 | database entries. 398 | """ 399 | if 'crossref' not in self: 400 | return self 401 | nentry = self.copy() 402 | source = entries[self['crossref'].lower()] 403 | if 'crossref' in source: 404 | self.field_pos['crossref'].warn('nested crossref') 405 | for k, v in source.items(): 406 | if k not in nentry: 407 | nentry[k] = v 408 | nentry.field_pos[k] = source.field_pos[k] 409 | del nentry['crossref'] 410 | return nentry 411 | 412 | def date_key(self): 413 | """Return a sort key appropriate for sorting by date. 414 | 415 | Returns a tuple ([year, [month]]) where year and month are 416 | numeric. Raises InputError if the entry has year and/or month 417 | fields, but they are malformed. 418 | """ 419 | 420 | key = () 421 | year, month = self.get('year'), self.get('month') 422 | if year is not None: 423 | if not year.isdigit(): 424 | self.field_pos['year'].raise_error( 425 | 'invalid year `{}\''.format(year)) 426 | key += (int(year),) 427 | if month is not None: 428 | if year is None: 429 | self.field_pos['month'].raise_error('month without year') 430 | key += (self.month_num(),) 431 | return key 432 | 433 | def authors(self, field='author'): 434 | """Return a list of parsed author names. 435 | 436 | This is a wrapper for biblib.algo.parse_names. 437 | """ 438 | from .algo import parse_names 439 | return parse_names(self[field], self.field_pos[field]) 440 | 441 | def month_num(self, field='month'): 442 | """Convert the month of this entry into a number in [1,12]. 443 | 444 | This is a wrapper for biblib.algo.parse_month (which see). 445 | 446 | Raises KeyError if this entry does not have the specified 447 | field and InputError if the field cannot be parsed. 448 | """ 449 | from .algo import parse_month 450 | return parse_month(self[field], pos=self.field_pos[field]) 451 | 452 | def resolve_crossrefs(db, min_crossrefs=None): 453 | """Resolve cross-referenced entries in db. 454 | 455 | This returns a new database containing the same entries in the 456 | same order as db, but any entries that crossref another entry are 457 | expanded with the fields for the cross-referenced entry. 458 | 459 | If min_crossrefs is not None, then any entry that is 460 | cross-referenced by min_crossrefs or more other entries will *not* 461 | be expanded and entries that cross-reference it will retain their 462 | crossref field. If min_crossrefs is None, entries are always 463 | expanded. (This mimics BibTeX "-min-crossrefs" option.) 464 | 465 | If there are unknown crossrefs, raises a (potentially bundled) 466 | InputError. 467 | """ 468 | if min_crossrefs is not None: 469 | counts = collections.Counter(entry['crossref'].lower() 470 | for entry in db.values() 471 | if 'crossref' in entry) 472 | else: 473 | counts = None 474 | 475 | key_idx = {k: i for i, k in enumerate(db)} 476 | recoverer = messages.InputErrorRecoverer() 477 | ndb = collections.OrderedDict() 478 | for entry_idx, (key, entry) in enumerate(db.items()): 479 | crossref = entry.get('crossref') 480 | if crossref is None: 481 | ndb[key] = entry 482 | else: 483 | with recoverer: 484 | crossref_idx = key_idx.get(crossref.lower()) 485 | if crossref_idx is None: 486 | entry.field_pos['crossref'].raise_error( 487 | 'unknown crossref `{}\''.format(crossref)) 488 | elif crossref_idx < entry_idx: 489 | entry.field_pos['crossref'].raise_error( 490 | 'crossref `{}\' must come after entry'.format(crossref)) 491 | elif counts and counts[crossref.lower()] >= min_crossrefs: 492 | ndb[key] = entry 493 | else: 494 | ndb[key] = entry.resolve_crossref(db) 495 | recoverer.reraise() 496 | return ndb 497 | --------------------------------------------------------------------------------