195 | */
196 | private function determineCase(string $word): int
197 | {
198 | $ret = -1;
199 | $trimmedword = trim($word);
200 | /*We need this variable. Without the next of would not work
201 | (trim changes the variable automatically to a string!)*/
202 | if (mb_strlen($trimmedword) > 0) {
203 | $i = 0;
204 | $found = false;
205 | $openbrace = 0;
206 | while (!$found && ($i <= mb_strlen($word))) {
207 | $letter = mb_substr($trimmedword, $i, 1);
208 | $ord = \ord($letter);
209 | if (123 === $ord) { // Open brace
210 | ++$openbrace;
211 | }
212 | if (125 === $ord) { // Closing brace
213 | --$openbrace;
214 | }
215 | if (($ord >= 65) && ($ord <= 90) && (0 === $openbrace)) { // The first character is uppercase
216 | $ret = 1;
217 | $found = true;
218 | } elseif (($ord >= 97) && ($ord <= 122) && (0 === $openbrace)) { // The first character is lowercase
219 | $ret = 0;
220 | $found = true;
221 | } else { // Not yet found
222 | ++$i;
223 | }
224 | }
225 | } else {
226 | throw new ProcessorException('Could not determine case on word: ' . $word);
227 | }
228 |
229 | return $ret;
230 | }
231 | }
232 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | PHP BibTeX Parser 2.x
2 |
3 | This is a
4 | BibTeX
5 | parser written in
6 | PHP.
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | 
18 | 
19 | 
20 |
21 | You are browsing the documentation of **BibTeX Parser 2.x**, the latest version.
22 |
23 | ## Table of contents
24 |
25 | * [Installing](#installing)
26 | * [Usage](#usage)
27 | * [Vocabulary](#vocabulary)
28 | * [Processors](#processors)
29 | * [Tag name case](#tag-name-case)
30 | * [Authors and editors](#authors-and-editors)
31 | * [Keywords](#keywords)
32 | * [Date](#date)
33 | * [Fill missing tag](#fill-missing-tag)
34 | * [Trim tags](#trim-tags)
35 | * [Determine URL from the DOI](#determine-url-from-the-doi)
36 | * [LaTeX to unicode](#latex-to-unicode)
37 | * [Custom](#custom)
38 | * [Handling errors](#handling-errors)
39 | * [Advanced usage](#advanced-usage)
40 | * [Release Policy](#release-policy)
41 | * [Dependencies Compatibility Policy](#dependencies-compatibility-policy)
42 |
43 | ## Installing
44 |
45 | ```bash
46 | composer require renanbr/bibtex-parser
47 | ```
48 |
49 | ## Usage
50 |
51 | ```php
52 | use RenanBr\BibTexParser\Listener;
53 | use RenanBr\BibTexParser\Parser;
54 | use RenanBr\BibTexParser\Processor;
55 |
56 | require 'vendor/autoload.php';
57 |
58 | $bibtex = <<addProcessor(new Processor\TagNameCaseProcessor(CASE_LOWER));
69 | // $listener->addProcessor(new Processor\NamesProcessor());
70 | // $listener->addProcessor(new Processor\KeywordsProcessor());
71 | // $listener->addProcessor(new Processor\DateProcessor());
72 | // $listener->addProcessor(new Processor\FillMissingProcessor([/* ... */]));
73 | // $listener->addProcessor(new Processor\TrimProcessor());
74 | // $listener->addProcessor(new Processor\UrlFromDoiProcessor());
75 | // $listener->addProcessor(new Processor\LatexToUnicodeProcessor());
76 | // ... you can append as many Processors as you want
77 |
78 | // Create a Parser and attach the listener
79 | $parser = new Parser();
80 | $parser->addListener($listener);
81 |
82 | // Parse the content, then read processed data from the Listener
83 | $parser->parseString($bibtex); // or parseFile('/path/to/file.bib')
84 | $entries = $listener->export();
85 |
86 | print_r($entries);
87 | ```
88 |
89 | This will output:
90 |
91 | ```
92 | Array
93 | (
94 | [0] => Array
95 | (
96 | [_type] => article
97 | [citation-key] => einstein1916relativity
98 | [title] => Relativity: The Special and General Theory
99 | [author] => Einstein, Albert
100 | [year] => 1916
101 | )
102 | )
103 | ```
104 |
105 | ## Vocabulary
106 |
107 | [BibTeX] is all about "entry", "tag's name" and "tag's content".
108 |
109 | > A [BibTeX] **entry** consists of the type (the word after @), a citation-key and a number of tags which define various characteristics of the specific [BibTeX] entry.
110 | > (...) A [BibTeX] **tag** is specified by its **name** followed by an equals sign, and the **content**.
111 |
112 | Source: http://www.bibtex.org/Format/
113 |
114 | Note:
115 | This library considers "type" and "citation-key" as tags.
116 | This behavior can be changed [implementing your own Listener](#advanced-usage).
117 |
118 | ## Processors
119 |
120 | `Processor` is a [callable] that receives an entry as argument and returns a modified entry.
121 |
122 | This library contains three main parts:
123 |
124 | - `Parser` class, responsible for detecting units inside a [BibTeX] input;
125 | - `Listener` class, responsible for gathering units and transforming them into a list of entries;
126 | - `Processor` classes, responsible for manipulating entries.
127 |
128 | Despite you can't configure the `Parser`, you can append as many `Processor` as you want to the `Listener` through `Listener::addProcessor()` before exporting the contents.
129 | Be aware that `Listener` provides, by default, these features:
130 |
131 | - Found entries are reachable through `Listener::export()` method;
132 | - [Tag content concatenation](http://www.bibtex.org/Format/);
133 | - e.g. `hello # " world"` tag's content will generate `hello world` [string]
134 | - [Tag content abbreviation handling](http://www.bibtex.org/Format/);
135 | - e.g. `@string{foo="bar"} @misc{bar=foo}` will make `$entries[1]['bar']` assume `bar` as value
136 | - Publication's type exposed as `_type` tag;
137 | - Citation key exposed as `citation-key` tag;
138 | - Original entry text exposed as `_original` tag.
139 |
140 | This project ships some useful processors.
141 |
142 | ### Tag name case
143 |
144 | In [BibTeX] the tag's names aren't case-sensitive.
145 | This library exposes entries as [array], in which keys are case-sensitive.
146 | To avoid this misunderstanding, you can force the tags' name character case using `TagNameCaseProcessor`.
147 |
148 | Usage
149 |
150 | ```php
151 | use RenanBr\BibTexParser\Processor\TagNameCaseProcessor;
152 |
153 | $listener->addProcessor(new TagNameCaseProcessor(CASE_UPPER)); // or CASE_LOWER
154 | ```
155 |
156 | ```bib
157 | @article{
158 | title={BibTeX rocks}
159 | }
160 | ```
161 |
162 | ```
163 | Array
164 | (
165 | [0] => Array
166 | (
167 | [TYPE] => article
168 | [TITLE] => BibTeX rocks
169 | )
170 | )
171 | ```
172 |
173 |
174 |
175 | ### Authors and editors
176 |
177 | [BibTeX] recognizes four parts of an author's name: First Von Last Jr.
178 | If you would like to parse the `author` and `editor` tags included in your entries, you can use the `NamesProcessor` class.
179 |
180 | Usage
181 |
182 | ```php
183 | use RenanBr\BibTexParser\Processor\NamesProcessor;
184 |
185 | $listener->addProcessor(new NamesProcessor());
186 | ```
187 |
188 | ```bib
189 | @article{
190 | title={Relativity: The Special and General Theory},
191 | author={Einstein, Albert}
192 | }
193 | ```
194 |
195 | ```
196 | Array
197 | (
198 | [0] => Array
199 | (
200 | [type] => article
201 | [title] => Relativity: The Special and General Theory
202 | [author] => Array
203 | (
204 | [0] => Array
205 | (
206 | [first] => Albert
207 | [von] =>
208 | [last] => Einstein
209 | [jr] =>
210 | )
211 | )
212 | )
213 | )
214 | ```
215 |
216 |
217 |
218 | ### Keywords
219 |
220 | The `keywords` tag contains a list of expressions represented as [string], you might want to read them as an [array] instead.
221 |
222 | Usage
223 |
224 | ```php
225 | use RenanBr\BibTexParser\Processor\KeywordsProcessor;
226 |
227 | $listener->addProcessor(new KeywordsProcessor());
228 | ```
229 |
230 | ```bib
231 | @misc{
232 | title={The End of Theory: The Data Deluge Makes the Scientific Method Obsolete},
233 | keywords={big data, data deluge, scientific method}
234 | }
235 | ```
236 |
237 | ```
238 | Array
239 | (
240 | [0] => Array
241 | (
242 | [type] => misc
243 | [title] => The End of Theory: The Data Deluge Makes the Scientific Method Obsolete
244 | [keywords] => Array
245 | (
246 | [0] => big data
247 | [1] => data deluge
248 | [2] => scientific method
249 | )
250 | )
251 | )
252 | ```
253 |
254 |
255 |
256 | ### Date
257 |
258 | It adds a new tag `_date` as [DateTimeImmutable].
259 | This processor adds the new tag **if and only if** this the tags `month` and `year` are fulfilled.
260 |
261 | Usage
262 |
263 | ```php
264 | use RenanBr\BibTexParser\Processor\DateProcessor;
265 |
266 | $listener->addProcessor(new DateProcessor());
267 | ```
268 |
269 | ```bib
270 | @misc{
271 | month="1~oct",
272 | year=2000
273 | }
274 | ```
275 |
276 | ```
277 | Array
278 | (
279 | [0] => Array
280 | (
281 | [type] => misc
282 | [month] => 1~oct
283 | [year] => 2000
284 | [_date] => DateTimeImmutable Object
285 | (
286 | [date] => 2000-10-01 00:00:00.000000
287 | [timezone_type] => 3
288 | [timezone] => UTC
289 | )
290 | )
291 | )
292 | ```
293 |
294 |
295 |
296 | ### Fill missing tag
297 |
298 | It puts a default value to some missing field.
299 |
300 | Usage
301 |
302 | ```php
303 | use RenanBr\BibTexParser\Processor\FillMissingProcessor;
304 |
305 | $listener->addProcessor(new FillMissingProcessor([
306 | 'title' => 'This entry has no title',
307 | 'year' => 1970,
308 | ]));
309 | ```
310 |
311 | ```bib
312 | @misc{
313 | }
314 |
315 | @misc{
316 | title="I do exist"
317 | }
318 | ```
319 |
320 | ```
321 | Array
322 | (
323 | [0] => Array
324 | (
325 | [type] => misc
326 | [title] => This entry has no title
327 | [year] => 1970
328 | )
329 | [1] => Array
330 | (
331 | [type] => misc
332 | [title] => I do exist
333 | [year] => 1970
334 | )
335 | )
336 | ```
337 |
338 |
339 |
340 | ### Trim tags
341 |
342 | Apply [trim()] to all tags.
343 |
344 | Usage
345 |
346 | ```php
347 | use RenanBr\BibTexParser\Processor\TrimProcessor;
348 |
349 | $listener->addProcessor(new TrimProcessor());
350 | ```
351 |
352 | ```bib
353 | @misc{
354 | title=" too much space "
355 | }
356 | ```
357 |
358 | ```
359 | Array
360 | (
361 | [0] => Array
362 | (
363 | [type] => misc
364 | [title] => too much space
365 | )
366 |
367 | )
368 | ```
369 |
370 |
371 |
372 | ### Determine URL from the DOI
373 |
374 | Sets `url` tag with [DOI] if `doi` tag is present and `url` tag is missing.
375 |
376 | Usage
377 |
378 | ```php
379 | use RenanBr\BibTexParser\Processor\UrlFromDoiProcessor;
380 |
381 | $listener->addProcessor(new UrlFromDoiProcessor());
382 | ```
383 |
384 | ```bib
385 | @misc{
386 | doi="qwerty"
387 | }
388 |
389 | @misc{
390 | doi="azerty",
391 | url="http://example.org"
392 | }
393 | ```
394 |
395 | ```
396 | Array
397 | (
398 | [0] => Array
399 | (
400 | [type] => misc
401 | [doi] => qwerty
402 | [url] => https://doi.org/qwerty
403 | )
404 |
405 | [1] => Array
406 | (
407 | [type] => misc
408 | [doi] => azerty
409 | [url] => http://example.org
410 | )
411 | )
412 | ```
413 |
414 |
415 |
416 | ### LaTeX to unicode
417 |
418 | [BibTeX] files store [LaTeX] contents.
419 | You might want to read them as unicode instead.
420 | The `LatexToUnicodeProcessor` class solves this problem, but before adding the processor to the listener you must:
421 |
422 | - [install Pandoc](http://pandoc.org/installing.html) in your system; and
423 | - add [ryakad/pandoc-php](https://github.com/ryakad/pandoc-php) or [ueberdosis/pandoc](https://github.com/ueberdosis/pandoc) as a dependency of your project.
424 |
425 | Usage
426 |
427 | ```php
428 | use RenanBr\BibTexParser\Processor\LatexToUnicodeProcessor;
429 |
430 | $listener->addProcessor(new LatexToUnicodeProcessor());
431 | ```
432 |
433 | ```bib
434 | @article{
435 | title={Caf\\'{e}s and bars}
436 | }
437 | ```
438 |
439 | ```
440 | Array
441 | (
442 | [0] => Array
443 | (
444 | [type] => article
445 | [title] => Cafés and bars
446 | )
447 | )
448 | ```
449 |
450 |
451 |
452 | Note: Order matters, add this processor as the last.
453 |
454 | ### Custom
455 |
456 | The `Listener::addProcessor()` method expects a [callable] as argument.
457 | In the example shown below, we append the text `with laser` to the `title` tags for all entries.
458 |
459 | Usage
460 |
461 | ```php
462 | $listener->addProcessor(static function (array $entry) {
463 | $entry['title'] .= ' with laser';
464 | return $entry;
465 | });
466 | ```
467 |
468 | ```
469 | @article{
470 | title={BibTeX rocks}
471 | }
472 | ```
473 |
474 | ```
475 | Array
476 | (
477 | [0] => Array
478 | (
479 | [type] => article
480 | [title] => BibTeX rocks with laser
481 | )
482 | )
483 | ```
484 |
485 |
486 |
487 | ## Handling errors
488 |
489 | This library throws two types of exception: `ParserException` and `ProcessorException`.
490 | The first one may happen during the data extraction.
491 | When it occurs it probably means the parsed BibTeX isn't valid.
492 | The second exception may happen during the data processing.
493 | When it occurs it means the listener's processors can't handle properly the data found.
494 | Both implement `ExceptionInterface`.
495 |
496 | ```php
497 | use RenanBr\BibTexParser\Exception\ExceptionInterface;
498 | use RenanBr\BibTexParser\Exception\ParserException;
499 | use RenanBr\BibTexParser\Exception\ProcessorException;
500 |
501 | try {
502 | // ... parser and listener configuration
503 |
504 | $parser->parseFile('/path/to/file.bib');
505 | $entries = $listener->export();
506 | } catch (ParserException $exception) {
507 | // The BibTeX isn't valid
508 | } catch (ProcessorException $exception) {
509 | // Listener's processors aren't able to handle data found
510 | } catch (ExceptionInterface $exception) {
511 | // Alternatively, you can use this exception to catch all of them at once
512 | }
513 | ```
514 |
515 | ## Advanced usage
516 |
517 | The core of this library contains these main classes:
518 |
519 | - `RenanBr\BibTexParser\Parser` responsible for detecting units inside a [BibTeX] input;
520 | - `RenanBr\BibTexParser\ListenerInterface` responsible for treating units found.
521 |
522 | You can attach listeners to the parser through `Parser::addListener()`.
523 | The parser is able to detect [BibTeX] units, such as "type", "tag's name", "tag's content".
524 | As the parser finds a unit, it triggers the listeners attached to it.
525 |
526 | You can code your own listener! All you have to do is handle units.
527 |
528 | ```php
529 | namespace RenanBr\BibTexParser;
530 |
531 | interface ListenerInterface
532 | {
533 | /**
534 | * Called when an unit is found.
535 | *
536 | * @param string $text The original content of the unit found.
537 | * Escape character will not be sent.
538 | * @param string $type The type of unit found.
539 | * It can assume one of Parser's constant value.
540 | * @param array $context Contains details of the unit found.
541 | */
542 | public function bibTexUnitFound($text, $type, array $context);
543 | }
544 | ```
545 |
546 | `$type` may assume one of these values:
547 |
548 | - `Parser::TYPE`
549 | - `Parser::CITATION_KEY`
550 | - `Parser::TAG_NAME`
551 | - `Parser::RAW_TAG_CONTENT`
552 | - `Parser::BRACED_TAG_CONTENT`
553 | - `Parser::QUOTED_TAG_CONTENT`
554 | - `Parser::ENTRY`
555 |
556 | `$context` is an [array] with these keys:
557 |
558 | - `offset` contains the `$text`'s beginning position.
559 | It may be useful, for example, to [seek on a file pointer](https://php.net/fseek);
560 | - `length` contains the original `$text`'s length.
561 | It may differ from [string] length sent to the listener because may there are escaped characters.
562 |
563 | ## Release Policy
564 |
565 | There is a **single** maintained branch per time.
566 | This branch targets a minor version.
567 |
568 | A maintained version reaches its end-of-life when a new minor version is released.
569 |
570 | ### Dependencies Compatibility Policy
571 |
572 | This library is compatible with maintained versions of
573 | [PHP][php-versions].
574 |
575 | [BibTeX]: https://tug.org/bibtex/
576 | [DOI]: https://www.doi.org/
577 | [DateTimeImmutable]: https://www.php.net/manual/class.datetimeimmutable.php
578 | [LaTeX]: https://www.latex-project.org/
579 | [array]: https://php.net/manual/language.types.array.php
580 | [callable]: https://php.net/manual/en/language.types.callable.php
581 | [php-versions]: https://www.php.net/supported-versions.php
582 | [string]: https://php.net/manual/language.types.string.php
583 | [trim()]: https://www.php.net/trim
584 |
--------------------------------------------------------------------------------
/src/Parser.php:
--------------------------------------------------------------------------------
1 |
7 | *
8 | * For the full copyright and license information, please view the LICENSE
9 | * file that was distributed with this source code.
10 | */
11 |
12 | namespace RenanBr\BibTexParser;
13 |
14 | use ErrorException;
15 | use RenanBr\BibTexParser\Exception\ParserException;
16 |
17 | class Parser
18 | {
19 | public const TYPE = 'type';
20 | public const CITATION_KEY = 'citation_key';
21 | public const TAG_NAME = 'tag_name';
22 | public const RAW_TAG_CONTENT = 'raw_tag_content';
23 | public const BRACED_TAG_CONTENT = 'braced_tag_content';
24 | public const QUOTED_TAG_CONTENT = 'quoted_tag_content';
25 | public const ENTRY = 'entry';
26 |
27 | public const NONE = 'none';
28 | public const COMMENT = 'comment';
29 | public const FIRST_TAG_NAME = 'first_tag_name';
30 | public const POST_TYPE = 'post_type';
31 | public const POST_TAG_NAME = 'post_tag_name';
32 | public const PRE_TAG_CONTENT = 'pre_tag_content';
33 |
34 | private string $state;
35 |
36 | private string $buffer;
37 |
38 | private ?int $bufferOffset = null;
39 |
40 | private ?array $firstTagSnapshot = null;
41 |
42 | private ?string $originalEntryBuffer = null;
43 |
44 | private ?int $originalEntryOffset = null;
45 |
46 | private bool $skipOriginalEntryReading;
47 |
48 | private int $line;
49 |
50 | private int $column;
51 |
52 | private int $offset;
53 |
54 | private bool $isTagContentEscaped;
55 |
56 | private bool $mayConcatenateTagContent;
57 |
58 | private ?string $tagContentDelimiter = null;
59 |
60 | private int $braceLevel;
61 |
62 | /** @var array */
63 | private array $listeners = [];
64 |
65 | public function addListener(ListenerInterface $listener): void
66 | {
67 | $this->listeners[] = $listener;
68 | }
69 |
70 | /**
71 | * @throws ParserException if $file given is not a valid BibTeX
72 | * @throws ErrorException if $file given is not readable
73 | */
74 | public function parseFile(string $file): void
75 | {
76 | $handle = @fopen($file, 'r');
77 | if (!$handle) {
78 | throw new ErrorException(sprintf('Unable to open %s', $file));
79 | }
80 | try {
81 | $this->reset();
82 | while (!feof($handle)) {
83 | $buffer = fread($handle, 128);
84 | $this->parse($buffer);
85 | }
86 | $this->throwExceptionIfReadingEntry("\0");
87 | } finally {
88 | fclose($handle);
89 | }
90 | }
91 |
92 | /**
93 | * @throws ParserException if $string given is not a valid BibTeX
94 | */
95 | public function parseString(string $string): void
96 | {
97 | $this->reset();
98 | $this->parse($string);
99 | $this->throwExceptionIfReadingEntry("\0");
100 | }
101 |
102 | private function parse(string $text): void
103 | {
104 | $text = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
105 | $length = count($text);
106 | for ($position = 0; $position < $length; ++$position) {
107 | $char = $text[$position];
108 | $this->read($char);
109 | if ("\n" === $char) {
110 | ++$this->line;
111 | $this->column = 1;
112 | } else {
113 | ++$this->column;
114 | }
115 | ++$this->offset;
116 | }
117 | }
118 |
119 | private function reset(): void
120 | {
121 | $this->state = self::NONE;
122 | $this->buffer = '';
123 | $this->firstTagSnapshot = null;
124 | $this->originalEntryBuffer = null;
125 | $this->originalEntryOffset = null;
126 | $this->skipOriginalEntryReading = false;
127 | $this->line = 1;
128 | $this->column = 1;
129 | $this->offset = 0;
130 | $this->mayConcatenateTagContent = false;
131 | $this->isTagContentEscaped = false;
132 | $this->tagContentDelimiter = null;
133 | $this->braceLevel = 0;
134 | }
135 |
136 | // ----- Readers -----------------------------------------------------------
137 |
138 | private function read(string $char): void
139 | {
140 | $previousState = $this->state;
141 |
142 | switch ($this->state) {
143 | case self::NONE:
144 | $this->readNone($char);
145 | break;
146 | case self::COMMENT:
147 | $this->readComment($char);
148 | break;
149 | case self::TYPE:
150 | $this->readType($char);
151 | break;
152 | case self::POST_TYPE:
153 | $this->readPostType($char);
154 | break;
155 | case self::FIRST_TAG_NAME:
156 | case self::TAG_NAME:
157 | $this->readTagName($char);
158 | break;
159 | case self::POST_TAG_NAME:
160 | $this->readPostTagName($char);
161 | break;
162 | case self::PRE_TAG_CONTENT:
163 | $this->readPreTagContent($char);
164 | break;
165 | case self::RAW_TAG_CONTENT:
166 | $this->readRawTagContent($char);
167 | break;
168 | case self::QUOTED_TAG_CONTENT:
169 | case self::BRACED_TAG_CONTENT:
170 | $this->readDelimitedTagContent($char);
171 | break;
172 | }
173 |
174 | $this->readOriginalEntry($char, $previousState);
175 | }
176 |
177 | private function readNone(string $char): void
178 | {
179 | if ('@' === $char) {
180 | $this->state = self::TYPE;
181 | } elseif (!$this->isWhitespace($char)) {
182 | $this->state = self::COMMENT;
183 | }
184 | }
185 |
186 | private function readComment(string $char): void
187 | {
188 | if ($this->isWhitespace($char)) {
189 | $this->state = self::NONE;
190 | }
191 | }
192 |
193 | private function readType(string $char): void
194 | {
195 | if (preg_match('/^[a-zA-Z]$/', $char)) {
196 | $this->appendToBuffer($char);
197 | } else {
198 | $this->throwExceptionIfBufferIsEmpty($char);
199 |
200 | // Skips @comment type
201 | if ('comment' === mb_strtolower($this->buffer)) {
202 | $this->skipOriginalEntryReading = true;
203 | $this->buffer = '';
204 | $this->bufferOffset = null;
205 | $this->state = self::COMMENT;
206 | $this->readComment($char);
207 |
208 | return;
209 | }
210 |
211 | $this->triggerListenersWithCurrentBuffer();
212 |
213 | // once $char isn't a valid character
214 | // it must be interpreted as POST_TYPE
215 | $this->state = self::POST_TYPE;
216 | $this->readPostType($char);
217 | }
218 | }
219 |
220 | private function readPostType(string $char): void
221 | {
222 | if ('{' === $char) {
223 | $this->state = self::FIRST_TAG_NAME;
224 | } elseif (!$this->isWhitespace($char)) {
225 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
226 | }
227 | }
228 |
229 | private function readTagName(string $char): void
230 | {
231 | if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/\x{00C0}-\x{01FF}]$/u', $char)) {
232 | $this->appendToBuffer($char);
233 | } elseif ($this->isWhitespace($char) && empty($this->buffer)) {
234 | // Skips because we didn't start reading
235 | } elseif ('}' === $char && empty($this->buffer)) {
236 | // No tag name found, $char is just closing current entry
237 | $this->state = self::NONE;
238 | } else {
239 | $this->throwExceptionIfBufferIsEmpty($char);
240 |
241 | if (self::FIRST_TAG_NAME === $this->state) {
242 | // Takes a snapshot of current state to be triggered later as
243 | // tag name or citation key, see readPostTagName()
244 | $this->firstTagSnapshot = $this->takeBufferSnapshot();
245 | } else {
246 | // Current buffer is a simple tag name
247 | $this->triggerListenersWithCurrentBuffer();
248 | }
249 |
250 | // Once $char isn't a valid tag name character, it must be
251 | // interpreted as post tag name
252 | $this->state = self::POST_TAG_NAME;
253 | $this->readPostTagName($char);
254 | }
255 | }
256 |
257 | private function readPostTagName(string $char): void
258 | {
259 | if ('=' === $char) {
260 | // First tag name isn't a citation key, because it has content
261 | $this->triggerListenersWithFirstTagSnapshotAs(self::TAG_NAME);
262 | $this->state = self::PRE_TAG_CONTENT;
263 | } elseif ('}' === $char) {
264 | // First tag name is a citation key, because $char closes entry and
265 | // lets first tag without value
266 | $this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY);
267 | $this->state = self::NONE;
268 | } elseif (',' === $char) {
269 | // First tag name is a citation key, because $char moves to the next
270 | // tag and lets first tag without value
271 | $this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY);
272 | $this->state = self::TAG_NAME;
273 | } elseif (!$this->isWhitespace($char)) {
274 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
275 | }
276 | }
277 |
278 | private function readPreTagContent(string $char): void
279 | {
280 | if (preg_match('/^[a-zA-Z0-9]$/', $char)) {
281 | // When concatenation is available it means there is already a
282 | // defined value, and parser expect a concatenator, a tag separator
283 | // or an entry closing char as next $char
284 | $this->throwExceptionAccordingToConcatenationAvailability($char, true);
285 | $this->state = self::RAW_TAG_CONTENT;
286 | $this->readRawTagContent($char);
287 | } elseif ('"' === $char) {
288 | // The exception is here for the same reason of the first case
289 | $this->throwExceptionAccordingToConcatenationAvailability($char, true);
290 | $this->tagContentDelimiter = '"';
291 | $this->state = self::QUOTED_TAG_CONTENT;
292 | } elseif ('{' === $char) {
293 | // The exception is here for the same reason of the first case
294 | $this->throwExceptionAccordingToConcatenationAvailability($char, true);
295 | $this->tagContentDelimiter = '}';
296 | $this->state = self::BRACED_TAG_CONTENT;
297 | } elseif ('#' === $char) {
298 | $this->throwExceptionAccordingToConcatenationAvailability($char, false);
299 | $this->mayConcatenateTagContent = false;
300 | } elseif (',' === $char) {
301 | $this->throwExceptionAccordingToConcatenationAvailability($char, false);
302 | $this->mayConcatenateTagContent = false;
303 | $this->state = self::TAG_NAME;
304 | } elseif ('}' === $char) {
305 | $this->throwExceptionAccordingToConcatenationAvailability($char, false);
306 | $this->mayConcatenateTagContent = false;
307 | $this->state = self::NONE;
308 | } elseif (!$this->isWhitespace($char)) {
309 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
310 | }
311 | }
312 |
313 | private function readRawTagContent(string $char): void
314 | {
315 | if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/]$/', $char)) {
316 | $this->appendToBuffer($char);
317 | } else {
318 | $this->throwExceptionIfBufferIsEmpty($char);
319 | $this->triggerListenersWithCurrentBuffer();
320 |
321 | // once $char isn't a valid character
322 | // it must be interpreted as TAG_CONTENT
323 | $this->mayConcatenateTagContent = true;
324 | $this->state = self::PRE_TAG_CONTENT;
325 | $this->readPreTagContent($char);
326 | }
327 | }
328 |
329 | private function readDelimitedTagContent(string $char): void
330 | {
331 | if ($this->isTagContentEscaped) {
332 | $this->isTagContentEscaped = false;
333 | if ($this->tagContentDelimiter !== $char && '\\' !== $char && '%' !== $char) {
334 | $this->appendToBuffer('\\');
335 | }
336 | $this->appendToBuffer($char);
337 | } elseif ('}' === $this->tagContentDelimiter && '{' === $char) {
338 | ++$this->braceLevel;
339 | $this->appendToBuffer($char);
340 | } elseif ($this->tagContentDelimiter === $char) {
341 | if (0 === $this->braceLevel) {
342 | $this->triggerListenersWithCurrentBuffer();
343 | $this->mayConcatenateTagContent = true;
344 | $this->state = self::PRE_TAG_CONTENT;
345 | } else {
346 | --$this->braceLevel;
347 | $this->appendToBuffer($char);
348 | }
349 | } elseif ('\\' === $char) {
350 | $this->isTagContentEscaped = true;
351 | } else {
352 | $this->appendToBuffer($char);
353 | }
354 | }
355 |
356 | private function readOriginalEntry(string $char, string $previousState): void
357 | {
358 | if ($this->skipOriginalEntryReading) {
359 | $this->originalEntryBuffer = '';
360 | $this->originalEntryOffset = null;
361 | $this->skipOriginalEntryReading = false;
362 |
363 | return;
364 | }
365 |
366 | // Checks whether we are reading an entry character or not
367 | $isPreviousStateEntry = $this->isEntryState($previousState);
368 | $isCurrentStateEntry = $this->isEntryState($this->state);
369 | $isEntry = $isPreviousStateEntry || $isCurrentStateEntry;
370 | if (!$isEntry) {
371 | return;
372 | }
373 |
374 | // Appends $char to the original entry buffer
375 | if (empty($this->originalEntryBuffer)) {
376 | $this->originalEntryOffset = $this->offset;
377 | }
378 | $this->originalEntryBuffer .= $char;
379 |
380 | // Sends original entry to the listeners when $char closes an entry
381 | $isClosingEntry = $isPreviousStateEntry && !$isCurrentStateEntry;
382 | if ($isClosingEntry) {
383 | $this->triggerListeners($this->originalEntryBuffer, self::ENTRY, [
384 | 'offset' => $this->originalEntryOffset,
385 | 'length' => $this->offset - $this->originalEntryOffset + 1,
386 | ]);
387 | $this->originalEntryBuffer = '';
388 | $this->originalEntryOffset = null;
389 | }
390 | }
391 |
392 | // ----- Listener triggers -------------------------------------------------
393 |
394 | private function triggerListeners(string $text, string $type, array $context): void
395 | {
396 | foreach ($this->listeners as $listener) {
397 | $listener->bibTexUnitFound($text, $type, $context);
398 | }
399 | }
400 |
401 | private function triggerListenersWithCurrentBuffer(): void
402 | {
403 | $snapshot = $this->takeBufferSnapshot();
404 | $text = $snapshot['text'];
405 | $context = $snapshot['context'];
406 | $this->triggerListeners($text, $this->state, $context);
407 | }
408 |
409 | private function triggerListenersWithFirstTagSnapshotAs(string $type): void
410 | {
411 | if (empty($this->firstTagSnapshot)) {
412 | return;
413 | }
414 | $text = $this->firstTagSnapshot['text'];
415 | $context = $this->firstTagSnapshot['context'];
416 | $this->firstTagSnapshot = null;
417 | $this->triggerListeners($text, $type, $context);
418 | }
419 |
420 | // ----- Buffer tools ------------------------------------------------------
421 |
422 | private function appendToBuffer(string $char): void
423 | {
424 | if (empty($this->buffer)) {
425 | $this->bufferOffset = $this->offset;
426 | }
427 | $this->buffer .= $char;
428 | }
429 |
430 | private function takeBufferSnapshot(): array
431 | {
432 | $snapshot = [
433 | 'text' => $this->buffer,
434 | 'context' => [
435 | 'offset' => $this->bufferOffset,
436 | 'length' => $this->offset - $this->bufferOffset,
437 | ],
438 | ];
439 | $this->bufferOffset = null;
440 | $this->buffer = '';
441 |
442 | return $snapshot;
443 | }
444 |
445 | // ----- Exception throwers ------------------------------------------------
446 |
447 | private function throwExceptionAccordingToConcatenationAvailability(string $char, bool $availability): void
448 | {
449 | if ($availability === $this->mayConcatenateTagContent) {
450 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
451 | }
452 | }
453 |
454 | private function throwExceptionIfBufferIsEmpty(string $char): void
455 | {
456 | if (empty($this->buffer)) {
457 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
458 | }
459 | }
460 |
461 | private function throwExceptionIfReadingEntry(string $char): void
462 | {
463 | if ($this->isEntryState($this->state)) {
464 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
465 | }
466 | }
467 |
468 | // ----- Auxiliaries -------------------------------------------------------
469 |
470 | private function isEntryState(string $state): bool
471 | {
472 | return self::NONE !== $state && self::COMMENT !== $state;
473 | }
474 |
475 | private function isWhitespace(string $char): bool
476 | {
477 | return ' ' === $char || "\t" === $char || "\n" === $char || "\r" === $char;
478 | }
479 | }
480 |
--------------------------------------------------------------------------------