├── src
    ├── Exception
    │   ├── ExceptionInterface.php
    │   ├── ProcessorException.php
    │   └── ParserException.php
    ├── Processor
    │   ├── TagNameCaseProcessor.php
    │   ├── TagSearchTrait.php
    │   ├── FillMissingProcessor.php
    │   ├── KeywordsProcessor.php
    │   ├── UrlFromDoiProcessor.php
    │   ├── TrimProcessor.php
    │   ├── DateProcessor.php
    │   ├── TagCoverageTrait.php
    │   ├── LatexToUnicodeProcessor.php
    │   └── NamesProcessor.php
    ├── ListenerInterface.php
    ├── Listener.php
    └── Parser.php
├── phpunit.dist.xml
├── LICENSE
├── composer.json
└── README.md


/src/Exception/ExceptionInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Exception;
13 | 
14 | /**
15 |  * Interface for package exceptions.
16 |  */
17 | interface ExceptionInterface {}
18 | 


--------------------------------------------------------------------------------
/src/Exception/ProcessorException.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Exception;
13 | 
14 | use Exception;
15 | 
16 | class ProcessorException extends Exception implements ExceptionInterface {}
17 | 


--------------------------------------------------------------------------------
/src/Processor/TagNameCaseProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | /**
15 |  * Change the case of all tag names.
16 |  */
17 | class TagNameCaseProcessor
18 | {
19 |     public function __construct(
20 |         private readonly int $case,
21 |     ) {}
22 | 
23 |     public function __invoke(array $entry): array
24 |     {
25 |         return array_change_key_case($entry, $this->case);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/Processor/TagSearchTrait.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | trait TagSearchTrait
15 | {
16 |     /**
17 |      * Searches for the actual name of a tag.
18 |      *
19 |      * The search performed is case-insensitive.
20 |      */
21 |     protected function tagSearch(string $needle, array $haystack): string|null
22 |     {
23 |         foreach ($haystack as $actual) {
24 |             if (0 === strcasecmp($needle, (string) $actual)) {
25 |                 return $actual;
26 |             }
27 |         }
28 | 
29 |         return null;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/ListenerInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser;
13 | 
14 | interface ListenerInterface
15 | {
16 |     /**
17 |      * Called when a unit is found.
18 |      *
19 |      * @param string $text    The original content of the unit found.
20 |      *                        Escape character will not be sent.
21 |      * @param string $type    The type of unit found.
22 |      *                        It can assume one of Parser's constant value.
23 |      * @param array  $context contains details of the unit found
24 |      */
25 |     public function bibTexUnitFound($text, $type, array $context);
26 | }
27 | 


--------------------------------------------------------------------------------
/src/Processor/FillMissingProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | class FillMissingProcessor
15 | {
16 |     use TagSearchTrait;
17 | 
18 |     public function __construct(
19 |         private readonly array $missingFields,
20 |     ) {}
21 | 
22 |     public function __invoke(array $entry): array
23 |     {
24 |         $tags = array_keys($entry);
25 | 
26 |         foreach ($this->missingFields as $tag => $value) {
27 |             if (!$this->tagSearch($tag, $tags)) {
28 |                 $entry[$tag] = $value;
29 |             }
30 |         }
31 | 
32 |         return $entry;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/Processor/KeywordsProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | /**
15 |  * Splits tags contents as array.
16 |  */
17 | class KeywordsProcessor
18 | {
19 |     use TagCoverageTrait;
20 | 
21 |     public function __construct()
22 |     {
23 |         $this->setTagCoverage(['keywords']);
24 |     }
25 | 
26 |     public function __invoke(array $entry): array
27 |     {
28 |         $covered = $this->getCoveredTags(array_keys($entry));
29 |         foreach ($covered as $tag) {
30 |             $entry[$tag] = preg_split('/, |; /', (string) $entry[$tag]);
31 |         }
32 | 
33 |         return $entry;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/phpunit.dist.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/12.2/phpunit.xsd"
 4 |          bootstrap="vendor/autoload.php"
 5 |          cacheDirectory=".phpunit.cache"
 6 |          executionOrder="depends,defects"
 7 |          requireCoverageMetadata="false"
 8 |          beStrictAboutOutputDuringTests="true"
 9 |          displayDetailsOnPhpunitDeprecations="true"
10 |          failOnPhpunitDeprecation="true"
11 |          failOnRisky="true"
12 |          failOnWarning="true"
13 |          colors="true">
14 |   <testsuites>
15 |     <testsuite name="default">
16 |       <directory>tests</directory>
17 |     </testsuite>
18 |   </testsuites>
19 |   <source restrictNotices="true" restrictWarnings="true">
20 |     <include>
21 |       <directory>src</directory>
22 |     </include>
23 |   </source>
24 | </phpunit>
25 | 


--------------------------------------------------------------------------------
/src/Exception/ParserException.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Exception;
13 | 
14 | use Exception;
15 | 
16 | class ParserException extends Exception implements ExceptionInterface
17 | {
18 |     public static function unexpectedCharacter(string $character, int $line, int $column): self
19 |     {
20 |         // Avoid var_export() weird treatment for \0
21 |         $character = "\0" === $character ? "'\\0'" : var_export($character, true);
22 | 
23 |         return new self(sprintf(
24 |             'Unexpected character %s at line %d column %d',
25 |             $character,
26 |             $line,
27 |             $column,
28 |         ));
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Renan de Lima Barbosa
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/src/Processor/UrlFromDoiProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | class UrlFromDoiProcessor
15 | {
16 |     use TagSearchTrait;
17 | 
18 |     public const FORMAT = 'https://doi.org/%s';
19 | 
20 |     public function __construct(
21 |         private readonly string $urlFormat = self::FORMAT,
22 |     ) {}
23 | 
24 |     public function __invoke(array $entry): array
25 |     {
26 |         $doiTag = $this->tagSearch('doi', array_keys($entry));
27 |         $urlTag = $this->tagSearch('url', array_keys($entry));
28 |         if (null === $urlTag && null !== $doiTag) {
29 |             $doiValue = $entry[$doiTag];
30 |             if (\is_string($doiValue) && '' !== $doiValue) {
31 |                 $entry['url'] = sprintf($this->urlFormat, $doiValue);
32 |             }
33 |         }
34 | 
35 |         return $entry;
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/Processor/TrimProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | /**
15 |  * @author Florent DESPIERRES <florent@despierres.pro>
16 |  */
17 | class TrimProcessor
18 | {
19 |     use TagCoverageTrait;
20 | 
21 |     public function __construct(array $fields = [])
22 |     {
23 |         if ($fields) {
24 |             $this->setTagCoverage($fields);
25 |         }
26 |     }
27 | 
28 |     public function __invoke(array $entry): array
29 |     {
30 |         $covered = $this->getCoveredTags(array_keys($entry));
31 |         foreach ($covered as $tag) {
32 |             $entry[$tag] = $this->trim($entry[$tag]);
33 |         }
34 | 
35 |         return $entry;
36 |     }
37 | 
38 |     private function trim(mixed $value): mixed
39 |     {
40 |         if (\is_array($value)) {
41 |             $trimmed = [];
42 |             foreach ($value as $key => $subValue) {
43 |                 $trimmed[$key] = $this->trim($subValue);
44 |             }
45 | 
46 |             return $trimmed;
47 |         }
48 | 
49 |         if (\is_string($value)) {
50 |             return trim($value);
51 |         }
52 | 
53 |         return $value;
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "renanbr/bibtex-parser",
 3 |     "type": "library",
 4 |     "description": "BibTex Parser provides an API to read .bib files programmatically",
 5 |     "keywords": [
 6 |         "bib",
 7 |         "bibtex",
 8 |         "latex",
 9 |         "parser",
10 |         "bibliography",
11 |         "citation",
12 |         "cite"
13 |     ],
14 |     "license": "MIT",
15 |     "authors": [
16 |         {
17 |             "name": "Renan de Lima Barbosa",
18 |             "email": "renandelima@gmail.com"
19 |         }
20 |     ],
21 |     "require": {
22 |         "php": "^8.1 | ^8.2 | ^8.3 | ^8.4"
23 |     },
24 |     "require-dev": {
25 |         "friendsofphp/php-cs-fixer": "^3.80",
26 |         "phpstan/phpstan": "^2.1.17",
27 |         "phpunit/phpunit": "^10.5.47 | ^11.5.26 | ^12.2.6",
28 |         "ueberdosis/pandoc": "^0.9"
29 |     },
30 |     "suggest": {
31 |         "ueberdosis/pandoc": "Needed to support LaTeX decoder in class RenanBr\\BibTexParser\\Processor\\LatexToUnicodeProcessor"
32 |     },
33 |     "config": {
34 |         "sort-packages": true,
35 |         "bump-after-update": true
36 |     },
37 |     "extra": {
38 |         "branch-alias": {
39 |             "dev-master": "2.x-dev"
40 |         }
41 |     },
42 |     "autoload": {
43 |         "psr-4": {
44 |             "RenanBr\\BibTexParser\\": "src/"
45 |         }
46 |     },
47 |     "autoload-dev": {
48 |         "psr-4": {
49 |             "RenanBr\\BibTexParser\\Test\\": "tests/"
50 |         }
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/Processor/DateProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | use DateTimeImmutable;
15 | use DateTimeZone;
16 | 
17 | class DateProcessor
18 | {
19 |     use TagSearchTrait;
20 | 
21 |     public const TAG_NAME = '_date';
22 | 
23 |     public function __construct(
24 |         private readonly string $tagName = self::TAG_NAME,
25 |     ) {}
26 | 
27 |     public function __invoke(array $entry): array
28 |     {
29 |         $yearTag = $this->tagSearch('year', array_keys($entry));
30 |         $monthTag = $this->tagSearch('month', array_keys($entry));
31 |         if (null !== $yearTag && null !== $monthTag) {
32 |             $year = (int) $entry[$yearTag];
33 |             $monthArray = explode('~', (string) $entry[$monthTag]);
34 |             if (2 === \count($monthArray)) {
35 |                 [$day, $month] = $monthArray;
36 |                 $day = (int) $day;
37 |                 $dateMonthNumber = date_parse($month);
38 |                 $month = $dateMonthNumber['month'] ?: 0;
39 |                 if (checkdate($month, $day, $year)) {
40 |                     $timestamp = mktime(0, 0, 0, $month, $day, $year);
41 |                     $entry[$this->tagName] = new DateTimeImmutable(date('Y-m-d', $timestamp), new DateTimeZone('UTC'));
42 |                 }
43 |             }
44 |         }
45 | 
46 |         return $entry;
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/Processor/TagCoverageTrait.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | trait TagCoverageTrait
15 | {
16 |     use TagSearchTrait;
17 | 
18 |     private array $tagCoverageList = [
19 |         '_original',
20 |         '_type',
21 |     ];
22 | 
23 |     private string $tagCoverageStrategy = 'blacklist';
24 | 
25 |     /**
26 |      * @param array   $tags     List of tags to be covered
27 |      * @param ?string $strategy Can assume "whitelist" (default) or "blacklist"
28 |      */
29 |     public function setTagCoverage(array $tags, ?string $strategy = null): void
30 |     {
31 |         $this->tagCoverageList = $tags;
32 |         $this->tagCoverageStrategy = $strategy ?: 'whitelist';
33 |     }
34 | 
35 |     /**
36 |      * Calculates which tags are covered.
37 |      *
38 |      * The search performed internally is case-insensitive.
39 |      */
40 |     protected function getCoveredTags(array $tags): array
41 |     {
42 |         // Finds for actual tag names
43 |         $matched = [];
44 |         foreach ($this->tagCoverageList as $original) {
45 |             $actual = $this->tagSearch($original, $tags);
46 |             if (null !== $actual) {
47 |                 $matched[] = $actual;
48 |             }
49 |         }
50 | 
51 |         // Whitelist
52 |         if ('whitelist' === $this->tagCoverageStrategy) {
53 |             return $matched;
54 |         }
55 | 
56 |         // Blacklist
57 |         return array_values(array_diff($tags, $matched));
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/Processor/LatexToUnicodeProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /*
 4 |  * This file is part of the BibTex Parser.
 5 |  *
 6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
 7 |  *
 8 |  * For the full copyright and license information, please view the LICENSE
 9 |  * file that was distributed with this source code.
10 |  */
11 | 
12 | namespace RenanBr\BibTexParser\Processor;
13 | 
14 | use Composer\InstalledVersions;
15 | use Exception;
16 | use Pandoc\Pandoc;
17 | use RenanBr\BibTexParser\Exception\ProcessorException;
18 | use RuntimeException;
19 | 
20 | /**
21 |  * Translates LaTeX texts to unicode.
22 |  */
23 | class LatexToUnicodeProcessor
24 | {
25 |     use TagCoverageTrait;
26 | 
27 |     /** @var (callable(string): string)|null */
28 |     private $converter;
29 | 
30 |     public function __invoke(array $entry): array
31 |     {
32 |         $covered = $this->getCoveredTags(array_keys($entry));
33 |         foreach ($covered as $tag) {
34 |             // Translate string
35 |             if (\is_string($entry[$tag])) {
36 |                 $entry[$tag] = $this->decode($entry[$tag]);
37 |                 continue;
38 |             }
39 | 
40 |             // Translate array
41 |             if (\is_array($entry[$tag])) {
42 |                 array_walk_recursive($entry[$tag], function (&$text): void {
43 |                     if (\is_string($text)) {
44 |                         $text = $this->decode($text);
45 |                     }
46 |                 });
47 |             }
48 |         }
49 | 
50 |         return $entry;
51 |     }
52 | 
53 |     private function decode($text): string
54 |     {
55 |         try {
56 |             return \call_user_func($this->getConverter(), $text);
57 |         } catch (Exception $exception) {
58 |             throw new ProcessorException(sprintf('Error while processing LaTeX to Unicode: %s', $exception->getMessage()), 0, $exception);
59 |         }
60 |     }
61 | 
62 |     /**
63 |      * @return (callable(string): string)
64 |      */
65 |     private function getConverter(): callable
66 |     {
67 |         if ($this->converter) {
68 |             return $this->converter;
69 |         }
70 | 
71 |         if (InstalledVersions::isInstalled('ueberdosis/pandoc')) {
72 |             return $this->converter = (static fn($text) => mb_substr((string) (new Pandoc())->input($text)->execute([
73 |                 '--from', 'latex',
74 |                 '--to', 'plain',
75 |                 '--wrap', 'none',
76 |             ]), 0, -1));
77 |         }
78 | 
79 |         throw new RuntimeException('Pandoc wrapper not installed. Try running "composer require ueberdosis/pandoc"');
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/Listener.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /*
  4 |  * This file is part of the BibTex Parser.
  5 |  *
  6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
  7 |  *
  8 |  * For the full copyright and license information, please view the LICENSE
  9 |  * file that was distributed with this source code.
 10 |  */
 11 | 
 12 | namespace RenanBr\BibTexParser;
 13 | 
 14 | class Listener implements ListenerInterface
 15 | {
 16 |     private array $entries = [];
 17 | 
 18 |     /**
 19 |      * Current tag name.
 20 |      *
 21 |      * Indicates where to save contents when triggered by the parser.
 22 |      */
 23 |     private string $currentTagName;
 24 | 
 25 |     private array $processors = [];
 26 | 
 27 |     private array $processed = [];
 28 | 
 29 |     /**
 30 |      * @return array all entries found during parsing process
 31 |      */
 32 |     public function export(): array
 33 |     {
 34 |         $offset = \count($this->processed);
 35 |         $missing = \array_slice($this->entries, $offset);
 36 |         foreach ($this->processors as $processor) {
 37 |             $missing = array_filter(array_map($processor, $missing));
 38 |         }
 39 |         $this->processed = array_merge($this->processed, $missing);
 40 | 
 41 |         return $this->processed;
 42 |     }
 43 | 
 44 |     /**
 45 |      * @param (callable(array): array) $processor Function to be applied to every BibTeX entry.
 46 |      *                                            The processor given must return the modified entry.
 47 |      *                                            Processors will be applied in the same order in which they were added.
 48 |      */
 49 |     public function addProcessor(callable $processor): void
 50 |     {
 51 |         $this->processors[] = $processor;
 52 |     }
 53 | 
 54 |     public function bibTexUnitFound($text, $type, array $context): void
 55 |     {
 56 |         switch ($type) {
 57 |             case Parser::TYPE:
 58 |                 // Starts a new entry
 59 |                 $this->entries[] = [
 60 |                     '_type' => $text,
 61 |                     'type' => $text, // compatibility
 62 |                 ];
 63 |                 break;
 64 | 
 65 |             case Parser::CITATION_KEY:
 66 |                 $index = \count($this->entries) - 1;
 67 |                 $this->entries[$index]['citation-key'] = $text;
 68 |                 break;
 69 | 
 70 |             case Parser::TAG_NAME:
 71 |                 // Saves tag into the current entry
 72 |                 $index = \count($this->entries) - 1;
 73 |                 $this->currentTagName = $text;
 74 |                 $this->entries[$index][$this->currentTagName] = null;
 75 |                 break;
 76 | 
 77 |             case Parser::RAW_TAG_CONTENT:
 78 |                 // Searches for an abbreviation
 79 |                 foreach ($this->entries as $entry) {
 80 |                     if ('string' === $entry['type'] && \array_key_exists($text, $entry)) {
 81 |                         $text = $entry[$text];
 82 |                         break;
 83 |                     }
 84 |                 }
 85 |                 // no break
 86 | 
 87 |             case Parser::BRACED_TAG_CONTENT:
 88 |             case Parser::QUOTED_TAG_CONTENT:
 89 |                 // Appends content into the current tag
 90 |                 if (null !== $text) {
 91 |                     $index = \count($this->entries) - 1;
 92 |                     $this->entries[$index][$this->currentTagName] .= $text;
 93 |                 }
 94 |                 break;
 95 | 
 96 |             case Parser::ENTRY:
 97 |                 $index = \count($this->entries) - 1;
 98 |                 $this->entries[$index]['_original'] = $text;
 99 |                 break;
100 |         }
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/Processor/NamesProcessor.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /*
  4 |  * This file is part of the BibTex Parser.
  5 |  *
  6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
  7 |  *
  8 |  * For the full copyright and license information, please view the LICENSE
  9 |  * file that was distributed with this source code.
 10 |  */
 11 | 
 12 | namespace RenanBr\BibTexParser\Processor;
 13 | 
 14 | use RenanBr\BibTexParser\Exception\ProcessorException;
 15 | 
 16 | /**
 17 |  * Splits names in four parts: First Von Last Jr.
 18 |  *
 19 |  * This class includes source code adapted from the Structures_BibTex package,
 20 |  * (c) Elmar Pitschke <elmar.pitschke@gmx.de>, included here under PHP license:
 21 |  * http://www.php.net/license/3_0.txt
 22 |  *
 23 |  * @author Andre Chalom <andrechalom@gmail.com>
 24 |  *
 25 |  * @see https://github.com/pear/Structures_BibTex
 26 |  */
 27 | class NamesProcessor
 28 | {
 29 |     use TagCoverageTrait;
 30 | 
 31 |     public function __construct()
 32 |     {
 33 |         $this->setTagCoverage(['author', 'editor']);
 34 |     }
 35 | 
 36 |     public function __invoke(array $entry): array
 37 |     {
 38 |         $covered = $this->getCoveredTags(array_keys($entry));
 39 |         foreach ($covered as $tag) {
 40 |             $entry[$tag] = $this->extractAuthors($entry[$tag]);
 41 |         }
 42 | 
 43 |         return $entry;
 44 |     }
 45 | 
 46 |     /**
 47 |      * Extracting the authors.
 48 |      *
 49 |      * @author Elmar Pitschke <elmar.pitschke@gmx.de>
 50 |      */
 51 |     private function extractAuthors(string $entry): array
 52 |     {
 53 |         // Sanitizes the entry to remove unwanted whitespace
 54 |         $entry = trim((string) preg_replace('/\s+/', ' ', $entry));
 55 | 
 56 |         $authorarray = explode(' and ', $entry);
 57 |         for ($i = 0; $i < \count($authorarray); ++$i) {
 58 |             $author = trim($authorarray[$i]);
 59 |             /*The first version of how an author could be written (First von Last)
 60 |             has no commas in it*/
 61 |             $first = '';
 62 |             $von = '';
 63 |             $last = '';
 64 |             $jr = '';
 65 |             if (false === mb_strpos($author, ',')) {
 66 |                 $tmparray = [];
 67 |                 $tmparray = preg_split('/[\s\~]/', $author);
 68 |                 $size = \count($tmparray);
 69 |                 if (1 === $size) { // There is only a last
 70 |                     $last = $tmparray[0];
 71 |                 } elseif (2 === $size) { // There is a first and a last
 72 |                     $first = $tmparray[0];
 73 |                     $last = $tmparray[1];
 74 |                 } else {
 75 |                     $invon = false;
 76 |                     $inlast = false;
 77 |                     for ($j = 0; $j < ($size - 1); ++$j) {
 78 |                         if ($inlast) {
 79 |                             $last .= ' ' . $tmparray[$j];
 80 |                         } elseif ($invon) {
 81 |                             try {
 82 |                                 $case = $this->determineCase($tmparray[$j]);
 83 | 
 84 |                                 if ((0 === $case) || (-1 === $case)) { // Change from von to last
 85 |                                     // You only change when there is no more lower case there
 86 |                                     $islast = true;
 87 |                                     for ($k = ($j + 1); $k < ($size - 1); ++$k) {
 88 |                                         try {
 89 |                                             $futurecase = $this->determineCase($tmparray[$k]);
 90 |                                             if (0 === $futurecase) {
 91 |                                                 $islast = false;
 92 |                                             }
 93 |                                         } catch (ProcessorException) {
 94 |                                             // Ignore
 95 |                                         }
 96 |                                     }
 97 |                                     if ($islast) {
 98 |                                         $inlast = true;
 99 |                                         if (-1 === $case) { // Caseless belongs to the last
100 |                                             $last .= ' ' . $tmparray[$j];
101 |                                         } else {
102 |                                             $von .= ' ' . $tmparray[$j];
103 |                                         }
104 |                                     } else {
105 |                                         $von .= ' ' . $tmparray[$j];
106 |                                     }
107 |                                 } else {
108 |                                     $von .= ' ' . $tmparray[$j];
109 |                                 }
110 |                             } catch (ProcessorException) {
111 |                                 // Ignore
112 |                             }
113 |                         } else {
114 |                             try {
115 |                                 $case = $this->determineCase($tmparray[$j]);
116 |                                 if (0 === $case) { // Change from first to von
117 |                                     $invon = true;
118 |                                     $von .= ' ' . $tmparray[$j];
119 |                                 } else {
120 |                                     $first .= ' ' . $tmparray[$j];
121 |                                 }
122 |                             } catch (ProcessorException) {
123 |                                 // Ignore
124 |                             }
125 |                         }
126 |                     }
127 |                     // The last entry is always the last!
128 |                     $last .= ' ' . $tmparray[$size - 1];
129 |                 }
130 |             } else { // Version 2 and 3
131 |                 $tmparray = [];
132 |                 $tmparray = explode(',', $author);
133 |                 // The first entry must contain von and last
134 |                 $vonlastarray = [];
135 |                 $vonlastarray = explode(' ', $tmparray[0]);
136 |                 $size = \count($vonlastarray);
137 |                 if (1 === $size) { // Only one entry->got to be the last
138 |                     $last = $vonlastarray[0];
139 |                 } else {
140 |                     $inlast = false;
141 |                     for ($j = 0; $j < ($size - 1); ++$j) {
142 |                         if ($inlast) {
143 |                             $last .= ' ' . $vonlastarray[$j];
144 |                         } else {
145 |                             if (0 !== $this->determineCase($vonlastarray[$j])) { // Change from von to last
146 |                                 $islast = true;
147 |                                 for ($k = ($j + 1); $k < ($size - 1); ++$k) {
148 |                                     try {
149 |                                         $case = $this->determineCase($vonlastarray[$k]);
150 |                                         if (0 === $case) {
151 |                                             $islast = false;
152 |                                         }
153 |                                     } catch (ProcessorException) {
154 |                                         // Ignore
155 |                                     }
156 |                                 }
157 |                                 if ($islast) {
158 |                                     $inlast = true;
159 |                                     $last .= ' ' . $vonlastarray[$j];
160 |                                 } else {
161 |                                     $von .= ' ' . $vonlastarray[$j];
162 |                                 }
163 |                             } else {
164 |                                 $von .= ' ' . $vonlastarray[$j];
165 |                             }
166 |                         }
167 |                     }
168 |                     $last .= ' ' . $vonlastarray[$size - 1];
169 |                 }
170 |                 // Now we check if it is version three (three entries in the array (two commas)
171 |                 if (3 === \count($tmparray)) {
172 |                     $jr = $tmparray[1];
173 |                 }
174 |                 // Everything in the last entry is first
175 |                 $first = $tmparray[\count($tmparray) - 1];
176 |             }
177 |             $authorarray[$i] = ['first' => trim($first), 'von' => trim($von), 'last' => trim($last), 'jr' => trim($jr)];
178 |         }
179 | 
180 |         return $authorarray;
181 |     }
182 | 
183 |     /**
184 |      * Case Determination according to the needs of BibTex.
185 |      *
186 |      * To parse the Author(s) correctly a determination is needed
187 |      * to get the Case of a word. There are three possible values:
188 |      * - Upper Case (return value 1)
189 |      * - Lower Case (return value 0)
190 |      * - Caseless   (return value -1)
191 |      *
192 |      * @throws ProcessorException
193 |      *
194 |      * @author Elmar Pitschke <elmar.pitschke@gmx.de>
195 |      */
196 |     private function determineCase(string $word): int
197 |     {
198 |         $ret = -1;
199 |         $trimmedword = trim($word);
200 |         /*We need this variable. Without the next of would not work
201 |         (trim changes the variable automatically to a string!)*/
202 |         if (mb_strlen($trimmedword) > 0) {
203 |             $i = 0;
204 |             $found = false;
205 |             $openbrace = 0;
206 |             while (!$found && ($i <= mb_strlen($word))) {
207 |                 $letter = mb_substr($trimmedword, $i, 1);
208 |                 $ord = \ord($letter);
209 |                 if (123 === $ord) { // Open brace
210 |                     ++$openbrace;
211 |                 }
212 |                 if (125 === $ord) { // Closing brace
213 |                     --$openbrace;
214 |                 }
215 |                 if (($ord >= 65) && ($ord <= 90) && (0 === $openbrace)) { // The first character is uppercase
216 |                     $ret = 1;
217 |                     $found = true;
218 |                 } elseif (($ord >= 97) && ($ord <= 122) && (0 === $openbrace)) { // The first character is lowercase
219 |                     $ret = 0;
220 |                     $found = true;
221 |                 } else { // Not yet found
222 |                     ++$i;
223 |                 }
224 |             }
225 |         } else {
226 |             throw new ProcessorException('Could not determine case on word: ' . $word);
227 |         }
228 | 
229 |         return $ret;
230 |     }
231 | }
232 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">PHP BibTeX Parser 2.x</h1>
  2 | <p align="center">
  3 |     This is a
  4 |     <a href="https://tug.org/bibtex/">BibTeX</a>
  5 |     parser written in
  6 |     <a href="https://php.net">PHP</a>.
  7 | </p>
  8 | <p align="center">
  9 |     <a href="https://tug.org/bibtex/">
 10 |         <img src="https://upload.wikimedia.org/wikipedia/commons/3/30/BibTeX_logo.svg" height="83" alt="BibTeX logo">
 11 |     </a>
 12 |     <a href="https://php.net">
 13 |         <img src="https://upload.wikimedia.org/wikipedia/commons/2/27/PHP-logo.svg" height="83" alt="PHP logo">
 14 |     </a>
 15 | </p>
 16 | 
 17 | ![Tests](https://github.com/renanbr/bibtex-parser/workflows/Tests/badge.svg)
 18 | ![Static Analysis](https://github.com/renanbr/bibtex-parser/workflows/Static%20Analysis/badge.svg)
 19 | ![Coding Standards](https://github.com/renanbr/bibtex-parser/workflows/Coding%20Standards/badge.svg)
 20 | 
 21 | You are browsing the documentation of **BibTeX Parser 2.x**, the latest version.
 22 | 
 23 | ## Table of contents
 24 | 
 25 | * [Installing](#installing)
 26 | * [Usage](#usage)
 27 | * [Vocabulary](#vocabulary)
 28 | * [Processors](#processors)
 29 |   * [Tag name case](#tag-name-case)
 30 |   * [Authors and editors](#authors-and-editors)
 31 |   * [Keywords](#keywords)
 32 |   * [Date](#date)
 33 |   * [Fill missing tag](#fill-missing-tag)
 34 |   * [Trim tags](#trim-tags)
 35 |   * [Determine URL from the DOI](#determine-url-from-the-doi)
 36 |   * [LaTeX to unicode](#latex-to-unicode)
 37 |   * [Custom](#custom)
 38 | * [Handling errors](#handling-errors)
 39 | * [Advanced usage](#advanced-usage)
 40 | * [Release Policy](#release-policy)
 41 |   * [Dependencies Compatibility Policy](#dependencies-compatibility-policy)
 42 | 
 43 | ## Installing
 44 | 
 45 | ```bash
 46 | composer require renanbr/bibtex-parser
 47 | ```
 48 | 
 49 | ## Usage
 50 | 
 51 | ```php
 52 | use RenanBr\BibTexParser\Listener;
 53 | use RenanBr\BibTexParser\Parser;
 54 | use RenanBr\BibTexParser\Processor;
 55 | 
 56 | require 'vendor/autoload.php';
 57 | 
 58 | $bibtex = <<<BIBTEX
 59 | @article{einstein1916relativity,
 60 |   title={Relativity: The Special and General Theory},
 61 |   author={Einstein, Albert},
 62 |   year={1916}
 63 | }
 64 | BIBTEX;
 65 | 
 66 | // Create and configure a Listener
 67 | $listener = new Listener();
 68 | $listener->addProcessor(new Processor\TagNameCaseProcessor(CASE_LOWER));
 69 | // $listener->addProcessor(new Processor\NamesProcessor());
 70 | // $listener->addProcessor(new Processor\KeywordsProcessor());
 71 | // $listener->addProcessor(new Processor\DateProcessor());
 72 | // $listener->addProcessor(new Processor\FillMissingProcessor([/* ... */]));
 73 | // $listener->addProcessor(new Processor\TrimProcessor());
 74 | // $listener->addProcessor(new Processor\UrlFromDoiProcessor());
 75 | // $listener->addProcessor(new Processor\LatexToUnicodeProcessor());
 76 | // ... you can append as many Processors as you want
 77 | 
 78 | // Create a Parser and attach the listener
 79 | $parser = new Parser();
 80 | $parser->addListener($listener);
 81 | 
 82 | // Parse the content, then read processed data from the Listener
 83 | $parser->parseString($bibtex); // or parseFile('/path/to/file.bib')
 84 | $entries = $listener->export();
 85 | 
 86 | print_r($entries);
 87 | ```
 88 | 
 89 | This will output:
 90 | 
 91 | ```
 92 | Array
 93 | (
 94 |     [0] => Array
 95 |         (
 96 |             [_type] => article
 97 |             [citation-key] => einstein1916relativity
 98 |             [title] => Relativity: The Special and General Theory
 99 |             [author] => Einstein, Albert
100 |             [year] => 1916
101 |         )
102 | )
103 | ```
104 | 
105 | ## Vocabulary
106 | 
107 | [BibTeX] is all about "entry", "tag's name" and "tag's content".
108 | 
109 | > A [BibTeX] **entry** consists of the type (the word after @), a citation-key and a number of tags which define various characteristics of the specific [BibTeX] entry.
110 | > (...) A [BibTeX] **tag** is specified by its **name** followed by an equals sign, and the **content**.
111 | 
112 | Source: http://www.bibtex.org/Format/
113 | 
114 | Note:
115 | This library considers "type" and "citation-key" as tags.
116 | This behavior can be changed [implementing your own Listener](#advanced-usage).
117 | 
118 | ## Processors
119 | 
120 | `Processor` is a [callable] that receives an entry as argument and returns a modified entry.
121 | 
122 | This library contains three main parts:
123 | 
124 | - `Parser` class, responsible for detecting units inside a [BibTeX] input;
125 | - `Listener` class, responsible for gathering units and transforming them into a list of entries;
126 | - `Processor` classes, responsible for manipulating entries.
127 | 
128 | Despite you can't configure the `Parser`, you can append as many `Processor` as you want to the `Listener` through `Listener::addProcessor()` before exporting the contents.
129 | Be aware that `Listener` provides, by default, these features:
130 | 
131 | - Found entries are reachable through `Listener::export()` method;
132 | - [Tag content concatenation](http://www.bibtex.org/Format/);
133 |     - e.g. `hello # " world"` tag's content will generate `hello world` [string]
134 | - [Tag content abbreviation handling](http://www.bibtex.org/Format/);
135 |     - e.g. `@string{foo="bar"} @misc{bar=foo}` will make `$entries[1]['bar']` assume `bar` as value
136 | - Publication's type exposed as `_type` tag;
137 | - Citation key exposed as `citation-key` tag;
138 | - Original entry text exposed as `_original` tag.
139 | 
140 | This project ships some useful processors.
141 | 
142 | ### Tag name case
143 | 
144 | In [BibTeX] the tag's names aren't case-sensitive.
145 | This library exposes entries as [array], in which keys are case-sensitive.
146 | To avoid this misunderstanding, you can force the tags' name character case using `TagNameCaseProcessor`.
147 | 
148 | <details><summary>Usage</summary>
149 | 
150 | ```php
151 | use RenanBr\BibTexParser\Processor\TagNameCaseProcessor;
152 | 
153 | $listener->addProcessor(new TagNameCaseProcessor(CASE_UPPER)); // or CASE_LOWER
154 | ```
155 | 
156 | ```bib
157 | @article{
158 |   title={BibTeX rocks}
159 | }
160 | ```
161 | 
162 | ```
163 | Array
164 | (
165 |     [0] => Array
166 |         (
167 |             [TYPE] => article
168 |             [TITLE] => BibTeX rocks
169 |         )
170 | )
171 | ```
172 | 
173 | </details>
174 | 
175 | ### Authors and editors
176 | 
177 | [BibTeX] recognizes four parts of an author's name: First Von Last Jr.
178 | If you would like to parse the `author` and `editor` tags included in your entries, you can use the `NamesProcessor` class.
179 | 
180 | <details><summary>Usage</summary>
181 | 
182 | ```php
183 | use RenanBr\BibTexParser\Processor\NamesProcessor;
184 | 
185 | $listener->addProcessor(new NamesProcessor());
186 | ```
187 | 
188 | ```bib
189 | @article{
190 |   title={Relativity: The Special and General Theory},
191 |   author={Einstein, Albert}
192 | }
193 | ```
194 | 
195 | ```
196 | Array
197 | (
198 |     [0] => Array
199 |         (
200 |             [type] => article
201 |             [title] => Relativity: The Special and General Theory
202 |             [author] => Array
203 |                 (
204 |                     [0] => Array
205 |                         (
206 |                             [first] => Albert
207 |                             [von] =>
208 |                             [last] => Einstein
209 |                             [jr] =>
210 |                         )
211 |                 )
212 |         )
213 | )
214 | ```
215 | 
216 | </details>
217 | 
218 | ### Keywords
219 | 
220 | The `keywords` tag contains a list of expressions represented as [string], you might want to read them as an [array] instead.
221 | 
222 | <details><summary>Usage</summary>
223 | 
224 | ```php
225 | use RenanBr\BibTexParser\Processor\KeywordsProcessor;
226 | 
227 | $listener->addProcessor(new KeywordsProcessor());
228 | ```
229 | 
230 | ```bib
231 | @misc{
232 |   title={The End of Theory: The Data Deluge Makes the Scientific Method Obsolete},
233 |   keywords={big data, data deluge, scientific method}
234 | }
235 | ```
236 | 
237 | ```
238 | Array
239 | (
240 |     [0] => Array
241 |         (
242 |             [type] => misc
243 |             [title] => The End of Theory: The Data Deluge Makes the Scientific Method Obsolete
244 |             [keywords] => Array
245 |                 (
246 |                     [0] => big data
247 |                     [1] => data deluge
248 |                     [2] => scientific method
249 |                 )
250 |         )
251 | )
252 | ```
253 | 
254 | </details>
255 | 
256 | ### Date
257 | 
258 | It adds a new tag `_date` as [DateTimeImmutable].
259 | This processor adds the new tag **if and only if** this the tags `month` and `year` are fulfilled.
260 | 
261 | <details><summary>Usage</summary>
262 | 
263 | ```php
264 | use RenanBr\BibTexParser\Processor\DateProcessor;
265 | 
266 | $listener->addProcessor(new DateProcessor());
267 | ```
268 | 
269 | ```bib
270 | @misc{
271 |   month="1~oct",
272 |   year=2000
273 | }
274 | ```
275 | 
276 | ```
277 | Array
278 | (
279 |     [0] => Array
280 |         (
281 |             [type] => misc
282 |             [month] => 1~oct
283 |             [year] => 2000
284 |             [_date] => DateTimeImmutable Object
285 |                 (
286 |                     [date] => 2000-10-01 00:00:00.000000
287 |                     [timezone_type] => 3
288 |                     [timezone] => UTC
289 |                 )
290 |         )
291 | )
292 | ```
293 | 
294 | </details>
295 | 
296 | ### Fill missing tag
297 | 
298 | It puts a default value to some missing field.
299 | 
300 | <details><summary>Usage</summary>
301 | 
302 | ```php
303 | use RenanBr\BibTexParser\Processor\FillMissingProcessor;
304 | 
305 | $listener->addProcessor(new FillMissingProcessor([
306 |     'title' => 'This entry has no title',
307 |     'year' => 1970,
308 | ]));
309 | ```
310 | 
311 | ```bib
312 | @misc{
313 | }
314 | 
315 | @misc{
316 |     title="I do exist"
317 | }
318 | ```
319 | 
320 | ```
321 | Array
322 | (
323 |     [0] => Array
324 |         (
325 |             [type] => misc
326 |             [title] => This entry has no title
327 |             [year] => 1970
328 |         )
329 |     [1] => Array
330 |         (
331 |             [type] => misc
332 |             [title] => I do exist
333 |             [year] => 1970
334 |         )
335 | )
336 | ```
337 | 
338 | </details>
339 | 
340 | ### Trim tags
341 | 
342 | Apply [trim()] to all tags.
343 | 
344 | <details><summary>Usage</summary>
345 | 
346 | ```php
347 | use RenanBr\BibTexParser\Processor\TrimProcessor;
348 | 
349 | $listener->addProcessor(new TrimProcessor());
350 | ```
351 | 
352 | ```bib
353 | @misc{
354 |   title=" too much space  "
355 | }
356 | ```
357 | 
358 | ```
359 | Array
360 | (
361 |     [0] => Array
362 |         (
363 |             [type] => misc
364 |             [title] => too much space
365 |         )
366 | 
367 | )
368 | ```
369 | 
370 | </details>
371 | 
372 | ### Determine URL from the DOI
373 | 
374 | Sets `url` tag with [DOI] if `doi` tag is present and `url` tag is missing.
375 | 
376 | <details><summary>Usage</summary>
377 | 
378 | ```php
379 | use RenanBr\BibTexParser\Processor\UrlFromDoiProcessor;
380 | 
381 | $listener->addProcessor(new UrlFromDoiProcessor());
382 | ```
383 | 
384 | ```bib
385 | @misc{
386 |   doi="qwerty"
387 | }
388 | 
389 | @misc{
390 |   doi="azerty",
391 |   url="http://example.org"
392 | }
393 | ```
394 | 
395 | ```
396 | Array
397 | (
398 |     [0] => Array
399 |         (
400 |             [type] => misc
401 |             [doi] => qwerty
402 |             [url] => https://doi.org/qwerty
403 |         )
404 | 
405 |     [1] => Array
406 |         (
407 |             [type] => misc
408 |             [doi] => azerty
409 |             [url] => http://example.org
410 |         )
411 | )
412 | ```
413 | 
414 | </details>
415 | 
416 | ### LaTeX to unicode
417 | 
418 | [BibTeX] files store [LaTeX] contents.
419 | You might want to read them as unicode instead.
420 | The `LatexToUnicodeProcessor` class solves this problem, but before adding the processor to the listener you must:
421 | 
422 | - [install Pandoc](http://pandoc.org/installing.html) in your system; and
423 | - add [ryakad/pandoc-php](https://github.com/ryakad/pandoc-php) or [ueberdosis/pandoc](https://github.com/ueberdosis/pandoc) as a dependency of your project.
424 | 
425 | <details><summary>Usage</summary>
426 | 
427 | ```php
428 | use RenanBr\BibTexParser\Processor\LatexToUnicodeProcessor;
429 | 
430 | $listener->addProcessor(new LatexToUnicodeProcessor());
431 | ```
432 | 
433 | ```bib
434 | @article{
435 |   title={Caf\\'{e}s and bars}
436 | }
437 | ```
438 | 
439 | ```
440 | Array
441 | (
442 |     [0] => Array
443 |         (
444 |             [type] => article
445 |             [title] => Cafés and bars
446 |         )
447 | )
448 | ```
449 | 
450 | </details>
451 | 
452 | Note: Order matters, add this processor as the last.
453 | 
454 | ### Custom
455 | 
456 | The `Listener::addProcessor()` method expects a [callable] as argument.
457 | In the example shown below, we append the text `with laser` to the `title` tags for all entries.
458 | 
459 | <details><summary>Usage</summary>
460 | 
461 | ```php
462 | $listener->addProcessor(static function (array $entry) {
463 |     $entry['title'] .= ' with laser';
464 |     return $entry;
465 | });
466 | ```
467 | 
468 | ```
469 | @article{
470 |   title={BibTeX rocks}
471 | }
472 | ```
473 | 
474 | ```
475 | Array
476 | (
477 |     [0] => Array
478 |         (
479 |             [type] => article
480 |             [title] => BibTeX rocks with laser
481 |         )
482 | )
483 | ```
484 | 
485 | </details>
486 | 
487 | ## Handling errors
488 | 
489 | This library throws two types of exception: `ParserException` and `ProcessorException`.
490 | The first one may happen during the data extraction.
491 | When it occurs it probably means the parsed BibTeX isn't valid.
492 | The second exception may happen during the data processing.
493 | When it occurs it means the listener's processors can't handle properly the data found.
494 | Both implement `ExceptionInterface`.
495 | 
496 | ```php
497 | use RenanBr\BibTexParser\Exception\ExceptionInterface;
498 | use RenanBr\BibTexParser\Exception\ParserException;
499 | use RenanBr\BibTexParser\Exception\ProcessorException;
500 | 
501 | try {
502 |     // ... parser and listener configuration
503 | 
504 |     $parser->parseFile('/path/to/file.bib');
505 |     $entries = $listener->export();
506 | } catch (ParserException $exception) {
507 |     // The BibTeX isn't valid
508 | } catch (ProcessorException $exception) {
509 |     // Listener's processors aren't able to handle data found
510 | } catch (ExceptionInterface $exception) {
511 |     // Alternatively, you can use this exception to catch all of them at once
512 | }
513 | ```
514 | 
515 | ## Advanced usage
516 | 
517 | The core of this library contains these main classes:
518 | 
519 | - `RenanBr\BibTexParser\Parser` responsible for detecting units inside a [BibTeX] input;
520 | - `RenanBr\BibTexParser\ListenerInterface` responsible for treating units found.
521 | 
522 | You can attach listeners to the parser through `Parser::addListener()`.
523 | The parser is able to detect [BibTeX] units, such as "type", "tag's name", "tag's content".
524 | As the parser finds a unit, it triggers the listeners attached to it.
525 | 
526 | You can code your own listener! All you have to do is handle units.
527 | 
528 | ```php
529 | namespace RenanBr\BibTexParser;
530 | 
531 | interface ListenerInterface
532 | {
533 |     /**
534 |      * Called when an unit is found.
535 |      *
536 |      * @param string $text    The original content of the unit found.
537 |      *                        Escape character will not be sent.
538 |      * @param string $type    The type of unit found.
539 |      *                        It can assume one of Parser's constant value.
540 |      * @param array  $context Contains details of the unit found.
541 |      */
542 |     public function bibTexUnitFound($text, $type, array $context);
543 | }
544 | ```
545 | 
546 | `$type` may assume one of these values:
547 | 
548 | - `Parser::TYPE`
549 | - `Parser::CITATION_KEY`
550 | - `Parser::TAG_NAME`
551 | - `Parser::RAW_TAG_CONTENT`
552 | - `Parser::BRACED_TAG_CONTENT`
553 | - `Parser::QUOTED_TAG_CONTENT`
554 | - `Parser::ENTRY`
555 | 
556 | `$context` is an [array] with these keys:
557 | 
558 | - `offset` contains the `$text`'s beginning position.
559 |   It may be useful, for example, to [seek on a file pointer](https://php.net/fseek);
560 | - `length` contains the original `$text`'s length.
561 |   It may differ from [string] length sent to the listener because may there are escaped characters.
562 | 
563 | ## Release Policy
564 | 
565 | There is a **single** maintained branch per time.
566 | This branch targets a minor version.
567 | 
568 | A maintained version reaches its end-of-life when a new minor version is released.
569 | 
570 | ### Dependencies Compatibility Policy
571 | 
572 | This library is compatible with maintained versions of
573 | [PHP][php-versions].
574 | 
575 | [BibTeX]: https://tug.org/bibtex/
576 | [DOI]: https://www.doi.org/
577 | [DateTimeImmutable]: https://www.php.net/manual/class.datetimeimmutable.php
578 | [LaTeX]: https://www.latex-project.org/
579 | [array]: https://php.net/manual/language.types.array.php
580 | [callable]: https://php.net/manual/en/language.types.callable.php
581 | [php-versions]: https://www.php.net/supported-versions.php
582 | [string]: https://php.net/manual/language.types.string.php
583 | [trim()]: https://www.php.net/trim
584 | 


--------------------------------------------------------------------------------
/src/Parser.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /*
  4 |  * This file is part of the BibTex Parser.
  5 |  *
  6 |  * (c) Renan de Lima Barbosa <renandelima@gmail.com>
  7 |  *
  8 |  * For the full copyright and license information, please view the LICENSE
  9 |  * file that was distributed with this source code.
 10 |  */
 11 | 
 12 | namespace RenanBr\BibTexParser;
 13 | 
 14 | use ErrorException;
 15 | use RenanBr\BibTexParser\Exception\ParserException;
 16 | 
 17 | class Parser
 18 | {
 19 |     public const TYPE = 'type';
 20 |     public const CITATION_KEY = 'citation_key';
 21 |     public const TAG_NAME = 'tag_name';
 22 |     public const RAW_TAG_CONTENT = 'raw_tag_content';
 23 |     public const BRACED_TAG_CONTENT = 'braced_tag_content';
 24 |     public const QUOTED_TAG_CONTENT = 'quoted_tag_content';
 25 |     public const ENTRY = 'entry';
 26 | 
 27 |     public const NONE = 'none';
 28 |     public const COMMENT = 'comment';
 29 |     public const FIRST_TAG_NAME = 'first_tag_name';
 30 |     public const POST_TYPE = 'post_type';
 31 |     public const POST_TAG_NAME = 'post_tag_name';
 32 |     public const PRE_TAG_CONTENT = 'pre_tag_content';
 33 | 
 34 |     private string $state;
 35 | 
 36 |     private string $buffer;
 37 | 
 38 |     private ?int $bufferOffset = null;
 39 | 
 40 |     private ?array $firstTagSnapshot = null;
 41 | 
 42 |     private ?string $originalEntryBuffer = null;
 43 | 
 44 |     private ?int $originalEntryOffset = null;
 45 | 
 46 |     private bool $skipOriginalEntryReading;
 47 | 
 48 |     private int $line;
 49 | 
 50 |     private int $column;
 51 | 
 52 |     private int $offset;
 53 | 
 54 |     private bool $isTagContentEscaped;
 55 | 
 56 |     private bool $mayConcatenateTagContent;
 57 | 
 58 |     private ?string $tagContentDelimiter = null;
 59 | 
 60 |     private int $braceLevel;
 61 | 
 62 |     /** @var array<ListenerInterface> */
 63 |     private array $listeners = [];
 64 | 
 65 |     public function addListener(ListenerInterface $listener): void
 66 |     {
 67 |         $this->listeners[] = $listener;
 68 |     }
 69 | 
 70 |     /**
 71 |      * @throws ParserException if $file given is not a valid BibTeX
 72 |      * @throws ErrorException  if $file given is not readable
 73 |      */
 74 |     public function parseFile(string $file): void
 75 |     {
 76 |         $handle = @fopen($file, 'r');
 77 |         if (!$handle) {
 78 |             throw new ErrorException(sprintf('Unable to open %s', $file));
 79 |         }
 80 |         try {
 81 |             $this->reset();
 82 |             while (!feof($handle)) {
 83 |                 $buffer = fread($handle, 128);
 84 |                 $this->parse($buffer);
 85 |             }
 86 |             $this->throwExceptionIfReadingEntry("\0");
 87 |         } finally {
 88 |             fclose($handle);
 89 |         }
 90 |     }
 91 | 
 92 |     /**
 93 |      * @throws ParserException if $string given is not a valid BibTeX
 94 |      */
 95 |     public function parseString(string $string): void
 96 |     {
 97 |         $this->reset();
 98 |         $this->parse($string);
 99 |         $this->throwExceptionIfReadingEntry("\0");
100 |     }
101 | 
102 |     private function parse(string $text): void
103 |     {
104 |         $text = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
105 |         $length = count($text);
106 |         for ($position = 0; $position < $length; ++$position) {
107 |             $char = $text[$position];
108 |             $this->read($char);
109 |             if ("\n" === $char) {
110 |                 ++$this->line;
111 |                 $this->column = 1;
112 |             } else {
113 |                 ++$this->column;
114 |             }
115 |             ++$this->offset;
116 |         }
117 |     }
118 | 
119 |     private function reset(): void
120 |     {
121 |         $this->state = self::NONE;
122 |         $this->buffer = '';
123 |         $this->firstTagSnapshot = null;
124 |         $this->originalEntryBuffer = null;
125 |         $this->originalEntryOffset = null;
126 |         $this->skipOriginalEntryReading = false;
127 |         $this->line = 1;
128 |         $this->column = 1;
129 |         $this->offset = 0;
130 |         $this->mayConcatenateTagContent = false;
131 |         $this->isTagContentEscaped = false;
132 |         $this->tagContentDelimiter = null;
133 |         $this->braceLevel = 0;
134 |     }
135 | 
136 |     // ----- Readers -----------------------------------------------------------
137 | 
138 |     private function read(string $char): void
139 |     {
140 |         $previousState = $this->state;
141 | 
142 |         switch ($this->state) {
143 |             case self::NONE:
144 |                 $this->readNone($char);
145 |                 break;
146 |             case self::COMMENT:
147 |                 $this->readComment($char);
148 |                 break;
149 |             case self::TYPE:
150 |                 $this->readType($char);
151 |                 break;
152 |             case self::POST_TYPE:
153 |                 $this->readPostType($char);
154 |                 break;
155 |             case self::FIRST_TAG_NAME:
156 |             case self::TAG_NAME:
157 |                 $this->readTagName($char);
158 |                 break;
159 |             case self::POST_TAG_NAME:
160 |                 $this->readPostTagName($char);
161 |                 break;
162 |             case self::PRE_TAG_CONTENT:
163 |                 $this->readPreTagContent($char);
164 |                 break;
165 |             case self::RAW_TAG_CONTENT:
166 |                 $this->readRawTagContent($char);
167 |                 break;
168 |             case self::QUOTED_TAG_CONTENT:
169 |             case self::BRACED_TAG_CONTENT:
170 |                 $this->readDelimitedTagContent($char);
171 |                 break;
172 |         }
173 | 
174 |         $this->readOriginalEntry($char, $previousState);
175 |     }
176 | 
177 |     private function readNone(string $char): void
178 |     {
179 |         if ('@' === $char) {
180 |             $this->state = self::TYPE;
181 |         } elseif (!$this->isWhitespace($char)) {
182 |             $this->state = self::COMMENT;
183 |         }
184 |     }
185 | 
186 |     private function readComment(string $char): void
187 |     {
188 |         if ($this->isWhitespace($char)) {
189 |             $this->state = self::NONE;
190 |         }
191 |     }
192 | 
193 |     private function readType(string $char): void
194 |     {
195 |         if (preg_match('/^[a-zA-Z]$/', $char)) {
196 |             $this->appendToBuffer($char);
197 |         } else {
198 |             $this->throwExceptionIfBufferIsEmpty($char);
199 | 
200 |             // Skips @comment type
201 |             if ('comment' === mb_strtolower($this->buffer)) {
202 |                 $this->skipOriginalEntryReading = true;
203 |                 $this->buffer = '';
204 |                 $this->bufferOffset = null;
205 |                 $this->state = self::COMMENT;
206 |                 $this->readComment($char);
207 | 
208 |                 return;
209 |             }
210 | 
211 |             $this->triggerListenersWithCurrentBuffer();
212 | 
213 |             // once $char isn't a valid character
214 |             // it must be interpreted as POST_TYPE
215 |             $this->state = self::POST_TYPE;
216 |             $this->readPostType($char);
217 |         }
218 |     }
219 | 
220 |     private function readPostType(string $char): void
221 |     {
222 |         if ('{' === $char) {
223 |             $this->state = self::FIRST_TAG_NAME;
224 |         } elseif (!$this->isWhitespace($char)) {
225 |             throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
226 |         }
227 |     }
228 | 
229 |     private function readTagName(string $char): void
230 |     {
231 |         if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/\x{00C0}-\x{01FF}]$/u', $char)) {
232 |             $this->appendToBuffer($char);
233 |         } elseif ($this->isWhitespace($char) && empty($this->buffer)) {
234 |             // Skips because we didn't start reading
235 |         } elseif ('}' === $char && empty($this->buffer)) {
236 |             // No tag name found, $char is just closing current entry
237 |             $this->state = self::NONE;
238 |         } else {
239 |             $this->throwExceptionIfBufferIsEmpty($char);
240 | 
241 |             if (self::FIRST_TAG_NAME === $this->state) {
242 |                 // Takes a snapshot of current state to be triggered later as
243 |                 // tag name or citation key, see readPostTagName()
244 |                 $this->firstTagSnapshot = $this->takeBufferSnapshot();
245 |             } else {
246 |                 // Current buffer is a simple tag name
247 |                 $this->triggerListenersWithCurrentBuffer();
248 |             }
249 | 
250 |             // Once $char isn't a valid tag name character, it must be
251 |             // interpreted as post tag name
252 |             $this->state = self::POST_TAG_NAME;
253 |             $this->readPostTagName($char);
254 |         }
255 |     }
256 | 
257 |     private function readPostTagName(string $char): void
258 |     {
259 |         if ('=' === $char) {
260 |             // First tag name isn't a citation key, because it has content
261 |             $this->triggerListenersWithFirstTagSnapshotAs(self::TAG_NAME);
262 |             $this->state = self::PRE_TAG_CONTENT;
263 |         } elseif ('}' === $char) {
264 |             // First tag name is a citation key, because $char closes entry and
265 |             // lets first tag without value
266 |             $this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY);
267 |             $this->state = self::NONE;
268 |         } elseif (',' === $char) {
269 |             // First tag name is a citation key, because $char moves to the next
270 |             // tag and lets first tag without value
271 |             $this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY);
272 |             $this->state = self::TAG_NAME;
273 |         } elseif (!$this->isWhitespace($char)) {
274 |             throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
275 |         }
276 |     }
277 | 
278 |     private function readPreTagContent(string $char): void
279 |     {
280 |         if (preg_match('/^[a-zA-Z0-9]$/', $char)) {
281 |             // When concatenation is available it means there is already a
282 |             // defined value, and parser expect a concatenator, a tag separator
283 |             // or an entry closing char as next $char
284 |             $this->throwExceptionAccordingToConcatenationAvailability($char, true);
285 |             $this->state = self::RAW_TAG_CONTENT;
286 |             $this->readRawTagContent($char);
287 |         } elseif ('"' === $char) {
288 |             // The exception is here for the same reason of the first case
289 |             $this->throwExceptionAccordingToConcatenationAvailability($char, true);
290 |             $this->tagContentDelimiter = '"';
291 |             $this->state = self::QUOTED_TAG_CONTENT;
292 |         } elseif ('{' === $char) {
293 |             // The exception is here for the same reason of the first case
294 |             $this->throwExceptionAccordingToConcatenationAvailability($char, true);
295 |             $this->tagContentDelimiter = '}';
296 |             $this->state = self::BRACED_TAG_CONTENT;
297 |         } elseif ('#' === $char) {
298 |             $this->throwExceptionAccordingToConcatenationAvailability($char, false);
299 |             $this->mayConcatenateTagContent = false;
300 |         } elseif (',' === $char) {
301 |             $this->throwExceptionAccordingToConcatenationAvailability($char, false);
302 |             $this->mayConcatenateTagContent = false;
303 |             $this->state = self::TAG_NAME;
304 |         } elseif ('}' === $char) {
305 |             $this->throwExceptionAccordingToConcatenationAvailability($char, false);
306 |             $this->mayConcatenateTagContent = false;
307 |             $this->state = self::NONE;
308 |         } elseif (!$this->isWhitespace($char)) {
309 |             throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
310 |         }
311 |     }
312 | 
313 |     private function readRawTagContent(string $char): void
314 |     {
315 |         if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/]$/', $char)) {
316 |             $this->appendToBuffer($char);
317 |         } else {
318 |             $this->throwExceptionIfBufferIsEmpty($char);
319 |             $this->triggerListenersWithCurrentBuffer();
320 | 
321 |             // once $char isn't a valid character
322 |             // it must be interpreted as TAG_CONTENT
323 |             $this->mayConcatenateTagContent = true;
324 |             $this->state = self::PRE_TAG_CONTENT;
325 |             $this->readPreTagContent($char);
326 |         }
327 |     }
328 | 
329 |     private function readDelimitedTagContent(string $char): void
330 |     {
331 |         if ($this->isTagContentEscaped) {
332 |             $this->isTagContentEscaped = false;
333 |             if ($this->tagContentDelimiter !== $char && '\\' !== $char && '%' !== $char) {
334 |                 $this->appendToBuffer('\\');
335 |             }
336 |             $this->appendToBuffer($char);
337 |         } elseif ('}' === $this->tagContentDelimiter && '{' === $char) {
338 |             ++$this->braceLevel;
339 |             $this->appendToBuffer($char);
340 |         } elseif ($this->tagContentDelimiter === $char) {
341 |             if (0 === $this->braceLevel) {
342 |                 $this->triggerListenersWithCurrentBuffer();
343 |                 $this->mayConcatenateTagContent = true;
344 |                 $this->state = self::PRE_TAG_CONTENT;
345 |             } else {
346 |                 --$this->braceLevel;
347 |                 $this->appendToBuffer($char);
348 |             }
349 |         } elseif ('\\' === $char) {
350 |             $this->isTagContentEscaped = true;
351 |         } else {
352 |             $this->appendToBuffer($char);
353 |         }
354 |     }
355 | 
356 |     private function readOriginalEntry(string $char, string $previousState): void
357 |     {
358 |         if ($this->skipOriginalEntryReading) {
359 |             $this->originalEntryBuffer = '';
360 |             $this->originalEntryOffset = null;
361 |             $this->skipOriginalEntryReading = false;
362 | 
363 |             return;
364 |         }
365 | 
366 |         // Checks whether we are reading an entry character or not
367 |         $isPreviousStateEntry = $this->isEntryState($previousState);
368 |         $isCurrentStateEntry = $this->isEntryState($this->state);
369 |         $isEntry = $isPreviousStateEntry || $isCurrentStateEntry;
370 |         if (!$isEntry) {
371 |             return;
372 |         }
373 | 
374 |         // Appends $char to the original entry buffer
375 |         if (empty($this->originalEntryBuffer)) {
376 |             $this->originalEntryOffset = $this->offset;
377 |         }
378 |         $this->originalEntryBuffer .= $char;
379 | 
380 |         // Sends original entry to the listeners when $char closes an entry
381 |         $isClosingEntry = $isPreviousStateEntry && !$isCurrentStateEntry;
382 |         if ($isClosingEntry) {
383 |             $this->triggerListeners($this->originalEntryBuffer, self::ENTRY, [
384 |                 'offset' => $this->originalEntryOffset,
385 |                 'length' => $this->offset - $this->originalEntryOffset + 1,
386 |             ]);
387 |             $this->originalEntryBuffer = '';
388 |             $this->originalEntryOffset = null;
389 |         }
390 |     }
391 | 
392 |     // ----- Listener triggers -------------------------------------------------
393 | 
394 |     private function triggerListeners(string $text, string $type, array $context): void
395 |     {
396 |         foreach ($this->listeners as $listener) {
397 |             $listener->bibTexUnitFound($text, $type, $context);
398 |         }
399 |     }
400 | 
401 |     private function triggerListenersWithCurrentBuffer(): void
402 |     {
403 |         $snapshot = $this->takeBufferSnapshot();
404 |         $text = $snapshot['text'];
405 |         $context = $snapshot['context'];
406 |         $this->triggerListeners($text, $this->state, $context);
407 |     }
408 | 
409 |     private function triggerListenersWithFirstTagSnapshotAs(string $type): void
410 |     {
411 |         if (empty($this->firstTagSnapshot)) {
412 |             return;
413 |         }
414 |         $text = $this->firstTagSnapshot['text'];
415 |         $context = $this->firstTagSnapshot['context'];
416 |         $this->firstTagSnapshot = null;
417 |         $this->triggerListeners($text, $type, $context);
418 |     }
419 | 
420 |     // ----- Buffer tools ------------------------------------------------------
421 | 
422 |     private function appendToBuffer(string $char): void
423 |     {
424 |         if (empty($this->buffer)) {
425 |             $this->bufferOffset = $this->offset;
426 |         }
427 |         $this->buffer .= $char;
428 |     }
429 | 
430 |     private function takeBufferSnapshot(): array
431 |     {
432 |         $snapshot = [
433 |             'text' => $this->buffer,
434 |             'context' => [
435 |                 'offset' => $this->bufferOffset,
436 |                 'length' => $this->offset - $this->bufferOffset,
437 |             ],
438 |         ];
439 |         $this->bufferOffset = null;
440 |         $this->buffer = '';
441 | 
442 |         return $snapshot;
443 |     }
444 | 
445 |     // ----- Exception throwers ------------------------------------------------
446 | 
447 |     private function throwExceptionAccordingToConcatenationAvailability(string $char, bool $availability): void
448 |     {
449 |         if ($availability === $this->mayConcatenateTagContent) {
450 |             throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
451 |         }
452 |     }
453 | 
454 |     private function throwExceptionIfBufferIsEmpty(string $char): void
455 |     {
456 |         if (empty($this->buffer)) {
457 |             throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
458 |         }
459 |     }
460 | 
461 |     private function throwExceptionIfReadingEntry(string $char): void
462 |     {
463 |         if ($this->isEntryState($this->state)) {
464 |             throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
465 |         }
466 |     }
467 | 
468 |     // ----- Auxiliaries -------------------------------------------------------
469 | 
470 |     private function isEntryState(string $state): bool
471 |     {
472 |         return self::NONE !== $state && self::COMMENT !== $state;
473 |     }
474 | 
475 |     private function isWhitespace(string $char): bool
476 |     {
477 |         return ' ' === $char || "\t" === $char || "\n" === $char || "\r" === $char;
478 |     }
479 | }
480 | 


--------------------------------------------------------------------------------