├── src ├── Exception │ ├── ExceptionInterface.php │ ├── ProcessorException.php │ └── ParserException.php ├── Processor │ ├── TagNameCaseProcessor.php │ ├── TagSearchTrait.php │ ├── FillMissingProcessor.php │ ├── KeywordsProcessor.php │ ├── UrlFromDoiProcessor.php │ ├── TrimProcessor.php │ ├── DateProcessor.php │ ├── TagCoverageTrait.php │ ├── LatexToUnicodeProcessor.php │ └── NamesProcessor.php ├── ListenerInterface.php ├── Listener.php └── Parser.php ├── phpunit.dist.xml ├── LICENSE ├── composer.json └── README.md /src/Exception/ExceptionInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Exception; 13 | 14 | /** 15 | * Interface for package exceptions. 16 | */ 17 | interface ExceptionInterface {} 18 | -------------------------------------------------------------------------------- /src/Exception/ProcessorException.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Exception; 13 | 14 | use Exception; 15 | 16 | class ProcessorException extends Exception implements ExceptionInterface {} 17 | -------------------------------------------------------------------------------- /src/Processor/TagNameCaseProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | /** 15 | * Change the case of all tag names. 16 | */ 17 | class TagNameCaseProcessor 18 | { 19 | public function __construct( 20 | private readonly int $case, 21 | ) {} 22 | 23 | public function __invoke(array $entry): array 24 | { 25 | return array_change_key_case($entry, $this->case); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Processor/TagSearchTrait.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | trait TagSearchTrait 15 | { 16 | /** 17 | * Searches for the actual name of a tag. 18 | * 19 | * The search performed is case-insensitive. 20 | */ 21 | protected function tagSearch(string $needle, array $haystack): string|null 22 | { 23 | foreach ($haystack as $actual) { 24 | if (0 === strcasecmp($needle, (string) $actual)) { 25 | return $actual; 26 | } 27 | } 28 | 29 | return null; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/ListenerInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser; 13 | 14 | interface ListenerInterface 15 | { 16 | /** 17 | * Called when a unit is found. 18 | * 19 | * @param string $text The original content of the unit found. 20 | * Escape character will not be sent. 21 | * @param string $type The type of unit found. 22 | * It can assume one of Parser's constant value. 23 | * @param array $context contains details of the unit found 24 | */ 25 | public function bibTexUnitFound($text, $type, array $context); 26 | } 27 | -------------------------------------------------------------------------------- /src/Processor/FillMissingProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | class FillMissingProcessor 15 | { 16 | use TagSearchTrait; 17 | 18 | public function __construct( 19 | private readonly array $missingFields, 20 | ) {} 21 | 22 | public function __invoke(array $entry): array 23 | { 24 | $tags = array_keys($entry); 25 | 26 | foreach ($this->missingFields as $tag => $value) { 27 | if (!$this->tagSearch($tag, $tags)) { 28 | $entry[$tag] = $value; 29 | } 30 | } 31 | 32 | return $entry; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/Processor/KeywordsProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | /** 15 | * Splits tags contents as array. 16 | */ 17 | class KeywordsProcessor 18 | { 19 | use TagCoverageTrait; 20 | 21 | public function __construct() 22 | { 23 | $this->setTagCoverage(['keywords']); 24 | } 25 | 26 | public function __invoke(array $entry): array 27 | { 28 | $covered = $this->getCoveredTags(array_keys($entry)); 29 | foreach ($covered as $tag) { 30 | $entry[$tag] = preg_split('/, |; /', (string) $entry[$tag]); 31 | } 32 | 33 | return $entry; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /phpunit.dist.xml: -------------------------------------------------------------------------------- 1 | 2 | 14 | 15 | 16 | tests 17 | 18 | 19 | 20 | 21 | src 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/Exception/ParserException.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Exception; 13 | 14 | use Exception; 15 | 16 | class ParserException extends Exception implements ExceptionInterface 17 | { 18 | public static function unexpectedCharacter(string $character, int $line, int $column): self 19 | { 20 | // Avoid var_export() weird treatment for \0 21 | $character = "\0" === $character ? "'\\0'" : var_export($character, true); 22 | 23 | return new self(sprintf( 24 | 'Unexpected character %s at line %d column %d', 25 | $character, 26 | $line, 27 | $column, 28 | )); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Renan de Lima Barbosa 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /src/Processor/UrlFromDoiProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | class UrlFromDoiProcessor 15 | { 16 | use TagSearchTrait; 17 | 18 | public const FORMAT = 'https://doi.org/%s'; 19 | 20 | public function __construct( 21 | private readonly string $urlFormat = self::FORMAT, 22 | ) {} 23 | 24 | public function __invoke(array $entry): array 25 | { 26 | $doiTag = $this->tagSearch('doi', array_keys($entry)); 27 | $urlTag = $this->tagSearch('url', array_keys($entry)); 28 | if (null === $urlTag && null !== $doiTag) { 29 | $doiValue = $entry[$doiTag]; 30 | if (\is_string($doiValue) && '' !== $doiValue) { 31 | $entry['url'] = sprintf($this->urlFormat, $doiValue); 32 | } 33 | } 34 | 35 | return $entry; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/Processor/TrimProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | /** 15 | * @author Florent DESPIERRES 16 | */ 17 | class TrimProcessor 18 | { 19 | use TagCoverageTrait; 20 | 21 | public function __construct(array $fields = []) 22 | { 23 | if ($fields) { 24 | $this->setTagCoverage($fields); 25 | } 26 | } 27 | 28 | public function __invoke(array $entry): array 29 | { 30 | $covered = $this->getCoveredTags(array_keys($entry)); 31 | foreach ($covered as $tag) { 32 | $entry[$tag] = $this->trim($entry[$tag]); 33 | } 34 | 35 | return $entry; 36 | } 37 | 38 | private function trim(mixed $value): mixed 39 | { 40 | if (\is_array($value)) { 41 | $trimmed = []; 42 | foreach ($value as $key => $subValue) { 43 | $trimmed[$key] = $this->trim($subValue); 44 | } 45 | 46 | return $trimmed; 47 | } 48 | 49 | if (\is_string($value)) { 50 | return trim($value); 51 | } 52 | 53 | return $value; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "renanbr/bibtex-parser", 3 | "type": "library", 4 | "description": "BibTex Parser provides an API to read .bib files programmatically", 5 | "keywords": [ 6 | "bib", 7 | "bibtex", 8 | "latex", 9 | "parser", 10 | "bibliography", 11 | "citation", 12 | "cite" 13 | ], 14 | "license": "MIT", 15 | "authors": [ 16 | { 17 | "name": "Renan de Lima Barbosa", 18 | "email": "renandelima@gmail.com" 19 | } 20 | ], 21 | "require": { 22 | "php": "^8.1 | ^8.2 | ^8.3 | ^8.4" 23 | }, 24 | "require-dev": { 25 | "friendsofphp/php-cs-fixer": "^3.80", 26 | "phpstan/phpstan": "^2.1.17", 27 | "phpunit/phpunit": "^10.5.47 | ^11.5.26 | ^12.2.6", 28 | "ueberdosis/pandoc": "^0.9" 29 | }, 30 | "suggest": { 31 | "ueberdosis/pandoc": "Needed to support LaTeX decoder in class RenanBr\\BibTexParser\\Processor\\LatexToUnicodeProcessor" 32 | }, 33 | "config": { 34 | "sort-packages": true, 35 | "bump-after-update": true 36 | }, 37 | "extra": { 38 | "branch-alias": { 39 | "dev-master": "2.x-dev" 40 | } 41 | }, 42 | "autoload": { 43 | "psr-4": { 44 | "RenanBr\\BibTexParser\\": "src/" 45 | } 46 | }, 47 | "autoload-dev": { 48 | "psr-4": { 49 | "RenanBr\\BibTexParser\\Test\\": "tests/" 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/Processor/DateProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | use DateTimeImmutable; 15 | use DateTimeZone; 16 | 17 | class DateProcessor 18 | { 19 | use TagSearchTrait; 20 | 21 | public const TAG_NAME = '_date'; 22 | 23 | public function __construct( 24 | private readonly string $tagName = self::TAG_NAME, 25 | ) {} 26 | 27 | public function __invoke(array $entry): array 28 | { 29 | $yearTag = $this->tagSearch('year', array_keys($entry)); 30 | $monthTag = $this->tagSearch('month', array_keys($entry)); 31 | if (null !== $yearTag && null !== $monthTag) { 32 | $year = (int) $entry[$yearTag]; 33 | $monthArray = explode('~', (string) $entry[$monthTag]); 34 | if (2 === \count($monthArray)) { 35 | [$day, $month] = $monthArray; 36 | $day = (int) $day; 37 | $dateMonthNumber = date_parse($month); 38 | $month = $dateMonthNumber['month'] ?: 0; 39 | if (checkdate($month, $day, $year)) { 40 | $timestamp = mktime(0, 0, 0, $month, $day, $year); 41 | $entry[$this->tagName] = new DateTimeImmutable(date('Y-m-d', $timestamp), new DateTimeZone('UTC')); 42 | } 43 | } 44 | } 45 | 46 | return $entry; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/Processor/TagCoverageTrait.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | trait TagCoverageTrait 15 | { 16 | use TagSearchTrait; 17 | 18 | private array $tagCoverageList = [ 19 | '_original', 20 | '_type', 21 | ]; 22 | 23 | private string $tagCoverageStrategy = 'blacklist'; 24 | 25 | /** 26 | * @param array $tags List of tags to be covered 27 | * @param ?string $strategy Can assume "whitelist" (default) or "blacklist" 28 | */ 29 | public function setTagCoverage(array $tags, ?string $strategy = null): void 30 | { 31 | $this->tagCoverageList = $tags; 32 | $this->tagCoverageStrategy = $strategy ?: 'whitelist'; 33 | } 34 | 35 | /** 36 | * Calculates which tags are covered. 37 | * 38 | * The search performed internally is case-insensitive. 39 | */ 40 | protected function getCoveredTags(array $tags): array 41 | { 42 | // Finds for actual tag names 43 | $matched = []; 44 | foreach ($this->tagCoverageList as $original) { 45 | $actual = $this->tagSearch($original, $tags); 46 | if (null !== $actual) { 47 | $matched[] = $actual; 48 | } 49 | } 50 | 51 | // Whitelist 52 | if ('whitelist' === $this->tagCoverageStrategy) { 53 | return $matched; 54 | } 55 | 56 | // Blacklist 57 | return array_values(array_diff($tags, $matched)); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/Processor/LatexToUnicodeProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | use Composer\InstalledVersions; 15 | use Exception; 16 | use Pandoc\Pandoc; 17 | use RenanBr\BibTexParser\Exception\ProcessorException; 18 | use RuntimeException; 19 | 20 | /** 21 | * Translates LaTeX texts to unicode. 22 | */ 23 | class LatexToUnicodeProcessor 24 | { 25 | use TagCoverageTrait; 26 | 27 | /** @var (callable(string): string)|null */ 28 | private $converter; 29 | 30 | public function __invoke(array $entry): array 31 | { 32 | $covered = $this->getCoveredTags(array_keys($entry)); 33 | foreach ($covered as $tag) { 34 | // Translate string 35 | if (\is_string($entry[$tag])) { 36 | $entry[$tag] = $this->decode($entry[$tag]); 37 | continue; 38 | } 39 | 40 | // Translate array 41 | if (\is_array($entry[$tag])) { 42 | array_walk_recursive($entry[$tag], function (&$text): void { 43 | if (\is_string($text)) { 44 | $text = $this->decode($text); 45 | } 46 | }); 47 | } 48 | } 49 | 50 | return $entry; 51 | } 52 | 53 | private function decode($text): string 54 | { 55 | try { 56 | return \call_user_func($this->getConverter(), $text); 57 | } catch (Exception $exception) { 58 | throw new ProcessorException(sprintf('Error while processing LaTeX to Unicode: %s', $exception->getMessage()), 0, $exception); 59 | } 60 | } 61 | 62 | /** 63 | * @return (callable(string): string) 64 | */ 65 | private function getConverter(): callable 66 | { 67 | if ($this->converter) { 68 | return $this->converter; 69 | } 70 | 71 | if (InstalledVersions::isInstalled('ueberdosis/pandoc')) { 72 | return $this->converter = (static fn($text) => mb_substr((string) (new Pandoc())->input($text)->execute([ 73 | '--from', 'latex', 74 | '--to', 'plain', 75 | '--wrap', 'none', 76 | ]), 0, -1)); 77 | } 78 | 79 | throw new RuntimeException('Pandoc wrapper not installed. Try running "composer require ueberdosis/pandoc"'); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/Listener.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser; 13 | 14 | class Listener implements ListenerInterface 15 | { 16 | private array $entries = []; 17 | 18 | /** 19 | * Current tag name. 20 | * 21 | * Indicates where to save contents when triggered by the parser. 22 | */ 23 | private string $currentTagName; 24 | 25 | private array $processors = []; 26 | 27 | private array $processed = []; 28 | 29 | /** 30 | * @return array all entries found during parsing process 31 | */ 32 | public function export(): array 33 | { 34 | $offset = \count($this->processed); 35 | $missing = \array_slice($this->entries, $offset); 36 | foreach ($this->processors as $processor) { 37 | $missing = array_filter(array_map($processor, $missing)); 38 | } 39 | $this->processed = array_merge($this->processed, $missing); 40 | 41 | return $this->processed; 42 | } 43 | 44 | /** 45 | * @param (callable(array): array) $processor Function to be applied to every BibTeX entry. 46 | * The processor given must return the modified entry. 47 | * Processors will be applied in the same order in which they were added. 48 | */ 49 | public function addProcessor(callable $processor): void 50 | { 51 | $this->processors[] = $processor; 52 | } 53 | 54 | public function bibTexUnitFound($text, $type, array $context): void 55 | { 56 | switch ($type) { 57 | case Parser::TYPE: 58 | // Starts a new entry 59 | $this->entries[] = [ 60 | '_type' => $text, 61 | 'type' => $text, // compatibility 62 | ]; 63 | break; 64 | 65 | case Parser::CITATION_KEY: 66 | $index = \count($this->entries) - 1; 67 | $this->entries[$index]['citation-key'] = $text; 68 | break; 69 | 70 | case Parser::TAG_NAME: 71 | // Saves tag into the current entry 72 | $index = \count($this->entries) - 1; 73 | $this->currentTagName = $text; 74 | $this->entries[$index][$this->currentTagName] = null; 75 | break; 76 | 77 | case Parser::RAW_TAG_CONTENT: 78 | // Searches for an abbreviation 79 | foreach ($this->entries as $entry) { 80 | if ('string' === $entry['type'] && \array_key_exists($text, $entry)) { 81 | $text = $entry[$text]; 82 | break; 83 | } 84 | } 85 | // no break 86 | 87 | case Parser::BRACED_TAG_CONTENT: 88 | case Parser::QUOTED_TAG_CONTENT: 89 | // Appends content into the current tag 90 | if (null !== $text) { 91 | $index = \count($this->entries) - 1; 92 | $this->entries[$index][$this->currentTagName] .= $text; 93 | } 94 | break; 95 | 96 | case Parser::ENTRY: 97 | $index = \count($this->entries) - 1; 98 | $this->entries[$index]['_original'] = $text; 99 | break; 100 | } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/Processor/NamesProcessor.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser\Processor; 13 | 14 | use RenanBr\BibTexParser\Exception\ProcessorException; 15 | 16 | /** 17 | * Splits names in four parts: First Von Last Jr. 18 | * 19 | * This class includes source code adapted from the Structures_BibTex package, 20 | * (c) Elmar Pitschke , included here under PHP license: 21 | * http://www.php.net/license/3_0.txt 22 | * 23 | * @author Andre Chalom 24 | * 25 | * @see https://github.com/pear/Structures_BibTex 26 | */ 27 | class NamesProcessor 28 | { 29 | use TagCoverageTrait; 30 | 31 | public function __construct() 32 | { 33 | $this->setTagCoverage(['author', 'editor']); 34 | } 35 | 36 | public function __invoke(array $entry): array 37 | { 38 | $covered = $this->getCoveredTags(array_keys($entry)); 39 | foreach ($covered as $tag) { 40 | $entry[$tag] = $this->extractAuthors($entry[$tag]); 41 | } 42 | 43 | return $entry; 44 | } 45 | 46 | /** 47 | * Extracting the authors. 48 | * 49 | * @author Elmar Pitschke 50 | */ 51 | private function extractAuthors(string $entry): array 52 | { 53 | // Sanitizes the entry to remove unwanted whitespace 54 | $entry = trim((string) preg_replace('/\s+/', ' ', $entry)); 55 | 56 | $authorarray = explode(' and ', $entry); 57 | for ($i = 0; $i < \count($authorarray); ++$i) { 58 | $author = trim($authorarray[$i]); 59 | /*The first version of how an author could be written (First von Last) 60 | has no commas in it*/ 61 | $first = ''; 62 | $von = ''; 63 | $last = ''; 64 | $jr = ''; 65 | if (false === mb_strpos($author, ',')) { 66 | $tmparray = []; 67 | $tmparray = preg_split('/[\s\~]/', $author); 68 | $size = \count($tmparray); 69 | if (1 === $size) { // There is only a last 70 | $last = $tmparray[0]; 71 | } elseif (2 === $size) { // There is a first and a last 72 | $first = $tmparray[0]; 73 | $last = $tmparray[1]; 74 | } else { 75 | $invon = false; 76 | $inlast = false; 77 | for ($j = 0; $j < ($size - 1); ++$j) { 78 | if ($inlast) { 79 | $last .= ' ' . $tmparray[$j]; 80 | } elseif ($invon) { 81 | try { 82 | $case = $this->determineCase($tmparray[$j]); 83 | 84 | if ((0 === $case) || (-1 === $case)) { // Change from von to last 85 | // You only change when there is no more lower case there 86 | $islast = true; 87 | for ($k = ($j + 1); $k < ($size - 1); ++$k) { 88 | try { 89 | $futurecase = $this->determineCase($tmparray[$k]); 90 | if (0 === $futurecase) { 91 | $islast = false; 92 | } 93 | } catch (ProcessorException) { 94 | // Ignore 95 | } 96 | } 97 | if ($islast) { 98 | $inlast = true; 99 | if (-1 === $case) { // Caseless belongs to the last 100 | $last .= ' ' . $tmparray[$j]; 101 | } else { 102 | $von .= ' ' . $tmparray[$j]; 103 | } 104 | } else { 105 | $von .= ' ' . $tmparray[$j]; 106 | } 107 | } else { 108 | $von .= ' ' . $tmparray[$j]; 109 | } 110 | } catch (ProcessorException) { 111 | // Ignore 112 | } 113 | } else { 114 | try { 115 | $case = $this->determineCase($tmparray[$j]); 116 | if (0 === $case) { // Change from first to von 117 | $invon = true; 118 | $von .= ' ' . $tmparray[$j]; 119 | } else { 120 | $first .= ' ' . $tmparray[$j]; 121 | } 122 | } catch (ProcessorException) { 123 | // Ignore 124 | } 125 | } 126 | } 127 | // The last entry is always the last! 128 | $last .= ' ' . $tmparray[$size - 1]; 129 | } 130 | } else { // Version 2 and 3 131 | $tmparray = []; 132 | $tmparray = explode(',', $author); 133 | // The first entry must contain von and last 134 | $vonlastarray = []; 135 | $vonlastarray = explode(' ', $tmparray[0]); 136 | $size = \count($vonlastarray); 137 | if (1 === $size) { // Only one entry->got to be the last 138 | $last = $vonlastarray[0]; 139 | } else { 140 | $inlast = false; 141 | for ($j = 0; $j < ($size - 1); ++$j) { 142 | if ($inlast) { 143 | $last .= ' ' . $vonlastarray[$j]; 144 | } else { 145 | if (0 !== $this->determineCase($vonlastarray[$j])) { // Change from von to last 146 | $islast = true; 147 | for ($k = ($j + 1); $k < ($size - 1); ++$k) { 148 | try { 149 | $case = $this->determineCase($vonlastarray[$k]); 150 | if (0 === $case) { 151 | $islast = false; 152 | } 153 | } catch (ProcessorException) { 154 | // Ignore 155 | } 156 | } 157 | if ($islast) { 158 | $inlast = true; 159 | $last .= ' ' . $vonlastarray[$j]; 160 | } else { 161 | $von .= ' ' . $vonlastarray[$j]; 162 | } 163 | } else { 164 | $von .= ' ' . $vonlastarray[$j]; 165 | } 166 | } 167 | } 168 | $last .= ' ' . $vonlastarray[$size - 1]; 169 | } 170 | // Now we check if it is version three (three entries in the array (two commas) 171 | if (3 === \count($tmparray)) { 172 | $jr = $tmparray[1]; 173 | } 174 | // Everything in the last entry is first 175 | $first = $tmparray[\count($tmparray) - 1]; 176 | } 177 | $authorarray[$i] = ['first' => trim($first), 'von' => trim($von), 'last' => trim($last), 'jr' => trim($jr)]; 178 | } 179 | 180 | return $authorarray; 181 | } 182 | 183 | /** 184 | * Case Determination according to the needs of BibTex. 185 | * 186 | * To parse the Author(s) correctly a determination is needed 187 | * to get the Case of a word. There are three possible values: 188 | * - Upper Case (return value 1) 189 | * - Lower Case (return value 0) 190 | * - Caseless (return value -1) 191 | * 192 | * @throws ProcessorException 193 | * 194 | * @author Elmar Pitschke 195 | */ 196 | private function determineCase(string $word): int 197 | { 198 | $ret = -1; 199 | $trimmedword = trim($word); 200 | /*We need this variable. Without the next of would not work 201 | (trim changes the variable automatically to a string!)*/ 202 | if (mb_strlen($trimmedword) > 0) { 203 | $i = 0; 204 | $found = false; 205 | $openbrace = 0; 206 | while (!$found && ($i <= mb_strlen($word))) { 207 | $letter = mb_substr($trimmedword, $i, 1); 208 | $ord = \ord($letter); 209 | if (123 === $ord) { // Open brace 210 | ++$openbrace; 211 | } 212 | if (125 === $ord) { // Closing brace 213 | --$openbrace; 214 | } 215 | if (($ord >= 65) && ($ord <= 90) && (0 === $openbrace)) { // The first character is uppercase 216 | $ret = 1; 217 | $found = true; 218 | } elseif (($ord >= 97) && ($ord <= 122) && (0 === $openbrace)) { // The first character is lowercase 219 | $ret = 0; 220 | $found = true; 221 | } else { // Not yet found 222 | ++$i; 223 | } 224 | } 225 | } else { 226 | throw new ProcessorException('Could not determine case on word: ' . $word); 227 | } 228 | 229 | return $ret; 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

PHP BibTeX Parser 2.x

2 |

3 | This is a 4 | BibTeX 5 | parser written in 6 | PHP. 7 |

8 |

9 | 10 | BibTeX logo 11 | 12 | 13 | PHP logo 14 | 15 |

16 | 17 | ![Tests](https://github.com/renanbr/bibtex-parser/workflows/Tests/badge.svg) 18 | ![Static Analysis](https://github.com/renanbr/bibtex-parser/workflows/Static%20Analysis/badge.svg) 19 | ![Coding Standards](https://github.com/renanbr/bibtex-parser/workflows/Coding%20Standards/badge.svg) 20 | 21 | You are browsing the documentation of **BibTeX Parser 2.x**, the latest version. 22 | 23 | ## Table of contents 24 | 25 | * [Installing](#installing) 26 | * [Usage](#usage) 27 | * [Vocabulary](#vocabulary) 28 | * [Processors](#processors) 29 | * [Tag name case](#tag-name-case) 30 | * [Authors and editors](#authors-and-editors) 31 | * [Keywords](#keywords) 32 | * [Date](#date) 33 | * [Fill missing tag](#fill-missing-tag) 34 | * [Trim tags](#trim-tags) 35 | * [Determine URL from the DOI](#determine-url-from-the-doi) 36 | * [LaTeX to unicode](#latex-to-unicode) 37 | * [Custom](#custom) 38 | * [Handling errors](#handling-errors) 39 | * [Advanced usage](#advanced-usage) 40 | * [Release Policy](#release-policy) 41 | * [Dependencies Compatibility Policy](#dependencies-compatibility-policy) 42 | 43 | ## Installing 44 | 45 | ```bash 46 | composer require renanbr/bibtex-parser 47 | ``` 48 | 49 | ## Usage 50 | 51 | ```php 52 | use RenanBr\BibTexParser\Listener; 53 | use RenanBr\BibTexParser\Parser; 54 | use RenanBr\BibTexParser\Processor; 55 | 56 | require 'vendor/autoload.php'; 57 | 58 | $bibtex = <<addProcessor(new Processor\TagNameCaseProcessor(CASE_LOWER)); 69 | // $listener->addProcessor(new Processor\NamesProcessor()); 70 | // $listener->addProcessor(new Processor\KeywordsProcessor()); 71 | // $listener->addProcessor(new Processor\DateProcessor()); 72 | // $listener->addProcessor(new Processor\FillMissingProcessor([/* ... */])); 73 | // $listener->addProcessor(new Processor\TrimProcessor()); 74 | // $listener->addProcessor(new Processor\UrlFromDoiProcessor()); 75 | // $listener->addProcessor(new Processor\LatexToUnicodeProcessor()); 76 | // ... you can append as many Processors as you want 77 | 78 | // Create a Parser and attach the listener 79 | $parser = new Parser(); 80 | $parser->addListener($listener); 81 | 82 | // Parse the content, then read processed data from the Listener 83 | $parser->parseString($bibtex); // or parseFile('/path/to/file.bib') 84 | $entries = $listener->export(); 85 | 86 | print_r($entries); 87 | ``` 88 | 89 | This will output: 90 | 91 | ``` 92 | Array 93 | ( 94 | [0] => Array 95 | ( 96 | [_type] => article 97 | [citation-key] => einstein1916relativity 98 | [title] => Relativity: The Special and General Theory 99 | [author] => Einstein, Albert 100 | [year] => 1916 101 | ) 102 | ) 103 | ``` 104 | 105 | ## Vocabulary 106 | 107 | [BibTeX] is all about "entry", "tag's name" and "tag's content". 108 | 109 | > A [BibTeX] **entry** consists of the type (the word after @), a citation-key and a number of tags which define various characteristics of the specific [BibTeX] entry. 110 | > (...) A [BibTeX] **tag** is specified by its **name** followed by an equals sign, and the **content**. 111 | 112 | Source: http://www.bibtex.org/Format/ 113 | 114 | Note: 115 | This library considers "type" and "citation-key" as tags. 116 | This behavior can be changed [implementing your own Listener](#advanced-usage). 117 | 118 | ## Processors 119 | 120 | `Processor` is a [callable] that receives an entry as argument and returns a modified entry. 121 | 122 | This library contains three main parts: 123 | 124 | - `Parser` class, responsible for detecting units inside a [BibTeX] input; 125 | - `Listener` class, responsible for gathering units and transforming them into a list of entries; 126 | - `Processor` classes, responsible for manipulating entries. 127 | 128 | Despite you can't configure the `Parser`, you can append as many `Processor` as you want to the `Listener` through `Listener::addProcessor()` before exporting the contents. 129 | Be aware that `Listener` provides, by default, these features: 130 | 131 | - Found entries are reachable through `Listener::export()` method; 132 | - [Tag content concatenation](http://www.bibtex.org/Format/); 133 | - e.g. `hello # " world"` tag's content will generate `hello world` [string] 134 | - [Tag content abbreviation handling](http://www.bibtex.org/Format/); 135 | - e.g. `@string{foo="bar"} @misc{bar=foo}` will make `$entries[1]['bar']` assume `bar` as value 136 | - Publication's type exposed as `_type` tag; 137 | - Citation key exposed as `citation-key` tag; 138 | - Original entry text exposed as `_original` tag. 139 | 140 | This project ships some useful processors. 141 | 142 | ### Tag name case 143 | 144 | In [BibTeX] the tag's names aren't case-sensitive. 145 | This library exposes entries as [array], in which keys are case-sensitive. 146 | To avoid this misunderstanding, you can force the tags' name character case using `TagNameCaseProcessor`. 147 | 148 |
Usage 149 | 150 | ```php 151 | use RenanBr\BibTexParser\Processor\TagNameCaseProcessor; 152 | 153 | $listener->addProcessor(new TagNameCaseProcessor(CASE_UPPER)); // or CASE_LOWER 154 | ``` 155 | 156 | ```bib 157 | @article{ 158 | title={BibTeX rocks} 159 | } 160 | ``` 161 | 162 | ``` 163 | Array 164 | ( 165 | [0] => Array 166 | ( 167 | [TYPE] => article 168 | [TITLE] => BibTeX rocks 169 | ) 170 | ) 171 | ``` 172 | 173 |
174 | 175 | ### Authors and editors 176 | 177 | [BibTeX] recognizes four parts of an author's name: First Von Last Jr. 178 | If you would like to parse the `author` and `editor` tags included in your entries, you can use the `NamesProcessor` class. 179 | 180 |
Usage 181 | 182 | ```php 183 | use RenanBr\BibTexParser\Processor\NamesProcessor; 184 | 185 | $listener->addProcessor(new NamesProcessor()); 186 | ``` 187 | 188 | ```bib 189 | @article{ 190 | title={Relativity: The Special and General Theory}, 191 | author={Einstein, Albert} 192 | } 193 | ``` 194 | 195 | ``` 196 | Array 197 | ( 198 | [0] => Array 199 | ( 200 | [type] => article 201 | [title] => Relativity: The Special and General Theory 202 | [author] => Array 203 | ( 204 | [0] => Array 205 | ( 206 | [first] => Albert 207 | [von] => 208 | [last] => Einstein 209 | [jr] => 210 | ) 211 | ) 212 | ) 213 | ) 214 | ``` 215 | 216 |
217 | 218 | ### Keywords 219 | 220 | The `keywords` tag contains a list of expressions represented as [string], you might want to read them as an [array] instead. 221 | 222 |
Usage 223 | 224 | ```php 225 | use RenanBr\BibTexParser\Processor\KeywordsProcessor; 226 | 227 | $listener->addProcessor(new KeywordsProcessor()); 228 | ``` 229 | 230 | ```bib 231 | @misc{ 232 | title={The End of Theory: The Data Deluge Makes the Scientific Method Obsolete}, 233 | keywords={big data, data deluge, scientific method} 234 | } 235 | ``` 236 | 237 | ``` 238 | Array 239 | ( 240 | [0] => Array 241 | ( 242 | [type] => misc 243 | [title] => The End of Theory: The Data Deluge Makes the Scientific Method Obsolete 244 | [keywords] => Array 245 | ( 246 | [0] => big data 247 | [1] => data deluge 248 | [2] => scientific method 249 | ) 250 | ) 251 | ) 252 | ``` 253 | 254 |
255 | 256 | ### Date 257 | 258 | It adds a new tag `_date` as [DateTimeImmutable]. 259 | This processor adds the new tag **if and only if** this the tags `month` and `year` are fulfilled. 260 | 261 |
Usage 262 | 263 | ```php 264 | use RenanBr\BibTexParser\Processor\DateProcessor; 265 | 266 | $listener->addProcessor(new DateProcessor()); 267 | ``` 268 | 269 | ```bib 270 | @misc{ 271 | month="1~oct", 272 | year=2000 273 | } 274 | ``` 275 | 276 | ``` 277 | Array 278 | ( 279 | [0] => Array 280 | ( 281 | [type] => misc 282 | [month] => 1~oct 283 | [year] => 2000 284 | [_date] => DateTimeImmutable Object 285 | ( 286 | [date] => 2000-10-01 00:00:00.000000 287 | [timezone_type] => 3 288 | [timezone] => UTC 289 | ) 290 | ) 291 | ) 292 | ``` 293 | 294 |
295 | 296 | ### Fill missing tag 297 | 298 | It puts a default value to some missing field. 299 | 300 |
Usage 301 | 302 | ```php 303 | use RenanBr\BibTexParser\Processor\FillMissingProcessor; 304 | 305 | $listener->addProcessor(new FillMissingProcessor([ 306 | 'title' => 'This entry has no title', 307 | 'year' => 1970, 308 | ])); 309 | ``` 310 | 311 | ```bib 312 | @misc{ 313 | } 314 | 315 | @misc{ 316 | title="I do exist" 317 | } 318 | ``` 319 | 320 | ``` 321 | Array 322 | ( 323 | [0] => Array 324 | ( 325 | [type] => misc 326 | [title] => This entry has no title 327 | [year] => 1970 328 | ) 329 | [1] => Array 330 | ( 331 | [type] => misc 332 | [title] => I do exist 333 | [year] => 1970 334 | ) 335 | ) 336 | ``` 337 | 338 |
339 | 340 | ### Trim tags 341 | 342 | Apply [trim()] to all tags. 343 | 344 |
Usage 345 | 346 | ```php 347 | use RenanBr\BibTexParser\Processor\TrimProcessor; 348 | 349 | $listener->addProcessor(new TrimProcessor()); 350 | ``` 351 | 352 | ```bib 353 | @misc{ 354 | title=" too much space " 355 | } 356 | ``` 357 | 358 | ``` 359 | Array 360 | ( 361 | [0] => Array 362 | ( 363 | [type] => misc 364 | [title] => too much space 365 | ) 366 | 367 | ) 368 | ``` 369 | 370 |
371 | 372 | ### Determine URL from the DOI 373 | 374 | Sets `url` tag with [DOI] if `doi` tag is present and `url` tag is missing. 375 | 376 |
Usage 377 | 378 | ```php 379 | use RenanBr\BibTexParser\Processor\UrlFromDoiProcessor; 380 | 381 | $listener->addProcessor(new UrlFromDoiProcessor()); 382 | ``` 383 | 384 | ```bib 385 | @misc{ 386 | doi="qwerty" 387 | } 388 | 389 | @misc{ 390 | doi="azerty", 391 | url="http://example.org" 392 | } 393 | ``` 394 | 395 | ``` 396 | Array 397 | ( 398 | [0] => Array 399 | ( 400 | [type] => misc 401 | [doi] => qwerty 402 | [url] => https://doi.org/qwerty 403 | ) 404 | 405 | [1] => Array 406 | ( 407 | [type] => misc 408 | [doi] => azerty 409 | [url] => http://example.org 410 | ) 411 | ) 412 | ``` 413 | 414 |
415 | 416 | ### LaTeX to unicode 417 | 418 | [BibTeX] files store [LaTeX] contents. 419 | You might want to read them as unicode instead. 420 | The `LatexToUnicodeProcessor` class solves this problem, but before adding the processor to the listener you must: 421 | 422 | - [install Pandoc](http://pandoc.org/installing.html) in your system; and 423 | - add [ryakad/pandoc-php](https://github.com/ryakad/pandoc-php) or [ueberdosis/pandoc](https://github.com/ueberdosis/pandoc) as a dependency of your project. 424 | 425 |
Usage 426 | 427 | ```php 428 | use RenanBr\BibTexParser\Processor\LatexToUnicodeProcessor; 429 | 430 | $listener->addProcessor(new LatexToUnicodeProcessor()); 431 | ``` 432 | 433 | ```bib 434 | @article{ 435 | title={Caf\\'{e}s and bars} 436 | } 437 | ``` 438 | 439 | ``` 440 | Array 441 | ( 442 | [0] => Array 443 | ( 444 | [type] => article 445 | [title] => Cafés and bars 446 | ) 447 | ) 448 | ``` 449 | 450 |
451 | 452 | Note: Order matters, add this processor as the last. 453 | 454 | ### Custom 455 | 456 | The `Listener::addProcessor()` method expects a [callable] as argument. 457 | In the example shown below, we append the text `with laser` to the `title` tags for all entries. 458 | 459 |
Usage 460 | 461 | ```php 462 | $listener->addProcessor(static function (array $entry) { 463 | $entry['title'] .= ' with laser'; 464 | return $entry; 465 | }); 466 | ``` 467 | 468 | ``` 469 | @article{ 470 | title={BibTeX rocks} 471 | } 472 | ``` 473 | 474 | ``` 475 | Array 476 | ( 477 | [0] => Array 478 | ( 479 | [type] => article 480 | [title] => BibTeX rocks with laser 481 | ) 482 | ) 483 | ``` 484 | 485 |
486 | 487 | ## Handling errors 488 | 489 | This library throws two types of exception: `ParserException` and `ProcessorException`. 490 | The first one may happen during the data extraction. 491 | When it occurs it probably means the parsed BibTeX isn't valid. 492 | The second exception may happen during the data processing. 493 | When it occurs it means the listener's processors can't handle properly the data found. 494 | Both implement `ExceptionInterface`. 495 | 496 | ```php 497 | use RenanBr\BibTexParser\Exception\ExceptionInterface; 498 | use RenanBr\BibTexParser\Exception\ParserException; 499 | use RenanBr\BibTexParser\Exception\ProcessorException; 500 | 501 | try { 502 | // ... parser and listener configuration 503 | 504 | $parser->parseFile('/path/to/file.bib'); 505 | $entries = $listener->export(); 506 | } catch (ParserException $exception) { 507 | // The BibTeX isn't valid 508 | } catch (ProcessorException $exception) { 509 | // Listener's processors aren't able to handle data found 510 | } catch (ExceptionInterface $exception) { 511 | // Alternatively, you can use this exception to catch all of them at once 512 | } 513 | ``` 514 | 515 | ## Advanced usage 516 | 517 | The core of this library contains these main classes: 518 | 519 | - `RenanBr\BibTexParser\Parser` responsible for detecting units inside a [BibTeX] input; 520 | - `RenanBr\BibTexParser\ListenerInterface` responsible for treating units found. 521 | 522 | You can attach listeners to the parser through `Parser::addListener()`. 523 | The parser is able to detect [BibTeX] units, such as "type", "tag's name", "tag's content". 524 | As the parser finds a unit, it triggers the listeners attached to it. 525 | 526 | You can code your own listener! All you have to do is handle units. 527 | 528 | ```php 529 | namespace RenanBr\BibTexParser; 530 | 531 | interface ListenerInterface 532 | { 533 | /** 534 | * Called when an unit is found. 535 | * 536 | * @param string $text The original content of the unit found. 537 | * Escape character will not be sent. 538 | * @param string $type The type of unit found. 539 | * It can assume one of Parser's constant value. 540 | * @param array $context Contains details of the unit found. 541 | */ 542 | public function bibTexUnitFound($text, $type, array $context); 543 | } 544 | ``` 545 | 546 | `$type` may assume one of these values: 547 | 548 | - `Parser::TYPE` 549 | - `Parser::CITATION_KEY` 550 | - `Parser::TAG_NAME` 551 | - `Parser::RAW_TAG_CONTENT` 552 | - `Parser::BRACED_TAG_CONTENT` 553 | - `Parser::QUOTED_TAG_CONTENT` 554 | - `Parser::ENTRY` 555 | 556 | `$context` is an [array] with these keys: 557 | 558 | - `offset` contains the `$text`'s beginning position. 559 | It may be useful, for example, to [seek on a file pointer](https://php.net/fseek); 560 | - `length` contains the original `$text`'s length. 561 | It may differ from [string] length sent to the listener because may there are escaped characters. 562 | 563 | ## Release Policy 564 | 565 | There is a **single** maintained branch per time. 566 | This branch targets a minor version. 567 | 568 | A maintained version reaches its end-of-life when a new minor version is released. 569 | 570 | ### Dependencies Compatibility Policy 571 | 572 | This library is compatible with maintained versions of 573 | [PHP][php-versions]. 574 | 575 | [BibTeX]: https://tug.org/bibtex/ 576 | [DOI]: https://www.doi.org/ 577 | [DateTimeImmutable]: https://www.php.net/manual/class.datetimeimmutable.php 578 | [LaTeX]: https://www.latex-project.org/ 579 | [array]: https://php.net/manual/language.types.array.php 580 | [callable]: https://php.net/manual/en/language.types.callable.php 581 | [php-versions]: https://www.php.net/supported-versions.php 582 | [string]: https://php.net/manual/language.types.string.php 583 | [trim()]: https://www.php.net/trim 584 | -------------------------------------------------------------------------------- /src/Parser.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace RenanBr\BibTexParser; 13 | 14 | use ErrorException; 15 | use RenanBr\BibTexParser\Exception\ParserException; 16 | 17 | class Parser 18 | { 19 | public const TYPE = 'type'; 20 | public const CITATION_KEY = 'citation_key'; 21 | public const TAG_NAME = 'tag_name'; 22 | public const RAW_TAG_CONTENT = 'raw_tag_content'; 23 | public const BRACED_TAG_CONTENT = 'braced_tag_content'; 24 | public const QUOTED_TAG_CONTENT = 'quoted_tag_content'; 25 | public const ENTRY = 'entry'; 26 | 27 | public const NONE = 'none'; 28 | public const COMMENT = 'comment'; 29 | public const FIRST_TAG_NAME = 'first_tag_name'; 30 | public const POST_TYPE = 'post_type'; 31 | public const POST_TAG_NAME = 'post_tag_name'; 32 | public const PRE_TAG_CONTENT = 'pre_tag_content'; 33 | 34 | private string $state; 35 | 36 | private string $buffer; 37 | 38 | private ?int $bufferOffset = null; 39 | 40 | private ?array $firstTagSnapshot = null; 41 | 42 | private ?string $originalEntryBuffer = null; 43 | 44 | private ?int $originalEntryOffset = null; 45 | 46 | private bool $skipOriginalEntryReading; 47 | 48 | private int $line; 49 | 50 | private int $column; 51 | 52 | private int $offset; 53 | 54 | private bool $isTagContentEscaped; 55 | 56 | private bool $mayConcatenateTagContent; 57 | 58 | private ?string $tagContentDelimiter = null; 59 | 60 | private int $braceLevel; 61 | 62 | /** @var array */ 63 | private array $listeners = []; 64 | 65 | public function addListener(ListenerInterface $listener): void 66 | { 67 | $this->listeners[] = $listener; 68 | } 69 | 70 | /** 71 | * @throws ParserException if $file given is not a valid BibTeX 72 | * @throws ErrorException if $file given is not readable 73 | */ 74 | public function parseFile(string $file): void 75 | { 76 | $handle = @fopen($file, 'r'); 77 | if (!$handle) { 78 | throw new ErrorException(sprintf('Unable to open %s', $file)); 79 | } 80 | try { 81 | $this->reset(); 82 | while (!feof($handle)) { 83 | $buffer = fread($handle, 128); 84 | $this->parse($buffer); 85 | } 86 | $this->throwExceptionIfReadingEntry("\0"); 87 | } finally { 88 | fclose($handle); 89 | } 90 | } 91 | 92 | /** 93 | * @throws ParserException if $string given is not a valid BibTeX 94 | */ 95 | public function parseString(string $string): void 96 | { 97 | $this->reset(); 98 | $this->parse($string); 99 | $this->throwExceptionIfReadingEntry("\0"); 100 | } 101 | 102 | private function parse(string $text): void 103 | { 104 | $text = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY); 105 | $length = count($text); 106 | for ($position = 0; $position < $length; ++$position) { 107 | $char = $text[$position]; 108 | $this->read($char); 109 | if ("\n" === $char) { 110 | ++$this->line; 111 | $this->column = 1; 112 | } else { 113 | ++$this->column; 114 | } 115 | ++$this->offset; 116 | } 117 | } 118 | 119 | private function reset(): void 120 | { 121 | $this->state = self::NONE; 122 | $this->buffer = ''; 123 | $this->firstTagSnapshot = null; 124 | $this->originalEntryBuffer = null; 125 | $this->originalEntryOffset = null; 126 | $this->skipOriginalEntryReading = false; 127 | $this->line = 1; 128 | $this->column = 1; 129 | $this->offset = 0; 130 | $this->mayConcatenateTagContent = false; 131 | $this->isTagContentEscaped = false; 132 | $this->tagContentDelimiter = null; 133 | $this->braceLevel = 0; 134 | } 135 | 136 | // ----- Readers ----------------------------------------------------------- 137 | 138 | private function read(string $char): void 139 | { 140 | $previousState = $this->state; 141 | 142 | switch ($this->state) { 143 | case self::NONE: 144 | $this->readNone($char); 145 | break; 146 | case self::COMMENT: 147 | $this->readComment($char); 148 | break; 149 | case self::TYPE: 150 | $this->readType($char); 151 | break; 152 | case self::POST_TYPE: 153 | $this->readPostType($char); 154 | break; 155 | case self::FIRST_TAG_NAME: 156 | case self::TAG_NAME: 157 | $this->readTagName($char); 158 | break; 159 | case self::POST_TAG_NAME: 160 | $this->readPostTagName($char); 161 | break; 162 | case self::PRE_TAG_CONTENT: 163 | $this->readPreTagContent($char); 164 | break; 165 | case self::RAW_TAG_CONTENT: 166 | $this->readRawTagContent($char); 167 | break; 168 | case self::QUOTED_TAG_CONTENT: 169 | case self::BRACED_TAG_CONTENT: 170 | $this->readDelimitedTagContent($char); 171 | break; 172 | } 173 | 174 | $this->readOriginalEntry($char, $previousState); 175 | } 176 | 177 | private function readNone(string $char): void 178 | { 179 | if ('@' === $char) { 180 | $this->state = self::TYPE; 181 | } elseif (!$this->isWhitespace($char)) { 182 | $this->state = self::COMMENT; 183 | } 184 | } 185 | 186 | private function readComment(string $char): void 187 | { 188 | if ($this->isWhitespace($char)) { 189 | $this->state = self::NONE; 190 | } 191 | } 192 | 193 | private function readType(string $char): void 194 | { 195 | if (preg_match('/^[a-zA-Z]$/', $char)) { 196 | $this->appendToBuffer($char); 197 | } else { 198 | $this->throwExceptionIfBufferIsEmpty($char); 199 | 200 | // Skips @comment type 201 | if ('comment' === mb_strtolower($this->buffer)) { 202 | $this->skipOriginalEntryReading = true; 203 | $this->buffer = ''; 204 | $this->bufferOffset = null; 205 | $this->state = self::COMMENT; 206 | $this->readComment($char); 207 | 208 | return; 209 | } 210 | 211 | $this->triggerListenersWithCurrentBuffer(); 212 | 213 | // once $char isn't a valid character 214 | // it must be interpreted as POST_TYPE 215 | $this->state = self::POST_TYPE; 216 | $this->readPostType($char); 217 | } 218 | } 219 | 220 | private function readPostType(string $char): void 221 | { 222 | if ('{' === $char) { 223 | $this->state = self::FIRST_TAG_NAME; 224 | } elseif (!$this->isWhitespace($char)) { 225 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column); 226 | } 227 | } 228 | 229 | private function readTagName(string $char): void 230 | { 231 | if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/\x{00C0}-\x{01FF}]$/u', $char)) { 232 | $this->appendToBuffer($char); 233 | } elseif ($this->isWhitespace($char) && empty($this->buffer)) { 234 | // Skips because we didn't start reading 235 | } elseif ('}' === $char && empty($this->buffer)) { 236 | // No tag name found, $char is just closing current entry 237 | $this->state = self::NONE; 238 | } else { 239 | $this->throwExceptionIfBufferIsEmpty($char); 240 | 241 | if (self::FIRST_TAG_NAME === $this->state) { 242 | // Takes a snapshot of current state to be triggered later as 243 | // tag name or citation key, see readPostTagName() 244 | $this->firstTagSnapshot = $this->takeBufferSnapshot(); 245 | } else { 246 | // Current buffer is a simple tag name 247 | $this->triggerListenersWithCurrentBuffer(); 248 | } 249 | 250 | // Once $char isn't a valid tag name character, it must be 251 | // interpreted as post tag name 252 | $this->state = self::POST_TAG_NAME; 253 | $this->readPostTagName($char); 254 | } 255 | } 256 | 257 | private function readPostTagName(string $char): void 258 | { 259 | if ('=' === $char) { 260 | // First tag name isn't a citation key, because it has content 261 | $this->triggerListenersWithFirstTagSnapshotAs(self::TAG_NAME); 262 | $this->state = self::PRE_TAG_CONTENT; 263 | } elseif ('}' === $char) { 264 | // First tag name is a citation key, because $char closes entry and 265 | // lets first tag without value 266 | $this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY); 267 | $this->state = self::NONE; 268 | } elseif (',' === $char) { 269 | // First tag name is a citation key, because $char moves to the next 270 | // tag and lets first tag without value 271 | $this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY); 272 | $this->state = self::TAG_NAME; 273 | } elseif (!$this->isWhitespace($char)) { 274 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column); 275 | } 276 | } 277 | 278 | private function readPreTagContent(string $char): void 279 | { 280 | if (preg_match('/^[a-zA-Z0-9]$/', $char)) { 281 | // When concatenation is available it means there is already a 282 | // defined value, and parser expect a concatenator, a tag separator 283 | // or an entry closing char as next $char 284 | $this->throwExceptionAccordingToConcatenationAvailability($char, true); 285 | $this->state = self::RAW_TAG_CONTENT; 286 | $this->readRawTagContent($char); 287 | } elseif ('"' === $char) { 288 | // The exception is here for the same reason of the first case 289 | $this->throwExceptionAccordingToConcatenationAvailability($char, true); 290 | $this->tagContentDelimiter = '"'; 291 | $this->state = self::QUOTED_TAG_CONTENT; 292 | } elseif ('{' === $char) { 293 | // The exception is here for the same reason of the first case 294 | $this->throwExceptionAccordingToConcatenationAvailability($char, true); 295 | $this->tagContentDelimiter = '}'; 296 | $this->state = self::BRACED_TAG_CONTENT; 297 | } elseif ('#' === $char) { 298 | $this->throwExceptionAccordingToConcatenationAvailability($char, false); 299 | $this->mayConcatenateTagContent = false; 300 | } elseif (',' === $char) { 301 | $this->throwExceptionAccordingToConcatenationAvailability($char, false); 302 | $this->mayConcatenateTagContent = false; 303 | $this->state = self::TAG_NAME; 304 | } elseif ('}' === $char) { 305 | $this->throwExceptionAccordingToConcatenationAvailability($char, false); 306 | $this->mayConcatenateTagContent = false; 307 | $this->state = self::NONE; 308 | } elseif (!$this->isWhitespace($char)) { 309 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column); 310 | } 311 | } 312 | 313 | private function readRawTagContent(string $char): void 314 | { 315 | if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/]$/', $char)) { 316 | $this->appendToBuffer($char); 317 | } else { 318 | $this->throwExceptionIfBufferIsEmpty($char); 319 | $this->triggerListenersWithCurrentBuffer(); 320 | 321 | // once $char isn't a valid character 322 | // it must be interpreted as TAG_CONTENT 323 | $this->mayConcatenateTagContent = true; 324 | $this->state = self::PRE_TAG_CONTENT; 325 | $this->readPreTagContent($char); 326 | } 327 | } 328 | 329 | private function readDelimitedTagContent(string $char): void 330 | { 331 | if ($this->isTagContentEscaped) { 332 | $this->isTagContentEscaped = false; 333 | if ($this->tagContentDelimiter !== $char && '\\' !== $char && '%' !== $char) { 334 | $this->appendToBuffer('\\'); 335 | } 336 | $this->appendToBuffer($char); 337 | } elseif ('}' === $this->tagContentDelimiter && '{' === $char) { 338 | ++$this->braceLevel; 339 | $this->appendToBuffer($char); 340 | } elseif ($this->tagContentDelimiter === $char) { 341 | if (0 === $this->braceLevel) { 342 | $this->triggerListenersWithCurrentBuffer(); 343 | $this->mayConcatenateTagContent = true; 344 | $this->state = self::PRE_TAG_CONTENT; 345 | } else { 346 | --$this->braceLevel; 347 | $this->appendToBuffer($char); 348 | } 349 | } elseif ('\\' === $char) { 350 | $this->isTagContentEscaped = true; 351 | } else { 352 | $this->appendToBuffer($char); 353 | } 354 | } 355 | 356 | private function readOriginalEntry(string $char, string $previousState): void 357 | { 358 | if ($this->skipOriginalEntryReading) { 359 | $this->originalEntryBuffer = ''; 360 | $this->originalEntryOffset = null; 361 | $this->skipOriginalEntryReading = false; 362 | 363 | return; 364 | } 365 | 366 | // Checks whether we are reading an entry character or not 367 | $isPreviousStateEntry = $this->isEntryState($previousState); 368 | $isCurrentStateEntry = $this->isEntryState($this->state); 369 | $isEntry = $isPreviousStateEntry || $isCurrentStateEntry; 370 | if (!$isEntry) { 371 | return; 372 | } 373 | 374 | // Appends $char to the original entry buffer 375 | if (empty($this->originalEntryBuffer)) { 376 | $this->originalEntryOffset = $this->offset; 377 | } 378 | $this->originalEntryBuffer .= $char; 379 | 380 | // Sends original entry to the listeners when $char closes an entry 381 | $isClosingEntry = $isPreviousStateEntry && !$isCurrentStateEntry; 382 | if ($isClosingEntry) { 383 | $this->triggerListeners($this->originalEntryBuffer, self::ENTRY, [ 384 | 'offset' => $this->originalEntryOffset, 385 | 'length' => $this->offset - $this->originalEntryOffset + 1, 386 | ]); 387 | $this->originalEntryBuffer = ''; 388 | $this->originalEntryOffset = null; 389 | } 390 | } 391 | 392 | // ----- Listener triggers ------------------------------------------------- 393 | 394 | private function triggerListeners(string $text, string $type, array $context): void 395 | { 396 | foreach ($this->listeners as $listener) { 397 | $listener->bibTexUnitFound($text, $type, $context); 398 | } 399 | } 400 | 401 | private function triggerListenersWithCurrentBuffer(): void 402 | { 403 | $snapshot = $this->takeBufferSnapshot(); 404 | $text = $snapshot['text']; 405 | $context = $snapshot['context']; 406 | $this->triggerListeners($text, $this->state, $context); 407 | } 408 | 409 | private function triggerListenersWithFirstTagSnapshotAs(string $type): void 410 | { 411 | if (empty($this->firstTagSnapshot)) { 412 | return; 413 | } 414 | $text = $this->firstTagSnapshot['text']; 415 | $context = $this->firstTagSnapshot['context']; 416 | $this->firstTagSnapshot = null; 417 | $this->triggerListeners($text, $type, $context); 418 | } 419 | 420 | // ----- Buffer tools ------------------------------------------------------ 421 | 422 | private function appendToBuffer(string $char): void 423 | { 424 | if (empty($this->buffer)) { 425 | $this->bufferOffset = $this->offset; 426 | } 427 | $this->buffer .= $char; 428 | } 429 | 430 | private function takeBufferSnapshot(): array 431 | { 432 | $snapshot = [ 433 | 'text' => $this->buffer, 434 | 'context' => [ 435 | 'offset' => $this->bufferOffset, 436 | 'length' => $this->offset - $this->bufferOffset, 437 | ], 438 | ]; 439 | $this->bufferOffset = null; 440 | $this->buffer = ''; 441 | 442 | return $snapshot; 443 | } 444 | 445 | // ----- Exception throwers ------------------------------------------------ 446 | 447 | private function throwExceptionAccordingToConcatenationAvailability(string $char, bool $availability): void 448 | { 449 | if ($availability === $this->mayConcatenateTagContent) { 450 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column); 451 | } 452 | } 453 | 454 | private function throwExceptionIfBufferIsEmpty(string $char): void 455 | { 456 | if (empty($this->buffer)) { 457 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column); 458 | } 459 | } 460 | 461 | private function throwExceptionIfReadingEntry(string $char): void 462 | { 463 | if ($this->isEntryState($this->state)) { 464 | throw ParserException::unexpectedCharacter($char, $this->line, $this->column); 465 | } 466 | } 467 | 468 | // ----- Auxiliaries ------------------------------------------------------- 469 | 470 | private function isEntryState(string $state): bool 471 | { 472 | return self::NONE !== $state && self::COMMENT !== $state; 473 | } 474 | 475 | private function isWhitespace(string $char): bool 476 | { 477 | return ' ' === $char || "\t" === $char || "\n" === $char || "\r" === $char; 478 | } 479 | } 480 | --------------------------------------------------------------------------------