├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── TexSoup ├── __init__.py ├── category.py ├── data.py ├── reader.py ├── tex.py ├── tokens.py └── utils.py ├── docs ├── .gitignore ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── _static │ ├── css │ │ └── theme-mod.css │ ├── images │ │ ├── android-chrome-192x192.png │ │ ├── android-chrome-512x512.png │ │ ├── apple-touch-icon.png │ │ ├── arrow-down-orange.svg │ │ ├── arrow-right-with-tail.svg │ │ ├── browserconfig.xml │ │ ├── chevron-right-orange.svg │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── favicon.ico │ │ ├── logo-dark.svg │ │ ├── logo-icon.svg │ │ ├── logo.svg │ │ ├── mstile-150x150.png │ │ ├── pytorch-x.svg │ │ ├── safari-pinned-tab.svg │ │ ├── search-icon.svg │ │ ├── site.webmanifest │ │ └── view-page-source-icon.svg │ └── texsoup.ai │ ├── _templates │ ├── cookie_banner.html │ ├── footer.html │ ├── landing.html │ ├── layout.html │ └── theme_variables.jinja │ ├── categorizer.rst │ ├── conf.py │ ├── data.rst │ ├── index.rst │ ├── main.rst │ ├── modification.rst │ ├── navigation.rst │ ├── parser.rst │ ├── quickstart.rst │ ├── searching.rst │ ├── soup.rst │ ├── tokenizer.rst │ └── utils.rst ├── examples ├── README.md ├── count_references.py ├── list_everything.py ├── resolve_imports.py ├── simple_conversion.py ├── solution_length.py └── structure_diagram.py ├── pytest.ini ├── setup.py └── tests ├── __init__.py ├── config.py ├── samples ├── chikin.pdf ├── chikin.tex └── pancake.tex ├── test_api.py ├── test_load_edit_save.py ├── test_parser.py └── test_search.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | include = */TexSoup/* 3 | omit = tests/* 4 | 5 | [report] 6 | # Regexes for lines to exclude from consideration 7 | exclude_lines = 8 | # Have to re-enable the standard pragma 9 | pragma: no cover 10 | 11 | # Don't complain about missing debug-only code: 12 | def __repr__ 13 | if self\.debug 14 | 15 | # Don't complain if tests don't hit defensive assertion code: 16 | raise AssertionError 17 | raise NotImplementedError 18 | 19 | # Don't complain if non-runnable code isn't run: 20 | if 0: 21 | if __name__ == .__main__.: 22 | 23 | ignore_errors = True 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .coveralls.yml 2 | .idea 3 | 4 | ### macOS ### 5 | # General 6 | .DS_Store 7 | .AppleDouble 8 | .LSOverride 9 | 10 | # Icon must end with two \r 11 | Icon 12 | 13 | # Thumbnails 14 | ._* 15 | 16 | # Files that might appear in the root of a volume 17 | .DocumentRevisions-V100 18 | .fseventsd 19 | .Spotlight-V100 20 | .TemporaryItems 21 | .Trashes 22 | .VolumeIcon.icns 23 | .com.apple.timemachine.donotpresent 24 | 25 | # Directories potentially created on remote AFP share 26 | .AppleDB 27 | .AppleDesktop 28 | Network Trash Folder 29 | Temporary Items 30 | .apdisk 31 | 32 | ### Python ### 33 | # Byte-compiled / optimized / DLL files 34 | __pycache__/ 35 | *.py[cod] 36 | *$py.class 37 | 38 | # C extensions 39 | *.so 40 | 41 | # Distribution / packaging 42 | .Python 43 | develop-eggs/ 44 | dist/ 45 | downloads/ 46 | eggs/ 47 | .eggs/ 48 | lib/ 49 | lib64/ 50 | parts/ 51 | sdist/ 52 | var/ 53 | wheels/ 54 | pip-wheel-metadata/ 55 | share/python-wheels/ 56 | *.egg-info/ 57 | .installed.cfg 58 | *.egg 59 | MANIFEST 60 | 61 | # PyInstaller 62 | # Usually these files are written by a python script from a template 63 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 64 | *.manifest 65 | *.spec 66 | 67 | # Installer logs 68 | pip-log.txt 69 | pip-delete-this-directory.txt 70 | 71 | # Unit test / coverage reports 72 | htmlcov/ 73 | .tox/ 74 | .nox/ 75 | .coverage 76 | .coverage.* 77 | .cache 78 | nosetests.xml 79 | coverage.xml 80 | *.cover 81 | .hypothesis/ 82 | .pytest_cache/ 83 | 84 | # Translations 85 | *.mo 86 | *.pot 87 | 88 | # Django stuff: 89 | *.log 90 | local_settings.py 91 | db.sqlite3 92 | 93 | # Flask stuff: 94 | instance/ 95 | .webassets-cache 96 | 97 | # Scrapy stuff: 98 | .scrapy 99 | 100 | # Sphinx documentation 101 | docs/_build/ 102 | 103 | # PyBuilder 104 | target/ 105 | 106 | # Jupyter Notebook 107 | .ipynb_checkpoints 108 | 109 | # IPython 110 | profile_default/ 111 | ipython_config.py 112 | 113 | # pyenv 114 | .python-version 115 | 116 | # pipenv 117 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 118 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 119 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 120 | # install all needed dependencies. 121 | #Pipfile.lock 122 | 123 | # celery beat schedule file 124 | celerybeat-schedule 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | sudo: false 4 | 5 | python: 6 | - "3.4" 7 | - "3.5" 8 | - "3.6" 9 | - "3.7" 10 | - "3.8" 11 | 12 | install: 13 | - python setup.py install 14 | - python setup.py easy_install $(python3 -c 'import distutils.core; print(" ".join(distutils.core.run_setup("setup.py").tests_require))') 15 | 16 | cache: 17 | directories: 18 | - "$HOME/.cache/pip" 19 | - lib/python3.4/site-packages 20 | - lib/python3.5/site-packages 21 | - lib/python3.6/site-packages 22 | - lib/python3.7/site-packages 23 | - lib/python3.8/site-packages 24 | 25 | script: 26 | - py.test --cov 27 | 28 | after_success: 29 | - CI=true TRAVIS=true coveralls 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Alvin Wan 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # [TexSoup](https://texsoup.alvinwan.com) 4 | 5 | [![PyPi Downloads per Day](https://img.shields.io/pypi/dm/texsoup.svg)](https://pypi.python.org/pypi/TexSoup/) 6 | [![Build Status](https://travis-ci.org/alvinwan/TexSoup.svg?branch=master)](https://travis-ci.org/alvinwan/TexSoup) 7 | [![Coverage Status](https://coveralls.io/repos/github/alvinwan/TexSoup/badge.svg?branch=master)](https://coveralls.io/github/alvinwan/TexSoup?branch=master) 8 | 9 | TexSoup is a fault-tolerant, Python3 package for searching, navigating, and modifying LaTeX documents. You can skip installation and try TexSoup directly, using the [pytwiddle demo →](https://pytwiddle.com/?id=example:latex.py) 10 | 11 | - [Getting Started](https://github.com/alvinwan/TexSoup#Getting-Started) 12 | - [Installation](https://github.com/alvinwan/TexSoup#Installation) 13 | - [API Reference](http://texsoup.alvinwan.com/docs/data.html) 14 | 15 | Created by [Alvin Wan](http://alvinwan.com) + [contributors](https://github.com/alvinwan/TexSoup/graphs/contributors). 16 | 17 | # Getting Started 18 | 19 | To parse a $LaTeX$ document, pass an open filehandle or a string into the 20 | `TexSoup` constructor. 21 | 22 | ``` python 23 | from TexSoup import TexSoup 24 | soup = TexSoup(""" 25 | \begin{document} 26 | 27 | \section{Hello \textit{world}.} 28 | 29 | \subsection{Watermelon} 30 | 31 | (n.) A sacred fruit. Also known as: 32 | 33 | \begin{itemize} 34 | \item red lemon 35 | \item life 36 | \end{itemize} 37 | 38 | Here is the prevalence of each synonym. 39 | 40 | \begin{tabular}{c c} 41 | red lemon & uncommon \\ 42 | life & common 43 | \end{tabular} 44 | 45 | \end{document} 46 | """) 47 | ``` 48 | 49 | With the soupified $\LaTeX$, you can now search and traverse the document tree. 50 | The code below demonstrates the basic functions that TexSoup provides. 51 | 52 | ```python 53 | >>> soup.section # grabs the first `section` 54 | \section{Hello \textit{world}.} 55 | >>> soup.section.name 56 | 'section' 57 | >>> soup.section.string 58 | 'Hello \\textit{world}.' 59 | >>> soup.section.parent.name 60 | 'document' 61 | >>> soup.tabular 62 | \begin{tabular}{c c} 63 | red lemon & uncommon \\ 64 | life & common 65 | \end{tabular} 66 | >>> soup.tabular.args[0] 67 | 'c c' 68 | >>> soup.item 69 | \item red lemon 70 | >>> list(soup.find_all('item')) 71 | [\item red lemon, \item life] 72 | ``` 73 | 74 | For more use cases, see [the Quickstart Guide](https://texsoup.alvinwan.com/docs/quickstart.html). Or, try TexSoup [online, via pytwiddle →](https://pytwiddle.com/?id=example:latex.py) 75 | 76 | Links: 77 | 78 | - [Quickstart Guide: how and when to use TexSoup](http://texsoup.alvinwan.com/docs/quickstart.html) 79 | - [Example Use Cases: counting references, resolving imports, and more](https://github.com/alvinwan/TexSoup/tree/master/examples) 80 | 81 | # Installation 82 | 83 | ## Pip 84 | 85 | TexSoup is published via PyPi, so you can install it via `pip`. The package 86 | name is `TexSoup`: 87 | 88 | ```bash 89 | $ pip install texsoup 90 | ``` 91 | 92 | ## From source 93 | 94 | Alternatively, you can install the package from source: 95 | 96 | ```bash 97 | $ git clone https://github.com/alvinwan/TexSoup.git 98 | $ cd TexSoup 99 | $ pip install . 100 | ``` 101 | -------------------------------------------------------------------------------- /TexSoup/__init__.py: -------------------------------------------------------------------------------- 1 | """TexSoup's main utility is the ``TexSoup`` function. 2 | 3 | Invoke this function on a LaTeX string or file handler to obtain a parse 4 | tree with navigation, search, and modification utilities. 5 | """ 6 | 7 | from TexSoup.tex import read 8 | from TexSoup.data import TexNode 9 | 10 | __version__ = '0.3.1' 11 | 12 | 13 | # noinspection PyPep8Naming 14 | def TexSoup(tex_code, skip_envs=(), tolerance=0): 15 | r""" 16 | At a high-level, parses provided Tex into a navigable, searchable 17 | structure. This is accomplished in two steps: 18 | 19 | 1. Tex is parsed, cleaned, and packaged. 20 | 2. Structure fed to TexNodes for a searchable, coder-friendly interface. 21 | 22 | :param Union[str,iterable] tex_code: the Tex source 23 | :param Union[str] skip_envs: names of environments to skip parsing 24 | :param int tolerance: error tolerance level (only supports 0 or 1) 25 | :return: :class:`TexSoup.data.TexNode` object representing tex document 26 | 27 | >>> from TexSoup import TexSoup 28 | >>> soup = TexSoup(r''' 29 | ... \begin{document} 30 | ... 31 | ... \section{Hello \textit{world}.} 32 | ... 33 | ... \subsection{Watermelon} 34 | ... 35 | ... (n.) A sacred fruit. Also known as: 36 | ... 37 | ... \begin{itemize} 38 | ... \item red lemon 39 | ... \item life 40 | ... \end{itemize} 41 | ... 42 | ... Here is the prevalence of each synonym. 43 | ... 44 | ... \begin{tabular}{c c} 45 | ... red lemon & uncommon \\ \n 46 | ... life & common 47 | ... \end{tabular} 48 | ... 49 | ... \end{document} 50 | ... ''') 51 | >>> soup.section 52 | \section{Hello \textit{world}.} 53 | >>> soup.section.name 54 | 'section' 55 | >>> soup.section.string 56 | 'Hello \\textit{world}.' 57 | >>> soup.section.parent.name 58 | 'document' 59 | >>> soup.tabular 60 | \begin{tabular}{c c} 61 | red lemon & uncommon \\ \n 62 | life & common 63 | \end{tabular} 64 | >>> soup.tabular.args[0].string 65 | 'c c' 66 | >>> soup.itemize 67 | \begin{itemize} 68 | \item red lemon 69 | \item life 70 | \end{itemize} 71 | >>> soup.item 72 | \item red lemon 73 | 74 | >>> list(soup.find_all('item')) 75 | [\item red lemon 76 | , \item life 77 | ] 78 | >>> soup = TexSoup(r'''\textbf{'Hello'}\textit{'Y'}O\textit{'U'}''') 79 | >>> soup.textbf.delete() 80 | >>> 'Hello' not in repr(soup) 81 | True 82 | >>> soup.textit.replace_with('S') 83 | >>> soup.textit.replace_with('U', 'P') 84 | >>> soup 85 | SOUP 86 | """ 87 | parsed, src = read(tex_code, skip_envs=skip_envs, tolerance=tolerance) 88 | return TexNode(parsed, src=src) 89 | -------------------------------------------------------------------------------- /TexSoup/category.py: -------------------------------------------------------------------------------- 1 | """Categorize all characters into one of category codes.""" 2 | 3 | from TexSoup.utils import CC, Token, to_buffer 4 | import string 5 | 6 | 7 | # Core category codes 8 | # https://www.overleaf.com/learn/latex/Table_of_TeX_category_codes 9 | others = set(string.printable) - set(string.ascii_letters) - \ 10 | set('{}\\$&\n\r#^_~%\x00\x7d \t[]()') 11 | CATEGORY_CODES = { 12 | CC.Escape: '\\', 13 | CC.GroupBegin: '{', 14 | CC.GroupEnd: '}', 15 | CC.MathSwitch: '$', 16 | CC.Alignment: '&', # not used 17 | CC.EndOfLine: ('\n', '\r'), 18 | CC.Macro: '#', # not used 19 | CC.Superscript: '^', # not used 20 | CC.Subscript: '_', # not used 21 | CC.Ignored: chr(0), 22 | CC.Spacer: (chr(32), chr(9)), 23 | CC.Letter: tuple(string.ascii_letters), # + lots of unicode 24 | CC.Other: tuple(others), 25 | CC.Active: '~', # not used 26 | CC.Comment: '%', 27 | CC.Invalid: chr(127), 28 | 29 | # custom 30 | CC.BracketBegin: '[', 31 | CC.BracketEnd: ']', 32 | CC.ParenBegin: '(', 33 | CC.ParenEnd: ')' 34 | } 35 | 36 | 37 | @to_buffer() 38 | def categorize(text): 39 | r"""Generator for category code tokens on text, ignoring comments. 40 | 41 | :param Union[str,iterator,Buffer] text: LaTeX to process 42 | 43 | >>> chars = list(categorize(r'\bf{}%[ello+😂')) 44 | >>> chars[0].category 45 | 46 | >>> chars[1].category 47 | 48 | >>> chars[3].category 49 | 50 | >>> chars[4].category 51 | 52 | >>> chars[5].category 53 | 54 | >>> chars[6].category 55 | 56 | >>> chars[-2].category 57 | 58 | >>> chars[-1].category 59 | 60 | >>> print(*chars) 61 | \ b f { } % [ e l l o + 😂 62 | >>> next(categorize(r''' 63 | ... ''')).category 64 | 65 | """ 66 | for position, char in enumerate(text): 67 | 68 | value = None 69 | for cc, values in CATEGORY_CODES.items(): 70 | if char in values: 71 | value = char 72 | break 73 | 74 | if value is None: 75 | yield Token(char, position, CC.Other) 76 | else: 77 | yield Token(char, position, cc) 78 | -------------------------------------------------------------------------------- /TexSoup/reader.py: -------------------------------------------------------------------------------- 1 | """Parsing mechanisms should not be directly invoked publicly, as they are 2 | subject to change.""" 3 | 4 | from TexSoup.utils import Token, Buffer, MixedBuffer, CharToLineOffset 5 | from TexSoup.data import * 6 | from TexSoup.data import arg_type 7 | from TexSoup.tokens import ( 8 | TC, 9 | tokenize, 10 | SKIP_ENV_NAMES, 11 | MATH_ENV_NAMES, 12 | SPECIAL_COMMANDS, 13 | ) 14 | import functools 15 | import string 16 | import sys 17 | 18 | 19 | MODE_MATH = 'mode:math' 20 | MODE_NON_MATH = 'mode:non-math' 21 | MODE_SPECIAL = 'mode:special' 22 | MATH_SIMPLE_ENVS = ( 23 | TexDisplayMathModeEnv, 24 | TexMathModeEnv, 25 | TexDisplayMathEnv, 26 | TexMathEnv 27 | ) 28 | MATH_TOKEN_TO_ENV = {env.token_begin: env for env in MATH_SIMPLE_ENVS} 29 | ARG_BEGIN_TO_ENV = {arg.token_begin: arg for arg in arg_type} 30 | 31 | SIGNATURES = { 32 | 'def': (2, 0), 33 | 'textbf': (1, 0), 34 | 'section': (1, 1), 35 | 'label': (1, 0), 36 | 'cap': (0, 0), 37 | 'cup': (0, 0), 38 | 'in': (0, 0), 39 | 'notin': (0, 0), 40 | 'infty': (0, 0), 41 | 'noindent': (0, 0), 42 | } 43 | 44 | 45 | __all__ = ['read_expr', 'read_tex'] 46 | 47 | 48 | def read_tex(buf, skip_envs=(), tolerance=0): 49 | r"""Parse all expressions in buffer 50 | 51 | :param Buffer buf: a buffer of tokens 52 | :param Tuple[str] skip_envs: environments to skip parsing 53 | :param int tolerance: error tolerance level (only supports 0 or 1) 54 | :return: iterable over parsed expressions 55 | :rtype: Iterable[TexExpr] 56 | """ 57 | while buf.hasNext(): 58 | yield read_expr(buf, 59 | skip_envs=SKIP_ENV_NAMES + skip_envs, 60 | tolerance=tolerance) 61 | 62 | 63 | def make_read_peek(f): 64 | r"""Make any reader into a peek function. 65 | 66 | The wrapped function still parses the next sequence of tokens in the 67 | buffer but rolls back the buffer position afterwards. 68 | 69 | >>> from TexSoup.category import categorize 70 | >>> from TexSoup.tokens import tokenize 71 | >>> def read(buf): 72 | ... buf.forward(3) 73 | >>> buf = Buffer(tokenize(categorize(r'\item testing \textbf{hah}'))) 74 | >>> buf.position 75 | 0 76 | >>> make_read_peek(read)(buf) 77 | >>> buf.position 78 | 0 79 | """ 80 | @functools.wraps(f) 81 | def wrapper(buf, *args, **kwargs): 82 | start = buf.position 83 | ret = f(buf, *args, **kwargs) 84 | buf.backward(buf.position - start) 85 | return ret 86 | return wrapper 87 | 88 | 89 | def read_expr(src, skip_envs=(), tolerance=0, mode=MODE_NON_MATH): 90 | r"""Read next expression from buffer 91 | 92 | :param Buffer src: a buffer of tokens 93 | :param Tuple[str] skip_envs: environments to skip parsing 94 | :param int tolerance: error tolerance level (only supports 0 or 1) 95 | :param str mode: math or not math mode 96 | :return: parsed expression 97 | :rtype: [TexExpr, Token] 98 | """ 99 | c = next(src) 100 | if c.category in MATH_TOKEN_TO_ENV.keys(): 101 | expr = MATH_TOKEN_TO_ENV[c.category]([], position=c.position) 102 | return read_math_env(src, expr, tolerance=tolerance) 103 | elif c.category == TC.Escape: 104 | name, args = read_command(src, tolerance=tolerance, mode=mode) 105 | if name == 'item': 106 | assert mode != MODE_MATH, r'Command \item invalid in math mode.' 107 | contents = read_item(src) 108 | expr = TexCmd(name, contents, args, position=c.position) 109 | # if we are in "special" mode, we do not attempt to match the `\begin` 110 | # and `\end` 111 | elif name == 'begin' and mode != MODE_SPECIAL: 112 | assert args, 'Begin command must be followed by an env name.' 113 | expr = TexNamedEnv( 114 | args[0].string, args=args[1:], position=c.position) 115 | if expr.name in MATH_ENV_NAMES: 116 | mode = MODE_MATH 117 | if expr.name in skip_envs: 118 | read_skip_env(src, expr) 119 | else: 120 | read_env(src, expr, skip_envs=skip_envs,tolerance=tolerance, mode=mode) 121 | else: 122 | expr = TexCmd(name, args=args, position=c.position) 123 | return expr 124 | if c.category == TC.GroupBegin: 125 | return read_arg(src, c, tolerance=tolerance) 126 | 127 | assert isinstance(c, Token) 128 | return TexText(c) 129 | 130 | 131 | ################ 132 | # ENVIRONMENTS # 133 | ################ 134 | 135 | 136 | def read_item(src, tolerance=0): 137 | r"""Read the item content. Assumes escape has just been parsed. 138 | 139 | There can be any number of whitespace characters between \item and the 140 | first non-whitespace character. Any amount of whitespace between subsequent 141 | characters is also allowed. 142 | 143 | \item can also take an argument. 144 | 145 | :param Buffer src: a buffer of tokens 146 | :param int tolerance: error tolerance level (only supports 0 or 1) 147 | :return: contents of the item and any item arguments 148 | 149 | >>> from TexSoup.category import categorize 150 | >>> from TexSoup.tokens import tokenize 151 | >>> def read_item_from(string, skip=2): 152 | ... buf = tokenize(categorize(string)) 153 | ... _ = buf.forward(skip) 154 | ... return read_item(buf) 155 | >>> read_item_from(r'\item aaa {bbb} ccc\end{itemize}') 156 | [' aaa ', BraceGroup('bbb'), ' ccc'] 157 | >>> read_item_from(r'\item aaa \textbf{itemize}\item no') 158 | [' aaa ', TexCmd('textbf', [BraceGroup('itemize')])] 159 | >>> read_item_from(r'\item WITCH [nuuu] DOCTORRRR 👩🏻‍⚕️') 160 | [' WITCH ', '[', 'nuuu', ']', ' DOCTORRRR 👩🏻‍⚕️'] 161 | >>> read_item_from(r'''\begin{itemize} 162 | ... \item 163 | ... \item first item 164 | ... \end{itemize}''', skip=8) 165 | ['\n'] 166 | >>> read_item_from(r'''\def\itemeqn{\item}''', skip=7) 167 | [] 168 | """ 169 | extras = [] 170 | 171 | while src.hasNext(): 172 | if src.peek().category == TC.Escape: 173 | cmd_name, _ = make_read_peek(read_command)( 174 | src, 1, skip=1, tolerance=tolerance) 175 | if cmd_name in ('end', 'item'): 176 | return extras 177 | elif src.peek().category == TC.GroupEnd: 178 | break 179 | extras.append(read_expr(src, tolerance=tolerance)) 180 | return extras 181 | 182 | 183 | def unclosed_env_handler(src, expr, end): 184 | """Handle unclosed environments. 185 | 186 | Currently raises an end-of-file error. In the future, this can be the hub 187 | for unclosed-environment fault tolerance. 188 | 189 | :param Buffer src: a buffer of tokens 190 | :param TexExpr expr: expression for the environment 191 | :param int tolerance: error tolerance level (only supports 0 or 1) 192 | :param end str: Actual end token (as opposed to expected) 193 | """ 194 | clo = CharToLineOffset(str(src)) 195 | explanation = 'Instead got %s' % end if end else 'Reached end of file.' 196 | line, offset = clo(src.position) 197 | raise EOFError('[Line: %d, Offset: %d] "%s" env expecting %s. %s' % ( 198 | line, offset, expr.name, expr.end, explanation)) 199 | 200 | 201 | def read_math_env(src, expr, tolerance=0): 202 | r"""Read the environment from buffer. 203 | 204 | Advances the buffer until right after the end of the environment. Adds 205 | parsed content to the expression automatically. 206 | 207 | :param Buffer src: a buffer of tokens 208 | :param TexExpr expr: expression for the environment 209 | :rtype: TexExpr 210 | 211 | >>> from TexSoup.category import categorize 212 | >>> from TexSoup.tokens import tokenize 213 | >>> buf = tokenize(categorize(r'\min_x \|Xw-y\|_2^2')) 214 | >>> read_math_env(buf, TexMathModeEnv()) 215 | Traceback (most recent call last): 216 | ... 217 | EOFError: [Line: 0, Offset: 7] "$" env expecting $. Reached end of file. 218 | """ 219 | contents = [] 220 | while src.hasNext() and src.peek().category != expr.token_end: 221 | contents.append(read_expr(src, tolerance=tolerance, mode=MODE_MATH)) 222 | if not src.hasNext() or src.peek().category != expr.token_end: 223 | unclosed_env_handler(src, expr, src.peek()) 224 | next(src) 225 | expr.append(*contents) 226 | return expr 227 | 228 | 229 | def read_skip_env(src, expr): 230 | r"""Read the environment from buffer, WITHOUT parsing contents 231 | 232 | Advances the buffer until right after the end of the environment. Adds 233 | UNparsed content to the expression automatically. 234 | 235 | :param Buffer src: a buffer of tokens 236 | :param TexExpr expr: expression for the environment 237 | :rtype: TexExpr 238 | 239 | >>> from TexSoup.category import categorize 240 | >>> from TexSoup.tokens import tokenize 241 | >>> buf = tokenize(categorize(r' \textbf{aa \end{foobar}ha')) 242 | >>> read_skip_env(buf, TexNamedEnv('foobar')) 243 | TexNamedEnv('foobar', [' \\textbf{aa '], []) 244 | >>> buf = tokenize(categorize(r' \textbf{aa ha')) 245 | >>> read_skip_env(buf, TexNamedEnv('foobar')) #doctest:+ELLIPSIS 246 | Traceback (most recent call last): 247 | ... 248 | EOFError: ... 249 | """ 250 | def condition(s): return s.startswith('\\end{%s}' % expr.name) 251 | contents = [src.forward_until(condition, peek=False)] 252 | if not src.startswith('\\end{%s}' % expr.name): 253 | unclosed_env_handler(src, expr, src.peek((0, 6))) 254 | src.forward(5) 255 | expr.append(*contents) 256 | return expr 257 | 258 | 259 | def read_env(src, expr, skip_envs=(), tolerance=0, mode=MODE_NON_MATH): 260 | r"""Read the environment from buffer. 261 | 262 | Advances the buffer until right after the end of the environment. Adds 263 | parsed content to the expression automatically. 264 | 265 | :param Buffer src: a buffer of tokens 266 | :param TexExpr expr: expression for the environment 267 | :param int tolerance: error tolerance level (only supports 0 or 1) 268 | :param str mode: math or not math mode 269 | :rtype: TexExpr 270 | 271 | >>> from TexSoup.category import categorize 272 | >>> from TexSoup.tokens import tokenize 273 | >>> buf = tokenize(categorize(' tingtang \\end\n{foobar}walla')) 274 | >>> read_env(buf, TexNamedEnv('foobar')) 275 | TexNamedEnv('foobar', [' tingtang '], []) 276 | >>> buf = tokenize(categorize(' tingtang \\end\n\n{foobar}walla')) 277 | >>> read_env(buf, TexNamedEnv('foobar')) #doctest: +ELLIPSIS 278 | Traceback (most recent call last): 279 | ... 280 | EOFError: [Line: 0, Offset: 1] ... 281 | >>> buf = tokenize(categorize(' tingtang \\end\n\n{nope}walla')) 282 | >>> read_env(buf, TexNamedEnv('foobar'), tolerance=1) # error tolerance 283 | TexNamedEnv('foobar', [' tingtang '], []) 284 | """ 285 | contents = [] 286 | while src.hasNext(): 287 | if src.peek().category == TC.Escape: 288 | name, args = make_read_peek(read_command)( 289 | src, skip=1, tolerance=tolerance, mode=mode) 290 | if name == 'end': 291 | break 292 | contents.append(read_expr(src, skip_envs=skip_envs, tolerance=tolerance, mode=mode)) 293 | error = not src.hasNext() or not args or args[0].string != expr.name 294 | if error and tolerance == 0: 295 | unclosed_env_handler(src, expr, src.peek((0, 6))) 296 | elif not error: 297 | src.forward(5) 298 | expr.append(*contents) 299 | return expr 300 | 301 | 302 | ############ 303 | # COMMANDS # 304 | ############ 305 | 306 | 307 | # TODO: handle macro-weirdness e.g., \def\blah[#1][[[[[[[[#2{"#1 . #2"} 308 | # TODO: add newcommand macro 309 | def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0, 310 | mode=MODE_NON_MATH): 311 | r"""Read all arguments from buffer. 312 | 313 | This function assumes that the command name has already been parsed. By 314 | default, LaTeX allows only up to 9 arguments of both types, optional 315 | and required. If `n_optional` is not set, all valid bracket groups are 316 | captured. If `n_required` is not set, all valid brace groups are 317 | captured. 318 | 319 | :param Buffer src: a buffer of tokens 320 | :param TexArgs args: existing arguments to extend 321 | :param int n_required: Number of required arguments. If < 0, all valid 322 | brace groups will be captured. 323 | :param int n_optional: Number of optional arguments. If < 0, all valid 324 | bracket groups will be captured. 325 | :param int tolerance: error tolerance level (only supports 0 or 1) 326 | :param str mode: math or not math mode 327 | :return: parsed arguments 328 | :rtype: TexArgs 329 | 330 | >>> from TexSoup.category import categorize 331 | >>> from TexSoup.tokens import tokenize 332 | >>> test = lambda s, *a, **k: read_args(tokenize(categorize(s)), *a, **k) 333 | >>> test('[walla]{walla}{ba]ng}') # 'regular' arg parse 334 | [BracketGroup('walla'), BraceGroup('walla'), BraceGroup('ba', ']', 'ng')] 335 | >>> test('\t[wa]\n{lla}\n\n{b[ing}') # interspersed spacers + 2 newlines 336 | [BracketGroup('wa'), BraceGroup('lla')] 337 | >>> test('\t[\t{a]}bs', 2, 0) # use char as arg, since no opt args 338 | [BraceGroup('['), BraceGroup('a', ']')] 339 | >>> test('\n[hue]\t[\t{a]}', 2, 1) # check stop opt arg capture 340 | [BracketGroup('hue'), BraceGroup('['), BraceGroup('a', ']')] 341 | >>> test('\t\\item') 342 | [] 343 | >>> test(' \t \n\t \n{bingbang}') 344 | [] 345 | >>> test('[tempt]{ing}[WITCH]{doctorrrr}', 0, 0) 346 | [] 347 | """ 348 | args = args or TexArgs() 349 | if n_required == 0 and n_optional == 0: 350 | return args 351 | 352 | n_optional = read_arg_optional(src, args, n_optional, tolerance, mode) 353 | n_required = read_arg_required(src, args, n_required, tolerance, mode) 354 | 355 | if src.hasNext() and src.peek().category == TC.BracketBegin: 356 | n_optional = read_arg_optional(src, args, n_optional, tolerance, mode) 357 | if src.hasNext() and src.peek().category == TC.GroupBegin: 358 | n_required = read_arg_required(src, args, n_required, tolerance, mode) 359 | return args 360 | 361 | 362 | def read_arg_optional( 363 | src, args, n_optional=-1, tolerance=0, mode=MODE_NON_MATH): 364 | """Read next optional argument from buffer. 365 | 366 | If the command has remaining optional arguments, look for: 367 | 368 | a. A spacer. Skip the spacer if it exists. 369 | b. A bracket delimiter. If the optional argument is bracket-delimited, 370 | the contents of the bracket group are used as the argument. 371 | 372 | :param Buffer src: a buffer of tokens 373 | :param TexArgs args: existing arguments to extend 374 | :param int n_optional: Number of optional arguments. If < 0, all valid 375 | bracket groups will be captured. 376 | :param int tolerance: error tolerance level (only supports 0 or 1) 377 | :param str mode: math or not math mode 378 | :return: number of remaining optional arguments 379 | :rtype: int 380 | """ 381 | while n_optional != 0: 382 | spacer = read_spacer(src) 383 | if not (src.hasNext() and src.peek().category == TC.BracketBegin): 384 | if spacer: 385 | src.backward(1) 386 | break 387 | args.append(read_arg(src, next(src), tolerance=tolerance, mode=mode)) 388 | n_optional -= 1 389 | return n_optional 390 | 391 | 392 | def read_arg_required( 393 | src, args, n_required=-1, tolerance=0, mode=MODE_NON_MATH): 394 | r"""Read next required argument from buffer. 395 | 396 | If the command has remaining required arguments, look for: 397 | 398 | a. A spacer. Skip the spacer if it exists. 399 | b. A curly-brace delimiter. If the required argument is brace-delimited, 400 | the contents of the brace group are used as the argument. 401 | c. Spacer or not, if a brace group is not found, simply use the next 402 | character, unless it is a backslash, in which case use the full command name 403 | 404 | :param Buffer src: a buffer of tokens 405 | :param TexArgs args: existing arguments to extend 406 | :param int n_required: Number of required arguments. If < 0, all valid 407 | brace groups will be captured. 408 | :param int tolerance: error tolerance level (only supports 0 or 1) 409 | :param str mode: math or not math mode 410 | :return: number of remaining optional arguments 411 | :rtype: int 412 | 413 | >>> from TexSoup.category import categorize 414 | >>> from TexSoup.tokens import tokenize 415 | >>> buf = tokenize(categorize('{wal]la}\n{ba ng}\n')) 416 | >>> args = TexArgs() 417 | >>> read_arg_required(buf, args) # 'regular' arg parse 418 | -3 419 | >>> args 420 | [BraceGroup('wal', ']', 'la'), BraceGroup('ba ng')] 421 | >>> buf.hasNext() and buf.peek().category == TC.MergedSpacer 422 | True 423 | """ 424 | while n_required != 0 and src.hasNext(): 425 | spacer = read_spacer(src) 426 | 427 | if src.hasNext() and src.peek().category == TC.GroupBegin: 428 | args.append(read_arg( 429 | src, next(src), tolerance=tolerance, mode=mode)) 430 | n_required -= 1 431 | continue 432 | elif src.hasNext() and n_required > 0: 433 | next_token = next(src) 434 | if next_token.category == TC.Escape: 435 | name, _ = read_command(src, 0, 0, tolerance=tolerance, mode=mode) 436 | args.append(TexCmd(name, position=next_token.position)) 437 | else: 438 | args.append('{%s}' % next_token) 439 | n_required -= 1 440 | continue 441 | 442 | if spacer: 443 | src.backward(1) 444 | break 445 | return n_required 446 | 447 | 448 | def read_arg(src, c, tolerance=0, mode=MODE_NON_MATH): 449 | r"""Read the argument from buffer. 450 | 451 | Advances buffer until right before the end of the argument. 452 | 453 | :param Buffer src: a buffer of tokens 454 | :param str c: argument token (starting token) 455 | :param int tolerance: error tolerance level (only supports 0 or 1) 456 | :param str mode: math or not math mode 457 | :return: the parsed argument 458 | :rtype: TexGroup 459 | 460 | >>> from TexSoup.category import categorize 461 | >>> from TexSoup.tokens import tokenize 462 | >>> s = r'''{\item\abovedisplayskip=2pt\abovedisplayshortskip=0pt~\vspace*{-\baselineskip}}''' 463 | >>> buf = tokenize(categorize(s)) 464 | >>> read_arg(buf, next(buf)) 465 | BraceGroup(TexCmd('item')) 466 | >>> buf = tokenize(categorize(r'{\incomplete! [complete]')) 467 | >>> read_arg(buf, next(buf), tolerance=1) 468 | BraceGroup(TexCmd('incomplete'), '! ', '[', 'complete', ']') 469 | """ 470 | content = [c] 471 | arg = ARG_BEGIN_TO_ENV[c.category] 472 | while src.hasNext(): 473 | if src.peek().category == arg.token_end: 474 | src.forward() 475 | return arg(*content[1:], position=c.position) 476 | else: 477 | content.append(read_expr(src, tolerance=tolerance, mode=mode)) 478 | 479 | if tolerance == 0: 480 | clo = CharToLineOffset(str(src)) 481 | line, offset = clo(c.position) 482 | raise TypeError( 483 | '[Line: %d, Offset %d] Malformed argument. First and last elements ' 484 | 'must match a valid argument format. In this case, TexSoup' 485 | ' could not find matching punctuation for: %s.\n' 486 | 'Just finished parsing: %s' % 487 | (line, offset, c, content)) 488 | return arg(*content[1:], position=c.position) 489 | 490 | 491 | def read_spacer(buf): 492 | r"""Extracts the next spacer, if there is one, before non-whitespace 493 | 494 | Define a spacer to be a contiguous string of only whitespace, with at most 495 | one line break. 496 | 497 | >>> from TexSoup.category import categorize 498 | >>> from TexSoup.tokens import tokenize 499 | >>> read_spacer(Buffer(tokenize(categorize(' \t \n')))) 500 | ' \t \n' 501 | >>> read_spacer(Buffer(tokenize(categorize(' \t \n\t \n \t\n')))) 502 | ' \t \n\t ' 503 | >>> read_spacer(Buffer(tokenize(categorize('{')))) 504 | '' 505 | >>> read_spacer(Buffer(tokenize(categorize(' \t \na')))) 506 | '' 507 | >>> read_spacer(Buffer(tokenize(categorize(' \t \n\t \n \t\na')))) 508 | ' \t \n\t ' 509 | """ 510 | if buf.hasNext() and buf.peek().category == TC.MergedSpacer: 511 | return next(buf) 512 | return '' 513 | 514 | 515 | def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0, 516 | tolerance=0, mode=MODE_NON_MATH): 517 | r"""Parses command and all arguments. Assumes escape has just been parsed. 518 | 519 | No whitespace is allowed between escape and command name. e.g., 520 | :code:`\ textbf` is a backslash command, then text :code:`textbf`. Only 521 | :code:`\textbf` is the bold command. 522 | 523 | >>> from TexSoup.category import categorize 524 | >>> from TexSoup.tokens import tokenize 525 | >>> buf = Buffer(tokenize(categorize('\\sect \t \n\t{wallawalla}'))) 526 | >>> next(buf) 527 | '\\' 528 | >>> read_command(buf) 529 | ('sect', [BraceGroup('wallawalla')]) 530 | >>> buf = Buffer(tokenize(categorize('\\sect \t \n\t \n{bingbang}'))) 531 | >>> _ = next(buf) 532 | >>> read_command(buf) 533 | ('sect', []) 534 | >>> buf = Buffer(tokenize(categorize('\\sect{ooheeeee}'))) 535 | >>> _ = next(buf) 536 | >>> read_command(buf) 537 | ('sect', [BraceGroup('ooheeeee')]) 538 | >>> buf = Buffer(tokenize(categorize(r'\item aaa {bbb} ccc\end{itemize}'))) 539 | >>> read_command(buf, skip=1) 540 | ('item', []) 541 | >>> buf.peek() 542 | ' aaa ' 543 | 544 | # >>> buf = Buffer(tokenize(categorize('\\sect abcd'))) 545 | # >>> _ = next(buf) 546 | # >>> read_command(buf) 547 | # ('sect', ('a',)) 548 | """ 549 | for _ in range(skip): 550 | next(buf) 551 | 552 | name = next(buf) 553 | # if the command is a special one (like `newcommand`), enter "special" 554 | # mode, in which a single `\begin` or `\end` are allowed 555 | if name.text in SPECIAL_COMMANDS: 556 | mode = MODE_SPECIAL 557 | token = Token('', buf.position) 558 | if n_required_args < 0 and n_optional_args < 0: 559 | n_required_args, n_optional_args = SIGNATURES.get(name, (-1, -1)) 560 | args = read_args(buf, n_required_args, n_optional_args, 561 | tolerance=tolerance, mode=mode) 562 | # after parsing the command, go back to normal mode 563 | if name.text in SPECIAL_COMMANDS: 564 | mode = MODE_NON_MATH 565 | return name, args 566 | -------------------------------------------------------------------------------- /TexSoup/tex.py: -------------------------------------------------------------------------------- 1 | from TexSoup.reader import read_expr, read_tex 2 | from TexSoup.data import * 3 | from TexSoup.utils import * 4 | from TexSoup.tokens import tokenize 5 | from TexSoup.category import categorize 6 | import itertools 7 | 8 | 9 | def read(tex, skip_envs=(), tolerance=0): 10 | """Read and parse all LaTeX source. 11 | 12 | :param Union[str,iterable] tex: LaTeX source 13 | :param Union[str] skip_envs: names of environments to skip parsing 14 | :param int tolerance: error tolerance level (only supports 0 or 1) 15 | :return TexEnv: the global environment 16 | """ 17 | if not isinstance(tex, str): 18 | tex = ''.join(itertools.chain(*tex)) 19 | buf = categorize(tex) 20 | buf = tokenize(buf) 21 | buf = read_tex(buf, skip_envs=skip_envs, tolerance=tolerance) 22 | return TexEnv('[tex]', begin='', end='', contents=buf), tex 23 | -------------------------------------------------------------------------------- /TexSoup/tokens.py: -------------------------------------------------------------------------------- 1 | """Tokenization for all input. 2 | 3 | Translates string into iterable `TexSoup.utils.Buffer`, yielding one 4 | token at a time. 5 | """ 6 | 7 | from TexSoup.utils import to_buffer, Buffer, Token, CC 8 | from TexSoup.data import arg_type 9 | from TexSoup.category import categorize # used for tests 10 | from TexSoup.utils import IntEnum, TC 11 | import itertools 12 | import string 13 | 14 | # Custom higher-level combinations of primitives 15 | SKIP_ENV_NAMES = ('lstlisting', 'verbatim', 'verbatimtab', 'Verbatim', 'listing') 16 | MATH_ENV_NAMES = ( 17 | 'align', 'align*', 'alignat', 'array', 'displaymath', 'eqnarray', 18 | 'eqnarray*', 'equation', 'equation*', 'flalign', 'flalign*', 'gather', 19 | 'gather*', 'math', 'multline', 'multline*', 'split' 20 | ) 21 | SPECIAL_COMMANDS = {'newcommand', 'renewcommand', 'providecommand'} 22 | BRACKETS_DELIMITERS = { 23 | '(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle', 24 | r'\rangle', r'\lfloor', r'\rfloor', r'\lceil', r'\rceil', r'\ulcorner', 25 | r'\urcorner', r'\lbrack', r'\rbrack' 26 | } 27 | # TODO: looks like left-right do have to match 28 | SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg') 29 | PUNCTUATION_COMMANDS = {command + bracket 30 | for command in SIZE_PREFIX 31 | for bracket in BRACKETS_DELIMITERS.union({'|', '.'})} 32 | 33 | __all__ = ['tokenize'] 34 | 35 | 36 | def next_token(text, prev=None): 37 | r"""Returns the next possible token, advancing the iterator to the next 38 | position to start processing from. 39 | 40 | :param Union[str,iterator,Buffer] text: LaTeX to process 41 | :return str: the token 42 | 43 | >>> b = categorize(r'\textbf{Do play\textit{nice}.} $$\min_w \|w\|_2^2$$') 44 | >>> print(next_token(b), next_token(b), next_token(b), next_token(b)) 45 | \ textbf { Do play 46 | >>> print(next_token(b), next_token(b), next_token(b), next_token(b)) 47 | \ textit { nice 48 | >>> print(next_token(b)) 49 | } 50 | >>> print(next_token(categorize('.}'))) 51 | . 52 | >>> next_token(b) 53 | '.' 54 | >>> next_token(b) 55 | '}' 56 | """ 57 | while text.hasNext(): 58 | for name, f in tokenizers: 59 | current_token = f(text, prev=prev) 60 | if current_token is not None: 61 | return current_token 62 | 63 | 64 | @to_buffer() 65 | def tokenize(text): 66 | r"""Generator for LaTeX tokens on text, ignoring comments. 67 | 68 | :param Union[str,iterator,Buffer] text: LaTeX to process 69 | 70 | >>> print(*tokenize(categorize(r'\\%}'))) 71 | \\ %} 72 | >>> print(*tokenize(categorize(r'\textbf{hello \\%}'))) 73 | \ textbf { hello \\ %} 74 | >>> print(*tokenize(categorize(r'\textbf{Do play \textit{nice}.}'))) 75 | \ textbf { Do play \ textit { nice } . } 76 | >>> print(*tokenize(categorize(r'\begin{tabular} 0 & 1 \\ 2 & 0 \end{tabular}'))) 77 | \ begin { tabular } 0 & 1 \\ 2 & 0 \ end { tabular } 78 | """ 79 | current_token = next_token(text) 80 | while current_token is not None: 81 | assert current_token.category in TC 82 | yield current_token 83 | current_token = next_token(text, prev=current_token) 84 | 85 | 86 | ############## 87 | # Tokenizers # 88 | ############## 89 | 90 | tokenizers = [] 91 | 92 | 93 | def token(name): 94 | """Marker for a token. 95 | 96 | :param str name: Name of tokenizer 97 | """ 98 | 99 | def wrap(f): 100 | tokenizers.append((name, f)) 101 | return f 102 | 103 | return wrap 104 | 105 | 106 | @token('escaped_symbols') 107 | def tokenize_escaped_symbols(text, prev=None): 108 | r"""Process an escaped symbol or a known punctuation command. 109 | 110 | :param Buffer text: iterator over line, with current position 111 | 112 | >>> tokenize_escaped_symbols(categorize(r'\\')) 113 | '\\\\' 114 | >>> tokenize_escaped_symbols(categorize(r'\\%')) 115 | '\\\\' 116 | >>> tokenize_escaped_symbols(categorize(r'\}')) 117 | '\\}' 118 | >>> tokenize_escaped_symbols(categorize(r'\%')) 119 | '\\%' 120 | >>> tokenize_escaped_symbols(categorize(r'\ ')) 121 | '\\ ' 122 | """ 123 | if text.peek().category == CC.Escape \ 124 | and text.peek(1) \ 125 | and text.peek(1).category in ( 126 | CC.Escape, CC.GroupBegin, CC.GroupEnd, CC.MathSwitch, 127 | CC.Alignment, CC.EndOfLine, CC.Macro, CC.Superscript, 128 | CC.Subscript, CC.Spacer, CC.Active, CC.Comment, CC.Other): 129 | result = text.forward(2) 130 | result.category = TC.EscapedComment 131 | return result 132 | 133 | 134 | @token('comment') 135 | def tokenize_line_comment(text, prev=None): 136 | r"""Process a line comment 137 | 138 | :param Buffer text: iterator over line, with current position 139 | 140 | >>> tokenize_line_comment(categorize('%hello world\\')) 141 | '%hello world\\' 142 | >>> tokenize_line_comment(categorize('hello %world')) 143 | >>> tokenize_line_comment(categorize('%}hello world')) 144 | '%}hello world' 145 | >>> tokenize_line_comment(categorize('%} ')) 146 | '%} ' 147 | >>> tokenize_line_comment(categorize('%hello\n world')) 148 | '%hello' 149 | >>> b = categorize(r'\\%') 150 | >>> _ = next(b), next(b) 151 | >>> tokenize_line_comment(b) 152 | '%' 153 | >>> tokenize_line_comment(categorize(r'\%')) 154 | """ 155 | result = Token('', text.position) 156 | if text.peek().category == CC.Comment and ( 157 | prev is None or prev.category != CC.Comment): 158 | result += text.forward(1) 159 | while text.hasNext() and text.peek().category != CC.EndOfLine: 160 | result += text.forward(1) 161 | result.category = TC.Comment 162 | return result 163 | 164 | 165 | @token('math_sym_switch') 166 | def tokenize_math_sym_switch(text, prev=None): 167 | r"""Group characters in math switches. 168 | 169 | :param Buffer text: iterator over line, with current position 170 | 171 | >>> tokenize_math_sym_switch(categorize(r'$\min_x$ \command')) 172 | '$' 173 | >>> tokenize_math_sym_switch(categorize(r'$$\min_x$$ \command')) 174 | '$$' 175 | """ 176 | if text.peek().category == CC.MathSwitch: 177 | if text.peek(1) and text.peek(1).category == CC.MathSwitch: 178 | result = Token(text.forward(2), text.position) 179 | result.category = TC.DisplayMathSwitch 180 | else: 181 | result = Token(text.forward(1), text.position) 182 | result.category = TC.MathSwitch 183 | return result 184 | 185 | 186 | @token('math_asym_switch') 187 | def tokenize_math_asym_switch(text, prev=None): 188 | r"""Group characters in begin-end-style math switches 189 | 190 | :param Buffer text: iterator over line, with current position 191 | 192 | >>> tokenize_math_asym_switch(categorize(r'\[asf')) 193 | '\\[' 194 | >>> tokenize_math_asym_switch(categorize(r'\] sdf')) 195 | '\\]' 196 | >>> tokenize_math_asym_switch(categorize(r'[]')) 197 | """ 198 | mapping = { 199 | (CC.Escape, CC.BracketBegin): TC.DisplayMathGroupBegin, 200 | (CC.Escape, CC.BracketEnd): TC.DisplayMathGroupEnd, 201 | (CC.Escape, CC.ParenBegin): TC.MathGroupBegin, 202 | (CC.Escape, CC.ParenEnd): TC.MathGroupEnd 203 | } 204 | if not text.hasNext(2): 205 | return 206 | key = (text.peek().category, text.peek(1).category) 207 | if key in mapping: 208 | result = text.forward(2) 209 | result.category = mapping[key] 210 | return result 211 | 212 | 213 | @token('line_break') 214 | def tokenize_line_break(text, prev=None): 215 | r"""Extract LaTeX line breaks. 216 | 217 | >>> tokenize_line_break(categorize(r'\\aaa')) 218 | '\\\\' 219 | >>> tokenize_line_break(categorize(r'\aaa')) 220 | """ 221 | if text.peek().category == CC.Escape and text.peek(1) \ 222 | and text.peek(1).category == CC.Escape: 223 | result = text.forward(2) 224 | result.category = TC.LineBreak 225 | return result 226 | 227 | 228 | @token('ignore') 229 | def tokenize_ignore(text, prev=None): 230 | r"""Filter out ignored or invalid characters 231 | 232 | >>> print(*tokenize(categorize('\x00hello'))) 233 | hello 234 | """ 235 | while text.peek().category in (CC.Ignored, CC.Invalid): 236 | text.forward(1) 237 | 238 | 239 | @token('spacers') 240 | def tokenize_spacers(text, prev=None): 241 | r"""Combine spacers [ + line break [ + spacer]] 242 | 243 | >>> tokenize_spacers(categorize('\t\n{there')) 244 | '\t\n' 245 | >>> tokenize_spacers(categorize('\t\nthere')) 246 | >>> tokenize_spacers(categorize(' \t ')) 247 | ' \t ' 248 | >>> tokenize_spacers(categorize(r' ccc')) 249 | """ 250 | result = Token('', text.position) 251 | while text.hasNext() and text.peek().category == CC.Spacer: 252 | result += text.forward(1) 253 | if text.hasNext() and text.peek().category == CC.EndOfLine: 254 | result += text.forward(1) 255 | while text.hasNext() and text.peek().category == CC.Spacer: 256 | result += text.forward(1) 257 | result.category = TC.MergedSpacer 258 | 259 | if text.hasNext() and text.peek().category in (CC.Letter, CC.Other): 260 | text.backward(text.position - result.position) 261 | return 262 | 263 | if result: 264 | return result 265 | 266 | 267 | @token('symbols') 268 | def tokenize_symbols(text, prev=None): 269 | r"""Process singletone symbols as standalone tokens. 270 | 271 | :param Buffer text: iterator over line, with current position. Escape is 272 | isolated if not part of escaped char 273 | 274 | >>> next(tokenize(categorize(r'\begin turing'))) 275 | '\\' 276 | >>> next(tokenize(categorize(r'\bf {turing}'))) 277 | '\\' 278 | >>> next(tokenize(categorize(r'{]}'))).category 279 | 280 | """ 281 | mapping = { 282 | CC.Escape: TC.Escape, 283 | CC.GroupBegin: TC.GroupBegin, 284 | CC.GroupEnd: TC.GroupEnd, 285 | CC.BracketBegin: TC.BracketBegin, 286 | CC.BracketEnd: TC.BracketEnd 287 | } 288 | if text.peek().category in mapping.keys(): 289 | result = text.forward(1) 290 | result.category = mapping[result.category] 291 | return result 292 | 293 | 294 | # TODO: move me to parser (should parse punctuation as arg + 295 | # store punctuation commads as macro) 296 | @token('punctuation_command_name') 297 | def tokenize_punctuation_command_name(text, prev=None): 298 | """Process command that augments or modifies punctuation. 299 | 300 | This is important to the tokenization of a string, as opening or closing 301 | punctuation is not supposed to match. 302 | 303 | :param Buffer text: iterator over text, with current position 304 | """ 305 | if text.peek(-1) and text.peek(-1).category == CC.Escape: 306 | for point in PUNCTUATION_COMMANDS: 307 | if text.peek((0, len(point))) == point: 308 | result = text.forward(len(point)) 309 | result.category = TC.PunctuationCommandName 310 | return result 311 | 312 | 313 | @token('command_name') 314 | def tokenize_command_name(text, prev=None): 315 | r"""Extract most restrictive subset possibility for command name. 316 | 317 | Parser can later join allowed spacers and macros to assemble the final 318 | command name and arguments. 319 | 320 | >>> b = categorize(r'\bf{') 321 | >>> _ = next(b) 322 | >>> tokenize_command_name(b) 323 | 'bf' 324 | >>> b = categorize(r'\bf,') 325 | >>> _ = next(b) 326 | >>> tokenize_command_name(b) 327 | 'bf' 328 | >>> b = categorize(r'\bf*{') 329 | >>> _ = next(b) 330 | >>> tokenize_command_name(b) 331 | 'bf*' 332 | """ 333 | if text.peek(-1) and text.peek(-1).category == CC.Escape \ 334 | and text.peek().category == CC.Letter: 335 | c = text.forward(1) 336 | while text.hasNext() and text.peek().category == CC.Letter \ 337 | or text.peek() == '*': # TODO: what do about asterisk? 338 | # TODO: excluded other, macro, super, sub, acttive, alignment 339 | # although macros can make these a part of the command name 340 | c += text.forward(1) 341 | c.category = TC.CommandName 342 | return c 343 | 344 | 345 | @token('string') 346 | def tokenize_string(text, prev=None): 347 | r"""Process a string of text 348 | 349 | :param Buffer text: iterator over line, with current position 350 | :param Union[None,iterable,str] delimiters: defines the delimiters 351 | 352 | >>> tokenize_string(categorize('hello')) 353 | 'hello' 354 | >>> b = categorize(r'hello again\command') 355 | >>> tokenize_string(b) 356 | 'hello again' 357 | >>> print(b.peek()) 358 | \ 359 | >>> print(tokenize_string(categorize(r'0 & 1\\\command'))) 360 | 0 & 1 361 | """ 362 | result = Token('', text.position, category=TC.Text) 363 | while text.hasNext() and text.peek().category not in ( 364 | CC.Escape, 365 | CC.GroupBegin, 366 | CC.GroupEnd, 367 | CC.MathSwitch, 368 | CC.BracketBegin, 369 | CC.BracketEnd, 370 | CC.Comment): 371 | result += next(text) 372 | return result 373 | -------------------------------------------------------------------------------- /TexSoup/utils.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import functools 3 | 4 | from enum import IntEnum as IntEnumBase 5 | 6 | 7 | ########## 8 | # Tokens # 9 | ########## 10 | 11 | 12 | def IntEnum(name, keys, start=1): 13 | """Explicitly define key-value pairs. For Python3.4 compatibility""" 14 | return IntEnumBase(name, 15 | [(key, index) for index, key in enumerate(keys, start=start)]) 16 | 17 | 18 | CC = IntEnum('CategoryCodes', ( 19 | 'Escape', 20 | 'GroupBegin', 21 | 'GroupEnd', 22 | 'MathSwitch', 23 | 'Alignment', 24 | 'EndOfLine', 25 | 'Macro', 26 | 'Superscript', 27 | 'Subscript', 28 | 'Ignored', 29 | 'Spacer', 30 | 'Letter', 31 | 'Other', 32 | 'Active', 33 | 'Comment', 34 | 'Invalid', 35 | 36 | # custom 37 | 'MathGroupBegin', 38 | 'MathGroupEnd', 39 | 'BracketBegin', 40 | 'BracketEnd', 41 | 'ParenBegin', 42 | 'ParenEnd' 43 | )) 44 | 45 | 46 | # Only includes items that cannot cause failures 47 | TC = IntEnum('TokenCode', ( 48 | 'Escape', 49 | 'GroupBegin', 50 | 'GroupEnd', 51 | 'Comment', 52 | 'MergedSpacer', # whitespace allowed between and arguments 53 | 'EscapedComment', 54 | 'MathSwitch', 55 | 'DisplayMathSwitch', 56 | 'MathGroupBegin', 57 | 'MathGroupEnd', 58 | 'DisplayMathGroupBegin', 59 | 'DisplayMathGroupEnd', 60 | 'LineBreak', 61 | 'CommandName', 62 | 'Text', 63 | 'BracketBegin', 64 | 'BracketEnd', 65 | 'ParenBegin', 66 | 'ParenEnd', 67 | 68 | # temporary (Replace with macros support) 69 | 'PunctuationCommandName', 70 | 'SizeCommand', 71 | 'Spacer' 72 | ), start=max(CC)) 73 | 74 | 75 | class Token(str): 76 | """Enhanced string object with knowledge of global position.""" 77 | 78 | # noinspection PyArgumentList 79 | def __new__(cls, text='', position=None, category=None): 80 | """Initializer for pseudo-string object. 81 | 82 | :param text: The original string 83 | :param position: Position in the original buffer 84 | :param category: Category of token 85 | """ 86 | self = str.__new__(cls, text) 87 | if isinstance(text, Token): 88 | self.text = text.text 89 | self.position = text.position 90 | self.category = category or text.category 91 | else: 92 | self.text = text 93 | self.position = position 94 | self.category = category 95 | return self 96 | 97 | def __repr__(self): 98 | return repr(self.text) 99 | 100 | def __str__(self): 101 | return str(self.text) 102 | 103 | def __getattr__(self, name): 104 | return getattr(self.text, name) 105 | 106 | def __eq__(self, other): 107 | """ 108 | >>> Token('asdf', 0) == Token('asdf', 2) 109 | True 110 | >>> Token('asdf', 0) == Token('asd', 0) 111 | False 112 | """ 113 | if isinstance(other, Token): 114 | return self.text == other.text 115 | else: 116 | return self.text == other 117 | 118 | def __hash__(self): 119 | """ 120 | >>> hash(Token('asf')) == hash('asf') 121 | True 122 | """ 123 | return hash(self.text) 124 | 125 | def __add__(self, other): 126 | """Implements addition in the form of TextWithPosition(...) + (obj). 127 | 128 | >>> t1 = Token('as', 0) + Token('df', 1) 129 | >>> str(t1) 130 | 'asdf' 131 | >>> t1.position 132 | 0 133 | >>> t2 = Token('as', 1) + 'df' 134 | >>> str(t2) 135 | 'asdf' 136 | >>> t3 = Token(t2) 137 | >>> t3.position 138 | 1 139 | """ 140 | if isinstance(other, Token): 141 | return Token(self.text + other.text, self.position, self.category) 142 | else: 143 | return Token(self.text + other, self.position, self.category) 144 | 145 | def __radd__(self, other): 146 | """Implements addition in the form of (obj) + TextWithPosition(...). 147 | 148 | Note that if the first element is Token, 149 | Token(...).__add__(...) will be used. As a result, we 150 | can assume WLOG that `other` is a type other than Token. 151 | 152 | >>> t1 = Token('as', 2) + Token('dfg', 2) 153 | >>> str(t1) 154 | 'asdfg' 155 | >>> t1.position 156 | 2 157 | >>> t2 = 'as' + Token('dfg', 2) 158 | >>> str(t2) 159 | 'asdfg' 160 | >>> t2.position 161 | 0 162 | """ 163 | return Token( 164 | other + self.text, self.position - len(other), self.category) 165 | 166 | def __iadd__(self, other): 167 | """Implements addition in the form of TextWithPosition(...) += ... 168 | 169 | >>> t1 = Token('as', 0) 170 | >>> t1 += 'df' 171 | >>> str(t1) 172 | 'asdf' 173 | >>> t1.position 174 | 0 175 | """ 176 | if isinstance(other, Token): 177 | new = Token(self.text + other.text, self.position, self.category) 178 | else: 179 | new = Token(self.text + other, self.position, self.category) 180 | return new 181 | 182 | @classmethod 183 | def join(cls, tokens, glue=''): 184 | if len(tokens) > 0: 185 | return Token( 186 | glue.join(t.text for t in tokens), 187 | tokens[0].position, 188 | tokens[0].category) 189 | else: 190 | return Token.Empty 191 | 192 | def __bool__(self): 193 | return bool(self.text) 194 | 195 | def __contains__(self, item): 196 | """ 197 | >>> 'rg' in Token('corgi', 0) 198 | True 199 | >>> 'reg' in Token('corgi', 0) 200 | False 201 | >>> Token('rg', 0) in Token('corgi', 0) 202 | True 203 | """ 204 | if isinstance(item, Token): 205 | return item.text in self.text 206 | return item in self.text 207 | 208 | def __iter__(self): 209 | """ 210 | >>> list(Token('asdf', 0)) 211 | ['a', 's', 'd', 'f'] 212 | """ 213 | return iter(self.__iter()) 214 | 215 | def __iter(self): 216 | for i, c in enumerate(self.text): 217 | yield Token(c, self.position + i, self.category) 218 | 219 | def __getitem__(self, i): 220 | """Access characters in object just as with strings. 221 | 222 | >>> t1 = Token('asdf', 2) 223 | >>> t1[0] 224 | 'a' 225 | >>> t1[-1] 226 | 'f' 227 | >>> t1[:] 228 | 'asdf' 229 | """ 230 | if isinstance(i, int): 231 | start = i 232 | else: 233 | start = i.start 234 | if start is None: 235 | start = 0 236 | if start < 0: 237 | start = len(self.text) + start 238 | return Token(self.text[i], self.position + start, self.category) 239 | 240 | def strip(self, *args, **kwargs): 241 | stripped = self.text.strip(*args, **kwargs) 242 | offset = self.text.find(stripped) 243 | return Token(stripped, self.position + offset, self.category) 244 | 245 | def lstrip(self, *args, **kwargs): 246 | """Strip leading whitespace for text. 247 | 248 | >>> t = Token(' asdf ', 2) 249 | >>> t.lstrip() 250 | 'asdf ' 251 | """ 252 | stripped = self.text.lstrip(*args, **kwargs) 253 | offset = self.text.find(stripped) 254 | return Token(stripped, self.position + offset, self.category) 255 | 256 | def rstrip(self, *args, **kwargs): 257 | """Strip trailing whitespace for text. 258 | 259 | >>> t = Token(' asdf ', 2) 260 | >>> t.rstrip() 261 | ' asdf' 262 | """ 263 | stripped = self.text.rstrip(*args, **kwargs) 264 | offset = self.text.find(stripped) 265 | return Token(stripped, self.position + offset, self.category) 266 | 267 | 268 | Token.Empty = Token('', position=0) 269 | 270 | 271 | # TODO: Rename to Buffer (formerly MixedBuffer) and StringBuffer 272 | # but needs test refactoring to change defaults 273 | class Buffer: 274 | """Converts string or iterable into a navigable iterator of strings. 275 | 276 | >>> b1 = Buffer("012345") 277 | >>> next(b1) 278 | '0' 279 | >>> b1.forward() 280 | '1' 281 | >>> b1.endswith('1') 282 | True 283 | >>> b1.backward(2) 284 | '01' 285 | >>> b1.peek() 286 | '0' 287 | >>> b1.peek(2) 288 | '2' 289 | >>> b1.peek((0, 2)) 290 | '01' 291 | >>> b1.startswith('01') 292 | True 293 | >>> b1[2:4] 294 | '23' 295 | >>> Buffer('asdf')[:10] 296 | 'asdf' 297 | >>> def gen(): 298 | ... for i in range(10): 299 | ... yield i 300 | >>> list(gen()) 301 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 302 | >>> list(Buffer(gen())) 303 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 304 | """ 305 | 306 | def __init__(self, iterator, join=Token.join, empty=lambda: '', 307 | init=lambda content, index: Token(content, index)): 308 | """Initialization for Buffer. 309 | 310 | :param iterator: iterator or iterable 311 | :param func join: function to join multiple buffer elements 312 | """ 313 | assert hasattr(iterator, '__iter__'), 'Must be an iterable.' 314 | self.__iterator = iter(iterator) 315 | self.__queue = [] 316 | self.__i = 0 317 | self.__join = join 318 | self.__init = init 319 | self.__empty = empty 320 | 321 | # noinspection PyPep8Naming 322 | def hasNext(self, n=1): 323 | """Returns whether or not there is another element.""" 324 | return bool(self.peek(n - 1)) 325 | 326 | def startswith(self, s): 327 | """Check if iterator starts with s, beginning from the current 328 | position.""" 329 | return self.peek((0, len(s))).startswith(s) 330 | 331 | def endswith(self, s): 332 | """Check if iterator ends with s, ending at current position.""" 333 | return self.peek((-len(s), 0)).endswith(s) 334 | 335 | def forward(self, j=1): 336 | """Move forward by j steps. 337 | 338 | >>> b = Buffer('abcdef') 339 | >>> b.forward(3) 340 | 'abc' 341 | >>> b.forward(-2) 342 | 'bc' 343 | """ 344 | if j < 0: 345 | return self.backward(-j) 346 | self.__i += j 347 | return self[self.__i - j:self.__i] 348 | 349 | def num_forward_until(self, condition): 350 | """Forward until one of the provided matches is found. 351 | 352 | :param condition: set of valid strings 353 | """ 354 | i, c = 0, '' 355 | while self.hasNext() and not condition(self.peek()): 356 | c += self.forward(1) 357 | i += 1 358 | assert self.backward(i) == c 359 | return i 360 | 361 | def forward_until(self, condition, peek=True): 362 | """Forward until one of the provided matches is found. 363 | 364 | The returned string contains all characters found before the condition 365 | was met. In other words, the condition will be true for the remainder 366 | of the buffer. 367 | 368 | :param Callable condition: lambda condition for the token to stop at 369 | 370 | >>> buf = Buffer(map(str, range(9))) 371 | >>> _ = buf.forward_until(lambda x: int(x) > 3) 372 | >>> c = buf.forward_until(lambda x: int(x) > 6) 373 | >>> c 374 | '456' 375 | >>> c.position 376 | 4 377 | """ 378 | c = self.__init(self.__empty(), self.peek().position) 379 | while self.hasNext() and not condition(self.peek() if peek else self): 380 | c += self.forward(1) 381 | return c 382 | 383 | def backward(self, j=1): 384 | """Move backward by j steps. 385 | 386 | >>> b = Buffer('abcdef') 387 | >>> b.backward(-3) 388 | 'abc' 389 | >>> b.backward(2) 390 | 'bc' 391 | """ 392 | if j < 0: 393 | return self.forward(-j) 394 | assert self.__i - j >= 0, 'Cannot move more than %d back' % self.__i 395 | self.__i -= j 396 | return self[self.__i:self.__i + j] 397 | 398 | def peek(self, j=0): 399 | """Peek at the next value(s), without advancing the Buffer. 400 | 401 | Return None if index is out of range. 402 | """ 403 | try: 404 | if isinstance(j, int): 405 | return self[self.__i + j] 406 | return self[self.__i + j[0]:self.__i + j[1]] 407 | except IndexError: 408 | return None 409 | 410 | def __next__(self): 411 | """Implements next.""" 412 | while self.__i >= len(self.__queue): 413 | self.__queue.append(self.__init( 414 | next(self.__iterator), self.__i)) 415 | self.__i += 1 416 | return self.__queue[self.__i - 1] 417 | 418 | def __getitem__(self, i): 419 | """Supports indexing list. 420 | 421 | >>> b = Buffer('asdf') 422 | >>> b[5] 423 | Traceback (most recent call last): 424 | ... 425 | IndexError: list index out of range 426 | >>> b[0] 427 | 'a' 428 | >>> b[1:3] 429 | 'sd' 430 | >>> b[1:] 431 | 'sdf' 432 | >>> b[:3] 433 | 'asd' 434 | >>> b[:] 435 | 'asdf' 436 | """ 437 | if isinstance(i, int): 438 | old, j = self.__i, i 439 | else: 440 | old, j = self.__i, i.stop 441 | 442 | while j is None or self.__i <= j: 443 | try: 444 | next(self) 445 | except StopIteration: 446 | break 447 | self.__i = old 448 | if isinstance(i, int): 449 | return self.__queue[i] 450 | return self.__join(self.__queue[i]) 451 | 452 | def __iter__(self): 453 | return self 454 | 455 | @property 456 | def position(self): 457 | return self.__i 458 | 459 | 460 | class CharToLineOffset(object): 461 | """Utility to convert absolute position in the source file to 462 | line_no:char_no_in_line. This can be very useful if we want to parse LaTeX 463 | and navigate to some elements in the generated DVI/PDF via SyncTeX. 464 | 465 | >>> clo = CharToLineOffset('''hello 466 | ... world 467 | ... I scream for ice cream!''') 468 | >>> clo(3) 469 | (0, 3) 470 | >>> clo(6) 471 | (1, 0) 472 | >>> clo(12) 473 | (2, 0) 474 | """ 475 | 476 | def __init__(self, src): 477 | self.line_break_positions = [i for i, c in enumerate(src) if c == '\n'] 478 | self.src_len = len(src) 479 | 480 | def __call__(self, char_pos): 481 | line_no = bisect.bisect(self.line_break_positions, char_pos) 482 | if line_no == 0: 483 | char_no = char_pos 484 | elif line_no == len(self.line_break_positions): 485 | line_start = self.line_break_positions[-1] 486 | char_no = min(char_pos - line_start - 1, self.src_len - line_start) 487 | else: 488 | char_no = char_pos - self.line_break_positions[line_no - 1] - 1 489 | return line_no, char_no 490 | 491 | 492 | class MixedBuffer(Buffer): 493 | 494 | def __init__(self, iterator): 495 | """Initialization for Buffer, accepting types beyond strings. 496 | 497 | :param iterator: iterator or iterable 498 | :param func join: function to join multiple buffer elements 499 | 500 | >>> buf = MixedBuffer([324, 'adsf', lambda x: x]) 501 | >>> buf.peek() 502 | 324 503 | """ 504 | super().__init__(iterator, 505 | join=lambda x: x, empty=lambda x: [], 506 | init=lambda content, index: content) 507 | 508 | 509 | ############## 510 | # Decorators # 511 | ############## 512 | 513 | 514 | def to_buffer(convert_in=True, convert_out=True, Buffer=Buffer): 515 | """Decorator converting all strings and iterators/iterables into 516 | Buffers. 517 | 518 | :param bool convert_in: Convert inputs where applicable to Buffers 519 | :param bool convert_out: Convert output to a Buffer 520 | :param type Buffer: Type of Buffer to convert into 521 | """ 522 | def decorator(f): 523 | @functools.wraps(f) 524 | def wrap(*args, **kwargs): 525 | iterator = args[0] 526 | if convert_in: 527 | iterator = kwargs.get('iterator', iterator) 528 | if not isinstance(iterator, Buffer): 529 | iterator = Buffer(iterator) 530 | output = f(iterator, *args[1:], **kwargs) 531 | if convert_out: 532 | return Buffer(output) 533 | return output 534 | return wrap 535 | return decorator 536 | 537 | 538 | def to_list(f): 539 | """Converts generator or iterable output to list 540 | 541 | >>> class A: 542 | ... @property 543 | ... @to_list 544 | ... def a(self): 545 | ... for i in range(3): 546 | ... yield i 547 | >>> A().a 548 | [0, 1, 2] 549 | """ 550 | @functools.wraps(f) 551 | def wrapper(*args, **kwargs): 552 | return list(f(*args, **kwargs)) 553 | return wrapper 554 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | src 2 | build 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = TexSoup 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=TexSoup 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme 3 | sphinxcontrib.katex 4 | -------------------------------------------------------------------------------- /docs/source/_static/css/theme-mod.css: -------------------------------------------------------------------------------- 1 | /******** 2 | * LOGO * 3 | ********/ 4 | 5 | @media screen and (min-width: 768px) { 6 | .site-footer .footer-logo { 7 | background-size: 51px 80px; 8 | height: 51px; 9 | margin-bottom: 0; 10 | margin-bottom: 0; 11 | width: 80px; 12 | } 13 | } 14 | 15 | /********* 16 | * COLOR * 17 | *********/ 18 | 19 | html p a, html p a:link, html p a:visited { 20 | text-decoration-line: underline; 21 | } 22 | 23 | p a, p a:link, p a:visited { 24 | color:#999; 25 | } 26 | 27 | .pytorch-left-menu li.toctree-l1.current>a, .pytorch-right-menu li.toctree-l1.current>a, 28 | .header-holder .main-menu ul li.active:after, .header-holder .main-menu ul li.active a, 29 | ul.pytorch-breadcrumbs a, p a:hover, li a:hover, .btn:hover, 30 | article.pytorch-article .class em.property,.anchorjs-link:hover { 31 | color:transparent; 32 | -webkit-background-clip: text !important; 33 | -webkit-text-fill-color: transparent; 34 | background: rgb(241,90,36); 35 | background: linear-gradient(135deg, rgba(241,90,36,1) 0%, rgba(251,176,59,1) 100%); 36 | } 37 | 38 | .pytorch-left-menu li.toctree-l1.current > a::before, .pytorch-right-menu li.toctree-l1.current > a::before { 39 | color: transparent; 40 | height: 8px; 41 | width: 8px; 42 | border-radius: 50%; 43 | background: rgb(241,90,36); 44 | background: linear-gradient(135deg, rgba(241,90,36,1) 0%, rgba(251,176,59,1) 100%); 45 | } 46 | 47 | article.pytorch-article .class dt { 48 | background-clip: padding-box; 49 | -webkit-background-clip: padding-box; 50 | border: solid 3px transparent; 51 | } 52 | 53 | article.pytorch-article .class dt::before { 54 | content: ''; 55 | position: absolute; 56 | top: 0; 57 | right: 0; 58 | bottom: 0; 59 | left: 0; 60 | z-index: -1; 61 | 62 | margin-top:-3px; 63 | margin-left:1px; 64 | background: rgb(241,90,36); 65 | background: linear-gradient(135deg, rgba(241,90,36,1) 0%, rgba(251,176,59,1) 100%); 66 | } 67 | 68 | .pytorch-content-wrap { 69 | background-color:transparent; /* allows gradients to show up as borders */ 70 | } 71 | 72 | 73 | article.pytorch-article .attribute dt, 74 | article.pytorch-article .function dt, 75 | article.pytorch-article .class .attribute dt, 76 | article.pytorch-article .class .classmethod dt, 77 | article.pytorch-article .class .method dt, 78 | article.pytorch-article .class .staticmethod dt { 79 | border-left:3px solid transparent; 80 | } 81 | 82 | article.pytorch-article .attribute dt::before, 83 | article.pytorch-article .function dt::before, 84 | article.pytorch-article .class .attribute dt::before, 85 | article.pytorch-article .class .classmethod dt::before, 86 | article.pytorch-article .class .method dt::before, 87 | article.pytorch-article .class .staticmethod dt::before { 88 | margin-top:0; 89 | margin-left:-3px; 90 | background: linear-gradient(0deg, rgba(241,90,36,1) 0%, rgba(251,176,59,1) 100%); 91 | } 92 | 93 | /* HACK: why is this needed? */ 94 | article.pytorch-article .class dl.attribute dt::before { 95 | margin-left:-6px; 96 | margin-bottom: -3px; 97 | } 98 | 99 | /********* 100 | * FIXES * 101 | *********/ 102 | 103 | .pytorch-left-menu li.current ul, .pytorch-right-menu li.current ul { 104 | padding-left: 1em; 105 | } 106 | 107 | #docs-tutorials-resources { 108 | padding: 0; 109 | height: 0; 110 | margin: 0; 111 | } 112 | 113 | .built-with { 114 | margin-bottom:1em; 115 | } 116 | 117 | .pytorch-menu-vertical { 118 | padding-bottom: 1.625rem; 119 | } 120 | 121 | .pytorch-breadcrumbs-aside img { 122 | height: 20px; 123 | } 124 | 125 | @media screen and (min-width: 1101px) { 126 | .pytorch-left-menu li.toctree-l1.current>a:before, .pytorch-right-menu li.toctree-l1.current>a:before { 127 | top:5px; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /docs/source/_static/images/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/android-chrome-192x192.png -------------------------------------------------------------------------------- /docs/source/_static/images/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/android-chrome-512x512.png -------------------------------------------------------------------------------- /docs/source/_static/images/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/apple-touch-icon.png -------------------------------------------------------------------------------- /docs/source/_static/images/arrow-down-orange.svg: -------------------------------------------------------------------------------- 1 | arrow-down-orange -------------------------------------------------------------------------------- /docs/source/_static/images/arrow-right-with-tail.svg: -------------------------------------------------------------------------------- 1 | arrow-right-with-tail -------------------------------------------------------------------------------- /docs/source/_static/images/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #ffffff 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/source/_static/images/chevron-right-orange.svg: -------------------------------------------------------------------------------- 1 | chevron-right-orange -------------------------------------------------------------------------------- /docs/source/_static/images/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/favicon-16x16.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/favicon-32x32.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/images/logo-dark.svg: -------------------------------------------------------------------------------- 1 | logo-dark -------------------------------------------------------------------------------- /docs/source/_static/images/logo-icon.svg: -------------------------------------------------------------------------------- 1 | logo-icon -------------------------------------------------------------------------------- /docs/source/_static/images/logo.svg: -------------------------------------------------------------------------------- 1 | logo -------------------------------------------------------------------------------- /docs/source/_static/images/mstile-150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/images/mstile-150x150.png -------------------------------------------------------------------------------- /docs/source/_static/images/pytorch-x.svg: -------------------------------------------------------------------------------- 1 | pytorch-x -------------------------------------------------------------------------------- /docs/source/_static/images/safari-pinned-tab.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | Created by potrace 1.11, written by Peter Selinger 2001-2013 9 | 10 | 12 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/source/_static/images/search-icon.svg: -------------------------------------------------------------------------------- 1 | search-icon -------------------------------------------------------------------------------- /docs/source/_static/images/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "short_name": "", 4 | "icons": [ 5 | { 6 | "src": "/android-chrome-192x192.png", 7 | "sizes": "192x192", 8 | "type": "image/png" 9 | }, 10 | { 11 | "src": "/android-chrome-512x512.png", 12 | "sizes": "512x512", 13 | "type": "image/png" 14 | } 15 | ], 16 | "theme_color": "#ffffff", 17 | "background_color": "#ffffff", 18 | "display": "standalone" 19 | } 20 | -------------------------------------------------------------------------------- /docs/source/_static/images/view-page-source-icon.svg: -------------------------------------------------------------------------------- 1 | view-page-source-icon -------------------------------------------------------------------------------- /docs/source/_static/texsoup.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alvinwan/TexSoup/7dabf76fdde1f0d7f0d30ede841b3b45bbb609f6/docs/source/_static/texsoup.ai -------------------------------------------------------------------------------- /docs/source/_templates/cookie_banner.html: -------------------------------------------------------------------------------- 1 | 7 | -------------------------------------------------------------------------------- /docs/source/_templates/footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | {% if (theme_prev_next_buttons_location == 'bottom' or theme_prev_next_buttons_location == 'both') and (next or prev) %} 5 | 13 | {% endif %} 14 | 15 | {% if theme_pytorch_project == 'tutorials' %} 16 | 17 |
18 |
19 |
Was this helpful?
20 | 21 | 22 |
Thank you
23 |
24 |
25 | 26 | {% else %} 27 | 28 |
29 | 30 | {% endif %} 31 | 32 |
33 |

34 | {%- if show_copyright %} 35 | {%- if hasdoc('copyright') %} 36 | {% trans path=pathto('copyright'), copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} 37 | {%- else %} 38 | {% trans copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} 39 | {%- endif %} 40 | {%- endif %} 41 | 42 | {%- if build_id and build_url %} 43 | {% trans build_url=build_url, build_id=build_id %} 44 | 45 | Build 46 | {{ build_id }}. 47 | 48 | {% endtrans %} 49 | {%- elif commit %} 50 | {% trans commit=commit %} 51 | 52 | Revision {{ commit }}. 53 | 54 | {% endtrans %} 55 | {%- elif last_updated %} 56 | {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %} 57 | {%- endif %} 58 | 59 |

60 |
61 | 62 | {%- if show_sphinx %} 63 | {% trans %} 64 |
65 | Built with Sphinx using a theme provided by PyTorch. 66 |
67 | {% endtrans %} 68 | {%- endif %} 69 | 70 | {%- block extrafooter %} {% endblock %} 71 | 72 |
73 | -------------------------------------------------------------------------------- /docs/source/_templates/landing.html: -------------------------------------------------------------------------------- 1 | {% extend "layout.html" %} 2 | 3 | {% block mainbody %} 4 | 49 | 50 |
51 |

Navigate, Search, and Modify LaTeX Documents in Python

52 |

Easy and reliable: No C extensions, no installation dependencies, and 100% test coverage

53 |

Get started View on Github

54 |
55 | 56 | {% endblock %} 57 | -------------------------------------------------------------------------------- /docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {# TEMPLATE VAR SETTINGS #} 2 | 3 | {%- set extra_css_files = ["_static/css/theme-mod.css"] -%} 4 | {%- set favicon=1 -%} 5 | {%- set url_root = pathto('', 1) %} 6 | {%- if url_root == '#' %}{% set url_root = '' %}{% endif %} 7 | {%- if not embedded and docstitle %} 8 | {%- set titlesuffix = " — "|safe + docstitle|e %} 9 | {%- else %} 10 | {%- set titlesuffix = "" %} 11 | {%- endif %} 12 | {%- set lang_attr = 'en' if language == None else (language | replace('_', '-')) %} 13 | {% import 'theme_variables.jinja' as theme_variables %} 14 | {%- set theme_pytorch_project = 'docs' -%} 15 | 16 | 17 | 18 | 19 | 20 | 21 | {{ metatags }} 22 | 23 | {% block htmltitle %} 24 | {{ title|striptags|e }}{{ titlesuffix }} 25 | {% endblock %} 26 | 27 | {# FAVICON #} 28 | {% if favicon %} 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | {% endif %} 37 | {# CANONICAL URL #} 38 | {% if theme_canonical_url %} 39 | 40 | {% endif %} 41 | 42 | {# CSS #} 43 | 44 | {# OPENSEARCH #} 45 | {% if not embedded %} 46 | {% if use_opensearch %} 47 | 50 | {% endif %} 51 | 52 | {% endif %} 53 | 54 | 55 | 56 | {%- for css in css_files %} 57 | {%- if css|attr("rel") %} 58 | 59 | {%- else %} 60 | 61 | {%- endif %} 62 | {%- endfor %} 63 | {%- for cssfile in extra_css_files %} 64 | 65 | {%- endfor %} 66 | 67 | {%- block linktags %} 68 | {%- if hasdoc('about') %} 69 | 70 | {%- endif %} 71 | {%- if hasdoc('genindex') %} 72 | 73 | {%- endif %} 74 | {%- if hasdoc('search') %} 75 | 76 | {%- endif %} 77 | {%- if hasdoc('copyright') %} 78 | 79 | {%- endif %} 80 | {%- if next %} 81 | 82 | {%- endif %} 83 | {%- if prev %} 84 | 85 | {%- endif %} 86 | {%- endblock %} 87 | {%- block extrahead %} {% endblock %} 88 | 89 | {# Keep modernizr in head - http://modernizr.com/docs/#installing #} 90 | 91 | 92 |
93 |
94 |
95 | 96 | 97 | 106 | 107 | 108 |
109 | 110 |
111 |
112 | 113 | 114 | 115 | 116 | {% block extrabody %} {% endblock %} 117 | 118 | {% block mainbody %} 119 | {# SIDE NAV, TOGGLES ON MOBILE #} 120 | 121 | 125 | 126 | 170 | 171 |
172 |
173 |
174 | {% include "breadcrumbs.html" %} 175 |
176 | 177 |
178 | Shortcuts 179 |
180 |
181 | 182 |
183 |
184 | 185 | {% if theme_pytorch_project == 'tutorials' %} 186 | 187 | 206 | 207 | {% endif %} 208 | 209 | {%- block content %} 210 | {% if theme_style_external_links|tobool %} 211 | 231 | 232 |
233 |
234 |
235 | {{ toc }} 236 |
237 |
238 |
239 |
240 |
241 | 242 | {% include "versions.html" %} 243 | 244 | {% if not embedded %} 245 | 246 | {% if sphinx_version >= "1.8.0" %} 247 | 248 | {%- for scriptfile in script_files %} 249 | {{ js_tag(scriptfile) }} 250 | {%- endfor %} 251 | {% else %} 252 | 263 | {%- for scriptfile in script_files %} 264 | 265 | {%- endfor %} 266 | {% endif %} 267 | 268 | {% endif %} 269 | 270 | 271 | 272 | 273 | 274 | 279 | 280 | {% endblock %} 281 | 282 | {%- block footer %} {% endblock %} 283 | 284 | 285 |
286 |
287 | 307 | 308 | {% include "cookie_banner.html" %} 309 | 310 | 311 | 312 | 313 | 314 |
315 |
316 |
317 |
318 | 319 | 320 |
321 |
322 |
323 | 324 | 335 |
336 | 337 | 338 | 339 | 340 | 341 | 356 | 357 | 358 | 375 | 376 | 377 | -------------------------------------------------------------------------------- /docs/source/_templates/theme_variables.jinja: -------------------------------------------------------------------------------- 1 | {%- set urls = { 2 | 'github': 'https://github.com/alvinwan/texsoup', 3 | 'github_issues': 'https://github.com/alvinwan/texsoup/issues', 4 | 'getting_started': 'https://texsoup.alvinwan.com/docs/quickstart.html', 5 | 'docs': 'https://texsoup.alvinwan.com/docs/', 6 | 'home': 'https://texsoup.alvinwan.com/', 7 | 'md2py': 'https://github.com/alvinwan/md2py', 8 | 'tex2py': 'https://github.com/alvinwan/tex2py' 9 | } 10 | -%} 11 | {%- set main_menu_links = [ 12 | {'name': 'Home', 'href': urls['home']}, 13 | {'name': 'Docs', 'href': urls['docs']}, 14 | {'name': 'Github', 'href': urls['github'],} 15 | ] 16 | -%} 17 | {%- set footer_columns = [ 18 | {'name': 'TexSoup', 'href': urls['home'], 'links': [ 19 | {'name': 'Getting Started', 'href': urls['getting_started']} 20 | ]}, 21 | {'name': 'Support', 'links': [ 22 | {'name': 'Docs', 'href': urls['docs']}, 23 | {'name': 'Github Issues', 'href': urls['github_issues']} 24 | ]}, 25 | {'name': 'Related', 'links': [ 26 | {'name': 'Markdown2Python', 'href': urls['md2py']}, 27 | {'name': 'LaTeX2Python', 'href': urls['tex2py']} 28 | ]} 29 | ] 30 | -%} 31 | -------------------------------------------------------------------------------- /docs/source/categorizer.rst: -------------------------------------------------------------------------------- 1 | Categorizing Mechanics 2 | =================================== 3 | 4 | .. automodule:: TexSoup.category 5 | 6 | Categorizer 7 | ----------------------------------- 8 | 9 | .. autofunction:: categorize 10 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # TexSoup documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Dec 23 13:31:47 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | # import sys 22 | 23 | # source code directory, relative to this file, for sphinx-autobuild 24 | # sys.path.insert(0, os.path.abspath('../..')) 25 | 26 | import TexSoup 27 | 28 | RELEASE = os.environ.get('RELEASE', False) 29 | 30 | import pytorch_sphinx_theme 31 | 32 | # -- General configuration ------------------------------------------------ 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | needs_sphinx = '1.6' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.autosummary', 44 | 'sphinx.ext.doctest', 45 | 'sphinx.ext.intersphinx', 46 | 'sphinx.ext.todo', 47 | 'sphinx.ext.coverage', 48 | 'sphinx.ext.napoleon', 49 | 'sphinx.ext.viewcode', 50 | 'sphinxcontrib.katex', 51 | ] 52 | 53 | # katex options 54 | # 55 | # 56 | 57 | katex_options = r''' 58 | delimiters : [ 59 | {left: "$$", right: "$$", display: true}, 60 | {left: "\\(", right: "\\)", display: false}, 61 | {left: "\\[", right: "\\]", display: true} 62 | ] 63 | ''' 64 | 65 | napoleon_use_ivar = True 66 | 67 | # Add any paths that contain templates here, relative to this directory. 68 | templates_path = ['_templates'] 69 | 70 | # The suffix(es) of source filenames. 71 | # You can specify multiple suffix as a list of string: 72 | # 73 | # source_suffix = ['.rst', '.md'] 74 | source_suffix = '.rst' 75 | 76 | # The master toctree document. 77 | master_doc = 'index' 78 | 79 | # General information about the project. 80 | project = 'TexSoup' 81 | copyright = '2020, Alvin Wan' 82 | author = 'Alvin Wan' 83 | 84 | # The version info for the project you're documenting, acts as replacement for 85 | # |version| and |release|, also used in various other places throughout the 86 | # built documents. 87 | # 88 | # The short X.Y version. 89 | # TODO: change to [:2] at v1.0 90 | version = 'master (' + TexSoup.__version__ + ' )' 91 | # The full version, including alpha/beta/rc tags. 92 | # TODO: verify this works as expected 93 | release = 'master' 94 | 95 | # The language for content autogenerated by Sphinx. Refer to documentation 96 | # for a list of supported languages. 97 | # 98 | # This is also used if you do content translation via gettext catalogs. 99 | # Usually you set "language" from the command line for these cases. 100 | language = None 101 | 102 | # List of patterns, relative to source directory, that match files and 103 | # directories to ignore when looking for source files. 104 | # This patterns also effect to html_static_path and html_extra_path 105 | exclude_patterns = [] 106 | 107 | # The name of the Pygments (syntax highlighting) style to use. 108 | pygments_style = 'sphinx' 109 | 110 | # If true, `todo` and `todoList` produce output, else they produce nothing. 111 | todo_include_todos = True 112 | 113 | # Disable docstring inheritance 114 | autodoc_inherit_docstrings = False 115 | 116 | 117 | # -- katex javascript in header 118 | # 119 | # def setup(app): 120 | # app.add_javascript("https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.js") 121 | 122 | 123 | # -- Options for HTML output ---------------------------------------------- 124 | # 125 | # The theme to use for HTML and HTML Help pages. See the documentation for 126 | # a list of builtin themes. 127 | # 128 | # 129 | # 130 | 131 | html_theme = 'pytorch_sphinx_theme' 132 | html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] 133 | 134 | # Theme options are theme-specific and customize the look and feel of a theme 135 | # further. For a list of options available for each theme, see the 136 | # documentation. 137 | 138 | html_theme_options = { 139 | 'pytorch_project': 'docs', 140 | 'canonical_url': 'https://texsoup.alvinwan.com', 141 | 'collapse_navigation': False, 142 | 'display_version': True, 143 | 'logo_only': True, 144 | } 145 | 146 | html_logo = '_static/images/logo-dark.svg' 147 | 148 | 149 | # Add any paths that contain custom static files (such as style sheets) here, 150 | # relative to this directory. They are copied after the builtin static files, 151 | # so a file named "default.css" will overwrite the builtin "default.css". 152 | html_static_path = ['_static'] # , '_images'] 153 | 154 | 155 | # Called automatically by Sphinx, making this `conf.py` an "extension". 156 | def setup(app): 157 | # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value 158 | # and can be moved outside of this function (and the setup(app) function 159 | # can be deleted). 160 | html_css_files = [ 161 | 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css' 162 | ] 163 | 164 | # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is 165 | # `add_stylesheet` (deprecated in 1.8). 166 | add_css = getattr(app, 'add_css_file', app.add_stylesheet) 167 | for css_file in html_css_files: 168 | add_css(css_file) 169 | 170 | 171 | # -- Options for HTMLHelp output ------------------------------------------ 172 | 173 | # Output file base name for HTML help builder. 174 | htmlhelp_basename = 'TexSoupdoc' 175 | 176 | 177 | # -- Options for LaTeX output --------------------------------------------- 178 | 179 | latex_elements = { 180 | # The paper size ('letterpaper' or 'a4paper'). 181 | # 182 | # 'papersize': 'letterpaper', 183 | 184 | # The font size ('10pt', '11pt' or '12pt'). 185 | # 186 | # 'pointsize': '10pt', 187 | 188 | # Additional stuff for the LaTeX preamble. 189 | # 190 | # 'preamble': '', 191 | 192 | # Latex figure (float) alignment 193 | # 194 | # 'figure_align': 'htbp', 195 | } 196 | 197 | # Grouping the document tree into LaTeX files. List of tuples 198 | # (source start file, target name, title, 199 | # author, documentclass [howto, manual, or own class]). 200 | latex_documents = [ 201 | (master_doc, 'pytorch.tex', 'TexSoup Documentation', 202 | 'Alvin Wan', 'manual'), 203 | ] 204 | 205 | 206 | # -- Options for manual page output --------------------------------------- 207 | 208 | # One entry per manual page. List of tuples 209 | # (source start file, name, description, authors, manual section). 210 | man_pages = [ 211 | (master_doc, 'TexSoup', 'TexSoup Documentation', 212 | [author], 1) 213 | ] 214 | 215 | 216 | # -- Options for Texinfo output ------------------------------------------- 217 | 218 | # Grouping the document tree into Texinfo files. List of tuples 219 | # (source start file, target name, title, author, 220 | # dir menu entry, description, category) 221 | texinfo_documents = [ 222 | (master_doc, 'TexSoup', 'TexSoup Documentation', 223 | author, 'TexSoup', 'One line description of project.', 224 | 'Miscellaneous'), 225 | ] 226 | 227 | 228 | # Example configuration for intersphinx: refer to the Python standard library. 229 | intersphinx_mapping = { 230 | 'python': ('https://docs.python.org/', None), 231 | 'numpy': ('https://docs.scipy.org/doc/numpy/', None), 232 | } 233 | 234 | # -- A patch that prevents Sphinx from cross-referencing ivar tags ------- 235 | # See http://stackoverflow.com/a/41184353/3343043 236 | 237 | from docutils import nodes 238 | from sphinx.util.docfields import TypedField 239 | from sphinx import addnodes 240 | 241 | 242 | def patched_make_field(self, types, domain, items, **kw): 243 | # `kw` catches `env=None` needed for newer sphinx while maintaining 244 | # backwards compatibility when passed along further down! 245 | 246 | # type: (List, unicode, Tuple) -> nodes.field 247 | def handle_item(fieldarg, content): 248 | par = nodes.paragraph() 249 | par += addnodes.literal_strong('', fieldarg) # Patch: this line added 250 | # par.extend(self.make_xrefs(self.rolename, domain, fieldarg, 251 | # addnodes.literal_strong)) 252 | if fieldarg in types: 253 | par += nodes.Text(' (') 254 | # NOTE: using .pop() here to prevent a single type node to be 255 | # inserted twice into the doctree, which leads to 256 | # inconsistencies later when references are resolved 257 | fieldtype = types.pop(fieldarg) 258 | if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text): 259 | typename = u''.join(n.astext() for n in fieldtype) 260 | typename = typename.replace('int', 'python:int') 261 | typename = typename.replace('long', 'python:long') 262 | typename = typename.replace('float', 'python:float') 263 | typename = typename.replace('type', 'python:type') 264 | par.extend(self.make_xrefs(self.typerolename, domain, typename, 265 | addnodes.literal_emphasis, **kw)) 266 | else: 267 | par += fieldtype 268 | par += nodes.Text(')') 269 | par += nodes.Text(' -- ') 270 | par += content 271 | return par 272 | 273 | fieldname = nodes.field_name('', self.label) 274 | if len(items) == 1 and self.can_collapse: 275 | fieldarg, content = items[0] 276 | bodynode = handle_item(fieldarg, content) 277 | else: 278 | bodynode = self.list_type() 279 | for fieldarg, content in items: 280 | bodynode += nodes.list_item('', handle_item(fieldarg, content)) 281 | fieldbody = nodes.field_body('', bodynode) 282 | return nodes.field('', fieldname, fieldbody) 283 | 284 | TypedField.make_field = patched_make_field 285 | -------------------------------------------------------------------------------- /docs/source/data.rst: -------------------------------------------------------------------------------- 1 | Data Structures 2 | =================================== 3 | 4 | .. automodule:: TexSoup.data 5 | 6 | Node 7 | ----------------------------------- 8 | 9 | .. autoclass:: TexNode() 10 | :members: 11 | 12 | Expressions 13 | ----------------------------------- 14 | 15 | .. autoclass:: TexExpr() 16 | :members: 17 | 18 | .. autoclass:: TexEnv() 19 | :members: 20 | 21 | .. autoclass:: TexCmd() 22 | :members: 23 | 24 | Groups 25 | ----------------------------------- 26 | 27 | .. autoclass:: TexGroup() 28 | :members: 29 | 30 | .. autoclass:: BracketGroup() 31 | :members: 32 | 33 | .. autoclass:: BraceGroup() 34 | :members: 35 | 36 | .. autoclass:: TexArgs() 37 | :members: 38 | 39 | Environments 40 | ----------------------------------- 41 | 42 | .. autoclass:: TexNamedEnv() 43 | :members: 44 | 45 | .. autoclass:: TexUnNamedEnv() 46 | :members: 47 | 48 | .. autoclass:: TexMathEnv() 49 | :members: 50 | 51 | .. autoclass:: TexDisplayMathEnv() 52 | :members: 53 | 54 | .. autoclass:: TexMathModeEnv() 55 | :members: 56 | 57 | .. autoclass:: TexDisplayMathModeEnv() 58 | :members: 59 | 60 | Text 61 | ---------------------------------- 62 | 63 | .. autoclass:: TexText() 64 | :members: 65 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. TexSoup documentation master file, created by 2 | sphinx-quickstart on Sat Apr 6 22:08:46 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | TexSoup documentation 7 | =================================== 8 | 9 | TexSoup is a Python3 library for pulling data from :math:`\LaTeX` files. It 10 | turns even invalid sources into a BeautifulSoup-esque structure that you can 11 | navigate, search, and modify. 12 | 13 | .. toctree:: 14 | :maxdepth: 1 15 | :caption: Guides 16 | 17 | quickstart 18 | soup 19 | navigation 20 | searching 21 | modification 22 | 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :caption: How it Works 27 | 28 | categorizer 29 | tokenizer 30 | parser 31 | 32 | .. toctree:: 33 | :maxdepth: 1 34 | :caption: Package Reference 35 | 36 | main 37 | data 38 | utils 39 | 40 | Indices and tables 41 | ================== 42 | 43 | * :ref:`genindex` 44 | -------------------------------------------------------------------------------- /docs/source/main.rst: -------------------------------------------------------------------------------- 1 | Main Interface 2 | =================================== 3 | 4 | .. automodule:: TexSoup 5 | 6 | .. autofunction:: TexSoup 7 | -------------------------------------------------------------------------------- /docs/source/modification.rst: -------------------------------------------------------------------------------- 1 | Modification 2 | =================================== 3 | 4 | You can also modify the document using the TexSoup tree, then export the changes 5 | back to a :math:`\LaTeX` file. 6 | 7 | Commands 8 | ----------------------------------- 9 | 10 | As mentioned in :ref:`page-soup`, you can change commands and their arguments. 11 | 12 | >>> soup = TexSoup(r'I am \textbf{\large Large and bold}') 13 | >>> cmd = soup.textbf 14 | >>> cmd.name = 'textit' 15 | >>> cmd 16 | \textit{\large Large and bold} 17 | 18 | You can set :code:`.string` for any single-argument command (e.g., :code:`\section`). 19 | 20 | >>> cmd.string = 'corgis are the best' 21 | >>> cmd 22 | \textit{corgis are the best} 23 | 24 | You can do the same for any command in math mode. 25 | 26 | >>> soup2 = TexSoup(r'$$\textrm{math}\sum$$') 27 | >>> soup2.textrm.string = 'not math' 28 | >>> soup2 29 | $$\textrm{not math}\sum$$ 30 | 31 | You can also remove any command in-place, by calling :code:`.delete` on it. 32 | 33 | >>> soup2.textrm.delete() 34 | >>> soup2 35 | $$\sum$$ 36 | 37 | Arguments 38 | ----------------------------------- 39 | 40 | You can modify arguments just as you would a list. 41 | 42 | >>> cmd.args.append('{moar}') 43 | >>> cmd 44 | \textit{corgis are the best}{moar} 45 | >>> cmd.args.remove('{moar}') 46 | >>> cmd 47 | \textit{corgis are the best} 48 | >>> cmd.args.extend(['[moar]', '{crazy}']) 49 | \textit{corgis are the best}[moar]{crazy} 50 | >>> cmd.args = cmd.args[:2] 51 | >>> cmd 52 | \textit{corgis are the best}[moar] 53 | 54 | Use the argument's :code:`.string` attribute to modify the argument's contents. 55 | 56 | >>> cmd.args[0].string = 'no' 57 | >>> cmd 58 | \textit{no}[moar] 59 | 60 | Environments 61 | ----------------------------------- 62 | 63 | Use the :code:`.string` attribute to modify any environment with only text content 64 | (i.e., a verbatim or math environment). 65 | 66 | >>> soup = TexSoup(r'\begin{verbatim}Huehue\end{verbatim}') 67 | >>> soup.verbatim.string = 'HUEHUE' 68 | >>> soup 69 | \begin{verbatim}HUEHUE\end{verbatim} 70 | >>> soup = TexSoup(r'$$\text{math}$$') 71 | >>> soup.text.string = '' 72 | 73 | You can add to an environment's contents using list-like operations, like 74 | :code:`.append`, :code:`.remove`, :code:`.insert`, and :code:`.extend`. 75 | 76 | >>> from TexSoup import TexSoup 77 | >>> soup = TexSoup(r''' 78 | ... \begin{itemize} 79 | ... \item Hello 80 | ... \item Bye 81 | ... \end{itemize}''') 82 | >>> tmp = soup.item 83 | >>> soup.itemize.remove(soup.item) 84 | >>> soup.itemize 85 | \begin{itemize} 86 | \item Bye 87 | \end{itemize} 88 | >>> soup.insert(1, tmp) 89 | >>> soup 90 | \begin{itemize} 91 | \item Hello 92 | \item Bye 93 | \end{itemize} 94 | 95 | See :class:`TexSoup.data.TexNode` for more utilities. 96 | -------------------------------------------------------------------------------- /docs/source/navigation.rst: -------------------------------------------------------------------------------- 1 | Navigation 2 | =================================== 3 | 4 | Here's the :math:`\LaTeX` document from the quickstart guide:: 5 | 6 | >>> tex_doc = """ 7 | ... \begin{document} 8 | ... \section{Hello \textit{world}.} 9 | ... \subsection{Watermelon} 10 | ... (n.) A sacred fruit. Also known as: 11 | ... \begin{itemize} 12 | ... \item red lemon 13 | ... \item life 14 | ... \end{itemize} 15 | ... Here is the prevalence of each synonym, in Table \ref{table:synonyms}. 16 | ... \begin{tabular}{c c}\label{table:synonyms} 17 | ... red lemon & uncommon \\ \n 18 | ... life & common 19 | ... \end{tabular} 20 | ... \end{document} 21 | ... """ 22 | >>> from TexSoup import TexSoup 23 | >>> soup = TexSoup(tex_doc) 24 | 25 | Going Down 26 | ----------------------------------- 27 | 28 | Some expressions contain content. For example, environments may contain items. 29 | TexSoup provides attributes for navigating an environment's children. 30 | 31 | Naviate by naming the expression you want. For example, to access italicized 32 | text, use :code:`soup.textit`:: 33 | 34 | >>> soup.textit 35 | \textit{world} 36 | 37 | You can use this to select expressions from a specific part of the document. 38 | For example, this retrieves the an item from an itemize environment:: 39 | 40 | >>> soup.itemize.item 41 | \item red lemon 42 | 43 | 44 | 45 | Note accessing by name only returns the first result. 46 | 47 | >>> soup.item 48 | \item red lemon 49 | 50 | 51 | To access *all* items, use one of the utilities from :ref:`page-search`, such 52 | as :code:`find_all`:: 53 | 54 | >>> soup.find_all('item') 55 | [\item red lemon 56 | , \item life 57 | ] 58 | 59 | An environment's contents are accessible via a list called :code:`contents`. 60 | Note that changing this list in-place will not affect the environment:: 61 | 62 | >>> soup.itemize.contents 63 | [\item red lemon 64 | , \item life 65 | ] 66 | 67 | There are several views into an environment's content: 68 | 69 | - :code:`.children`: Nested Tex expressions. Does not include floating text. 70 | - :code:`.contents`: Nested Tex expressions and text. Does not contain whitespace-only text. 71 | - :code:`.expr.all`: Nested Tex expressions and text, regardless of whitespace or not. All information needed to reconstruct the original source. 72 | - :code:`.descendants`: Tex expressions nested inside of Tex expressions. 73 | - :code:`.text`: Used to "detex" a source file. Returns text from all descendants, without Tex expressions. 74 | 75 | If a command has only one required argument, or an environment has only one 76 | child, these values are made available as a :code:`.string`. 77 | 78 | >>> soup.textit.string 79 | 'world' 80 | 81 | Going Up 82 | ----------------------------------- 83 | 84 | You can access an experssion's parent with the :code:`.parent` attribute:: 85 | 86 | >>> soup.textit.parent 87 | \section{Hello \textit{world}.} 88 | -------------------------------------------------------------------------------- /docs/source/parser.rst: -------------------------------------------------------------------------------- 1 | Parsing Mechanics 2 | =================================== 3 | 4 | .. automodule:: TexSoup.reader 5 | 6 | Parser 7 | ----------------------------------- 8 | 9 | .. autofunction:: read_tex 10 | .. autofunction:: read_expr 11 | .. autofunction:: read_spacer 12 | .. autofunction:: make_read_peek 13 | 14 | Environment Parser 15 | ----------------------------------- 16 | 17 | .. autofunction:: read_item 18 | .. autofunction:: unclosed_env_handler 19 | .. autofunction:: read_math_env 20 | .. autofunction:: read_skip_env 21 | .. autofunction:: read_env 22 | 23 | Argument Parser 24 | ----------------------------------- 25 | 26 | .. autofunction:: read_args 27 | .. autofunction:: read_arg_optional 28 | .. autofunction:: read_arg_required 29 | .. autofunction:: read_arg 30 | 31 | Command Parser 32 | ----------------------------------- 33 | 34 | .. autofunction:: read_command 35 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quick Start 2 | =================================== 3 | 4 | The below illustrates some basic TexSoup functions. 5 | 6 | How to Use 7 | ----------------------------------- 8 | 9 | Here is a :math:`\LaTeX` document:: 10 | 11 | >>> tex_doc = """ 12 | ... \begin{document} 13 | ... \section{Hello \textit{world}.} 14 | ... \subsection{Watermelon} 15 | ... (n.) A sacred fruit. Also known as: 16 | ... \begin{itemize} 17 | ... \item red lemon 18 | ... \item life 19 | ... \end{itemize} 20 | ... Here is the prevalence of each synonym, in Table \ref{table:synonyms}. 21 | ... \begin{tabular}{c c}\label{table:synonyms} 22 | ... red lemon & uncommon \\ \n 23 | ... life & common 24 | ... \end{tabular} 25 | ... \end{document} 26 | ... """ 27 | 28 | Call :code:`TexSoup` on this string to re-represent this document as a 29 | nested data structure:: 30 | 31 | >>> from TexSoup import TexSoup 32 | >>> soup = TexSoup(tex_doc) 33 | >>> soup 34 | \begin{document} 35 | \section{Hello \textit{world}.} 36 | \subsection{Watermelon} 37 | (n.) A sacred fruit. Also known as: 38 | \begin{itemize} 39 | \item red lemon 40 | \item life 41 | \end{itemize} 42 | Here is the prevalence of each synonym, in Table \ref{table:synonyms}. 43 | \begin{tabular}{c c}\label{table:synonyms} 44 | red lemon & uncommon \\ \n 45 | life & common 46 | \end{tabular} 47 | \end{document} 48 | 49 | Here are a few ways to navigate the TexSoup data structure:: 50 | 51 | >>> soup.section 52 | \section{Hello \textit{world}.} 53 | >>> soup.section.name 54 | 'section' 55 | >>> soup.section.string 56 | 'Hello \\textit{world}.' 57 | >>> soup.section.parent.name 58 | 'document' 59 | >>> soup.tabular 60 | \begin{tabular}{c c}\label{table:synonyms} 61 | red lemon & uncommon \\ \n 62 | life & common 63 | \end{tabular} 64 | >>> soup.tabular.args[0] 65 | 'c c' 66 | >>> soup.item 67 | \item red lemon 68 | 69 | >>> list(soup.find_all('item')) 70 | [\item red lemon 71 | , \item life 72 | ] 73 | 74 | One task may be to find all references. To do this, simply search for 75 | ``\ref{