├── .github ├── FUNDING.yml └── workflows │ └── rust.yml ├── .gitignore ├── misc └── example.png ├── examples ├── sample.foo ├── sample.bf ├── sample.json ├── sample.nrs ├── brainfuck.rs ├── foo.rs ├── json.rs └── nano_rust.rs ├── LICENSE ├── Cargo.toml ├── src ├── chain.rs ├── span.rs ├── debug.rs ├── recursive.rs ├── text.rs ├── stream.rs ├── recovery.rs ├── error.rs └── primitive.rs ├── CHANGELOG.md ├── benches └── json.rs ├── README.md └── tutorial.md /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [zesterer] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | flamegraph.svg 4 | perf.data* 5 | -------------------------------------------------------------------------------- /misc/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikdesjardins/chumsky/master/misc/example.png -------------------------------------------------------------------------------- /examples/sample.foo: -------------------------------------------------------------------------------- 1 | let five = 5; 2 | let eight = 3 + five; 3 | fn add x y = x + y; 4 | add(five, eight) 5 | -------------------------------------------------------------------------------- /examples/sample.bf: -------------------------------------------------------------------------------- 1 | --[>--->->->++>-<<<<<-------]>--.>---------.>--..+++.>----.>+++++++++.<<.+++.------.<-.>>+. 2 | -------------------------------------------------------------------------------- /examples/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "leaving": { 3 | "tail": [{ 4 | -2063823378.8597813, 5 | !true, 6 | false, 7 | !null,! 8 | -153646.6402, 9 | "board" 10 | ], 11 | "fed": -283765067.9149623, 12 | "cowboy": --355139449, 13 | "although": 794127593.3922591, 14 | "front": "college", 15 | "origin": 981339097 16 | }, 17 | "though": ~true, 18 | "invalid": "\uDFFF", 19 | "activity": "value", 20 | "office": -342325541.1937506, 21 | "noise": fallse, 22 | "acres": "home", 23 | "foo": [}] 24 | } 25 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | check: 14 | name: Check Chumsky 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Install latest nightly 19 | uses: actions-rs/toolchain@v1 20 | with: 21 | toolchain: stable 22 | override: true 23 | components: rustfmt, clippy 24 | - name: Run cargo check 25 | run: cargo check --verbose --no-default-features 26 | test: 27 | name: Test Chumsky 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@v2 31 | - name: Install latest nightly 32 | uses: actions-rs/toolchain@v1 33 | with: 34 | toolchain: nightly 35 | override: true 36 | components: rustfmt, clippy 37 | - name: Run cargo check 38 | run: cargo test --verbose --all-features 39 | -------------------------------------------------------------------------------- /examples/sample.nrs: -------------------------------------------------------------------------------- 1 | // Run this example with `cargo run --example nano_rust -- examples/sample.nrs` 2 | // Feel free to play around with this sample to see what errors you can generate! 3 | // Spans are propagated to the interpreted AST so you can even invoke runtime 4 | // errors and still have an error message that points to source code emitted! 5 | 6 | // Calculate the factorial of a number 7 | fn factorial(x) { 8 | // Conditionals are supported! 9 | if x == 0 { 10 | 1 11 | } else { 12 | x * factorial(x - 1) 13 | } 14 | } 15 | 16 | // The main function 17 | fn main() { 18 | let three = 3; 19 | let meaning_of_life = three * 14 + 1; 20 | 21 | print("Hello, world!"); 22 | print("The meaning of life is..."); 23 | 24 | if meaning_of_life == 42 { 25 | print(meaning_of_life); 26 | } else { 27 | print("...something we cannot know"); 28 | 29 | print("However, I can tell you that the factorial of 10 is..."); 30 | // Function calling 31 | print(factorial(10)); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Joshua Barretto 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "chumsky" 3 | version = "0.8.0" 4 | description = "A parser library for humans with powerful error recovery" 5 | authors = ["Joshua Barretto "] 6 | repository = "https://github.com/zesterer/chumsky" 7 | license = "MIT" 8 | keywords = ["parser", "combinator", "token", "language", "syntax"] 9 | categories = ["parsing", "text-processing"] 10 | edition = "2018" 11 | exclude = [ 12 | "/misc/*", 13 | "/benches/*", 14 | ] 15 | 16 | [features] 17 | default = ["ahash", "std", "spill-stack"] 18 | # Use `ahash` instead of the standard hasher for maintaining sets of expected inputs 19 | # (Also used if `std` is disabled) 20 | ahash = [] 21 | # Integrate with the standard library 22 | std = [] 23 | # Enable nightly-only features like better compiler diagnostics 24 | nightly = [] 25 | # Allows deeper recursion by dynamically spilling stack state on to the heap 26 | spill-stack = ["stacker", "std"] 27 | 28 | [dependencies] 29 | # Used if `std` is disabled. 30 | # Provides `ahash` for the corresponding feature as it uses it by default. 31 | # Due to https://github.com/rust-lang/cargo/issues/1839, this can't be optional 32 | hashbrown = "0.11" 33 | stacker = { version = "0.1", optional = true } 34 | 35 | [dev-dependencies] 36 | ariadne = "0.1.2" 37 | pom = "3.0" 38 | -------------------------------------------------------------------------------- /examples/brainfuck.rs: -------------------------------------------------------------------------------- 1 | //! This is a Brainfuck parser and interpreter 2 | //! Run it with the following command: 3 | //! cargo run --example brainfuck -- examples/sample.bf 4 | 5 | use chumsky::prelude::*; 6 | use std::{ 7 | env, fs, 8 | io::{self, Read}, 9 | }; 10 | 11 | #[derive(Clone)] 12 | enum Instr { 13 | Invalid, 14 | Left, 15 | Right, 16 | Incr, 17 | Decr, 18 | Read, 19 | Write, 20 | Loop(Vec), 21 | } 22 | 23 | fn parser() -> impl Parser, Error = Simple> { 24 | use Instr::*; 25 | recursive(|bf| { 26 | choice(( 27 | just('<').to(Left), 28 | just('>').to(Right), 29 | just('+').to(Incr), 30 | just('-').to(Decr), 31 | just(',').to(Read), 32 | just('.').to(Write), 33 | )) 34 | .or(bf.delimited_by(just('['), just(']')).map(Loop)) 35 | .recover_with(nested_delimiters('[', ']', [], |_| Invalid)) 36 | .recover_with(skip_then_retry_until([']'])) 37 | .repeated() 38 | }) 39 | .then_ignore(end()) 40 | } 41 | 42 | const TAPE_LEN: usize = 10_000; 43 | 44 | fn execute(ast: &[Instr], ptr: &mut usize, tape: &mut [u8; TAPE_LEN]) { 45 | use Instr::*; 46 | for symbol in ast { 47 | match symbol { 48 | Invalid => unreachable!(), 49 | Left => *ptr = (*ptr + TAPE_LEN - 1).rem_euclid(TAPE_LEN), 50 | Right => *ptr = (*ptr + 1).rem_euclid(TAPE_LEN), 51 | Incr => tape[*ptr] = tape[*ptr].wrapping_add(1), 52 | Decr => tape[*ptr] = tape[*ptr].wrapping_sub(1), 53 | Read => tape[*ptr] = io::stdin().bytes().next().unwrap().unwrap(), 54 | Write => print!("{}", tape[*ptr] as char), 55 | Loop(ast) => { 56 | while tape[*ptr] != 0 { 57 | execute(ast, ptr, tape) 58 | } 59 | } 60 | } 61 | } 62 | } 63 | 64 | fn main() { 65 | let src = fs::read_to_string(env::args().nth(1).expect("Expected file argument")) 66 | .expect("Failed to read file"); 67 | 68 | // let src = "[!]+"; 69 | match parser().parse(src.trim()) { 70 | Ok(ast) => execute(&ast, &mut 0, &mut [0; TAPE_LEN]), 71 | Err(errs) => errs.into_iter().for_each(|e| println!("{:?}", e)), 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/chain.rs: -------------------------------------------------------------------------------- 1 | //! Traits that allow chaining parser outputs together. 2 | //! 3 | //! *“And what’s happened to the Earth?” “Ah. It’s been demolished.” “Has it,” said Arthur levelly. “Yes. It just 4 | //! boiled away into space.” “Look,” said Arthur, “I’m a bit upset about that.”* 5 | //! 6 | //! You usually don't need to interact with this trait, or even import it. It's only public so that you can see which 7 | //! types implement it. See [`Parser::chain`](super::Parser) for examples of its usage. 8 | 9 | use alloc::{string::String, vec::Vec}; 10 | 11 | mod private { 12 | pub trait Sealed {} 13 | 14 | impl Sealed for T {} 15 | impl> Sealed for (A, T) {} 16 | impl Sealed for Option {} 17 | impl Sealed for alloc::vec::Vec {} 18 | impl Sealed for alloc::string::String {} 19 | } 20 | 21 | /// A utility trait that facilitates chaining parser outputs together into [`Vec`]s. 22 | /// 23 | /// See [`Parser::chain`](super::Parser). 24 | #[allow(clippy::len_without_is_empty)] 25 | pub trait Chain: private::Sealed { 26 | /// The number of items that this chain link consists of. 27 | fn len(&self) -> usize; 28 | /// Append the elements in this link to the chain. 29 | fn append_to(self, v: &mut Vec); 30 | } 31 | 32 | impl Chain for T { 33 | fn len(&self) -> usize { 34 | 1 35 | } 36 | fn append_to(self, v: &mut Vec) { 37 | v.push(self); 38 | } 39 | } 40 | 41 | impl> Chain for (A, T) { 42 | fn len(&self) -> usize { 43 | 1 44 | } 45 | fn append_to(self, v: &mut Vec) { 46 | self.0.append_to(v); 47 | v.push(self.1); 48 | } 49 | } 50 | 51 | impl Chain for Option { 52 | fn len(&self) -> usize { 53 | self.is_some() as usize 54 | } 55 | fn append_to(self, v: &mut Vec) { 56 | if let Some(x) = self { 57 | v.push(x); 58 | } 59 | } 60 | } 61 | 62 | impl Chain for Vec { 63 | fn len(&self) -> usize { 64 | self.as_slice().len() 65 | } 66 | fn append_to(mut self, v: &mut Vec) { 67 | v.append(&mut self); 68 | } 69 | } 70 | 71 | impl Chain for String { 72 | // TODO: Quite inefficient 73 | fn len(&self) -> usize { 74 | self.chars().count() 75 | } 76 | fn append_to(self, v: &mut Vec) { 77 | v.extend(self.chars()); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/span.rs: -------------------------------------------------------------------------------- 1 | //! Types and traits related to spans. 2 | //! 3 | //! *“We demand rigidly defined areas of doubt and uncertainty!”* 4 | //! 5 | //! You can use the [`Span`] trait to connect up chumsky to your compiler's knowledge of the input source. 6 | 7 | use core::ops::Range; 8 | 9 | /// A trait that describes a span over a particular range of inputs. 10 | /// 11 | /// Spans typically consist of some context, such as the file they originated from, and a start/end offset. Spans are 12 | /// permitted to overlap one-another. The end offset must always be greater than or equal to the start offset. 13 | /// 14 | /// Span is automatically implemented for [`Range`] and [`(C, Range)`]. 15 | pub trait Span: Clone { 16 | /// Extra context used in a span. 17 | /// 18 | /// This is usually some way to uniquely identity the source file that a span originated in such as the file's 19 | /// path, URL, etc. 20 | /// 21 | /// NOTE: Span contexts have no inherent meaning to Chumsky and can be anything. For example, [`Range`]'s 22 | /// implementation of [`Span`] simply uses [`()`] as its context. 23 | type Context: Clone; 24 | 25 | /// A type representing a span's start or end offset from the start of the input. 26 | /// 27 | /// Typically, [`usize`] is used. 28 | /// 29 | /// NOTE: Offsets have no inherently meaning to Chumsky and are not used to decide how to prioritise errors. This 30 | /// means that it's perfectly fine for tokens to have non-continuous spans that bear no relation to their actual 31 | /// location in the input stream. This is useful for languages with an AST-level macro system that need to 32 | /// correctly point to symbols in the macro input when producing errors. 33 | type Offset: Clone; 34 | 35 | /// Create a new span given a context and an offset range. 36 | fn new(context: Self::Context, range: Range) -> Self; 37 | 38 | /// Return the span's context. 39 | fn context(&self) -> Self::Context; 40 | 41 | /// Return the start offset of the span. 42 | fn start(&self) -> Self::Offset; 43 | 44 | /// Return the end offset of the span. 45 | fn end(&self) -> Self::Offset; 46 | } 47 | 48 | impl Span for Range { 49 | type Context = (); 50 | type Offset = T; 51 | 52 | fn new((): Self::Context, range: Self) -> Self { 53 | range 54 | } 55 | fn context(&self) -> Self::Context {} 56 | fn start(&self) -> Self::Offset { 57 | self.start.clone() 58 | } 59 | fn end(&self) -> Self::Offset { 60 | self.end.clone() 61 | } 62 | } 63 | 64 | impl Span for (C, Range) { 65 | type Context = C; 66 | type Offset = T; 67 | 68 | fn new(context: Self::Context, range: Range) -> Self { 69 | (context, range) 70 | } 71 | fn context(&self) -> Self::Context { 72 | self.0.clone() 73 | } 74 | fn start(&self) -> Self::Offset { 75 | self.1.start.clone() 76 | } 77 | fn end(&self) -> Self::Offset { 78 | self.1.end.clone() 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | # Unreleased 9 | 10 | ### Added 11 | 12 | ### Removed 13 | 14 | ### Changed 15 | 16 | ### Fixed 17 | 18 | # [0.8.0] - 2022-02-07 19 | 20 | ### Added 21 | 22 | - `then_with` combinator to allow limited support for parsing nested patterns 23 | - impl From<&[T; N]> for Stream 24 | - `SkipUntil/SkipThenRetryUntil::skip_start/consume_end` for more precise control over skip-based recovery 25 | 26 | ### Changed 27 | 28 | - Allowed `Validate` to map the output type 29 | - Switched to zero-size End Of Input spans for default implementations of `Stream` 30 | - Made `delimited_by` take combinators instead of specific tokens 31 | - Minor optimisations 32 | - Documentation improvements 33 | 34 | ### Fixed 35 | 36 | - Compilation error with `--no-default-features` 37 | - Made default behaviour of `skip_until` more sensible 38 | 39 | # [0.7.0] - 2021-12-16 40 | 41 | ### Added 42 | 43 | - A new [tutorial](tutorial.md) to help new users 44 | 45 | - `select` macro, a wrapper over `filter_map` that makes extracting data from specific tokens easy 46 | - `choice` parser, a better alternative to long `or` chains (which sometimes have poor compilation performance) 47 | - `todo` parser, that panics when used (but not when created) (akin to Rust's `todo!` macro, but for parsers) 48 | - `keyword` parser, that parses *exact* identifiers 49 | 50 | - `from_str` combinator to allow converting a pattern to a value inline, using `std::str::FromStr` 51 | - `unwrapped` combinator, to automatically unwrap an output value inline 52 | - `rewind` combinator, that allows reverting the input stream on success. It's most useful when requiring that a 53 | pattern is followed by some terminating pattern without the first parser greedily consuming it 54 | - `map_err_with_span` combinator, to allow fetching the span of the input that was parsed by a parser before an error 55 | was encountered 56 | 57 | - `or_else` combinator, to allow processing and potentially recovering from a parser error 58 | - `SeparatedBy::at_most` to require that a separated pattern appear at most a specific number of times 59 | - `SeparatedBy::exactly` to require that a separated pattern be repeated exactly a specific number of times 60 | - `Repeated::exactly` to require that a pattern be repeated exactly a specific number of times 61 | 62 | - More trait implementations for various things, making the crate more useful 63 | 64 | ### Changed 65 | 66 | - Made `just`, `one_of`, and `none_of` significant more useful. They can now accept strings, arrays, slices, vectors, 67 | sets, or just single tokens as before 68 | - Added the return type of each parser to its documentation 69 | - More explicit documentation of parser behaviour 70 | - More doc examples 71 | - Deprecated `seq` (`just` has been generalised and can now be used to parse specific input sequences) 72 | - Sealed the `Character` trait so that future changes are not breaking 73 | - Sealed the `Chain` trait and made it more powerful 74 | - Moved trait constraints on `Parser` to where clauses for improved readability 75 | 76 | ### Fixed 77 | 78 | - Fixed a subtle bug that allowed `separated_by` to parse an extra trailing separator when it shouldn't 79 | - Filled a 'hole' in the `Error` trait's API that conflated a lack of expected tokens with expectation of end of input 80 | - Made recursive parsers use weak reference-counting to avoid memory leaks 81 | 82 | # [0.6.0] - 2021-11-22 83 | 84 | ### Added 85 | 86 | - `skip_until` error recovery strategy 87 | - `SeparatedBy::at_least` and `SeparatedBy::at_most` for parsing a specific number of separated items 88 | - `Parser::validate` for integrated AST validation 89 | - `Recursive::declare` and `Recursive::define` for more precise control over recursive declarations 90 | 91 | ### Changed 92 | 93 | - Improved `separated_by` error messages 94 | - Improved documentation 95 | - Hid a new (probably) unused implementation details 96 | 97 | # [0.5.0] - 2021-10-30 98 | 99 | ### Added 100 | 101 | - `take_until` primitive 102 | 103 | ### Changed 104 | 105 | - Added span to fallback output function in `nested_delimiters` 106 | 107 | # [0.4.0] - 2021-10-28 108 | 109 | ### Added 110 | 111 | - Support for LL(k) parsing 112 | - Custom error recovery strategies 113 | - Debug mode 114 | - Nested input flattening 115 | 116 | ### Changed 117 | 118 | - Radically improved error quality 119 | -------------------------------------------------------------------------------- /src/debug.rs: -------------------------------------------------------------------------------- 1 | //! Utilities for debugging parsers. 2 | //! 3 | //! *“He was staring at the instruments with the air of one who is trying to convert Fahrenheit to centigrade in his 4 | //! head while his house is burning down.”* 5 | 6 | use super::*; 7 | 8 | use alloc::borrow::Cow; 9 | use core::panic::Location; 10 | 11 | /// Information about a specific parser. 12 | #[allow(dead_code)] 13 | pub struct ParserInfo { 14 | name: Cow<'static, str>, 15 | display: Rc, 16 | location: Location<'static>, 17 | } 18 | 19 | impl ParserInfo { 20 | pub(crate) fn new( 21 | name: impl Into>, 22 | display: Rc, 23 | location: Location<'static>, 24 | ) -> Self { 25 | Self { 26 | name: name.into(), 27 | display, 28 | location, 29 | } 30 | } 31 | } 32 | 33 | /// An event that occurred during parsing. 34 | pub enum ParseEvent { 35 | /// Debugging information was emitted. 36 | Info(String), 37 | } 38 | 39 | /// A trait implemented by parser debuggers. 40 | #[deprecated( 41 | note = "This trait is excluded from the semver guarantees of chumsky. If you decide to use it, broken builds are your fault." 42 | )] 43 | pub trait Debugger { 44 | /// Create a new debugging scope. 45 | fn scope ParserInfo, F: FnOnce(&mut Self) -> R>( 46 | &mut self, 47 | info: Info, 48 | f: F, 49 | ) -> R; 50 | /// Emit a parse event, if the debugger supports them. 51 | fn emit_with ParseEvent>(&mut self, f: F); 52 | /// Invoke the given parser with a mode specific to this debugger. 53 | fn invoke + ?Sized>( 54 | &mut self, 55 | parser: &P, 56 | stream: &mut StreamOf, 57 | ) -> PResult; 58 | } 59 | 60 | /// A verbose debugger that emits debugging messages to the console. 61 | pub struct Verbose { 62 | // TODO: Don't use `Result`, that's silly 63 | events: Vec>, 64 | } 65 | 66 | impl Verbose { 67 | pub(crate) fn new() -> Self { 68 | Self { events: Vec::new() } 69 | } 70 | 71 | #[allow(unused_variables)] 72 | fn print_inner(&self, depth: usize) { 73 | // a no-op on no_std! 74 | #[cfg(feature = "std")] 75 | for event in &self.events { 76 | for _ in 0..depth * 4 { 77 | print!(" "); 78 | } 79 | match event { 80 | Ok(ParseEvent::Info(s)) => println!("{}", s), 81 | Err((info, scope)) => { 82 | println!( 83 | "Entered {} at line {} in {}", 84 | info.display, 85 | info.location.line(), 86 | info.location.file() 87 | ); 88 | scope.print_inner(depth + 1); 89 | } 90 | } 91 | } 92 | } 93 | 94 | pub(crate) fn print(&self) { 95 | self.print_inner(0) 96 | } 97 | } 98 | 99 | impl Debugger for Verbose { 100 | fn scope ParserInfo, F: FnOnce(&mut Self) -> R>( 101 | &mut self, 102 | info: Info, 103 | f: F, 104 | ) -> R { 105 | let mut verbose = Verbose { events: Vec::new() }; 106 | let res = f(&mut verbose); 107 | self.events.push(Err((info(), verbose))); 108 | res 109 | } 110 | 111 | fn emit_with ParseEvent>(&mut self, f: F) { 112 | self.events.push(Ok(f())); 113 | } 114 | 115 | fn invoke + ?Sized>( 116 | &mut self, 117 | parser: &P, 118 | stream: &mut StreamOf, 119 | ) -> PResult { 120 | parser.parse_inner_verbose(self, stream) 121 | } 122 | } 123 | 124 | /// A silent debugger that emits no debugging messages nor collects any debugging data. 125 | pub struct Silent { 126 | phantom: PhantomData<()>, 127 | } 128 | 129 | impl Silent { 130 | pub(crate) fn new() -> Self { 131 | Self { 132 | phantom: PhantomData, 133 | } 134 | } 135 | } 136 | 137 | impl Debugger for Silent { 138 | fn scope ParserInfo, F: FnOnce(&mut Self) -> R>( 139 | &mut self, 140 | _: Info, 141 | f: F, 142 | ) -> R { 143 | f(self) 144 | } 145 | fn emit_with ParseEvent>(&mut self, _: F) {} 146 | 147 | fn invoke + ?Sized>( 148 | &mut self, 149 | parser: &P, 150 | stream: &mut StreamOf, 151 | ) -> PResult { 152 | parser.parse_inner_silent(self, stream) 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /benches/json.rs: -------------------------------------------------------------------------------- 1 | #![feature(test, array_methods)] 2 | 3 | extern crate test; 4 | 5 | use test::{black_box, Bencher}; 6 | 7 | #[derive(Debug, Clone, PartialEq)] 8 | pub enum Json { 9 | Null, 10 | Bool(bool), 11 | Str(String), 12 | Num(f64), 13 | Array(Vec), 14 | Object(Vec<(String, Json)>), 15 | } 16 | 17 | static JSON: &'static [u8] = include_bytes!("sample.json"); 18 | 19 | #[bench] 20 | fn chumsky(b: &mut Bencher) { 21 | use ::chumsky::prelude::*; 22 | 23 | let json = chumsky::json(); 24 | b.iter(|| black_box(json.parse(JSON).unwrap())); 25 | } 26 | 27 | #[bench] 28 | fn pom(b: &mut Bencher) { 29 | let json = pom::json(); 30 | b.iter(|| black_box(json.parse(JSON).unwrap())); 31 | } 32 | 33 | mod chumsky { 34 | use chumsky::{error::Cheap, prelude::*}; 35 | 36 | use super::Json; 37 | use std::str; 38 | 39 | pub fn json() -> impl Parser> { 40 | recursive(|value| { 41 | let frac = just(b'.').chain(text::digits(10)); 42 | 43 | let exp = one_of(b"eE") 44 | .ignore_then(just(b'+').or(just(b'-')).or_not()) 45 | .chain(text::digits(10)); 46 | 47 | let number = just(b'-') 48 | .or_not() 49 | .chain(text::int(10)) 50 | .chain(frac.or_not().flatten()) 51 | .chain::(exp.or_not().flatten()) 52 | .map(|bytes| str::from_utf8(&bytes.as_slice()).unwrap().parse().unwrap()); 53 | 54 | let escape = just(b'\\').ignore_then(choice(( 55 | just(b'\\'), 56 | just(b'/'), 57 | just(b'"'), 58 | just(b'b').to(b'\x08'), 59 | just(b'f').to(b'\x0C'), 60 | just(b'n').to(b'\n'), 61 | just(b'r').to(b'\r'), 62 | just(b't').to(b'\t'), 63 | ))); 64 | 65 | let string = just(b'"') 66 | .ignore_then(filter(|c| *c != b'\\' && *c != b'"').or(escape).repeated()) 67 | .then_ignore(just(b'"')) 68 | .map(|bytes| String::from_utf8(bytes).unwrap()); 69 | 70 | let array = value 71 | .clone() 72 | .separated_by(just(b',').padded()) 73 | .padded() 74 | .delimited_by(just(b'['), just(b']')) 75 | .map(Json::Array); 76 | 77 | let member = string.then_ignore(just(b':').padded()).then(value); 78 | let object = member 79 | .separated_by(just(b',').padded()) 80 | .padded() 81 | .delimited_by(just(b'{'), just(b'}')) 82 | .collect::>() 83 | .map(Json::Object); 84 | 85 | choice(( 86 | just(b"null").to(Json::Null), 87 | just(b"true").to(Json::Bool(true)), 88 | just(b"false").to(Json::Bool(false)), 89 | number.map(Json::Num), 90 | string.map(Json::Str), 91 | array, 92 | object, 93 | )) 94 | .padded() 95 | }) 96 | .then_ignore(end()) 97 | } 98 | } 99 | 100 | mod pom { 101 | use pom::parser::*; 102 | use pom::Parser; 103 | 104 | use super::Json; 105 | use std::str::{self, FromStr}; 106 | 107 | fn space() -> Parser { 108 | one_of(b" \t\r\n").repeat(0..).discard() 109 | } 110 | 111 | fn number() -> Parser { 112 | let integer = one_of(b"123456789") - one_of(b"0123456789").repeat(0..) | sym(b'0'); 113 | let frac = sym(b'.') + one_of(b"0123456789").repeat(1..); 114 | let exp = one_of(b"eE") + one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); 115 | let number = sym(b'-').opt() + integer + frac.opt() + exp.opt(); 116 | number 117 | .collect() 118 | .convert(str::from_utf8) 119 | .convert(|s| f64::from_str(&s)) 120 | } 121 | 122 | fn string() -> Parser { 123 | let special_char = sym(b'\\') 124 | | sym(b'/') 125 | | sym(b'"') 126 | | sym(b'b').map(|_| b'\x08') 127 | | sym(b'f').map(|_| b'\x0C') 128 | | sym(b'n').map(|_| b'\n') 129 | | sym(b'r').map(|_| b'\r') 130 | | sym(b't').map(|_| b'\t'); 131 | let escape_sequence = sym(b'\\') * special_char; 132 | let string = sym(b'"') * (none_of(b"\\\"") | escape_sequence).repeat(0..) - sym(b'"'); 133 | string.convert(String::from_utf8) 134 | } 135 | 136 | fn array() -> Parser> { 137 | let elems = list(call(value), sym(b',') * space()); 138 | sym(b'[') * space() * elems - sym(b']') 139 | } 140 | 141 | fn object() -> Parser> { 142 | let member = string() - space() - sym(b':') - space() + call(value); 143 | let members = list(member, sym(b',') * space()); 144 | let obj = sym(b'{') * space() * members - sym(b'}'); 145 | obj.map(|members| members.into_iter().collect::>()) 146 | } 147 | 148 | fn value() -> Parser { 149 | (seq(b"null").map(|_| Json::Null) 150 | | seq(b"true").map(|_| Json::Bool(true)) 151 | | seq(b"false").map(|_| Json::Bool(false)) 152 | | number().map(|num| Json::Num(num)) 153 | | string().map(|text| Json::Str(text)) 154 | | array().map(|arr| Json::Array(arr)) 155 | | object().map(|obj| Json::Object(obj))) 156 | - space() 157 | } 158 | 159 | pub fn json() -> Parser { 160 | space() * value() - end() 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /examples/foo.rs: -------------------------------------------------------------------------------- 1 | /// This is the parser and interpreter for the 'Foo' language. See `tutorial.md` in the repository's root to learn 2 | /// about it. 3 | use chumsky::prelude::*; 4 | 5 | #[derive(Debug)] 6 | enum Expr { 7 | Num(f64), 8 | Var(String), 9 | 10 | Neg(Box), 11 | Add(Box, Box), 12 | Sub(Box, Box), 13 | Mul(Box, Box), 14 | Div(Box, Box), 15 | 16 | Call(String, Vec), 17 | Let { 18 | name: String, 19 | rhs: Box, 20 | then: Box, 21 | }, 22 | Fn { 23 | name: String, 24 | args: Vec, 25 | body: Box, 26 | then: Box, 27 | }, 28 | } 29 | 30 | fn parser() -> impl Parser> { 31 | let ident = text::ident().padded(); 32 | 33 | let expr = recursive(|expr| { 34 | let int = text::int(10) 35 | .map(|s: String| Expr::Num(s.parse().unwrap())) 36 | .padded(); 37 | 38 | let call = ident 39 | .then( 40 | expr.clone() 41 | .separated_by(just(',')) 42 | .allow_trailing() 43 | .delimited_by(just('('), just(')')), 44 | ) 45 | .map(|(f, args)| Expr::Call(f, args)); 46 | 47 | let atom = int 48 | .or(expr.delimited_by(just('('), just(')'))) 49 | .or(call) 50 | .or(ident.map(Expr::Var)); 51 | 52 | let op = |c| just(c).padded(); 53 | 54 | let unary = op('-') 55 | .repeated() 56 | .then(atom) 57 | .foldr(|_op, rhs| Expr::Neg(Box::new(rhs))); 58 | 59 | let product = unary 60 | .clone() 61 | .then( 62 | op('*') 63 | .to(Expr::Mul as fn(_, _) -> _) 64 | .or(op('/').to(Expr::Div as fn(_, _) -> _)) 65 | .then(unary) 66 | .repeated(), 67 | ) 68 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 69 | 70 | let sum = product 71 | .clone() 72 | .then( 73 | op('+') 74 | .to(Expr::Add as fn(_, _) -> _) 75 | .or(op('-').to(Expr::Sub as fn(_, _) -> _)) 76 | .then(product) 77 | .repeated(), 78 | ) 79 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 80 | 81 | sum.padded() 82 | }); 83 | 84 | let decl = recursive(|decl| { 85 | let r#let = text::keyword("let") 86 | .ignore_then(ident) 87 | .then_ignore(just('=')) 88 | .then(expr.clone()) 89 | .then_ignore(just(';')) 90 | .then(decl.clone()) 91 | .map(|((name, rhs), then)| Expr::Let { 92 | name, 93 | rhs: Box::new(rhs), 94 | then: Box::new(then), 95 | }); 96 | 97 | let r#fn = text::keyword("fn") 98 | .ignore_then(ident) 99 | .then(ident.repeated()) 100 | .then_ignore(just('=')) 101 | .then(expr.clone()) 102 | .then_ignore(just(';')) 103 | .then(decl) 104 | .map(|(((name, args), body), then)| Expr::Fn { 105 | name, 106 | args, 107 | body: Box::new(body), 108 | then: Box::new(then), 109 | }); 110 | 111 | r#let.or(r#fn).or(expr).padded() 112 | }); 113 | 114 | decl.then_ignore(end()) 115 | } 116 | 117 | fn eval<'a>( 118 | expr: &'a Expr, 119 | vars: &mut Vec<(&'a String, f64)>, 120 | funcs: &mut Vec<(&'a String, &'a [String], &'a Expr)>, 121 | ) -> Result { 122 | match expr { 123 | Expr::Num(x) => Ok(*x), 124 | Expr::Neg(a) => Ok(-eval(a, vars, funcs)?), 125 | Expr::Add(a, b) => Ok(eval(a, vars, funcs)? + eval(b, vars, funcs)?), 126 | Expr::Sub(a, b) => Ok(eval(a, vars, funcs)? - eval(b, vars, funcs)?), 127 | Expr::Mul(a, b) => Ok(eval(a, vars, funcs)? * eval(b, vars, funcs)?), 128 | Expr::Div(a, b) => Ok(eval(a, vars, funcs)? / eval(b, vars, funcs)?), 129 | Expr::Var(name) => { 130 | if let Some((_, val)) = vars.iter().rev().find(|(var, _)| *var == name) { 131 | Ok(*val) 132 | } else { 133 | Err(format!("Cannot find variable `{}` in scope", name)) 134 | } 135 | } 136 | Expr::Let { name, rhs, then } => { 137 | let rhs = eval(rhs, vars, funcs)?; 138 | vars.push((name, rhs)); 139 | let output = eval(then, vars, funcs); 140 | vars.pop(); 141 | output 142 | } 143 | Expr::Call(name, args) => { 144 | if let Some((_, arg_names, body)) = 145 | funcs.iter().rev().find(|(var, _, _)| *var == name).copied() 146 | { 147 | if arg_names.len() == args.len() { 148 | let mut args = args 149 | .iter() 150 | .map(|arg| eval(arg, vars, funcs)) 151 | .zip(arg_names.iter()) 152 | .map(|(val, name)| Ok((name, val?))) 153 | .collect::>()?; 154 | vars.append(&mut args); 155 | let output = eval(body, vars, funcs); 156 | vars.truncate(vars.len() - args.len()); 157 | output 158 | } else { 159 | Err(format!( 160 | "Wrong number of arguments for function `{}`: expected {}, found {}", 161 | name, 162 | arg_names.len(), 163 | args.len(), 164 | )) 165 | } 166 | } else { 167 | Err(format!("Cannot find function `{}` in scope", name)) 168 | } 169 | } 170 | Expr::Fn { 171 | name, 172 | args, 173 | body, 174 | then, 175 | } => { 176 | funcs.push((name, args, body)); 177 | let output = eval(then, vars, funcs); 178 | funcs.pop(); 179 | output 180 | } 181 | } 182 | } 183 | 184 | fn main() { 185 | let src = std::fs::read_to_string(std::env::args().nth(1).unwrap()).unwrap(); 186 | 187 | match parser().parse(src) { 188 | Ok(ast) => match eval(&ast, &mut Vec::new(), &mut Vec::new()) { 189 | Ok(output) => println!("{}", output), 190 | Err(eval_err) => println!("Evaluation error: {}", eval_err), 191 | }, 192 | Err(parse_errs) => parse_errs 193 | .into_iter() 194 | .for_each(|e| println!("Parse error: {}", e)), 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /examples/json.rs: -------------------------------------------------------------------------------- 1 | //! This is a parser for JSON. 2 | //! Run it with the following command: 3 | //! cargo run --example json -- examples/sample.json 4 | 5 | use ariadne::{Color, Fmt, Label, Report, ReportKind, Source}; 6 | use chumsky::prelude::*; 7 | use std::{collections::HashMap, env, fs}; 8 | 9 | #[derive(Clone, Debug)] 10 | enum Json { 11 | Invalid, 12 | Null, 13 | Bool(bool), 14 | Str(String), 15 | Num(f64), 16 | Array(Vec), 17 | Object(HashMap), 18 | } 19 | 20 | fn parser() -> impl Parser> { 21 | recursive(|value| { 22 | let frac = just('.').chain(text::digits(10)); 23 | 24 | let exp = just('e') 25 | .or(just('E')) 26 | .chain(just('+').or(just('-')).or_not()) 27 | .chain(text::digits(10)); 28 | 29 | let number = just('-') 30 | .or_not() 31 | .chain(text::int(10)) 32 | .chain(frac.or_not().flatten()) 33 | .chain::(exp.or_not().flatten()) 34 | .collect::() 35 | .from_str() 36 | .unwrapped() 37 | .labelled("number"); 38 | 39 | let escape = just('\\').ignore_then( 40 | just('\\') 41 | .or(just('/')) 42 | .or(just('"')) 43 | .or(just('b').to('\x08')) 44 | .or(just('f').to('\x0C')) 45 | .or(just('n').to('\n')) 46 | .or(just('r').to('\r')) 47 | .or(just('t').to('\t')) 48 | .or(just('u').ignore_then( 49 | filter(|c: &char| c.is_digit(16)) 50 | .repeated() 51 | .exactly(4) 52 | .collect::() 53 | .validate(|digits, span, emit| { 54 | char::from_u32(u32::from_str_radix(&digits, 16).unwrap()) 55 | .unwrap_or_else(|| { 56 | emit(Simple::custom(span, "invalid unicode character")); 57 | '\u{FFFD}' // unicode replacement character 58 | }) 59 | }), 60 | )), 61 | ); 62 | 63 | let string = just('"') 64 | .ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated()) 65 | .then_ignore(just('"')) 66 | .collect::() 67 | .labelled("string"); 68 | 69 | let array = value 70 | .clone() 71 | .chain(just(',').ignore_then(value.clone()).repeated()) 72 | .or_not() 73 | .flatten() 74 | .delimited_by(just('['), just(']')) 75 | .map(Json::Array) 76 | .labelled("array"); 77 | 78 | let member = string.clone().then_ignore(just(':').padded()).then(value); 79 | let object = member 80 | .clone() 81 | .chain(just(',').padded().ignore_then(member).repeated()) 82 | .or_not() 83 | .flatten() 84 | .padded() 85 | .delimited_by(just('{'), just('}')) 86 | .collect::>() 87 | .map(Json::Object) 88 | .labelled("object"); 89 | 90 | just("null") 91 | .to(Json::Null) 92 | .labelled("null") 93 | .or(just("true").to(Json::Bool(true)).labelled("true")) 94 | .or(just("false").to(Json::Bool(false)).labelled("false")) 95 | .or(number.map(Json::Num)) 96 | .or(string.map(Json::Str)) 97 | .or(array) 98 | .or(object) 99 | .recover_with(nested_delimiters('{', '}', [('[', ']')], |_| Json::Invalid)) 100 | .recover_with(nested_delimiters('[', ']', [('{', '}')], |_| Json::Invalid)) 101 | .recover_with(skip_then_retry_until(['}', ']'])) 102 | .padded() 103 | }) 104 | .then_ignore(end().recover_with(skip_then_retry_until([]))) 105 | } 106 | 107 | fn main() { 108 | let src = fs::read_to_string(env::args().nth(1).expect("Expected file argument")) 109 | .expect("Failed to read file"); 110 | 111 | let (json, errs) = parser().parse_recovery(src.trim()); 112 | println!("{:#?}", json); 113 | errs.into_iter().for_each(|e| { 114 | let msg = if let chumsky::error::SimpleReason::Custom(msg) = e.reason() { 115 | msg.clone() 116 | } else { 117 | format!( 118 | "{}{}, expected {}", 119 | if e.found().is_some() { 120 | "Unexpected token" 121 | } else { 122 | "Unexpected end of input" 123 | }, 124 | if let Some(label) = e.label() { 125 | format!(" while parsing {}", label) 126 | } else { 127 | String::new() 128 | }, 129 | if e.expected().len() == 0 { 130 | "something else".to_string() 131 | } else { 132 | e.expected() 133 | .map(|expected| match expected { 134 | Some(expected) => expected.to_string(), 135 | None => "end of input".to_string(), 136 | }) 137 | .collect::>() 138 | .join(", ") 139 | }, 140 | ) 141 | }; 142 | 143 | let report = Report::build(ReportKind::Error, (), e.span().start) 144 | .with_code(3) 145 | .with_message(msg) 146 | .with_label( 147 | Label::new(e.span()) 148 | .with_message(match e.reason() { 149 | chumsky::error::SimpleReason::Custom(msg) => msg.clone(), 150 | _ => format!( 151 | "Unexpected {}", 152 | e.found() 153 | .map(|c| format!("token {}", c.fg(Color::Red))) 154 | .unwrap_or_else(|| "end of input".to_string()) 155 | ), 156 | }) 157 | .with_color(Color::Red), 158 | ); 159 | 160 | let report = match e.reason() { 161 | chumsky::error::SimpleReason::Unclosed { span, delimiter } => report.with_label( 162 | Label::new(span.clone()) 163 | .with_message(format!( 164 | "Unclosed delimiter {}", 165 | delimiter.fg(Color::Yellow) 166 | )) 167 | .with_color(Color::Yellow), 168 | ), 169 | chumsky::error::SimpleReason::Unexpected => report, 170 | chumsky::error::SimpleReason::Custom(_) => report, 171 | }; 172 | 173 | report.finish().print(Source::from(&src)).unwrap(); 174 | }); 175 | } 176 | -------------------------------------------------------------------------------- /src/recursive.rs: -------------------------------------------------------------------------------- 1 | //! Recursive parsers (parser that include themselves within their patterns). 2 | //! 3 | //! *“It's unpleasantly like being drunk." 4 | //! "What's so unpleasant about being drunk?" 5 | //! "You ask a glass of water.”* 6 | //! 7 | //! The [`recursive()`] function covers most cases, but sometimes it's necessary to manually control the declaration and 8 | //! definition of parsers more corefully, particularly for mutually-recursive parsers. In such cases, the functions on 9 | //! [`Recursive`] allow for this. 10 | 11 | use super::*; 12 | 13 | use alloc::rc::{Rc, Weak}; 14 | 15 | // TODO: Remove when `OnceCell` is stable 16 | struct OnceCell(core::cell::RefCell>); 17 | impl OnceCell { 18 | pub fn new() -> Self { 19 | Self(core::cell::RefCell::new(None)) 20 | } 21 | pub fn set(&self, x: T) -> Result<(), ()> { 22 | *self.0.try_borrow_mut().map_err(|_| ())? = Some(x); 23 | Ok(()) 24 | } 25 | pub fn get(&self) -> Option> { 26 | Some(core::cell::Ref::map(self.0.borrow(), |x| { 27 | x.as_ref().unwrap() 28 | })) 29 | } 30 | } 31 | 32 | enum RecursiveInner { 33 | Owned(Rc), 34 | Unowned(Weak), 35 | } 36 | 37 | type OnceParser<'a, I, O, E> = OnceCell + 'a>>; 38 | 39 | /// A parser that can be defined in terms of itself by separating its [declaration](Recursive::declare) from its 40 | /// [definition](Recursive::define). 41 | /// 42 | /// Prefer to use [`recursive()`], which exists as a convenient wrapper around both operations, if possible. 43 | pub struct Recursive<'a, I, O, E: Error>(RecursiveInner>); 44 | 45 | impl<'a, I: Clone, O, E: Error> Recursive<'a, I, O, E> { 46 | fn cell(&self) -> Rc> { 47 | match &self.0 { 48 | RecursiveInner::Owned(x) => x.clone(), 49 | RecursiveInner::Unowned(x) => x 50 | .upgrade() 51 | .expect("Recursive parser used before being defined"), 52 | } 53 | } 54 | 55 | /// Declare the existence of a recursive parser, allowing it to be used to construct parser combinators before 56 | /// being fulled defined. 57 | /// 58 | /// Declaring a parser before defining it is required for a parser to reference itself. 59 | /// 60 | /// This should be followed by **exactly one** call to the [`Recursive::define`] method prior to using the parser 61 | /// for parsing (i.e: via the [`Parser::parse`] method or similar). 62 | /// 63 | /// Prefer to use [`recursive()`], which is a convenient wrapper around this method and [`Recursive::define`], if 64 | /// possible. 65 | /// 66 | /// # Examples 67 | /// 68 | /// ``` 69 | /// # use chumsky::prelude::*; 70 | /// #[derive(Debug, PartialEq)] 71 | /// enum Chain { 72 | /// End, 73 | /// Link(char, Box), 74 | /// } 75 | /// 76 | /// // Declare the existence of the parser before defining it so that it can reference itself 77 | /// let mut chain = Recursive::<_, _, Simple>::declare(); 78 | /// 79 | /// // Define the parser in terms of itself. 80 | /// // In this case, the parser parses a right-recursive list of '+' into a singly linked list 81 | /// chain.define(just('+') 82 | /// .then(chain.clone()) 83 | /// .map(|(c, chain)| Chain::Link(c, Box::new(chain))) 84 | /// .or_not() 85 | /// .map(|chain| chain.unwrap_or(Chain::End))); 86 | /// 87 | /// assert_eq!(chain.parse(""), Ok(Chain::End)); 88 | /// assert_eq!( 89 | /// chain.parse("++"), 90 | /// Ok(Chain::Link('+', Box::new(Chain::Link('+', Box::new(Chain::End))))), 91 | /// ); 92 | /// ``` 93 | pub fn declare() -> Self { 94 | Recursive(RecursiveInner::Owned(Rc::new(OnceCell::new()))) 95 | } 96 | 97 | /// Defines the parser after declaring it, allowing it to be used for parsing. 98 | pub fn define + 'a>(&mut self, parser: P) { 99 | self.cell() 100 | .set(Box::new(parser)) 101 | .unwrap_or_else(|_| panic!("Parser defined more than once")); 102 | } 103 | } 104 | 105 | impl<'a, I: Clone, O, E: Error> Clone for Recursive<'a, I, O, E> { 106 | fn clone(&self) -> Self { 107 | Self(match &self.0 { 108 | RecursiveInner::Owned(x) => RecursiveInner::Owned(x.clone()), 109 | RecursiveInner::Unowned(x) => RecursiveInner::Unowned(x.clone()), 110 | }) 111 | } 112 | } 113 | 114 | impl<'a, I: Clone, O, E: Error> Parser for Recursive<'a, I, O, E> { 115 | type Error = E; 116 | 117 | fn parse_inner( 118 | &self, 119 | debugger: &mut D, 120 | stream: &mut StreamOf, 121 | ) -> PResult { 122 | #[cfg(feature = "stacker")] 123 | #[inline(always)] 124 | fn recurse R>(f: F) -> R { 125 | stacker::maybe_grow(1024 * 1024, 1024 * 1024, f) 126 | } 127 | #[cfg(not(feature = "stacker"))] 128 | #[inline(always)] 129 | fn recurse R>(f: F) -> R { 130 | f() 131 | } 132 | 133 | recurse(|| { 134 | #[allow(deprecated)] 135 | debugger.invoke( 136 | self.cell() 137 | .get() 138 | .expect("Recursive parser used before being defined") 139 | .as_ref(), 140 | stream, 141 | ) 142 | }) 143 | } 144 | 145 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 146 | #[allow(deprecated)] 147 | self.parse_inner(d, s) 148 | } 149 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 150 | #[allow(deprecated)] 151 | self.parse_inner(d, s) 152 | } 153 | } 154 | 155 | /// Construct a recursive parser (i.e: a parser that may contain itself as part of its pattern). 156 | /// 157 | /// The given function must create the parser. The parser must not be used to parse input before this function returns. 158 | /// 159 | /// This is a wrapper around [`Recursive::declare`] and [`Recursive::define`]. 160 | /// 161 | /// The output type of this parser is `O`, the same as the inner parser. 162 | /// 163 | /// # Examples 164 | /// 165 | /// ``` 166 | /// # use chumsky::prelude::*; 167 | /// #[derive(Debug, PartialEq)] 168 | /// enum Tree { 169 | /// Leaf(String), 170 | /// Branch(Vec), 171 | /// } 172 | /// 173 | /// // Parser that recursively parses nested lists 174 | /// let tree = recursive::<_, _, _, _, Simple>(|tree| tree 175 | /// .separated_by(just(',')) 176 | /// .delimited_by(just('['), just(']')) 177 | /// .map(Tree::Branch) 178 | /// .or(text::ident().map(Tree::Leaf)) 179 | /// .padded()); 180 | /// 181 | /// assert_eq!(tree.parse("hello"), Ok(Tree::Leaf("hello".to_string()))); 182 | /// assert_eq!(tree.parse("[a, b, c]"), Ok(Tree::Branch(vec![ 183 | /// Tree::Leaf("a".to_string()), 184 | /// Tree::Leaf("b".to_string()), 185 | /// Tree::Leaf("c".to_string()), 186 | /// ]))); 187 | /// // The parser can deal with arbitrarily complex nested lists 188 | /// assert_eq!(tree.parse("[[a, b], c, [d, [e, f]]]"), Ok(Tree::Branch(vec![ 189 | /// Tree::Branch(vec![ 190 | /// Tree::Leaf("a".to_string()), 191 | /// Tree::Leaf("b".to_string()), 192 | /// ]), 193 | /// Tree::Leaf("c".to_string()), 194 | /// Tree::Branch(vec![ 195 | /// Tree::Leaf("d".to_string()), 196 | /// Tree::Branch(vec![ 197 | /// Tree::Leaf("e".to_string()), 198 | /// Tree::Leaf("f".to_string()), 199 | /// ]), 200 | /// ]), 201 | /// ]))); 202 | /// ``` 203 | pub fn recursive< 204 | 'a, 205 | I: Clone, 206 | O, 207 | P: Parser + 'a, 208 | F: FnOnce(Recursive<'a, I, O, E>) -> P, 209 | E: Error, 210 | >( 211 | f: F, 212 | ) -> Recursive<'a, I, O, E> { 213 | let mut parser = Recursive::declare(); 214 | parser.define(f(Recursive(match &parser.0 { 215 | RecursiveInner::Owned(x) => RecursiveInner::Unowned(Rc::downgrade(x)), 216 | RecursiveInner::Unowned(_) => unreachable!(), 217 | }))); 218 | parser 219 | } 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chumsky 2 | 3 | [![crates.io](https://img.shields.io/crates/v/chumsky.svg)](https://crates.io/crates/chumsky) 4 | [![crates.io](https://docs.rs/chumsky/badge.svg)](https://docs.rs/chumsky) 5 | [![License](https://img.shields.io/crates/l/chumsky.svg)](https://github.com/zesterer/chumsky) 6 | [![actions-badge](https://github.com/zesterer/chumsky/workflows/Rust/badge.svg?branch=master)](https://github.com/zesterer/chumsky/actions) 7 | 8 | A parser library for humans with powerful error recovery. 9 | 10 | 11 | Example usage with my own language, Tao 12 | 13 | 14 | *Note: Error diagnostic rendering is performed by [Ariadne](https://github.com/zesterer/ariadne)* 15 | 16 | ## Features 17 | 18 | - Lots of combinators! 19 | - Generic across input, output, error, and span types 20 | - Powerful error recovery strategies 21 | - Inline mapping to your AST 22 | - Text-specific parsers for both `u8`s and `char`s 23 | - Recursive parsers 24 | - Backtracking is fully supported, allowing the parsing of all known context-free grammars 25 | - Parsing of nesting inputs, allowing you to move delimiter parsing to the lexical stage (as Rust does!) 26 | - Built-in parser debugging 27 | 28 | ## Example [Brainfuck](https://en.wikipedia.org/wiki/Brainfuck) Parser 29 | 30 | See [`examples/brainfuck.rs`](https://github.com/zesterer/chumsky/blob/master/examples/brainfuck.rs) for the full 31 | interpreter (`cargo run --example brainfuck -- examples/sample.bf`). 32 | 33 | ```rust 34 | use chumsky::prelude::*; 35 | 36 | #[derive(Clone)] 37 | enum Instr { 38 | Left, Right, 39 | Incr, Decr, 40 | Read, Write, 41 | Loop(Vec), 42 | } 43 | 44 | fn parser() -> impl Parser, Error = Simple> { 45 | recursive(|bf| choice(( 46 | just('<').to(Instr::Left), 47 | just('>').to(Instr::Right), 48 | just('+').to(Instr::Incr), 49 | just('-').to(Instr::Decr), 50 | just(',').to(Instr::Read), 51 | just('.').to(Instr::Write), 52 | bf.delimited_by(just('['), just(']')).map(Instr::Loop), 53 | )) 54 | .repeated()) 55 | } 56 | ``` 57 | 58 | Other examples include: 59 | 60 | - A [JSON parser](https://github.com/zesterer/chumsky/blob/master/examples/json.rs) (`cargo run --example json -- 61 | examples/sample.json`) 62 | - An [interpreter for a simple Rust-y language](https://github.com/zesterer/chumsky/blob/master/examples/nano_rust.rs) 63 | (`cargo run --example nano_rust -- examples/sample.nrs`) 64 | 65 | ## Tutorial 66 | 67 | Chumsky has [a tutorial](https://github.com/zesterer/chumsky/blob/master/tutorial.md) that teaches you how to write a 68 | parser and interpreter for a simple dynamic language with unary and binary operators, operator precedence, functions, 69 | let declarations, and calls. 70 | 71 | ## *What* is a parser combinator? 72 | 73 | Parser combinators are a technique for implementing parsers by defining them in terms of other parsers. The resulting 74 | parsers use a [recursive descent](https://en.wikipedia.org/wiki/Recursive_descent_parser) strategy to transform a stream 75 | of tokens into an output. Using parser combinators to define parsers is roughly analagous to using Rust's 76 | [`Iterator`](https://doc.rust-lang.org/std/iter/trait.Iterator.html) trait to define iterative algorithms: the 77 | type-driven API of `Iterator` makes it more difficult to make mistakes and easier to encode complicated iteration logic 78 | than if one were to write the same code by hand. The same is true of parser combinators. 79 | 80 | ## *Why* use parser combinators? 81 | 82 | Writing parsers with good error recovery is conceptually difficult and time-consuming. It requires understanding the 83 | intricacies of the recursive descent algorithm, and then implementing recovery strategies on top of it. If you're 84 | developing a programming language, you'll almost certainly change your mind about syntax in the process, leading to some 85 | slow and painful parser refactoring. Parser combinators solve both problems by providing an ergonomic API that allows 86 | for rapidly iterating upon a syntax. 87 | 88 | Parser combinators are also a great fit for domain-specific languages for which an existing parser does not exist. 89 | Writing a reliable, fault-tolerant parser for such situations can go from being a multi-day task to a half-hour task 90 | with the help of a decent parser combinator library. 91 | 92 | ## Classification 93 | 94 | Chumsky's parsers are [recursive descent](https://en.wikipedia.org/wiki/Recursive_descent_parser) parsers and are 95 | capable of parsing [parsing expression grammars (PEGs)](https://en.wikipedia.org/wiki/Parsing_expression_grammar), which 96 | includes all known context-free languages. It is theoretically possible to extend Chumsky further to accept limited 97 | context-sensitive grammars too, although this is rarely required. 98 | 99 | ## Error Recovery 100 | 101 | Chumsky has support for error recovery, meaning that it can encounter a syntax error, report the error, and then 102 | attempt to recover itself into a state in which it can continue parsing so that multiple errors can be produced at once 103 | and a partial [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree) can still be generated from the input for future 104 | compilation stages to consume. 105 | 106 | However, there is no silver bullet strategy for error recovery. By definition, if the input to a parser is invalid then 107 | the parser can only make educated guesses as to the meaning of the input. Different recovery strategies will work better 108 | for different languages, and for different patterns within those languages. 109 | 110 | Chumsky provides a variety of recovery strategies (each implementing the `Strategy` trait), but it's important to 111 | understand that which you apply, where you apply them, and in what order will greatly affect the quality of the errors 112 | that Chumsky is able to produce, along with the extent to which it is able to recover a useful AST. Where possible, you 113 | should attempt more 'specific' recovery strategies first rather than those that mindlessly skip large swathes of the 114 | input. 115 | 116 | It is recommended that you experiment with applying different strategies in different situations and at different levels 117 | of the parser to find a configuration that you are happy with. If none of the provided error recovery strategies cover 118 | the specific pattern you wish to catch, you can even create your own by digging into Chumsky's internals and 119 | implementing your own strategies! If you come up with a useful strategy, feel free to open a PR against the 120 | [main repository](https://github.com/zesterer/chumsky/)! 121 | 122 | ## Performance 123 | 124 | Chumsky focuses on high-quality errors and ergonomics over performance. That said, it's important that Chumsky can keep 125 | up with the rest of your compiler! Unfortunately, it's *extremely* difficult to come up with sensible benchmarks given 126 | that exactly how Chumsky performs depends entirely on what you are parsing, how you structure your parser, which 127 | patterns the parser attempts to match first, how complex your error type is, what is involved in constructing your AST, 128 | etc. All that said, here are some numbers from the 129 | [JSON benchmark](https://github.com/zesterer/chumsky/blob/master/benches/json.rs) included in the repository running on 130 | my Ryzen 7 3700x. 131 | 132 | ```ignore 133 | test chumsky ... bench: 4,782,390 ns/iter (+/- 997,208) 134 | test pom ... bench: 12,793,490 ns/iter (+/- 1,954,583) 135 | ``` 136 | 137 | I've included results from [`pom`](https://github.com/J-F-Liu/pom), another parser combinator crate with a similar 138 | design, as a point of reference. The sample file being parsed is broadly represenative of typical JSON data and has 139 | 3,018 lines. This translates to a little over 630,000 lines of JSON per second. 140 | 141 | Clearly, this is a little slower than a well-optimised hand-written parser: but that's okay! Chumsky's goal is to be 142 | *fast enough*. If you've written enough code in your language that parsing performance even starts to be a problem, 143 | you've already committed enough time and resources to your language that hand-writing a parser is the best choice going! 144 | 145 | ## Planned Features 146 | 147 | - An optimised 'happy path' parser mode that skips error recovery & error generation 148 | - An even faster 'validation' parser mode, guaranteed to not allocate, that doesn't generate outputs but just verifies 149 | the validity of an input 150 | 151 | ## Philosophy 152 | 153 | Chumsky should: 154 | 155 | - Be easy to use, even if you doesn't understand exactly what the parser is doing under the hood 156 | - Be type-driven, pushing users away from anti-patterns at compile-time 157 | - Be a mature, 'batteries-included' solution for context-free parsing by default. If you need to implement either 158 | `Parser` or `Strategy` by hand, that's a problem that needs fixing 159 | - Be 'fast enough', but no faster (i.e: when there is a tradeoff between error quality and performance, Chumsky will 160 | always take the former option) 161 | - Be modular and extensible, allowing users to implement their own parsers, recovery strategies, error types, spans, and 162 | be generic over both input tokens and the output AST 163 | 164 | ## Notes 165 | 166 | My apologies to Noam for choosing such an absurd name. 167 | 168 | ## License 169 | 170 | Chumsky is licensed under the MIT license (see `LICENSE` in the main repository). 171 | -------------------------------------------------------------------------------- /src/text.rs: -------------------------------------------------------------------------------- 1 | //! Text-specific parsers and utilities. 2 | //! 3 | //! *“Ford!" he said, "there's an infinite number of monkeys outside who want to talk to us about this script for 4 | //! Hamlet they've worked out.”* 5 | //! 6 | //! The parsers in this module are generic over both Unicode ([`char`]) and ASCII ([`u8`]) characters. Most parsers take 7 | //! a type parameter, `C`, that can be either [`u8`] or [`char`] in order to handle either case. 8 | //! 9 | //! The [`TextParser`] trait is an extension on top of the main [`Parser`] trait that adds combinators unique to the 10 | //! parsing of text. 11 | 12 | use super::*; 13 | use core::iter::FromIterator; 14 | 15 | /// The type of a parser that accepts (and ignores) any number of whitespace characters. 16 | pub type Padding = Custom) -> PResult, E>; 17 | 18 | /// The type of a parser that accepts (and ignores) any number of whitespace characters before or after another 19 | /// pattern. 20 | // pub type Padded = ThenIgnore< 21 | // IgnoreThen>::Error>, P, (), O>, 22 | // Padding>::Error>, 23 | // O, 24 | // (), 25 | // >; 26 | 27 | /// A parser that accepts (and ignores) any number of whitespace characters before or after another pattern. 28 | #[derive(Copy, Clone)] 29 | pub struct Padded(A); 30 | 31 | impl, E: Error> Parser for Padded { 32 | type Error = E; 33 | 34 | #[inline] 35 | fn parse_inner( 36 | &self, 37 | debugger: &mut D, 38 | stream: &mut StreamOf, 39 | ) -> PResult { 40 | while stream.skip_if(|c| c.is_whitespace()) {} 41 | match self.0.parse_inner(debugger, stream) { 42 | (a_errors, Ok((a_out, a_alt))) => { 43 | while stream.skip_if(|c| c.is_whitespace()) {} 44 | (a_errors, Ok((a_out, a_alt))) 45 | } 46 | (a_errors, Err(err)) => (a_errors, Err(err)), 47 | } 48 | } 49 | 50 | #[inline] 51 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 52 | #[allow(deprecated)] 53 | self.parse_inner(d, s) 54 | } 55 | #[inline] 56 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 57 | #[allow(deprecated)] 58 | self.parse_inner(d, s) 59 | } 60 | } 61 | 62 | mod private { 63 | pub trait Sealed {} 64 | 65 | impl Sealed for u8 {} 66 | impl Sealed for char {} 67 | } 68 | 69 | /// A trait implemented by textual character types (currently, [`u8`] and [`char`]). 70 | /// 71 | /// Avoid implementing this trait yourself if you can: it's *very* likely to be expanded in future versions! 72 | pub trait Character: private::Sealed + Copy + PartialEq { 73 | /// The default unsized [`str`]-like type of a linear sequence of this character. 74 | /// 75 | /// For [`char`], this is [`str`]. For [`u8`], this is [`[u8]`]. 76 | type Str: ?Sized + PartialEq; 77 | 78 | /// The default type that this character collects into. 79 | /// 80 | /// For [`char`], this is [`String`]. For [`u8`], this is [`Vec`]. 81 | type Collection: Chain + FromIterator + AsRef + 'static; 82 | 83 | /// Returns true if the character is canonically considered to be whitespace. 84 | fn is_whitespace(&self) -> bool; 85 | 86 | /// Return the '0' digit of the character. 87 | fn digit_zero() -> Self; 88 | 89 | /// Returns true if the character is canonically considered to be a numeric digit. 90 | fn is_digit(&self, radix: u32) -> bool; 91 | 92 | /// Returns this character as a [`char`]. 93 | fn to_char(&self) -> char; 94 | } 95 | 96 | impl Character for u8 { 97 | type Str = [u8]; 98 | type Collection = Vec; 99 | 100 | fn is_whitespace(&self) -> bool { 101 | self.is_ascii_whitespace() 102 | } 103 | fn digit_zero() -> Self { 104 | b'0' 105 | } 106 | fn is_digit(&self, radix: u32) -> bool { 107 | (*self as char).is_digit(radix) 108 | } 109 | fn to_char(&self) -> char { 110 | *self as char 111 | } 112 | } 113 | 114 | impl Character for char { 115 | type Str = str; 116 | type Collection = String; 117 | 118 | fn is_whitespace(&self) -> bool { 119 | char::is_whitespace(*self) 120 | } 121 | fn digit_zero() -> Self { 122 | '0' 123 | } 124 | fn is_digit(&self, radix: u32) -> bool { 125 | char::is_digit(*self, radix) 126 | } 127 | fn to_char(&self) -> char { 128 | *self 129 | } 130 | } 131 | 132 | /// A trait containing text-specific functionality that extends the [`Parser`] trait. 133 | pub trait TextParser: Parser { 134 | /// Parse a pattern, ignoring any amount of whitespace both before and after the pattern. 135 | /// 136 | /// The output type of this parser is `O`, the same as the original parser. 137 | /// 138 | /// # Examples 139 | /// 140 | /// ``` 141 | /// # use chumsky::prelude::*; 142 | /// let ident = text::ident::<_, Simple>().padded(); 143 | /// 144 | /// // A pattern with no whitespace surrounding it is accepted 145 | /// assert_eq!(ident.parse("hello"), Ok("hello".to_string())); 146 | /// // A pattern with arbitrary whitespace surrounding it is also accepted 147 | /// assert_eq!(ident.parse(" \t \n \t world \t "), Ok("world".to_string())); 148 | /// ``` 149 | fn padded(self) -> Padded 150 | where 151 | Self: Sized, 152 | { 153 | Padded(self) 154 | // whitespace().ignore_then(self).then_ignore(whitespace()) 155 | } 156 | } 157 | 158 | impl> TextParser for P {} 159 | 160 | /// A parser that accepts (and ignores) any number of whitespace characters. 161 | /// 162 | /// The output type of this parser is `()`. 163 | /// 164 | /// # Examples 165 | /// 166 | /// ``` 167 | /// # use chumsky::prelude::*; 168 | /// let whitespace = text::whitespace::<_, Simple>(); 169 | /// 170 | /// // Any amount of whitespace is parsed... 171 | /// assert_eq!(whitespace.parse(" \t \n\n \t "), Ok(())); 172 | /// // ...including none at all! 173 | /// assert_eq!(whitespace.parse(""), Ok(())); 174 | /// ``` 175 | pub fn whitespace>() -> Padding { 176 | custom(|stream: &mut StreamOf| loop { 177 | let state = stream.save(); 178 | if stream.next().2.map_or(true, |b| !b.is_whitespace()) { 179 | stream.revert(state); 180 | break (Vec::new(), Ok(((), None))); 181 | } 182 | }) 183 | } 184 | 185 | /// A parser that accepts (and ignores) any newline characters or character sequences. 186 | /// 187 | /// The output type of this parser is `()`. 188 | /// 189 | /// This parser is quite extensive, recognising: 190 | /// 191 | /// - Line feed (`\n`) 192 | /// - Carriage return (`\r`) 193 | /// - Carriage return + line feed (`\r\n`) 194 | /// - Vertical tab (`\x0B`) 195 | /// - Form feed (`\x0C`) 196 | /// - Next line (`\u{0085}`) 197 | /// - Line separator (`\u{2028}`) 198 | /// - Paragraph separator (`\u{2029}`) 199 | /// 200 | /// # Examples 201 | /// 202 | /// ``` 203 | /// # use chumsky::prelude::*; 204 | /// let newline = text::newline::>() 205 | /// .then_ignore(end()); 206 | /// 207 | /// assert_eq!(newline.parse("\n"), Ok(())); 208 | /// assert_eq!(newline.parse("\r"), Ok(())); 209 | /// assert_eq!(newline.parse("\r\n"), Ok(())); 210 | /// assert_eq!(newline.parse("\x0B"), Ok(())); 211 | /// assert_eq!(newline.parse("\x0C"), Ok(())); 212 | /// assert_eq!(newline.parse("\u{0085}"), Ok(())); 213 | /// assert_eq!(newline.parse("\u{2028}"), Ok(())); 214 | /// assert_eq!(newline.parse("\u{2029}"), Ok(())); 215 | /// ``` 216 | pub fn newline>() -> impl Parser + Copy + Clone { 217 | just('\r') 218 | .or_not() 219 | .ignore_then(just('\n')) 220 | .or(just('\r')) // Carriage return 221 | .or(just('\x0B')) // Vertical tab 222 | .or(just('\x0C')) // Form feed 223 | .or(just('\u{0085}')) // Next line 224 | .or(just('\u{2028}')) // Line separator 225 | .or(just('\u{2029}')) // Paragraph separator 226 | .ignored() 227 | } 228 | 229 | /// A parser that accepts one or more ASCII digits. 230 | /// 231 | /// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec`] 232 | /// when `C` is [`u8`]). 233 | /// 234 | /// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`. 235 | /// 236 | /// # Examples 237 | /// 238 | /// ``` 239 | /// # use chumsky::prelude::*; 240 | /// let digits = text::digits::<_, Simple>(10); 241 | /// 242 | /// assert_eq!(digits.parse("0"), Ok("0".to_string())); 243 | /// assert_eq!(digits.parse("1"), Ok("1".to_string())); 244 | /// assert_eq!(digits.parse("01234"), Ok("01234".to_string())); 245 | /// assert_eq!(digits.parse("98345"), Ok("98345".to_string())); 246 | /// // A string of zeroes is still valid. Use `int` if this is not desirable. 247 | /// assert_eq!(digits.parse("0000"), Ok("0000".to_string())); 248 | /// assert!(digits.parse("").is_err()); 249 | /// ``` 250 | pub fn digits>( 251 | radix: u32, 252 | ) -> impl Parser + Copy + Clone { 253 | filter(move |c: &C| c.is_digit(radix)) 254 | .repeated() 255 | .at_least(1) 256 | .collect() 257 | } 258 | 259 | /// A parser that accepts a positive integer. 260 | /// 261 | /// An integer is defined as a non-empty sequence of ASCII digits, where the first digit is non-zero or the sequence 262 | /// has length one. 263 | /// 264 | /// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec`] 265 | /// when `C` is [`u8`]). 266 | /// 267 | /// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`. 268 | /// 269 | /// # Examples 270 | /// 271 | /// ``` 272 | /// # use chumsky::prelude::*; 273 | /// let dec = text::int::<_, Simple>(10) 274 | /// .then_ignore(end()); 275 | /// 276 | /// assert_eq!(dec.parse("0"), Ok("0".to_string())); 277 | /// assert_eq!(dec.parse("1"), Ok("1".to_string())); 278 | /// assert_eq!(dec.parse("1452"), Ok("1452".to_string())); 279 | /// // No leading zeroes are permitted! 280 | /// assert!(dec.parse("04").is_err()); 281 | /// 282 | /// let hex = text::int::<_, Simple>(16) 283 | /// .then_ignore(end()); 284 | /// 285 | /// assert_eq!(hex.parse("2A"), Ok("2A".to_string())); 286 | /// assert_eq!(hex.parse("d"), Ok("d".to_string())); 287 | /// assert_eq!(hex.parse("b4"), Ok("b4".to_string())); 288 | /// assert!(hex.parse("0B").is_err()); 289 | /// ``` 290 | pub fn int>( 291 | radix: u32, 292 | ) -> impl Parser + Copy + Clone { 293 | filter(move |c: &C| c.is_digit(radix) && c != &C::digit_zero()) 294 | .map(Some) 295 | .chain::, _>(filter(move |c: &C| c.is_digit(radix)).repeated()) 296 | .collect() 297 | .or(just(C::digit_zero()).map(|c| core::iter::once(c).collect())) 298 | } 299 | 300 | /// A parser that accepts a C-style identifier. 301 | /// 302 | /// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec`] 303 | /// when `C` is [`u8`]). 304 | /// 305 | /// An identifier is defined as an ASCII alphabetic character or an underscore followed by any number of alphanumeric 306 | /// characters or underscores. The regex pattern for it is `[a-zA-Z_][a-zA-Z0-9_]*`. 307 | pub fn ident>() -> impl Parser + Copy + Clone 308 | { 309 | filter(|c: &C| c.to_char().is_ascii_alphabetic() || c.to_char() == '_') 310 | .map(Some) 311 | .chain::, _>( 312 | filter(|c: &C| c.to_char().is_ascii_alphanumeric() || c.to_char() == '_').repeated(), 313 | ) 314 | .collect() 315 | } 316 | 317 | /// Like [`ident`], but only accepts an exact identifier while ignoring trailing identifier characters. 318 | /// 319 | /// The output type of this parser is `()`. 320 | /// 321 | /// # Examples 322 | /// 323 | /// ``` 324 | /// # use chumsky::prelude::*; 325 | /// let def = text::keyword::<_, _, Simple>("def"); 326 | /// 327 | /// // Exactly 'def' was found 328 | /// assert_eq!(def.parse("def"), Ok(())); 329 | /// // Exactly 'def' was found, with non-identifier trailing characters 330 | /// assert_eq!(def.parse("def(foo, bar)"), Ok(())); 331 | /// // 'def' was found, but only as part of a larger identifier, so this fails to parse 332 | /// assert!(def.parse("define").is_err()); 333 | /// ``` 334 | pub fn keyword<'a, C: Character + 'a, S: AsRef + 'a + Clone, E: Error + 'a>( 335 | keyword: S, 336 | ) -> impl Parser + Clone + 'a { 337 | // TODO: use .filter(...), improve error messages 338 | ident().try_map(move |s: C::Collection, span| { 339 | if s.as_ref() == keyword.as_ref() { 340 | Ok(()) 341 | } else { 342 | Err(E::expected_input_found(span, None, None)) 343 | } 344 | }) 345 | } 346 | -------------------------------------------------------------------------------- /src/stream.rs: -------------------------------------------------------------------------------- 1 | //! Token streams and tools converting to and from them.. 2 | //! 3 | //! *“What’s up?” “I don’t know,” said Marvin, “I’ve never been there.”* 4 | //! 5 | //! [`Stream`] is the primary type used to feed input data into a chumsky parser. You can create them in a number of 6 | //! ways: from strings, iterators, arrays, etc. 7 | 8 | use super::*; 9 | use alloc::vec; 10 | 11 | trait StreamExtend: Iterator { 12 | /// Extend the vector with input. The actual amount can be more or less than `n`, but must be at least 1 (0 implies 13 | /// that the stream has been exhausted. 14 | fn extend(&mut self, v: &mut Vec, n: usize); 15 | } 16 | 17 | #[allow(deprecated)] 18 | impl StreamExtend for I { 19 | fn extend(&mut self, v: &mut Vec, n: usize) { 20 | v.reserve(n); 21 | v.extend(self.take(n)); 22 | } 23 | } 24 | 25 | /// A utility type used to flatten input trees. See [`Stream::from_nested`]. 26 | pub enum Flat { 27 | /// The input tree flattens into a single input. 28 | Single(I), 29 | /// The input tree flattens into many sub-trees. 30 | Many(Iter), 31 | } 32 | 33 | /// A type that represents a stream of input tokens. Unlike [`Iterator`], this type supports backtracking and a few 34 | /// other features required by the crate. 35 | #[allow(deprecated)] 36 | pub struct Stream< 37 | 'a, 38 | I, 39 | S: Span, 40 | Iter: Iterator + ?Sized = dyn Iterator + 'a, 41 | > { 42 | pub(crate) phantom: PhantomData<&'a ()>, 43 | pub(crate) eoi: S, 44 | pub(crate) offset: usize, 45 | pub(crate) buffer: Vec<(I, S)>, 46 | pub(crate) iter: Iter, 47 | } 48 | 49 | /// A [`Stream`] that pulls tokens from a boxed [`Iterator`]. 50 | pub type BoxStream<'a, I, S> = Stream<'a, I, S, Box + 'a>>; 51 | 52 | impl<'a, I, S: Span, Iter: Iterator> Stream<'a, I, S, Iter> { 53 | /// Create a new stream from an iterator of `(Token, Span)` pairs. A span representing the end of input must also 54 | /// be provided. 55 | /// 56 | /// There is no requirement that spans must map exactly to the position of inputs in the stream, but they should 57 | /// be non-overlapping and should appear in a monotonically-increasing order. 58 | pub fn from_iter(eoi: S, iter: Iter) -> Self { 59 | Self { 60 | phantom: PhantomData, 61 | eoi, 62 | offset: 0, 63 | buffer: Vec::new(), 64 | iter, 65 | } 66 | } 67 | 68 | /// Eagerly evaluate the token stream, returning an iterator over the tokens in it (but without modifying the 69 | /// stream's state so that it can still be used for parsing). 70 | /// 71 | /// This is most useful when you wish to check the input of a parser during debugging. 72 | pub fn fetch_tokens(&mut self) -> impl Iterator + '_ 73 | where 74 | (I, S): Clone, 75 | { 76 | self.buffer.extend(&mut self.iter); 77 | self.buffer.iter().cloned() 78 | } 79 | } 80 | 81 | impl<'a, I: Clone, S: Span + 'a> BoxStream<'a, I, S> { 82 | /// Create a new `Stream` from an iterator of nested tokens and a function that flattens them. 83 | /// 84 | /// It's not uncommon for compilers to perform delimiter parsing during the lexing stage (Rust does this!). When 85 | /// this is done, the output of the lexing stage is usually a series of nested token trees. This functions allows 86 | /// you to easily flatten such token trees into a linear token stream so that they can be parsed (Chumsky currently 87 | /// only support parsing linear streams of inputs). 88 | /// 89 | /// For reference, [here](https://docs.rs/syn/0.11.1/syn/enum.TokenTree.html) is `syn`'s `TokenTree` type that it 90 | /// uses when parsing Rust syntax. 91 | /// 92 | /// # Examples 93 | /// 94 | /// ``` 95 | /// # use chumsky::{Stream, BoxStream, Flat}; 96 | /// type Span = std::ops::Range; 97 | /// 98 | /// fn span_at(at: usize) -> Span { at..at + 1 } 99 | /// 100 | /// #[derive(Clone)] 101 | /// enum Token { 102 | /// Local(String), 103 | /// Int(i64), 104 | /// Bool(bool), 105 | /// Add, 106 | /// Sub, 107 | /// OpenParen, 108 | /// CloseParen, 109 | /// OpenBrace, 110 | /// CloseBrace, 111 | /// // etc. 112 | /// } 113 | /// 114 | /// enum Delimiter { 115 | /// Paren, // ( ... ) 116 | /// Brace, // { ... } 117 | /// } 118 | /// 119 | /// // The structure of this token tree is very similar to that which Rust uses. 120 | /// // See: https://docs.rs/syn/0.11.1/syn/enum.TokenTree.html 121 | /// enum TokenTree { 122 | /// Token(Token), 123 | /// Tree(Delimiter, Vec<(TokenTree, Span)>), 124 | /// } 125 | /// 126 | /// // A function that turns a series of nested token trees into a linear stream that can be used for parsing. 127 | /// fn flatten_tts(eoi: Span, token_trees: Vec<(TokenTree, Span)>) -> BoxStream<'static, Token, Span> { 128 | /// use std::iter::once; 129 | /// // Currently, this is quite an explicit process: it will likely become easier in future versions of Chumsky. 130 | /// Stream::from_nested( 131 | /// eoi, 132 | /// token_trees.into_iter(), 133 | /// |(tt, span)| match tt { 134 | /// // For token trees that contain just a single token, no flattening needs to occur! 135 | /// TokenTree::Token(token) => Flat::Single((token, span)), 136 | /// // Flatten a parenthesised token tree into an iterator of the inner token trees, surrounded by parenthesis tokens 137 | /// TokenTree::Tree(Delimiter::Paren, tree) => Flat::Many(once((TokenTree::Token(Token::OpenParen), span_at(span.start))) 138 | /// .chain(tree.into_iter()) 139 | /// .chain(once((TokenTree::Token(Token::CloseParen), span_at(span.end - 1))))), 140 | /// // Flatten a braced token tree into an iterator of the inner token trees, surrounded by brace tokens 141 | /// TokenTree::Tree(Delimiter::Brace, tree) => Flat::Many(once((TokenTree::Token(Token::OpenBrace), span_at(span.start))) 142 | /// .chain(tree.into_iter()) 143 | /// .chain(once((TokenTree::Token(Token::CloseBrace), span_at(span.end - 1))))), 144 | /// } 145 | /// ) 146 | /// } 147 | /// ``` 148 | pub fn from_nested< 149 | P: 'a, 150 | Iter: Iterator, 151 | Many: Iterator, 152 | F: FnMut((P, S)) -> Flat<(I, S), Many> + 'a, 153 | >( 154 | eoi: S, 155 | iter: Iter, 156 | mut flatten: F, 157 | ) -> Self { 158 | let mut v: Vec> = vec![iter.collect()]; 159 | Self::from_iter( 160 | eoi, 161 | Box::new(core::iter::from_fn(move || loop { 162 | if let Some(many) = v.last_mut() { 163 | match many.pop_front().map(&mut flatten) { 164 | Some(Flat::Single(input)) => break Some(input), 165 | Some(Flat::Many(many)) => v.push(many.collect()), 166 | None => { 167 | v.pop(); 168 | } 169 | } 170 | } else { 171 | break None; 172 | } 173 | })), 174 | ) 175 | } 176 | } 177 | 178 | impl<'a, I: Clone, S: Span> Stream<'a, I, S> { 179 | pub(crate) fn offset(&self) -> usize { 180 | self.offset 181 | } 182 | 183 | pub(crate) fn save(&self) -> usize { 184 | self.offset 185 | } 186 | pub(crate) fn revert(&mut self, offset: usize) { 187 | self.offset = offset; 188 | } 189 | 190 | fn pull_until(&mut self, offset: usize) -> Option<&(I, S)> { 191 | let additional = offset.saturating_sub(self.buffer.len()) + 1024; 192 | #[allow(deprecated)] 193 | (&mut &mut self.iter as &mut dyn StreamExtend<_>).extend(&mut self.buffer, additional); 194 | self.buffer.get(offset) 195 | } 196 | 197 | pub(crate) fn skip_if(&mut self, f: impl FnOnce(&I) -> bool) -> bool { 198 | match self.pull_until(self.offset).cloned() { 199 | Some((out, _)) if f(&out) => { 200 | self.offset += 1; 201 | true 202 | } 203 | Some(_) => false, 204 | None => false, 205 | } 206 | } 207 | 208 | pub(crate) fn next(&mut self) -> (usize, S, Option) { 209 | match self.pull_until(self.offset).cloned() { 210 | Some((out, span)) => { 211 | self.offset += 1; 212 | (self.offset - 1, span, Some(out)) 213 | } 214 | None => (self.offset, self.eoi.clone(), None), 215 | } 216 | } 217 | 218 | pub(crate) fn span_since(&mut self, start_offset: usize) -> S { 219 | debug_assert!( 220 | start_offset <= self.offset, 221 | "{} > {}", 222 | self.offset, 223 | start_offset 224 | ); 225 | let start = self 226 | .pull_until(start_offset) 227 | .as_ref() 228 | .map(|(_, s)| s.start()) 229 | .unwrap_or_else(|| self.eoi.start()); 230 | let end = self 231 | .pull_until(self.offset.saturating_sub(1).max(start_offset)) 232 | .as_ref() 233 | .map(|(_, s)| s.end()) 234 | .unwrap_or_else(|| self.eoi.end()); 235 | S::new(self.eoi.context(), start..end) 236 | } 237 | 238 | pub(crate) fn attempt (bool, R)>(&mut self, f: F) -> R { 239 | let old_offset = self.offset; 240 | let (commit, out) = f(self); 241 | if !commit { 242 | self.offset = old_offset; 243 | } 244 | out 245 | } 246 | 247 | pub(crate) fn try_parse PResult>( 248 | &mut self, 249 | f: F, 250 | ) -> PResult { 251 | self.attempt(move |stream| { 252 | let out = f(stream); 253 | (out.1.is_ok(), out) 254 | }) 255 | } 256 | } 257 | 258 | impl<'a> From<&'a str> 259 | for Stream<'a, char, Range, Box)> + 'a>> 260 | { 261 | /// Please note that Chumsky currently uses character indices and not byte offsets in this impl. This is likely to 262 | /// change in the future. If you wish to use byte offsets, you can do so with [`Stream::from_iter`]. 263 | fn from(s: &'a str) -> Self { 264 | let len = s.chars().count(); 265 | Self::from_iter( 266 | len..len, 267 | Box::new(s.chars().enumerate().map(|(i, c)| (c, i..i + 1))), 268 | ) 269 | } 270 | } 271 | 272 | impl<'a> From 273 | for Stream<'a, char, Range, Box)>>> 274 | { 275 | /// Please note that Chumsky currently uses character indices and not byte offsets in this impl. This is likely to 276 | /// change in the future. If you wish to use byte offsets, you can do so with [`Stream::from_iter`]. 277 | fn from(s: String) -> Self { 278 | let chars = s.chars().collect::>(); 279 | Self::from_iter( 280 | chars.len()..chars.len(), 281 | Box::new(chars.into_iter().enumerate().map(|(i, c)| (c, i..i + 1))), 282 | ) 283 | } 284 | } 285 | 286 | impl<'a, T: Clone> From<&'a [T]> 287 | for Stream<'a, T, Range, Box)> + 'a>> 288 | { 289 | fn from(s: &'a [T]) -> Self { 290 | let len = s.len(); 291 | Self::from_iter( 292 | len..len, 293 | Box::new(s.iter().cloned().enumerate().map(|(i, x)| (x, i..i + 1))), 294 | ) 295 | } 296 | } 297 | 298 | impl<'a, T: Clone + 'a> From> 299 | for Stream<'a, T, Range, Box)> + 'a>> 300 | { 301 | fn from(s: Vec) -> Self { 302 | let len = s.len(); 303 | Self::from_iter( 304 | len..len, 305 | Box::new(s.into_iter().enumerate().map(|(i, x)| (x, i..i + 1))), 306 | ) 307 | } 308 | } 309 | 310 | impl<'a, T: Clone + 'a, const N: usize> From<[T; N]> 311 | for Stream<'a, T, Range, Box)> + 'a>> 312 | { 313 | fn from(s: [T; N]) -> Self { 314 | Self::from_iter( 315 | N..N, 316 | Box::new( 317 | core::array::IntoIter::new(s) 318 | .enumerate() 319 | .map(|(i, x)| (x, i..i + 1)), 320 | ), 321 | ) 322 | } 323 | } 324 | 325 | impl<'a, T: Clone, const N: usize> From<&'a [T; N]> 326 | for Stream<'a, T, Range, Box)> + 'a>> 327 | { 328 | fn from(s: &'a [T; N]) -> Self { 329 | Self::from_iter( 330 | N..N, 331 | Box::new(s.iter().cloned().enumerate().map(|(i, x)| (x, i..i + 1))), 332 | ) 333 | } 334 | } 335 | 336 | // impl<'a, T: Clone, S: Clone + Span> From<&'a [(T, S)]> for Stream<'a, T, S, Box + 'a>> 337 | // where S::Offset: Default 338 | // { 339 | // fn from(s: &'a [(T, S)]) -> Self { 340 | // Self::from_iter(Default::default(), Box::new(s.iter().cloned())) 341 | // } 342 | // } 343 | -------------------------------------------------------------------------------- /src/recovery.rs: -------------------------------------------------------------------------------- 1 | //! Types and traits that facilitate error recovery. 2 | //! 3 | //! *“Do you find coming to terms with the mindless tedium of it all presents an interesting challenge?”* 4 | 5 | use super::*; 6 | 7 | /// A trait implemented by error recovery strategies. 8 | pub trait Strategy> { 9 | /// Recover from a parsing failure. 10 | fn recover>( 11 | &self, 12 | recovered_errors: Vec>, 13 | fatal_error: Located, 14 | parser: P, 15 | debugger: &mut D, 16 | stream: &mut StreamOf, 17 | ) -> PResult; 18 | } 19 | 20 | /// See [`skip_then_retry_until`]. 21 | #[derive(Copy, Clone)] 22 | pub struct SkipThenRetryUntil( 23 | pub(crate) [I; N], 24 | pub(crate) bool, 25 | pub(crate) bool, 26 | ); 27 | 28 | impl SkipThenRetryUntil { 29 | /// Alters this recovery strategy so that the first token will always be skipped. 30 | /// 31 | /// This is useful when the input being searched for also appears at the beginning of the pattern that failed to 32 | /// parse. 33 | pub fn skip_start(self) -> Self { 34 | Self(self.0, self.1, true) 35 | } 36 | 37 | /// Alters this recovery strategy so that the synchronisation token will be consumed during recovery. 38 | /// 39 | /// This is useful when the input being searched for is a delimiter of a prior pattern rather than the start of a 40 | /// new pattern and hence is no longer important once recovery has occurred. 41 | pub fn consume_end(self) -> Self { 42 | Self(self.0, true, self.2) 43 | } 44 | } 45 | 46 | impl, const N: usize> Strategy 47 | for SkipThenRetryUntil 48 | { 49 | fn recover>( 50 | &self, 51 | a_errors: Vec>, 52 | a_err: Located, 53 | parser: P, 54 | debugger: &mut D, 55 | stream: &mut StreamOf, 56 | ) -> PResult { 57 | if self.2 { 58 | let _ = stream.next(); 59 | } 60 | loop { 61 | #[allow(clippy::blocks_in_if_conditions)] 62 | if !stream.attempt( 63 | |stream| match stream.next().2.map(|tok| self.0.contains(&tok)) { 64 | Some(true) => (self.1, false), 65 | Some(false) => (true, true), 66 | None => (false, false), 67 | }, 68 | ) { 69 | break (a_errors, Err(a_err)); 70 | } 71 | #[allow(deprecated)] 72 | let (mut errors, res) = debugger.invoke(&parser, stream); 73 | if let Ok(out) = res { 74 | errors.push(a_err); 75 | break (errors, Ok(out)); 76 | } 77 | } 78 | } 79 | } 80 | 81 | /// A recovery mode that simply skips to the next input on parser failure and tries again, until reaching one of 82 | /// several inputs. 83 | /// 84 | /// Also see [`SkipThenRetryUntil::consume_end`]. 85 | /// 86 | /// This strategy is very 'stupid' and can result in very poor error generation in some languages. Place this strategy 87 | /// after others as a last resort, and be careful about over-using it. 88 | pub fn skip_then_retry_until(until: [I; N]) -> SkipThenRetryUntil { 89 | SkipThenRetryUntil(until, false, false) 90 | } 91 | 92 | /// See [`skip_until`]. 93 | #[derive(Copy, Clone)] 94 | pub struct SkipUntil( 95 | pub(crate) [I; N], 96 | pub(crate) F, 97 | pub(crate) bool, 98 | pub(crate) bool, 99 | ); 100 | 101 | impl SkipUntil { 102 | /// Alters this recovery strategy so that the first token will always be skipped. 103 | /// 104 | /// This is useful when the input being searched for also appears at the beginning of the pattern that failed to 105 | /// parse. 106 | pub fn skip_start(self) -> Self { 107 | Self(self.0, self.1, self.2, true) 108 | } 109 | 110 | /// Alters this recovery strategy so that the synchronisation token will be consumed during recovery. 111 | /// 112 | /// This is useful when the input being searched for is a delimiter of a prior pattern rather than the start of a 113 | /// new pattern and hence is no longer important once recovery has occurred. 114 | pub fn consume_end(self) -> Self { 115 | Self(self.0, self.1, true, self.3) 116 | } 117 | } 118 | 119 | impl O, E: Error, const N: usize> Strategy 120 | for SkipUntil 121 | { 122 | fn recover>( 123 | &self, 124 | mut a_errors: Vec>, 125 | a_err: Located, 126 | _parser: P, 127 | _debugger: &mut D, 128 | stream: &mut StreamOf, 129 | ) -> PResult { 130 | let pre_state = stream.save(); 131 | if self.3 { 132 | let _ = stream.next(); 133 | } 134 | a_errors.push(a_err); 135 | loop { 136 | match stream.attempt(|stream| { 137 | let (at, span, tok) = stream.next(); 138 | match tok.map(|tok| self.0.contains(&tok)) { 139 | Some(true) => (self.2, Ok(true)), 140 | Some(false) => (true, Ok(false)), 141 | None => (true, Err((at, span))), 142 | } 143 | }) { 144 | Ok(true) => break (a_errors, Ok(((self.1)(stream.span_since(pre_state)), None))), 145 | Ok(false) => {} 146 | Err(_) if stream.save() > pre_state => { 147 | break (a_errors, Ok(((self.1)(stream.span_since(pre_state)), None))) 148 | } 149 | Err((at, span)) => { 150 | break ( 151 | a_errors, 152 | Err(Located::at( 153 | at, 154 | E::expected_input_found(span, self.0.iter().cloned().map(Some), None), 155 | )), 156 | ) 157 | } 158 | } 159 | } 160 | } 161 | } 162 | 163 | /// A recovery mode that skips input until one of several inputs is found. 164 | /// 165 | /// Also see [`SkipUntil::consume_end`]. 166 | /// 167 | /// This strategy is very 'stupid' and can result in very poor error generation in some languages. Place this strategy 168 | /// after others as a last resort, and be careful about over-using it. 169 | pub fn skip_until(until: [I; N], fallback: F) -> SkipUntil { 170 | SkipUntil(until, fallback, false, false) 171 | } 172 | 173 | /// See [`nested_delimiters`]. 174 | #[derive(Copy, Clone)] 175 | pub struct NestedDelimiters( 176 | pub(crate) I, 177 | pub(crate) I, 178 | pub(crate) [(I, I); N], 179 | pub(crate) F, 180 | ); 181 | 182 | impl O, E: Error, const N: usize> Strategy 183 | for NestedDelimiters 184 | { 185 | // This looks like something weird with clippy, it warns in a weird spot and isn't fixed by 186 | // marking it at the spot. 187 | #[allow(clippy::blocks_in_if_conditions)] 188 | fn recover>( 189 | &self, 190 | mut a_errors: Vec>, 191 | a_err: Located, 192 | _parser: P, 193 | _debugger: &mut D, 194 | stream: &mut StreamOf, 195 | ) -> PResult { 196 | let mut balance = 0; 197 | let mut balance_others = [0; N]; 198 | let mut starts = Vec::new(); 199 | let mut error = None; 200 | let pre_state = stream.save(); 201 | let recovered = loop { 202 | if match stream.next() { 203 | (_, span, Some(t)) if t == self.0 => { 204 | balance += 1; 205 | starts.push(span); 206 | true 207 | } 208 | (_, _, Some(t)) if t == self.1 => { 209 | balance -= 1; 210 | starts.pop(); 211 | true 212 | } 213 | (at, span, Some(t)) => { 214 | for (balance_other, others) in balance_others.iter_mut().zip(self.2.iter()) { 215 | if t == others.0 { 216 | *balance_other += 1; 217 | } else if t == others.1 { 218 | *balance_other -= 1; 219 | 220 | if *balance_other < 0 && balance == 1 { 221 | // stream.revert(pre_state); 222 | error.get_or_insert_with(|| { 223 | Located::at( 224 | at, 225 | P::Error::unclosed_delimiter( 226 | starts.pop().unwrap(), 227 | self.0.clone(), 228 | span.clone(), 229 | self.1.clone(), 230 | Some(t.clone()), 231 | ), 232 | ) 233 | }); 234 | } 235 | } 236 | } 237 | false 238 | } 239 | (at, span, None) => { 240 | if balance > 0 && balance == 1 { 241 | error.get_or_insert_with(|| match starts.pop() { 242 | Some(start) => Located::at( 243 | at, 244 | P::Error::unclosed_delimiter( 245 | start, 246 | self.0.clone(), 247 | span, 248 | self.1.clone(), 249 | None, 250 | ), 251 | ), 252 | None => Located::at( 253 | at, 254 | P::Error::expected_input_found( 255 | span, 256 | Some(Some(self.1.clone())), 257 | None, 258 | ), 259 | ), 260 | }); 261 | } 262 | break false; 263 | } 264 | } { 265 | match balance.cmp(&0) { 266 | Ordering::Equal => break true, 267 | // The end of a delimited section is not a valid recovery pattern 268 | Ordering::Less => break false, 269 | Ordering::Greater => (), 270 | } 271 | } else if balance == 0 { 272 | // A non-delimiter input before anything else is not a valid recovery pattern 273 | break false; 274 | } 275 | }; 276 | 277 | if let Some(e) = error { 278 | a_errors.push(e); 279 | } 280 | 281 | if recovered { 282 | if a_errors.last().map_or(true, |e| a_err.at < e.at) { 283 | a_errors.push(a_err); 284 | } 285 | (a_errors, Ok(((self.3)(stream.span_since(pre_state)), None))) 286 | } else { 287 | (a_errors, Err(a_err)) 288 | } 289 | } 290 | } 291 | 292 | /// A recovery strategy that searches for a start and end delimiter, respecting nesting. 293 | /// 294 | /// It is possible to specify additional delimiter pairs that are valid in the pattern's context for better errors. For 295 | /// example, you might want to also specify `[('[', ']'), ('{', '}')]` when recovering a parenthesised expression as 296 | /// this can aid in detecting delimiter mismatches. 297 | /// 298 | /// A function that generates a fallback output on recovery is also required. 299 | pub fn nested_delimiters( 300 | start: I, 301 | end: I, 302 | others: [(I, I); N], 303 | fallback: F, 304 | ) -> NestedDelimiters { 305 | assert!( 306 | start != end, 307 | "Start and end delimiters cannot be the same when using `NestedDelimiters`" 308 | ); 309 | NestedDelimiters(start, end, others, fallback) 310 | } 311 | 312 | /// A parser that includes a fallback recovery strategy should parsing result in an error. 313 | #[derive(Copy, Clone)] 314 | pub struct Recovery(pub(crate) A, pub(crate) S); 315 | 316 | impl, S: Strategy, E: Error> Parser 317 | for Recovery 318 | { 319 | type Error = E; 320 | 321 | fn parse_inner( 322 | &self, 323 | debugger: &mut D, 324 | stream: &mut StreamOf, 325 | ) -> PResult { 326 | match stream.try_parse(|stream| { 327 | #[allow(deprecated)] 328 | debugger.invoke(&self.0, stream) 329 | }) { 330 | (a_errors, Ok(a_out)) => (a_errors, Ok(a_out)), 331 | (a_errors, Err(a_err)) => self.1.recover(a_errors, a_err, &self.0, debugger, stream), 332 | } 333 | } 334 | 335 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 336 | #[allow(deprecated)] 337 | self.parse_inner(d, s) 338 | } 339 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 340 | #[allow(deprecated)] 341 | self.parse_inner(d, s) 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | //! Error types, traits and utilities. 2 | //! 3 | //! *“I like the cover," he said. "Don't Panic. It's the first helpful or intelligible thing anybody's said to me all 4 | //! day.”* 5 | //! 6 | //! You can implement the [`Error`] trait to create your own parser errors, or you can use one provided by the crate 7 | //! like [`Simple`] or [`Cheap`]. 8 | 9 | use super::*; 10 | use alloc::{format, string::ToString}; 11 | use core::hash::Hash; 12 | 13 | #[cfg(not(feature = "std"))] 14 | use hashbrown::HashSet; 15 | #[cfg(feature = "std")] 16 | use std::collections::HashSet; 17 | 18 | // (ahash + std) => ahash 19 | // (ahash) => ahash 20 | // (std) => std 21 | // () => ahash 22 | #[cfg(any(feature = "ahash", not(feature = "std")))] 23 | type RandomState = hashbrown::hash_map::DefaultHashBuilder; 24 | #[cfg(all(not(feature = "ahash"), feature = "std"))] 25 | type RandomState = std::collections::hash_map::RandomState; 26 | 27 | /// A trait that describes parser error types. 28 | /// 29 | /// If you have a custom error type in your compiler, or your needs are not sufficiently met by [`Simple`], you should 30 | /// implement this trait. If your error type has 'extra' features that allow for more specific error messages, you can 31 | /// use the [`Parser::map_err`] or [`Parser::try_map`] functions to take advantage of these inline within your parser. 32 | /// 33 | /// # Examples 34 | /// 35 | /// ``` 36 | /// # use chumsky::{prelude::*, error::Cheap}; 37 | /// type Span = std::ops::Range; 38 | /// 39 | /// // A custom error type 40 | /// #[derive(Debug, PartialEq)] 41 | /// enum MyError { 42 | /// ExpectedFound(Span, Vec>, Option), 43 | /// NotADigit(Span, char), 44 | /// } 45 | /// 46 | /// impl chumsky::Error for MyError { 47 | /// type Span = Span; 48 | /// type Label = (); 49 | /// 50 | /// fn expected_input_found>>( 51 | /// span: Span, 52 | /// expected: Iter, 53 | /// found: Option, 54 | /// ) -> Self { 55 | /// Self::ExpectedFound(span, expected.into_iter().collect(), found) 56 | /// } 57 | /// 58 | /// fn with_label(mut self, label: Self::Label) -> Self { self } 59 | /// 60 | /// fn merge(mut self, mut other: Self) -> Self { 61 | /// if let (Self::ExpectedFound(_, expected, _), Self::ExpectedFound(_, expected_other, _)) = ( 62 | /// &mut self, 63 | /// &mut other, 64 | /// ) { 65 | /// expected.append(expected_other); 66 | /// } 67 | /// self 68 | /// } 69 | /// } 70 | /// 71 | /// let numeral = filter_map(|span, c: char| match c.to_digit(10) { 72 | /// Some(x) => Ok(x), 73 | /// None => Err(MyError::NotADigit(span, c)), 74 | /// }); 75 | /// 76 | /// assert_eq!(numeral.parse("3"), Ok(3)); 77 | /// assert_eq!(numeral.parse("7"), Ok(7)); 78 | /// assert_eq!(numeral.parse("f"), Err(vec![MyError::NotADigit(0..1, 'f')])); 79 | /// ``` 80 | pub trait Error: Sized { 81 | /// The type of spans to be used in the error. 82 | type Span: Span; // TODO: Default to = Range; 83 | 84 | /// The label used to describe a syntatic structure currently being parsed. 85 | /// 86 | /// This can be used to generate errors that tell the user what syntactic structure was currently being parsed when 87 | /// the error occured. 88 | type Label; // TODO: Default to = &'static str; 89 | 90 | /// Create a new error describing a conflict between expected inputs and that which was actually found. 91 | /// 92 | /// `found` having the value `None` indicates that the end of input was reached, but was not expected. 93 | /// 94 | /// An expected input having the value `None` indicates that the end of input was expected. 95 | fn expected_input_found>>( 96 | span: Self::Span, 97 | expected: Iter, 98 | found: Option, 99 | ) -> Self; 100 | 101 | /// Create a new error describing a delimiter that was not correctly closed. 102 | /// 103 | /// Provided to this function is the span of the unclosed delimiter, the delimiter itself, the span of the input 104 | /// that was found in its place, the closing delimiter that was expected but not found, and the input that was 105 | /// found in its place. 106 | /// 107 | /// The default implementation of this function uses [`Error::expected_input_found`], but you'll probably want to 108 | /// implement it yourself to take full advantage of the extra diagnostic information. 109 | fn unclosed_delimiter( 110 | unclosed_span: Self::Span, 111 | unclosed: I, 112 | span: Self::Span, 113 | expected: I, 114 | found: Option, 115 | ) -> Self { 116 | #![allow(unused_variables)] 117 | Self::expected_input_found(span, Some(Some(expected)), found) 118 | } 119 | 120 | /// Indicate that the error occured while parsing a particular syntactic structure. 121 | /// 122 | /// How the error handles this information is up to it. It can append it to a list of structures to get a sort of 123 | /// 'parse backtrace', or it can just keep only the most recent label. If the latter, this method should have no 124 | /// effect when the error already has a label. 125 | fn with_label(self, label: Self::Label) -> Self; 126 | 127 | /// Merge two errors that point to the same input together, combining their information. 128 | fn merge(self, other: Self) -> Self; 129 | } 130 | 131 | // /// A simple default input pattern that allows describing inputs and input patterns in error messages. 132 | // #[derive(Clone, Debug, PartialEq, Eq, Hash)] 133 | // pub enum SimplePattern { 134 | // /// A pattern with the given name was expected. 135 | // Labelled(&'static str), 136 | // /// A specific input was expected. 137 | // Token(I), 138 | // } 139 | 140 | // impl From<&'static str> for SimplePattern { 141 | // fn from(s: &'static str) -> Self { Self::Labelled(s) } 142 | // } 143 | 144 | // impl fmt::Display for SimplePattern { 145 | // fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 146 | // match self { 147 | // Self::Labelled(s) => write!(f, "{}", s), 148 | // Self::Token(x) => write!(f, "'{}'", x), 149 | // } 150 | // } 151 | // } 152 | 153 | /// A type representing possible reasons for an error. 154 | #[derive(Clone, Debug, PartialEq, Eq)] 155 | pub enum SimpleReason { 156 | /// An unexpected input was found. 157 | Unexpected, 158 | /// An unclosed delimiter was found. 159 | Unclosed { 160 | /// The span of the unclosed delimiter. 161 | span: S, 162 | /// The unclosed delimiter. 163 | delimiter: I, 164 | }, 165 | /// An error with a custom message occurred. 166 | Custom(String), 167 | } 168 | 169 | /// A simple default error type that tracks error spans, expected inputs, and the actual input found at an error site. 170 | /// 171 | /// Please note that it uses a [`HashSet`] to remember expected symbols. If you find this to be too slow, you can 172 | /// implement [`Error`] for your own error type or use [`Cheap`] instead. 173 | #[derive(Clone, Debug)] 174 | pub struct Simple> { 175 | span: S, 176 | reason: SimpleReason, 177 | expected: HashSet, RandomState>, 178 | found: Option, 179 | label: Option<&'static str>, 180 | } 181 | 182 | impl Simple { 183 | /// Create an error with a custom error message. 184 | pub fn custom(span: S, msg: M) -> Self { 185 | Self { 186 | span, 187 | reason: SimpleReason::Custom(msg.to_string()), 188 | expected: HashSet::default(), 189 | found: None, 190 | label: None, 191 | } 192 | } 193 | 194 | /// Returns the span that the error occured at. 195 | pub fn span(&self) -> S { 196 | self.span.clone() 197 | } 198 | 199 | /// Returns an iterator over possible expected patterns. 200 | pub fn expected(&self) -> impl ExactSizeIterator> + '_ { 201 | self.expected.iter() 202 | } 203 | 204 | /// Returns the input, if any, that was found instead of an expected pattern. 205 | pub fn found(&self) -> Option<&I> { 206 | self.found.as_ref() 207 | } 208 | 209 | /// Returns the reason for the error. 210 | pub fn reason(&self) -> &SimpleReason { 211 | &self.reason 212 | } 213 | 214 | /// Returns the error's label, if any. 215 | pub fn label(&self) -> Option<&'static str> { 216 | self.label 217 | } 218 | 219 | /// Map the error's inputs using the given function. 220 | /// 221 | /// This can be used to unify the errors between parsing stages that operate upon two forms of input (for example, 222 | /// the initial lexing stage and the parsing stage in most compilers). 223 | pub fn map U>(self, mut f: F) -> Simple { 224 | Simple { 225 | span: self.span, 226 | reason: match self.reason { 227 | SimpleReason::Unclosed { span, delimiter } => SimpleReason::Unclosed { 228 | span, 229 | delimiter: f(delimiter), 230 | }, 231 | SimpleReason::Unexpected => SimpleReason::Unexpected, 232 | SimpleReason::Custom(msg) => SimpleReason::Custom(msg), 233 | }, 234 | expected: self.expected.into_iter().map(|e| e.map(&mut f)).collect(), 235 | found: self.found.map(f), 236 | label: self.label, 237 | } 238 | } 239 | } 240 | 241 | impl Error for Simple { 242 | type Span = S; 243 | type Label = &'static str; 244 | 245 | fn expected_input_found>>( 246 | span: Self::Span, 247 | expected: Iter, 248 | found: Option, 249 | ) -> Self { 250 | Self { 251 | span, 252 | reason: SimpleReason::Unexpected, 253 | expected: expected.into_iter().collect(), 254 | found, 255 | label: None, 256 | } 257 | } 258 | 259 | fn unclosed_delimiter( 260 | unclosed_span: Self::Span, 261 | delimiter: I, 262 | span: Self::Span, 263 | expected: I, 264 | found: Option, 265 | ) -> Self { 266 | Self { 267 | span, 268 | reason: SimpleReason::Unclosed { 269 | span: unclosed_span, 270 | delimiter, 271 | }, 272 | expected: core::iter::once(Some(expected)).collect(), 273 | found, 274 | label: None, 275 | } 276 | } 277 | 278 | fn with_label(mut self, label: Self::Label) -> Self { 279 | self.label.get_or_insert(label); 280 | self 281 | } 282 | 283 | fn merge(mut self, other: Self) -> Self { 284 | // TODO: Assert that `self.span == other.span` here? 285 | self.reason = match (&self.reason, &other.reason) { 286 | (SimpleReason::Unclosed { .. }, _) => self.reason, 287 | (_, SimpleReason::Unclosed { .. }) => other.reason, 288 | _ => self.reason, 289 | }; 290 | for expected in other.expected { 291 | self.expected.insert(expected); 292 | } 293 | self 294 | } 295 | } 296 | 297 | impl PartialEq for Simple { 298 | fn eq(&self, other: &Self) -> bool { 299 | self.span == other.span 300 | && self.found == other.found 301 | && self.reason == other.reason 302 | && self.label == other.label 303 | } 304 | } 305 | 306 | impl fmt::Display for Simple { 307 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 308 | // TODO: Take `self.reason` into account 309 | 310 | if let Some(found) = &self.found { 311 | write!(f, "found '{}'", found)?; 312 | } else { 313 | write!(f, "found end of input")?; 314 | } 315 | 316 | match self.expected.len() { 317 | 0 => {} //write!(f, " but end of input was expected")?, 318 | 1 => write!( 319 | f, 320 | " but {} was expected", 321 | match self.expected.iter().next().unwrap() { 322 | Some(x) => format!("'{}'", x), 323 | None => format!("end of input"), 324 | }, 325 | )?, 326 | _ => write!( 327 | f, 328 | " but one of {} was expected", 329 | self.expected 330 | .iter() 331 | .map(|expected| match expected { 332 | Some(x) => format!("'{}'", x), 333 | None => format!("end of input"), 334 | }) 335 | .collect::>() 336 | .join(", ") 337 | )?, 338 | } 339 | 340 | Ok(()) 341 | } 342 | } 343 | 344 | #[cfg(feature = "std")] 345 | impl 346 | std::error::Error for Simple 347 | { 348 | } 349 | 350 | /// A minimal error type that tracks only the error span and label. This type is most useful when you want fast parsing 351 | /// but do not particularly care about the quality of error messages. 352 | #[derive(Clone, Debug, PartialEq, Eq)] 353 | pub struct Cheap> { 354 | span: S, 355 | label: Option<&'static str>, 356 | phantom: PhantomData, 357 | } 358 | 359 | impl Cheap { 360 | /// Returns the span that the error occured at. 361 | pub fn span(&self) -> S { 362 | self.span.clone() 363 | } 364 | 365 | /// Returns the error's label, if any. 366 | pub fn label(&self) -> Option<&'static str> { 367 | self.label 368 | } 369 | } 370 | 371 | impl Error for Cheap { 372 | type Span = S; 373 | type Label = &'static str; 374 | 375 | fn expected_input_found>>( 376 | span: Self::Span, 377 | _: Iter, 378 | _: Option, 379 | ) -> Self { 380 | Self { 381 | span, 382 | label: None, 383 | phantom: PhantomData, 384 | } 385 | } 386 | 387 | fn with_label(mut self, label: Self::Label) -> Self { 388 | self.label.get_or_insert(label); 389 | self 390 | } 391 | 392 | fn merge(self, _: Self) -> Self { 393 | self 394 | } 395 | } 396 | 397 | /// An internal type used to facilitate error prioritisation. You shouldn't need to interact with this type during 398 | /// normal use of the crate. 399 | pub struct Located { 400 | pub(crate) at: usize, 401 | pub(crate) error: E, 402 | pub(crate) phantom: PhantomData, 403 | } 404 | 405 | impl> Located { 406 | /// Create a new [`Located`] with the give input position and error. 407 | pub fn at(at: usize, error: E) -> Self { 408 | Self { 409 | at, 410 | error, 411 | phantom: PhantomData, 412 | } 413 | } 414 | 415 | /// Get the maximum of two located errors. If they hold the same position in the input, merge them. 416 | pub fn max(self, other: impl Into>) -> Self { 417 | let other = match other.into() { 418 | Some(other) => other, 419 | None => return self, 420 | }; 421 | match self.at.cmp(&other.at) { 422 | Ordering::Greater => self, 423 | Ordering::Less => other, 424 | Ordering::Equal => Self { 425 | error: self.error.merge(other.error), 426 | ..self 427 | }, 428 | } 429 | } 430 | 431 | /// Map the error with the given function. 432 | pub fn map U>(self, f: F) -> Located { 433 | Located { 434 | at: self.at, 435 | error: f(self.error), 436 | phantom: PhantomData, 437 | } 438 | } 439 | } 440 | 441 | // Merge two alternative errors 442 | pub(crate) fn merge_alts, T: IntoIterator>>( 443 | mut error: Option>, 444 | errors: T, 445 | ) -> Option> { 446 | for other in errors { 447 | match (error, other) { 448 | (Some(a), b) => { 449 | error = Some(b.max(a)); 450 | } 451 | (None, b) => { 452 | error = Some(b); 453 | } 454 | } 455 | } 456 | error 457 | } 458 | -------------------------------------------------------------------------------- /examples/nano_rust.rs: -------------------------------------------------------------------------------- 1 | //! This is an entire parser and interpreter for a dynamically-typed Rust-like expression-oriented 2 | //! programming language. See `sample.nrs` for sample source code. 3 | //! Run it with the following command: 4 | //! cargo run --example nano_rust -- examples/sample.nrs 5 | 6 | use ariadne::{Color, Fmt, Label, Report, ReportKind, Source}; 7 | use chumsky::{prelude::*, stream::Stream}; 8 | use std::{collections::HashMap, env, fmt, fs}; 9 | 10 | pub type Span = std::ops::Range; 11 | 12 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 13 | enum Token { 14 | Null, 15 | Bool(bool), 16 | Num(String), 17 | Str(String), 18 | Op(String), 19 | Ctrl(char), 20 | Ident(String), 21 | Fn, 22 | Let, 23 | Print, 24 | If, 25 | Else, 26 | } 27 | 28 | impl fmt::Display for Token { 29 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 30 | match self { 31 | Token::Null => write!(f, "null"), 32 | Token::Bool(x) => write!(f, "{}", x), 33 | Token::Num(n) => write!(f, "{}", n), 34 | Token::Str(s) => write!(f, "{}", s), 35 | Token::Op(s) => write!(f, "{}", s), 36 | Token::Ctrl(c) => write!(f, "{}", c), 37 | Token::Ident(s) => write!(f, "{}", s), 38 | Token::Fn => write!(f, "fn"), 39 | Token::Let => write!(f, "let"), 40 | Token::Print => write!(f, "print"), 41 | Token::If => write!(f, "if"), 42 | Token::Else => write!(f, "else"), 43 | } 44 | } 45 | } 46 | 47 | fn lexer() -> impl Parser, Error = Simple> { 48 | // A parser for numbers 49 | let num = text::int(10) 50 | .chain::(just('.').chain(text::digits(10)).or_not().flatten()) 51 | .collect::() 52 | .map(Token::Num); 53 | 54 | // A parser for strings 55 | let str_ = just('"') 56 | .ignore_then(filter(|c| *c != '"').repeated()) 57 | .then_ignore(just('"')) 58 | .collect::() 59 | .map(Token::Str); 60 | 61 | // A parser for operators 62 | let op = one_of("+-*/!=") 63 | .repeated() 64 | .at_least(1) 65 | .collect::() 66 | .map(Token::Op); 67 | 68 | // A parser for control characters (delimiters, semicolons, etc.) 69 | let ctrl = one_of("()[]{};,").map(|c| Token::Ctrl(c)); 70 | 71 | // A parser for identifiers and keywords 72 | let ident = text::ident().map(|ident: String| match ident.as_str() { 73 | "fn" => Token::Fn, 74 | "let" => Token::Let, 75 | "print" => Token::Print, 76 | "if" => Token::If, 77 | "else" => Token::Else, 78 | "true" => Token::Bool(true), 79 | "false" => Token::Bool(false), 80 | "null" => Token::Null, 81 | _ => Token::Ident(ident), 82 | }); 83 | 84 | // A single token can be one of the above 85 | let token = num 86 | .or(str_) 87 | .or(op) 88 | .or(ctrl) 89 | .or(ident) 90 | .recover_with(skip_then_retry_until([])); 91 | 92 | let comment = just("//").then(take_until(just('\n'))).padded(); 93 | 94 | token 95 | .padded_by(comment.repeated()) 96 | .map_with_span(|tok, span| (tok, span)) 97 | .padded() 98 | .repeated() 99 | } 100 | 101 | #[derive(Clone, Debug, PartialEq)] 102 | enum Value { 103 | Null, 104 | Bool(bool), 105 | Num(f64), 106 | Str(String), 107 | List(Vec), 108 | Func(String), 109 | } 110 | 111 | impl Value { 112 | fn num(self, span: Span) -> Result { 113 | if let Value::Num(x) = self { 114 | Ok(x) 115 | } else { 116 | Err(Error { 117 | span, 118 | msg: format!("'{}' is not a number", self), 119 | }) 120 | } 121 | } 122 | } 123 | 124 | impl std::fmt::Display for Value { 125 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 126 | match self { 127 | Self::Null => write!(f, "null"), 128 | Self::Bool(x) => write!(f, "{}", x), 129 | Self::Num(x) => write!(f, "{}", x), 130 | Self::Str(x) => write!(f, "{}", x), 131 | Self::List(xs) => write!( 132 | f, 133 | "[{}]", 134 | xs.iter() 135 | .map(|x| x.to_string()) 136 | .collect::>() 137 | .join(", ") 138 | ), 139 | Self::Func(name) => write!(f, "", name), 140 | } 141 | } 142 | } 143 | 144 | #[derive(Clone, Debug)] 145 | enum BinaryOp { 146 | Add, 147 | Sub, 148 | Mul, 149 | Div, 150 | Eq, 151 | NotEq, 152 | } 153 | 154 | pub type Spanned = (T, Span); 155 | 156 | // An expression node in the AST. Children are spanned so we can generate useful runtime errors. 157 | #[derive(Debug)] 158 | enum Expr { 159 | Error, 160 | Value(Value), 161 | List(Vec>), 162 | Local(String), 163 | Let(String, Box>, Box>), 164 | Then(Box>, Box>), 165 | Binary(Box>, BinaryOp, Box>), 166 | Call(Box>, Spanned>>), 167 | If(Box>, Box>, Box>), 168 | Print(Box>), 169 | } 170 | 171 | // A function node in the AST. 172 | #[derive(Debug)] 173 | struct Func { 174 | args: Vec, 175 | body: Spanned, 176 | } 177 | 178 | fn expr_parser() -> impl Parser, Error = Simple> + Clone { 179 | recursive(|expr| { 180 | let raw_expr = recursive(|raw_expr| { 181 | let val = filter_map(|span, tok| match tok { 182 | Token::Null => Ok(Expr::Value(Value::Null)), 183 | Token::Bool(x) => Ok(Expr::Value(Value::Bool(x))), 184 | Token::Num(n) => Ok(Expr::Value(Value::Num(n.parse().unwrap()))), 185 | Token::Str(s) => Ok(Expr::Value(Value::Str(s))), 186 | _ => Err(Simple::expected_input_found(span, Vec::new(), Some(tok))), 187 | }) 188 | .labelled("value"); 189 | 190 | let ident = filter_map(|span, tok| match tok { 191 | Token::Ident(ident) => Ok(ident.clone()), 192 | _ => Err(Simple::expected_input_found(span, Vec::new(), Some(tok))), 193 | }) 194 | .labelled("identifier"); 195 | 196 | // A list of expressions 197 | let items = expr 198 | .clone() 199 | .chain(just(Token::Ctrl(',')).ignore_then(expr.clone()).repeated()) 200 | .then_ignore(just(Token::Ctrl(',')).or_not()) 201 | .or_not() 202 | .map(|item| item.unwrap_or_else(Vec::new)); 203 | 204 | // A let expression 205 | let let_ = just(Token::Let) 206 | .ignore_then(ident) 207 | .then_ignore(just(Token::Op("=".to_string()))) 208 | .then(raw_expr) 209 | .then_ignore(just(Token::Ctrl(';'))) 210 | .then(expr.clone()) 211 | .map(|((name, val), body)| Expr::Let(name, Box::new(val), Box::new(body))); 212 | 213 | let list = items 214 | .clone() 215 | .delimited_by(just(Token::Ctrl('[')), just(Token::Ctrl(']'))) 216 | .map(Expr::List); 217 | 218 | // 'Atoms' are expressions that contain no ambiguity 219 | let atom = val 220 | .or(ident.map(Expr::Local)) 221 | .or(let_) 222 | .or(list) 223 | // In Nano Rust, `print` is just a keyword, just like Python 2, for simplicity 224 | .or(just(Token::Print) 225 | .ignore_then( 226 | expr.clone() 227 | .delimited_by(just(Token::Ctrl('(')), just(Token::Ctrl(')'))), 228 | ) 229 | .map(|expr| Expr::Print(Box::new(expr)))) 230 | .map_with_span(|expr, span| (expr, span)) 231 | // Atoms can also just be normal expressions, but surrounded with parentheses 232 | .or(expr 233 | .clone() 234 | .delimited_by(just(Token::Ctrl('(')), just(Token::Ctrl(')')))) 235 | // Attempt to recover anything that looks like a parenthesised expression but contains errors 236 | .recover_with(nested_delimiters( 237 | Token::Ctrl('('), 238 | Token::Ctrl(')'), 239 | [ 240 | (Token::Ctrl('['), Token::Ctrl(']')), 241 | (Token::Ctrl('{'), Token::Ctrl('}')), 242 | ], 243 | |span| (Expr::Error, span), 244 | )) 245 | // Attempt to recover anything that looks like a list but contains errors 246 | .recover_with(nested_delimiters( 247 | Token::Ctrl('['), 248 | Token::Ctrl(']'), 249 | [ 250 | (Token::Ctrl('('), Token::Ctrl(')')), 251 | (Token::Ctrl('{'), Token::Ctrl('}')), 252 | ], 253 | |span| (Expr::Error, span), 254 | )); 255 | 256 | // Function calls have very high precedence so we prioritise them 257 | let call = atom 258 | .then( 259 | items 260 | .delimited_by(just(Token::Ctrl('(')), just(Token::Ctrl(')'))) 261 | .map_with_span(|args, span| (args, span)) 262 | .repeated(), 263 | ) 264 | .foldl(|f, args| { 265 | let span = f.1.start..args.1.end; 266 | (Expr::Call(Box::new(f), args), span) 267 | }); 268 | 269 | // Product ops (multiply and divide) have equal precedence 270 | let op = just(Token::Op("*".to_string())) 271 | .to(BinaryOp::Mul) 272 | .or(just(Token::Op("/".to_string())).to(BinaryOp::Div)); 273 | let product = call 274 | .clone() 275 | .then(op.then(call).repeated()) 276 | .foldl(|a, (op, b)| { 277 | let span = a.1.start..b.1.end; 278 | (Expr::Binary(Box::new(a), op, Box::new(b)), span) 279 | }); 280 | 281 | // Sum ops (add and subtract) have equal precedence 282 | let op = just(Token::Op("+".to_string())) 283 | .to(BinaryOp::Add) 284 | .or(just(Token::Op("-".to_string())).to(BinaryOp::Sub)); 285 | let sum = product 286 | .clone() 287 | .then(op.then(product).repeated()) 288 | .foldl(|a, (op, b)| { 289 | let span = a.1.start..b.1.end; 290 | (Expr::Binary(Box::new(a), op, Box::new(b)), span) 291 | }); 292 | 293 | // Comparison ops (equal, not-equal) have equal precedence 294 | let op = just(Token::Op("==".to_string())) 295 | .to(BinaryOp::Eq) 296 | .or(just(Token::Op("!=".to_string())).to(BinaryOp::NotEq)); 297 | let compare = sum 298 | .clone() 299 | .then(op.then(sum).repeated()) 300 | .foldl(|a, (op, b)| { 301 | let span = a.1.start..b.1.end; 302 | (Expr::Binary(Box::new(a), op, Box::new(b)), span) 303 | }); 304 | 305 | compare 306 | }); 307 | 308 | // Blocks are expressions but delimited with braces 309 | let block = expr 310 | .clone() 311 | .delimited_by(just(Token::Ctrl('{')), just(Token::Ctrl('}'))) 312 | // Attempt to recover anything that looks like a block but contains errors 313 | .recover_with(nested_delimiters( 314 | Token::Ctrl('{'), 315 | Token::Ctrl('}'), 316 | [ 317 | (Token::Ctrl('('), Token::Ctrl(')')), 318 | (Token::Ctrl('['), Token::Ctrl(']')), 319 | ], 320 | |span| (Expr::Error, span), 321 | )); 322 | 323 | let if_ = recursive(|if_| { 324 | just(Token::If) 325 | .ignore_then(expr.clone()) 326 | .then(block.clone()) 327 | .then( 328 | just(Token::Else) 329 | .ignore_then(block.clone().or(if_)) 330 | .or_not(), 331 | ) 332 | .map_with_span(|((cond, a), b), span| { 333 | ( 334 | Expr::If( 335 | Box::new(cond), 336 | Box::new(a), 337 | Box::new(match b { 338 | Some(b) => b, 339 | // If an `if` expression has no trailing `else` block, we magic up one that just produces null 340 | None => (Expr::Value(Value::Null), span.clone()), 341 | }), 342 | ), 343 | span, 344 | ) 345 | }) 346 | }); 347 | 348 | // Both blocks and `if` are 'block expressions' and can appear in the place of statements 349 | let block_expr = block.or(if_).labelled("block"); 350 | 351 | let block_chain = block_expr 352 | .clone() 353 | .then(block_expr.clone().repeated()) 354 | .foldl(|a, b| { 355 | let span = a.1.start..b.1.end; 356 | (Expr::Then(Box::new(a), Box::new(b)), span) 357 | }); 358 | 359 | block_chain 360 | // Expressions, chained by semicolons, are statements 361 | .or(raw_expr.clone()) 362 | .then(just(Token::Ctrl(';')).ignore_then(expr.or_not()).repeated()) 363 | .foldl(|a, b| { 364 | let span = a.1.clone(); // TODO: Not correct 365 | ( 366 | Expr::Then( 367 | Box::new(a), 368 | Box::new(match b { 369 | Some(b) => b, 370 | None => (Expr::Value(Value::Null), span.clone()), 371 | }), 372 | ), 373 | span, 374 | ) 375 | }) 376 | }) 377 | } 378 | 379 | fn funcs_parser() -> impl Parser, Error = Simple> + Clone { 380 | let ident = filter_map(|span, tok| match tok { 381 | Token::Ident(ident) => Ok(ident.clone()), 382 | _ => Err(Simple::expected_input_found(span, Vec::new(), Some(tok))), 383 | }); 384 | 385 | // Argument lists are just identifiers separated by commas, surrounded by parentheses 386 | let args = ident 387 | .clone() 388 | .separated_by(just(Token::Ctrl(','))) 389 | .allow_trailing() 390 | .delimited_by(just(Token::Ctrl('(')), just(Token::Ctrl(')'))) 391 | .labelled("function args"); 392 | 393 | let func = just(Token::Fn) 394 | .ignore_then( 395 | ident 396 | .map_with_span(|name, span| (name, span)) 397 | .labelled("function name"), 398 | ) 399 | .then(args) 400 | .then( 401 | expr_parser() 402 | .delimited_by(just(Token::Ctrl('{')), just(Token::Ctrl('}'))) 403 | // Attempt to recover anything that looks like a function body but contains errors 404 | .recover_with(nested_delimiters( 405 | Token::Ctrl('{'), 406 | Token::Ctrl('}'), 407 | [ 408 | (Token::Ctrl('('), Token::Ctrl(')')), 409 | (Token::Ctrl('['), Token::Ctrl(']')), 410 | ], 411 | |span| (Expr::Error, span), 412 | )), 413 | ) 414 | .map(|((name, args), body)| (name, Func { args, body })) 415 | .labelled("function"); 416 | 417 | func.repeated() 418 | .try_map(|fs, _| { 419 | let mut funcs = HashMap::new(); 420 | for ((name, name_span), f) in fs { 421 | if funcs.insert(name.clone(), f).is_some() { 422 | return Err(Simple::custom( 423 | name_span.clone(), 424 | format!("Function '{}' already exists", name), 425 | )); 426 | } 427 | } 428 | Ok(funcs) 429 | }) 430 | .then_ignore(end()) 431 | } 432 | 433 | struct Error { 434 | span: Span, 435 | msg: String, 436 | } 437 | 438 | fn eval_expr( 439 | expr: &Spanned, 440 | funcs: &HashMap, 441 | stack: &mut Vec<(String, Value)>, 442 | ) -> Result { 443 | Ok(match &expr.0 { 444 | Expr::Error => unreachable!(), // Error expressions only get created by parser errors, so cannot exist in a valid AST 445 | Expr::Value(val) => val.clone(), 446 | Expr::List(items) => Value::List( 447 | items 448 | .iter() 449 | .map(|item| eval_expr(item, funcs, stack)) 450 | .collect::>()?, 451 | ), 452 | Expr::Local(name) => stack 453 | .iter() 454 | .rev() 455 | .find(|(l, _)| l == name) 456 | .map(|(_, v)| v.clone()) 457 | .or_else(|| Some(Value::Func(name.clone())).filter(|_| funcs.contains_key(name))) 458 | .ok_or_else(|| Error { 459 | span: expr.1.clone(), 460 | msg: format!("No such variable '{}' in scope", name), 461 | })?, 462 | Expr::Let(local, val, body) => { 463 | let val = eval_expr(val, funcs, stack)?; 464 | stack.push((local.clone(), val)); 465 | let res = eval_expr(body, funcs, stack)?; 466 | stack.pop(); 467 | res 468 | } 469 | Expr::Then(a, b) => { 470 | eval_expr(a, funcs, stack)?; 471 | eval_expr(b, funcs, stack)? 472 | } 473 | Expr::Binary(a, BinaryOp::Add, b) => Value::Num( 474 | eval_expr(a, funcs, stack)?.num(a.1.clone())? 475 | + eval_expr(b, funcs, stack)?.num(b.1.clone())?, 476 | ), 477 | Expr::Binary(a, BinaryOp::Sub, b) => Value::Num( 478 | eval_expr(a, funcs, stack)?.num(a.1.clone())? 479 | - eval_expr(b, funcs, stack)?.num(b.1.clone())?, 480 | ), 481 | Expr::Binary(a, BinaryOp::Mul, b) => Value::Num( 482 | eval_expr(a, funcs, stack)?.num(a.1.clone())? 483 | * eval_expr(b, funcs, stack)?.num(b.1.clone())?, 484 | ), 485 | Expr::Binary(a, BinaryOp::Div, b) => Value::Num( 486 | eval_expr(a, funcs, stack)?.num(a.1.clone())? 487 | / eval_expr(b, funcs, stack)?.num(b.1.clone())?, 488 | ), 489 | Expr::Binary(a, BinaryOp::Eq, b) => { 490 | Value::Bool(eval_expr(a, funcs, stack)? == eval_expr(b, funcs, stack)?) 491 | } 492 | Expr::Binary(a, BinaryOp::NotEq, b) => { 493 | Value::Bool(eval_expr(a, funcs, stack)? != eval_expr(b, funcs, stack)?) 494 | } 495 | Expr::Call(func, (args, args_span)) => { 496 | let f = eval_expr(func, funcs, stack)?; 497 | match f { 498 | Value::Func(name) => { 499 | let f = &funcs[&name]; 500 | let mut stack = if f.args.len() != args.len() { 501 | return Err(Error { 502 | span: args_span.clone(), 503 | msg: format!("'{}' called with wrong number of arguments (expected {}, found {})", name, f.args.len(), args.len()), 504 | }); 505 | } else { 506 | f.args 507 | .iter() 508 | .zip(args.iter()) 509 | .map(|(name, arg)| Ok((name.clone(), eval_expr(arg, funcs, stack)?))) 510 | .collect::>()? 511 | }; 512 | eval_expr(&f.body, funcs, &mut stack)? 513 | } 514 | f => { 515 | return Err(Error { 516 | span: func.1.clone(), 517 | msg: format!("'{:?}' is not callable", f), 518 | }) 519 | } 520 | } 521 | } 522 | Expr::If(cond, a, b) => { 523 | let c = eval_expr(cond, funcs, stack)?; 524 | match c { 525 | Value::Bool(true) => eval_expr(a, funcs, stack)?, 526 | Value::Bool(false) => eval_expr(b, funcs, stack)?, 527 | c => { 528 | return Err(Error { 529 | span: cond.1.clone(), 530 | msg: format!("Conditions must be booleans, found '{:?}'", c), 531 | }) 532 | } 533 | } 534 | } 535 | Expr::Print(a) => { 536 | let val = eval_expr(a, funcs, stack)?; 537 | println!("{}", val); 538 | val 539 | } 540 | }) 541 | } 542 | 543 | fn main() { 544 | let src = fs::read_to_string(env::args().nth(1).expect("Expected file argument")) 545 | .expect("Failed to read file"); 546 | 547 | let (tokens, mut errs) = lexer().parse_recovery(src.as_str()); 548 | 549 | let parse_errs = if let Some(tokens) = tokens { 550 | // println!("Tokens = {:?}", tokens); 551 | let len = src.chars().count(); 552 | let (ast, parse_errs) = 553 | funcs_parser().parse_recovery(Stream::from_iter(len..len + 1, tokens.into_iter())); 554 | 555 | println!("{:#?}", ast); 556 | if let Some(funcs) = ast.filter(|_| errs.len() + parse_errs.len() == 0) { 557 | if let Some(main) = funcs.get("main") { 558 | assert_eq!(main.args.len(), 0); 559 | match eval_expr(&main.body, &funcs, &mut Vec::new()) { 560 | Ok(val) => println!("Return value: {}", val), 561 | Err(e) => errs.push(Simple::custom(e.span, e.msg)), 562 | } 563 | } else { 564 | panic!("No main function!"); 565 | } 566 | } 567 | 568 | parse_errs 569 | } else { 570 | Vec::new() 571 | }; 572 | 573 | errs.into_iter() 574 | .map(|e| e.map(|c| c.to_string())) 575 | .chain(parse_errs.into_iter().map(|e| e.map(|tok| tok.to_string()))) 576 | .for_each(|e| { 577 | let report = Report::build(ReportKind::Error, (), e.span().start); 578 | 579 | let report = match e.reason() { 580 | chumsky::error::SimpleReason::Unclosed { span, delimiter } => report 581 | .with_message(format!( 582 | "Unclosed delimiter {}", 583 | delimiter.fg(Color::Yellow) 584 | )) 585 | .with_label( 586 | Label::new(span.clone()) 587 | .with_message(format!( 588 | "Unclosed delimiter {}", 589 | delimiter.fg(Color::Yellow) 590 | )) 591 | .with_color(Color::Yellow), 592 | ) 593 | .with_label( 594 | Label::new(e.span()) 595 | .with_message(format!( 596 | "Must be closed before this {}", 597 | e.found() 598 | .unwrap_or(&"end of file".to_string()) 599 | .fg(Color::Red) 600 | )) 601 | .with_color(Color::Red), 602 | ), 603 | chumsky::error::SimpleReason::Unexpected => report 604 | .with_message(format!( 605 | "{}, expected {}", 606 | if e.found().is_some() { 607 | "Unexpected token in input" 608 | } else { 609 | "Unexpected end of input" 610 | }, 611 | if e.expected().len() == 0 { 612 | "something else".to_string() 613 | } else { 614 | e.expected() 615 | .map(|expected| match expected { 616 | Some(expected) => expected.to_string(), 617 | None => "end of input".to_string(), 618 | }) 619 | .collect::>() 620 | .join(", ") 621 | } 622 | )) 623 | .with_label( 624 | Label::new(e.span()) 625 | .with_message(format!( 626 | "Unexpected token {}", 627 | e.found() 628 | .unwrap_or(&"end of file".to_string()) 629 | .fg(Color::Red) 630 | )) 631 | .with_color(Color::Red), 632 | ), 633 | chumsky::error::SimpleReason::Custom(msg) => report.with_message(msg).with_label( 634 | Label::new(e.span()) 635 | .with_message(format!("{}", msg.fg(Color::Red))) 636 | .with_color(Color::Red), 637 | ), 638 | }; 639 | 640 | report.finish().print(Source::from(&src)).unwrap(); 641 | }); 642 | } 643 | -------------------------------------------------------------------------------- /tutorial.md: -------------------------------------------------------------------------------- 1 | # Chumsky: A Tutorial 2 | 3 | *Please note that this tutorial is kept up to date with the `master` branch and not the most stable release: small 4 | details may differ!* 5 | 6 | In this tutorial, we'll develop a parser (and interpreter!) for a programming language called 'Foo'. 7 | 8 | Foo is a simple language, but it's enough for us to have some fun. It isn't 9 | [Turing-complete](https://en.wikipedia.org/wiki/Turing_completeness), but it is complex enough to 10 | allow us to get to grips with parsing using Chumsky. Here's some sample code written in Foo: 11 | 12 | ``` 13 | let seven = 7; 14 | fn add x y = x + y; 15 | add(2, 3) * -seven 16 | ``` 17 | 18 | You can find the source code for the full interpreter in `examples/foo.rs` in the main repository. 19 | 20 | ## Setting up 21 | 22 | Create a new project with `cargo new --bin foo`, add the latest version of Chumsky as a dependency, and place 23 | the following in your `main.rs`: 24 | 25 | ```rust 26 | use chumsky::prelude::*; 27 | 28 | fn main() { 29 | let src = std::fs::read_to_string(std::env::args().nth(1).unwrap()).unwrap(); 30 | 31 | println!("{}", src); 32 | } 33 | ``` 34 | 35 | This code is quite simple: it treats the first command-line argument as a path, reads the corresponding file, 36 | then prints the contents to the terminal. 37 | 38 | Create a file named `test.foo` and run `cargo run -- test.foo` (the `--` tells cargo to pass the remaining 39 | arguments to the program instead of cargo itself). You should see that the contents of `test.foo`, if any, get 40 | printed to the console. 41 | 42 | Next, we'll create a data type that represents a program written in Foo. All programs in Foo are expressions, 43 | so we'll call it `Expr`. 44 | 45 | ```rust 46 | #[derive(Debug)] 47 | enum Expr { 48 | Num(f64), 49 | Var(String), 50 | 51 | Neg(Box), 52 | Add(Box, Box), 53 | Sub(Box, Box), 54 | Mul(Box, Box), 55 | Div(Box, Box), 56 | 57 | Call(String, Vec), 58 | Let { 59 | name: String, 60 | rhs: Box, 61 | then: Box, 62 | }, 63 | Fn { 64 | name: String, 65 | args: Vec, 66 | body: Box, 67 | then: Box, 68 | }, 69 | } 70 | ``` 71 | 72 | This is Foo's [Abstract Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST). It represents 73 | all possible Foo programs and is defined recursively in terms of itself (`Box` is used to avoid the type being 74 | infinitely large). Each expression may itself contain sub-expressions. 75 | 76 | We're also going to create a function that creates Foo's parser. Our parser takes in a `char` stream and 77 | produces an `Expr`, so we'll use those types for the `I` (input) and `O` (output) type parameters. 78 | 79 | ```rust 80 | fn parser() -> impl Parser> { 81 | // To be filled in later... 82 | } 83 | ``` 84 | 85 | The `Error` associated type allows us to customise the error type that Chumsky uses. For now, we'll stick to 86 | `Simple`, a built-in error type that does everything we need. 87 | 88 | In `main`, we'll alter the `println!` as follows: 89 | 90 | ```rust 91 | println!("{:?}", parser().parse(src)); 92 | ``` 93 | 94 | ## Parsing digits 95 | 96 | Chumsky is a 'parser combinator' library. It allows the creation of parsers by combining together many smaller 97 | parsers. The very smallest parsers are called 'primitives' and live in the 98 | [`primitive`](https://docs.rs/chumsky/latest/chumsky/primitive/index.html) module. 99 | 100 | We're going to want to start by parsing the simplest element of Foo's syntax: numbers. 101 | 102 | ```rust 103 | // In `parser`... 104 | filter(|c: &char| c.is_ascii_digit()) 105 | ``` 106 | 107 | The `filter` primitive allows us to read a single input and accept it if it passes a condition. In our case, 108 | that condition simply checks that the character is a digit. 109 | 110 | If we compile this code now, we'll encounter an error. Why? 111 | 112 | Although we promised that our parser would produce an `Expr`, the `filter` primitive only outputs the input 113 | it found. Right now, all we have is a parser from `char` to `char` instead of a parser from `char` to `Expr`! 114 | 115 | To solve this, we need to crack open the 'combinator' part of parser combinators. We'll use Chumsky's `map` 116 | method to convert the output of the parser to an `Expr`. This method is very similar to its namesake on 117 | `Iterator`. 118 | 119 | ```rust 120 | filter(|c: &char| c.is_ascii_digit()) 121 | .map(|c| Expr::Num(c.to_digit(10).unwrap() as f64)) 122 | ``` 123 | 124 | Here, we're converting the `char` digit to an `f64` (unwrapping is fine: `map` only gets applied to outputs 125 | that successfully parsed!) and then wrapping it in `Expr::Num(_)` to convert it to a Foo expression. 126 | 127 | Try running the code. You'll see that you can type a digit into `test.foo` and have our interpreter generate 128 | an AST like so: 129 | 130 | ``` 131 | Ok(Num(5.0)) 132 | ``` 133 | 134 | ## Parsing numbers 135 | 136 | If you're more than a little adventurous, you'll quickly notice that typing in a multi-digit number doesn't 137 | quite behave as expected. Inputting `42` will only produce a `Num(4.0)` AST. 138 | 139 | This is because `filter` only accepts a *single* input. But now another question arises: why did our interpreter 140 | *not* complain at the trailing digits that didn't get parsed? 141 | 142 | The answer is that Chumsky's parsers are *lazy*: they will consume all of the input that they can and then stop. 143 | If there's any trailing input, it'll be ignored. 144 | 145 | This is obviously not always desirable. If the user places random nonsense at the end of the file, we want to be 146 | able to generate an error about it! Worse still, that 'nonsense' could be input the user intended to be part of 147 | the program, but that contained a syntax error and so was not properly parsed. How can we force the parser to consume 148 | all of the input? 149 | 150 | To do this, we can make use of two new parsers: the `then_ignore` combinator and the `end` primitive. 151 | 152 | ```rust 153 | filter(|c: &char| c.is_ascii_digit()) 154 | .map(|c| Expr::Num(c.to_digit(10).unwrap() as f64)) 155 | .then_ignore(end()) 156 | ``` 157 | 158 | The `then_ignore` combinator parses a second pattern after the first, but ignores its output in favour of that of the 159 | first. 160 | 161 | The `end` primitive succeeds if it encounters only the end of input. 162 | 163 | Combining these together, we now get an error for longer inputs. Unfortunately, this just reveals another problem 164 | (particularly if you're working on a Unix-like platform): any whitespace before or after our digit will upset our 165 | parser and trigger an error. 166 | 167 | We can handle whitespace by adding a call to `padded_by` (which ignores a given pattern before and after the first) 168 | after our digit parser, and a repeating filter for any whitespace characters. 169 | 170 | ```rust 171 | filter(|c: &char| c.is_ascii_digit()) 172 | .map(|c| Expr::Num(c.to_digit(10).unwrap() as f64)) 173 | .padded_by(filter(|c: &char| c.is_whitespace()).repeated()) 174 | .then_ignore(end()) 175 | ``` 176 | 177 | This example should have taught you a few important things about Chumsky's parsers: 178 | 179 | 1. Parsers are lazy: trailing input is ignored 180 | 181 | 2. Whitespace is not automatically ignored. Chumsky is a general-purpose parsing library, and some languages care very 182 | much about the structure of whitespace, so Chumsky does too 183 | 184 | ## Cleaning up and taking shortcuts 185 | 186 | At this point, things are starting to look a little messy. We've ended up writing 4 lines of code to properly parse a 187 | single digit. Let's clean things up a bit. We'll also make use of a bunch of text-based parser primitives that 188 | come with Chumsky to get rid of some of this cruft. 189 | 190 | ```rust 191 | let int = text::int(10) 192 | .map(|s: String| Expr::Num(s.parse().unwrap())) 193 | .padded(); 194 | 195 | int.then_ignore(end()) 196 | ``` 197 | 198 | That's better. We've also swapped out our custom digit parser with a built-in parser that parses any positive 199 | integer. 200 | 201 | ## Evaluating simple expressions 202 | 203 | We'll now take a diversion away from the parser to create a function that can evaluate our AST. This is the 'heart' of 204 | our interpreter and is the thing that actually performs the computation of programs. 205 | 206 | ```rust 207 | fn eval(expr: &Expr) -> Result { 208 | match expr { 209 | Expr::Num(x) => Ok(*x), 210 | Expr::Neg(a) => Ok(-eval(a)?), 211 | Expr::Add(a, b) => Ok(eval(a)? + eval(b)?), 212 | Expr::Sub(a, b) => Ok(eval(a)? - eval(b)?), 213 | Expr::Mul(a, b) => Ok(eval(a)? * eval(b)?), 214 | Expr::Div(a, b) => Ok(eval(a)? / eval(b)?), 215 | _ => todo!(), // We'll handle other cases later 216 | } 217 | } 218 | ``` 219 | 220 | This function is quite simple: it just recursively calls itself, evaluating each node of the AST until it has a final 221 | result. Any runtime errors simply get thrown back down the stack. 222 | 223 | We'll also change our `main` function a little so that we can pass our AST to `eval`. 224 | 225 | ```rust 226 | fn main() { 227 | let src = std::fs::read_to_string(std::env::args().nth(1).unwrap()).unwrap(); 228 | 229 | match parser().parse(src) { 230 | Ok(ast) => match eval(&ast) { 231 | Ok(output) => println!("{}", output), 232 | Err(eval_err) => println!("Evaluation error: {}", eval_err), 233 | }, 234 | Err(parse_errs) => parse_errs 235 | .into_iter() 236 | .for_each(|e| println!("Parse error: {}", e)), 237 | } 238 | } 239 | ``` 240 | 241 | This looks like a big change, but it's actually quite simple. We're just taking the result of the parse, printing 242 | errors if they occured, or evaluating the AST otherwise. We'll allow for some evaluation operations to produce 243 | runtime errors later. 244 | 245 | ## Parsing unary operators 246 | 247 | Jumping back to our parser, let's handle unary operators. Currently, our only unary operator is `-`, the negation 248 | operator. We're looking to parse any number of `-`, followed by a number. More formally: 249 | 250 | ``` 251 | expr = op* + int 252 | ``` 253 | 254 | We'll also give our `int` parser a new name, 'atom', for reasons that will become clear later. 255 | 256 | ```rust 257 | let int = text::int(10) 258 | .map(|s: String| Expr::Num(s.parse().unwrap())) 259 | .padded(); 260 | 261 | let atom = int; 262 | 263 | let op = |c| just(c).padded(); 264 | 265 | let unary = op('-') 266 | .repeated() 267 | .then(atom) 268 | .foldr(|_op, rhs| Expr::Neg(Box::new(rhs))); 269 | 270 | unary.then_ignore(end()) 271 | ``` 272 | 273 | Here, we meet a few new combinators: 274 | 275 | - `repeated` will parse a given pattern any number of times (including zero!), collecting the outputs into a `Vec` 276 | 277 | - `then` will parse one pattern and then another immediately afterwards, collecting both outputs into a tuple pair 278 | 279 | - `foldr` will take an output of the form `(Vec, U)` and will fold it into a single `U` by repeatedly applying 280 | the given function to each element of the `Vec` 281 | 282 | This last combinator is worth a little more consideration. We're trying to parse *any number* of negation operators, 283 | followed by a single atom (for now, just a number). This might give us an output like this: 284 | 285 | ```rust 286 | (['-', '-', '-'], Num(42.0)) 287 | ``` 288 | 289 | The `foldr` function repeatedly applies the function to 'fold' the elements into a single element, like so: 290 | 291 | ``` 292 | ['-', '-', '-'], Num(42.0) 293 | | | | | 294 | | | \ / 295 | | | Neg(Num(42.0)) 296 | | | | 297 | | \ / 298 | | Neg(Neg(Num(42.0))) 299 | | | 300 | \ / 301 | Neg(Neg(Neg(Num(42.0)))) 302 | ``` 303 | 304 | This may be a little hard to conceptualise for those used to imperative programming, but for functional programmers 305 | it should come naturally: `foldr` is just equivalent to `reduce`! 306 | 307 | Give the interpreter a try. You'll be able to enter inputs as before, but also values like `-17`. You can even apply 308 | the negation operator multiple times: `--9` will yield a value of `9` in the command line. 309 | 310 | This is exciting: we've finally started to see our interpreter perform useful (sort of) computations! 311 | 312 | ## Parsing binary operators 313 | 314 | Let's keep the momentum going and move over to binary operators. Traditionally, these pose quite a problem for 315 | parsers. To parse an expression like `3 + 4 * 2`, it's necessary to understand that multiplication 316 | [binds more eagerly than addition](https://en.wikipedia.org/wiki/Order_of_operations) and hence is applied first. 317 | Therefore, the result of this expression is `11` and not `14`. 318 | 319 | Parsers employ a range of strategies to handle these cases, but for Chumsky things are simple: the most eagerly binding 320 | (highest 'precedence') operators should be those that get considered first when parsing. 321 | 322 | It's worth noting that summation operators (`+` and `-`) are typically considered to have the *same* precedence as 323 | one-another. The same also applies to product operators (`*` and `/`). For this reason, we treat each group as a single 324 | pattern. 325 | 326 | At each stage, we're looking for a simple pattern: a unary expression, following by any number of a combination of an 327 | operator and a unary expression. More formally: 328 | 329 | ``` 330 | expr = unary + (op + unary)* 331 | ``` 332 | 333 | Let's expand our parser. 334 | 335 | ```rust 336 | let int = text::int(10) 337 | .map(|s: String| Expr::Num(s.parse().unwrap())) 338 | .padded(); 339 | 340 | let atom = int; 341 | 342 | let op = |c| just(c).padded(); 343 | 344 | let unary = op('-') 345 | .repeated() 346 | .then(atom) 347 | .foldr(|_op, rhs| Expr::Neg(Box::new(rhs))); 348 | 349 | let product = unary.clone() 350 | .then(op('*').to(Expr::Mul as fn(_, _) -> _) 351 | .or(op('/').to(Expr::Div as fn(_, _) -> _)) 352 | .then(unary) 353 | .repeated()) 354 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 355 | 356 | let sum = product.clone() 357 | .then(op('+').to(Expr::Add as fn(_, _) -> _) 358 | .or(op('-').to(Expr::Sub as fn(_, _) -> _)) 359 | .then(product) 360 | .repeated()) 361 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 362 | 363 | sum.then_ignore(end()) 364 | ``` 365 | 366 | The `Expr::Mul as fn(_, _) -> _` syntax might look a little unfamiliar, but don't worry! In Rust, 367 | [tuple enum variants are implicitly functions](https://stackoverflow.com/questions/54802045/what-is-this-strange-syntax-where-an-enum-variant-is-used-as-a-function). 368 | All we're doing here is making sure that Rust treats each of them as if they had the same type using the `as` cast, and 369 | then letting type inference do the rest. Those functions then get passed through the internals of the parser and end up 370 | in `op` within the `foldl` call. 371 | 372 | Another three combinators are introduced here: 373 | 374 | - `or` attempts to parse a pattern and, if unsuccessful, instead attempts another pattern 375 | 376 | - `to` is similar to `map`, but instead of mapping the output, entirely overrides the output with a new value. In our 377 | case, we use it to convert each binary operator to a function that produces the relevant AST node for that operator. 378 | 379 | - `foldl` is very similar to `foldr` in the last section but, instead of operating on a `(Vec<_>, _)`, it operates 380 | upon a `(_, Vec<_>)`, going backwards to combine values together with the function 381 | 382 | Give the interpreter a try. You should find that the interpreter can correctly handle both unary and binary operations 383 | combined in arbitrary configurations, correctly handling precedence. You can use it as a calculator! 384 | 385 | ## Parsing parentheses 386 | 387 | A new challenger approaches: *nested expressions*. Sometimes, we want to override the default operator precedence rules 388 | entirely. We can do this by nesting expressions within parentheses, like `(3 + 4) * 2`. How do we handle this? 389 | 390 | The creation of the `atom` pattern a few sections before was no accident: parentheses have a greater precedence than 391 | any operator, so we should treat a parenthesised expression as if it were equivalent to a single value. We call things 392 | that behave like single values 'atoms' by convention. 393 | 394 | We're going to hoist our entire parser up into a closure, allowing us to define it in terms of itself. 395 | 396 | ```rust 397 | recursive(|expr| { 398 | let int = text::int(10) 399 | .map(|s: String| Expr::Num(s.parse().unwrap())) 400 | .padded(); 401 | 402 | let atom = int 403 | .or(expr.delimited_by(just('('), just(')'))); 404 | 405 | let op = |c| just(c).padded(); 406 | 407 | let unary = op('-') 408 | .repeated() 409 | .then(atom) 410 | .foldr(|_op, rhs| Expr::Neg(Box::new(rhs))); 411 | 412 | let product = unary.clone() 413 | .then(op('*').to(Expr::Mul as fn(_, _) -> _) 414 | .or(op('/').to(Expr::Div as fn(_, _) -> _)) 415 | .then(unary) 416 | .repeated()) 417 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 418 | 419 | let sum = product.clone() 420 | .then(op('+').to(Expr::Add as fn(_, _) -> _) 421 | .or(op('-').to(Expr::Sub as fn(_, _) -> _)) 422 | .then(product) 423 | .repeated()) 424 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 425 | 426 | sum.padded() 427 | }) 428 | .then_ignore(end()) 429 | ``` 430 | 431 | There are a few things worth paying attention to here. 432 | 433 | 1. `recursive` allows us to define a parser recursively in terms of itself by giving us a copy of it within the 434 | closure's scope 435 | 436 | 2. We use the recursive definition of `expr` within the definition of `atom`. We use the new `delimited_by` combinator 437 | to allow it to sit nested within a pair of parentheses 438 | 439 | 3. The `then_ignore(end())` call has *not* been hoisted inside the `recursive` call. This is because we only want to 440 | parse an end of input on the outermost expression, not at every level of nesting 441 | 442 | Try running the interpreter. You'll find that it can handle a surprising number of cases elegantly. Make sure that the 443 | following cases work correctly: 444 | 445 | | Expression | Expected result | 446 | |---------------|-----------------| 447 | | `3 * 4 + 2` | `14` | 448 | | `3 * (4 + 2)` | `18` | 449 | | `-4 + 2` | `-2` | 450 | | `-(4 + 2)` | `-6` | 451 | 452 | ## Parsing lets 453 | 454 | Our next step is to handle `let`. Unlike Rust and other imperative languages, `let` in Foo is an expression and not an 455 | statement (Foo has no statements) that takes the following form: 456 | 457 | ``` 458 | let = ; 459 | ``` 460 | 461 | We only want `let`s to appear at the outermost level of the expression, so we leave it out of the original recursive 462 | expression definition. However, we also want to be able to chain `let`s together, so we put them in their own recursive 463 | definition. We call it `decl` ('declaration') because we're eventually going to be adding `fn` syntax too. 464 | 465 | ```rust 466 | let ident = text::ident() 467 | .padded(); 468 | 469 | let expr = recursive(|expr| { 470 | let int = text::int(10) 471 | .map(|s: String| Expr::Num(s.parse().unwrap())) 472 | .padded(); 473 | 474 | let atom = int 475 | .or(expr.delimited_by(just('('), just(')'))) 476 | .or(ident.map(Expr::Var)); 477 | 478 | let op = |c| just(c).padded(); 479 | 480 | let unary = op('-') 481 | .repeated() 482 | .then(atom) 483 | .foldr(|_op, rhs| Expr::Neg(Box::new(rhs))); 484 | 485 | let product = unary.clone() 486 | .then(op('*').to(Expr::Mul as fn(_, _) -> _) 487 | .or(op('/').to(Expr::Div as fn(_, _) -> _)) 488 | .then(unary) 489 | .repeated()) 490 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 491 | 492 | let sum = product.clone() 493 | .then(op('+').to(Expr::Add as fn(_, _) -> _) 494 | .or(op('-').to(Expr::Sub as fn(_, _) -> _)) 495 | .then(product) 496 | .repeated()) 497 | .foldl(|lhs, (op, rhs)| op(Box::new(lhs), Box::new(rhs))); 498 | 499 | sum.padded() 500 | }); 501 | 502 | let decl = recursive(|decl| { 503 | let r#let = text::keyword("let") 504 | .ignore_then(ident) 505 | .then_ignore(just('=')) 506 | .then(expr.clone()) 507 | .then_ignore(just(';')) 508 | .then(decl) 509 | .map(|((name, rhs), then)| Expr::Let { 510 | name, 511 | rhs: Box::new(rhs), 512 | then: Box::new(then), 513 | }); 514 | 515 | r#let 516 | // Must be later in the chain than `r#let` to avoid ambiguity 517 | .or(expr) 518 | .padded() 519 | }); 520 | 521 | decl 522 | .then_ignore(end()) 523 | ``` 524 | 525 | `keyword` is simply a parser that looks for an exact identifier (i.e: it doesn't match identifiers that only start with 526 | a keyword). 527 | 528 | Other than that, there's nothing in the definition of `r#let` that you haven't seen before: familiar combinators, but 529 | combined in different ways. It selectively ignores parts of the syntax that we don't care about after validating that 530 | it exists, then uses those elements that it does care about to create an `Expr::Let` AST node. 531 | 532 | Another thing to note is that the definition of `ident` will parse `"let"`. To avoid the parser accidentally deciding 533 | that `"let"` is a variable, we place `r#let` earlier in the or chain than `expr` so that it prioritises the correct 534 | interpretation. As mentioned in previous sections, Chumsky handles ambiguity simply by choosing the first successful 535 | parse it encounters, so making sure that we declare things in the right order can sometimes be important. 536 | 537 | You should now be able to run the interpreter and have it accept an input such as 538 | 539 | ``` 540 | let five = 5; 541 | five * 3 542 | ``` 543 | 544 | Unfortunately, the `eval` function will panic because we've not yet handled `Expr::Var` or `Expr::Let`. Let's do that 545 | now. 546 | 547 | ```rust 548 | fn eval<'a>(expr: &'a Expr, vars: &mut Vec<(&'a String, f64)>) -> Result { 549 | match expr { 550 | Expr::Num(x) => Ok(*x), 551 | Expr::Neg(a) => Ok(-eval(a, vars)?), 552 | Expr::Add(a, b) => Ok(eval(a, vars)? + eval(b, vars)?), 553 | Expr::Sub(a, b) => Ok(eval(a, vars)? - eval(b, vars)?), 554 | Expr::Mul(a, b) => Ok(eval(a, vars)? * eval(b, vars)?), 555 | Expr::Div(a, b) => Ok(eval(a, vars)? / eval(b, vars)?), 556 | Expr::Var(name) => if let Some((_, val)) = vars.iter().rev().find(|(var, _)| *var == name) { 557 | Ok(*val) 558 | } else { 559 | Err(format!("Cannot find variable `{}` in scope", name)) 560 | }, 561 | Expr::Let { name, rhs, then } => { 562 | let rhs = eval(rhs, vars)?; 563 | vars.push((name, rhs)); 564 | let output = eval(then, vars); 565 | vars.pop(); 566 | output 567 | }, 568 | _ => todo!(), 569 | } 570 | } 571 | ``` 572 | 573 | Woo! That got a bit more complicated. Don't fear, there are only 3 important changes: 574 | 575 | 1. Because we need to keep track of variables that were previously defined, we use a `Vec` to remember them. Because 576 | `eval` is a recursive function, we also need to pass is to all recursive calls. 577 | 578 | 2. When we encounter an `Expr::Let`, we first evaluate the right-hand side (`rhs`). Once evaluated, we push it to the 579 | `vars` stack and evaluate the trailing `then` expression (i.e: all of the remaining code that appears after the 580 | semicolon). Popping it afterwards is not *technically* necessary because Foo does not permit nested declarations, 581 | but we do it anyway because it's good practice and it's what we'd want to do if we ever decided to add nesting. 582 | 583 | 3. When we encounter an `Expr::Var` (i.e: an inline variable) we search the stack *backwards* (because Foo permits 584 | [variable shadowing](https://en.wikipedia.org/wiki/Variable_shadowing) and we only want to find the most recently 585 | declared variable with the same name) to find the variables's value. If we can't find a variable of that name, we 586 | generate a runtime error which gets propagated back up the stack. 587 | 588 | Obviously, the signature of `eval` has changed so we'll update the call in `main` to become: 589 | 590 | ```rust 591 | eval(&ast, &mut Vec::new()) 592 | ``` 593 | 594 | Make sure to test the interpreter. Try experimenting with `let` declarations to make sure things aren't broken. In 595 | particular, it's worth testing variable shadowing by ensuring that the following program produces `8`: 596 | 597 | ``` 598 | let x = 5; 599 | let x = 3 + x; 600 | x 601 | ``` 602 | 603 | ## Parsing functions 604 | 605 | We're almost at a complete implementation of Foo. There's just one thing left: *functions*. 606 | 607 | Surprisingly, parsing functions is the easy part. All we need to modify is the definition of `decl` to add `r#fn`. It 608 | looks very much like the existing definition of `r#let`: 609 | 610 | ```rust 611 | let decl = recursive(|decl| { 612 | let r#let = text::keyword("let") 613 | .ignore_then(ident) 614 | .then_ignore(just('=')) 615 | .then(expr.clone()) 616 | .then_ignore(just(';')) 617 | .then(decl.clone()) 618 | .map(|((name, rhs), then)| Expr::Let { 619 | name, 620 | rhs: Box::new(rhs), 621 | then: Box::new(then), 622 | }); 623 | 624 | let r#fn = text::keyword("fn") 625 | .ignore_then(ident) 626 | .then(ident.repeated()) 627 | .then_ignore(just('=')) 628 | .then(expr.clone()) 629 | .then_ignore(just(';')) 630 | .then(decl) 631 | .map(|(((name, args), body), then)| Expr::Fn { 632 | name, 633 | args, 634 | body: Box::new(body), 635 | then: Box::new(then), 636 | }); 637 | 638 | r#let 639 | .or(r#fn) 640 | .or(expr) 641 | .padded() 642 | }); 643 | ``` 644 | 645 | There's nothing new here, you understand this all already. 646 | 647 | Obviously, we also need to add support for *calling* functions by modifying `atom`: 648 | 649 | ```rust 650 | let call = ident 651 | .then(expr.clone() 652 | .separated_by(just(',')) 653 | .allow_trailing() // Foo is Rust-like, so allow trailing commas to appear in arg lists 654 | .delimited_by(just('('), just(')'))) 655 | .map(|(f, args)| Expr::Call(f, args)); 656 | 657 | let atom = int 658 | .or(expr.delimited_by(just('('), just(')'))) 659 | .or(call) 660 | .or(ident.map(Expr::Var)); 661 | ``` 662 | 663 | The only new combinator here is `separated_by` which behaves like `repeated`, but requires a separator pattern between 664 | each element. It has a method called `allow_trailing` which allows for parsing a trailing separator at the end of the 665 | elements. 666 | 667 | Next, we modify our `eval` function to support a function stack. 668 | 669 | ```rust 670 | fn eval<'a>( 671 | expr: &'a Expr, 672 | vars: &mut Vec<(&'a String, f64)>, 673 | funcs: &mut Vec<(&'a String, &'a [String], &'a Expr)>, 674 | ) -> Result { 675 | match expr { 676 | Expr::Num(x) => Ok(*x), 677 | Expr::Neg(a) => Ok(-eval(a, vars, funcs)?), 678 | Expr::Add(a, b) => Ok(eval(a, vars, funcs)? + eval(b, vars, funcs)?), 679 | Expr::Sub(a, b) => Ok(eval(a, vars, funcs)? - eval(b, vars, funcs)?), 680 | Expr::Mul(a, b) => Ok(eval(a, vars, funcs)? * eval(b, vars, funcs)?), 681 | Expr::Div(a, b) => Ok(eval(a, vars, funcs)? / eval(b, vars, funcs)?), 682 | Expr::Var(name) => if let Some((_, val)) = vars.iter().rev().find(|(var, _)| *var == name) { 683 | Ok(*val) 684 | } else { 685 | Err(format!("Cannot find variable `{}` in scope", name)) 686 | }, 687 | Expr::Let { name, rhs, then } => { 688 | let rhs = eval(rhs, vars, funcs)?; 689 | vars.push((name, rhs)); 690 | let output = eval(then, vars, funcs); 691 | vars.pop(); 692 | output 693 | }, 694 | Expr::Call(name, args) => if let Some((_, arg_names, body)) = funcs 695 | .iter() 696 | .rev() 697 | .find(|(var, _, _)| *var == name) 698 | .copied() 699 | { 700 | if arg_names.len() == args.len() { 701 | let mut args = args 702 | .iter() 703 | .map(|arg| eval(arg, vars, funcs)) 704 | .zip(arg_names.iter()) 705 | .map(|(val, name)| Ok((name, val?))) 706 | .collect::>()?; 707 | vars.append(&mut args); 708 | let output = eval(body, vars, funcs); 709 | vars.truncate(vars.len() - args.len()); 710 | output 711 | } else { 712 | Err(format!( 713 | "Wrong number of arguments for function `{}`: expected {}, found {}", 714 | name, 715 | arg_names.len(), 716 | args.len(), 717 | )) 718 | } 719 | } else { 720 | Err(format!("Cannot find function `{}` in scope", name)) 721 | }, 722 | Expr::Fn { name, args, body, then } => { 723 | funcs.push((name, args, body)); 724 | let output = eval(then, vars, funcs); 725 | funcs.pop(); 726 | output 727 | }, 728 | } 729 | } 730 | ``` 731 | 732 | Another big change! On closer inspection, however, this looks a lot like the change we made previously when we added 733 | support for `let` declarations. Whenever we encounter an `Expr::Fn`, we just push the function to the `funcs` stack and 734 | continue. Whenever we encounter an `Expr::Call`, we search the function stack backwards, as we did for variables, and 735 | then execute the body of the function (making sure to evaluate and push the arguments!). 736 | 737 | As before, we'll need to change the `eval` call in `main` to: 738 | 739 | ```rust 740 | eval(&ast, &mut Vec::new(), &mut Vec::new()) 741 | ``` 742 | 743 | Give the interpreter a test - see what you can do with it! Here's an example program to get you started: 744 | 745 | ``` 746 | let five = 5; 747 | let eight = 3 + five; 748 | fn add x y = x + y; 749 | add(five, eight) 750 | ``` 751 | 752 | ## Conclusion 753 | 754 | Here ends our exploration into Chumsky's API. We only scratched the surface of what Chumsky can do, but now you'll need 755 | to rely on the examples in the repository and the API doc examples for further help. Nonetheless, I hope it was an 756 | interesting foray into the use of parser combinators for the development of parsers. 757 | 758 | If nothing else, you've now got a neat little calculator language to play with. 759 | 760 | Interestingly, there is a subtle bug in Foo's `eval` function that produces unexpected scoping behaviour with function 761 | calls. I'll leave finding it as an exercise for the reader. 762 | 763 | ## Extension tasks 764 | 765 | - Find the interesting function scoping bug and consider how it could be fixed 766 | 767 | - Split token lexing into a separate compilation stage to avoid the need for `.padded()` in the parser 768 | 769 | - Add more operators 770 | 771 | - Add an `if then else ` ternary operator 772 | 773 | - Add values of different types by turning `f64` into an `enum` 774 | 775 | - Add lambdas to the language 776 | 777 | - Format the error message in a more useful way, perhaps by providing a reference to the original code 778 | -------------------------------------------------------------------------------- /src/primitive.rs: -------------------------------------------------------------------------------- 1 | //! Parser primitives that accept specific token patterns. 2 | //! 3 | //! *“These creatures you call mice, you see, they are not quite as they appear. They are merely the protrusion into 4 | //! our dimension of vastly hyperintelligent pandimensional beings.”* 5 | //! 6 | //! Chumsky parsers are created by combining together smaller parsers. Right at the bottom of the pile are the parser 7 | //! primitives, a parser developer's bread & butter. Each of these primitives are very easy to understand in isolation, 8 | //! usually only doing one thing. 9 | //! 10 | //! ## The Important Ones 11 | //! 12 | //! - [`just`]: parses a specific input or sequence of inputs 13 | //! - [`filter`]: parses a single input, if the given filter function returns `true` 14 | //! - [`end`]: parses the end of input (i.e: if there any more inputs, this parse fails) 15 | 16 | use super::*; 17 | 18 | /// See [`custom`]. 19 | pub struct Custom(F, PhantomData); 20 | 21 | impl Copy for Custom {} 22 | impl Clone for Custom { 23 | fn clone(&self) -> Self { 24 | Self(self.0.clone(), PhantomData) 25 | } 26 | } 27 | 28 | impl) -> PResult, E: Error> Parser 29 | for Custom 30 | { 31 | type Error = E; 32 | 33 | fn parse_inner( 34 | &self, 35 | _debugger: &mut D, 36 | stream: &mut StreamOf, 37 | ) -> PResult { 38 | (self.0)(stream) 39 | } 40 | 41 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 42 | #[allow(deprecated)] 43 | self.parse_inner(d, s) 44 | } 45 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 46 | #[allow(deprecated)] 47 | self.parse_inner(d, s) 48 | } 49 | } 50 | 51 | /// A parser primitive that allows you to define your own custom parsers. 52 | /// 53 | /// In theory you shouldn't need to use this unless you have particularly bizarre requirements, but it's a cleaner and 54 | //// more sustainable alternative to implementing [`Parser`] by hand. 55 | /// 56 | /// The output type of this parser is determined by the parse result of the function. 57 | pub fn custom(f: F) -> Custom { 58 | Custom(f, PhantomData) 59 | } 60 | 61 | /// See [`end`]. 62 | pub struct End(PhantomData); 63 | 64 | impl Clone for End { 65 | fn clone(&self) -> Self { 66 | Self(PhantomData) 67 | } 68 | } 69 | 70 | impl> Parser for End { 71 | type Error = E; 72 | 73 | fn parse_inner( 74 | &self, 75 | _debugger: &mut D, 76 | stream: &mut StreamOf, 77 | ) -> PResult { 78 | match stream.next() { 79 | (_, _, None) => (Vec::new(), Ok(((), None))), 80 | (at, span, found) => ( 81 | Vec::new(), 82 | Err(Located::at( 83 | at, 84 | E::expected_input_found(span, Some(None), found), 85 | )), 86 | ), 87 | } 88 | } 89 | 90 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 91 | #[allow(deprecated)] 92 | self.parse_inner(d, s) 93 | } 94 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 95 | #[allow(deprecated)] 96 | self.parse_inner(d, s) 97 | } 98 | } 99 | 100 | /// A parser that accepts only the end of input. 101 | /// 102 | /// This parser is very useful when you wish to force a parser to consume *all* of the input. It is typically combined 103 | /// with [`Parser::then_ignore`]. 104 | /// 105 | /// The output type of this parser is `()`. 106 | /// 107 | /// # Examples 108 | /// 109 | /// ``` 110 | /// # use chumsky::prelude::*; 111 | /// assert_eq!(end::>().parse(""), Ok(())); 112 | /// assert!(end::>().parse("hello").is_err()); 113 | /// ``` 114 | /// 115 | /// ``` 116 | /// # use chumsky::prelude::*; 117 | /// let digits = text::digits::<_, Simple>(10); 118 | /// 119 | /// // This parser parses digits! 120 | /// assert_eq!(digits.parse("1234"), Ok("1234".to_string())); 121 | /// 122 | /// // However, parsers are lazy and do not consume trailing input. 123 | /// // This can be inconvenient if we want to validate all of the input. 124 | /// assert_eq!(digits.parse("1234AhasjADSJAlaDJKSDAK"), Ok("1234".to_string())); 125 | /// 126 | /// // To fix this problem, we require that the end of input follows any successfully parsed input 127 | /// let only_digits = digits.then_ignore(end()); 128 | /// 129 | /// // Now our parser correctly produces an error if any trailing input is found... 130 | /// assert!(only_digits.parse("1234AhasjADSJAlaDJKSDAK").is_err()); 131 | /// // ...while still behaving correctly for inputs that only consist of valid patterns 132 | /// assert_eq!(only_digits.parse("1234"), Ok("1234".to_string())); 133 | /// ``` 134 | pub fn end() -> End { 135 | End(PhantomData) 136 | } 137 | 138 | mod private { 139 | pub trait Sealed {} 140 | 141 | impl Sealed for T {} 142 | impl Sealed for alloc::string::String {} 143 | impl<'a> Sealed for &'a str {} 144 | impl<'a, T> Sealed for &'a [T] {} 145 | impl Sealed for [T; N] {} 146 | impl<'a, T, const N: usize> Sealed for &'a [T; N] {} 147 | impl Sealed for alloc::vec::Vec {} 148 | impl Sealed for alloc::collections::LinkedList {} 149 | impl Sealed for alloc::collections::VecDeque {} 150 | impl Sealed for alloc::collections::BTreeSet {} 151 | impl Sealed for alloc::collections::BinaryHeap {} 152 | 153 | #[cfg(feature = "std")] 154 | impl Sealed for std::collections::HashSet {} 155 | #[cfg(not(feature = "std"))] 156 | impl Sealed for hashbrown::HashSet {} 157 | } 158 | 159 | /// A utility trait to abstract over linear container-like things. 160 | /// 161 | /// This trait is likely to change in future versions of the crate, so avoid implementing it yourself. 162 | pub trait Container: private::Sealed { 163 | /// An iterator over the items within this container, by value. 164 | type Iter: Iterator; 165 | /// Iterate over the elements of the container (using internal iteration because GATs are unstable). 166 | fn get_iter(&self) -> Self::Iter; 167 | } 168 | 169 | impl Container for T { 170 | type Iter = core::iter::Once; 171 | fn get_iter(&self) -> Self::Iter { 172 | core::iter::once(self.clone()) 173 | } 174 | } 175 | 176 | impl Container for String { 177 | type Iter = alloc::vec::IntoIter; 178 | fn get_iter(&self) -> Self::Iter { 179 | self.chars().collect::>().into_iter() 180 | } 181 | } 182 | 183 | impl<'a> Container for &'a str { 184 | type Iter = alloc::str::Chars<'a>; 185 | fn get_iter(&self) -> Self::Iter { 186 | self.chars() 187 | } 188 | } 189 | 190 | impl<'a, T: Clone> Container for &'a [T] { 191 | type Iter = core::iter::Cloned>; 192 | fn get_iter(&self) -> Self::Iter { 193 | self.iter().cloned() 194 | } 195 | } 196 | 197 | impl<'a, T: Clone, const N: usize> Container for &'a [T; N] { 198 | type Iter = core::iter::Cloned>; 199 | fn get_iter(&self) -> Self::Iter { 200 | self.iter().cloned() 201 | } 202 | } 203 | 204 | impl Container for [T; N] { 205 | type Iter = core::array::IntoIter; 206 | fn get_iter(&self) -> Self::Iter { 207 | core::array::IntoIter::new(self.clone()) 208 | } 209 | } 210 | 211 | impl Container for Vec { 212 | type Iter = alloc::vec::IntoIter; 213 | fn get_iter(&self) -> Self::Iter { 214 | self.clone().into_iter() 215 | } 216 | } 217 | 218 | impl Container for alloc::collections::LinkedList { 219 | type Iter = alloc::collections::linked_list::IntoIter; 220 | fn get_iter(&self) -> Self::Iter { 221 | self.clone().into_iter() 222 | } 223 | } 224 | 225 | impl Container for alloc::collections::VecDeque { 226 | type Iter = alloc::collections::vec_deque::IntoIter; 227 | fn get_iter(&self) -> Self::Iter { 228 | self.clone().into_iter() 229 | } 230 | } 231 | 232 | #[cfg(feature = "std")] 233 | impl Container for std::collections::HashSet { 234 | type Iter = std::collections::hash_set::IntoIter; 235 | fn get_iter(&self) -> Self::Iter { 236 | self.clone().into_iter() 237 | } 238 | } 239 | 240 | #[cfg(not(feature = "std"))] 241 | impl Container for hashbrown::HashSet { 242 | type Iter = hashbrown::hash_set::IntoIter; 243 | fn get_iter(&self) -> Self::Iter { 244 | self.clone().into_iter() 245 | } 246 | } 247 | 248 | impl Container for alloc::collections::BTreeSet { 249 | type Iter = alloc::collections::btree_set::IntoIter; 250 | fn get_iter(&self) -> Self::Iter { 251 | self.clone().into_iter() 252 | } 253 | } 254 | 255 | impl Container for alloc::collections::BinaryHeap { 256 | type Iter = alloc::collections::binary_heap::IntoIter; 257 | fn get_iter(&self) -> Self::Iter { 258 | self.clone().into_iter() 259 | } 260 | } 261 | 262 | /// See [`just`]. 263 | pub struct Just, E>(C, PhantomData<(I, E)>); 264 | 265 | impl, E> Copy for Just {} 266 | impl, E> Clone for Just { 267 | fn clone(&self) -> Self { 268 | Self(self.0.clone(), PhantomData) 269 | } 270 | } 271 | 272 | impl + Clone, E: Error> Parser for Just { 273 | type Error = E; 274 | 275 | fn parse_inner( 276 | &self, 277 | _debugger: &mut D, 278 | stream: &mut StreamOf, 279 | ) -> PResult { 280 | for expected in self.0.get_iter() { 281 | match stream.next() { 282 | (_, _, Some(tok)) if tok == expected => {} 283 | (at, span, found) => { 284 | return ( 285 | Vec::new(), 286 | Err(Located::at( 287 | at, 288 | E::expected_input_found(span, Some(Some(expected)), found), 289 | )), 290 | ) 291 | } 292 | } 293 | } 294 | 295 | (Vec::new(), Ok((self.0.clone(), None))) 296 | } 297 | 298 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 299 | #[allow(deprecated)] 300 | self.parse_inner(d, s) 301 | } 302 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 303 | #[allow(deprecated)] 304 | self.parse_inner(d, s) 305 | } 306 | } 307 | 308 | /// A parser that accepts only the given input. 309 | /// 310 | /// The output type of this parser is `C`, the input or sequence that was provided. 311 | /// 312 | /// # Examples 313 | /// 314 | /// ``` 315 | /// # use chumsky::{prelude::*, error::Cheap}; 316 | /// let question = just::<_, _, Cheap>('?'); 317 | /// 318 | /// assert_eq!(question.parse("?"), Ok('?')); 319 | /// assert!(question.parse("!").is_err()); 320 | /// // This works because parsers do not eagerly consume input, so the '!' is not parsed 321 | /// assert_eq!(question.parse("?!"), Ok('?')); 322 | /// // This fails because the parser expects an end to the input after the '?' 323 | /// assert!(question.then(end()).parse("?!").is_err()); 324 | /// ``` 325 | pub fn just, E: Error>(inputs: C) -> Just { 326 | Just(inputs, PhantomData) 327 | } 328 | 329 | /// See [`seq`]. 330 | pub struct Seq(Vec, PhantomData); 331 | 332 | impl Clone for Seq { 333 | fn clone(&self) -> Self { 334 | Self(self.0.clone(), PhantomData) 335 | } 336 | } 337 | 338 | impl> Parser for Seq { 339 | type Error = E; 340 | 341 | fn parse_inner( 342 | &self, 343 | _debugger: &mut D, 344 | stream: &mut StreamOf, 345 | ) -> PResult { 346 | for expected in &self.0 { 347 | match stream.next() { 348 | (_, _, Some(tok)) if &tok == expected => {} 349 | (at, span, found) => { 350 | return ( 351 | Vec::new(), 352 | Err(Located::at( 353 | at, 354 | E::expected_input_found(span, Some(Some(expected.clone())), found), 355 | )), 356 | ) 357 | } 358 | } 359 | } 360 | 361 | (Vec::new(), Ok(((), None))) 362 | } 363 | 364 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 365 | #[allow(deprecated)] 366 | self.parse_inner(d, s) 367 | } 368 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 369 | #[allow(deprecated)] 370 | self.parse_inner(d, s) 371 | } 372 | } 373 | 374 | /// A parser that accepts only a sequence of specific inputs. 375 | /// 376 | /// The output type of this parser is `()`. 377 | /// 378 | /// # Examples 379 | /// 380 | /// ``` 381 | /// # use chumsky::{prelude::*, error::Cheap}; 382 | /// let hello = seq::<_, _, Cheap>("Hello".chars()); 383 | /// 384 | /// assert_eq!(hello.parse("Hello"), Ok(())); 385 | /// assert_eq!(hello.parse("Hello, world!"), Ok(())); 386 | /// assert!(hello.parse("Goodbye").is_err()); 387 | /// 388 | /// let onetwothree = seq::<_, _, Cheap>([1, 2, 3]); 389 | /// 390 | /// assert_eq!(onetwothree.parse([1, 2, 3]), Ok(())); 391 | /// assert_eq!(onetwothree.parse([1, 2, 3, 4, 5]), Ok(())); 392 | /// assert!(onetwothree.parse([2, 1, 3]).is_err()); 393 | /// ``` 394 | #[deprecated( 395 | since = "0.7.0", 396 | note = "Use `just` instead: it now works for many sequence-like types!" 397 | )] 398 | pub fn seq, E>(xs: Iter) -> Seq { 399 | Seq(xs.into_iter().collect(), PhantomData) 400 | } 401 | 402 | /// See [`one_of`]. 403 | pub struct OneOf(C, PhantomData<(I, E)>); 404 | 405 | impl Clone for OneOf { 406 | fn clone(&self) -> Self { 407 | Self(self.0.clone(), PhantomData) 408 | } 409 | } 410 | 411 | impl, E: Error> Parser for OneOf { 412 | type Error = E; 413 | 414 | fn parse_inner( 415 | &self, 416 | _debugger: &mut D, 417 | stream: &mut StreamOf, 418 | ) -> PResult { 419 | match stream.next() { 420 | (_, _, Some(tok)) if self.0.get_iter().any(|not| not == tok) => { 421 | (Vec::new(), Ok((tok, None))) 422 | } 423 | (at, span, found) => ( 424 | Vec::new(), 425 | Err(Located::at( 426 | at, 427 | E::expected_input_found(span, self.0.get_iter().map(Some), found), 428 | )), 429 | ), 430 | } 431 | } 432 | 433 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 434 | #[allow(deprecated)] 435 | self.parse_inner(d, s) 436 | } 437 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 438 | #[allow(deprecated)] 439 | self.parse_inner(d, s) 440 | } 441 | } 442 | 443 | /// A parser that accepts one of a sequence of specific inputs. 444 | /// 445 | /// The output type of this parser is `I`, the input that was found. 446 | /// 447 | /// # Examples 448 | /// 449 | /// ``` 450 | /// # use chumsky::{prelude::*, error::Cheap}; 451 | /// let digits = one_of::<_, _, Cheap>("0123456789") 452 | /// .repeated().at_least(1) 453 | /// .then_ignore(end()) 454 | /// .collect::(); 455 | /// 456 | /// assert_eq!(digits.parse("48791"), Ok("48791".to_string())); 457 | /// assert!(digits.parse("421!53").is_err()); 458 | /// ``` 459 | pub fn one_of, E: Error>(inputs: C) -> OneOf { 460 | OneOf(inputs, PhantomData) 461 | } 462 | 463 | /// See [`empty`]. 464 | pub struct Empty(PhantomData); 465 | 466 | impl Clone for Empty { 467 | fn clone(&self) -> Self { 468 | Self(PhantomData) 469 | } 470 | } 471 | 472 | impl> Parser for Empty { 473 | type Error = E; 474 | 475 | fn parse_inner( 476 | &self, 477 | _debugger: &mut D, 478 | _: &mut StreamOf, 479 | ) -> PResult { 480 | (Vec::new(), Ok(((), None))) 481 | } 482 | 483 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 484 | #[allow(deprecated)] 485 | self.parse_inner(d, s) 486 | } 487 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 488 | #[allow(deprecated)] 489 | self.parse_inner(d, s) 490 | } 491 | } 492 | 493 | /// A parser that parses no inputs. 494 | /// 495 | /// The output type of this parser is `()`. 496 | pub fn empty() -> Empty { 497 | Empty(PhantomData) 498 | } 499 | 500 | /// See [`none_of`]. 501 | pub struct NoneOf(C, PhantomData<(I, E)>); 502 | 503 | impl Clone for NoneOf { 504 | fn clone(&self) -> Self { 505 | Self(self.0.clone(), PhantomData) 506 | } 507 | } 508 | 509 | impl, E: Error> Parser for NoneOf { 510 | type Error = E; 511 | 512 | fn parse_inner( 513 | &self, 514 | _debugger: &mut D, 515 | stream: &mut StreamOf, 516 | ) -> PResult { 517 | match stream.next() { 518 | (_, _, Some(tok)) if self.0.get_iter().all(|not| not != tok) => { 519 | (Vec::new(), Ok((tok, None))) 520 | } 521 | (at, span, found) => ( 522 | Vec::new(), 523 | Err(Located::at( 524 | at, 525 | E::expected_input_found(span, Vec::new(), found), 526 | )), 527 | ), 528 | } 529 | } 530 | 531 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 532 | #[allow(deprecated)] 533 | self.parse_inner(d, s) 534 | } 535 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 536 | #[allow(deprecated)] 537 | self.parse_inner(d, s) 538 | } 539 | } 540 | 541 | /// A parser that accepts any input that is *not* in a sequence of specific inputs. 542 | /// 543 | /// The output type of this parser is `I`, the input that was found. 544 | /// 545 | /// # Examples 546 | /// 547 | /// ``` 548 | /// # use chumsky::{prelude::*, error::Cheap}; 549 | /// let string = one_of::<_, _, Cheap>("\"'") 550 | /// .ignore_then(none_of("\"'").repeated()) 551 | /// .then_ignore(one_of("\"'")) 552 | /// .then_ignore(end()) 553 | /// .collect::(); 554 | /// 555 | /// assert_eq!(string.parse("'hello'"), Ok("hello".to_string())); 556 | /// assert_eq!(string.parse("\"world\""), Ok("world".to_string())); 557 | /// assert!(string.parse("\"421!53").is_err()); 558 | /// ``` 559 | pub fn none_of, E: Error>(inputs: C) -> NoneOf { 560 | NoneOf(inputs, PhantomData) 561 | } 562 | 563 | /// See [`take_until`]. 564 | #[derive(Copy, Clone)] 565 | pub struct TakeUntil(A); 566 | 567 | impl> Parser, O)> for TakeUntil { 568 | type Error = A::Error; 569 | 570 | fn parse_inner( 571 | &self, 572 | debugger: &mut D, 573 | stream: &mut StreamOf, 574 | ) -> PResult, O), A::Error> { 575 | let mut outputs = Vec::new(); 576 | let mut alt = None; 577 | 578 | loop { 579 | let (errors, err) = match stream.try_parse(|stream| { 580 | #[allow(deprecated)] 581 | self.0.parse_inner(debugger, stream) 582 | }) { 583 | (errors, Ok((out, a_alt))) => { 584 | break (errors, Ok(((outputs, out), merge_alts(alt, a_alt)))) 585 | } 586 | (errors, Err(err)) => (errors, err), 587 | }; 588 | 589 | match stream.next() { 590 | (_, _, Some(tok)) => outputs.push(tok), 591 | (_, _, None) => break (errors, Err(err)), 592 | } 593 | 594 | alt = merge_alts(alt.take(), Some(err)); 595 | } 596 | } 597 | 598 | fn parse_inner_verbose( 599 | &self, 600 | d: &mut Verbose, 601 | s: &mut StreamOf, 602 | ) -> PResult, O), A::Error> { 603 | #[allow(deprecated)] 604 | self.parse_inner(d, s) 605 | } 606 | fn parse_inner_silent( 607 | &self, 608 | d: &mut Silent, 609 | s: &mut StreamOf, 610 | ) -> PResult, O), A::Error> { 611 | #[allow(deprecated)] 612 | self.parse_inner(d, s) 613 | } 614 | } 615 | 616 | /// A parser that accepts any number of inputs until a terminating pattern is reached. 617 | /// 618 | /// The output type of this parser is `(Vec, O)`, a combination of the preceding inputs and the output of the 619 | /// final patterns. 620 | /// 621 | /// # Examples 622 | /// 623 | /// ``` 624 | /// # use chumsky::{prelude::*, error::Cheap}; 625 | /// let single_line = just::<_, _, Simple>("//") 626 | /// .then(take_until(text::newline())) 627 | /// .ignored(); 628 | /// 629 | /// let multi_line = just::<_, _, Simple>("/*") 630 | /// .then(take_until(just("*/"))) 631 | /// .ignored(); 632 | /// 633 | /// let comment = single_line.or(multi_line); 634 | /// 635 | /// let tokens = text::ident() 636 | /// .padded() 637 | /// .padded_by(comment 638 | /// .padded() 639 | /// .repeated()) 640 | /// .repeated(); 641 | /// 642 | /// assert_eq!(tokens.parse(r#" 643 | /// // These tokens... 644 | /// these are 645 | /// /* 646 | /// ...have some 647 | /// multi-line... 648 | /// */ 649 | /// // ...and single-line... 650 | /// tokens 651 | /// // ...comments between them 652 | /// "#), Ok(vec!["these".to_string(), "are".to_string(), "tokens".to_string()])); 653 | /// ``` 654 | pub fn take_until(until: A) -> TakeUntil { 655 | TakeUntil(until) 656 | } 657 | 658 | /// See [`filter`]. 659 | pub struct Filter(F, PhantomData); 660 | 661 | impl Copy for Filter {} 662 | impl Clone for Filter { 663 | fn clone(&self) -> Self { 664 | Self(self.0.clone(), PhantomData) 665 | } 666 | } 667 | 668 | impl bool, E: Error> Parser for Filter { 669 | type Error = E; 670 | 671 | fn parse_inner( 672 | &self, 673 | _debugger: &mut D, 674 | stream: &mut StreamOf, 675 | ) -> PResult { 676 | match stream.next() { 677 | (_, _, Some(tok)) if (self.0)(&tok) => (Vec::new(), Ok((tok, None))), 678 | (at, span, found) => ( 679 | Vec::new(), 680 | Err(Located::at( 681 | at, 682 | E::expected_input_found(span, Vec::new(), found), 683 | )), 684 | ), 685 | } 686 | } 687 | 688 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 689 | #[allow(deprecated)] 690 | self.parse_inner(d, s) 691 | } 692 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 693 | #[allow(deprecated)] 694 | self.parse_inner(d, s) 695 | } 696 | } 697 | 698 | /// A parser that accepts only inputs that match the given predicate. 699 | /// 700 | /// The output type of this parser is `I`, the input that was found. 701 | /// 702 | /// # Examples 703 | /// 704 | /// ``` 705 | /// # use chumsky::{prelude::*, error::Cheap}; 706 | /// let lowercase = filter::<_, _, Cheap>(char::is_ascii_lowercase) 707 | /// .repeated().at_least(1) 708 | /// .then_ignore(end()) 709 | /// .collect::(); 710 | /// 711 | /// assert_eq!(lowercase.parse("hello"), Ok("hello".to_string())); 712 | /// assert!(lowercase.parse("Hello").is_err()); 713 | /// ``` 714 | pub fn filter bool, E>(f: F) -> Filter { 715 | Filter(f, PhantomData) 716 | } 717 | 718 | /// See [`filter_map`]. 719 | pub struct FilterMap(F, PhantomData); 720 | 721 | impl Copy for FilterMap {} 722 | impl Clone for FilterMap { 723 | fn clone(&self) -> Self { 724 | Self(self.0.clone(), PhantomData) 725 | } 726 | } 727 | 728 | impl Result, E: Error> Parser for FilterMap { 729 | type Error = E; 730 | 731 | fn parse_inner( 732 | &self, 733 | _debugger: &mut D, 734 | stream: &mut StreamOf, 735 | ) -> PResult { 736 | let (at, span, tok) = stream.next(); 737 | match tok.map(|tok| (self.0)(span.clone(), tok)) { 738 | Some(Ok(tok)) => (Vec::new(), Ok((tok, None))), 739 | Some(Err(err)) => (Vec::new(), Err(Located::at(at, err))), 740 | None => ( 741 | Vec::new(), 742 | Err(Located::at( 743 | at, 744 | E::expected_input_found(span, Vec::new(), None), 745 | )), 746 | ), 747 | } 748 | } 749 | 750 | fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf) -> PResult { 751 | #[allow(deprecated)] 752 | self.parse_inner(d, s) 753 | } 754 | fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf) -> PResult { 755 | #[allow(deprecated)] 756 | self.parse_inner(d, s) 757 | } 758 | } 759 | 760 | /// A parser that accepts a input and tests it against the given fallible function. 761 | /// 762 | /// This function allows integration with custom error types to allow for custom parser errors. 763 | /// 764 | /// Before using this function, consider whether the [`select`] macro would serve you better. 765 | /// 766 | /// The output type of this parser is `I`, the input that was found. 767 | /// 768 | /// # Examples 769 | /// 770 | /// ``` 771 | /// # use chumsky::{prelude::*, error::Cheap}; 772 | /// let numeral = filter_map(|span, c: char| match c.to_digit(10) { 773 | /// Some(x) => Ok(x), 774 | /// None => Err(Simple::custom(span, format!("'{}' is not a digit", c))), 775 | /// }); 776 | /// 777 | /// assert_eq!(numeral.parse("3"), Ok(3)); 778 | /// assert_eq!(numeral.parse("7"), Ok(7)); 779 | /// assert_eq!(numeral.parse("f"), Err(vec![Simple::custom(0..1, "'f' is not a digit")])); 780 | /// ``` 781 | pub fn filter_map Result, E: Error>(f: F) -> FilterMap { 782 | FilterMap(f, PhantomData) 783 | } 784 | 785 | /// See [`any`]. 786 | pub type Any = Filter bool, E>; 787 | 788 | /// A parser that accepts any input (but not the end of input). 789 | /// 790 | /// The output type of this parser is `I`, the input that was found. 791 | /// 792 | /// # Examples 793 | /// 794 | /// ``` 795 | /// # use chumsky::{prelude::*, error::Cheap}; 796 | /// let any = any::>(); 797 | /// 798 | /// assert_eq!(any.parse("a"), Ok('a')); 799 | /// assert_eq!(any.parse("7"), Ok('7')); 800 | /// assert_eq!(any.parse("\t"), Ok('\t')); 801 | /// assert!(any.parse("").is_err()); 802 | /// ``` 803 | pub fn any() -> Any { 804 | Filter(|_| true, PhantomData) 805 | } 806 | 807 | /// See [`fn@todo`]. 808 | pub struct Todo(PhantomData<(I, O, E)>); 809 | 810 | /// A parser that can be used wherever you need to implement a parser later. 811 | /// 812 | /// This parser is analagous to the [`todo!`] and [`unimplemented!`] macros, but will produce a panic when used to 813 | /// parse input, not immediately when invoked. 814 | /// 815 | /// This function is useful when developing your parser, allowing you to prototype and run parts of your parser without 816 | /// committing to implementing the entire thing immediately. 817 | /// 818 | /// The output type of this parser is whatever you want it to be: it'll never produce output! 819 | /// 820 | /// # Examples 821 | /// 822 | /// ```should_panic 823 | /// # use chumsky::prelude::*; 824 | /// let int = just::<_, _, Simple>("0x").ignore_then(todo()) 825 | /// .or(just("0b").ignore_then(text::digits(2))) 826 | /// .or(text::int(10)); 827 | /// 828 | /// // Decimal numbers are parsed 829 | /// assert_eq!(int.parse("12"), Ok("12".to_string())); 830 | /// // Binary numbers are parsed 831 | /// assert_eq!(int.parse("0b00101"), Ok("00101".to_string())); 832 | /// // Parsing hexidecimal numbers results in a panic because the parser is unimplemented 833 | /// int.parse("0xd4"); 834 | /// ``` 835 | pub fn todo() -> Todo { 836 | Todo(PhantomData) 837 | } 838 | 839 | impl Copy for Todo {} 840 | impl Clone for Todo { 841 | fn clone(&self) -> Self { 842 | Self(PhantomData) 843 | } 844 | } 845 | 846 | impl> Parser for Todo { 847 | type Error = E; 848 | 849 | fn parse_inner( 850 | &self, 851 | _debugger: &mut D, 852 | _stream: &mut StreamOf, 853 | ) -> PResult { 854 | todo!("Attempted to use an unimplemented parser.") 855 | } 856 | 857 | fn parse_inner_verbose( 858 | &self, 859 | d: &mut Verbose, 860 | s: &mut StreamOf, 861 | ) -> PResult { 862 | #[allow(deprecated)] 863 | self.parse_inner(d, s) 864 | } 865 | fn parse_inner_silent( 866 | &self, 867 | d: &mut Silent, 868 | s: &mut StreamOf, 869 | ) -> PResult { 870 | #[allow(deprecated)] 871 | self.parse_inner(d, s) 872 | } 873 | } 874 | 875 | /// See [`choice`]. 876 | pub struct Choice(pub(crate) T, pub(crate) PhantomData); 877 | 878 | impl Copy for Choice {} 879 | impl Clone for Choice { 880 | fn clone(&self) -> Self { 881 | Self(self.0.clone(), PhantomData) 882 | } 883 | } 884 | 885 | macro_rules! impl_for_tuple { 886 | () => {}; 887 | ($head:ident $($X:ident)*) => { 888 | impl_for_tuple!($($X)*); 889 | impl_for_tuple!(~ $head $($X)*); 890 | }; 891 | (~ $($X:ident)*) => { 892 | #[allow(unused_variables, non_snake_case)] 893 | impl, $($X: Parser),*> Parser for Choice<($($X,)*), E> { 894 | type Error = E; 895 | 896 | fn parse_inner( 897 | &self, 898 | debugger: &mut D, 899 | stream: &mut StreamOf, 900 | ) -> PResult { 901 | let Choice(($($X,)*), _) = self; 902 | let mut alt = None; 903 | $( 904 | match stream.try_parse(|stream| { 905 | #[allow(deprecated)] 906 | debugger.invoke($X, stream) 907 | }) { 908 | (errors, Ok(out)) => return (errors, Ok(out)), 909 | (errors, Err(a_alt)) => { 910 | alt = merge_alts(alt.take(), Some(a_alt)); 911 | }, 912 | }; 913 | )* 914 | (Vec::new(), Err(alt.unwrap())) 915 | } 916 | 917 | fn parse_inner_verbose( 918 | &self, 919 | d: &mut Verbose, 920 | s: &mut StreamOf, 921 | ) -> PResult { 922 | #[allow(deprecated)] 923 | self.parse_inner(d, s) 924 | } 925 | fn parse_inner_silent( 926 | &self, 927 | d: &mut Silent, 928 | s: &mut StreamOf, 929 | ) -> PResult { 930 | #[allow(deprecated)] 931 | self.parse_inner(d, s) 932 | } 933 | } 934 | }; 935 | } 936 | 937 | impl_for_tuple!(A_ B_ C_ D_ E_ F_ G_ H_ I_ J_ K_ L_ M_ N_ O_ P_ Q_ S_ T_ U_ V_ W_ X_ Y_ Z_); 938 | 939 | /// Parse using a tuple of many parsers, producing the output of the first to successfully parse. 940 | /// 941 | /// This primitive has a twofold improvement over a chain of [`Parser::or`] calls: 942 | /// 943 | /// - Rust's trait solver seems to resolve the [`Parser`] impl for this type much faster, significantly reducing 944 | /// compilation times. 945 | /// 946 | /// - Parsing is likely a little faster in some cases because the resulting parser is 'less careful' about error 947 | /// routing, and doesn't perform the same fine-grained error prioritisation that [`Parser::or`] does. 948 | /// 949 | /// These qualities make this parser ideal for lexers. 950 | /// 951 | /// The output type of this parser is the output type of the inner parsers. 952 | /// 953 | /// # Examples 954 | /// ``` 955 | /// # use chumsky::prelude::*; 956 | /// #[derive(Clone, Debug, PartialEq)] 957 | /// enum Token { 958 | /// If, 959 | /// For, 960 | /// While, 961 | /// Fn, 962 | /// Int(u64), 963 | /// Ident(String), 964 | /// } 965 | /// 966 | /// let tokens = choice::<_, Simple>(( 967 | /// text::keyword("if").to(Token::If), 968 | /// text::keyword("for").to(Token::For), 969 | /// text::keyword("while").to(Token::While), 970 | /// text::keyword("fn").to(Token::Fn), 971 | /// text::int(10).from_str().unwrapped().map(Token::Int), 972 | /// text::ident().map(Token::Ident), 973 | /// )) 974 | /// .padded() 975 | /// .repeated(); 976 | /// 977 | /// use Token::*; 978 | /// assert_eq!( 979 | /// tokens.parse("if 56 for foo while 42 fn bar"), 980 | /// Ok(vec![If, Int(56), For, Ident("foo".to_string()), While, Int(42), Fn, Ident("bar".to_string())]), 981 | /// ); 982 | /// ``` 983 | pub fn choice(parsers: T) -> Choice { 984 | Choice(parsers, PhantomData) 985 | } 986 | --------------------------------------------------------------------------------