├── tests ├── fixtures │ └── structured │ │ ├── empty.json │ │ ├── empty.toml │ │ ├── empty.yaml │ │ ├── invalid.json │ │ ├── invalid.toml │ │ ├── floats.yaml │ │ ├── binary.yaml │ │ ├── floats.toml │ │ ├── empty_structures.yaml │ │ ├── invalid.yaml │ │ ├── deep_nesting.yaml │ │ ├── type_mismatch_coercion.toml │ │ ├── type_coercion.yaml │ │ ├── multi_doc.yaml │ │ ├── datetime.toml │ │ ├── null_boolean_edge_cases.yaml │ │ ├── numeric_edge_cases.json │ │ ├── large_config.yaml │ │ ├── config.yaml │ │ ├── Cargo.toml │ │ ├── nested_arrays.json │ │ ├── package.json │ │ └── unicode_strings.json ├── parser_errors.rs ├── parser_basic.rs ├── temporal_tests.rs ├── aliases.rs └── navigation_structured.rs ├── .gitignore ├── src ├── parser │ ├── mod.rs │ ├── structured_path.pest │ ├── grammar.pest │ ├── aliases.rs │ ├── ast.rs │ ├── raw.rs │ ├── structured_path.rs │ └── error.rs ├── expr │ ├── short_circuit.rs │ └── frame.rs ├── predicate_error.rs ├── util.rs ├── predicate │ └── enum_matcher.rs ├── eval.rs ├── lib.rs ├── expr.rs ├── main.rs └── eval │ └── fs.rs ├── LICENSE-MIT ├── Cargo.toml ├── CHANGELOG.md ├── docs ├── examples.md ├── operators.md └── predicates.md ├── README.md └── LICENSE-APACHE /tests/fixtures/structured/empty.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/fixtures/structured/empty.toml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/fixtures/structured/empty.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /tmp 3 | /test_* 4 | NEXT*.md 5 | .claude/* 6 | CLAUDE.md 7 | -------------------------------------------------------------------------------- /tests/fixtures/structured/invalid.json: -------------------------------------------------------------------------------- 1 | {"invalid": "missing closing brace", "unclosed": [1, 2, 3 -------------------------------------------------------------------------------- /tests/fixtures/structured/invalid.toml: -------------------------------------------------------------------------------- 1 | [missing 2 | closing = "bracket" 3 | bad_key with space = "value" 4 | -------------------------------------------------------------------------------- /tests/fixtures/structured/floats.yaml: -------------------------------------------------------------------------------- 1 | value: 1.5 2 | negative: -2.7 3 | zero: 0.0 4 | large: 999.99 5 | scientific: 1.23e-4 6 | -------------------------------------------------------------------------------- /tests/fixtures/structured/binary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inanna-malick/detect/HEAD/tests/fixtures/structured/binary.yaml -------------------------------------------------------------------------------- /tests/fixtures/structured/floats.toml: -------------------------------------------------------------------------------- 1 | value = 2.5 2 | negative = -3.8 3 | zero = 0.0 4 | large = 1234.56 5 | scientific = 6.022e23 6 | -------------------------------------------------------------------------------- /tests/fixtures/structured/empty_structures.yaml: -------------------------------------------------------------------------------- 1 | empty_array: [] 2 | empty_object: {} 3 | nested_empty: 4 | inner_array: [] 5 | inner_object: {} 6 | -------------------------------------------------------------------------------- /tests/fixtures/structured/invalid.yaml: -------------------------------------------------------------------------------- 1 | invalid: [unclosed array 2 | missing_quote: "unterminated string 3 | bad_indent: 4 | - item1 5 | - item2 6 | -------------------------------------------------------------------------------- /tests/fixtures/structured/deep_nesting.yaml: -------------------------------------------------------------------------------- 1 | a: 2 | b: 3 | c: 4 | d: 5 | e: 6 | f: "deeply_nested_value" 7 | g: 123 8 | -------------------------------------------------------------------------------- /tests/fixtures/structured/type_mismatch_coercion.toml: -------------------------------------------------------------------------------- 1 | port = 8080 2 | version_string = "1.2.3" 3 | version_int = 123 4 | enabled = true 5 | disabled = false 6 | count = 42 7 | -------------------------------------------------------------------------------- /tests/fixtures/structured/type_coercion.yaml: -------------------------------------------------------------------------------- 1 | port: 8080 2 | version: "1.2.3" 3 | count: "42" 4 | flag: true 5 | items: 6 | - id: 1 7 | name: "first" 8 | - id: 2 9 | name: "second" 10 | -------------------------------------------------------------------------------- /tests/fixtures/structured/multi_doc.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | service: web 3 | port: 8080 4 | enabled: true 5 | --- 6 | service: api 7 | port: 3000 8 | enabled: false 9 | --- 10 | service: cache 11 | port: 6379 12 | enabled: true 13 | -------------------------------------------------------------------------------- /tests/fixtures/structured/datetime.toml: -------------------------------------------------------------------------------- 1 | timestamp = 2024-01-15T10:30:00Z 2 | created_date = 2024-01-01T00:00:00Z 3 | future_date = 2025-12-31T23:59:59Z 4 | 5 | [event] 6 | start = 2024-06-15T09:00:00Z 7 | end = 2024-06-15T17:00:00Z 8 | -------------------------------------------------------------------------------- /tests/fixtures/structured/null_boolean_edge_cases.yaml: -------------------------------------------------------------------------------- 1 | null_value: null 2 | null_string: "null" 3 | bool_true: true 4 | bool_false: false 5 | string_true: "true" 6 | string_false: "false" 7 | empty_string: "" 8 | missing_is_not_present: yes 9 | -------------------------------------------------------------------------------- /tests/fixtures/structured/numeric_edge_cases.json: -------------------------------------------------------------------------------- 1 | { 2 | "int_value": 42, 3 | "float_value": 1.5, 4 | "negative_int": -10, 5 | "negative_float": -2.7, 6 | "zero": 0, 7 | "large_int": 9223372036854775806, 8 | "mixed_array": [1, 2.5, -3, 0, 100.99] 9 | } 10 | -------------------------------------------------------------------------------- /tests/fixtures/structured/large_config.yaml: -------------------------------------------------------------------------------- 1 | # This file is intentionally large to test size limits 2 | # Adding comments to increase file size beyond the test threshold 3 | # More comments here to make sure we exceed 100 bytes easily 4 | # Yet another comment line for padding 5 | server: 6 | port: 9999 7 | host: testhost 8 | debug: false 9 | -------------------------------------------------------------------------------- /tests/fixtures/structured/config.yaml: -------------------------------------------------------------------------------- 1 | server: 2 | port: 8080 3 | host: localhost 4 | debug: true 5 | 6 | database: 7 | host: db.example.com 8 | port: 5432 9 | name: myapp 10 | credentials: 11 | username: admin 12 | password: secret123 13 | 14 | features: 15 | - name: auth 16 | enabled: true 17 | - name: logging 18 | enabled: false 19 | - name: metrics 20 | enabled: true 21 | -------------------------------------------------------------------------------- /tests/fixtures/structured/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "detect" 3 | version = "0.3.0" 4 | edition = "2021" 5 | authors = ["Test Author "] 6 | 7 | [dependencies] 8 | serde = { version = "1.0", features = ["derive"] } 9 | tokio = { version = "1.35", features = ["full"] } 10 | regex = "1.10" 11 | 12 | [dev-dependencies] 13 | tempfile = "3.8" 14 | proptest = "1.4" 15 | 16 | [features] 17 | default = ["mcp"] 18 | mcp = [] 19 | -------------------------------------------------------------------------------- /tests/fixtures/structured/nested_arrays.json: -------------------------------------------------------------------------------- 1 | { 2 | "services": [ 3 | { 4 | "name": "web", 5 | "ports": [8080, 8443], 6 | "enabled": true 7 | }, 8 | { 9 | "name": "api", 10 | "ports": [3000], 11 | "enabled": true 12 | }, 13 | { 14 | "name": "cache", 15 | "ports": [6379], 16 | "enabled": false 17 | } 18 | ], 19 | "metadata": { 20 | "version": "2.0", 21 | "author": "test" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tests/fixtures/structured/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "my-app", 3 | "version": "1.2.3", 4 | "description": "A sample application", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "jest", 8 | "build": "webpack", 9 | "start": "node index.js" 10 | }, 11 | "dependencies": { 12 | "express": "^4.18.0", 13 | "lodash": "^4.17.21" 14 | }, 15 | "devDependencies": { 16 | "jest": "^29.0.0", 17 | "webpack": "^5.75.0" 18 | }, 19 | "engines": { 20 | "node": ">=16.0.0" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/parser/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod aliases; 2 | pub mod ast; 3 | pub mod error; 4 | pub mod raw; 5 | pub mod structured_path; 6 | pub mod time; 7 | pub mod typechecker; 8 | pub mod typed; 9 | 10 | // Re-exports 11 | pub use aliases::{resolve_alias, suggest_aliases}; 12 | pub use ast::{test_utils, RawExpr, RawPredicate, RawValue}; 13 | pub use error::DetectError; 14 | pub use raw::RawParser; 15 | pub use structured_path::{parse_path, PathComponent, PathParseError}; 16 | pub use time::parse_time_value; 17 | pub use typechecker::Typechecker; 18 | -------------------------------------------------------------------------------- /src/expr/short_circuit.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Display; 2 | 3 | pub enum ShortCircuit { 4 | Known(bool), 5 | Unknown(X), 6 | } 7 | 8 | impl From for ShortCircuit { 9 | fn from(value: bool) -> Self { 10 | ShortCircuit::Known(value) 11 | } 12 | } 13 | 14 | impl Display for ShortCircuit { 15 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 16 | match self { 17 | ShortCircuit::Known(x) => write!(f, "known: {x}"), 18 | ShortCircuit::Unknown(x) => write!(f, "unknown: {x}"), 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/predicate_error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | /// Error type for predicate parsing operations 4 | #[derive(Debug, Error)] 5 | pub enum PredicateParseError { 6 | #[error("Invalid regex pattern")] 7 | Regex(#[from] regex::Error), 8 | 9 | #[error("Invalid number")] 10 | Numeric(#[from] std::num::ParseIntError), 11 | 12 | #[error("Invalid time: {0}")] 13 | Temporal(String), 14 | 15 | #[error("DFA compilation failed: {0}")] 16 | Dfa(String), 17 | 18 | #[error("Incompatible: {0}")] 19 | Incompatible(String), 20 | 21 | #[error("Unknown selector: {0}")] 22 | UnknownSelector(String), 23 | } 24 | -------------------------------------------------------------------------------- /src/parser/structured_path.pest: -------------------------------------------------------------------------------- 1 | // Grammar for structured data path expressions 2 | // Parses paths like: .spec.replicas, [0].name, .items[*].id 3 | 4 | WHITESPACE = _{ " " | "\t" } 5 | 6 | path = { SOI ~ component+ ~ EOI } 7 | 8 | component = _{ recursive_key | key_access | index_access | wildcard_access } 9 | 10 | // Recursive descent: ..fieldname 11 | recursive_key = { ".." ~ identifier } 12 | 13 | // Key access: .fieldname 14 | key_access = { "." ~ identifier } 15 | identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-" | "/")* } 16 | 17 | // Array index: [42] 18 | index_access = { "[" ~ number ~ "]" } 19 | number = @{ ASCII_DIGIT+ } 20 | 21 | // Wildcard array access: [*] 22 | wildcard_access = { "[" ~ "*" ~ "]" } 23 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | 2 | 3 | Copyright 2022 Inanna Malick 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "detect" 3 | description = "Expression-based file search combining name, content, metadata, and structured data predicates" 4 | license = "MIT OR Apache-2.0" 5 | version = "0.3.0" 6 | edition = "2021" 7 | rust-version = "1.70" 8 | repository = "https://github.com/inanna-malick/detect/" 9 | homepage = "https://github.com/inanna-malick/detect/" 10 | keywords = ["egrep", "grep", "pattern", "regex", "search"] 11 | categories = ["command-line-utilities", "filesystem"] 12 | 13 | [[bin]] 14 | name = "detect" 15 | path = "src/main.rs" 16 | 17 | [dependencies] 18 | clap = {version = "4.5", features = ["derive"]} 19 | futures = "0.3.31" 20 | ignore = "0.4" 21 | pest = "2.7.15" 22 | pest_derive = "2.7.15" 23 | recursion = {version = "0.5", features = ["experimental"]} 24 | regex = "1.12" 25 | regex-automata = "0.4.13" 26 | slog-term = "2.9" 27 | slog = "2.7" 28 | tokio = {version = "1.48", features = ["rt", "rt-multi-thread", "fs", "macros"]} 29 | tokio-util = {version ="0.7.17", features = ["io"] } 30 | serde_json = "1.0" 31 | chrono = "0.4" 32 | thiserror = "2.0" 33 | miette = { version = "7.6.0", features = ["fancy"] } 34 | toml = "0.9.8" 35 | yaml-rust2 = "0.10.4" 36 | 37 | [dev-dependencies] 38 | tempfile = "3" 39 | 40 | [profile.release] 41 | opt-level = 3 42 | lto = true 43 | codegen-units = 1 44 | strip = true 45 | -------------------------------------------------------------------------------- /tests/fixtures/structured/unicode_strings.json: -------------------------------------------------------------------------------- 1 | { 2 | "emoji_field": "🚀", 3 | "emoji_value": "test 🎉 value", 4 | "unicode_chars": "Ñoño αβγ 中文", 5 | "multiline": "line1\nline2\nline3", 6 | "long_string": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?" 7 | } 8 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | /// uninhabited type, used to signify that something does not exist 2 | /// provided typeclass instances never invoked but provided for 3 | /// convenience 4 | #[derive(Debug, Clone)] 5 | pub enum Done {} 6 | 7 | impl std::fmt::Display for Done { 8 | fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 9 | unreachable!() 10 | } 11 | } 12 | 13 | /// Parse size values like "1mb", "100kb", etc. into bytes 14 | /// 15 | /// Supports units: b, kb, mb, gb, tb (case-insensitive) 16 | /// 17 | /// # Examples 18 | /// ``` 19 | /// use detect::util::parse_size; 20 | /// assert_eq!(parse_size("10mb").unwrap(), 10 * 1024 * 1024); 21 | /// assert_eq!(parse_size("500KB").unwrap(), 500 * 1024); 22 | /// ``` 23 | pub fn parse_size(s: &str) -> Result { 24 | let s = s.trim().to_lowercase(); 25 | 26 | let mut unit_start = 0; 27 | for (i, ch) in s.char_indices() { 28 | if !ch.is_ascii_digit() && ch != '.' { 29 | unit_start = i; 30 | break; 31 | } 32 | } 33 | 34 | if unit_start == 0 { 35 | return Err(format!( 36 | "Invalid size '{s}': expected format like '10mb', '500kb'" 37 | )); 38 | } 39 | 40 | let number_str = &s[..unit_start]; 41 | let unit_str = &s[unit_start..]; 42 | 43 | let number: f64 = number_str 44 | .parse() 45 | .map_err(|_| format!("Invalid size '{s}': cannot parse numeric value '{number_str}'"))?; 46 | 47 | let multiplier = match unit_str { 48 | "b" | "byte" | "bytes" => 1.0, 49 | "k" | "kb" | "kilobyte" | "kilobytes" => 1024.0, 50 | "m" | "mb" | "megabyte" | "megabytes" => 1024.0 * 1024.0, 51 | "g" | "gb" | "gigabyte" | "gigabytes" => 1024.0 * 1024.0 * 1024.0, 52 | "t" | "tb" | "terabyte" | "terabytes" => 1024.0 * 1024.0 * 1024.0 * 1024.0, 53 | _ => { 54 | return Err(format!( 55 | "Invalid size '{s}': unknown unit '{unit_str}' (expected: b, kb, mb, gb, tb)" 56 | )) 57 | } 58 | }; 59 | 60 | Ok((number * multiplier) as u64) 61 | } 62 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | ## [0.3.0] - 2025-01-22 6 | 7 | ### Added 8 | 9 | - **Structured data selectors** for querying YAML, JSON, and TOML file contents 10 | - Dot notation for nested field access: `yaml:.server.port` 11 | - Array indexing: `json:.dependencies[0]` 12 | - Wildcards with OR semantics: `yaml:.features[*].enabled` 13 | - Recursive descent: `yaml:..field` finds field at any depth 14 | - Comparison operators: `==`, `!=`, `>`, `<`, `>=`, `<=` 15 | - String matchers: `contains`, `~=` (regex) 16 | - Automatic type coercion between numbers and strings 17 | - Fully composable with other predicates: `size < 50kb AND yaml:.server.port > 8000` 18 | - `--max-structured-size` CLI flag to configure maximum file size for structured parsing (default: 10MB) 19 | - Support for multi-document YAML with OR semantics (matches if ANY document matches) 20 | - **Single-word file type aliases**: Use `file`, `dir`, `symlink`, etc. as shorthand for `type == file`, `type == dir`, etc. Enables natural queries like `dir && depth > 0` or `file && size > 1mb`. All file type values work as aliases (case-insensitive). 21 | - MCP (Model Context Protocol) server support for AI assistant integration 22 | - Better error messages with source location tracking and helpful suggestions 23 | - Unquoted regex pattern support - `content ~= [0-9]+` works without quotes 24 | - Parse-time validation for `type` selector values (breaking change - see below) 25 | - Relative path display in search results 26 | - Dual MIT/Apache-2.0 licensing 27 | - Greater than or equal (`>=`) and less than or equal (`<=`) operators for temporal selectors 28 | 29 | ### Changed 30 | 31 | - Two-phase parser architecture (raw parsing → type checking) for better errors 32 | - Relative time formats now support full aliases (`-7days`, `-2hours`, etc.) 33 | 34 | ### Breaking Changes 35 | 36 | - `type` selector now validates file type values at parse time. Invalid types like `type == dirq` produce parse errors instead of matching nothing. Valid types: `file`, `dir`/`directory`, `symlink`/`link`, `socket`/`sock`, `fifo`/`pipe`, `block`/`blockdev`, `char`/`chardev` (case-insensitive) 37 | -------------------------------------------------------------------------------- /src/predicate/enum_matcher.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use std::fmt::{self, Debug, Display}; 3 | use std::hash::Hash; 4 | 5 | /// Generic matcher for enum-valued predicates with parse-time validation 6 | #[derive(Clone, Debug, PartialEq, Eq)] 7 | pub enum EnumMatcher { 8 | Equals(E), 9 | NotEquals(E), 10 | In(HashSet), 11 | } 12 | 13 | /// Trait for enums usable as predicate values 14 | /// 15 | /// Implementors provide parsing from string aliases, validation, 16 | /// and display logic for enum-based selectors like `type`. 17 | pub trait EnumPredicate: Sized + Eq + Hash + Clone + Debug { 18 | /// Parse from string, checking all aliases. 19 | /// 20 | /// Returns error message on failure (not a structured error type, 21 | /// since it gets wrapped in `DetectError::InvalidValue` immediately). 22 | fn from_str(s: &str) -> Result; 23 | 24 | /// All valid string representations (for error messages) 25 | fn all_valid_strings() -> &'static [&'static str]; 26 | 27 | /// Canonical string representation for this variant 28 | fn as_str(&self) -> &'static str; 29 | 30 | /// All aliases that map to this variant 31 | fn aliases(&self) -> &'static [&'static str]; 32 | } 33 | 34 | impl EnumMatcher { 35 | /// Check if a value matches this enum matcher 36 | pub fn is_match(&self, value: &E) -> bool { 37 | match self { 38 | EnumMatcher::Equals(v) => value == v, 39 | EnumMatcher::NotEquals(v) => value != v, 40 | EnumMatcher::In(set) => set.contains(value), 41 | } 42 | } 43 | } 44 | 45 | impl Display for EnumMatcher { 46 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 47 | match self { 48 | EnumMatcher::Equals(v) => write!(f, "== {}", v.as_str()), 49 | EnumMatcher::NotEquals(v) => write!(f, "!= {}", v.as_str()), 50 | EnumMatcher::In(set) => { 51 | write!(f, "in [")?; 52 | let mut items: Vec<_> = set.iter().map(EnumPredicate::as_str).collect(); 53 | items.sort_unstable(); // Deterministic display order 54 | write!(f, "{}", items.join(", "))?; 55 | write!(f, "]") 56 | } 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/eval.rs: -------------------------------------------------------------------------------- 1 | use crate::expr::short_circuit::ShortCircuit; 2 | use crate::expr::Expr; 3 | use crate::predicate::{Predicate, StreamingCompiledContentPredicateRef}; 4 | use crate::util::Done; 5 | use futures::{Stream, StreamExt}; 6 | use regex_automata::dfa::Automaton; 7 | use tokio::io::{self}; 8 | 9 | pub mod fs; 10 | pub mod structured; 11 | 12 | pub async fn run_contents_predicate_stream( 13 | e: Expr>>, 14 | mut s: impl Stream>> + std::marker::Unpin, 15 | ) -> io::Result>> { 16 | let config = regex_automata::util::start::Config::new(); 17 | 18 | // Initialize state for DFA patterns 19 | let mut e: Expr> = e.map_predicate(|p| match p { 20 | Predicate::Content(pred) => { 21 | let dfa = pred.inner; 22 | let s = dfa 23 | .start_state(&config) 24 | .expect("DFA start_state failed: invalid regex configuration"); 25 | Predicate::Content((dfa, s)) 26 | } 27 | _ => unreachable!(), 28 | }); 29 | 30 | while let Some(next) = s.next().await { 31 | // read the next buffered chunk of bytes 32 | let bytes = next?; 33 | 34 | // advance each pattern appropriately 35 | e = e.reduce_predicate_and_short_circuit(move |p| match p { 36 | Predicate::Content((dfa, state)) => { 37 | // DFA streaming processing 38 | let mut next_state = state; 39 | let mut iter = bytes.iter(); 40 | 41 | loop { 42 | if let Some(byte) = iter.next() { 43 | next_state = dfa.next_state(next_state, *byte); 44 | 45 | if dfa.is_match_state(next_state) { 46 | break ShortCircuit::Known(true); 47 | } 48 | 49 | if dfa.is_dead_state(next_state) { 50 | break ShortCircuit::Known(false); 51 | } 52 | } else { 53 | break ShortCircuit::Unknown(Predicate::Content((dfa, next_state))); 54 | } 55 | } 56 | } 57 | _ => unreachable!(), 58 | }); 59 | } 60 | 61 | // Final evaluation 62 | let e = e.reduce_predicate_and_short_circuit(|p| match p { 63 | Predicate::Content((dfa, state)) => { 64 | let next_state = dfa.next_eoi_state(state); 65 | let matched = dfa.is_match_state(next_state); 66 | ShortCircuit::Known(matched) 67 | } 68 | _ => unreachable!(), 69 | }); 70 | 71 | Ok(e) 72 | } 73 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # detect Examples 2 | 3 | **Quick tips:** 4 | - Start with cheap filters (`ext`, `size`, `type`) before expensive ones (`content`, structured) 5 | - Quote expressions with spaces/special chars: `'ext == rs AND content ~= "async "'` 6 | - Use `-i` to include gitignored files 7 | 8 | ## Progressive Examples 9 | 10 | Each line adds complexity - shows how to combine features: 11 | 12 | ```bash 13 | # Start simple 14 | detect 'ext == rs' # selector + operator 15 | 16 | # Combine with AND 17 | detect 'ext in [rs,toml] AND size > 1mb' # set membership, numeric 18 | 19 | # Add temporal predicates 20 | detect 'ext == rs AND size > 1mb AND modified > -7d' # relative time 21 | 22 | # Content matching with regex 23 | detect 'ext == ts AND content ~= "class.*Service"' # regex operator 24 | 25 | # Boolean logic: grouping, NOT 26 | detect '(ext == rs OR ext == toml) AND NOT path ~= test' # precedence, path filter 27 | 28 | # Structured data + file metadata 29 | detect 'yaml:.server.port > 8000 AND size < 100kb' # structured selector 30 | ``` 31 | 32 | ## Structured Data Patterns 33 | 34 | Navigate YAML/JSON/TOML with path syntax: 35 | 36 | ```bash 37 | # Nested field access: .field.field 38 | yaml:.server.port == 8080 39 | 40 | # Array indexing + field access: [0].field 41 | json:.items[0].name == "first" 42 | 43 | # Wildcard - matches if ANY element matches: [*] 44 | yaml:.features[*].enabled == true 45 | 46 | # Recursive descent - finds field at any depth: ..field 47 | toml:..database contains prod 48 | 49 | # Combine with file predicates 50 | yaml:.replicas > 3 AND size < 100kb AND NOT path ~= test 51 | 52 | # Multi-format queries with OR 53 | json:.version ~= "^1\\." OR toml:.package.version ~= "^1\\." 54 | ``` 55 | 56 | ## Common Patterns 57 | 58 | Real-world multi-feature queries: 59 | 60 | ```bash 61 | # Large recent files with TODOs, excluding tests 62 | detect 'size > 10kb AND modified > -7d AND content contains TODO AND NOT path ~= test' 63 | 64 | # Security: env files with secrets outside node_modules 65 | detect 'name ~= "^\.env" AND NOT path ~= node_modules AND content ~= "(password|secret|key)"' 66 | 67 | # Recent config changes 68 | detect 'ext in [json,yaml,toml] AND modified > -3d' 69 | 70 | # Kubernetes manifests with high replicas 71 | detect 'yaml:.kind == Deployment AND yaml:.spec.replicas > 3' 72 | 73 | # Find TypeScript async functions in source directories 74 | detect 'path ~= "^\./(src|lib)/" AND ext == ts AND content ~= "async\s+function"' 75 | ``` 76 | 77 | ## Migration from find/grep 78 | 79 | ```bash 80 | # find . -name "*.js" -size +1M 81 | detect 'ext == js AND size > 1mb' 82 | 83 | # find . -type f -exec grep -l "TODO" {} \; 84 | detect 'type == file AND content contains TODO' 85 | 86 | # grep -r "class.*Service" --include="*.ts" . 87 | detect 'ext == ts AND content ~= "class.*Service"' 88 | ``` 89 | -------------------------------------------------------------------------------- /docs/operators.md: -------------------------------------------------------------------------------- 1 | # detect Operators Reference 2 | 3 | All operators organized by selector type. 4 | 5 | ## String Operators 6 | 7 | For: `name`, `ext`, `path`, `dir`, `content` 8 | 9 | | Operator | Description | Example | 10 | |-------------|-------------|---------| 11 | | `==` | Exact match (case-sensitive) | `name == "README.md"` | 12 | | `!=` | Not equal | `ext != md` | 13 | | `contains` | Substring search (literal) | `content contains TODO` | 14 | | `~=` | Regex pattern matching | `name ~= "test.*\.rs$"` | 15 | | `in [a,b,c]` | Match any item in set | `ext in [js,ts,jsx,tsx]` | 16 | 17 | Regex uses Rust regex syntax. Set membership allows optional spaces: `ext in [rs, toml]`. 18 | 19 | ## Numeric Operators 20 | 21 | For: `size`, `depth` 22 | 23 | | Operator | Description | Example | 24 | |----------|-------------|---------| 25 | | `==` | Exact value | `size == 1024` | 26 | | `!=` | Not equal | `depth != 0` | 27 | | `>` | Greater than | `size > 1mb` | 28 | | `<` | Less than | `depth < 5` | 29 | | `>=` | Greater or equal | `size >= 100kb` | 30 | | `<=` | Less or equal | `depth <= 2` | 31 | 32 | Size units: `kb`, `mb`, `gb`, `tb` (lowercase only, e.g. `1kb`, `2.5mb`) 33 | 34 | ## Temporal Operators 35 | 36 | For: `modified`, `created`, `accessed` 37 | 38 | | Operator | Description | Example | 39 | |----------|-------------|---------| 40 | | `>` | After (newer than) | `modified > -7d` | 41 | | `<` | Before (older than) | `created < 2024-01-01` | 42 | | `>=` | At or after | `modified >= -1w` | 43 | | `<=` | At or before | `accessed <= -1d` | 44 | | `==` | Exact time | `modified == 2024-01-15` | 45 | | `!=` | Not at time | `created != 2024-01-01` | 46 | 47 | **Formats:** Relative `-7d`, `-2h`, `-30m`, `-1w` (units: `s`, `m`/`min`, `h`/`hr`, `d`/`day`, `w`/`week`, with plurals). Absolute `2024-01-15`, `2024-01-15T10:30:00`. 48 | 49 | ## Enum Operators 50 | 51 | For: `type` 52 | 53 | | Operator | Description | Example | 54 | |----------|-------------|---------| 55 | | `==` | Exact match (validated at parse-time) | `type == file` | 56 | | `!=` | Not equal | `type != dir` | 57 | | `in [a,b,c]` | Match any type in set | `type in [file,dir,symlink]` | 58 | 59 | **Valid types (case-insensitive):** `file`, `dir`/`directory`, `symlink`/`link`, `socket`/`sock`, `fifo`/`pipe`, `block`/`blockdev`, `char`/`chardev`. Invalid values caught at parse-time with suggestions. 60 | 61 | ## Boolean Operators 62 | 63 | | Operator | Description | Example | 64 | |----------|-------------|---------| 65 | | `AND` / `&&` | Both conditions true | `ext == rs AND size > 1kb` | 66 | | `OR` / `\|\|` | Either condition true | `file OR dir` | 67 | | `NOT` / `!` | Negate condition | `NOT symlink` | 68 | | `( )` | Group expressions | `(file OR dir) AND size > 1kb` | 69 | 70 | **Precedence:** `NOT` > `AND` > `OR`. Use parentheses for clarity: `(a OR b) AND c`. 71 | 72 | ## Common Mistakes 73 | 74 | **Units:** Lowercase only - `1mb` not `1MB` 75 | **Regex quotes:** Quote patterns with spaces - `content ~= "class.*"` not `content ~= class.*` 76 | **Wildcards:** Use `ext == rs` not `*.rs` (or `name ~= ".*\.rs$"` for regex) 77 | -------------------------------------------------------------------------------- /docs/predicates.md: -------------------------------------------------------------------------------- 1 | # detect Predicates Reference 2 | 3 | All selectors and their types. Aliases shown as `primary` / `alias`. 4 | 5 | ## File Identity 6 | 7 | | Selector | Type | Description | Example | 8 | |----------|--------|-------------|---------| 9 | | `name` / `filename` | String | Full filename with extension | `name == "README.md"` | 10 | | `basename` / `stem` | String | Filename without extension | `basename == README` | 11 | | `ext` / `extension` | String | File extension without dot | `ext == rs` | 12 | | `path` | String | Full absolute path | `path ~= "/src/"` | 13 | | `dir` / `parent` / `directory` | String | Parent directory path | `dir == "/usr/bin"` | 14 | 15 | ## File Properties 16 | 17 | | Selector | Type | Description | Example | 18 | |----------|---------|-------------|---------| 19 | | `size` / `filesize` / `bytes` | Numeric | File size in bytes | `size > 1mb` | 20 | | `type` / `filetype` | Enum | File type (validated at parse-time) | `type == file` | 21 | | `depth` | Numeric | Directory depth from root | `depth <= 3` | 22 | 23 | **Size units:** `kb`, `mb`, `gb`, `tb` (e.g. `45kb`, `1.5mb`) 24 | 25 | **Valid types (case-insensitive):** `file`, `dir`/`directory`, `symlink`/`link`, `socket`/`sock`, `fifo`/`pipe`, `block`/`blockdev`, `char`/`chardev` 26 | 27 | ## Timestamps 28 | 29 | | Selector | Type | Description | Example | 30 | |----------|------|-------------|---------| 31 | | `modified` / `mtime` | Temporal | Last modification time | `modified > -7d` | 32 | | `created` / `ctime` | Temporal | File creation time | `created > 2024-01-01` | 33 | | `accessed` / `atime` | Temporal | Last access time | `accessed < -1h` | 34 | 35 | **Formats:** Relative `-7d`/`-7days`, `-2h`/`-2hours` (units: `s`, `m`/`min`, `h`/`hr`, `d`/`day`, `w`/`week` + plurals). Absolute `2024-01-15`, `2024-01-15T10:30:00`. 36 | 37 | ## Content 38 | 39 | | Selector | Type | Description | Example | 40 | |----------|------|-------------|---------| 41 | | `content` / `text` / `contents` | String | File text contents | `content contains TODO` | 42 | 43 | ## Structured Data 44 | 45 | Query YAML, JSON, TOML by navigating structure: 46 | 47 | | Selector | Description | Example | 48 | |----------|-------------|---------| 49 | | `yaml:.path` | YAML navigation | `yaml:.server.port == 8080` | 50 | | `json:.path` | JSON navigation | `json:.items[0].name == "test"` | 51 | | `toml:.path` | TOML navigation | `toml:.package.edition == "2021"` | 52 | 53 | **Navigation syntax:** 54 | 55 | | Pattern | Meaning | Example | 56 | |---------|---------|---------| 57 | | `.field` | Object field access | `yaml:.server` | 58 | | `.nested.field` | Nested fields | `json:.meta.author` | 59 | | `[0]` | Array index | `yaml:.items[0]` | 60 | | `[*]` | Wildcard - any element | `yaml:.features[*].enabled` | 61 | | `..field` | Recursive - any depth | `toml:..password` | 62 | 63 | **Operators:** `==`, `!=`, `>`, `<`, `>=`, `<=`, `contains`, `~=` (same as other selectors) 64 | 65 | **Type coercion:** Numbers/booleans convert to strings - `yaml:.port == 8080` matches both `8080` and `"8080"` 66 | 67 | **Existence check:** Use selector alone without operator - `yaml:.server` checks if field exists 68 | 69 | **Limitations:** 70 | - Files > 10MB skipped (configurable: `--max-structured-size`) 71 | - Non-UTF8 files skip structured evaluation 72 | - Invalid YAML/JSON/TOML returns false (no error) 73 | - Multi-document YAML: matches if ANY document matches 74 | -------------------------------------------------------------------------------- /src/parser/grammar.pest: -------------------------------------------------------------------------------- 1 | WHITESPACE = _{ " " | "\t" | NEWLINE } 2 | 3 | program = { SOI ~ expr ~ EOI } 4 | expr = { prefix* ~ primary ~ (infix ~ prefix* ~ primary )* } 5 | infix = _{ and | or } 6 | and = { "&&" | ^"and" } 7 | or = { "||" | ^"or" } 8 | prefix = _{ neg } 9 | neg = { "!" | "\\!" | ^"not" } 10 | primary = _{ predicate | single_word | "(" ~ expr ~ ")" } 11 | 12 | predicate = { selector ~ operator ~ value } 13 | selector = @{ (ASCII_ALPHANUMERIC | "." | "_" | "-" | "/" | "[" | "]" | "*" | ":")+ } 14 | // Parse operators flexibly - validate in typechecker 15 | // Start with symbols or letters, but not mix arbitrarily 16 | operator = @{ 17 | // Symbol-based operators (can combine symbols) 18 | ("=" | "!" | ">" | "<" | "~")+ | 19 | // Word-based operators (alphanumeric with underscores) 20 | // But NOT the reserved infix/prefix operators 21 | !(^"and" | ^"or" | ^"not") ~ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* 22 | } 23 | 24 | value = { value_content ~ trailing_quote? } 25 | 26 | value_content = _{ quoted_string | unterminated_string | raw_token } 27 | 28 | // Error detection: unterminated string literals 29 | // Matches opening quote without closing quote before EOI/newline 30 | unterminated_string = @{ 31 | ("\"" ~ (!"\"" ~ (escaped | ANY))* ~ (EOI | &NEWLINE)) | 32 | ("'" ~ (!"'" ~ (escaped | ANY))* ~ (EOI | &NEWLINE)) 33 | } 34 | 35 | // Error detection: trailing quote after value content 36 | trailing_quote = @{ "\"" | "'" } 37 | 38 | // Raw tokens with recursive balanced delimiter matching 39 | // Supports nested structures like ((pub|async)\s+)* or [[a-z]] 40 | raw_token = @{ raw_char+ } 41 | 42 | raw_char = _{ 43 | "\\" ~ ANY | // Escaped character 44 | balanced_paren | // Recursive paren matching 45 | balanced_bracket | // Recursive bracket matching 46 | balanced_curly | // Recursive curly matching 47 | !WHITESPACE ~ !"&&" ~ !"||" ~ !")" ~ !"\"" ~ !"'" ~ ANY // Regular character 48 | } 49 | 50 | balanced_paren = { "(" ~ ( "\\" ~ ANY | balanced_paren | balanced_bracket | balanced_curly | !")" ~ ANY )* ~ ")" } 51 | balanced_bracket = { "[" ~ ( "\\" ~ ANY | balanced_paren | balanced_bracket | balanced_curly | !"]" ~ ANY )* ~ "]" } 52 | balanced_curly = { "{" ~ ( "\\" ~ ANY | balanced_paren | balanced_bracket | balanced_curly | !"}" ~ ANY )* ~ "}" } 53 | 54 | quoted_string = ${ "\"" ~ inner_double ~ "\"" | "'" ~ inner_single ~ "'" } 55 | inner_double = @{ (!"\"" ~ (escaped | ANY))* } 56 | inner_single = @{ (!"'" ~ (escaped | ANY))* } 57 | 58 | escaped = { "\\" ~ ("\"" | "'" | "\\" | "n" | "t" | "r") } 59 | 60 | single_word = @{ 61 | // Structured data selectors: word:.path (validated at typecheck) 62 | (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* ~ ":" ~ (ASCII_ALPHANUMERIC | "." | "_" | "-" | "/" | "[" | "]" | "*")+ | 63 | // Regular word aliases 64 | (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* 65 | } 66 | 67 | // Separate entry point for parsing set contents 68 | // Used by typechecker when operator is 'in' 69 | set_contents = { SOI ~ set_items? ~ EOI } 70 | set_items = { set_item? ~ ("," ~ set_item?)* } 71 | set_item = { quoted_string | bare_set_item } 72 | // Bare items in sets: stop at comma, whitespace, or quotes 73 | bare_set_item = @{ (!("," | "\"" | "'" | WHITESPACE) ~ ANY)+ } -------------------------------------------------------------------------------- /src/expr/frame.rs: -------------------------------------------------------------------------------- 1 | use super::Expr; 2 | use futures::FutureExt; 3 | use recursion::{ 4 | experimental::frame::AsyncMappableFrame, Collapsible, MappableFrame, PartiallyApplied, 5 | }; 6 | use std::fmt::Display; 7 | use tokio::try_join; 8 | 9 | /// short-lived single layer of a filesystem entity matcher expression, used for 10 | /// expressing recursive algorithms over a single layer of a borrowed Expr 11 | pub enum ExprFrame { 12 | // borrowed predicate 13 | Predicate(P), 14 | // boolean operators 15 | Not(X), 16 | And(X, X), 17 | Or(X, X), 18 | // literal values 19 | Literal(bool), 20 | } 21 | 22 | impl

MappableFrame for ExprFrame { 23 | type Frame = ExprFrame; 24 | 25 | fn map_frame(input: Self::Frame, mut f: impl FnMut(A) -> B) -> Self::Frame { 26 | use ExprFrame::{And, Literal, Not, Or, Predicate}; 27 | match input { 28 | Not(a) => Not(f(a)), 29 | And(a, b) => And(f(a), f(b)), 30 | Or(a, b) => Or(f(a), f(b)), 31 | Predicate(p) => Predicate(p), 32 | Literal(bool) => Literal(bool), 33 | } 34 | } 35 | } 36 | 37 | async fn map_frame_async<'a, A, B, E, P>( 38 | input: ExprFrame, 39 | f: impl Fn(A) -> futures::future::BoxFuture<'a, Result> + Send + Sync + 'a, 40 | ) -> Result, E> 41 | where 42 | E: Send + 'a, 43 | A: Send + 'a, 44 | B: Send + 'a, 45 | { 46 | use ExprFrame::{And, Literal, Not, Or, Predicate}; 47 | match input { 48 | Not(a) => Ok(Not(f(a).await?)), 49 | And(a, b) => { 50 | let (a, b) = try_join!(f(a), f(b))?; 51 | Ok(And(a, b)) 52 | } 53 | Or(a, b) => { 54 | let (a, b) = try_join!(f(a), f(b))?; 55 | Ok(Or(a, b)) 56 | } 57 | Predicate(p) => Ok(Predicate(p)), 58 | Literal(bool) => Ok(Literal(bool)), 59 | } 60 | } 61 | 62 | impl AsyncMappableFrame for ExprFrame { 63 | fn map_frame_async<'a, A, B, E>( 64 | input: Self::Frame, 65 | f: impl Fn(A) -> futures::future::BoxFuture<'a, Result> + Send + Sync + 'a, 66 | ) -> futures::future::BoxFuture<'a, Result, E>> 67 | where 68 | E: Send + 'a, 69 | A: Send + 'a, 70 | B: Send + 'a, 71 | { 72 | map_frame_async(input, f).boxed() 73 | } 74 | } 75 | 76 | pub struct MapPredicateRef<'a, P>(pub &'a Expr

); 77 | 78 | impl Collapsible for &Expr

{ 79 | type FrameToken = ExprFrame; 80 | 81 | fn into_frame(self) -> ExprFrame { 82 | match self { 83 | Expr::Not(x) => ExprFrame::Not(x), 84 | Expr::And(a, b) => ExprFrame::And(a, b), 85 | Expr::Or(a, b) => ExprFrame::Or(a, b), 86 | Expr::Predicate(p) => ExprFrame::Predicate(p.clone()), 87 | Expr::Literal(b) => ExprFrame::Literal(*b), 88 | } 89 | } 90 | } 91 | 92 | impl<'a, P> Collapsible for MapPredicateRef<'a, P> { 93 | type FrameToken = ExprFrame; 94 | 95 | fn into_frame(self) -> ExprFrame { 96 | match self.0 { 97 | Expr::Not(x) => ExprFrame::Not(MapPredicateRef(x)), 98 | Expr::And(a, b) => ExprFrame::And(MapPredicateRef(a), MapPredicateRef(b)), 99 | Expr::Or(a, b) => ExprFrame::Or(MapPredicateRef(a), MapPredicateRef(b)), 100 | Expr::Predicate(p) => ExprFrame::Predicate(p), 101 | Expr::Literal(b) => ExprFrame::Literal(*b), 102 | } 103 | } 104 | } 105 | 106 | impl Display for ExprFrame<(), P> { 107 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 108 | match self { 109 | Self::Not(()) => write!(f, "NOT"), 110 | Self::And((), ()) => { 111 | write!(f, "AND") 112 | } 113 | Self::Or((), ()) => { 114 | write!(f, "OR") 115 | } 116 | Self::Predicate(arg0) => write!(f, "{arg0}"), 117 | Self::Literal(arg0) => write!(f, "{arg0}"), 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | #![warn(clippy::all)] 3 | #![warn(clippy::cargo)] 4 | 5 | pub mod eval; 6 | pub mod expr; 7 | pub mod parser; 8 | pub mod predicate; 9 | mod predicate_error; 10 | pub mod util; 11 | 12 | use std::{path::Path, sync::Arc, time::Instant}; 13 | 14 | use ignore::WalkBuilder; 15 | use parser::{error::DetectError, RawParser, Typechecker}; 16 | use predicate::Predicate; 17 | use slog::{debug, info, warn, Logger}; 18 | 19 | /// Runtime configuration for detect operations 20 | #[derive(Debug, Clone)] 21 | pub struct RuntimeConfig { 22 | /// Maximum file size (in bytes) for structured data parsing (YAML/JSON/TOML) 23 | /// Files larger than this will skip structured data evaluation 24 | pub max_structured_size: u64, 25 | } 26 | 27 | impl Default for RuntimeConfig { 28 | fn default() -> Self { 29 | Self { 30 | max_structured_size: 10 * 1024 * 1024, // 10MB default 31 | } 32 | } 33 | } 34 | 35 | pub async fn parse_and_run_fs( 36 | logger: Logger, 37 | root: &Path, 38 | respect_gitignore: bool, 39 | expr: String, 40 | config: RuntimeConfig, 41 | mut on_match: F, 42 | ) -> Result { 43 | let original_query = expr.clone(); 44 | let parse_result = RawParser::parse_raw_expr(&expr) 45 | .and_then(|raw_expr| Typechecker::typecheck(raw_expr, &expr, &config)); 46 | 47 | match parse_result { 48 | Ok(parsed_expr) => { 49 | if !root.exists() { 50 | return Err(DetectError::DirectoryNotFound { 51 | path: root.display().to_string(), 52 | }); 53 | } 54 | if !root.is_dir() { 55 | return Err(DetectError::NotADirectory { 56 | path: root.display().to_string(), 57 | }); 58 | } 59 | 60 | let walker = WalkBuilder::new(root) 61 | .hidden(false) 62 | .git_ignore(respect_gitignore) 63 | .filter_entry(|entry| { 64 | // Always exclude VCS directories, regardless of gitignore settings 65 | // This matches ripgrep's behavior 66 | !entry 67 | .file_name() 68 | .to_str() 69 | .is_some_and(|s| s == ".git" || s == ".hg" || s == ".svn") 70 | }) 71 | .build(); 72 | 73 | let expr = parsed_expr.map_predicate_ref(|p| match p { 74 | Predicate::Name(n) => Predicate::Name(Arc::clone(n)), 75 | Predicate::Metadata(m) => Predicate::Metadata(Arc::clone(m)), 76 | Predicate::Content(c) => Predicate::Content(c.as_ref()), 77 | Predicate::Structured(s) => Predicate::Structured(s.clone()), 78 | }); 79 | 80 | info!(logger, "parsed expression"; "expr" => %expr); 81 | 82 | let mut match_count = 0; 83 | for entry in walker { 84 | let entry = match entry { 85 | Ok(e) => e, 86 | Err(e) => { 87 | // Skip entries we can't access (permission denied, etc.) 88 | warn!(logger, "skipping entry due to walker error"; "error" => %e); 89 | continue; 90 | } 91 | }; 92 | let path = entry.path(); 93 | 94 | if path == root { 95 | continue; 96 | } 97 | 98 | let start = Instant::now(); 99 | 100 | let is_match = match eval::fs::eval(&logger, &expr, path, Some(root)).await { 101 | Ok(result) => result, 102 | Err(e) => { 103 | // Handle I/O errors gracefully - skip files we can't access 104 | if e.kind() == std::io::ErrorKind::PermissionDenied { 105 | debug!(logger, "skipping file due to permission denied"; "path" => #?path); 106 | continue; 107 | } 108 | // For other I/O errors, also skip but log at warning level 109 | warn!(logger, "skipping file due to I/O error"; "path" => #?path, "error" => %e); 110 | continue; 111 | } 112 | }; 113 | 114 | let duration = start.elapsed(); 115 | 116 | debug!(logger, "visited entity"; "path" => #?path, "duration" => #?duration, "result" => is_match); 117 | 118 | if is_match { 119 | match_count += 1; 120 | on_match(path); 121 | } 122 | } 123 | 124 | if match_count == 0 { 125 | eprintln!("No files matched the query: {original_query}"); 126 | eprintln!("Searched in: {}", root.display()); 127 | if respect_gitignore { 128 | eprintln!("Hint: Use -i flag to include gitignored files, or try broadening your search criteria"); 129 | } else { 130 | eprintln!("Hint: Try broadening your search criteria or check the path/expression syntax"); 131 | } 132 | } 133 | 134 | Ok(match_count) 135 | } 136 | Err(err) => Err(err), 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/parser/aliases.rs: -------------------------------------------------------------------------------- 1 | //! Single-word predicate aliases 2 | //! 3 | //! Provides shorthand syntax like `dir` instead of `type == dir`, 4 | //! enabling more natural queries: `dir && depth > 0` 5 | //! 6 | //! Also handles structured data selectors like `yaml:.field` as existence predicates. 7 | 8 | use std::sync::Arc; 9 | 10 | use super::typed::{parse_structured_selector, AliasError, DataFormat}; 11 | use crate::predicate::{ 12 | DetectFileType, EnumMatcher, EnumPredicate, MetadataPredicate, Predicate, 13 | StructuredDataPredicate, 14 | }; 15 | 16 | /// Resolve a single-word alias to a predicate 17 | /// 18 | /// Supports: 19 | /// - File type aliases: `file`, `dir`, `symlink`, etc. 20 | /// - Structured data selectors: `yaml:.field`, `json:.path`, `toml:.key` (existence check) 21 | /// 22 | /// Example: `resolve_alias("dir")` is equivalent to `type == dir` 23 | /// Example: `resolve_alias("yaml:.spec")` checks if `.spec` exists in YAML file 24 | pub fn resolve_alias(word: &str) -> Result { 25 | // Check if it's a structured selector 26 | match parse_structured_selector(word) { 27 | Ok(Some((format, components))) => { 28 | // Create existence predicate based on format 29 | let predicate = match format { 30 | DataFormat::Yaml => StructuredDataPredicate::YamlExists { path: components }, 31 | DataFormat::Json => StructuredDataPredicate::JsonExists { path: components }, 32 | DataFormat::Toml => StructuredDataPredicate::TomlExists { path: components }, 33 | }; 34 | return Ok(Predicate::Structured(predicate)); 35 | } 36 | Ok(None) => { 37 | // Not a structured selector, try file type alias 38 | } 39 | Err(e) => { 40 | return Err(AliasError::Structured(e)); 41 | } 42 | } 43 | 44 | // Try to resolve as file type alias 45 | match DetectFileType::from_str(word) { 46 | Ok(file_type) => Ok(Predicate::Metadata(Arc::new(MetadataPredicate::Type( 47 | EnumMatcher::Equals(file_type), 48 | )))), 49 | Err(_) => Err(AliasError::UnknownAlias(word.to_string())), 50 | } 51 | } 52 | 53 | /// Suggest similar aliases for an unknown word 54 | /// 55 | /// Uses simple edit distance to find close matches 56 | pub fn suggest_aliases(word: &str) -> Vec { 57 | let all_aliases = DetectFileType::all_valid_strings(); 58 | 59 | all_aliases 60 | .iter() 61 | .filter(|&&alias| levenshtein_distance(word, alias) <= 2) 62 | .map(|&s| s.to_string()) 63 | .collect() 64 | } 65 | 66 | /// Simple Levenshtein distance implementation for fuzzy matching 67 | fn levenshtein_distance(a: &str, b: &str) -> usize { 68 | let a_chars: Vec = a.chars().collect(); 69 | let b_chars: Vec = b.chars().collect(); 70 | let a_len = a_chars.len(); 71 | let b_len = b_chars.len(); 72 | 73 | if a_len == 0 { 74 | return b_len; 75 | } 76 | if b_len == 0 { 77 | return a_len; 78 | } 79 | 80 | let mut prev_row: Vec = (0..=b_len).collect(); 81 | let mut curr_row = vec![0; b_len + 1]; 82 | 83 | for (i, a_char) in a_chars.iter().enumerate() { 84 | curr_row[0] = i + 1; 85 | 86 | for (j, b_char) in b_chars.iter().enumerate() { 87 | let cost = usize::from(a_char != b_char); 88 | curr_row[j + 1] = (curr_row[j] + 1) // insertion 89 | .min(prev_row[j + 1] + 1) // deletion 90 | .min(prev_row[j] + cost); // substitution 91 | } 92 | 93 | std::mem::swap(&mut prev_row, &mut curr_row); 94 | } 95 | 96 | prev_row[b_len] 97 | } 98 | 99 | #[cfg(test)] 100 | mod tests { 101 | use super::*; 102 | 103 | #[test] 104 | fn test_file_type_aliases() { 105 | // All file type aliases should resolve 106 | assert!(resolve_alias("file").is_ok()); 107 | assert!(resolve_alias("dir").is_ok()); 108 | assert!(resolve_alias("directory").is_ok()); 109 | assert!(resolve_alias("symlink").is_ok()); 110 | assert!(resolve_alias("link").is_ok()); 111 | assert!(resolve_alias("socket").is_ok()); 112 | assert!(resolve_alias("sock").is_ok()); 113 | assert!(resolve_alias("fifo").is_ok()); 114 | assert!(resolve_alias("pipe").is_ok()); 115 | assert!(resolve_alias("block").is_ok()); 116 | assert!(resolve_alias("blockdev").is_ok()); 117 | assert!(resolve_alias("char").is_ok()); 118 | assert!(resolve_alias("chardev").is_ok()); 119 | } 120 | 121 | #[test] 122 | fn test_unknown_alias() { 123 | let result = resolve_alias("unknown"); 124 | assert!(matches!(result, Err(AliasError::UnknownAlias(_)))); 125 | } 126 | 127 | #[test] 128 | fn test_case_insensitive() { 129 | // DetectFileType::from_str is case-insensitive 130 | assert!(resolve_alias("FILE").is_ok()); 131 | assert!(resolve_alias("Dir").is_ok()); 132 | assert!(resolve_alias("SYMLINK").is_ok()); 133 | } 134 | 135 | #[test] 136 | fn test_suggestions() { 137 | // Close matches should be suggested 138 | let suggestions = suggest_aliases("fil"); 139 | assert!(suggestions.contains(&"file".to_string())); 140 | 141 | let suggestions = suggest_aliases("direktory"); 142 | assert!(suggestions.contains(&"directory".to_string())); 143 | } 144 | 145 | #[test] 146 | fn test_levenshtein_distance() { 147 | assert_eq!(levenshtein_distance("file", "file"), 0); 148 | assert_eq!(levenshtein_distance("file", "fil"), 1); 149 | assert_eq!(levenshtein_distance("directory", "dir"), 6); 150 | assert_eq!(levenshtein_distance("", "test"), 4); 151 | assert_eq!(levenshtein_distance("test", ""), 4); 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/parser/ast.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, PartialEq)] 2 | pub struct RawPredicate<'a> { 3 | pub selector: &'a str, 4 | pub operator: &'a str, 5 | pub value: RawValue<'a>, 6 | pub span: pest::Span<'a>, 7 | // Subcomponent spans for precise error reporting 8 | pub selector_span: pest::Span<'a>, 9 | pub operator_span: pest::Span<'a>, 10 | pub value_span: pest::Span<'a>, 11 | } 12 | 13 | #[derive(Debug, Clone, PartialEq)] 14 | pub enum RawValue<'a> { 15 | Quoted(&'a str), // Explicitly quoted by user (quotes stripped, escapes preserved) 16 | Raw(&'a str), // Raw token - could be bare word, [set], (group), {curly}, etc. 17 | // Typechecker interprets based on operator context 18 | } 19 | 20 | #[derive(Debug, Clone, PartialEq)] 21 | pub enum RawExpr<'a> { 22 | Not(Box>), 23 | And(Box>, Box>), 24 | Or(Box>, Box>), 25 | Predicate(RawPredicate<'a>), 26 | SingleWord(pest::Span<'a>), 27 | } 28 | 29 | impl<'a> RawExpr<'a> { 30 | /// Convert to test-friendly expression without spans 31 | pub fn to_test_expr(&self) -> test_utils::RawTestExpr<'a> { 32 | match self { 33 | RawExpr::Not(expr) => test_utils::RawTestExpr::Not(Box::new(expr.to_test_expr())), 34 | RawExpr::And(left, right) => test_utils::RawTestExpr::And( 35 | Box::new(left.to_test_expr()), 36 | Box::new(right.to_test_expr()), 37 | ), 38 | RawExpr::Or(left, right) => test_utils::RawTestExpr::Or( 39 | Box::new(left.to_test_expr()), 40 | Box::new(right.to_test_expr()), 41 | ), 42 | RawExpr::Predicate(pred) => { 43 | test_utils::RawTestExpr::Predicate(pred.to_test_predicate()) 44 | } 45 | RawExpr::SingleWord(span) => test_utils::RawTestExpr::SingleWord(span.as_str()), 46 | } 47 | } 48 | } 49 | 50 | impl<'a> RawPredicate<'a> { 51 | /// Convert to test-friendly predicate without spans 52 | pub fn to_test_predicate(&self) -> test_utils::RawTestPredicate<'a> { 53 | test_utils::RawTestPredicate { 54 | selector: self.selector, 55 | operator: self.operator, 56 | value: self.value.to_test_value(), 57 | } 58 | } 59 | } 60 | 61 | impl<'a> RawValue<'a> { 62 | /// Get the string value (works for both Quoted and Raw) 63 | pub fn as_string(&self) -> &'a str { 64 | match self { 65 | RawValue::Quoted(s) | RawValue::Raw(s) => s, 66 | } 67 | } 68 | 69 | /// Check if this is a quoted value (user explicitly quoted it) 70 | pub fn is_quoted(&self) -> bool { 71 | matches!(self, RawValue::Quoted(_)) 72 | } 73 | 74 | /// Convert to test-friendly value without spans 75 | pub fn to_test_value(&self) -> test_utils::RawTestValue<'a> { 76 | match self { 77 | RawValue::Quoted(s) => test_utils::RawTestValue::Quoted(s), 78 | RawValue::Raw(s) => test_utils::RawTestValue::Raw(s), 79 | } 80 | } 81 | } 82 | 83 | pub mod test_utils { 84 | #[derive(Debug, Clone, PartialEq)] 85 | pub struct RawTestPredicate<'a> { 86 | pub selector: &'a str, 87 | pub operator: &'a str, 88 | pub value: RawTestValue<'a>, 89 | } 90 | 91 | #[derive(Debug, Clone, PartialEq)] 92 | pub enum RawTestValue<'a> { 93 | Quoted(&'a str), // Explicitly quoted by user 94 | Raw(&'a str), // Raw token (bare word, [brackets], (parens), {curlies}) 95 | } 96 | 97 | impl std::fmt::Display for RawTestValue<'_> { 98 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 99 | match self { 100 | RawTestValue::Quoted(s) | RawTestValue::Raw(s) => write!(f, "{s}"), 101 | } 102 | } 103 | } 104 | 105 | #[derive(Debug, Clone, PartialEq)] 106 | pub enum RawTestExpr<'a> { 107 | Not(Box>), 108 | And(Box>, Box>), 109 | Or(Box>, Box>), 110 | Predicate(RawTestPredicate<'a>), 111 | SingleWord(&'a str), 112 | } 113 | 114 | impl<'a> RawTestExpr<'a> { 115 | pub fn string_predicate(selector: &'a str, operator: &'a str, value: &'a str) -> Self { 116 | RawTestExpr::Predicate(RawTestPredicate { 117 | selector, 118 | operator, 119 | value: RawTestValue::Raw(value), 120 | }) 121 | } 122 | 123 | pub fn quoted_predicate(selector: &'a str, operator: &'a str, value: &'a str) -> Self { 124 | RawTestExpr::Predicate(RawTestPredicate { 125 | selector, 126 | operator, 127 | value: RawTestValue::Quoted(value), 128 | }) 129 | } 130 | 131 | pub fn set_predicate(selector: &'a str, operator: &'a str, values: Vec<&'a str>) -> Self { 132 | let value = format!("[{}]", values.join(",")); 133 | RawTestExpr::Predicate(RawTestPredicate { 134 | selector, 135 | operator, 136 | value: RawTestValue::Raw(Box::leak(value.into_boxed_str())), 137 | }) 138 | } 139 | 140 | pub fn single_word(word: &'a str) -> Self { 141 | RawTestExpr::SingleWord(word) 142 | } 143 | 144 | pub fn and(left: RawTestExpr<'a>, right: RawTestExpr<'a>) -> Self { 145 | RawTestExpr::And(Box::new(left), Box::new(right)) 146 | } 147 | 148 | pub fn or(left: RawTestExpr<'a>, right: RawTestExpr<'a>) -> Self { 149 | RawTestExpr::Or(Box::new(left), Box::new(right)) 150 | } 151 | 152 | #[allow(clippy::should_implement_trait)] 153 | pub fn not(expr: RawTestExpr<'a>) -> Self { 154 | RawTestExpr::Not(Box::new(expr)) 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/expr.rs: -------------------------------------------------------------------------------- 1 | pub mod frame; 2 | pub mod short_circuit; 3 | 4 | use std::{fmt::Display, sync::Arc}; 5 | 6 | use crate::{ 7 | expr::frame::ExprFrame, 8 | predicate::{self, Predicate}, 9 | }; 10 | use frame::MapPredicateRef; 11 | use recursion::CollapsibleExt; 12 | 13 | use self::short_circuit::ShortCircuit; 14 | 15 | /// Filesystem entity matcher expression with boolean logic and predicates 16 | #[derive(Debug, PartialEq, Eq)] 17 | pub enum Expr { 18 | Not(Box), 19 | And(Box, Box), 20 | Or(Box, Box), 21 | Predicate(Predicate), 22 | Literal(bool), 23 | } 24 | 25 | impl Display for Expr

{ 26 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 27 | match self { 28 | Expr::Not(e) => f.write_str(&format!("!{e}")), 29 | Expr::And(a, b) => f.write_str(&format!("{a} && {b}")), 30 | Expr::Or(a, b) => f.write_str(&format!("{a} || {b}")), 31 | Expr::Predicate(p) => write!(f, "{:?}", &p), 32 | Expr::Literal(x) => f.write_str(&x.to_string()), 33 | } 34 | } 35 | } 36 | 37 | impl Expr> { 38 | pub fn name_predicate(x: A) -> Self { 39 | Self::Predicate(Predicate::Name(Arc::new(x))) 40 | } 41 | pub fn meta_predicate(x: B) -> Self { 42 | Self::Predicate(Predicate::Metadata(Arc::new(x))) 43 | } 44 | pub fn content_predicate(x: C) -> Self { 45 | Self::Predicate(Predicate::Content(x)) 46 | } 47 | } 48 | 49 | impl Expr> { 50 | /// Check if expression contains any Structured predicates 51 | pub fn contains_structured_predicates(&self) -> bool { 52 | MapPredicateRef(self).collapse_frames(|e| match e { 53 | ExprFrame::Predicate(Predicate::Structured(_)) => true, 54 | ExprFrame::And(a, b) | ExprFrame::Or(a, b) => a || b, 55 | ExprFrame::Not(a) => a, 56 | ExprFrame::Predicate(_) | ExprFrame::Literal(_) => false, 57 | }) 58 | } 59 | 60 | /// Check if expression contains any Content predicates 61 | pub fn contains_content_predicates(&self) -> bool { 62 | MapPredicateRef(self).collapse_frames(|e| match e { 63 | ExprFrame::Predicate(Predicate::Content(_)) => true, 64 | ExprFrame::And(a, b) | ExprFrame::Or(a, b) => a || b, 65 | ExprFrame::Not(a) => a, 66 | ExprFrame::Predicate(_) | ExprFrame::Literal(_) => false, 67 | }) 68 | } 69 | } 70 | 71 | impl

Expr

{ 72 | pub fn map_predicate_ref<'a, B>(&'a self, f: impl Fn(&'a P) -> B) -> Expr { 73 | MapPredicateRef(self).collapse_frames(|e| match e { 74 | // apply 'f' to Predicate expressions 75 | ExprFrame::Predicate(p) => Expr::Predicate(f(p)), 76 | ExprFrame::And(a, b) => Expr::and(a, b), 77 | ExprFrame::Or(a, b) => Expr::or(a, b), 78 | ExprFrame::Not(a) => Expr::negate(a), 79 | ExprFrame::Literal(x) => Expr::Literal(x), 80 | }) 81 | } 82 | 83 | pub fn and(a: Self, b: Self) -> Self { 84 | Self::And(Box::new(a), Box::new(b)) 85 | } 86 | pub fn or(a: Self, b: Self) -> Self { 87 | Self::Or(Box::new(a), Box::new(b)) 88 | } 89 | pub fn negate(a: Self) -> Self { 90 | Self::Not(Box::new(a)) 91 | } 92 | } 93 | 94 | impl Expr

{ 95 | pub fn map_predicate(self, f: impl Fn(P) -> B) -> Expr { 96 | self.collapse_frames(|e| match e { 97 | // apply 'f' to Predicate expressions 98 | ExprFrame::Predicate(p) => Expr::Predicate(f(p)), 99 | ExprFrame::And(a, b) => Expr::and(a, b), 100 | ExprFrame::Or(a, b) => Expr::or(a, b), 101 | ExprFrame::Not(a) => Expr::negate(a), 102 | ExprFrame::Literal(x) => Expr::Literal(x), 103 | }) 104 | } 105 | 106 | pub fn map_predicate_err(self, f: impl Fn(P) -> Result) -> Result, E> { 107 | self.collapse_frames(|e| match e { 108 | // apply 'f' to Predicate expressions 109 | ExprFrame::Predicate(p) => Ok(Expr::Predicate(f(p)?)), 110 | ExprFrame::And(a, b) => Ok(Expr::and(a?, b?)), 111 | ExprFrame::Or(a, b) => Ok(Expr::or(a?, b?)), 112 | ExprFrame::Not(a) => Ok(Expr::negate(a?)), 113 | ExprFrame::Literal(x) => Ok(Expr::Literal(x)), 114 | }) 115 | } 116 | } 117 | 118 | impl Expr

{ 119 | pub fn reduce_predicate_and_short_circuit>>( 120 | &self, 121 | mut f: impl FnMut(P) -> X, 122 | ) -> Expr { 123 | self.collapse_frames(|e| match e { 124 | // apply 'f' to Predicate expressions 125 | ExprFrame::Predicate(p) => match f(p).into() { 126 | ShortCircuit::Known(b) => Expr::Literal(b), 127 | ShortCircuit::Unknown(p) => Expr::Predicate(p), 128 | }, 129 | // reduce And expressions 130 | ExprFrame::And(Expr::Literal(false), _) => Expr::Literal(false), 131 | ExprFrame::And(_, Expr::Literal(false)) => Expr::Literal(false), 132 | ExprFrame::And(x, Expr::Literal(true)) => x, 133 | ExprFrame::And(Expr::Literal(true), x) => x, 134 | ExprFrame::And(a, b) => Expr::and(a, b), 135 | // reduce Or expressions 136 | ExprFrame::Or(Expr::Literal(true), _) => Expr::Literal(true), 137 | ExprFrame::Or(_, Expr::Literal(true)) => Expr::Literal(true), 138 | ExprFrame::Or(x, Expr::Literal(false)) => x, 139 | ExprFrame::Or(Expr::Literal(false), x) => x, 140 | ExprFrame::Or(a, b) => Expr::or(a, b), 141 | // reduce Not expressions 142 | ExprFrame::Not(Expr::Literal(k)) => Expr::Literal(!k), 143 | ExprFrame::Not(x) => Expr::negate(x), 144 | // Literal expressions are unchanged 145 | ExprFrame::Literal(x) => Expr::Literal(x), 146 | }) 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{env::current_dir, io::Write, path::PathBuf, str::FromStr}; 2 | 3 | use clap::Parser; 4 | use detect::{parse_and_run_fs, RuntimeConfig}; 5 | use slog::{o, Drain, Level, Logger}; 6 | 7 | const EXAMPLES: &str = include_str!("../docs/examples.md"); 8 | const PREDICATES: &str = include_str!("../docs/predicates.md"); 9 | const OPERATORS: &str = include_str!("../docs/operators.md"); 10 | 11 | #[derive(Parser, Debug)] 12 | #[command( 13 | name = "detect", 14 | author, 15 | version, 16 | about = "Find filesystem entities using expressions", 17 | long_about = "Find filesystem entities using expressions 18 | 19 | EXIT CODES: 20 | 0 Matches found 21 | 1 No matches found 22 | 2 Error (parse error, directory not found, etc.)" 23 | )] 24 | struct Args { 25 | /// Show help on specific topics: examples, predicates, operators 26 | /// 27 | /// Without argument, lists available topics 28 | #[arg( 29 | long = "explain", 30 | value_name = "TOPIC", 31 | num_args = 0..=1, 32 | default_missing_value = "list", 33 | require_equals = false, 34 | )] 35 | explain: Option, 36 | 37 | /// filtering expr 38 | #[clap(index = 1, required_unless_present = "explain")] 39 | expr: Option, 40 | 41 | /// target dir 42 | #[clap(index = 2)] 43 | path: Option, 44 | /// include gitignored files 45 | #[arg(short = 'i')] 46 | visit_gitignored: bool, 47 | /// log level (trace/debug/info/warning/error/critical) 48 | #[arg(short = 'l', default_value = "warning")] 49 | log_level: String, 50 | /// Maximum file size for structured data parsing (yaml/json/toml) 51 | /// Supports units: kb, mb, gb (e.g., "10mb", "500kb") 52 | #[arg(long = "max-structured-size", default_value = "10mb")] 53 | max_structured_size: String, 54 | } 55 | 56 | #[tokio::main] 57 | pub async fn main() -> Result<(), Box> { 58 | let args = Args::parse(); 59 | 60 | // Handle --explain flag 61 | if let Some(topic) = args.explain { 62 | match topic.to_lowercase().as_str() { 63 | "list" => { 64 | println!("Available help topics:\n"); 65 | println!(" examples - Practical usage examples for common tasks"); 66 | println!( 67 | " predicates - Reference of all selectors (name, size, content, yaml, etc.)" 68 | ); 69 | println!(" operators - Reference of all operators (==, contains, ~=, etc.)"); 70 | println!("\nUsage: detect --explain "); 71 | println!(" or: detect --explain (shows this list)"); 72 | } 73 | "examples" => println!("{}", EXAMPLES), 74 | "predicates" | "selectors" => println!("{}", PREDICATES), 75 | "operators" | "ops" => println!("{}", OPERATORS), 76 | _ => { 77 | eprintln!("Error: Unknown topic '{}'\n", topic); 78 | eprintln!("Available topics: examples, predicates, operators"); 79 | eprintln!("Run 'detect --explain' to see all topics"); 80 | std::process::exit(2); 81 | } 82 | } 83 | return Ok(()); 84 | } 85 | 86 | let expr = args 87 | .expr 88 | .expect("Expression required when --explain isn't used, should be present"); 89 | 90 | let max_structured_size = 91 | detect::util::parse_size(&args.max_structured_size).unwrap_or_else(|e| { 92 | eprintln!("Error: {e}"); 93 | std::process::exit(1); 94 | }); 95 | 96 | let config = RuntimeConfig { 97 | max_structured_size, 98 | }; 99 | 100 | let log_level = Level::from_str(&args.log_level).unwrap_or_else(|_| { 101 | eprintln!( 102 | "Error: Invalid log level '{}'\nValid options: trace, debug, info, warning, error, critical", 103 | args.log_level 104 | ); 105 | std::process::exit(1); 106 | }); 107 | 108 | let plain = slog_term::PlainSyncDecorator::new(std::io::stdout()); 109 | let logger = Logger::root( 110 | RuntimeLevelFilter { 111 | drain: slog_term::FullFormat::new(plain).build(), 112 | level: log_level, 113 | } 114 | .fuse(), 115 | o!(), 116 | ); 117 | 118 | let root_path = match args.path { 119 | Some(path) => path, 120 | None => current_dir()?, 121 | }; 122 | 123 | // Canonicalize root path for relative path computation 124 | let canonical_root = root_path 125 | .canonicalize() 126 | .unwrap_or_else(|_| root_path.clone()); 127 | 128 | let mut output = std::io::stdout(); 129 | 130 | let result = parse_and_run_fs( 131 | logger, 132 | &root_path, 133 | !args.visit_gitignored, 134 | expr, 135 | config, 136 | |s| { 137 | let display_path = s 138 | .strip_prefix(&canonical_root) 139 | .unwrap_or(s) 140 | .to_string_lossy(); 141 | 142 | if let Err(e) = writeln!(output, "./{}", display_path) { 143 | if e.kind() == std::io::ErrorKind::BrokenPipe { 144 | // Unix convention: exit 0 on SIGPIPE/BrokenPipe 145 | std::process::exit(0); 146 | } else { 147 | eprintln!("Output error: {}", e); 148 | std::process::exit(1); 149 | } 150 | } 151 | }, 152 | ) 153 | .await; 154 | 155 | match result { 156 | Ok(match_count) => { 157 | if match_count > 0 { 158 | std::process::exit(0); // Matches found 159 | } else { 160 | std::process::exit(1); // No matches 161 | } 162 | } 163 | Err(e) => { 164 | eprintln!("{:?}", miette::Report::new(e)); 165 | std::process::exit(2); // Error 166 | } 167 | } 168 | } 169 | 170 | struct RuntimeLevelFilter { 171 | drain: D, 172 | level: Level, 173 | } 174 | 175 | impl Drain for RuntimeLevelFilter 176 | where 177 | D: Drain, 178 | { 179 | type Ok = Option; 180 | type Err = Option; 181 | 182 | fn log( 183 | &self, 184 | record: &slog::Record, 185 | values: &slog::OwnedKVList, 186 | ) -> Result { 187 | if record.level().is_at_least(self.level) { 188 | self.drain.log(record, values).map(Some).map_err(Some) 189 | } else { 190 | Ok(None) 191 | } 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Crates.io](https://img.shields.io/crates/v/detect.svg)](https://crates.io/crates/detect) 2 | 3 | # detect 4 | 5 | A modern replacement for find/grep using an intuitive expression language. 6 | 7 | - **Readable syntax**: `ext == ts AND size > 50kb` instead of `find . -name "*.ts" -size +50k` 8 | - **Unified queries**: Combine filename + content + metadata instead of chaining multiple processes 9 | - **Lazy evaluation**: Detect checks cheap predicates first (filename, metadata) and short circuits whenever possible 10 | 11 | [Quick start](#quick-start) • [Installation](#installation) • [Query language](#query-language) • [Examples](#examples) 12 | 13 | Traditional Unix tools require chaining multiple commands with cryptic syntax: 14 | 15 | ```bash 16 | # Find Rust files importing BOTH tokio and serde 17 | detect 'ext == rs 18 | AND content contains "use tokio" 19 | AND content contains "use serde"' 20 | 21 | # Traditional approach - scan all .rs files, then scan matches a second time 22 | grep -rl 'use tokio' --include="*.rs" | xargs grep -l 'use serde' 23 | ``` 24 | 25 | Detect also supports searches inspecting structured data in YAML, TOML, and JSON files: 26 | 27 | ```bash 28 | # Find Cargo.toml files with package edition 2018 29 | detect 'name == "Cargo.toml" AND toml:.package.edition == 2018' 30 | 31 | # using regexes (may result in false positives) 32 | find . -name "Cargo.toml" -exec grep -q 'edition.*"2018"' {} \; -print 33 | 34 | # using cryptaliagy's tomlq crate 35 | find . -name "Cargo.toml" -exec sh -c ' 36 | tq -f "$1" -r ".package.edition" 2>/dev/null | grep -q "2018" 37 | ' _ {} \; -print 38 | ``` 39 | 40 | 41 | ## Installation 42 | 43 | ### From crates.io 44 | 45 | ```bash 46 | cargo install detect 47 | ``` 48 | 49 | ### Building from source 50 | 51 | **Prerequisites:** Rust toolchain (1.70+) 52 | 53 | ```bash 54 | git clone https://github.com/inanna-malick/detect.git 55 | cd detect 56 | cargo build --release 57 | 58 | # Binary will be at ./target/release/detect 59 | # Optionally install globally: 60 | cargo install --path . 61 | ``` 62 | 63 | ## Quick start 64 | 65 | ```bash 66 | detect 'ext == rs' # selector + operator 67 | detect 'ext in [rs,toml] AND size > 1mb' # sets, AND, numeric 68 | detect 'ext == ts AND modified > -7d' # temporal predicates 69 | detect 'ext == ts AND content ~= "class.*Service"' # content, regex 70 | detect '(file OR dir) AND NOT path ~= test' # aliases, grouping, NOT 71 | detect 'yaml:.server.port > 8000 AND size < 0.5mb' # structured data 72 | ``` 73 | 74 | ## Query language 75 | 76 | ### Selectors 77 | 78 | #### File Identity 79 | | Selector | Type | Description | Example | 80 | |----------|------|-------------|---------| 81 | | `name` / `filename` | String | Full filename with extension | `name == "README.md"` | 82 | | `basename` / `stem` | String | Filename without extension | `basename == README` | 83 | | `ext` / `extension` | String | File extension (no dot) | `ext == rs` | 84 | | `path` | String | Full absolute path | `path contains /src/` | 85 | | `dir` / `parent` / `directory` | String | Parent directory path | `dir contains lib` | 86 | 87 | #### File Properties 88 | | Selector | Type | Description | Example | 89 | |----------|------|-------------|---------| 90 | | `size` | Numeric | File size in bytes | `size > 1mb` | 91 | | `type` | Enum | File type (parse-time validated) | `type == file` | 92 | | `depth` | Numeric | Directory depth from search root | `depth <= 3` | 93 | 94 | **Size units:** `kb`, `mb`, `gb`, `tb` (e.g., `1.5mb`, `500kb`) 95 | 96 | **File types** (case-insensitive): `file`, `dir`/`directory`, `symlink`/`link`, `socket`/`sock`, `fifo`/`pipe`, `block`/`blockdev`, `char`/`chardev` 97 | 98 | #### Timestamps 99 | | Selector | Type | Description | Example | 100 | |----------|------|-------------|---------| 101 | | `modified` / `mtime` | Temporal | Last modification time | `modified > -7d` | 102 | | `created` / `ctime` | Temporal | File creation time | `created > 2024-01-01` | 103 | | `accessed` / `atime` | Temporal | Last access time | `accessed < -1h` | 104 | 105 | **Time formats:** Relative `-7d`/`-7days`, `-2h`/`-2hours`, `-1w`/`-1week` (units: `s`, `m`/`min`, `h`/`hr`, `d`/`day`, `w`/`week` + plurals). Absolute `2024-01-15`, `2024-01-15T10:30:00`. 106 | 107 | #### Content 108 | | Selector | Type | Description | Example | 109 | |----------|------|-------------|---------| 110 | | `content` / `text` / `contents` | String | File text contents | `content contains TODO` | 111 | 112 | #### Structured Data 113 | 114 | Query YAML, JSON, and TOML: 115 | 116 | ```bash 117 | yaml:.server # existence check (no operator needed) 118 | yaml:.server.port == 8080 # nested field value 119 | toml:.package.edition == "2021" # value match 120 | yaml:.features[*].enabled == true # wildcard - any array element 121 | json:..password contains prod # recursive - any depth 122 | ``` 123 | 124 | Navigate with `.field`, `.nested.field`, `[0]`, `[*]`, `..field`. Auto-converts between numbers and strings (`yaml:.port == 8080` matches both `8080` and `"8080"`). Default max file size: 10MB (configurable with `--max-structured-size`). 125 | 126 | ### Operators 127 | 128 | | Type | Operators | Example | 129 | |------|-----------|---------| 130 | | String | `==`, `!=`, `contains`, `~=`, `in [a,b]` | `content contains TODO` | 131 | | Numeric | `==`, `!=`, `>`, `<`, `>=`, `<=` | `size > 1mb` | 132 | | Temporal | `>`, `<`, `>=`, `<=`, `==`, `!=` | `modified > -7d` | 133 | | Enum | `==`, `!=`, `in [a,b]` | `type == file` | 134 | | Boolean | `AND`/`&&`, `OR`/`||`, `NOT`/`!`, `()` | `a AND (b OR c)` | 135 | 136 | **Precedence:** `NOT` > `AND` > `OR` 137 | 138 | Full reference: `detect --operators` 139 | 140 | ## Examples 141 | 142 | ```bash 143 | # File metadata combinations 144 | detect 'ext == rs AND size > 1mb AND modified > -7d' 145 | 146 | # Content matching with regex 147 | detect 'ext == ts AND content ~= "class.*Service"' 148 | 149 | # Structured data navigation 150 | detect 'yaml:.server.port == 8080' 151 | detect 'toml:.package.edition == "2021"' 152 | 153 | # Multi-feature real-world queries 154 | detect 'size > 10kb AND modified > -7d AND content contains TODO AND NOT path ~= test' 155 | detect 'yaml:.spec.replicas > 3 AND size < 100kb' 156 | 157 | # Security scanning 158 | detect 'name ~= "^\.env" AND content ~= "(password|secret|key)" AND NOT path ~= node_modules' 159 | 160 | # Migration from find/grep 161 | find . -name "*.ts" -size +1M -mtime -7 → detect 'ext == ts AND size > 1mb AND modified > -7d' 162 | 163 | # CLI options 164 | detect 'ext == rs' ./src # search specific directory 165 | detect -i 'content contains SECRET' # include gitignored files 166 | detect --max-structured-size 50mb 'yaml:.config' # configure size limit for structured files 167 | ``` 168 | 169 | **More examples:** `detect --examples` 170 | 171 | ## Exit codes 172 | 173 | Compatible with scripting and CI/CD pipelines (same as `grep`/`ripgrep`): 174 | 175 | - **0** - Matches found 176 | - **1** - No matches 177 | - **2** - Error (parse error, directory not found, etc.) 178 | 179 | ```bash 180 | # Use in conditionals 181 | if detect 'size > 100mb'; then 182 | echo "Found large files" 183 | fi 184 | 185 | # CI: fail build if TODOs found 186 | detect 'path contains src AND content contains TODO' && exit 1 187 | ``` 188 | 189 | ## Performance 190 | 191 | Queries are evaluated in four phases: name → metadata → structured → content. Each phase can eliminate files before more expensive operations. Content is never read unless the file passes all earlier checks. 192 | 193 | Respects `.gitignore` by default. Traverses directories in parallel. Structured data parsing is limited to 10MB files (configurable). 194 | 195 | ## Contributing 196 | 197 | Contributions welcome. File an issue before major changes. 198 | 199 | ## License 200 | 201 | Licensed under either of: 202 | 203 | - Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or ) 204 | - MIT license ([LICENSE-MIT](LICENSE-MIT) or ) 205 | 206 | at your option. 207 | -------------------------------------------------------------------------------- /src/parser/raw.rs: -------------------------------------------------------------------------------- 1 | use pest::{ 2 | iterators::Pair, 3 | pratt_parser::{Assoc::Left, Op, PrattParser}, 4 | Parser, 5 | }; 6 | use pest_derive::Parser; 7 | 8 | use super::{ 9 | ast::{RawExpr, RawPredicate, RawValue}, 10 | error::{DetectError, SpanExt}, 11 | }; 12 | 13 | #[derive(Parser)] 14 | #[grammar = "parser/grammar.pest"] 15 | pub struct RawParser; 16 | 17 | impl RawParser { 18 | /// Parse an expression from input string into a Raw AST 19 | pub fn parse_raw_expr(input: &str) -> Result, DetectError> { 20 | let mut pairs = Self::parse(Rule::program, input) 21 | .map_err(|e| DetectError::from_pest(Box::new(e), input.to_string()))?; 22 | 23 | let program_pair = pairs 24 | .next() 25 | .ok_or_else(|| DetectError::internal("Grammar guarantees program exists"))?; 26 | 27 | let expr_pair = program_pair 28 | .into_inner() 29 | .next() 30 | .ok_or_else(|| DetectError::internal("Grammar guarantees program contains expr"))?; 31 | 32 | Self::parse_expr(expr_pair).map_err(|e| e.with_source(input.to_string())) 33 | } 34 | 35 | /// Parse set contents from a string like "rs, js, ts" or "foo, \"bar, baz\", qux" 36 | /// Used by typechecker for 'in' operator 37 | /// 38 | /// Properly handles: 39 | /// - Quoted items with commas: `"foo, bar", baz` 40 | /// - Bare items: `rs, js, ts` 41 | /// - Mixed: `foo, "bar baz", qux` 42 | /// - Trailing commas: `rs, js,` 43 | /// - Empty sets: `` 44 | pub fn parse_set_contents(input: &str) -> Result, DetectError> { 45 | let pairs = Self::parse(Rule::set_contents, input) 46 | .map_err(|e| DetectError::from_pest(Box::new(e), input.to_string()))?; 47 | 48 | let items: Vec = pairs 49 | .flat_map(pest::iterators::Pair::into_inner) // set_contents -> set_items or EOI 50 | .filter(|pair| pair.as_rule() == Rule::set_items) 51 | .flat_map(pest::iterators::Pair::into_inner) // set_items -> set_item* 52 | .filter_map(|item_pair| { 53 | // set_item -> quoted_string | bare_set_item 54 | item_pair.into_inner().next() 55 | }) 56 | .map(|inner| { 57 | match inner.as_rule() { 58 | Rule::quoted_string => { 59 | // quoted_string -> inner_double | inner_single (quotes stripped) 60 | // Preserve all whitespace inside quotes 61 | inner 62 | .into_inner() 63 | .next() 64 | .map(|s| s.as_str().to_string()) 65 | .unwrap_or_default() 66 | } 67 | Rule::bare_set_item => { 68 | // Trim whitespace from bare items 69 | inner.as_str().trim().to_string() 70 | } 71 | _ => String::new(), // Should never happen 72 | } 73 | }) 74 | .filter(|s| !s.is_empty()) 75 | .collect(); 76 | 77 | Ok(items) 78 | } 79 | 80 | fn parse_expr(pair: Pair<'_, Rule>) -> Result, DetectError> { 81 | let pratt = PrattParser::new() 82 | .op(Op::infix(Rule::or, Left)) 83 | .op(Op::infix(Rule::and, Left)) 84 | .op(Op::prefix(Rule::neg)); 85 | 86 | pratt 87 | .map_primary(Self::parse_primary) 88 | .map_infix(Self::parse_infix) 89 | .map_prefix(Self::parse_prefix) 90 | .parse(pair.into_inner()) 91 | } 92 | 93 | fn parse_primary(pair: Pair<'_, Rule>) -> Result, DetectError> { 94 | match pair.as_rule() { 95 | Rule::predicate => Self::parse_predicate(pair), 96 | Rule::single_word => Ok(RawExpr::SingleWord(pair.as_span())), 97 | Rule::expr => Self::parse_expr(pair), 98 | rule => Err(DetectError::internal(format!( 99 | "Unexpected primary rule: {rule:?}" 100 | ))), 101 | } 102 | } 103 | 104 | fn parse_infix<'a>( 105 | lhs: Result, DetectError>, 106 | _pair: Pair<'a, Rule>, 107 | rhs: Result, DetectError>, 108 | ) -> Result, DetectError> { 109 | match _pair.as_rule() { 110 | Rule::and => Ok(RawExpr::And(Box::new(lhs?), Box::new(rhs?))), 111 | Rule::or => Ok(RawExpr::Or(Box::new(lhs?), Box::new(rhs?))), 112 | rule => Err(DetectError::internal(format!( 113 | "Unexpected infix rule: {rule:?}" 114 | ))), 115 | } 116 | } 117 | 118 | fn parse_prefix<'a>( 119 | _pair: Pair<'a, Rule>, 120 | rhs: Result, DetectError>, 121 | ) -> Result, DetectError> { 122 | match _pair.as_rule() { 123 | Rule::neg => Ok(RawExpr::Not(Box::new(rhs?))), 124 | rule => Err(DetectError::internal(format!( 125 | "Unexpected prefix rule: {rule:?}" 126 | ))), 127 | } 128 | } 129 | 130 | fn parse_predicate(pair: Pair<'_, Rule>) -> Result, DetectError> { 131 | let span = pair.as_span(); 132 | let mut inner = pair.into_inner(); 133 | 134 | let selector_pair = inner 135 | .next() 136 | .ok_or_else(|| DetectError::internal("Grammar guarantees predicate has selector"))?; 137 | let selector = selector_pair.as_str(); 138 | let selector_span = selector_pair.as_span(); 139 | 140 | let operator_pair = inner 141 | .next() 142 | .ok_or_else(|| DetectError::internal("Grammar guarantees predicate has operator"))?; 143 | let operator = operator_pair.as_str(); 144 | let operator_span = operator_pair.as_span(); 145 | 146 | let value_pair = inner 147 | .next() 148 | .ok_or_else(|| DetectError::internal("Grammar guarantees predicate has value"))?; 149 | let value_span = value_pair.as_span(); 150 | let value = Self::parse_value(value_pair)?; 151 | 152 | Ok(RawExpr::Predicate(RawPredicate { 153 | selector, 154 | operator, 155 | value, 156 | span, 157 | selector_span, 158 | operator_span, 159 | value_span, 160 | })) 161 | } 162 | 163 | fn parse_value(pair: Pair<'_, Rule>) -> Result, DetectError> { 164 | match pair.as_rule() { 165 | Rule::value => { 166 | // value = { value_content ~ trailing_quote? } 167 | // Check if there's a trailing quote error 168 | let mut inner = pair.into_inner(); 169 | let value_content = inner 170 | .next() 171 | .ok_or_else(|| DetectError::internal("Grammar guarantees value has content"))?; 172 | 173 | // Check for trailing quote 174 | if let Some(trailing) = inner.next() { 175 | if trailing.as_rule() == Rule::trailing_quote { 176 | let span = trailing.as_span(); 177 | let quote = span.as_str().chars().next().unwrap_or('"'); 178 | return Err(DetectError::StrayQuote { 179 | span: span.to_source_span(), 180 | quote, 181 | src: String::new(), // Will be filled by with_source() 182 | }); 183 | } 184 | } 185 | 186 | // No trailing quote, parse the value content 187 | Self::parse_value(value_content) 188 | } 189 | Rule::quoted_string => { 190 | // Grammar already parsed inner content without quotes 191 | let inner = pair.into_inner().next().ok_or_else(|| { 192 | DetectError::internal("Grammar guarantees quoted_string has inner content") 193 | })?; 194 | Ok(RawValue::Quoted(inner.as_str())) 195 | } 196 | Rule::unterminated_string => { 197 | // Matched an unterminated string literal - return error with proper span 198 | let span = pair.as_span(); 199 | let text = span.as_str(); 200 | let quote = text.chars().next().unwrap_or('"'); 201 | 202 | // Point span at just the opening quote and a few chars (not extending to EOI) 203 | let start = span.start(); 204 | let length = text.len().min(10); // Show first 10 chars max 205 | let error_span = (start, length).into(); 206 | 207 | Err(DetectError::UnterminatedString { 208 | span: error_span, 209 | quote, 210 | src: String::new(), // Will be filled by with_source() 211 | }) 212 | } 213 | Rule::raw_token => { 214 | // All raw tokens stored as-is, typechecker decides meaning based on operator 215 | Ok(RawValue::Raw(pair.as_str())) 216 | } 217 | rule => Err(DetectError::internal(format!( 218 | "Unexpected value rule: {rule:?}" 219 | ))), 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /tests/parser_errors.rs: -------------------------------------------------------------------------------- 1 | use detect::parser::test_utils::RawTestExpr; 2 | use detect::parser::*; 3 | 4 | // ============================================================================== 5 | 6 | #[test] 7 | fn test_unterminated_double_quote() { 8 | // Unterminated double quote at various positions 9 | let result = RawParser::parse_raw_expr(r#"contents ~= "a"#); 10 | assert!(result.is_err(), "Unterminated double quote should fail"); 11 | 12 | let result = RawParser::parse_raw_expr(r#"name == "test"#); 13 | assert!(result.is_err(), "Unterminated double quote should fail"); 14 | 15 | let result = RawParser::parse_raw_expr(r#"ext == "some long string"#); 16 | assert!(result.is_err(), "Unterminated double quote should fail"); 17 | } 18 | 19 | #[test] 20 | fn test_unterminated_single_quote() { 21 | // Unterminated single quote at various positions 22 | let result = RawParser::parse_raw_expr("contents ~= 'a"); 23 | assert!(result.is_err(), "Unterminated single quote should fail"); 24 | 25 | let result = RawParser::parse_raw_expr("name == 'test"); 26 | assert!(result.is_err(), "Unterminated single quote should fail"); 27 | 28 | let result = RawParser::parse_raw_expr("ext == 'foo bar baz"); 29 | assert!(result.is_err(), "Unterminated single quote should fail"); 30 | } 31 | 32 | #[test] 33 | fn test_stray_double_quote_after_value() { 34 | // Stray double quote immediately after valid bare token 35 | let result = RawParser::parse_raw_expr(r#"contents ~= a""#); 36 | assert!(result.is_err(), "Stray double quote should fail"); 37 | 38 | let result = RawParser::parse_raw_expr(r#"name == foo""#); 39 | assert!(result.is_err(), "Stray double quote should fail"); 40 | 41 | let result = RawParser::parse_raw_expr(r#"ext == test.rs""#); 42 | assert!(result.is_err(), "Stray double quote should fail"); 43 | } 44 | 45 | #[test] 46 | fn test_stray_single_quote_after_value() { 47 | // Stray single quote immediately after valid bare token 48 | let result = RawParser::parse_raw_expr("contents ~= a'"); 49 | assert!(result.is_err(), "Stray single quote should fail"); 50 | 51 | let result = RawParser::parse_raw_expr("name == foo'"); 52 | assert!(result.is_err(), "Stray single quote should fail"); 53 | 54 | let result = RawParser::parse_raw_expr("ext == test.rs'"); 55 | assert!(result.is_err(), "Stray single quote should fail"); 56 | } 57 | 58 | #[test] 59 | fn test_quote_errors_in_complex_expressions() { 60 | // Unterminated quote in boolean expressions 61 | let result = RawParser::parse_raw_expr(r#"ext == rs AND name == "unterminated"#); 62 | assert!( 63 | result.is_err(), 64 | "Unterminated quote in AND expression should fail" 65 | ); 66 | 67 | let result = RawParser::parse_raw_expr(r#"ext == rs OR name == 'foo"#); 68 | assert!( 69 | result.is_err(), 70 | "Unterminated quote in OR expression should fail" 71 | ); 72 | 73 | // Stray quote in boolean expressions 74 | let result = RawParser::parse_raw_expr(r#"ext == rs AND name == foo""#); 75 | assert!(result.is_err(), "Stray quote in AND expression should fail"); 76 | 77 | let result = RawParser::parse_raw_expr(r#"ext == rs OR name == bar'"#); 78 | assert!(result.is_err(), "Stray quote in OR expression should fail"); 79 | } 80 | 81 | #[test] 82 | fn test_lone_quote_as_value() { 83 | // Single quote character alone should be unterminated 84 | let result = RawParser::parse_raw_expr(r#"contents ~= ""#); 85 | assert!(result.is_err(), "Lone double quote should fail"); 86 | 87 | let result = RawParser::parse_raw_expr("contents ~= '"); 88 | assert!(result.is_err(), "Lone single quote should fail"); 89 | } 90 | 91 | #[test] 92 | fn test_properly_quoted_strings_still_work() { 93 | // Verify that proper quotes continue to work after adding error detection 94 | let result = RawParser::parse_raw_expr(r#"name == "properly quoted""#); 95 | assert!(result.is_ok(), "Properly quoted double quotes should work"); 96 | let expected = RawTestExpr::quoted_predicate("name", "==", "properly quoted"); 97 | assert_eq!(result.unwrap().to_test_expr(), expected); 98 | 99 | let result = RawParser::parse_raw_expr("name == 'properly quoted'"); 100 | assert!(result.is_ok(), "Properly quoted single quotes should work"); 101 | let expected = RawTestExpr::quoted_predicate("name", "==", "properly quoted"); 102 | assert_eq!(result.unwrap().to_test_expr(), expected); 103 | 104 | // With spaces 105 | let result = RawParser::parse_raw_expr(r#"content ~= "test string with spaces""#); 106 | assert!(result.is_ok(), "Quoted string with spaces should work"); 107 | let expected = RawTestExpr::quoted_predicate("content", "~=", "test string with spaces"); 108 | assert_eq!(result.unwrap().to_test_expr(), expected); 109 | } 110 | 111 | #[test] 112 | fn test_edge_case_empty_inputs() { 113 | // Empty string 114 | let result = RawParser::parse_raw_expr(""); 115 | assert!(result.is_err(), "Empty string should fail"); 116 | 117 | // Just whitespace 118 | let result = RawParser::parse_raw_expr(" "); 119 | assert!(result.is_err(), "Whitespace only should fail"); 120 | 121 | // Just operators 122 | let result = RawParser::parse_raw_expr("=="); 123 | assert!(result.is_err(), "Operator only should fail"); 124 | } 125 | 126 | #[test] 127 | fn test_malformed_sets() { 128 | // Set with no closing bracket 129 | let result = RawParser::parse_raw_expr("name in [foo, bar"); 130 | assert!(result.is_err(), "Unclosed set should fail"); 131 | 132 | // Set with no opening bracket - actually just parses as "foo" bare token 133 | let result = RawParser::parse_raw_expr("name in foo, bar]"); 134 | assert!(result.is_err(), "Malformed syntax should fail"); 135 | 136 | // Nested sets - simplified grammar now allows this to parse (will fail at typecheck) 137 | let result = RawParser::parse_raw_expr("name in [foo, [bar]]"); 138 | assert!( 139 | result.is_ok(), 140 | "Simplified grammar allows nested brackets (typecheck will handle validity)" 141 | ); 142 | 143 | // Set with trailing comma - now allowed, typechecker filters empty items 144 | let result = RawParser::parse_raw_expr("name in [foo, bar,]"); 145 | assert!(result.is_ok(), "Trailing comma is now allowed"); 146 | 147 | // Set with only commas - parses as empty set after filtering 148 | let result = RawParser::parse_raw_expr("name in [,,,]"); 149 | assert!(result.is_ok(), "Only commas parses as empty set"); 150 | } 151 | 152 | #[test] 153 | fn test_malformed_quotes() { 154 | // Mismatched quotes 155 | let result = RawParser::parse_raw_expr(r#"name == "foo'"#); 156 | assert!(result.is_err(), "Mismatched quotes should fail"); 157 | 158 | let result = RawParser::parse_raw_expr(r#"name == 'foo""#); 159 | assert!(result.is_err(), "Mismatched quotes should fail"); 160 | 161 | // Escaped quote at end without closing 162 | let result = RawParser::parse_raw_expr(r#"name == "foo\""#); 163 | assert!(result.is_err(), "Escaped quote at end should fail"); 164 | 165 | // Multiple quotes 166 | let result = RawParser::parse_raw_expr(r#"name == ""foo""#); 167 | assert!(result.is_err(), "Double quotes should fail"); 168 | 169 | // Quote in the middle of bare value 170 | let result = RawParser::parse_raw_expr(r#"name == fo"o"#); 171 | assert!(result.is_err(), "Quote in middle should fail"); 172 | } 173 | 174 | #[test] 175 | fn test_boolean_logic_edge_cases() { 176 | // Incomplete boolean expressions 177 | let result = RawParser::parse_raw_expr("name == foo AND"); 178 | assert!(result.is_err(), "Incomplete AND should fail"); 179 | 180 | let result = RawParser::parse_raw_expr("OR name == foo"); 181 | assert!(result.is_err(), "Leading OR should fail"); 182 | 183 | let result = RawParser::parse_raw_expr("NOT"); 184 | assert!(result.is_err(), "Standalone NOT should fail"); 185 | 186 | // Multiple consecutive operators 187 | let result = RawParser::parse_raw_expr("name == foo AND OR bar == baz"); 188 | assert!(result.is_err(), "AND OR should fail"); 189 | 190 | let result = RawParser::parse_raw_expr("name == foo NOT AND bar == baz"); 191 | assert!(result.is_err(), "NOT AND should fail"); 192 | 193 | // Multiple NOT 194 | let result = RawParser::parse_raw_expr("NOT NOT name == foo"); 195 | let expected = RawTestExpr::not(RawTestExpr::not(RawTestExpr::string_predicate( 196 | "name", "==", "foo", 197 | ))); 198 | assert_eq!(result.unwrap().to_test_expr(), expected); 199 | 200 | // Mixed NOT usage - prefix operator with NOT as value 201 | let result = RawParser::parse_raw_expr("NOT filename == NOT"); 202 | let expected = RawTestExpr::not(RawTestExpr::string_predicate("filename", "==", "NOT")); 203 | assert_eq!(result.unwrap().to_test_expr(), expected); 204 | } 205 | 206 | #[test] 207 | fn test_parentheses_edge_cases() { 208 | // Unmatched parentheses 209 | let result = RawParser::parse_raw_expr("((name == foo)"); 210 | assert!(result.is_err(), "Unmatched opening parens should fail"); 211 | 212 | let result = RawParser::parse_raw_expr("(name == foo))"); 213 | assert!(result.is_err(), "Unmatched closing parens should fail"); 214 | 215 | // Empty parentheses 216 | let result = RawParser::parse_raw_expr("()"); 217 | assert!(result.is_err(), "Empty parentheses should fail"); 218 | 219 | // Parentheses around operators 220 | let result = RawParser::parse_raw_expr("name (==) foo"); 221 | assert!(result.is_err(), "Parentheses around operators should fail"); 222 | 223 | // Basic nested parentheses (deep nesting tested in test_extreme_nesting_limits) 224 | let result = RawParser::parse_raw_expr("((name == foo))"); 225 | let expected = RawTestExpr::string_predicate("name", "==", "foo"); 226 | assert_eq!(result.unwrap().to_test_expr(), expected); 227 | } 228 | -------------------------------------------------------------------------------- /src/eval/fs.rs: -------------------------------------------------------------------------------- 1 | use crate::expr::short_circuit::ShortCircuit; 2 | use crate::expr::Expr; 3 | use crate::predicate::{ 4 | MetadataPredicate, NamePredicate, Predicate, StreamingCompiledContentPredicateRef, 5 | }; 6 | use crate::util::Done; 7 | use futures::{stream, TryStreamExt}; 8 | use slog::{debug, o, Logger}; 9 | use std::path::Path; 10 | use tokio::fs::File; 11 | use tokio::io::BufStream; 12 | use tokio_util::io::ReaderStream; 13 | 14 | use crate::eval::run_contents_predicate_stream; 15 | use crate::eval::structured::{eval_structured_predicate, ParsedDocuments}; 16 | 17 | /// multipass evaluation with short circuiting, runs, in order: 18 | /// - file name matchers 19 | /// - metadata matchers 20 | /// - file content matchers 21 | pub async fn eval<'dfa>( 22 | logger: &Logger, 23 | e: &'dfa Expr< 24 | Predicate>, 25 | >, 26 | path: &Path, 27 | base_path: Option<&Path>, 28 | ) -> std::io::Result { 29 | let logger = logger.new(o!("path" => format!("{:?}", path))); 30 | 31 | debug!(logger, "visit entity"; "expr" => %e); 32 | 33 | let e: Expr>> = 34 | e.reduce_predicate_and_short_circuit(|p| p.eval_name_predicate(path, base_path)); 35 | 36 | if let Expr::Literal(b) = e { 37 | debug!(logger, "short circuit after path predicate eval"; "expr" => %e, "result" => %b); 38 | return Ok(b); 39 | } 40 | 41 | debug!(logger, "reduced expr after path predicate eval"; "expr" => %e); 42 | 43 | let file = File::open(path).await?; 44 | let metadata = file.metadata().await?; 45 | 46 | let e: Expr>> = 47 | e.reduce_predicate_and_short_circuit(|p| p.eval_metadata_predicate(&metadata)); 48 | 49 | if let Expr::Literal(b) = e { 50 | debug!(logger, "short circuit after metadata predicate eval"; "expr" => %e, "result" => %b); 51 | return Ok(b); 52 | } 53 | 54 | debug!(logger, "reduced expr after metadata predicate eval"; "expr" => %e); 55 | 56 | // Determine which predicates remain for optimized file reading 57 | let has_structured = e.contains_structured_predicates(); 58 | let has_content = e.contains_content_predicates(); 59 | 60 | if !metadata.is_file() { 61 | debug!( 62 | logger, 63 | "not a file, all structured/content predicates eval to false" 64 | ); 65 | let e: Expr> = 66 | e.reduce_predicate_and_short_circuit(|p| match p { 67 | Predicate::Content(_) => ShortCircuit::Known(false), 68 | Predicate::Structured(_) => ShortCircuit::Known(false), 69 | _ => unreachable!( 70 | "only Content and Structured predicates should remain after metadata phase" 71 | ), 72 | }); 73 | 74 | if let Expr::Literal(b) = e { 75 | debug!(logger, "evaluation finished"; "result" => b); 76 | return Ok(b); 77 | } 78 | unreachable!("all predicates should be reduced to literals after evaluation") 79 | } 80 | 81 | match (has_structured, has_content) { 82 | (true, true) => { 83 | debug!( 84 | logger, 85 | "evaluating both structured and content predicates - single file read" 86 | ); 87 | let bytes = tokio::fs::read(path).await?; 88 | 89 | if let Ok(contents) = std::str::from_utf8(&bytes) { 90 | // UTF-8: evaluate structured predicates first 91 | let mut cache = ParsedDocuments::new(); 92 | let e = e.reduce_predicate_and_short_circuit(|p| match p { 93 | Predicate::Structured(s) => { 94 | match eval_structured_predicate(&s, contents, &mut cache) { 95 | Ok(result) => ShortCircuit::Known(result), 96 | Err(_) => ShortCircuit::Known(false), 97 | } 98 | } 99 | Predicate::Content(c) => ShortCircuit::Unknown(Predicate::Content(c)), 100 | _ => unreachable!("only Structured and Content predicates should remain"), 101 | }); 102 | 103 | if let Expr::Literal(b) = e { 104 | debug!(logger, "short circuit after structured predicates"; "result" => b); 105 | return Ok(b); 106 | } 107 | 108 | // Evaluate content predicates using in-memory stream (8KB chunks) 109 | const CHUNK_SIZE: usize = 8192; 110 | let chunks: Vec, std::io::Error>> = bytes 111 | .chunks(CHUNK_SIZE) 112 | .map(|chunk| Ok(chunk.to_vec())) 113 | .collect(); 114 | 115 | let e = run_contents_predicate_stream(e, stream::iter(chunks)).await?; 116 | 117 | if let Expr::Literal(b) = e { 118 | debug!(logger, "evaluation finished"; "result" => b); 119 | Ok(b) 120 | } else { 121 | unreachable!( 122 | "all content predicates should be reduced to literals after streaming" 123 | ) 124 | } 125 | } else { 126 | debug!( 127 | logger, 128 | "file is not UTF-8, structured predicates = false, using streaming content" 129 | ); 130 | // Non-UTF-8: structured predicates fail, stream content 131 | let e = e.reduce_predicate_and_short_circuit(|p| match p { 132 | Predicate::Structured(_) => ShortCircuit::Known(false), 133 | Predicate::Content(c) => ShortCircuit::Unknown(Predicate::Content(c)), 134 | _ => unreachable!("only Structured and Content predicates should remain"), 135 | }); 136 | 137 | if let Expr::Literal(b) = e { 138 | debug!(logger, "short circuit after structured=false"; "result" => b); 139 | return Ok(b); 140 | } 141 | 142 | const CHUNK_SIZE: usize = 8192; 143 | let chunks: Vec, std::io::Error>> = bytes 144 | .chunks(CHUNK_SIZE) 145 | .map(|chunk| Ok(chunk.to_vec())) 146 | .collect(); 147 | 148 | let e = run_contents_predicate_stream(e, stream::iter(chunks)).await?; 149 | 150 | if let Expr::Literal(b) = e { 151 | debug!(logger, "evaluation finished"; "result" => b); 152 | Ok(b) 153 | } else { 154 | unreachable!( 155 | "all content predicates should be reduced to literals after streaming" 156 | ) 157 | } 158 | } 159 | } 160 | (true, false) => { 161 | debug!(logger, "evaluating structured predicates only"); 162 | let e = match tokio::fs::read_to_string(path).await { 163 | Ok(contents) => { 164 | let mut cache = ParsedDocuments::new(); 165 | e.reduce_predicate_and_short_circuit(|p| match p { 166 | Predicate::Structured(s) => { 167 | match eval_structured_predicate(&s, &contents, &mut cache) { 168 | Ok(result) => { 169 | ShortCircuit::>::Known(result) 170 | } 171 | Err(_) => ShortCircuit::>::Known(false), 172 | } 173 | } 174 | _ => unreachable!( 175 | "only Structured predicates should remain when has_content is false" 176 | ), 177 | }) 178 | } 179 | Err(_) => { 180 | // Non-UTF-8 or read error: all structured predicates = false 181 | e.reduce_predicate_and_short_circuit(|p| match p { 182 | Predicate::Structured(_) => { 183 | ShortCircuit::>::Known(false) 184 | } 185 | _ => unreachable!( 186 | "only Structured predicates should remain when has_content is false" 187 | ), 188 | }) 189 | } 190 | }; 191 | 192 | if let Expr::Literal(b) = e { 193 | debug!(logger, "evaluation finished"; "result" => b); 194 | Ok(b) 195 | } else { 196 | unreachable!( 197 | "all structured predicates should be reduced to literals after evaluation" 198 | ) 199 | } 200 | } 201 | (false, true) => { 202 | debug!(logger, "evaluating content predicates only - streaming"); 203 | let e = run_contents_predicate_stream( 204 | e, 205 | ReaderStream::new(BufStream::new(file)).map_ok(|b| b.to_vec()), 206 | ) 207 | .await?; 208 | 209 | if let Expr::Literal(b) = e { 210 | debug!(logger, "evaluation finished"; "result" => b); 211 | Ok(b) 212 | } else { 213 | unreachable!("all content predicates should be reduced to literals after streaming") 214 | } 215 | } 216 | (false, false) => { 217 | // No structured or content predicates remain (already short-circuited) 218 | unreachable!( 219 | "both has_structured and has_content are false - should have short-circuited" 220 | ) 221 | } 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/parser/structured_path.rs: -------------------------------------------------------------------------------- 1 | //! Parser for structured data path expressions 2 | //! 3 | //! Handles paths like: 4 | //! - `.spec.replicas` → [Key("spec"), Key("replicas")] 5 | //! - `[0].name` → [Index(0), Key("name")] 6 | //! - `.items[*].id` → [Key("items"), `WildcardIndex`, Key("id")] 7 | 8 | use pest::{iterators::Pair, Parser}; 9 | use pest_derive::Parser; 10 | use thiserror::Error; 11 | 12 | #[derive(Parser)] 13 | #[grammar = "parser/structured_path.pest"] 14 | pub struct PathParser; 15 | 16 | /// A single component in a path expression 17 | #[derive(Debug, Clone, PartialEq, Eq)] 18 | pub enum PathComponent { 19 | /// Object field access: .fieldname 20 | Key(String), 21 | /// Recursive descent: ..fieldname (matches key at any depth) 22 | RecursiveKey(String), 23 | /// Array index access: [42] 24 | Index(usize), 25 | /// Array wildcard access: [*] 26 | WildcardIndex, 27 | } 28 | 29 | /// Errors that can occur during path parsing 30 | #[derive(Debug, Error, Clone, PartialEq, Eq)] 31 | pub enum PathParseError { 32 | /// Syntax error from Pest parser 33 | #[error("Path syntax error: {0}")] 34 | Syntax(String), 35 | 36 | /// Invalid numeric index value 37 | #[error("Invalid array index '{value}': {reason}")] 38 | InvalidIndex { value: String, reason: String }, 39 | 40 | /// Empty path (no components) 41 | #[error("Path cannot be empty")] 42 | EmptyPath, 43 | } 44 | 45 | /// Parse a path expression into a vector of components 46 | /// 47 | /// # Examples 48 | /// ``` 49 | /// use detect::parser::structured_path::{parse_path, PathComponent}; 50 | /// 51 | /// let components = parse_path(".spec.replicas").unwrap(); 52 | /// assert_eq!(components, vec![ 53 | /// PathComponent::Key("spec".to_string()), 54 | /// PathComponent::Key("replicas".to_string()), 55 | /// ]); 56 | /// 57 | /// let components = parse_path("[0].name").unwrap(); 58 | /// assert_eq!(components, vec![ 59 | /// PathComponent::Index(0), 60 | /// PathComponent::Key("name".to_string()), 61 | /// ]); 62 | /// ``` 63 | pub fn parse_path(input: &str) -> Result, PathParseError> { 64 | if input.is_empty() { 65 | return Err(PathParseError::EmptyPath); 66 | } 67 | 68 | let pairs = PathParser::parse(Rule::path, input) 69 | .map_err(|e| PathParseError::Syntax(format!("Failed to parse path '{input}': {e}")))?; 70 | 71 | let mut components = Vec::new(); 72 | 73 | for pair in pairs { 74 | match pair.as_rule() { 75 | Rule::path => { 76 | // Recurse into path components 77 | for component_pair in pair.into_inner() { 78 | if let Some(component) = parse_component(component_pair)? { 79 | components.push(component); 80 | } 81 | } 82 | } 83 | Rule::EOI => {} // End of input, ignore 84 | _ => { 85 | return Err(PathParseError::Syntax(format!( 86 | "Unexpected rule: {:?}", 87 | pair.as_rule() 88 | ))) 89 | } 90 | } 91 | } 92 | 93 | if components.is_empty() { 94 | return Err(PathParseError::EmptyPath); 95 | } 96 | 97 | Ok(components) 98 | } 99 | 100 | fn parse_component(pair: Pair<'_, Rule>) -> Result, PathParseError> { 101 | match pair.as_rule() { 102 | Rule::recursive_key => { 103 | // recursive_key -> identifier 104 | let identifier = pair 105 | .into_inner() 106 | .next() 107 | .ok_or_else(|| PathParseError::Syntax("Missing identifier".to_string()))?; 108 | Ok(Some(PathComponent::RecursiveKey( 109 | identifier.as_str().to_string(), 110 | ))) 111 | } 112 | Rule::key_access => { 113 | // key_access -> identifier 114 | let identifier = pair 115 | .into_inner() 116 | .next() 117 | .ok_or_else(|| PathParseError::Syntax("Missing identifier".to_string()))?; 118 | Ok(Some(PathComponent::Key(identifier.as_str().to_string()))) 119 | } 120 | Rule::index_access => { 121 | // index_access -> number 122 | let number_pair = pair 123 | .into_inner() 124 | .next() 125 | .ok_or_else(|| PathParseError::Syntax("Missing number".to_string()))?; 126 | let number_str = number_pair.as_str(); 127 | 128 | let index = number_str 129 | .parse::() 130 | .map_err(|e| PathParseError::InvalidIndex { 131 | value: number_str.to_string(), 132 | reason: e.to_string(), 133 | })?; 134 | 135 | Ok(Some(PathComponent::Index(index))) 136 | } 137 | Rule::wildcard_access => Ok(Some(PathComponent::WildcardIndex)), 138 | _ => Ok(None), // Skip unknown rules 139 | } 140 | } 141 | 142 | #[cfg(test)] 143 | mod tests { 144 | use super::*; 145 | 146 | #[test] 147 | fn test_simple_key() { 148 | let result = parse_path(".name").unwrap(); 149 | assert_eq!(result, vec![PathComponent::Key("name".to_string())]); 150 | } 151 | 152 | #[test] 153 | fn test_nested_keys() { 154 | let result = parse_path(".spec.replicas").unwrap(); 155 | assert_eq!( 156 | result, 157 | vec![ 158 | PathComponent::Key("spec".to_string()), 159 | PathComponent::Key("replicas".to_string()), 160 | ] 161 | ); 162 | } 163 | 164 | #[test] 165 | fn test_deep_nesting() { 166 | let result = parse_path(".a.b.c.d").unwrap(); 167 | assert_eq!( 168 | result, 169 | vec![ 170 | PathComponent::Key("a".to_string()), 171 | PathComponent::Key("b".to_string()), 172 | PathComponent::Key("c".to_string()), 173 | PathComponent::Key("d".to_string()), 174 | ] 175 | ); 176 | } 177 | 178 | #[test] 179 | fn test_single_index() { 180 | let result = parse_path("[0]").unwrap(); 181 | assert_eq!(result, vec![PathComponent::Index(0)]); 182 | } 183 | 184 | #[test] 185 | fn test_index_then_key() { 186 | let result = parse_path("[0].name").unwrap(); 187 | assert_eq!( 188 | result, 189 | vec![ 190 | PathComponent::Index(0), 191 | PathComponent::Key("name".to_string()), 192 | ] 193 | ); 194 | } 195 | 196 | #[test] 197 | fn test_key_then_index() { 198 | let result = parse_path(".items[0]").unwrap(); 199 | assert_eq!( 200 | result, 201 | vec![ 202 | PathComponent::Key("items".to_string()), 203 | PathComponent::Index(0), 204 | ] 205 | ); 206 | } 207 | 208 | #[test] 209 | fn test_wildcard() { 210 | let result = parse_path("[*]").unwrap(); 211 | assert_eq!(result, vec![PathComponent::WildcardIndex]); 212 | } 213 | 214 | #[test] 215 | fn test_wildcard_with_keys() { 216 | let result = parse_path(".items[*].id").unwrap(); 217 | assert_eq!( 218 | result, 219 | vec![ 220 | PathComponent::Key("items".to_string()), 221 | PathComponent::WildcardIndex, 222 | PathComponent::Key("id".to_string()), 223 | ] 224 | ); 225 | } 226 | 227 | #[test] 228 | fn test_multiple_indices() { 229 | let result = parse_path("[0][1][2]").unwrap(); 230 | assert_eq!( 231 | result, 232 | vec![ 233 | PathComponent::Index(0), 234 | PathComponent::Index(1), 235 | PathComponent::Index(2), 236 | ] 237 | ); 238 | } 239 | 240 | #[test] 241 | fn test_complex_path() { 242 | let result = parse_path(".spec.containers[0].image").unwrap(); 243 | assert_eq!( 244 | result, 245 | vec![ 246 | PathComponent::Key("spec".to_string()), 247 | PathComponent::Key("containers".to_string()), 248 | PathComponent::Index(0), 249 | PathComponent::Key("image".to_string()), 250 | ] 251 | ); 252 | } 253 | 254 | #[test] 255 | fn test_underscore_in_key() { 256 | let result = parse_path(".my_field").unwrap(); 257 | assert_eq!(result, vec![PathComponent::Key("my_field".to_string())]); 258 | } 259 | 260 | #[test] 261 | fn test_mixed_case_key() { 262 | let result = parse_path(".camelCase").unwrap(); 263 | assert_eq!(result, vec![PathComponent::Key("camelCase".to_string())]); 264 | } 265 | 266 | #[test] 267 | fn test_large_index() { 268 | let result = parse_path("[999]").unwrap(); 269 | assert_eq!(result, vec![PathComponent::Index(999)]); 270 | } 271 | 272 | #[test] 273 | fn test_error_empty_path() { 274 | let result = parse_path(""); 275 | assert!(matches!(result, Err(PathParseError::EmptyPath))); 276 | } 277 | 278 | #[test] 279 | fn test_error_no_dot_before_key() { 280 | let result = parse_path("name"); 281 | assert!(matches!(result, Err(PathParseError::Syntax(_)))); 282 | } 283 | 284 | #[test] 285 | fn test_error_missing_bracket() { 286 | let result = parse_path("[0"); 287 | assert!(matches!(result, Err(PathParseError::Syntax(_)))); 288 | } 289 | 290 | #[test] 291 | fn test_error_missing_closing_bracket() { 292 | let result = parse_path(".items[0"); 293 | assert!(matches!(result, Err(PathParseError::Syntax(_)))); 294 | } 295 | 296 | #[test] 297 | fn test_error_empty_brackets() { 298 | let result = parse_path("[]"); 299 | assert!(matches!(result, Err(PathParseError::Syntax(_)))); 300 | } 301 | 302 | #[test] 303 | fn test_hyphen_in_key() { 304 | let result = parse_path(".field-name").unwrap(); 305 | assert_eq!(result, vec![PathComponent::Key("field-name".to_string())]); 306 | } 307 | 308 | #[test] 309 | fn test_error_triple_dot() { 310 | // Triple dots are invalid (recursive descent is only double dots) 311 | let result = parse_path("...field"); 312 | assert!(matches!(result, Err(PathParseError::Syntax(_)))); 313 | } 314 | 315 | #[test] 316 | fn test_error_space_in_key() { 317 | let result = parse_path(".my field"); 318 | assert!(matches!(result, Err(PathParseError::Syntax(_)))); 319 | } 320 | } 321 | -------------------------------------------------------------------------------- /tests/parser_basic.rs: -------------------------------------------------------------------------------- 1 | use detect::parser::test_utils::RawTestExpr; 2 | use detect::parser::*; 3 | 4 | // ============================================================================== 5 | // Basic Syntax Tests - Predicates, Values, Quotes 6 | // ============================================================================== 7 | 8 | #[test] 9 | fn test_simple_predicate() { 10 | let result = RawParser::parse_raw_expr("name == foo").unwrap(); 11 | let expected = RawTestExpr::string_predicate("name", "==", "foo"); 12 | assert_eq!(result.to_test_expr(), expected); 13 | } 14 | 15 | #[test] 16 | fn test_quoted_values() { 17 | let result = RawParser::parse_raw_expr(r#"filename == "my file.txt""#).unwrap(); 18 | let expected = RawTestExpr::quoted_predicate("filename", "==", "my file.txt"); 19 | assert_eq!(result.to_test_expr(), expected); 20 | } 21 | 22 | #[test] 23 | fn test_single_quoted_values() { 24 | let result = RawParser::parse_raw_expr("filename == 'my file.txt'").unwrap(); 25 | let expected = RawTestExpr::quoted_predicate("filename", "==", "my file.txt"); 26 | assert_eq!(result.to_test_expr(), expected); 27 | } 28 | 29 | #[test] 30 | fn test_escape_sequences() { 31 | // Test double quote escapes 32 | let result = RawParser::parse_raw_expr(r#"name == "file\"with\"quotes""#).unwrap(); 33 | let expected = RawTestExpr::quoted_predicate("name", "==", r#"file\"with\"quotes"#); 34 | assert_eq!(result.to_test_expr(), expected); 35 | 36 | // Test various escape sequences 37 | let result = RawParser::parse_raw_expr(r#"content == "line1\nline2\ttab\\backslash""#).unwrap(); 38 | let expected = 39 | RawTestExpr::quoted_predicate("content", "==", r#"line1\nline2\ttab\\backslash"#); 40 | assert_eq!(result.to_test_expr(), expected); 41 | 42 | // Test single quote escapes 43 | let result = RawParser::parse_raw_expr(r"name == 'file\'with\'quotes'").unwrap(); 44 | let expected = RawTestExpr::quoted_predicate("name", "==", r"file\'with\'quotes"); 45 | assert_eq!(result.to_test_expr(), expected); 46 | } 47 | 48 | #[test] 49 | fn test_set_values() { 50 | let result = RawParser::parse_raw_expr("ext in [rs, js, ts]").unwrap(); 51 | // With new parser, sets are raw tokens - spaces preserved 52 | let expected = RawTestExpr::string_predicate("ext", "in", "[rs, js, ts]"); 53 | assert_eq!(result.to_test_expr(), expected); 54 | } 55 | 56 | #[test] 57 | fn test_mixed_set() { 58 | let result = RawParser::parse_raw_expr(r#"name in [README, "my file", config]"#).unwrap(); 59 | let expected = RawTestExpr::string_predicate("name", "in", r#"[README, "my file", config]"#); 60 | assert_eq!(result.to_test_expr(), expected); 61 | } 62 | 63 | #[test] 64 | fn test_set_with_quotes_and_escapes() { 65 | let result = RawParser::parse_raw_expr(r#"name in ["file\"1", 'file\'2', plain]"#).unwrap(); 66 | let expected = RawTestExpr::string_predicate("name", "in", r#"["file\"1", 'file\'2', plain]"#); 67 | assert_eq!(result.to_test_expr(), expected); 68 | } 69 | 70 | #[test] 71 | fn test_empty_set() { 72 | let result = RawParser::parse_raw_expr("ext in []").unwrap(); 73 | let expected = RawTestExpr::string_predicate("ext", "in", "[]"); 74 | assert_eq!(result.to_test_expr(), expected); 75 | } 76 | 77 | // ============================================================================== 78 | // Boolean Logic Tests - AND, OR, NOT, Precedence 79 | // ============================================================================== 80 | 81 | #[test] 82 | fn test_boolean_logic() { 83 | let result = RawParser::parse_raw_expr("name == foo AND size > 1000").unwrap(); 84 | let expected = RawTestExpr::and( 85 | RawTestExpr::string_predicate("name", "==", "foo"), 86 | RawTestExpr::string_predicate("size", ">", "1000"), 87 | ); 88 | assert_eq!(result.to_test_expr(), expected); 89 | } 90 | 91 | #[test] 92 | fn test_or_logic() { 93 | let result = RawParser::parse_raw_expr("name == foo OR name == bar").unwrap(); 94 | let expected = RawTestExpr::or( 95 | RawTestExpr::string_predicate("name", "==", "foo"), 96 | RawTestExpr::string_predicate("name", "==", "bar"), 97 | ); 98 | assert_eq!(result.to_test_expr(), expected); 99 | } 100 | 101 | #[test] 102 | fn test_negation_variants() { 103 | // Test NOT keyword 104 | let result = RawParser::parse_raw_expr("NOT name == foo").unwrap(); 105 | let expected = RawTestExpr::not(RawTestExpr::string_predicate("name", "==", "foo")); 106 | assert_eq!(result.to_test_expr(), expected); 107 | 108 | // Test ! symbol 109 | let result = RawParser::parse_raw_expr("! name == foo").unwrap(); 110 | let expected = RawTestExpr::not(RawTestExpr::string_predicate("name", "==", "foo")); 111 | assert_eq!(result.to_test_expr(), expected); 112 | 113 | // Test escaped ! symbol 114 | let result = RawParser::parse_raw_expr("\\! name == foo").unwrap(); 115 | let expected = RawTestExpr::not(RawTestExpr::string_predicate("name", "==", "foo")); 116 | assert_eq!(result.to_test_expr(), expected); 117 | } 118 | 119 | #[test] 120 | fn test_operator_precedence() { 121 | // AND should bind tighter than OR 122 | let result = RawParser::parse_raw_expr("a == b OR c == d AND e == f").unwrap(); 123 | let expected = RawTestExpr::or( 124 | RawTestExpr::string_predicate("a", "==", "b"), 125 | RawTestExpr::and( 126 | RawTestExpr::string_predicate("c", "==", "d"), 127 | RawTestExpr::string_predicate("e", "==", "f"), 128 | ), 129 | ); 130 | assert_eq!(result.to_test_expr(), expected); 131 | } 132 | 133 | #[test] 134 | fn test_parentheses() { 135 | let result = RawParser::parse_raw_expr("(a == b OR c == d) AND e == f").unwrap(); 136 | let expected = RawTestExpr::and( 137 | RawTestExpr::or( 138 | RawTestExpr::string_predicate("a", "==", "b"), 139 | RawTestExpr::string_predicate("c", "==", "d"), 140 | ), 141 | RawTestExpr::string_predicate("e", "==", "f"), 142 | ); 143 | assert_eq!(result.to_test_expr(), expected); 144 | } 145 | 146 | #[test] 147 | fn test_complex_expression() { 148 | let result = 149 | RawParser::parse_raw_expr(r#"(name == "test.rs" OR ext in [js, ts]) AND NOT size > 1mb"#) 150 | .unwrap(); 151 | 152 | let expected = RawTestExpr::and( 153 | RawTestExpr::or( 154 | RawTestExpr::quoted_predicate("name", "==", "test.rs"), 155 | RawTestExpr::string_predicate("ext", "in", "[js, ts]"), 156 | ), 157 | RawTestExpr::not(RawTestExpr::string_predicate("size", ">", "1mb")), 158 | ); 159 | assert_eq!(result.to_test_expr(), expected); 160 | } 161 | 162 | #[test] 163 | fn test_all_operators() { 164 | let test_cases = vec![ 165 | ("name == foo", "=="), 166 | ("name != foo", "!="), 167 | ("name ~= foo", "~="), 168 | ("name > foo", ">"), 169 | ("name < foo", "<"), 170 | ("name >= foo", ">="), 171 | ("name <= foo", "<="), 172 | ("name contains foo", "contains"), 173 | ("name in [foo]", "in"), 174 | ]; 175 | 176 | for (input, expected_op) in test_cases { 177 | let result = RawParser::parse_raw_expr(input).unwrap(); 178 | match result.to_test_expr() { 179 | RawTestExpr::Predicate(pred) => { 180 | assert_eq!(pred.operator, expected_op, "Failed for input: {}", input); 181 | } 182 | _ => panic!("Expected predicate for input: {}", input), 183 | } 184 | } 185 | } 186 | 187 | #[test] 188 | fn test_complex_selectors() { 189 | let result = RawParser::parse_raw_expr("name == test.rs").unwrap(); 190 | let expected = RawTestExpr::string_predicate("name", "==", "test.rs"); 191 | assert_eq!(result.to_test_expr(), expected); 192 | 193 | let result = RawParser::parse_raw_expr("meta.size > 1000").unwrap(); 194 | let expected = RawTestExpr::string_predicate("meta.size", ">", "1000"); 195 | assert_eq!(result.to_test_expr(), expected); 196 | } 197 | 198 | #[test] 199 | fn test_case_insensitive_keywords() { 200 | let result = RawParser::parse_raw_expr("name == foo AND name == bar").unwrap(); 201 | let result_upper = RawParser::parse_raw_expr("name == foo AND name == bar").unwrap(); 202 | assert_eq!(result.to_test_expr(), result_upper.to_test_expr()); 203 | 204 | let result = RawParser::parse_raw_expr("NOT name == foo").unwrap(); 205 | let result_lower = RawParser::parse_raw_expr("not name == foo").unwrap(); 206 | assert_eq!(result.to_test_expr(), result_lower.to_test_expr()); 207 | } 208 | 209 | #[test] 210 | fn test_syntax_errors() { 211 | // Missing value 212 | let result = RawParser::parse_raw_expr("name =="); 213 | assert!(result.is_err()); 214 | 215 | // Missing operator 216 | let result = RawParser::parse_raw_expr("name foo"); 217 | assert!(result.is_err()); 218 | 219 | // Unclosed parentheses 220 | let result = RawParser::parse_raw_expr("(name == foo"); 221 | assert!(result.is_err()); 222 | 223 | // Unclosed bracket - now parses as bare token (grammar is permissive) 224 | // This is valid: searches for files named "[foo" 225 | let result = RawParser::parse_raw_expr("name in [foo"); 226 | assert!( 227 | result.is_ok(), 228 | "Permissive grammar allows [foo as bare token" 229 | ); 230 | 231 | // Unclosed quote 232 | let result = RawParser::parse_raw_expr(r#"name == "unclosed"#); 233 | assert!(result.is_err()); 234 | } 235 | 236 | #[test] 237 | fn test_invalid_escape_sequences() { 238 | // Since we're a syntax-only parser, we preserve escape sequences without validating them 239 | // This previously "invalid" escape sequence is now just preserved as-is 240 | let result = RawParser::parse_raw_expr(r#"name == "invalid\x""#); 241 | let expected = RawTestExpr::quoted_predicate("name", "==", r"invalid\x"); 242 | assert_eq!(result.unwrap().to_test_expr(), expected); 243 | 244 | // Unterminated string (this is actually a syntax error, not escape error) 245 | let result = RawParser::parse_raw_expr("name == \"unterminated"); 246 | assert!(result.is_err()); 247 | } 248 | 249 | // ============================================================================== 250 | #[test] 251 | fn test_whitespace_handling() { 252 | // Basic predicate whitespace tolerance 253 | let result1 = RawParser::parse_raw_expr("name==foo").unwrap(); 254 | let result2 = RawParser::parse_raw_expr("name == foo").unwrap(); 255 | let result3 = RawParser::parse_raw_expr(" name == foo ").unwrap(); 256 | 257 | assert_eq!(result1.to_test_expr(), result2.to_test_expr()); 258 | assert_eq!(result2.to_test_expr(), result3.to_test_expr()); 259 | 260 | // Whitespace in sets (preserved as raw token) 261 | let result = RawParser::parse_raw_expr("ext in [ rs , js , ts ]").unwrap(); 262 | let expected = RawTestExpr::string_predicate("ext", "in", "[ rs , js , ts ]"); 263 | assert_eq!(result.to_test_expr(), expected); 264 | } 265 | 266 | #[test] 267 | fn test_edge_cases() { 268 | // Empty string value 269 | let result = RawParser::parse_raw_expr(r#"name == """#).unwrap(); 270 | let expected = RawTestExpr::quoted_predicate("name", "==", ""); 271 | assert_eq!(result.to_test_expr(), expected); 272 | 273 | // Value with special characters 274 | let result = RawParser::parse_raw_expr("name == foo-bar_baz.txt").unwrap(); 275 | let expected = RawTestExpr::string_predicate("name", "==", "foo-bar_baz.txt"); 276 | assert_eq!(result.to_test_expr(), expected); 277 | 278 | // Selector with dots and underscores 279 | let result = RawParser::parse_raw_expr("path.name_with_underscores == foo").unwrap(); 280 | let expected = RawTestExpr::string_predicate("path.name_with_underscores", "==", "foo"); 281 | assert_eq!(result.to_test_expr(), expected); 282 | } 283 | 284 | // Bug: Reserved word substrings in bare values 285 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/parser/error.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_assignments)] // Fields are used by miette's derive macros 2 | 3 | use miette::{Diagnostic, SourceSpan}; 4 | use thiserror::Error; 5 | 6 | use super::raw::Rule; 7 | 8 | /// Main error type for detect expressions, using miette for diagnostics 9 | #[derive(Debug, Clone, Diagnostic, Error)] 10 | pub enum DetectError { 11 | // Syntax errors from pest 12 | #[error("Syntax error at line {line}, column {col}")] 13 | #[diagnostic(code(detect::syntax))] 14 | Syntax { 15 | #[source_code] 16 | src: String, 17 | #[label("{expected_msg}")] 18 | span: SourceSpan, 19 | #[help] 20 | help: Option, 21 | expected_msg: String, 22 | line: usize, 23 | col: usize, 24 | }, 25 | 26 | // Typechecker errors with spans 27 | #[error("Unknown selector: {selector}")] 28 | #[diagnostic(code(detect::unknown_selector), help("Valid selectors: name, basename, ext, path, dir, size, type, depth, modified, created, accessed, content"))] 29 | UnknownSelector { 30 | selector: String, 31 | #[label("unknown selector")] 32 | span: SourceSpan, 33 | #[source_code] 34 | src: String, 35 | }, 36 | 37 | #[error("Invalid {format} selector path: {path}")] 38 | #[diagnostic( 39 | code(detect::invalid_structured_path), 40 | help("Structured selectors use format: {format}:.path.to.field") 41 | )] 42 | InvalidStructuredPath { 43 | format: String, 44 | path: String, 45 | #[label("invalid path: {reason}")] 46 | span: SourceSpan, 47 | reason: String, 48 | #[source_code] 49 | src: String, 50 | }, 51 | 52 | #[error("Unknown structured data format: '{format}'")] 53 | #[diagnostic(code(detect::unknown_structured_format))] 54 | UnknownStructuredFormat { 55 | format: String, 56 | #[label("unknown format")] 57 | span: SourceSpan, 58 | #[source_code] 59 | src: String, 60 | #[help] 61 | suggestions: Option, 62 | }, 63 | 64 | #[error("Unknown operator: {operator}")] 65 | #[diagnostic( 66 | code(detect::unknown_operator), 67 | help("Valid operators include: ==, !=, >, <, contains, matches, etc.") 68 | )] 69 | UnknownOperator { 70 | operator: String, 71 | #[label("unknown operator")] 72 | span: SourceSpan, 73 | #[source_code] 74 | src: String, 75 | }, 76 | 77 | #[error("Unknown alias: '{word}'")] 78 | #[diagnostic(code(detect::unknown_alias))] 79 | UnknownAlias { 80 | word: String, 81 | #[label("unknown alias")] 82 | span: SourceSpan, 83 | #[source_code] 84 | src: String, 85 | #[help] 86 | suggestions: Option, 87 | }, 88 | 89 | #[error("Operator '{operator}' is not compatible with selector '{selector}'")] 90 | #[diagnostic( 91 | code(detect::incompatible_operator), 92 | help("This selector requires a different type of operator") 93 | )] 94 | IncompatibleOperator { 95 | selector: String, 96 | operator: String, 97 | #[label("incompatible operator")] 98 | operator_span: SourceSpan, 99 | #[label("for this selector")] 100 | selector_span: SourceSpan, 101 | #[source_code] 102 | src: String, 103 | }, 104 | 105 | #[error("Expected {expected} value, found: {found}")] 106 | #[diagnostic( 107 | code(detect::invalid_value), 108 | help("Check the value type for this selector") 109 | )] 110 | InvalidValue { 111 | expected: String, 112 | found: String, 113 | #[label("invalid value")] 114 | span: SourceSpan, 115 | #[source_code] 116 | src: String, 117 | }, 118 | 119 | // Escape errors 120 | #[error("Invalid escape sequence '\\{char}'")] 121 | #[diagnostic( 122 | code(detect::invalid_escape), 123 | help("Valid escape sequences: \\n, \\t, \\\\, \\\", \\'") 124 | )] 125 | InvalidEscape { 126 | char: char, 127 | #[label("invalid escape")] 128 | span: SourceSpan, 129 | #[source_code] 130 | src: String, 131 | }, 132 | 133 | #[error("Unterminated escape sequence")] 134 | #[diagnostic(code(detect::unterminated_escape))] 135 | UnterminatedEscape { 136 | #[label("escape sequence not completed")] 137 | span: SourceSpan, 138 | #[source_code] 139 | src: String, 140 | }, 141 | 142 | // Quote errors 143 | #[error("Unterminated string literal")] 144 | #[diagnostic(code(detect::unterminated_string))] 145 | UnterminatedString { 146 | #[label("missing closing {quote} quote")] 147 | span: SourceSpan, 148 | quote: char, 149 | #[source_code] 150 | src: String, 151 | }, 152 | 153 | #[error("Stray {quote} quote")] 154 | #[diagnostic( 155 | code(detect::stray_quote), 156 | help("Remove the quote or add matching opening quote") 157 | )] 158 | StrayQuote { 159 | #[label("unexpected quote")] 160 | span: SourceSpan, 161 | quote: char, 162 | #[source_code] 163 | src: String, 164 | }, 165 | 166 | // Filesystem errors 167 | #[error("Directory not found: {path}")] 168 | #[diagnostic( 169 | code(detect::directory_not_found), 170 | help("Check that the directory path exists and is accessible") 171 | )] 172 | DirectoryNotFound { path: String }, 173 | 174 | #[error("Path is not a directory: {path}")] 175 | #[diagnostic( 176 | code(detect::not_a_directory), 177 | help("The path must be a directory, not a file") 178 | )] 179 | NotADirectory { path: String }, 180 | 181 | // I/O errors 182 | #[error("I/O error: {message}")] 183 | #[diagnostic(code(detect::io_error))] 184 | IoError { message: String }, 185 | 186 | // Internal errors 187 | #[error("Internal parser error: {message}")] 188 | #[diagnostic(code(detect::internal))] 189 | Internal { 190 | message: String, 191 | #[source_code] 192 | src: String, 193 | }, 194 | } 195 | 196 | // Extension trait for span location extraction 197 | pub trait SpanExt { 198 | fn to_location(&self) -> (usize, usize); 199 | fn to_source_span(&self) -> SourceSpan; 200 | } 201 | 202 | impl SpanExt for pest::Span<'_> { 203 | #[inline] 204 | fn to_location(&self) -> (usize, usize) { 205 | self.start_pos().line_col() 206 | } 207 | 208 | #[inline] 209 | fn to_source_span(&self) -> SourceSpan { 210 | (self.start(), self.end() - self.start()).into() 211 | } 212 | } 213 | 214 | /// Convert pest Rule enum to user-friendly names 215 | fn rule_to_friendly_name(rule: &Rule) -> &'static str { 216 | match rule { 217 | Rule::program => "program", 218 | Rule::expr => "expression", 219 | Rule::infix => "operator (AND/OR)", 220 | Rule::and => "AND", 221 | Rule::or => "OR", 222 | Rule::prefix => "prefix operator (NOT)", 223 | Rule::neg => "NOT", 224 | Rule::primary => "predicate or expression", 225 | Rule::predicate => "predicate", 226 | Rule::selector => "selector", 227 | Rule::operator => "operator", 228 | Rule::value => "value", 229 | Rule::value_content => "value", 230 | Rule::raw_token => "value", 231 | Rule::quoted_string => "quoted string", 232 | Rule::unterminated_string => "unterminated string", 233 | Rule::trailing_quote => "trailing quote", 234 | Rule::single_word => "single-word alias", 235 | Rule::set_contents => "set contents", 236 | Rule::set_items => "set items", 237 | Rule::set_item => "set item", 238 | Rule::bare_set_item => "item", 239 | Rule::inner_double => "string content", 240 | Rule::inner_single => "string content", 241 | Rule::escaped => "escape sequence", 242 | Rule::raw_char => "character", 243 | Rule::balanced_paren => "balanced parentheses", 244 | Rule::balanced_bracket => "balanced brackets", 245 | Rule::balanced_curly => "balanced braces", 246 | Rule::WHITESPACE => "whitespace", 247 | Rule::EOI => "end of input", 248 | } 249 | } 250 | 251 | /// Generate contextual help text based on error patterns 252 | fn generate_help_text(positives: &[Rule], found_eoi: bool) -> Option { 253 | if positives.is_empty() { 254 | return None; 255 | } 256 | 257 | // Check for common patterns 258 | if positives.contains(&Rule::value) { 259 | if found_eoi { 260 | return Some("Try adding a value after the operator, like: ext == rs".to_string()); 261 | } 262 | return Some("Expected a value here (e.g., a string, number, or [set])".to_string()); 263 | } 264 | 265 | if (positives.contains(&Rule::expr) || positives.contains(&Rule::predicate)) && found_eoi { 266 | return Some("Expression is incomplete. Add a predicate after the operator.".to_string()); 267 | } 268 | 269 | if positives.contains(&Rule::EOI) { 270 | return Some("Unexpected input. Check for unbalanced parentheses or quotes.".to_string()); 271 | } 272 | 273 | None 274 | } 275 | 276 | impl DetectError { 277 | /// Create a syntax error from pest error with diagnostic information 278 | pub fn from_pest(pest_err: Box>, src: String) -> Self { 279 | use pest::error::{ErrorVariant, InputLocation}; 280 | 281 | // Extract position information with non-zero width for miette arrow rendering 282 | let (span, _pos) = match pest_err.location { 283 | InputLocation::Pos(pos) => { 284 | // For point locations, ensure non-zero width for miette arrow 285 | // If at/past EOI, point backwards at last char; otherwise point at current position 286 | if pos >= src.len() && pos > 0 { 287 | ((pos - 1, 1).into(), pos) 288 | } else if pos < src.len() { 289 | ((pos, 1).into(), pos) 290 | } else { 291 | // Empty input 292 | ((0, 0).into(), pos) 293 | } 294 | } 295 | InputLocation::Span((start, end)) => { 296 | let width = end.saturating_sub(start).max(1); // Ensure at least width 1 297 | ((start, width).into(), start) 298 | } 299 | }; 300 | 301 | // Get line and column 302 | let (line, col) = match pest_err.line_col { 303 | pest::error::LineColLocation::Pos((line, col)) => (line, col), 304 | pest::error::LineColLocation::Span((line, col), _) => (line, col), 305 | }; 306 | 307 | // Extract expected tokens and generate user-friendly message 308 | let (expected_msg, help) = match &pest_err.variant { 309 | ErrorVariant::ParsingError { 310 | positives, 311 | negatives: _, 312 | } => { 313 | let found_eoi = match pest_err.location { 314 | InputLocation::Pos(p) => p >= src.len(), 315 | InputLocation::Span((_, end)) => end >= src.len(), 316 | }; 317 | 318 | let expected_msg = if positives.is_empty() { 319 | "Unexpected input".to_string() 320 | } else if positives.len() == 1 { 321 | format!("Expected {}", rule_to_friendly_name(&positives[0])) 322 | } else { 323 | let names: Vec<&str> = positives.iter().map(rule_to_friendly_name).collect(); 324 | if names.len() <= 3 { 325 | format!("Expected one of: {}", names.join(", ")) 326 | } else { 327 | format!("Expected one of: {}, ...", names[..3].join(", ")) 328 | } 329 | }; 330 | 331 | let help = generate_help_text(positives, found_eoi); 332 | (expected_msg, help) 333 | } 334 | ErrorVariant::CustomError { message } => (message.clone(), None), 335 | }; 336 | 337 | DetectError::Syntax { 338 | src, 339 | span, 340 | help, 341 | expected_msg, 342 | line, 343 | col, 344 | } 345 | } 346 | 347 | /// Create an internal error 348 | pub fn internal(msg: impl Into) -> Self { 349 | DetectError::Internal { 350 | message: msg.into(), 351 | src: String::new(), 352 | } 353 | } 354 | 355 | /// Add source code to the error 356 | pub fn with_source(mut self, src: String) -> Self { 357 | match &mut self { 358 | DetectError::Syntax { src: s, .. } 359 | | DetectError::UnknownSelector { src: s, .. } 360 | | DetectError::InvalidStructuredPath { src: s, .. } 361 | | DetectError::UnknownStructuredFormat { src: s, .. } 362 | | DetectError::UnknownOperator { src: s, .. } 363 | | DetectError::UnknownAlias { src: s, .. } 364 | | DetectError::IncompatibleOperator { src: s, .. } 365 | | DetectError::InvalidValue { src: s, .. } 366 | | DetectError::InvalidEscape { src: s, .. } 367 | | DetectError::UnterminatedEscape { src: s, .. } 368 | | DetectError::UnterminatedString { src: s, .. } 369 | | DetectError::StrayQuote { src: s, .. } 370 | | DetectError::Internal { src: s, .. } => { 371 | *s = src; 372 | } 373 | // Filesystem and I/O errors don't have source code 374 | DetectError::DirectoryNotFound { .. } 375 | | DetectError::NotADirectory { .. } 376 | | DetectError::IoError { .. } => {} 377 | } 378 | self 379 | } 380 | } 381 | -------------------------------------------------------------------------------- /tests/temporal_tests.rs: -------------------------------------------------------------------------------- 1 | use slog::{o, Discard, Logger}; 2 | use std::{fs, time::SystemTime}; 3 | use tempfile::TempDir; 4 | 5 | // Shared helper to run temporal test cases 6 | async fn run_temporal_test( 7 | tmp_dir: &TempDir, 8 | expr: &str, 9 | expected_files: Vec<&str>, 10 | not_expected: Vec<&str>, 11 | ) { 12 | let mut found = Vec::new(); 13 | detect::parse_and_run_fs( 14 | Logger::root(Discard, o!()), 15 | tmp_dir.path(), 16 | false, 17 | expr.to_owned(), 18 | detect::RuntimeConfig::default(), 19 | |p| found.push(p.file_name().unwrap().to_string_lossy().to_string()), 20 | ) 21 | .await 22 | .unwrap(); 23 | 24 | for file in expected_files { 25 | assert!( 26 | found.contains(&file.to_string()), 27 | "Expression '{}' should find '{}', but found: {:?}", 28 | expr, 29 | file, 30 | found 31 | ); 32 | } 33 | 34 | for file in not_expected { 35 | assert!( 36 | !found.contains(&file.to_string()), 37 | "Expression '{}' should not find '{}', but found: {:?}", 38 | expr, 39 | file, 40 | found 41 | ); 42 | } 43 | } 44 | 45 | #[tokio::test] 46 | async fn test_relative_time_operations() { 47 | let tmp_dir = tempfile::Builder::new() 48 | .prefix("detect-temporal-relative") 49 | .tempdir() 50 | .unwrap(); 51 | 52 | // Create files with different ages 53 | let files = vec![ 54 | ("1sec.txt", 1), 55 | ("10secs.txt", 10), 56 | ("5mins.txt", 5 * 60), 57 | ("2hours.txt", 2 * 60 * 60), 58 | ("3days.txt", 3 * 24 * 60 * 60), 59 | ("1week.txt", 7 * 24 * 60 * 60 - 1), // Just under 7 days to pass > -7.days test 60 | ("30days.txt", 30 * 24 * 60 * 60), 61 | ]; 62 | 63 | for (name, age_secs) in &files { 64 | let path = tmp_dir.path().join(name); 65 | std::fs::write(&path, "content").unwrap(); 66 | let mtime = SystemTime::now() - std::time::Duration::from_secs(*age_secs); 67 | fs::File::open(&path).unwrap().set_modified(mtime).unwrap(); 68 | } 69 | 70 | // Test various relative time expressions 71 | let test_cases = vec![ 72 | // Seconds 73 | ( 74 | "modified > \"-2.seconds\"", 75 | vec!["1sec.txt"], 76 | vec!["10secs.txt"], 77 | ), 78 | ( 79 | "modified > \"-30.seconds\"", 80 | vec!["1sec.txt", "10secs.txt"], 81 | vec!["5mins.txt"], 82 | ), 83 | // Minutes 84 | ( 85 | "modified > \"-10.minutes\"", 86 | vec!["1sec.txt", "10secs.txt", "5mins.txt"], 87 | vec!["2hours.txt"], 88 | ), 89 | ( 90 | "modified > \"-1.minute\"", 91 | vec!["1sec.txt", "10secs.txt"], 92 | vec!["5mins.txt"], 93 | ), 94 | // Hours 95 | ( 96 | "modified > \"-3.hours\"", 97 | vec!["1sec.txt", "10secs.txt", "5mins.txt", "2hours.txt"], 98 | vec!["3days.txt"], 99 | ), 100 | ( 101 | "modified > \"-1.hour\"", 102 | vec!["1sec.txt", "10secs.txt", "5mins.txt"], 103 | vec!["2hours.txt"], 104 | ), 105 | // Days 106 | ( 107 | "modified > \"-5.days\"", 108 | vec![ 109 | "1sec.txt", 110 | "10secs.txt", 111 | "5mins.txt", 112 | "2hours.txt", 113 | "3days.txt", 114 | ], 115 | vec!["1week.txt"], 116 | ), 117 | ( 118 | "modified > \"-7.days\"", 119 | vec![ 120 | "1sec.txt", 121 | "10secs.txt", 122 | "5mins.txt", 123 | "2hours.txt", 124 | "3days.txt", 125 | "1week.txt", 126 | ], 127 | vec!["30days.txt"], 128 | ), 129 | // Weeks 130 | ( 131 | "modified > \"-2.weeks\"", 132 | vec![ 133 | "1sec.txt", 134 | "10secs.txt", 135 | "5mins.txt", 136 | "2hours.txt", 137 | "3days.txt", 138 | "1week.txt", 139 | ], 140 | vec!["30days.txt"], 141 | ), 142 | // Test with different units abbreviations 143 | ( 144 | "modified > -30s", 145 | vec!["1sec.txt", "10secs.txt"], 146 | vec!["5mins.txt"], 147 | ), 148 | ( 149 | "modified > -10m", 150 | vec!["1sec.txt", "10secs.txt", "5mins.txt"], 151 | vec!["2hours.txt"], 152 | ), 153 | ( 154 | "modified > -3h", 155 | vec!["1sec.txt", "10secs.txt", "5mins.txt", "2hours.txt"], 156 | vec!["3days.txt"], 157 | ), 158 | ( 159 | "modified > -5d", 160 | vec![ 161 | "1sec.txt", 162 | "10secs.txt", 163 | "5mins.txt", 164 | "2hours.txt", 165 | "3days.txt", 166 | ], 167 | vec!["1week.txt"], 168 | ), 169 | ( 170 | "modified > -2w", 171 | vec![ 172 | "1sec.txt", 173 | "10secs.txt", 174 | "5mins.txt", 175 | "2hours.txt", 176 | "3days.txt", 177 | "1week.txt", 178 | ], 179 | vec!["30days.txt"], 180 | ), 181 | ]; 182 | 183 | for (expr, expected, not_expected) in test_cases { 184 | run_temporal_test(&tmp_dir, expr, expected, not_expected).await; 185 | } 186 | } 187 | 188 | #[tokio::test] 189 | async fn test_absolute_dates() { 190 | let tmp_dir = tempfile::Builder::new() 191 | .prefix("detect-temporal-absolute") 192 | .tempdir() 193 | .unwrap(); 194 | 195 | // Create files with specific dates 196 | let today_file = tmp_dir.path().join("today.txt"); 197 | let yesterday_file = tmp_dir.path().join("yesterday.txt"); 198 | let week_old_file = tmp_dir.path().join("week_old.txt"); 199 | let year_2020_file = tmp_dir.path().join("year_2020.txt"); 200 | let year_2023_file = tmp_dir.path().join("year_2023.txt"); 201 | 202 | // Create files 203 | std::fs::write(&today_file, "today").unwrap(); 204 | std::fs::write(&yesterday_file, "yesterday").unwrap(); 205 | std::fs::write(&week_old_file, "week").unwrap(); 206 | std::fs::write(&year_2020_file, "2020").unwrap(); 207 | std::fs::write(&year_2023_file, "2023").unwrap(); 208 | 209 | // Set modification times 210 | let now = SystemTime::now(); 211 | let yesterday = now - std::time::Duration::from_secs(24 * 60 * 60); 212 | let week_ago = now - std::time::Duration::from_secs(7 * 24 * 60 * 60); 213 | let year_2020 = SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(1577836800); // 2020-01-01 214 | let year_2023 = SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(1672531200); // 2023-01-01 215 | 216 | fs::File::open(&today_file) 217 | .unwrap() 218 | .set_modified(now) 219 | .unwrap(); 220 | fs::File::open(&yesterday_file) 221 | .unwrap() 222 | .set_modified(yesterday) 223 | .unwrap(); 224 | fs::File::open(&week_old_file) 225 | .unwrap() 226 | .set_modified(week_ago) 227 | .unwrap(); 228 | fs::File::open(&year_2020_file) 229 | .unwrap() 230 | .set_modified(year_2020) 231 | .unwrap(); 232 | fs::File::open(&year_2023_file) 233 | .unwrap() 234 | .set_modified(year_2023) 235 | .unwrap(); 236 | 237 | // Test absolute date queries (quoted and unquoted) 238 | run_temporal_test( 239 | &tmp_dir, 240 | "modified > \"2021-01-01\"", 241 | vec![ 242 | "today.txt", 243 | "yesterday.txt", 244 | "week_old.txt", 245 | "year_2023.txt", 246 | ], 247 | vec!["year_2020.txt"], 248 | ) 249 | .await; 250 | run_temporal_test( 251 | &tmp_dir, 252 | "modified > 2021-01-01", 253 | vec![ 254 | "today.txt", 255 | "yesterday.txt", 256 | "week_old.txt", 257 | "year_2023.txt", 258 | ], 259 | vec!["year_2020.txt"], 260 | ) 261 | .await; 262 | run_temporal_test( 263 | &tmp_dir, 264 | "modified < 2022-01-01", 265 | vec!["year_2020.txt"], 266 | vec!["year_2023.txt", "today.txt"], 267 | ) 268 | .await; 269 | } 270 | 271 | #[tokio::test] 272 | async fn test_time_selectors() { 273 | let tmp_dir = tempfile::Builder::new() 274 | .prefix("detect-temporal-selectors") 275 | .tempdir() 276 | .unwrap(); 277 | 278 | // Create test files 279 | let test_file = tmp_dir.path().join("test.txt"); 280 | let old_file = tmp_dir.path().join("old.txt"); 281 | 282 | std::fs::write(&test_file, "content").unwrap(); 283 | std::fs::write(&old_file, "old").unwrap(); 284 | 285 | // Set old file to be old 286 | let week_ago = SystemTime::now() - std::time::Duration::from_secs(7 * 24 * 60 * 60); 287 | fs::File::open(&old_file) 288 | .unwrap() 289 | .set_modified(week_ago) 290 | .unwrap(); 291 | 292 | // Test modified selector (already tested above, but verify syntax variants) 293 | run_temporal_test( 294 | &tmp_dir, 295 | "modified > -1hour", 296 | vec!["test.txt"], 297 | vec!["old.txt"], 298 | ) 299 | .await; 300 | run_temporal_test( 301 | &tmp_dir, 302 | "modified > -1hour", 303 | vec!["test.txt"], 304 | vec!["old.txt"], 305 | ) 306 | .await; 307 | 308 | // Test created selector (created - creation time is OS-specific, just verify it runs) 309 | let mut created_files = Vec::new(); 310 | detect::parse_and_run_fs( 311 | Logger::root(Discard, o!()), 312 | tmp_dir.path(), 313 | false, 314 | "created > -1hour".to_owned(), 315 | detect::RuntimeConfig::default(), 316 | |p| created_files.push(p.file_name().unwrap().to_string_lossy().to_string()), 317 | ) 318 | .await 319 | .unwrap(); 320 | // Just verify it doesn't crash - actual results are OS-dependent 321 | 322 | // Test accessed selector 323 | // Read the test file to update access time 324 | let _ = std::fs::read_to_string(&test_file).unwrap(); 325 | 326 | run_temporal_test(&tmp_dir, "accessed > -1minute", vec!["test.txt"], vec![]).await; 327 | run_temporal_test(&tmp_dir, "accessed > -1minute", vec!["test.txt"], vec![]).await; 328 | 329 | // Test created time variants 330 | let mut created_files = Vec::new(); 331 | detect::parse_and_run_fs( 332 | Logger::root(Discard, o!()), 333 | tmp_dir.path(), 334 | false, 335 | "created > -1hour".to_owned(), 336 | detect::RuntimeConfig::default(), 337 | |p| created_files.push(p.file_name().unwrap().to_string_lossy().to_string()), 338 | ) 339 | .await 340 | .unwrap(); 341 | // Just verify syntax works 342 | } 343 | 344 | #[tokio::test] 345 | async fn test_temporal_combined_queries() { 346 | let tmp_dir = tempfile::Builder::new() 347 | .prefix("detect-temporal-combined") 348 | .tempdir() 349 | .unwrap(); 350 | 351 | // Create various files 352 | let files = vec![ 353 | ("old.rs", "rust code", 10 * 24 * 60 * 60), // 10 days old 354 | ("new.rs", "rust code", 60), // 1 minute old 355 | ("old.txt", "text content", 10 * 24 * 60 * 60), // 10 days old 356 | ("new.txt", "text content", 60), // 1 minute old 357 | ("old_todo.rs", "// TODO: fix", 10 * 24 * 60 * 60), // 10 days old 358 | ("new_todo.rs", "// TODO: implement", 60), // 1 minute old 359 | ]; 360 | 361 | for (name, content, age_secs) in &files { 362 | let path = tmp_dir.path().join(name); 363 | std::fs::write(&path, content).unwrap(); 364 | let mtime = SystemTime::now() - std::time::Duration::from_secs(*age_secs); 365 | fs::File::open(&path).unwrap().set_modified(mtime).unwrap(); 366 | } 367 | 368 | // Test temporal + extension 369 | run_temporal_test( 370 | &tmp_dir, 371 | "ext == rs && modified > -1day", 372 | vec!["new.rs", "new_todo.rs"], 373 | vec!["old.rs", "new.txt", "old.txt", "old_todo.rs"], 374 | ) 375 | .await; 376 | 377 | // Test temporal + content 378 | run_temporal_test( 379 | &tmp_dir, 380 | r#"content contains "TODO" && modified > -1day"#, 381 | vec!["new_todo.rs"], 382 | vec!["old_todo.rs", "new.rs", "new.txt"], 383 | ) 384 | .await; 385 | 386 | // Test multiple temporal selectors 387 | run_temporal_test( 388 | &tmp_dir, 389 | "modified > -1hour && accessed > -1hour", 390 | vec!["new.rs", "new.txt", "new_todo.rs"], 391 | vec!["old.rs", "old.txt", "old_todo.rs"], 392 | ) 393 | .await; 394 | 395 | // Test temporal with size (all our test files are small) 396 | run_temporal_test( 397 | &tmp_dir, 398 | "size < 100 && modified > -1day", 399 | vec!["new.rs", "new.txt", "new_todo.rs"], 400 | vec!["old.rs", "old.txt", "old_todo.rs"], 401 | ) 402 | .await; 403 | 404 | // Test temporal with negation 405 | run_temporal_test( 406 | &tmp_dir, 407 | r#"!(basename contains "old") && modified > -1day"#, 408 | vec!["new.rs", "new.txt", "new_todo.rs"], 409 | vec!["old.rs", "old.txt", "old_todo.rs"], 410 | ) 411 | .await; 412 | } 413 | 414 | #[tokio::test] 415 | async fn test_greater_less_or_equal_operators() { 416 | let tmp_dir = tempfile::Builder::new() 417 | .prefix("detect-temporal-gte-lte") 418 | .tempdir() 419 | .unwrap(); 420 | 421 | // Create files with precise timestamps 422 | // Using slightly offset times to avoid boundary issues 423 | let now = SystemTime::now(); 424 | let thirty_mins_ago = now - std::time::Duration::from_secs(30 * 60); 425 | let ninety_mins_ago = now - std::time::Duration::from_secs(90 * 60); 426 | let two_and_half_hours_ago = now - std::time::Duration::from_secs(150 * 60); 427 | let four_hours_ago = now - std::time::Duration::from_secs(4 * 60 * 60); 428 | 429 | let recent = tmp_dir.path().join("recent.txt"); 430 | let thirty_mins = tmp_dir.path().join("thirty_mins.txt"); 431 | let ninety_mins = tmp_dir.path().join("ninety_mins.txt"); 432 | let two_half_hours = tmp_dir.path().join("two_half_hours.txt"); 433 | let four_hours = tmp_dir.path().join("four_hours.txt"); 434 | 435 | std::fs::write(&recent, "recent").unwrap(); 436 | std::fs::write(&thirty_mins, "thirty_mins").unwrap(); 437 | std::fs::write(&ninety_mins, "ninety_mins").unwrap(); 438 | std::fs::write(&two_half_hours, "two_half_hours").unwrap(); 439 | std::fs::write(&four_hours, "four_hours").unwrap(); 440 | 441 | fs::File::open(&recent).unwrap().set_modified(now).unwrap(); 442 | fs::File::open(&thirty_mins) 443 | .unwrap() 444 | .set_modified(thirty_mins_ago) 445 | .unwrap(); 446 | fs::File::open(&ninety_mins) 447 | .unwrap() 448 | .set_modified(ninety_mins_ago) 449 | .unwrap(); 450 | fs::File::open(&two_half_hours) 451 | .unwrap() 452 | .set_modified(two_and_half_hours_ago) 453 | .unwrap(); 454 | fs::File::open(&four_hours) 455 | .unwrap() 456 | .set_modified(four_hours_ago) 457 | .unwrap(); 458 | 459 | // Test >= operator (after or equal) - should include files at or after the threshold 460 | run_temporal_test( 461 | &tmp_dir, 462 | "modified >= -1hour", 463 | vec!["recent.txt", "thirty_mins.txt"], 464 | vec!["ninety_mins.txt", "two_half_hours.txt", "four_hours.txt"], 465 | ) 466 | .await; 467 | 468 | run_temporal_test( 469 | &tmp_dir, 470 | "modified >= -2hours", 471 | vec!["recent.txt", "thirty_mins.txt", "ninety_mins.txt"], 472 | vec!["two_half_hours.txt", "four_hours.txt"], 473 | ) 474 | .await; 475 | 476 | // Test <= operator (before or equal) - should include files at or before the threshold 477 | run_temporal_test( 478 | &tmp_dir, 479 | "modified <= -1hour", 480 | vec!["ninety_mins.txt", "two_half_hours.txt", "four_hours.txt"], 481 | vec!["recent.txt", "thirty_mins.txt"], 482 | ) 483 | .await; 484 | 485 | run_temporal_test( 486 | &tmp_dir, 487 | "modified <= -3hours", 488 | vec!["four_hours.txt"], 489 | vec![ 490 | "recent.txt", 491 | "thirty_mins.txt", 492 | "ninety_mins.txt", 493 | "two_half_hours.txt", 494 | ], 495 | ) 496 | .await; 497 | 498 | // Test combining >= and <= to create a time range 499 | run_temporal_test( 500 | &tmp_dir, 501 | "modified >= -3hours AND modified <= -1hour", 502 | vec!["ninety_mins.txt", "two_half_hours.txt"], 503 | vec!["recent.txt", "thirty_mins.txt", "four_hours.txt"], 504 | ) 505 | .await; 506 | } 507 | -------------------------------------------------------------------------------- /tests/aliases.rs: -------------------------------------------------------------------------------- 1 | //! Tests for single-word predicate aliases 2 | //! 3 | //! Verifies that file type aliases like `dir`, `file`, `symlink` work correctly 4 | //! in parsing and typechecking. 5 | 6 | use detect::{ 7 | expr::Expr, 8 | parser::{RawParser, Typechecker}, 9 | predicate::{DetectFileType, EnumMatcher, MetadataPredicate, Predicate}, 10 | }; 11 | 12 | /// Helper to parse and typecheck an expression 13 | fn parse_and_typecheck(input: &str) -> Result, detect::parser::error::DetectError> { 14 | let raw = RawParser::parse_raw_expr(input)?; 15 | Typechecker::typecheck(raw, input, &detect::RuntimeConfig::default()) 16 | } 17 | 18 | #[test] 19 | fn test_all_file_type_aliases_parse() { 20 | // All file type aliases should parse and typecheck successfully 21 | let aliases = [ 22 | "file", 23 | "dir", 24 | "directory", 25 | "symlink", 26 | "link", 27 | "socket", 28 | "sock", 29 | "fifo", 30 | "pipe", 31 | "block", 32 | "blockdev", 33 | "char", 34 | "chardev", 35 | ]; 36 | 37 | for alias in &aliases { 38 | let result = parse_and_typecheck(alias); 39 | assert!( 40 | result.is_ok(), 41 | "Alias '{}' should parse successfully, got: {:?}", 42 | alias, 43 | result.err() 44 | ); 45 | } 46 | } 47 | 48 | #[test] 49 | fn test_alias_case_insensitive() { 50 | // Aliases should be case-insensitive 51 | assert!(parse_and_typecheck("FILE").is_ok()); 52 | assert!(parse_and_typecheck("Dir").is_ok()); 53 | assert!(parse_and_typecheck("DIRECTORY").is_ok()); 54 | assert!(parse_and_typecheck("SyMlInK").is_ok()); 55 | } 56 | 57 | #[test] 58 | fn test_alias_equivalence_to_explicit_predicate() { 59 | // `dir` should be equivalent to `type == dir` 60 | let alias_result = parse_and_typecheck("dir").unwrap(); 61 | let explicit_result = parse_and_typecheck("type == dir").unwrap(); 62 | 63 | // Both should produce MetadataPredicate::Type with Equals matcher 64 | match (&alias_result, &explicit_result) { 65 | (Expr::Predicate(Predicate::Metadata(a)), Expr::Predicate(Predicate::Metadata(e))) => { 66 | assert_eq!(a, e, "Alias and explicit predicate should be equal"); 67 | } 68 | _ => panic!("Both should be Predicate::Metadata"), 69 | } 70 | } 71 | 72 | #[test] 73 | fn test_alias_in_boolean_expression() { 74 | // Aliases should work in boolean expressions 75 | let result = parse_and_typecheck("dir && depth > 0"); 76 | assert!(result.is_ok(), "Boolean expression with alias should parse"); 77 | 78 | let result = parse_and_typecheck("file || symlink"); 79 | assert!(result.is_ok(), "OR with aliases should parse"); 80 | 81 | let result = parse_and_typecheck("NOT dir"); 82 | assert!(result.is_ok(), "NOT with alias should parse"); 83 | } 84 | 85 | #[test] 86 | fn test_alias_with_word_form_operators() { 87 | // Test word-form AND operator 88 | let result = parse_and_typecheck("file AND size > 10kb"); 89 | assert!( 90 | result.is_ok(), 91 | "Alias with word-form AND should parse, got: {:?}", 92 | result.err() 93 | ); 94 | 95 | // Test word-form OR operator 96 | let result = parse_and_typecheck("dir OR file"); 97 | assert!( 98 | result.is_ok(), 99 | "Alias with word-form OR should parse, got: {:?}", 100 | result.err() 101 | ); 102 | 103 | // Test case-insensitive word operators 104 | let result = parse_and_typecheck("file and size > 1mb"); 105 | assert!( 106 | result.is_ok(), 107 | "Alias with lowercase 'and' should parse, got: {:?}", 108 | result.err() 109 | ); 110 | 111 | let result = parse_and_typecheck("file or dir"); 112 | assert!( 113 | result.is_ok(), 114 | "Alias with lowercase 'or' should parse, got: {:?}", 115 | result.err() 116 | ); 117 | 118 | // Mixed case 119 | let result = parse_and_typecheck("file And size > 1kb"); 120 | assert!( 121 | result.is_ok(), 122 | "Alias with mixed-case 'And' should parse, got: {:?}", 123 | result.err() 124 | ); 125 | 126 | // Complex expressions with multiple word operators 127 | let result = parse_and_typecheck("file AND size > 1mb OR dir AND depth < 3"); 128 | assert!( 129 | result.is_ok(), 130 | "Complex expression with multiple word operators should parse, got: {:?}", 131 | result.err() 132 | ); 133 | 134 | // Parenthesized expressions 135 | let result = parse_and_typecheck("(file OR dir) AND size > 100kb"); 136 | assert!( 137 | result.is_ok(), 138 | "Parenthesized expression with word operators should parse, got: {:?}", 139 | result.err() 140 | ); 141 | 142 | // With NOT 143 | let result = parse_and_typecheck("NOT file AND size > 1kb"); 144 | assert!( 145 | result.is_ok(), 146 | "NOT with word-form AND should parse, got: {:?}", 147 | result.err() 148 | ); 149 | } 150 | 151 | #[test] 152 | fn test_unknown_alias_error() { 153 | // Unknown aliases should produce helpful errors 154 | let result = parse_and_typecheck("unknownalias"); 155 | assert!(result.is_err()); 156 | 157 | if let Err(err) = result { 158 | assert!( 159 | matches!(err, detect::parser::error::DetectError::UnknownAlias { .. }), 160 | "Should produce UnknownAlias error" 161 | ); 162 | } 163 | } 164 | 165 | #[test] 166 | fn test_alias_suggestions() { 167 | // Close typos should get suggestions 168 | let result = parse_and_typecheck("fil"); 169 | assert!(result.is_err()); 170 | 171 | if let Err(detect::parser::error::DetectError::UnknownAlias { suggestions, .. }) = result { 172 | assert!( 173 | suggestions.is_some(), 174 | "Should provide suggestions for typos" 175 | ); 176 | let sugg = suggestions.unwrap(); 177 | assert!( 178 | sugg.contains("file"), 179 | "Should suggest 'file' for 'fil', got: {}", 180 | sugg 181 | ); 182 | } 183 | } 184 | 185 | #[test] 186 | fn test_wildcard_rejected() { 187 | // Wildcards should no longer parse as single words 188 | let result = RawParser::parse_raw_expr("*.rs"); 189 | assert!( 190 | result.is_err(), 191 | "Wildcards should be rejected by new grammar" 192 | ); 193 | 194 | let result = RawParser::parse_raw_expr("**/*.js"); 195 | assert!(result.is_err(), "Complex glob patterns should be rejected"); 196 | } 197 | 198 | // Filesystem evaluation is tested in integration.rs 199 | // This file focuses on parsing and typechecking of aliases 200 | 201 | #[test] 202 | fn test_complex_alias_expressions() { 203 | // Test complex boolean logic with aliases 204 | let result = parse_and_typecheck("(file || dir) && depth < 5"); 205 | assert!(result.is_ok(), "Complex alias expression should parse"); 206 | 207 | let result = parse_and_typecheck("NOT (symlink || socket) && file"); 208 | assert!(result.is_ok(), "Complex negation with aliases should parse"); 209 | } 210 | 211 | #[test] 212 | fn test_alias_constructed_predicates() { 213 | // Verify that aliases construct the correct predicate internally 214 | let typed = parse_and_typecheck("file").unwrap(); 215 | 216 | match typed { 217 | Expr::Predicate(Predicate::Metadata(meta)) => match meta.as_ref() { 218 | MetadataPredicate::Type(EnumMatcher::Equals(file_type)) => { 219 | assert_eq!( 220 | file_type, 221 | &DetectFileType::File, 222 | "Alias 'file' should construct DetectFileType::File" 223 | ); 224 | } 225 | _ => panic!("Expected Type predicate with Equals matcher"), 226 | }, 227 | _ => panic!("Expected Predicate::Metadata"), 228 | } 229 | } 230 | 231 | // ============================================================================ 232 | // Structured Selector Alias Tests 233 | // ============================================================================ 234 | 235 | #[test] 236 | fn test_structured_selector_alias_parsing() { 237 | // YAML structured selectors should parse 238 | assert!(parse_and_typecheck("yaml:.field").is_ok()); 239 | assert!(parse_and_typecheck("yaml:.server.port").is_ok()); 240 | assert!(parse_and_typecheck("yaml:.items[0]").is_ok()); 241 | assert!(parse_and_typecheck("yaml:.items[*].name").is_ok()); 242 | assert!(parse_and_typecheck("yaml:..recursive").is_ok()); 243 | 244 | // JSON structured selectors should parse 245 | assert!(parse_and_typecheck("json:.version").is_ok()); 246 | assert!(parse_and_typecheck("json:.dependencies.lodash").is_ok()); 247 | 248 | // TOML structured selectors should parse 249 | assert!(parse_and_typecheck("toml:.package.name").is_ok()); 250 | assert!(parse_and_typecheck("toml:.dependencies.serde").is_ok()); 251 | } 252 | 253 | #[test] 254 | fn test_structured_selector_case_insensitive() { 255 | // Format prefix should be case-insensitive 256 | assert!(parse_and_typecheck("YAML:.field").is_ok()); 257 | assert!(parse_and_typecheck("Json:.field").is_ok()); 258 | assert!(parse_and_typecheck("toml:.field").is_ok()); 259 | assert!(parse_and_typecheck("YaML:.field").is_ok()); 260 | } 261 | 262 | #[test] 263 | fn test_structured_selector_with_boolean_logic() { 264 | // Structured selectors should work in boolean expressions 265 | assert!(parse_and_typecheck("yaml:.field AND size > 1kb").is_ok()); 266 | assert!(parse_and_typecheck("json:.version OR toml:.version").is_ok()); 267 | assert!(parse_and_typecheck("NOT yaml:.field").is_ok()); 268 | assert!(parse_and_typecheck("(yaml:.a OR json:.b) && file").is_ok()); 269 | } 270 | 271 | #[test] 272 | fn test_structured_selector_complex_and_logic() { 273 | // Complex AND expressions with structured selectors 274 | assert!(parse_and_typecheck("yaml:.field AND json:.field").is_ok()); 275 | assert!(parse_and_typecheck("yaml:.a AND yaml:.b AND yaml:.c").is_ok()); 276 | assert!(parse_and_typecheck("yaml:.field && json:.field && toml:.field").is_ok()); 277 | 278 | // Mixed word-form and symbol operators 279 | assert!(parse_and_typecheck("yaml:.field AND json:.field && toml:.field").is_ok()); 280 | assert!(parse_and_typecheck("yaml:.a && yaml:.b AND yaml:.c").is_ok()); 281 | } 282 | 283 | #[test] 284 | fn test_structured_selector_complex_or_logic() { 285 | // Complex OR expressions with structured selectors 286 | assert!(parse_and_typecheck("yaml:.field OR json:.field").is_ok()); 287 | assert!(parse_and_typecheck("yaml:.a OR yaml:.b OR yaml:.c").is_ok()); 288 | assert!(parse_and_typecheck("yaml:.field || json:.field || toml:.field").is_ok()); 289 | 290 | // Mixed word-form and symbol operators 291 | assert!(parse_and_typecheck("yaml:.field OR json:.field || toml:.field").is_ok()); 292 | assert!(parse_and_typecheck("yaml:.a || yaml:.b OR yaml:.c").is_ok()); 293 | } 294 | 295 | #[test] 296 | fn test_structured_selector_negation_variants() { 297 | // All negation forms should work 298 | assert!(parse_and_typecheck("NOT yaml:.field").is_ok()); 299 | assert!(parse_and_typecheck("not yaml:.field").is_ok()); 300 | assert!(parse_and_typecheck("! yaml:.field").is_ok()); 301 | assert!(parse_and_typecheck("\\! yaml:.field").is_ok()); 302 | 303 | // Double negation 304 | assert!(parse_and_typecheck("NOT NOT yaml:.field").is_ok()); 305 | assert!(parse_and_typecheck("!! yaml:.field").is_ok()); 306 | 307 | // Negation with multiple selectors 308 | assert!(parse_and_typecheck("NOT (yaml:.a OR yaml:.b)").is_ok()); 309 | assert!(parse_and_typecheck("!(yaml:.a AND yaml:.b)").is_ok()); 310 | } 311 | 312 | #[test] 313 | fn test_structured_selector_precedence_and_grouping() { 314 | // Test operator precedence with parentheses 315 | assert!(parse_and_typecheck("yaml:.a AND (yaml:.b OR yaml:.c)").is_ok()); 316 | assert!(parse_and_typecheck("(yaml:.a OR yaml:.b) AND yaml:.c").is_ok()); 317 | assert!(parse_and_typecheck("yaml:.a OR yaml:.b AND yaml:.c").is_ok()); 318 | 319 | // Complex nested grouping 320 | assert!(parse_and_typecheck("((yaml:.a OR yaml:.b) AND yaml:.c) OR yaml:.d").is_ok()); 321 | assert!(parse_and_typecheck("yaml:.a AND (yaml:.b OR (yaml:.c AND yaml:.d))").is_ok()); 322 | 323 | // Negation with grouping 324 | assert!(parse_and_typecheck("NOT (yaml:.a AND yaml:.b)").is_ok()); 325 | assert!(parse_and_typecheck("!(yaml:.a || yaml:.b) AND yaml:.c").is_ok()); 326 | } 327 | 328 | #[test] 329 | fn test_structured_selector_mixed_with_aliases() { 330 | // Structured selectors combined with file type aliases 331 | assert!(parse_and_typecheck("yaml:.field AND file").is_ok()); 332 | assert!(parse_and_typecheck("file AND yaml:.field").is_ok()); 333 | assert!(parse_and_typecheck("dir OR yaml:.config").is_ok()); 334 | assert!(parse_and_typecheck("yaml:.field AND NOT symlink").is_ok()); 335 | 336 | // Complex combinations 337 | assert!(parse_and_typecheck("(file OR dir) AND yaml:.field").is_ok()); 338 | assert!(parse_and_typecheck("yaml:.a AND (file OR symlink) AND json:.b").is_ok()); 339 | assert!(parse_and_typecheck("NOT file OR yaml:.config").is_ok()); 340 | } 341 | 342 | #[test] 343 | fn test_structured_selector_mixed_with_predicates() { 344 | // Structured selectors with other predicate types 345 | assert!(parse_and_typecheck("yaml:.field AND size > 10kb").is_ok()); 346 | assert!(parse_and_typecheck("yaml:.field AND name == test.yaml").is_ok()); 347 | assert!(parse_and_typecheck("yaml:.field AND ext == yaml").is_ok()); 348 | assert!(parse_and_typecheck("yaml:.field AND modified > -7d").is_ok()); 349 | 350 | // Complex mixed predicates 351 | assert!(parse_and_typecheck("yaml:.field AND size > 1mb AND ext == yaml").is_ok()); 352 | assert!(parse_and_typecheck("(yaml:.field OR json:.field) AND size < 10mb").is_ok()); 353 | assert!(parse_and_typecheck("yaml:.a AND yaml:.b AND name ~= \"config.*\"").is_ok()); 354 | 355 | // With negation 356 | assert!(parse_and_typecheck("yaml:.field AND NOT (size > 1gb)").is_ok()); 357 | assert!(parse_and_typecheck("NOT yaml:.field AND ext == yaml").is_ok()); 358 | } 359 | 360 | #[test] 361 | fn test_structured_selector_de_morgan_laws() { 362 | // Test De Morgan's law equivalences parse correctly 363 | // NOT (A AND B) is equivalent to (NOT A) OR (NOT B) 364 | assert!(parse_and_typecheck("NOT (yaml:.a AND yaml:.b)").is_ok()); 365 | assert!(parse_and_typecheck("NOT yaml:.a OR NOT yaml:.b").is_ok()); 366 | 367 | // NOT (A OR B) is equivalent to (NOT A) AND (NOT B) 368 | assert!(parse_and_typecheck("NOT (yaml:.a OR yaml:.b)").is_ok()); 369 | assert!(parse_and_typecheck("NOT yaml:.a AND NOT yaml:.b").is_ok()); 370 | } 371 | 372 | #[test] 373 | fn test_structured_selector_all_formats_combined() { 374 | // All three formats in complex expressions 375 | assert!(parse_and_typecheck("yaml:.a AND json:.b AND toml:.c").is_ok()); 376 | assert!(parse_and_typecheck("yaml:.a OR json:.b OR toml:.c").is_ok()); 377 | assert!(parse_and_typecheck("(yaml:.a AND json:.b) OR toml:.c").is_ok()); 378 | assert!(parse_and_typecheck("yaml:.a AND (json:.b OR toml:.c)").is_ok()); 379 | 380 | // With negation 381 | assert!(parse_and_typecheck("yaml:.a AND NOT json:.b AND toml:.c").is_ok()); 382 | assert!(parse_and_typecheck("NOT (yaml:.a OR json:.b OR toml:.c)").is_ok()); 383 | 384 | // Mixed with other predicates 385 | assert!(parse_and_typecheck("yaml:.a AND json:.b AND toml:.c AND file").is_ok()); 386 | assert!(parse_and_typecheck("(yaml:.a OR json:.b OR toml:.c) AND size > 1kb").is_ok()); 387 | } 388 | 389 | #[test] 390 | fn test_structured_selector_invalid_format() { 391 | // Invalid format prefixes should produce UnknownStructuredFormat error 392 | let result = parse_and_typecheck("xml:.field"); 393 | assert!(result.is_err()); 394 | 395 | if let Err(err) = result { 396 | let err_str = format!("{:?}", err); 397 | assert!( 398 | err_str.contains("UnknownStructuredFormat") || err_str.contains("xml"), 399 | "Expected UnknownStructuredFormat error, got: {:?}", 400 | err 401 | ); 402 | } 403 | 404 | // Other invalid formats 405 | assert!(parse_and_typecheck("csv:.column").is_err()); 406 | assert!(parse_and_typecheck("ini:.section").is_err()); 407 | } 408 | 409 | #[test] 410 | fn test_structured_selector_invalid_path() { 411 | // Empty path should produce error 412 | let result = parse_and_typecheck("yaml:"); 413 | assert!(result.is_err(), "Empty path should fail"); 414 | 415 | // Invalid path syntax should produce error 416 | let result = parse_and_typecheck("yaml:["); 417 | assert!(result.is_err(), "Unclosed bracket should fail"); 418 | } 419 | 420 | #[test] 421 | fn test_structured_selector_constructs_exists_predicate() { 422 | // Verify that structured selectors construct StructuredDataPredicate::*Exists 423 | use detect::predicate::StructuredDataPredicate; 424 | 425 | let typed = parse_and_typecheck("yaml:.field").unwrap(); 426 | 427 | match typed { 428 | Expr::Predicate(Predicate::Structured(predicate)) => match predicate { 429 | StructuredDataPredicate::YamlExists { path } => { 430 | assert_eq!(path.len(), 1, "Should have one path component"); 431 | } 432 | _ => panic!("Expected YamlExists predicate, got: {:?}", predicate), 433 | }, 434 | _ => panic!("Expected Predicate::Structured, got: {:?}", typed), 435 | } 436 | 437 | let typed = parse_and_typecheck("json:.version").unwrap(); 438 | 439 | match typed { 440 | Expr::Predicate(Predicate::Structured(StructuredDataPredicate::JsonExists { .. })) => { 441 | // Success 442 | } 443 | _ => panic!("Expected JsonExists predicate"), 444 | } 445 | 446 | let typed = parse_and_typecheck("toml:.package").unwrap(); 447 | 448 | match typed { 449 | Expr::Predicate(Predicate::Structured(StructuredDataPredicate::TomlExists { .. })) => { 450 | // Success 451 | } 452 | _ => panic!("Expected TomlExists predicate"), 453 | } 454 | } 455 | -------------------------------------------------------------------------------- /tests/navigation_structured.rs: -------------------------------------------------------------------------------- 1 | //! Comprehensive tests for structured data navigation (YAML/JSON/TOML) 2 | //! 3 | //! Tests the iterative, zero-clone traversal of parsed documents. 4 | 5 | use detect::eval::structured::{navigate_json, navigate_toml, navigate_yaml}; 6 | use detect::parser::structured_path::parse_path; 7 | 8 | // ============================================================================ 9 | // YAML Test Helpers 10 | // ============================================================================ 11 | 12 | /// Helper to construct YAML integer 13 | fn yaml_int(i: i64) -> yaml_rust2::Yaml { 14 | yaml_rust2::Yaml::Integer(i) 15 | } 16 | 17 | /// Helper to construct YAML string 18 | fn yaml_str(s: &str) -> yaml_rust2::Yaml { 19 | yaml_rust2::Yaml::String(s.to_string()) 20 | } 21 | 22 | /// Helper to construct YAML boolean 23 | fn yaml_bool(b: bool) -> yaml_rust2::Yaml { 24 | yaml_rust2::Yaml::Boolean(b) 25 | } 26 | 27 | /// Helper to construct YAML array 28 | fn yaml_array(items: Vec) -> yaml_rust2::Yaml { 29 | yaml_rust2::Yaml::Array(items) 30 | } 31 | 32 | /// Helper to construct YAML hash/object 33 | fn yaml_hash(pairs: Vec<(&str, yaml_rust2::Yaml)>) -> yaml_rust2::Yaml { 34 | use yaml_rust2::yaml::Hash; 35 | let mut map = Hash::new(); 36 | for (k, v) in pairs { 37 | map.insert(yaml_rust2::Yaml::String(k.to_string()), v); 38 | } 39 | yaml_rust2::Yaml::Hash(map) 40 | } 41 | 42 | /// YAML navigation test case 43 | struct YamlNavCase { 44 | name: &'static str, 45 | document: yaml_rust2::Yaml, 46 | path: &'static str, 47 | expected: Vec, 48 | } 49 | 50 | /// Check if two slices contain the same elements (order-insensitive) 51 | fn yaml_sets_equal(a: &[&yaml_rust2::Yaml], b: &[yaml_rust2::Yaml]) -> bool { 52 | if a.len() != b.len() { 53 | return false; 54 | } 55 | 56 | // Check every element in 'a' exists in 'b' 57 | for item_a in a { 58 | if !b.iter().any(|item_b| item_a == &item_b) { 59 | return false; 60 | } 61 | } 62 | 63 | // Check every element in 'b' exists in 'a' 64 | for item_b in b { 65 | if !a.contains(&item_b) { 66 | return false; 67 | } 68 | } 69 | 70 | true 71 | } 72 | 73 | /// Run a batch of YAML navigation test cases 74 | fn run_yaml_tests(test_cases: &[YamlNavCase]) { 75 | for case in test_cases { 76 | let path = parse_path(case.path).unwrap_or_else(|e| { 77 | panic!( 78 | "Test '{}': Failed to parse path '{}': {:?}", 79 | case.name, case.path, e 80 | ) 81 | }); 82 | 83 | let results = navigate_yaml(&case.document, &path); 84 | 85 | // Order-insensitive comparison 86 | if !yaml_sets_equal(&results, &case.expected) { 87 | panic!( 88 | "Test '{}' failed:\n Path: {}\n Expected {} results: {:?}\n Got {} results: {:?}", 89 | case.name, 90 | case.path, 91 | case.expected.len(), 92 | case.expected, 93 | results.len(), 94 | results 95 | ); 96 | } 97 | } 98 | } 99 | 100 | // ============================================================================ 101 | // JSON Test Helpers 102 | // ============================================================================ 103 | 104 | /// Helper to construct JSON integer 105 | fn json_int(i: i64) -> serde_json::Value { 106 | serde_json::Value::Number(serde_json::Number::from(i)) 107 | } 108 | 109 | /// Helper to construct JSON string 110 | fn json_str(s: &str) -> serde_json::Value { 111 | serde_json::Value::String(s.to_string()) 112 | } 113 | 114 | /// Helper to construct JSON array 115 | fn json_array(items: Vec) -> serde_json::Value { 116 | serde_json::Value::Array(items) 117 | } 118 | 119 | /// Helper to construct JSON object 120 | fn json_object(pairs: Vec<(&str, serde_json::Value)>) -> serde_json::Value { 121 | let map: serde_json::Map = 122 | pairs.into_iter().map(|(k, v)| (k.to_string(), v)).collect(); 123 | serde_json::Value::Object(map) 124 | } 125 | 126 | /// JSON navigation test case 127 | struct JsonNavCase { 128 | name: &'static str, 129 | document: serde_json::Value, 130 | path: &'static str, 131 | expected: Vec, 132 | } 133 | 134 | /// Check if two slices contain the same JSON elements (order-insensitive) 135 | fn json_sets_equal(a: &[&serde_json::Value], b: &[serde_json::Value]) -> bool { 136 | if a.len() != b.len() { 137 | return false; 138 | } 139 | 140 | for item_a in a { 141 | if !b.iter().any(|item_b| item_a == &item_b) { 142 | return false; 143 | } 144 | } 145 | 146 | for item_b in b { 147 | if !a.contains(&item_b) { 148 | return false; 149 | } 150 | } 151 | 152 | true 153 | } 154 | 155 | /// Run a batch of JSON navigation test cases 156 | fn run_json_tests(test_cases: &[JsonNavCase]) { 157 | for case in test_cases { 158 | let path = parse_path(case.path).unwrap_or_else(|e| { 159 | panic!( 160 | "Test '{}': Failed to parse path '{}': {:?}", 161 | case.name, case.path, e 162 | ) 163 | }); 164 | 165 | let results = navigate_json(&case.document, &path); 166 | 167 | if !json_sets_equal(&results, &case.expected) { 168 | panic!( 169 | "Test '{}' failed:\n Path: {}\n Expected {} results: {:?}\n Got {} results: {:?}", 170 | case.name, 171 | case.path, 172 | case.expected.len(), 173 | case.expected, 174 | results.len(), 175 | results 176 | ); 177 | } 178 | } 179 | } 180 | 181 | // ============================================================================ 182 | // TOML Test Helpers 183 | // ============================================================================ 184 | 185 | /// Helper to construct TOML integer 186 | fn toml_int(i: i64) -> toml::Value { 187 | toml::Value::Integer(i) 188 | } 189 | 190 | /// Helper to construct TOML string 191 | fn toml_str(s: &str) -> toml::Value { 192 | toml::Value::String(s.to_string()) 193 | } 194 | 195 | /// Helper to construct TOML array 196 | fn toml_array(items: Vec) -> toml::Value { 197 | toml::Value::Array(items) 198 | } 199 | 200 | /// Helper to construct TOML table 201 | fn toml_table(pairs: Vec<(&str, toml::Value)>) -> toml::Value { 202 | let map: toml::map::Map = 203 | pairs.into_iter().map(|(k, v)| (k.to_string(), v)).collect(); 204 | toml::Value::Table(map) 205 | } 206 | 207 | /// TOML navigation test case 208 | struct TomlNavCase { 209 | name: &'static str, 210 | document: toml::Value, 211 | path: &'static str, 212 | expected: Vec, 213 | } 214 | 215 | /// Check if two slices contain the same TOML elements (order-insensitive) 216 | fn toml_sets_equal(a: &[&toml::Value], b: &[toml::Value]) -> bool { 217 | if a.len() != b.len() { 218 | return false; 219 | } 220 | 221 | for item_a in a { 222 | if !b.iter().any(|item_b| item_a == &item_b) { 223 | return false; 224 | } 225 | } 226 | 227 | for item_b in b { 228 | if !a.contains(&item_b) { 229 | return false; 230 | } 231 | } 232 | 233 | true 234 | } 235 | 236 | /// Run a batch of TOML navigation test cases 237 | fn run_toml_tests(test_cases: &[TomlNavCase]) { 238 | for case in test_cases { 239 | let path = parse_path(case.path).unwrap_or_else(|e| { 240 | panic!( 241 | "Test '{}': Failed to parse path '{}': {:?}", 242 | case.name, case.path, e 243 | ) 244 | }); 245 | 246 | let results = navigate_toml(&case.document, &path); 247 | 248 | if !toml_sets_equal(&results, &case.expected) { 249 | panic!( 250 | "Test '{}' failed:\n Path: {}\n Expected {} results: {:?}\n Got {} results: {:?}", 251 | case.name, 252 | case.path, 253 | case.expected.len(), 254 | case.expected, 255 | results.len(), 256 | results 257 | ); 258 | } 259 | } 260 | } 261 | 262 | // ============================================================================ 263 | // YAML Navigation Tests 264 | // ============================================================================ 265 | 266 | #[test] 267 | fn test_yaml_basic_navigation() { 268 | let test_cases = vec![ 269 | YamlNavCase { 270 | name: "simple key", 271 | document: yaml_hash(vec![("port", yaml_int(8080))]), 272 | path: ".port", 273 | expected: vec![yaml_int(8080)], 274 | }, 275 | YamlNavCase { 276 | name: "nested keys", 277 | document: yaml_hash(vec![("server", yaml_hash(vec![("port", yaml_int(8080))]))]), 278 | path: ".server.port", 279 | expected: vec![yaml_int(8080)], 280 | }, 281 | YamlNavCase { 282 | name: "deep nesting", 283 | document: yaml_hash(vec![( 284 | "a", 285 | yaml_hash(vec![( 286 | "b", 287 | yaml_hash(vec![("c", yaml_hash(vec![("d", yaml_str("deep"))]))]), 288 | )]), 289 | )]), 290 | path: ".a.b.c.d", 291 | expected: vec![yaml_str("deep")], 292 | }, 293 | YamlNavCase { 294 | name: "missing key returns empty", 295 | document: yaml_hash(vec![("port", yaml_int(8080))]), 296 | path: ".missing", 297 | expected: vec![], 298 | }, 299 | YamlNavCase { 300 | name: "missing nested key returns empty", 301 | document: yaml_hash(vec![("server", yaml_hash(vec![("port", yaml_int(8080))]))]), 302 | path: ".server.missing", 303 | expected: vec![], 304 | }, 305 | ]; 306 | 307 | run_yaml_tests(&test_cases); 308 | } 309 | 310 | #[test] 311 | fn test_yaml_array_navigation() { 312 | let test_cases = vec![ 313 | YamlNavCase { 314 | name: "array index 0", 315 | document: yaml_hash(vec![( 316 | "items", 317 | yaml_array(vec![ 318 | yaml_str("first"), 319 | yaml_str("second"), 320 | yaml_str("third"), 321 | ]), 322 | )]), 323 | path: ".items[0]", 324 | expected: vec![yaml_str("first")], 325 | }, 326 | YamlNavCase { 327 | name: "array index middle", 328 | document: yaml_hash(vec![( 329 | "items", 330 | yaml_array(vec![ 331 | yaml_str("first"), 332 | yaml_str("second"), 333 | yaml_str("third"), 334 | ]), 335 | )]), 336 | path: ".items[1]", 337 | expected: vec![yaml_str("second")], 338 | }, 339 | YamlNavCase { 340 | name: "array index last", 341 | document: yaml_hash(vec![( 342 | "items", 343 | yaml_array(vec![ 344 | yaml_str("first"), 345 | yaml_str("second"), 346 | yaml_str("third"), 347 | ]), 348 | )]), 349 | path: ".items[2]", 350 | expected: vec![yaml_str("third")], 351 | }, 352 | YamlNavCase { 353 | name: "array out of bounds returns empty", 354 | document: yaml_hash(vec![("items", yaml_array(vec![yaml_str("first")]))]), 355 | path: ".items[999]", 356 | expected: vec![], 357 | }, 358 | YamlNavCase { 359 | name: "chained array access", 360 | document: yaml_hash(vec![( 361 | "matrix", 362 | yaml_array(vec![ 363 | yaml_array(vec![yaml_int(1), yaml_int(2)]), 364 | yaml_array(vec![yaml_int(3), yaml_int(4)]), 365 | ]), 366 | )]), 367 | path: ".matrix[1][0]", 368 | expected: vec![yaml_int(3)], 369 | }, 370 | YamlNavCase { 371 | name: "array then key", 372 | document: yaml_hash(vec![( 373 | "users", 374 | yaml_array(vec![ 375 | yaml_hash(vec![("name", yaml_str("alice"))]), 376 | yaml_hash(vec![("name", yaml_str("bob"))]), 377 | ]), 378 | )]), 379 | path: ".users[1].name", 380 | expected: vec![yaml_str("bob")], 381 | }, 382 | ]; 383 | 384 | run_yaml_tests(&test_cases); 385 | } 386 | 387 | #[test] 388 | fn test_yaml_wildcard_navigation() { 389 | let test_cases = vec![ 390 | YamlNavCase { 391 | name: "wildcard all array elements", 392 | document: yaml_hash(vec![( 393 | "items", 394 | yaml_array(vec![yaml_int(1), yaml_int(2), yaml_int(3)]), 395 | )]), 396 | path: ".items[*]", 397 | expected: vec![yaml_int(1), yaml_int(2), yaml_int(3)], 398 | }, 399 | YamlNavCase { 400 | name: "wildcard with mixed types", 401 | document: yaml_hash(vec![( 402 | "mixed", 403 | yaml_array(vec![yaml_int(42), yaml_str("hello"), yaml_bool(true)]), 404 | )]), 405 | path: ".mixed[*]", 406 | expected: vec![yaml_int(42), yaml_str("hello"), yaml_bool(true)], 407 | }, 408 | YamlNavCase { 409 | name: "wildcard then field", 410 | document: yaml_hash(vec![( 411 | "users", 412 | yaml_array(vec![ 413 | yaml_hash(vec![("id", yaml_int(1))]), 414 | yaml_hash(vec![("id", yaml_int(2))]), 415 | yaml_hash(vec![("id", yaml_int(3))]), 416 | ]), 417 | )]), 418 | path: ".users[*].id", 419 | expected: vec![yaml_int(1), yaml_int(2), yaml_int(3)], 420 | }, 421 | YamlNavCase { 422 | name: "wildcard on empty array", 423 | document: yaml_hash(vec![("empty", yaml_array(vec![]))]), 424 | path: ".empty[*]", 425 | expected: vec![], 426 | }, 427 | ]; 428 | 429 | run_yaml_tests(&test_cases); 430 | } 431 | 432 | #[test] 433 | fn test_yaml_recursive_descent() { 434 | let test_cases = vec![ 435 | YamlNavCase { 436 | name: "recursive finds single occurrence", 437 | document: yaml_hash(vec![( 438 | "config", 439 | yaml_hash(vec![( 440 | "database", 441 | yaml_hash(vec![("host", yaml_str("localhost"))]), 442 | )]), 443 | )]), 444 | path: "..host", 445 | expected: vec![yaml_str("localhost")], 446 | }, 447 | YamlNavCase { 448 | name: "recursive finds multiple occurrences", 449 | document: yaml_hash(vec![ 450 | ("db1", yaml_hash(vec![("host", yaml_str("server1"))])), 451 | ("db2", yaml_hash(vec![("host", yaml_str("server2"))])), 452 | ]), 453 | path: "..host", 454 | expected: vec![yaml_str("server1"), yaml_str("server2")], 455 | }, 456 | YamlNavCase { 457 | name: "recursive descent then field", 458 | document: yaml_hash(vec![ 459 | ( 460 | "app1", 461 | yaml_hash(vec![( 462 | "database", 463 | yaml_hash(vec![("host", yaml_str("db1"))]), 464 | )]), 465 | ), 466 | ( 467 | "app2", 468 | yaml_hash(vec![( 469 | "database", 470 | yaml_hash(vec![("host", yaml_str("db2"))]), 471 | )]), 472 | ), 473 | ]), 474 | path: "..database.host", 475 | expected: vec![yaml_str("db1"), yaml_str("db2")], 476 | }, 477 | YamlNavCase { 478 | name: "recursive in nested arrays", 479 | document: yaml_hash(vec![( 480 | "items", 481 | yaml_array(vec![ 482 | yaml_hash(vec![("id", yaml_int(1))]), 483 | yaml_hash(vec![("id", yaml_int(2))]), 484 | ]), 485 | )]), 486 | path: "..id", 487 | expected: vec![yaml_int(1), yaml_int(2)], 488 | }, 489 | YamlNavCase { 490 | name: "recursive descent with wildcard", 491 | document: yaml_hash(vec![( 492 | "services", 493 | yaml_array(vec![ 494 | yaml_hash(vec![( 495 | "ports", 496 | yaml_array(vec![yaml_int(8080), yaml_int(8081)]), 497 | )]), 498 | yaml_hash(vec![("ports", yaml_array(vec![yaml_int(9090)]))]), 499 | ]), 500 | )]), 501 | path: "..ports[*]", 502 | expected: vec![yaml_int(8080), yaml_int(8081), yaml_int(9090)], 503 | }, 504 | ]; 505 | 506 | run_yaml_tests(&test_cases); 507 | } 508 | 509 | // ============================================================================ 510 | // JSON Navigation Tests 511 | // ============================================================================ 512 | 513 | #[test] 514 | fn test_json_basic_navigation() { 515 | let test_cases = vec![ 516 | JsonNavCase { 517 | name: "simple key", 518 | document: json_object(vec![("port", json_int(8080))]), 519 | path: ".port", 520 | expected: vec![json_int(8080)], 521 | }, 522 | JsonNavCase { 523 | name: "nested keys", 524 | document: json_object(vec![( 525 | "server", 526 | json_object(vec![("port", json_int(8080))]), 527 | )]), 528 | path: ".server.port", 529 | expected: vec![json_int(8080)], 530 | }, 531 | JsonNavCase { 532 | name: "array index", 533 | document: json_object(vec![( 534 | "items", 535 | json_array(vec![json_str("a"), json_str("b")]), 536 | )]), 537 | path: ".items[1]", 538 | expected: vec![json_str("b")], 539 | }, 540 | JsonNavCase { 541 | name: "wildcard", 542 | document: json_object(vec![("nums", json_array(vec![json_int(1), json_int(2)]))]), 543 | path: ".nums[*]", 544 | expected: vec![json_int(1), json_int(2)], 545 | }, 546 | ]; 547 | 548 | run_json_tests(&test_cases); 549 | } 550 | 551 | // ============================================================================ 552 | // TOML Navigation Tests 553 | // ============================================================================ 554 | 555 | #[test] 556 | fn test_toml_basic_navigation() { 557 | let test_cases = vec![ 558 | TomlNavCase { 559 | name: "simple key", 560 | document: toml_table(vec![("port", toml_int(8080))]), 561 | path: ".port", 562 | expected: vec![toml_int(8080)], 563 | }, 564 | TomlNavCase { 565 | name: "nested table", 566 | document: toml_table(vec![("server", toml_table(vec![("port", toml_int(8080))]))]), 567 | path: ".server.port", 568 | expected: vec![toml_int(8080)], 569 | }, 570 | TomlNavCase { 571 | name: "array index", 572 | document: toml_table(vec![( 573 | "items", 574 | toml_array(vec![toml_str("a"), toml_str("b")]), 575 | )]), 576 | path: ".items[1]", 577 | expected: vec![toml_str("b")], 578 | }, 579 | TomlNavCase { 580 | name: "wildcard", 581 | document: toml_table(vec![("nums", toml_array(vec![toml_int(1), toml_int(2)]))]), 582 | path: ".nums[*]", 583 | expected: vec![toml_int(1), toml_int(2)], 584 | }, 585 | ]; 586 | 587 | run_toml_tests(&test_cases); 588 | } 589 | --------------------------------------------------------------------------------