├── .chglog ├── CHANGELOG.tpl.md └── config.yml ├── .codecov.yaml ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── benchmarks.yml │ ├── codeql-analysis.yaml │ ├── dep-review.yaml │ ├── go-lint.yaml │ ├── go-unit-tests.yaml │ └── release.yaml ├── .gitignore ├── .golangci.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── PATTERNS.md ├── README-pruner.md ├── README.md ├── REGEXP.md ├── anything_but.go ├── anything_but_test.go ├── arrays_test.go ├── benchmarks_test.go ├── case_folding.go ├── citylots_bench_test.go ├── cl2_test.go ├── code_gen ├── build_casefolding_table.go └── qtest-main.not-go ├── concurrency_test.go ├── core_matcher.go ├── core_matcher_test.go ├── doc.go ├── escaping_test.go ├── example_test.go ├── external_test.go ├── field_matcher.go ├── flatten_json.go ├── flatten_json_bench_test.go ├── flatten_json_test.go ├── flattener.go ├── generic_machine_test.go ├── go.mod ├── go.sum ├── live_pattern_state.go ├── live_pattern_state_test.go ├── match_set.go ├── match_set_test.go ├── matcher.go ├── matcher_test.go ├── monocase.go ├── monocase_test.go ├── nfa.go ├── nfa_test.go ├── numbers.go ├── numbers_test.go ├── numbits.go ├── numbits_test.go ├── pattern.go ├── pattern_test.go ├── prettyprinter.go ├── prettyprinter_test.go ├── pruner.go ├── pruner_test.go ├── quamina.go ├── quamina_test.go ├── race_test.go ├── rebuilding.go ├── rebuilding_test.go ├── regexp_end2end_test.go ├── regexp_nfa.go ├── regexp_nfa_test.go ├── regexp_parse.go ├── regexp_parse_test.go ├── regexp_reader.go ├── regexp_reader_test.go ├── regexp_samples_test.go ├── regexp_validity_test.go ├── segments_tree.go ├── segments_tree_test.go ├── segments_tree_tracker.go ├── shell_style.go ├── shell_style_test.go ├── small_table.go ├── small_table_test.go ├── stats.go ├── testdata ├── arrayEvent1.json ├── arrayEvent2.json ├── arrayEvent3.json ├── arrayEvent4.json ├── arrayRule1.json ├── arrayRule2.json ├── arrayRule3.json ├── arrayRule4.json ├── citylots.jlines.gz ├── citylots2.json.gz ├── cl-sample-0 ├── cl-sample-1 ├── cl-sample-2 ├── status.json └── wwords.txt ├── value_matcher.go ├── value_matcher_test.go ├── wildcard.go └── wildcard_test.go /.chglog/CHANGELOG.tpl.md: -------------------------------------------------------------------------------- 1 | {{ range .Versions }} 2 | 3 | ## {{ if .Tag.Previous }}[Release {{ .Tag.Name }}]({{ $.Info.RepositoryURL }}/compare/{{ .Tag.Previous.Name }}...{{ .Tag.Name }}){{ else }}{{ .Tag.Name }}{{ end }} 4 | 5 | > Release Date: {{ datetime "2006-01-02" .Tag.Date }} 6 | 7 | {{ range .CommitGroups -}} 8 | ### {{ .Title }} 9 | 10 | {{ range .Commits -}} 11 | - [{{ .Hash.Short }}]{{"\t"}}{{ .Subject }}{{ range .Refs }} (#{{ .Ref }}) {{ end }} 12 | {{ end }} 13 | {{ end -}} 14 | 15 | {{- if .RevertCommits -}} 16 | ### ⏮ Reverts 17 | 18 | {{ range .RevertCommits -}} 19 | - [{{ .Hash.Short }}]{{"\t"}}{{ .Revert.Header }}{{ range .Refs }} (#{{ .Ref }}) {{ end }} 20 | {{ end }} 21 | {{ end -}} 22 | 23 | ### ⚠️ BREAKING 24 | 25 | {{ range .Commits -}} 26 | {{ if .Notes -}} 27 | {{ if not .Merge -}} 28 | {{ if not (contains .Header "Update CHANGELOG for" ) -}} 29 | {{ .Subject }} [{{ .Hash.Short }}]:{{"\n"}}{{ range .Notes }}{{ .Body }} 30 | {{ end }} 31 | {{ end -}} 32 | {{ end -}} 33 | {{ end -}} 34 | {{ end -}} 35 | 36 | ### 📖 Commits 37 | 38 | {{ range .Commits -}} 39 | {{ if not .Merge -}} 40 | {{ if not (contains .Header "Update CHANGELOG for" ) -}} 41 | - [{{ .Hash.Short }}]{{"\t"}}{{ .Header }}{{ range .Refs }} (#{{ .Ref }}) {{ end }} 42 | {{ end -}} 43 | {{ end -}} 44 | {{ end -}} 45 | 46 | {{ end -}} -------------------------------------------------------------------------------- /.chglog/config.yml: -------------------------------------------------------------------------------- 1 | style: github 2 | template: CHANGELOG.tpl.md 3 | info: 4 | title: CHANGELOG 5 | repository_url: https://github.com/timbray/quamina 6 | options: 7 | commits: 8 | filters: 9 | Type: 10 | - api 11 | - pat 12 | - chore 13 | - fix 14 | - kaizen 15 | - docs 16 | commit_groups: 17 | title_maps: 18 | api: 🤖 API 19 | pat: 📖 Pattern Language 20 | chore: 🧹 Chore 21 | fix: 🐞 Fix 22 | kaizen: 👩‍🎨 Improve 23 | docs: 📚 Documentation 24 | header: 25 | pattern: "^(\\w*)\\:\\s(.*)$" 26 | pattern_maps: 27 | - Type 28 | - Subject 29 | refs: 30 | actions: 31 | - Closes 32 | - Fixes 33 | notes: 34 | keywords: 35 | - "BREAKING" 36 | -------------------------------------------------------------------------------- /.codecov.yaml: -------------------------------------------------------------------------------- 1 | coverage: 2 | # Commit status https://docs.codecov.io/docs/commit-status are used 3 | # to block PR based on coverage threshold. 4 | status: 5 | project: 6 | default: 7 | target: 80 8 | threshold: 1% 9 | patch: 10 | # Disable the coverage threshold of the patch, so that PRs are 11 | # only failing because of overall project coverage threshold. 12 | # See https://docs.codecov.io/docs/commit-status#disabling-a-status. 13 | default: false 14 | ignore: 15 | - "**/zz_generated*.go" # Ignore generated files. 16 | - "**/*.pb.go" # Ignore proto-generated files. 17 | - "hack" 18 | - "pkg/client" 19 | - "third_party" 20 | - "vendor" -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.jlines filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: gomod 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" -------------------------------------------------------------------------------- /.github/workflows/benchmarks.yml: -------------------------------------------------------------------------------- 1 | name: Benchmarks 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | pull_request: 8 | branches: ["main"] 9 | 10 | jobs: 11 | benchmark: 12 | name: Benchmarks 13 | strategy: 14 | matrix: 15 | go-version: ["1.22"] 16 | platform: ["ubuntu-latest"] 17 | 18 | runs-on: ${{ matrix.platform }} 19 | timeout-minutes: 10 20 | 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 24 | 25 | - name: Set up Go ${{ matrix.go-version }} 26 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a 27 | with: 28 | go-version: ${{ matrix.go-version }} 29 | id: go 30 | 31 | - name: Run benchmark 32 | run: go test -benchmem -run="^$" -bench "^Benchmark" . quamina.net/go/quamina | tee output.txt 33 | 34 | - name: Download previous benchmark data 35 | uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 36 | with: 37 | path: ./cache 38 | key: ${{ runner.os }}-benchmark 39 | 40 | - name: Store benchmark result 41 | uses: benchmark-action/github-action-benchmark@d48d326b4ca9ba73ca0cd0d59f108f9e02a381c7 42 | with: 43 | name: Go Benchmark 44 | tool: "go" 45 | github-token: ${{ secrets.GITHUB_TOKEN }} 46 | 47 | # Compare results against json from cache 48 | output-file-path: output.txt 49 | external-data-json-path: ./cache/benchmark-data.json 50 | 51 | # print job summary in workflow output 52 | summary-always: true 53 | 54 | # Alert on regression 55 | alert-threshold: "120%" 56 | fail-on-alert: false 57 | comment-on-alert: true 58 | 59 | # Disable github pages, for now. 60 | auto-push: false 61 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yaml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '34 21 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'go' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@df409f7d9260372bd5f19e5b04e83cb3c43714ae 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae 72 | -------------------------------------------------------------------------------- /.github/workflows/dep-review.yaml: -------------------------------------------------------------------------------- 1 | name: Dependency Review 2 | 3 | on: 4 | pull_request: 5 | branches: ["main"] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | dependency-review: 12 | strategy: 13 | matrix: 14 | go-version: ["1.22"] 15 | platform: ["ubuntu-latest"] 16 | runs-on: ${{ matrix.platform }} 17 | timeout-minutes: 5 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 21 | 22 | - name: Dependency Review 23 | uses: actions/dependency-review-action@0659a74c94536054bfa5aeb92241f70d680cc78e -------------------------------------------------------------------------------- /.github/workflows/go-lint.yaml: -------------------------------------------------------------------------------- 1 | name: Code Linting 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | pull_request: 8 | branches: ["main"] 9 | 10 | jobs: 11 | lint: 12 | name: Code Linting 13 | strategy: 14 | matrix: 15 | go-version: ["1.22"] 16 | platform: ["ubuntu-latest"] 17 | runs-on: ${{ matrix.platform }} 18 | timeout-minutes: 5 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 23 | with: 24 | fetch-depth: 1 25 | 26 | - name: Set up Go ${{ matrix.go-version }} 27 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a 28 | with: 29 | go-version: ${{ matrix.go-version }} 30 | id: go 31 | 32 | - name: Restore Go cache 33 | uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 34 | with: 35 | path: | 36 | ~/.cache/go-build 37 | ~/go/pkg/mod 38 | key: ${{ runner.os }}-go-${{ matrix.go-version }}-${{ hashFiles('**/go.sum', 'testdata/**') }} 39 | restore-keys: | 40 | ${{ runner.os }}-go-${{ matrix.go-version }}- 41 | 42 | - name: Run golangci-lint 43 | uses: golangci/golangci-lint-action@971e284b6050e8a5849b72094c50ab08da042db8 -------------------------------------------------------------------------------- /.github/workflows/go-unit-tests.yaml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | pull_request: 8 | branches: ["main"] 9 | 10 | concurrency: 11 | group: quamina-unit-tests-${{ github.head_ref || github.run_id }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | test: 16 | name: Unit Tests 17 | strategy: 18 | matrix: 19 | go-version: ["1.22"] 20 | platform: ["ubuntu-latest"] 21 | type: ["Tests","Cover"] # run coverage as separate job w/out -race to avoid killing process 22 | include: 23 | - type: "Tests" 24 | goflags: '-v -race -count=1 -json' 25 | - type: "Cover" 26 | goflags: "-v -count=1 -json" 27 | coveropts: "-coverprofile=coverage.txt -covermode=atomic" 28 | 29 | runs-on: ${{ matrix.platform }} 30 | timeout-minutes: 20 31 | 32 | steps: 33 | - name: Checkout repository 34 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 35 | 36 | - name: Set up Go ${{ matrix.go-version }} 37 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a 38 | with: 39 | go-version: ${{ matrix.go-version }} 40 | id: go 41 | 42 | - name: Restore Go cache 43 | uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 44 | with: 45 | path: | 46 | ~/.cache/go-build 47 | ~/go/pkg/mod 48 | 49 | key: ${{ runner.os }}-go-${{ matrix.go-version }}-${{ hashFiles('**/go.sum', 'testdata/**') }} 50 | restore-keys: | 51 | ${{ runner.os }}-go-${{ matrix.go-version }}- 52 | 53 | - name: Install tparse 54 | run: go install github.com/mfridman/tparse@latest 55 | 56 | - name: Check for .codecov.yaml 57 | id: codecov-enabled 58 | uses: andstor/file-existence-action@076e0072799f4942c8bc574a82233e1e4d13e9d6 59 | with: 60 | files: .codecov.yaml 61 | 62 | - name: Test 63 | env: 64 | COVER_OPTS: ${{ matrix.coveropts }} 65 | GOFLAGS: ${{ matrix.goflags }} 66 | run: go test $COVER_OPTS | tparse -all -notests -format markdown >> $GITHUB_STEP_SUMMARY 67 | 68 | - if: steps.codecov-enabled.outputs.files_exists == 'true' 69 | name: Upload Codecov Report 70 | uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303 71 | with: 72 | token: ${{ secrets.CODECOV_TOKEN }} 73 | 74 | - name: Verify git clean 75 | shell: bash 76 | run: | 77 | if [[ -z "$(git status --porcelain)" ]]; then 78 | echo "${{ github.repository }} up to date." 79 | else 80 | echo "${{ github.repository }} is dirty." 81 | echo "::error:: $(git status)" 82 | exit 1 83 | fi 84 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | # release will only be created when ref is a tag starting with "v" 5 | push: 6 | tags: 7 | - "v*" 8 | 9 | workflow_dispatch: 10 | inputs: 11 | tag: 12 | required: true 13 | type: string 14 | description: Use this existing Git tag to create the release 15 | 16 | jobs: 17 | release: 18 | name: Create Release 19 | timeout-minutes: 10 20 | strategy: 21 | matrix: 22 | go-version: ["1.22"] 23 | platform: ["ubuntu-latest"] 24 | runs-on: ${{ matrix.platform }} 25 | env: 26 | TAG: ${{ github.event.inputs.tag }} 27 | 28 | steps: 29 | - name: Checkout repository 30 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 31 | with: 32 | fetch-depth: 0 33 | ref: "main" 34 | 35 | - name: Get short TAG 36 | if: ${{ github.event_name != 'workflow_dispatch' }} 37 | run: | 38 | echo "Retrieving tag from Github ref" 39 | echo "TAG=$(basename "${{ github.ref }}")" >> $GITHUB_ENV 40 | 41 | - name: Create CHANGELOG for Release (tag) 42 | env: 43 | IMAGE: quay.io/git-chglog/git-chglog 44 | # https://quay.io/repository/git-chglog/git-chglog from tag v0.14.2 45 | IMAGE_SHA: 998e89dab8dd8284cfff5f8cfb9e9af41fe3fcd4671f2e86a180e453c20959e3 46 | run: | 47 | # generate CHANGELOG for this Github release tag only 48 | echo "Using tag $TAG to create release notes" 49 | docker run --rm -v $PWD:/workdir ${IMAGE}@sha256:${IMAGE_SHA} -o RELEASE_CHANGELOG.md $TAG 50 | 51 | # send to job summary 52 | cat RELEASE_CHANGELOG.md >> $GITHUB_STEP_SUMMARY 53 | 54 | - name: Create Github Release 55 | env: 56 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 57 | run: | 58 | echo "Using tag $TAG to create release" 59 | gh release create -F RELEASE_CHANGELOG.md ${TAG} LICENSE README.md 60 | 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | coverage.txt 4 | # codecov CLI and sig files 5 | codecov 6 | codecov.SHA256SUM* -------------------------------------------------------------------------------- /.golangci.yaml: -------------------------------------------------------------------------------- 1 | linters: 2 | disable-all: false 3 | # https://golangci-lint.run/usage/linters/#enabled-by-default 4 | enable: 5 | - gofmt 6 | - bodyclose 7 | - errname 8 | - errorlint 9 | - gochecknoinits 10 | - goimports 11 | - goprintffuncname 12 | - gosec 13 | # - ireturn 14 | - misspell 15 | - nilerr 16 | - nilnil 17 | - predeclared 18 | - stylecheck 19 | - thelper 20 | - tparallel 21 | - unparam 22 | # - wrapcheck 23 | - whitespace 24 | # wastedassign is disabled because of generics. You can track the evolution of the generics support by following the https://github.com/golangci/golangci-lint/issues/2649 25 | # - wastedassign 26 | 27 | issues: 28 | exclude-rules: 29 | - path: _test\.go 30 | text: "Use of weak random number generator" #gosec:G404 31 | - path: _test\.go 32 | text: "ST1018" 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # The only purpose of this makefile is to run code_gen/code_gen, which will rebuild the case_folding.go file if 2 | # it is more than three months out of date 3 | casefold: 4 | @ cd code_gen && go build && cd .. 5 | @ code_gen/code_gen 6 | -------------------------------------------------------------------------------- /PATTERNS.md: -------------------------------------------------------------------------------- 1 | # Patterns in Quamina 2 | 3 | **Patterns** are added to Quamina instances using the 4 | `AddPattern` API. This document specifies the syntax and 5 | semantics of Patterns. 6 | 7 | The discussion of JSON constructs in this document uses 8 | the terminology specified in [RFC 8259](https://www.rfc-editor.org/rfc/rfc8259.html). 9 | 10 | ## Fields 11 | 12 | Patterns exist to match **Fields** in incoming **Events**. At 13 | launch, Quamina supports only JSON syntax for Events. An 14 | Event **MUST** be equivalent to a JSON Object in the 15 | sense that it consists of an unordered set of members, 16 | each identified by a name which is string composed of 17 | Unicode characters. 18 | 19 | As in JSON, the allowed member values may be strings, 20 | numbers, the literals `true`, `false`, and `null`, arrays, 21 | and objects. We refer to values which are neither arrays 22 | nor objects as **Leaf** values. 23 | 24 | A **Field** is the combination of a Leaf value and a 25 | **Path**, a list of strings which are the Field names 26 | that must be traversed to reach it from the Event root. 27 | 28 | For example, in the Event 29 | ```json 30 | {"alpha": {"beta": 1}} 31 | ``` 32 | There is only one Field whose Leaf value is `1` 33 | and whose Path is `"alpha","beta"`. 34 | 35 | Paths omit arrays. So in the Event 36 | ```json 37 | {"alpha": [ {"beta": [1, 2]}, {"beta": [3, 4]} ] } 38 | ``` 39 | The Path for the Leaf value `1` is still `"alpha","beta"` 40 | 41 | ### Pattern Syntax and Semantics 42 | A Pattern **MUST** be a JSON object all of whose Leaf 43 | values **MUST** be in arrays. 44 | 45 | Note that a Field in an Event may have multiple Leaf Values 46 | if the Field's value is an array. 47 | 48 | To match a Field in an Event, the Pattern **MUST** contain 49 | an exactly-matching Path whose value **MUST** be an array 50 | which contains either an Event Field Leaf value or an 51 | **Extended Pattern** which matches an Event Field Leaf 52 | value. 53 | 54 | Thus, the following Pattern would match both JSON events above: 55 | ```json 56 | {"alpha": {"beta": [1]}} 57 | ``` 58 | 59 | ### Numeric Values 60 | 61 | Quamina can match numeric values with precision and range exactly the same as that provided by 62 | Go's `float64` data type, which is said to conform to IEEE 754 `binary64`. 63 | 64 | ## Extended Patterns 65 | An **Extended Pattern** **MUST** be a JSON object containing 66 | a single field whose name is called the **Pattern Type**. 67 | 68 | ### Prefix Pattern 69 | 70 | The Pattern Type of a Prefix Pattern is `prefix` and its value 71 | **MUST** be a string. 72 | 73 | The following event: 74 | 75 | ```json 76 | {"a": "alpha"} 77 | ``` 78 | 79 | would be matched by this Prefix Pattern: 80 | 81 | ```json 82 | {"a": [ { "prefix": "al" } ] } 83 | ``` 84 | 85 | ### Exists Pattern 86 | 87 | The Pattern Type of an Exists Pattern is `exists` and its 88 | value **MUST** be `true` or `false`. Here 89 | are two Exists Patterns that would match the Events above: 90 | ```json 91 | {"alpha": {"beta": [ {"exists": true} ]}} 92 | {"alpha": {"gamma": [ {"exists": false} ]}} 93 | ``` 94 | 95 | If a Field in a Pattern contains an Exists Pattern, it 96 | **MUST NOT** contain any other values. 97 | 98 | Exists Patterns currently only work on leaf nodes. That is to 99 | say, given this event: 100 | 101 | ```json 102 | { "a": { "b": 1 } } 103 | ``` 104 | 105 | The following pattern will not match: 106 | 107 | ```json 108 | { "a": [ {"exists": true} ] } 109 | ``` 110 | 111 | We may be able to change this in future. 112 | 113 | The case of empty arrays is interesting, both in Patterns and Events. Consider this event: 114 | 115 | ```json 116 | { "a": [] } 117 | ``` 118 | 119 | Then `"exists": true` does not match but `"exists": false` does. 120 | I.e., only the first of the two sample patterns below matches. 121 | 122 | ```json 123 | { "a": [ { "exists": false } ] } 124 | ``` 125 | ```json 126 | { "a": [ { "exists": true } ] } 127 | ``` 128 | This makes sense in the context of the leaf-node semantics; there 129 | really is no value for the `"a"` field. 130 | 131 | In Patterns, the following never matches any Event: 132 | 133 | ```json 134 | { "a": [] } 135 | ``` 136 | 137 | Once again, there is nothing in the array of candidate values in the Pattern that can match any value of an `"a"` 138 | field in an Event. 139 | 140 | 141 | 142 | ### Anything-But Pattern 143 | 144 | The Pattern Type of an Anything-But Pattern is 145 | `anything-but` and its value **MUST** be an array 146 | of strings. It will match a string value which 147 | is not equal to any of the strings in the array. 148 | 149 | If a Field in a Pattern contains an Anything-But Pattern, 150 | it **MUST NOT** contain any other values. 151 | 152 | ### Wildcard Pattern 153 | 154 | The Pattern Type of a Wildcard Pattern is `wildcard` 155 | and its value **MUST** be a string which **MAY** contain 156 | `*` (“star”) characters. The star character 157 | functions exactly as the same character does in 158 | command-line processors which descend from Unix’s 159 | shell; i.e., matches the regular expression `.*` 160 | 161 | Adjacent `*` characters are not allowed. 162 | 163 | Consider the following Event: 164 | ```json 165 | {"img": "https://example.com/9943.jpg"} 166 | ``` 167 | The following Wildcard Patterns would match it: 168 | ```json 169 | {"img": [ {"wildcard": "*.jpg"} ] } 170 | {"img": [ {"wildcard": "https://example.com/*"} ] } 171 | {"img": [ {"wildcard": "https://example.com/*.jpg"} ] } 172 | {"img": [ {"wildcard": "https://example.*/*.jpg"} ] } 173 | ``` 174 | 175 | If it is desired to match the actual character "*", it may be “escaped” 176 | with backslash, "\". For example, consider the following Event: 177 | 178 | ```json 179 | {"example-regex": "a**\\.b"} 180 | ``` 181 | 182 | The following Wildcard pattern would match it. 183 | 184 | ```json 185 | {"example-regex": [ {"wildcard": "a\\*\\*\\\\.b"}]} 186 | ``` 187 | 188 | Note that the "\" backslashes must be doubled to deal with the 189 | fact that they are escape characters for JSON as well as for Quamina. 190 | 191 | After a "\", the appearance of any character other than "*" or "\" is an error. 192 | 193 | ### Regexp Pattern 194 | 195 | The Pattern Type of a Regexp Pattern is `regexp` and its value 196 | **MUST** be a string. For details of that string’s syntax see 197 | [Regular Expressions in Quamina](REGEXP.md). 198 | 199 | ### Shellstyle Pattern 200 | 201 | This is an earlier version of the Wildcard pattern, differing only that 202 | \-escaping the "*" and "\" characters is not supported. 203 | 204 | ### Equals-Ignore-Case Pattern 205 | 206 | The Pattern Type of an Equals-Ignore-Case pattern is `equals-ignore-case` 207 | and its value **MUST** be a string. Quamina attempts to match with 208 | case folding in effect, as discussed in Section 3.13 of the Unicode 209 | Standard. Quamina uses the case-folding mappings provided in the file 210 | CaseFolding.txt in the Unicode Character Database to generate its mappings. 211 | Note that case-folding is highly dependent on the specifics of the language 212 | in use and in certain locales, this default mapping may not produce satisfactory 213 | results, although results are good for ASCII and "simple" characters from 214 | other alphabets. 215 | 216 | ## EventBridge Patterns 217 | 218 | Quamina’s Patterns are inspired by those offered by 219 | the AWS EventBridge service, as documented in 220 | [Amazon EventBridge event patterns](https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-event-patterns.html). 221 | 222 | It is a goal of Quamina to eventually support all the patterns that EventBridge does. 223 | -------------------------------------------------------------------------------- /README-pruner.md: -------------------------------------------------------------------------------- 1 | # `DeletePattern` 2 | 3 | The core quamina.Matcher doesn't currently support deleting patterns. 4 | Some of the contemplated implementations would probably be pretty 5 | difficult. At least one approach is pretty easy: Wrap the current 6 | matcher to filter removed patterns from match results and periodically 7 | rebuild the matcher from scrach with the live patterns. More 8 | specifically: 9 | 10 | 1. Remember patterns that have been added 11 | 2. Remember patterns that have been removed (implicitly) 12 | 3. Filter `MatchesFor...()` results to remove any removed patterns 13 | 4. Rebuilding the matcher state periodically with only the live 14 | patterns 15 | 5. Maintain some statistics to help decide when to rebuild 16 | 17 | The implementation of the set of live patterns is pluggable via a Go 18 | interface (`State`). The default implementation `MemState` is just a 19 | `map[quamina.X]string`. Other implementations could provide 20 | persistence. 21 | 22 | By default, rebuilding is triggered automatically (synchronously 23 | currently) during mutations. You can also force a manual `Rebuild()`, 24 | and you can `DisableRebuild()` to prevent any automation rebuilds. 25 | The code also supports pluggable rebuilding policies, but those 26 | features are not currently exposed. 27 | 28 | -------------------------------------------------------------------------------- /REGEXP.md: -------------------------------------------------------------------------------- 1 | # Regular Expressions in Quamina 2 | 3 | **Regular Expressions** (hereinafter “regexps”) may appear in Quamina Regexp Patterns. 4 | 5 | ## Syntax 6 | 7 | The regexp syntax supported in Regexp Patterns are that specified in 8 | [RFC 9485](https://datatracker.ietf.org/doc/rfc9485/), 9 | *I-Regexp: An Interoperable Regular Expression Format*. 10 | 11 | There is one important syntactic difference. The backslash character “\” commonly 12 | used in regexp constructs for escaping metacharacters (as in `Stop\.`) and in such 13 | constructs such as `\P{Lu}`, is “~” in Quamina regexps. 14 | 15 | “~” is used for this purpose because “\” is also used in Go string literals and 16 | in JSON. Thus, complexity is added to unit testing and fragments such as `\\\\` and even 17 | `\\\\\\\\` are regularly needed. Conventional regexps may be turned into Quamina regexps 18 | by replacing occurrences of “\” with “~” wherever “\” is being used as a metacharacter. If a 19 | Quamina regexp needs to match the literal character “\”, it need not be escaped. For 20 | example, the common Go-language syntax for representing whitespace characters in code can 21 | be matched with the Quamina regexp `\[nrt]`, but the newline character, U+000A, would 22 | be matched by the Quamina regexp `~n`. 23 | 24 | When a regexp is used in a Quamina `addPattern()` call, an error is returned if the regexp 25 | contains a syntax error or if it uses a regexp feature that is not yet supported in the 26 | current release. 27 | 28 | ## Features 29 | 30 | Regexps are being added to Quamina incrementally. The following list identifies the full 31 | set of planned features; it is not in any particular order. Those that are supported in the 32 | current release are bold-faced. 33 | 34 | `.` : **single-character matcher** 35 | 36 | 37 | `|` : **logical alternatives** 38 | 39 | `[]` : **character-class matcher** 40 | 41 | `()` : **parenthetized sub-regexp** 42 | 43 | `*` : zero-or-more matcher 44 | 45 | `+` : one-or-more matcher 46 | 47 | `?` : optional matcher 48 | 49 | `{lo,hi}` : occurrence-count matcher 50 | 51 | `~p{}` : Unicode property matcher 52 | 53 | `~P{}` : Unicode property-complement matcher 54 | 55 | `[^]` : complementary character-class matcher 56 | 57 | ## Semantics of “.” 58 | 59 | In Quamina regexps, the `.` metacharacter matches any Unicode character whose code point is 60 | among the *Unicode Scalars* as defined in Definition D76 in Section 3.9 of the Unicode Standard. 61 | This is the range of codepoints between U+0000 - U+D7FF inclusive, and U+E000 - U+10FFFF 62 | inclusive. 63 | 64 | Put another way, `.` matches all of the Unicode code points except those defined in the Unicode Standard as “Surrogates”. 65 | -------------------------------------------------------------------------------- /anything_but.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "io" 8 | ) 9 | 10 | func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) { 11 | t, err := pb.jd.Token() 12 | if err != nil { 13 | return 14 | } 15 | pathVals = valsIn 16 | fieldCount := 0 17 | delim, ok := t.(json.Delim) 18 | if (!ok) || delim != '[' { 19 | err = errors.New("value for anything-but must be an array") 20 | return 21 | } 22 | done := false 23 | val := typedVal{vType: anythingButType} 24 | for !done { 25 | t, err = pb.jd.Token() 26 | if errors.Is(err, io.EOF) { 27 | err = errors.New("anything-but list truncated") 28 | return 29 | } else if err != nil { 30 | return 31 | } 32 | switch tt := t.(type) { 33 | case json.Delim: 34 | if tt == ']' { 35 | done = true 36 | } else { 37 | err = fmt.Errorf("spurious %c in anything-but list", tt) 38 | } 39 | case string: 40 | fieldCount++ 41 | val.list = append(val.list, []byte(`"`+tt+`"`)) 42 | default: 43 | err = errors.New("malformed anything-but list") 44 | done = true 45 | } 46 | } 47 | if err != nil { 48 | return 49 | } 50 | if fieldCount == 0 { 51 | err = errors.New("empty list in 'anything-but' pattern") 52 | return 53 | } 54 | pathVals = append(pathVals, val) 55 | 56 | // this has to be a '}' or you're going to get an err from the tokenizer, so no point looking at the value 57 | _, err = pb.jd.Token() 58 | return 59 | } 60 | 61 | // makeMultiAnythingButDFA exists to handle constructs such as 62 | // 63 | // {"x": [ {"anything-but": [ "a", "b" ] } ] } 64 | // 65 | // A finite automaton that matches anything but one byte sequence is like this: 66 | // For each byte in val with value Z, we produce a table that leads to a nextField match on all non-Z values, 67 | // and to another such table for Z. After all the bytes have matched, a match on valueTerminator leads to 68 | // an empty table with no field Transitions, all others to a nexField match 69 | // 70 | // Making a succession of anything-but automata for each of "a" and "b" and then merging them turns out not 71 | // to work because what the caller means is really an AND - everything that matches neither "a" nor "b". So 72 | // in principle we could intersect automata. 73 | func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) { 74 | nextField := newFieldMatcher() 75 | successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} 76 | success := &faNext{states: []*faState{successStep}} 77 | 78 | ret, _ := makeOneMultiAnythingButStep(vals, 0, success), nextField 79 | return ret, nextField 80 | } 81 | 82 | // makeOneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is 83 | // the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is 84 | // success but transfers to the next step on whatever the current byte in each of the vals that have not 85 | // yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition 86 | // to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but 87 | // strings. 88 | func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable { 89 | // this will be the default transition in all the anything-but tables. 90 | var u unpackedTable 91 | for i := range u { 92 | u[i] = success 93 | } 94 | 95 | // for the char at position 'index' in each val. valsWithBytesRemaining is keyed by that char (assuming that 'index' isn't 96 | // off the edge of that val. valsEndingHere[index] being true for some val means that val ends here. 97 | valsWithBytesRemaining := make(map[byte][][]byte) 98 | valsEndingHere := make(map[byte]bool) 99 | for _, val := range vals { 100 | lastIndex := len(val) - 1 101 | switch { 102 | case index < lastIndex: 103 | // gather vals that still have characters past 'index' 104 | utf8Byte := val[index] 105 | step := valsWithBytesRemaining[utf8Byte] 106 | valsWithBytesRemaining[utf8Byte] = append(step, val) 107 | case index == lastIndex: 108 | // remember if this particular val ends here 109 | valsEndingHere[val[index]] = true 110 | case index > lastIndex: 111 | // no-op 112 | } 113 | } 114 | 115 | // for each val that still has bytes to process, recurse to process the next one 116 | for utf8Byte, val := range valsWithBytesRemaining { 117 | nextTable := makeOneMultiAnythingButStep(val, index+1, success) 118 | nextStep := &faState{table: nextTable} 119 | u[utf8Byte] = &faNext{states: []*faState{nextStep}} 120 | } 121 | 122 | // for each val that ends at 'index', put a failure-transition for this anything-but 123 | // if you hit the valueTerminator, success for everything else 124 | for utf8Byte := range valsEndingHere { 125 | failState := &faState{table: newSmallTable()} // note no transitions 126 | lastStep := &faNext{states: []*faState{failState}} 127 | lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep}) 128 | u[utf8Byte] = &faNext{states: []*faState{{table: lastTable}}} 129 | } 130 | 131 | table := newSmallTable() 132 | table.pack(&u) 133 | return table 134 | } 135 | -------------------------------------------------------------------------------- /anything_but_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestAnythingButMerging(t *testing.T) { 9 | pFoo := `{"z": [ "foo" ]}` 10 | pAbFoot := `{"z": [ {"anything-but": [ "foot"] } ]}` 11 | q, _ := New() 12 | var err error 13 | 14 | // can merge with FA? 15 | err = q.AddPattern("pFoo", pFoo) 16 | if err != nil { 17 | t.Error("add pFoo") 18 | } 19 | err = q.AddPattern("pAbFoot", pAbFoot) 20 | if err != nil { 21 | t.Error("add pAbFoot: " + err.Error()) 22 | } 23 | var m []X 24 | m, err = q.MatchesForEvent([]byte(`{"z": "foo"}`)) 25 | if err != nil { 26 | t.Error("m4E - foo: " + err.Error()) 27 | } 28 | if len(m) != 2 { 29 | t.Errorf("len=%d?!?", len(m)) 30 | } 31 | m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`)) 32 | if err != nil { 33 | t.Error("m4E - foo: " + err.Error()) 34 | } 35 | if len(m) != 0 { 36 | t.Errorf("len=%d?!?", len(m)) 37 | } 38 | 39 | // can merge with NFA? 40 | pFooStar := `{"z": [ {"shellstyle": "foo*" } ]}` 41 | q, _ = New() 42 | err = q.AddPattern("pFooStar", pFooStar) 43 | if err != nil { 44 | t.Error("pFooStar: " + err.Error()) 45 | } 46 | err = q.AddPattern("pAbFoot", pAbFoot) 47 | if err != nil { 48 | t.Error("add pAbFoot: " + err.Error()) 49 | } 50 | m, err = q.MatchesForEvent([]byte(`{"z": "foo"}`)) 51 | if err != nil { 52 | t.Error("m4E: " + err.Error()) 53 | } 54 | if len(m) != 2 { 55 | t.Errorf("len=%d?!?", len(m)) 56 | } 57 | m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`)) 58 | if err != nil { 59 | t.Error("m4E: " + err.Error()) 60 | } 61 | if len(m) != 1 { 62 | t.Errorf("len=%d?!?", len(m)) 63 | } 64 | } 65 | 66 | func TestFootCornerCase(t *testing.T) { 67 | q, _ := New() 68 | pFoot := `{"z": ["foot"]}` 69 | err := q.AddPattern("foot", pFoot) 70 | if err != nil { 71 | t.Error("addP: " + err.Error()) 72 | } 73 | m, err := q.MatchesForEvent([]byte(`{"z": "foot"}`)) 74 | if err != nil { 75 | t.Error(err.Error()) 76 | } 77 | if len(m) != 1 || m[0] != "foot" { 78 | t.Error("foot not 1") 79 | } 80 | q, _ = New() 81 | pNotFoo := `{"z": [ { "anything-but": ["foo"]} ] }` 82 | err = q.AddPattern("notFoo", pNotFoo) 83 | if err != nil { 84 | t.Error("addP: " + err.Error()) 85 | } 86 | m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`)) 87 | if err != nil { 88 | t.Error(err.Error()) 89 | } 90 | if len(m) != 1 || m[0] != "notFoo" { 91 | t.Error("foot not 1") 92 | } 93 | q, _ = New() 94 | pFooStar := `{"z": [ { "shellstyle": "foo*" } ] }` 95 | err = q.AddPattern("foostar", pFooStar) 96 | if err != nil { 97 | t.Error("addP: " + err.Error()) 98 | } 99 | m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`)) 100 | if err != nil { 101 | t.Error(err.Error()) 102 | } 103 | if len(m) != 1 || m[0] != "foostar" { 104 | t.Error("foot not 1") 105 | } 106 | } 107 | 108 | func TestAnythingButAlgo(t *testing.T) { 109 | notJoeTim := `{"x": [ { "anything-but": ["joe", "tim"] } ] }` 110 | q, _ := New() 111 | err := q.AddPattern("notJoeTim", notJoeTim) 112 | if err != nil { 113 | t.Error("NJT: " + err.Error()) 114 | } 115 | event := `{"x": "toe"}` 116 | matches, err := q.MatchesForEvent([]byte(event)) 117 | if err != nil { 118 | t.Error("NJT: " + err.Error()) 119 | } 120 | if len(matches) != 1 { 121 | t.Error("NJT: Didn't match") 122 | } 123 | event = `{"x": "joe"}` 124 | matches, err = q.MatchesForEvent([]byte(event)) 125 | if err != nil { 126 | t.Error("NJT: " + err.Error()) 127 | } 128 | if len(matches) != 0 { 129 | t.Error("NJT: matched joe") 130 | } 131 | 132 | notTTT := `{"x": [ { "anything-but": ["tim", "time", "timed"] } ] }` 133 | q, _ = New() 134 | err = q.AddPattern("notTTT", notTTT) 135 | if err != nil { 136 | t.Error("NTTT: " + err.Error()) 137 | } 138 | events := []string{`{"x": "tim"}`, `{"x": "time"}`, `{"x": "timed"}`} 139 | for _, ev := range events { 140 | matches, err := q.MatchesForEvent([]byte(ev)) 141 | if err != nil { 142 | t.Error("NTTT: (" + ev + ") " + err.Error()) 143 | } 144 | if len(matches) != 0 { 145 | t.Error("NTTT: (" + ev + ") matched") 146 | } 147 | } 148 | } 149 | 150 | func TestAnythingButMatching(t *testing.T) { 151 | q, _ := New() 152 | // the idea is we're testing against all the 5-letter Wordle patterns, so we want a 4-letter prefix and 153 | // suffix of an existing wordle, a 5-letter non-wordle, and a 6-letter where the wordle might match at the start 154 | // and end. I tried to think of scenarios that would defeat the pretty-simple anything-but FA but couldn't. 155 | problemWords := []string{ 156 | `"bloo"`, 157 | `"aper"`, 158 | `"fnord"`, 159 | `"doubts"`, 160 | `"astern"`, 161 | } 162 | pws := strings.Join(problemWords, ",") 163 | pattern := `{"a": [ {"anything-but": [ ` + pws + `] } ] }"` 164 | err := q.AddPattern(pattern, pattern) 165 | if err != nil { 166 | t.Error("AP: " + err.Error()) 167 | } 168 | words := readWWords(t) 169 | template := `{"a": "XX"}` 170 | problemTemplate := `{"a": XX}` 171 | for _, word := range problemWords { 172 | event := strings.ReplaceAll(problemTemplate, "XX", word) 173 | matches, err := q.MatchesForEvent([]byte(event)) 174 | if err != nil { 175 | t.Error("on problem word: " + err.Error()) 176 | } 177 | if len(matches) != 0 { 178 | t.Error("Matched on : " + word) 179 | } 180 | } 181 | for _, word := range words { 182 | ws := string(word) 183 | event := strings.ReplaceAll(template, "XX", ws) 184 | matches, err := q.MatchesForEvent([]byte(event)) 185 | if err != nil { 186 | t.Error("m4E: " + err.Error()) 187 | } 188 | if len(matches) != 1 { 189 | t.Errorf("missed on (len=%d): "+event, len(matches)) 190 | } 191 | } 192 | } 193 | 194 | func TestParseAnythingButPattern(t *testing.T) { 195 | goods := []string{ 196 | `{"a": [ {"anything-but": [ "foo" ] } ] }`, 197 | `{"a": [ {"anything-but": [ "bif", "x", "y", "a;sldkfjas;lkdfjs" ] } ] }`, 198 | } 199 | bads := []string{ 200 | `{"a": [ {"anything-but": x } ] }`, 201 | `{"a": [ {"anything-but": 1 } ] }`, 202 | `{"a": [ {"anything-but": [ "a"`, 203 | `{"a": [ {"anything-but": [ x ] } ] }`, 204 | `{"a": [ {"anything-but": [ {"z": 1} ] } ] }`, 205 | `{"a": [ {"anything-but": [ true ] } ] }`, 206 | `{"a": [ {"anything-but": [ "foo" ] x`, 207 | `{"a": [ {"anything-but": [ "foo" ] ] ] }`, 208 | `{"a": [ {"anything-but": {"x":1} } ] }`, 209 | `{"a": [ {"anything-but": "foo" } ] }`, 210 | `{"a": [ 2, {"anything-but": [ "foo" ] } ] }`, 211 | `{"a": [ {"anything-but": [ "foo" ] }, 2 ] }`, 212 | `{"a": [ {"anything-but": [ ] } ] }`, 213 | } 214 | 215 | for i, good := range goods { 216 | fields, err := patternFromJSON([]byte(good)) 217 | if err != nil { 218 | t.Errorf("parse anything-but i=%d: "+err.Error(), i) 219 | } 220 | if len(fields[0].vals) != 1 { 221 | t.Errorf("wanted11 fields got %d", len(fields)) 222 | } 223 | } 224 | 225 | for _, bad := range bads { 226 | _, err := patternFromJSON([]byte(bad)) 227 | if err == nil { 228 | t.Errorf(`accepted anything-but "%s"`, bad) 229 | } 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /arrays_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | const bands = `{ 8 | "bands": [ 9 | { 10 | "name": "The Clash", 11 | "members": [ 12 | { 13 | "given": "Joe", 14 | "surname": "Strummer", 15 | "role": [ 16 | "guitar", 17 | "vocals" 18 | ] 19 | }, 20 | { 21 | "given": "Mick", 22 | "surname": "Jones", 23 | "role": [ 24 | "guitar", 25 | "vocals" 26 | ] 27 | }, 28 | { 29 | "given": "Paul", 30 | "surname": "Simonon", 31 | "role": [ 32 | "bass" 33 | ] 34 | }, 35 | { 36 | "given": "Topper", 37 | "surname": "Headon", 38 | "role": [ 39 | "drums" 40 | ] 41 | } 42 | ] 43 | }, 44 | { 45 | "name": "Boris", 46 | "members": [ 47 | { 48 | "given": "Wata", 49 | "role": [ 50 | "guitar", 51 | "vocals" 52 | ] 53 | }, 54 | { 55 | "given": "Atsuo", 56 | "role": [ 57 | "drums" 58 | ] 59 | }, 60 | { 61 | "given": "Takeshi", 62 | "role": [ 63 | "bass", 64 | "vocals" 65 | ] 66 | } 67 | ] 68 | } 69 | ] 70 | }` 71 | 72 | func TestArrayCorrectness(t *testing.T) { 73 | // only wataGuiterPattern should match 74 | mickStrummerPattern := `{"bands": { "members": { "given": [ "Mick" ], "surname": [ "Strummer" ] } } }` 75 | wataDrumsPattern := `{"bands": { "members": { "given": [ "Wata" ], "role": [ "drums" ] } } }` 76 | wataGuiterPattern := `{"bands": { "members": { "given": [ "Wata" ], "role": [ "guitar" ] } } }` 77 | 78 | m := newCoreMatcher() 79 | if err := m.addPattern("Mick strummer", mickStrummerPattern); err != nil { 80 | t.Errorf("Failed adding pattern: %s: %s", mickStrummerPattern, err) 81 | } 82 | 83 | if err := m.addPattern("Wata drums", wataDrumsPattern); err != nil { 84 | t.Errorf("Failed adding pattern: %s: %s", wataDrumsPattern, err) 85 | } 86 | if err := m.addPattern("Wata guitar", wataGuiterPattern); err != nil { 87 | t.Errorf("Failed adding pattern: %s: %s", wataGuiterPattern, err) 88 | } 89 | 90 | matches, err := m.matchesForJSONEvent([]byte(bands)) 91 | if err != nil { 92 | t.Errorf("Failed 'matchesForJSONEvent': %s", err) 93 | } 94 | 95 | if len(matches) != 1 || matches[0].(string) != "Wata guitar" { 96 | t.Errorf("Expected to get a single of 'Wata guiter', but got %d matches: %+v", len(matches), matches) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /citylots_bench_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func BenchmarkCityLots(b *testing.B) { 8 | var localMatches []X 9 | 10 | patterns := []string{ 11 | `{ "properties": { "STREET": [ "CRANLEIGH" ] } }`, 12 | `{ "properties": { "STREET": [ "17TH" ], "ODD_EVEN": [ "E"] } }`, 13 | `{ "geometry": { "coordinates": [ 37.807807921694092 ] } }`, 14 | `{ "properties": { "MAPBLKLOT": ["0011008"], "BLKLOT": ["0011008"]}, "geometry": { "coordinates": [ 37.807807921694092 ] } } `, 15 | } 16 | names := []string{ 17 | "CRANLEIGH", 18 | "17TH Even", 19 | "Geometry", 20 | "0011008", 21 | } 22 | 23 | var err error 24 | q, err := New() 25 | if err != nil { 26 | b.Fatalf("New(): %s", err.Error()) 27 | } 28 | for i := range names { 29 | err = q.AddPattern(names[i], patterns[i]) 30 | if err != nil { 31 | b.Fatalf("AddPattern failed: %s", err.Error()) 32 | } 33 | } 34 | b.Log(matcherStats(q.matcher.(*coreMatcher))) 35 | 36 | lines := getCityLotsLines(b) 37 | 38 | b.ResetTimer() 39 | 40 | for i := 0; i < b.N; i++ { 41 | lineIndex := i 42 | if i >= len(lines) { 43 | lineIndex = 0 44 | } 45 | 46 | matches, err := q.MatchesForEvent(lines[lineIndex]) 47 | if err != nil { 48 | b.Errorf("Matches4JSON: %s", err.Error()) 49 | } 50 | 51 | localMatches = matches 52 | } 53 | 54 | topMatches = localMatches 55 | } 56 | -------------------------------------------------------------------------------- /code_gen/build_casefolding_table.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "net/http" 9 | "os" 10 | "regexp" 11 | "strconv" 12 | "time" 13 | ) 14 | 15 | const ( 16 | CaseFoldingURL = "https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt" 17 | CaseFoldingDB = "case_folding.go" 18 | ThreeMonthsInHours = 30 * 24 * 3 19 | PairsPerLine = 6 20 | reString = `^([0-9a-fA-F]+); C; ([0-9a-fA-F]+);.*` 21 | CFFheader = `package quamina 22 | 23 | // Code generated by code_gen/code_gen - DO NOT EDIT. 24 | // built from the "C" records in CaseFolding.txt in the Unicode character database 25 | var caseFoldingPairs = map[rune]rune{` 26 | ) 27 | 28 | func main() { 29 | // only rebuild the database if it's 3 months out of date 30 | needToRebuild := false 31 | info, err := os.Stat(CaseFoldingDB) 32 | if err != nil { 33 | needToRebuild = true 34 | } else { 35 | howOldInHours := time.Since(info.ModTime()).Hours() 36 | if howOldInHours > ThreeMonthsInHours { 37 | needToRebuild = true 38 | } 39 | } 40 | if !needToRebuild { 41 | fmt.Println("Skipping codegen.") 42 | return 43 | } 44 | resp, err := http.Get(CaseFoldingURL) 45 | if err != nil { 46 | fatal("Can't fetch CaseFolding.txt: " + err.Error()) 47 | } 48 | defer func() { _ = resp.Body.Close() }() 49 | cff, err := os.Create(CaseFoldingDB + ".tmp") 50 | if err != nil { 51 | fatal("Opening CaseFolding.txt: " + err.Error()) 52 | } 53 | _, err = cff.Write([]byte(CFFheader)) 54 | if err != nil { 55 | fatal("Write CFF header: " + err.Error()) 56 | } 57 | lines := bufio.NewReader(resp.Body) 58 | re, err := regexp.Compile(reString) 59 | if err != nil { 60 | fatal("RE compile: " + err.Error()) 61 | } 62 | mappings := make(map[string]string) 63 | 64 | for { 65 | line, err := lines.ReadBytes('\n') 66 | if errors.Is(err, io.EOF) { 67 | break 68 | } else if err != nil { 69 | fatal("Error reading CaseFolding.txt: " + err.Error()) 70 | } 71 | if line[0] == '#' || len(line) == 1 { 72 | continue 73 | } 74 | matches := re.FindSubmatch(line) 75 | if len(matches) != 3 { 76 | continue 77 | } 78 | _, err = strconv.ParseInt(string(matches[1]), 16, 64) 79 | if err != nil { 80 | fatalF("failed to parse hex string in %s", line) 81 | } 82 | _, err = strconv.ParseInt(string(matches[2]), 16, 64) 83 | if err != nil { 84 | fatalF("failed to parse hex string in %s", line) 85 | } 86 | _, ok := mappings[string(matches[1])] 87 | if !ok { 88 | mappings[string(matches[1])] = string(matches[2]) 89 | } 90 | _, ok = mappings[string(matches[2])] 91 | if !ok { 92 | mappings[string(matches[2])] = string(matches[1]) 93 | } 94 | } 95 | onLine := PairsPerLine 96 | for lhs, rhs := range mappings { 97 | if onLine == PairsPerLine { 98 | _, err = cff.WriteString("\n\t") 99 | if err != nil { 100 | fatal("failed to write line-end: " + err.Error()) 101 | } 102 | onLine = 0 103 | } 104 | _, err = fmt.Fprintf(cff, "0x%s: 0x%s, ", lhs, rhs) 105 | if err != nil { 106 | fatal("failed to write pair: " + err.Error()) 107 | } 108 | onLine++ 109 | } 110 | _, _ = cff.Write([]byte("\n}\n")) 111 | _ = cff.Close() 112 | fmt.Printf("Rebuilt case_folding.go with %d codepoint pairs.\n", len(mappings)) 113 | err = os.Rename(CaseFoldingDB+".tmp", CaseFoldingDB) 114 | if err != nil { 115 | fatalF("Error switching in %s: ", err.Error()) 116 | } 117 | } 118 | 119 | func fatal(message string) { 120 | _, _ = fmt.Fprintln(os.Stderr, message) 121 | os.Exit(1) 122 | } 123 | func fatalF(format string, args ...any) { 124 | _, _ = fmt.Fprintf(os.Stderr, format, args...) 125 | os.Exit(1) 126 | } 127 | -------------------------------------------------------------------------------- /concurrency_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func updateTree(t *testing.T, m *coreMatcher, use37 bool, ch chan string) { 11 | t.Helper() 12 | 13 | var pattern string 14 | var val string 15 | if use37 { 16 | //nolint:gosec 17 | val = fmt.Sprintf("%f", 37.0+rand.Float64()) 18 | pattern = fmt.Sprintf(`{ "geometry": { "coordinates": [ %s ] } }`, val) 19 | } else { 20 | //nolint:gosec 21 | val = fmt.Sprintf(`"%d"`, rand.Int()) 22 | pattern = fmt.Sprintf(`{ "properties": { "STREET": [ %s ] } }`, val) 23 | /* TODO: alternate literal and shellstyle addition 24 | val = fmt.Sprintf(`"*%d"`, rand.Int()) 25 | pattern = fmt.Sprintf(`{ "properties": { "STREET": [ {"shellstyle": %s } ] } }`, val) 26 | */ 27 | } 28 | err := m.addPattern(val, pattern) 29 | if err != nil { 30 | t.Error("Concurrent: " + err.Error()) 31 | } 32 | ch <- val 33 | } 34 | 35 | func TestConcurrency(t *testing.T) { 36 | const UpdateLines = 250 37 | 38 | // this is a cut/paste of TestCityLots, except for every few lines we add another pattern to the matcher, 39 | // focusing on the fields that are being used by the patterns. The idea is to exercise concurrent 40 | // update and use of the automaton 41 | // I was initially surprised that adding 860 or so changes to the automaton while it's running doesn't seem to 42 | // cause any decrease in performance. But I guess it splits out very cleanly onto another core and really 43 | // doesn't steal any resources from the thread doing the Match calls 44 | lines := getCityLotsLines(t) 45 | 46 | patterns := []string{ 47 | `{ "properties": { "STREET": [ "CRANLEIGH" ] } }`, 48 | `{ "properties": { "STREET": [ { "shellstyle": "B*K"} ] } }`, 49 | `{ "properties": { "STREET": [ "17TH" ], "ODD_EVEN": [ "E"] } }`, 50 | `{ "geometry": { "coordinates": [ 37.807807921694092 ] } }`, 51 | `{ "properties": { "MAPBLKLOT": ["0011008"], "BLKLOT": ["0011008"]}, "geometry": { "coordinates": [ 37.807807921694092 ] } } `, 52 | } 53 | names := []string{ 54 | "CRANLEIGH", 55 | "shellstyle", 56 | "17TH Even", 57 | "Geometry", 58 | "0011008", 59 | } 60 | wanted := map[X]int{ 61 | "CRANLEIGH": 7, 62 | "shellstyle": 746, 63 | "17TH Even": 836, 64 | "Geometry": 2, 65 | "0011008": 1, 66 | } 67 | 68 | var err error 69 | m := newCoreMatcher() 70 | for i := range names { 71 | err = m.addPattern(names[i], patterns[i]) 72 | if err != nil { 73 | t.Error("addPattern: " + err.Error()) 74 | } 75 | } 76 | results := make(map[X]int) 77 | 78 | use37 := true 79 | lineCount := 0 80 | before := time.Now() 81 | ch := make(chan string, 1000) 82 | sent := 0 83 | for _, line := range lines { 84 | matches, err := m.matchesForJSONEvent(line) 85 | if err != nil { 86 | t.Error("Matches4JSON: " + err.Error()) 87 | } 88 | lineCount++ 89 | if lineCount%UpdateLines == 0 { 90 | use37 = !use37 91 | sent++ 92 | go updateTree(t, m, use37, ch) 93 | } 94 | for _, match := range matches { 95 | count, ok := results[match] 96 | if !ok { 97 | count = 0 98 | } 99 | results[match] = count + 1 100 | } 101 | } 102 | 103 | elapsed := float64(time.Since(before).Milliseconds()) 104 | perSecond := float64(lineCount) / (elapsed / 1000.0) 105 | fmt.Printf("\n%.2f matches/second with updates\n\n", perSecond) 106 | 107 | if len(results) != len(wanted) { 108 | t.Errorf("got %d results, wanted %d", len(results), len(wanted)) 109 | } 110 | for match, count := range results { 111 | if count != wanted[match] { 112 | t.Errorf("For %s, wanted=%d, result=%d", match, wanted[match], count) 113 | } 114 | } 115 | 116 | // now we go back and make sure that all those addPattern calls actually made it into the matcher 117 | for i := 0; i < sent; i++ { 118 | val := <-ch 119 | var event string 120 | if val[0] == '"' { 121 | event = fmt.Sprintf(`{"properties": { "STREET": %s} }`, val) 122 | } else { 123 | event = fmt.Sprintf(`{"geometry": { "coordinates": [ %s ] } }`, val) 124 | } 125 | var matches []X 126 | matches, err = m.matchesForJSONEvent([]byte(event)) 127 | if err != nil { 128 | t.Error("after concur: " + err.Error()) 129 | } 130 | if len(matches) != 1 || matches[0] != val { 131 | t.Error("problem with: " + val) 132 | } 133 | } 134 | close(ch) 135 | } 136 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Package quamina instances support adding Patterns and then 2 | // presenting Events, generating a report of which Patterns 3 | // match the Event. Patterns and Events are both represented 4 | // as JSON objects, although there is a provided Flattener interface 5 | // by which structured objects in formats other than JSON can be 6 | // matched by quamina. Quamina instances match Events quickly and 7 | // with a latency that is not strongly affected by the number of 8 | // Patterns which have been added. 9 | package quamina 10 | -------------------------------------------------------------------------------- /escaping_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestReadMemberName(t *testing.T) { 8 | j := `{"😀💋😺": 1, "x\u0078\ud83d\udc8by": "2"}` 9 | m := fakeMatcher("😀💋😺", `xx💋y`) 10 | f := newJSONFlattener() 11 | fields, err := f.Flatten([]byte(j), m.getSegmentsTreeTracker()) 12 | if err != nil { 13 | t.Error("TRMN: " + err.Error()) 14 | } 15 | if len(fields) != 2 { 16 | t.Errorf("wanted 2 fields got %d", len(fields)) 17 | } 18 | if string(fields[0].Path) != "😀💋😺" || string(fields[0].Val) != "1" { 19 | t.Error("botched field 0") 20 | } 21 | if string(fields[1].Path) != "xx💋y" || string(fields[1].Val) != `"2"` { 22 | t.Error("botched field 0") 23 | } 24 | } 25 | 26 | func TestStringValuesWithEscapes(t *testing.T) { 27 | j := `{"a": "x\u0078\ud83d\udc8by", "b": "\ud83d\ude00\ud83d\udc8b\ud83d\ude3a"}` 28 | f := newJSONFlattener() 29 | fields, err := f.Flatten([]byte(j), fakeMatcher("a", "b").getSegmentsTreeTracker()) 30 | if err != nil { 31 | t.Error("TSVWE: " + err.Error()) 32 | } 33 | if len(fields) != 2 { 34 | t.Errorf("wanted 2 fields got %d", len(fields)) 35 | } 36 | wanted := `"xx💋y"` 37 | if string(fields[0].Path) != "a" || string(fields[0].Val) != wanted { 38 | t.Errorf("wanted %s got %s", wanted, "["+string(fields[0].Val)+"]") 39 | } 40 | if string(fields[1].Path) != "b" || string(fields[1].Val) != `"😀💋😺"` { 41 | t.Errorf("1 wanted %s got %s", `"😀💋😺"`, string(fields[1].Val)) 42 | } 43 | } 44 | 45 | func TestOneEscape(t *testing.T) { 46 | tests := map[string]string{ 47 | `\"z`: `"`, 48 | `\\z`: `\`, 49 | `\/z`: "/", 50 | `\bz`: string([]byte{8}), 51 | `\fz`: string([]byte{0xc}), 52 | `\nz`: "\n", 53 | `\rz`: "\r", 54 | `\tz`: "\t", 55 | `\u0416\ud83d\udc8b\u4e2dz`: `Ж💋中`, 56 | } 57 | for escape, wanted := range tests { 58 | f := &flattenJSON{event: []byte(escape), fields: make([]Field, 0, 32)} 59 | unescaped, from, err := f.readTextWithEscapes(0) 60 | if err != nil { 61 | t.Errorf("for %s: %s", escape, err.Error()) 62 | } 63 | if from != len(escape)-2 { 64 | t.Errorf("for %s from %d wanted %d", escape, from, len(escape)-2) 65 | } 66 | if string(unescaped) != wanted { 67 | t.Errorf("got %s wanted %s", string(unescaped), wanted) 68 | } 69 | } 70 | } 71 | 72 | func TestUTF16Escaping(t *testing.T) { 73 | str := `?*\u0066\u006f\u006f<>` 74 | b := []byte(str) 75 | f := &flattenJSON{fields: make([]Field, 0, 32)} 76 | f.event = b 77 | f.eventIndex = 0 78 | chars, from, err := f.readHexUTF16(3) 79 | if err != nil { 80 | t.Error("TUE: " + err.Error()) 81 | } 82 | if string(f.event[from:]) != "f<>" { 83 | t.Errorf("tail=%s should be f<>", string(f.event[from:])) 84 | } 85 | if string(chars) != "foo" { 86 | t.Errorf("Chars = '%s' wanted foo", string(chars)) 87 | } 88 | str = `?*\u0066\u006f\u006f\t<>` 89 | b = []byte(str) 90 | f = &flattenJSON{fields: make([]Field, 0, 32)} 91 | f.event = b 92 | f.eventIndex = 0 93 | chars, from, err = f.readHexUTF16(3) 94 | if err != nil { 95 | t.Error("TUE: " + err.Error()) 96 | } 97 | if string(f.event[from:]) != "\\t<>" { 98 | t.Errorf("tail=%s should be \\t<>", string(f.event[from:])) 99 | } 100 | if string(chars) != "foo" { 101 | t.Errorf("Chars = '%s' wanted foo", string(chars)) 102 | } 103 | 104 | shouldBeBad := []string{ 105 | `!!!\uaabx27`, 106 | `cde\u03`, 107 | } 108 | for _, bad := range shouldBeBad { 109 | b = []byte(bad) 110 | f = &flattenJSON{fields: make([]Field, 0, 32)} 111 | f.event = b 112 | _, _, err = f.readHexUTF16(4) 113 | if err == nil { 114 | t.Error("Missed error on " + bad) 115 | } 116 | } 117 | 118 | // emoji: U+1F600 d83d de00 😀 U+1F48B d83d dc8b 💋 U+1F408 d83d de3a 😺 U+4E2D 4e2d 中 U+0416 0416 Ж 119 | // trying to mix up various combinations of utf-16 one-codepoint and two-codepoint encodings 120 | emojis := []string{ 121 | `😀💋😺`, 122 | `中Жy`, 123 | `x中Ж`, 124 | `x中y`, 125 | `x💋y`, 126 | `😺Ж💋`, 127 | `Ж💋中`, 128 | } 129 | utf16 := []string{ 130 | `<\ud83d\ude00\ud83d\udc8b\ud83d\ude3a>`, 131 | `<\u4e2d\u0416\u0079>`, 132 | `<\u0078\u4e2d\u0416>`, 133 | `<\u0078\u4e2d\u0079>`, 134 | `<\u0078\ud83d\udc8b\u0079>`, 135 | `<\ud83d\ude3a\u0416\ud83d\udc8b>`, 136 | `<\u0416\ud83d\udc8b\u4e2d>`, 137 | } 138 | 139 | for i, emoji := range emojis { 140 | b = []byte(utf16[i]) 141 | f = &flattenJSON{fields: make([]Field, 0, 32)} 142 | f.event = b 143 | chars, from, err = f.readHexUTF16(2) 144 | if err != nil { 145 | t.Error("Ouch: '" + emoji + "': " + err.Error()) 146 | } 147 | if from != len(b)-2 { 148 | t.Errorf("for %s wanted from %d got %d", emoji, len(b)-2, from) 149 | } 150 | if string(chars) != emoji { 151 | t.Errorf("wanted '%s' got '%s'", emoji, string(chars)) 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package quamina_test 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "quamina.net/go/quamina" 8 | ) 9 | 10 | const userRegisteredEvent = `{ 11 | "id": "1c0e1ce4-3d88-4786-a09d-7133c170d02a", 12 | "type": "UserRegistered", 13 | "user": { 14 | "name": "Doe, John", 15 | "premiumAccount": true 16 | } 17 | } 18 | ` 19 | 20 | const premiumUserPattern = `{ 21 | "type":["UserRegistered"], 22 | "user": {"premiumAccount": [true]} 23 | }` 24 | 25 | func ExampleNew() { 26 | q, err := quamina.New() 27 | if err != nil { 28 | log.Fatalf("could not create quamina instance: %v", err) 29 | } 30 | 31 | const patternName = "premium user" 32 | err = q.AddPattern(patternName, premiumUserPattern) 33 | if err != nil { 34 | log.Fatalf("could not add pattern: %v", err) 35 | } 36 | 37 | matches, err := q.MatchesForEvent([]byte(userRegisteredEvent)) 38 | if err != nil { 39 | log.Fatalf("could not match for event: %v", err) 40 | } 41 | 42 | for _, m := range matches { 43 | if m == patternName { 44 | fmt.Printf("pattern matched for event: %q", patternName) 45 | return 46 | } 47 | } 48 | 49 | // you would typically handle no matches cases here, but in this example no 50 | // match is a bug, hence panic :) 51 | panic("no pattern match") 52 | 53 | // Output: pattern matched for event: "premium user" 54 | } 55 | -------------------------------------------------------------------------------- /external_test.go: -------------------------------------------------------------------------------- 1 | package quamina_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "quamina.net/go/quamina" 7 | ) 8 | 9 | type fakeFlattener struct { 10 | r []quamina.Field 11 | } 12 | 13 | func (f *fakeFlattener) Flatten(_ []byte, _ quamina.SegmentsTreeTracker) ([]quamina.Field, error) { 14 | return f.r, nil 15 | } 16 | 17 | func (f *fakeFlattener) Copy() quamina.Flattener { 18 | return &fakeFlattener{r: f.r} 19 | } 20 | 21 | // TestNew proves we can actually call New() using With options 22 | func TestNew(t *testing.T) { 23 | _, err := quamina.New(quamina.WithFlattener(&fakeFlattener{})) 24 | if err != nil { 25 | t.Error("qNew: " + err.Error()) 26 | } 27 | } 28 | 29 | func TestDifferentFlattener(t *testing.T) { 30 | pos := quamina.ArrayPos{Array: 1, Pos: 1} 31 | f := quamina.Field{ 32 | Path: []byte{97}, 33 | Val: []byte{49}, 34 | ArrayTrail: []quamina.ArrayPos{pos}, 35 | } 36 | flattener := &fakeFlattener{r: []quamina.Field{f}} 37 | q, err := quamina.New(quamina.WithFlattener(flattener)) 38 | if err != nil { 39 | t.Error("q.new: " + err.Error()) 40 | } 41 | err = q.AddPattern("xyz", `{"a": [1]}`) 42 | if err != nil { 43 | t.Error("addP: " + err.Error()) 44 | } 45 | matches, err := q.MatchesForEvent([]byte(`{"a": 1}`)) 46 | if err != nil { 47 | t.Error("m4: " + err.Error()) 48 | } 49 | if len(matches) != 1 || matches[0] != "xyz" { 50 | t.Error("missed!") 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /field_matcher.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "sync/atomic" 5 | ) 6 | 7 | // fieldMatcher represents a state in the matching automaton, which matches field names and dispatches to 8 | // valueMatcher to complete matching of field values. 9 | // the fields that hold state are segregated in updateable, so they can be replaced atomically and make the coreMatcher 10 | // thread-safe. 11 | type fieldMatcher struct { 12 | updateable atomic.Pointer[fmFields] 13 | } 14 | 15 | // fmFields contains the updateable fields in fieldMatcher. 16 | // transitions is a map keyed by the field paths that can start transitions from this state; for each such field, 17 | // there is a valueMatcher which, given the field's value, determines whether the automaton progresses to another 18 | // fieldMatcher. 19 | // matches contains the X values that arrival at this state implies have matched. 20 | // existsTrue and existsFalse record those types of patterns; traversal doesn't require looking at a valueMatcher 21 | type fmFields struct { 22 | transitions map[string]*valueMatcher 23 | matches []X 24 | existsTrue map[string]*fieldMatcher 25 | existsFalse map[string]*fieldMatcher 26 | } 27 | 28 | // fields / update / addExistsFalseFailure / addMatch exist to insulate callers from dealing with 29 | // the atomic Load/Store business 30 | func (m *fieldMatcher) fields() *fmFields { 31 | return m.updateable.Load() 32 | } 33 | 34 | func (m *fieldMatcher) update(fields *fmFields) { 35 | m.updateable.Store(fields) 36 | } 37 | 38 | func (m *fieldMatcher) gatherMetadata(meta *nfaMetadata) { 39 | for _, vm := range m.fields().transitions { 40 | vm.gatherMetadata(meta) 41 | } 42 | for _, fm := range m.fields().existsTrue { 43 | fm.gatherMetadata(meta) 44 | } 45 | for _, fm := range m.fields().existsFalse { 46 | fm.gatherMetadata(meta) 47 | } 48 | } 49 | 50 | func (m *fieldMatcher) addMatch(x X) { 51 | current := m.fields() 52 | newFields := &fmFields{ 53 | transitions: current.transitions, 54 | existsTrue: current.existsTrue, 55 | existsFalse: current.existsFalse, 56 | } 57 | 58 | newFields.matches = append(newFields.matches, current.matches...) 59 | newFields.matches = append(newFields.matches, x) 60 | m.update(newFields) 61 | } 62 | 63 | func newFieldMatcher() *fieldMatcher { 64 | fields := &fmFields{ 65 | transitions: make(map[string]*valueMatcher), 66 | existsTrue: make(map[string]*fieldMatcher), 67 | existsFalse: make(map[string]*fieldMatcher), 68 | } 69 | fm := &fieldMatcher{} 70 | fm.updateable.Store(fields) 71 | return fm 72 | } 73 | 74 | func (m *fieldMatcher) addExists(exists bool, field *patternField) []*fieldMatcher { 75 | var trans *fieldMatcher 76 | current := m.fields() 77 | freshStart := &fmFields{ 78 | transitions: current.transitions, 79 | matches: current.matches, 80 | existsTrue: make(map[string]*fieldMatcher), 81 | existsFalse: make(map[string]*fieldMatcher), 82 | } 83 | var path string 84 | for path, trans = range current.existsTrue { 85 | freshStart.existsTrue[path] = trans 86 | } 87 | for path, trans = range current.existsFalse { 88 | freshStart.existsFalse[path] = trans 89 | } 90 | var ok bool 91 | if exists { 92 | trans, ok = freshStart.existsTrue[field.path] 93 | if !ok { 94 | trans = newFieldMatcher() 95 | freshStart.existsTrue[field.path] = trans 96 | } 97 | } else { 98 | trans, ok = freshStart.existsFalse[field.path] 99 | if !ok { 100 | trans = newFieldMatcher() 101 | freshStart.existsFalse[field.path] = trans 102 | } 103 | } 104 | m.update(freshStart) 105 | return []*fieldMatcher{trans} 106 | } 107 | 108 | func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fieldMatcher { 109 | // we build the new updateable state in freshStart so that we can blast it in atomically once computed 110 | current := m.fields() 111 | freshStart := &fmFields{ 112 | matches: current.matches, 113 | existsTrue: current.existsTrue, 114 | existsFalse: current.existsFalse, 115 | } 116 | 117 | freshStart.transitions = make(map[string]*valueMatcher) 118 | for k, v := range current.transitions { 119 | freshStart.transitions[k] = v 120 | } 121 | vm, ok := freshStart.transitions[field.path] 122 | if !ok { 123 | vm = newValueMatcher() 124 | } 125 | freshStart.transitions[field.path] = vm 126 | 127 | // suppose I'm adding the first pattern to a matcher, and it has "x": [1, 2]. In principle the branches on 128 | // "x": 1 and "x": 2 could go to tne same next state. But we have to make a unique next state for each of them 129 | // because some future other pattern might have "x": [2, 3] and thus we need a separate branch to potentially 130 | // match two patterns on "x": 2 but not "x": 1. If you were optimizing the automaton for size you might detect 131 | // cases where this doesn't happen and reduce the number of fieldMatchStates 132 | var nextFieldMatchers []*fieldMatcher 133 | for _, val := range field.vals { 134 | nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(val, printer)) 135 | } 136 | m.update(freshStart) 137 | return nextFieldMatchers 138 | } 139 | 140 | // transitionOn returns one or more fieldMatchStates you can transition to on a field's name/value combination, 141 | // or nil if no transitions are possible. An example of name/value that could produce multiple next states 142 | // would be if you had the pattern { "a": [ "foo" ] } and another pattern that matched any value with 143 | // a prefix of "f". 144 | func (m *fieldMatcher) transitionOn(field *Field, bufs *bufpair) []*fieldMatcher { 145 | // are there transitions on this field name? 146 | valMatcher, ok := m.fields().transitions[string(field.Path)] 147 | if !ok { 148 | return nil 149 | } 150 | return valMatcher.transitionOn(field, bufs) 151 | } 152 | -------------------------------------------------------------------------------- /flatten_json_bench_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | var ( 10 | topMatches []X 11 | topFields []Field 12 | ) 13 | 14 | const PatternContext = `{ "context": { "user_id": [9034], "friends_count": [158] } }` 15 | const PatternMiddleNestedField = `{ "payload": { "user": { "id_str": ["903487807"] } } }` 16 | const PatternLastField = `{ "payload": { "lang_value": ["ja"] } }` 17 | 18 | func Benchmark_JsonFlattener_ContextFields(b *testing.B) { 19 | RunBenchmarkWithJSONFlattener(b, "context\nuser_id", "context\nfriends_count") 20 | } 21 | 22 | func Benchmark_JsonFlattener_MiddleNestedField(b *testing.B) { 23 | RunBenchmarkWithJSONFlattener(b, "payload\nuser\nid_str") 24 | } 25 | 26 | func Benchmark_JsonFlattener_LastField(b *testing.B) { 27 | RunBenchmarkWithJSONFlattener(b, "payload\nlang_value") 28 | } 29 | 30 | func RunBenchmarkWithJSONFlattener(b *testing.B, paths ...string) { 31 | b.Helper() 32 | var localFields []Field 33 | 34 | event, err := os.ReadFile("./testdata/status.json") 35 | if err != nil { 36 | b.Fatal(err) 37 | } 38 | 39 | flattener := newJSONFlattener() 40 | 41 | t := newSegmentsIndex(paths...) 42 | results, err := flattener.Flatten(event, t) 43 | if err != nil { 44 | b.Fatal(err) 45 | } 46 | PrintFields(b, results) 47 | 48 | b.ResetTimer() 49 | b.ReportAllocs() 50 | 51 | for i := 0; i < b.N; i++ { 52 | fields, err := flattener.Flatten(event, t) 53 | if err != nil { 54 | b.Fatal(err) 55 | } 56 | localFields = fields 57 | } 58 | topFields = localFields 59 | } 60 | 61 | func Benchmark_JsonFlattner_Evaluate_ContextFields(b *testing.B) { 62 | q, err := New() 63 | 64 | if err != nil { 65 | b.Fatal(err) 66 | } 67 | 68 | RunBenchmarkEvaluate(b, q, PatternContext) 69 | } 70 | 71 | func Benchmark_JsonFlattner_Evaluate_MiddleNestedField(b *testing.B) { 72 | q, err := New() 73 | 74 | if err != nil { 75 | b.Fatal(err) 76 | } 77 | 78 | RunBenchmarkEvaluate(b, q, PatternMiddleNestedField) 79 | } 80 | 81 | func Benchmark_JsonFlattner_Evaluate_LastField(b *testing.B) { 82 | q, err := New() 83 | 84 | if err != nil { 85 | b.Fatal(err) 86 | } 87 | 88 | RunBenchmarkEvaluate(b, q, PatternLastField) 89 | } 90 | 91 | func RunBenchmarkEvaluate(b *testing.B, q *Quamina, pattern string) { 92 | b.Helper() 93 | 94 | err := q.AddPattern(1, pattern) 95 | if err != nil { 96 | b.Fatalf("Failed adding pattern: %+v", err) 97 | } 98 | 99 | event, err := os.ReadFile("./testdata/status.json") 100 | if err != nil { 101 | b.Fatal(err) 102 | } 103 | 104 | matches, err := q.MatchesForEvent(event) 105 | if err != nil { 106 | b.Fatalf("failed matching: %s", err) 107 | } 108 | 109 | if len(matches) != 1 { 110 | b.Fatalf("in-correct matching: %+v", matches) 111 | } 112 | 113 | b.ReportAllocs() 114 | b.ResetTimer() 115 | 116 | for i := 0; i < b.N; i++ { 117 | matches, err := q.MatchesForEvent(event) 118 | if err != nil { 119 | b.Fatalf("failed matching: %s", err) 120 | } 121 | 122 | if len(matches) != 1 { 123 | b.Fatalf("in-correct matching: %+v", matches) 124 | } 125 | } 126 | } 127 | 128 | func PrintFields(tb testing.TB, fields []Field) { 129 | tb.Helper() 130 | 131 | tb.Logf("> Fields\n") 132 | 133 | for _, field := range fields { 134 | tb.Logf("Path [%s] Val [%s] ArrayTrail [%+v]\n", strings.ReplaceAll(string(field.Path), "\n", "->"), field.Val, field.ArrayTrail) 135 | } 136 | tb.Logf("\n") 137 | } 138 | -------------------------------------------------------------------------------- /flattener.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | // nolint:goimports,gofmt 4 | // Flattener is an interface which provides methods to turn a data structure into a list of path-names and 5 | // values. The following example illustrates how it works for a JSON object: 6 | // { "a": 1, "b": "two", "c": true", "d": nil, "e": { "e1": 2, "e2":, 3.02e-5} "f": [33, "x"]} } 7 | // should produce 8 | // 9 | // "a", "1" 10 | // "b", "\"two\"", 11 | // "c", "true" 12 | // "d", "nil", 13 | // "e\ne1", "2" 14 | // "e\ne2", "3.02e-5" 15 | // "f", "33" 16 | // "f", "\"x\"" 17 | // 18 | // Let's call the first column, eg "d" and "e\ne1", the path. For each 19 | // step i the path, e.g. "d" and "e1", the Flattener should utilize SegmentsTreeTracker to 20 | // traverse the hierarchy and select only the needed fields. 21 | type Flattener interface { 22 | Flatten(event []byte, tracker SegmentsTreeTracker) ([]Field, error) 23 | Copy() Flattener 24 | } 25 | 26 | // Arrays are invisible in the automaton. That is to say, if an event has 27 | // { "a": [ 1, 2, 3 ] } 28 | // Then the Flattener must produce a/1, a/2, and a/3 Same for {"a": [[1, 2], 3]} or any 29 | // other permutation. 30 | // If we're not careful, this would create a problem. If you have 31 | // {"a": [ { "b": 1, "c": 2}, {"b": 3, "c": 4}] } 32 | // then a pattern like 33 | // { "a": { "b": 1, "c": 4 } } 34 | // would match. To prevent that from happening, each ArrayPos contains two 35 | // numbers; the first identifies the array in 36 | // the event that this name/val occurred in, the second the position in the array. We don't allow 37 | // transitioning between field values that occur in different positions in the same array. 38 | // See the arrays_test unit for more examples, and the jsonFlattener source code to 39 | // see how it's implemented 40 | 41 | // ArrayPos represents a Field's position in an Event's structure. Each array in the Event 42 | // should get an integer which identifies it - in flattenJSON this is accomplished by keeping a counter and 43 | // giving arrays numbers starting from 0. ArrayPos exists to ensure that Quamina MatchesForEvent will not 44 | // return a match where two of the matching fields are in separate elements of the same array. 45 | // Array uniquely identifies an array in an Event. 46 | // Pos is the Field's index in the Array. 47 | type ArrayPos struct { 48 | Array int32 49 | Pos int32 50 | } 51 | 52 | // Field represents a pathname/value combination, one of the data items which is matched 53 | // against Patterns by the MatchesForEvent API. 54 | // Path is the \n-separated path from the event root to this field value. 55 | // Val is the value, a []byte forming a textual representation of the type 56 | // ArrayTrail, for each array in the Path, identifies the array and the index in it. 57 | type Field struct { 58 | Path []byte 59 | Val []byte 60 | ArrayTrail []ArrayPos 61 | IsNumber bool 62 | } 63 | -------------------------------------------------------------------------------- /generic_machine_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | ) 8 | 9 | /* This test adopted, with thanks, from aws/event-ruler */ 10 | 11 | func TestRulerArraysBug(t *testing.T) { 12 | event := "{\n" + 13 | " \"requestContext\": { \"obfuscatedCustomerId\": \"AIDACKCEVSQ6C2EXAMPLE\" },\n" + 14 | " \"hypotheses\": [\n" + 15 | " { \"isBluePrint\": true, \"creator\": \"A123\" },\n" + 16 | " { \"isBluePrint\": false, \"creator\": \"A234\" }\n" + 17 | " ]\n" + 18 | "}" 19 | r1 := "{\n" + 20 | " \"hypotheses\": {\n" + 21 | " \"isBluePrint\": [ false ],\n" + 22 | " \"creator\": [ \"A123\" ]\n" + 23 | " }\n" + 24 | "}" 25 | r2 := "{\n" + 26 | " \"hypotheses\": {\n" + 27 | " \"isBluePrint\": [ true ],\n" + 28 | " \"creator\": [ \"A234\" ]\n" + 29 | " }\n" + 30 | "}" 31 | 32 | q, _ := New() 33 | err := q.AddPattern("r1", r1) 34 | if err != nil { 35 | t.Error("add r1") 36 | } 37 | err = q.AddPattern("r2", r2) 38 | if err != nil { 39 | t.Error("add r2") 40 | } 41 | matches, err := q.MatchesForEvent([]byte(event)) 42 | if err != nil { 43 | t.Errorf("MatchesForEvent: %s", err) 44 | } 45 | if len(matches) != 0 { 46 | t.Error("Nonzero matches") 47 | } 48 | } 49 | 50 | func readTestData(t *testing.T, fname string) []byte { 51 | t.Helper() 52 | bytes, err := os.ReadFile("testdata/" + fname) 53 | if err != nil { 54 | t.Error("couldn't read: " + fname + ": " + err.Error()) 55 | } 56 | return bytes 57 | } 58 | 59 | func TestRulerNestedArrays(t *testing.T) { 60 | event1 := readTestData(t, "arrayEvent1.json") 61 | event2 := readTestData(t, "arrayEvent2.json") 62 | event3 := readTestData(t, "arrayEvent3.json") 63 | event4 := readTestData(t, "arrayEvent4.json") 64 | 65 | rule1 := string(readTestData(t, "arrayRule1.json")) 66 | rule2 := string(readTestData(t, "arrayRule2.json")) 67 | rule3 := string(readTestData(t, "arrayRule3.json")) 68 | 69 | q, _ := New() 70 | for i, rule := range []string{rule1, rule2, rule3} { 71 | err := q.AddPattern(fmt.Sprintf("rule%d", i+1), rule) 72 | if err != nil { 73 | t.Errorf("add rule%d", i) 74 | } 75 | } 76 | r1, err := q.MatchesForEvent(event1) 77 | if err != nil { 78 | t.Error("Matches " + err.Error()) 79 | } 80 | if len(r1) != 2 { 81 | t.Errorf("r1 len %d", len(r1)) 82 | } 83 | 84 | r2, err := q.MatchesForEvent(event2) 85 | if err != nil { 86 | t.Error("Matches " + err.Error()) 87 | } 88 | if len(r2) != 0 { 89 | t.Errorf("r2 matchd %d", len(r2)) 90 | } 91 | 92 | r3, err := q.MatchesForEvent(event3) 93 | if err != nil { 94 | t.Error("Matches " + err.Error()) 95 | } 96 | if len(r3) != 0 { 97 | t.Errorf("r3 matchd %d", len(r2)) 98 | } 99 | 100 | r4, err := q.MatchesForEvent(event4) 101 | if err != nil { 102 | t.Error("Matches " + err.Error()) 103 | } 104 | if len(r4) != 1 || r4[0] != "rule3" { 105 | var msg string 106 | if len(r4) == 1 { 107 | msg += "match: " + r4[0].(string) 108 | } else { 109 | msg = fmt.Sprintf("r4 matches %d", len(r4)) 110 | } 111 | t.Error(msg) 112 | } 113 | } 114 | 115 | func TestRulerSimplestPossibleMachine(t *testing.T) { 116 | rule1 := "{ \"a\" : [ 1 ] }" 117 | rule2 := "{ \"b\" : [ 2 ] }" 118 | rule3 := "{ \"c\" : [ 3 ] }" 119 | 120 | q, _ := New() 121 | _ = q.AddPattern("r1", rule1) 122 | _ = q.AddPattern("r2", rule2) 123 | _ = q.AddPattern("r3", rule3) 124 | 125 | event1 := "{ \"a\" : 1 }" 126 | event2 := "{ \"b\" : 2 }" 127 | event4 := "{ \"x\" : true }" 128 | event5 := "{ \"a\" : 1, \"b\": 2, \"c\" : 3 }" 129 | 130 | var val []X 131 | var err error 132 | val, err = q.MatchesForEvent([]byte(event1)) 133 | if err != nil { 134 | t.Error("e1: " + err.Error()) 135 | } 136 | if len(val) != 1 || val[0] != "r1" { 137 | t.Error("event1 fail") 138 | } 139 | 140 | val, err = q.MatchesForEvent([]byte(event2)) 141 | if err != nil { 142 | t.Error("e2: " + err.Error()) 143 | } 144 | if len(val) != 1 || val[0] != "r2" { 145 | t.Error("event2 fail") 146 | } 147 | 148 | val, err = q.MatchesForEvent([]byte(event4)) 149 | if err != nil { 150 | t.Error("e2: " + err.Error()) 151 | } 152 | if len(val) != 0 { 153 | t.Error("event4 fail") 154 | } 155 | 156 | val, err = q.MatchesForEvent([]byte(event5)) 157 | if err != nil { 158 | t.Error("e2: " + err.Error()) 159 | } 160 | if len(val) != 3 { 161 | t.Error("event4 fail") 162 | } 163 | matched := 0 164 | for _, v := range val { 165 | if v == "r1" || v == "r2" || v == "r3" { 166 | matched++ 167 | } 168 | } 169 | if matched != 3 { 170 | t.Error("missing match") 171 | } 172 | } 173 | 174 | func TestRulerEmptyInput(t *testing.T) { 175 | rule1 := `{ 176 | "detail": { 177 | "c-count": [ 178 | { 179 | "exists": false 180 | } 181 | ] 182 | }, 183 | "d-count": [ 184 | { 185 | "exists": false 186 | } 187 | ], 188 | "e-count": [ 189 | { 190 | "exists": false 191 | } 192 | ] 193 | }` 194 | event := "{}" 195 | q, _ := New() 196 | err := q.AddPattern("r", rule1) 197 | if err != nil { 198 | t.Error("Empty input add pattern" + err.Error()) 199 | } 200 | matches, err := q.MatchesForEvent([]byte(event)) 201 | if err != nil { 202 | t.Error("Empty input matches: " + err.Error()) 203 | } 204 | if len(matches) != 1 || matches[0] != "r" { 205 | t.Error("Empty input match botch") 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module quamina.net/go/quamina 2 | 3 | go 1.22.0 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timbray/quamina/0526acc321a81d4df535caf790879648ace11c86/go.sum -------------------------------------------------------------------------------- /live_pattern_state.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | // LivePatternsState represents the required capabilities for maintaining the 8 | // set of live patterns. 9 | type LivePatternsState interface { 10 | // Add adds a new pattern or updates an old pattern. 11 | // 12 | // Note that multiple patterns can be associated with the same X. 13 | Add(x X, pattern string) error 14 | 15 | // Delete removes all patterns associated with the given X and returns the 16 | // number of removed patterns. 17 | Delete(x X) (int, error) 18 | 19 | // Iterate calls the given function for every stored pattern. 20 | Iterate(func(x X, pattern string) error) error 21 | 22 | // Contains returns true if x is in the live set; false otherwise. 23 | Contains(x X) (bool, error) 24 | } 25 | 26 | type ( 27 | stringSet map[string]nothing 28 | nothing struct{} 29 | ) 30 | 31 | var na = nothing{} 32 | 33 | // memState is a LivePatternsState that is just a map (with a RWMutex). 34 | // 35 | // Since the LivePatternsState implementation can be provided to the 36 | // application, we're keeping things simple here initially. 37 | type memState struct { 38 | lock sync.RWMutex 39 | m map[X]stringSet 40 | } 41 | 42 | func newMemState() *memState { 43 | // Accept initial size as a parameter? 44 | return &memState{ 45 | m: make(map[X]stringSet), 46 | } 47 | } 48 | 49 | func (s *memState) Add(x X, pattern string) error { 50 | s.lock.Lock() 51 | ps, have := s.m[x] 52 | if !have { 53 | ps = make(stringSet) 54 | s.m[x] = ps 55 | } 56 | ps[pattern] = na 57 | s.lock.Unlock() 58 | return nil 59 | } 60 | 61 | func (s *memState) Contains(x X) (bool, error) { 62 | s.lock.RLock() 63 | _, have := s.m[x] 64 | s.lock.RUnlock() 65 | return have, nil 66 | } 67 | 68 | func (s *memState) Delete(x X) (int, error) { 69 | s.lock.Lock() 70 | cardinality := 0 71 | if xs, have := s.m[x]; have { 72 | cardinality = len(xs) 73 | delete(s.m, x) 74 | } 75 | s.lock.Unlock() 76 | 77 | return cardinality, nil 78 | } 79 | 80 | func (s *memState) Iterate(f func(x X, pattern string) error) error { 81 | s.lock.RLock() 82 | var err error 83 | for x, ps := range s.m { 84 | for p := range ps { 85 | if err = f(x, p); err != nil { 86 | break 87 | } 88 | } 89 | } 90 | s.lock.RUnlock() 91 | return err 92 | } 93 | -------------------------------------------------------------------------------- /live_pattern_state_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestMemIterateFerr(t *testing.T) { 9 | s := newMemState() 10 | f := func(x X, pattern string) error { 11 | return fmt.Errorf("broken") 12 | } 13 | if err := s.Add(1, "{}"); err != nil { 14 | t.Fatal(err) 15 | } 16 | if err := s.Iterate(f); err == nil { 17 | t.Fatal("expected error") 18 | } 19 | } 20 | 21 | func TestStateDelete(t *testing.T) { 22 | s := newMemState() 23 | 24 | if err := s.Add(1, `{"likes":"queso"}`); err != nil { 25 | t.Fatal(err) 26 | } 27 | 28 | if err := s.Add(1, `{"likes":"tacos"}`); err != nil { 29 | t.Fatal(err) 30 | } 31 | 32 | if n, err := s.Delete(1); err != nil { 33 | t.Fatal(err) 34 | } else if n != 2 { 35 | t.Fatal(n) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /match_set.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | // matchSet is what it says on the tin; implements a set semantic on matches, which are of type X. These could all 4 | // be implemented as match[X]bool but this makes the calling code more readable. 5 | type matchSet struct { 6 | set map[X]bool 7 | } 8 | 9 | func newMatchSet() *matchSet { 10 | return &matchSet{set: make(map[X]bool)} 11 | } 12 | 13 | func (m *matchSet) addX(exes ...X) *matchSet { 14 | if len(exes) == 0 { 15 | return m 16 | } 17 | 18 | // for concurrency, can't update in place 19 | newSet := make(map[X]bool, len(m.set)+1) 20 | for k := range m.set { 21 | newSet[k] = true 22 | } 23 | for _, x := range exes { 24 | newSet[x] = true 25 | } 26 | return &matchSet{set: newSet} 27 | } 28 | 29 | func (m *matchSet) addXSingleThreaded(exes ...X) *matchSet { 30 | for _, x := range exes { 31 | m.set[x] = true 32 | } 33 | 34 | return m 35 | } 36 | 37 | func (m *matchSet) matches() []X { 38 | matches := make([]X, 0, len(m.set)) 39 | for x := range m.set { 40 | matches = append(matches, x) 41 | } 42 | return matches 43 | } 44 | -------------------------------------------------------------------------------- /match_set_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import "testing" 4 | 5 | func TestAddX(t *testing.T) { 6 | set := newMatchSet() 7 | 8 | // empty exes 9 | set = set.addX() 10 | if !isSameMatches(set) { 11 | t.Errorf("Expected matches to be empty: %+v", set.matches()) 12 | } 13 | 14 | newSet := set.addX(1) 15 | // existing set should be empty. 16 | if len(set.matches()) > 0 { 17 | t.Errorf("Expected matches to be empty: %+v", set.matches()) 18 | } 19 | if !isSameMatches(newSet, 1) { 20 | t.Errorf("Expected matches to be [1]: %+v", set.matches()) 21 | } 22 | 23 | // add another two values 24 | newSet = newSet.addX(2) 25 | newSet = newSet.addX(3) 26 | if !isSameMatches(newSet, 1, 2, 3) { 27 | t.Errorf("Expected matches to be [1, 2, 3]: %+v", set.matches()) 28 | } 29 | } 30 | 31 | func TestAddXSingleThreaded(t *testing.T) { 32 | set := newMatchSet() 33 | 34 | // empty exes 35 | set.addXSingleThreaded() 36 | if !isSameMatches(set) { 37 | t.Errorf("Expected matches to be empty: %+v", set.matches()) 38 | } 39 | 40 | set.addXSingleThreaded(1) 41 | // existing set should be empty. 42 | if !isSameMatches(set, 1) { 43 | t.Errorf("Expected matches to be [1]: %+v", set.matches()) 44 | } 45 | 46 | // add another two values 47 | set.addXSingleThreaded(2) 48 | set.addXSingleThreaded(3) 49 | if !isSameMatches(set, 1, 2, 3) { 50 | t.Errorf("Expected matches to be [1, 2, 3]: %+v", set.matches()) 51 | } 52 | } 53 | 54 | func (m *matchSet) contains(x X) bool { 55 | _, ok := m.set[x] 56 | return ok 57 | } 58 | 59 | func isSameMatches(matchSet *matchSet, exes ...X) bool { 60 | if len(exes) == 0 && len(matchSet.matches()) == 0 { 61 | return true 62 | } 63 | 64 | if len(exes) != len(matchSet.matches()) { 65 | return false 66 | } 67 | 68 | for _, x := range exes { 69 | if !matchSet.contains(x) { 70 | return false 71 | } 72 | } 73 | 74 | return true 75 | } 76 | -------------------------------------------------------------------------------- /matcher.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | type matcher interface { 4 | addPattern(x X, pat string) error 5 | matchesForFields(fields []Field) ([]X, error) 6 | deletePatterns(x X) error 7 | getSegmentsTreeTracker() SegmentsTreeTracker 8 | } 9 | -------------------------------------------------------------------------------- /matcher_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import "testing" 4 | 5 | func TestMatcherInterface(t *testing.T) { 6 | var m matcher = newCoreMatcher() 7 | if _, ok := m.(*coreMatcher); !ok { 8 | t.Error("Can't cast") 9 | } 10 | var x X = "x" 11 | err := m.addPattern(x, `{"x": [1, 2]}`) 12 | if err != nil { 13 | t.Error("addPattern? " + err.Error()) 14 | } 15 | err = m.deletePatterns("x") 16 | if err == nil { 17 | t.Error("coreMatcher allowed Delete!?") 18 | } 19 | event := `{"x": [3, 1]}` 20 | fields, _ := newJSONFlattener().Flatten([]byte(event), m.getSegmentsTreeTracker()) 21 | matches, _ := m.matchesForFields(fields) 22 | if len(matches) != 1 || matches[0] != x { 23 | t.Error("missed match") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /monocase.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "unicode/utf8" 7 | ) 8 | 9 | func readMonocaseSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) { 10 | t, err := pb.jd.Token() 11 | if err != nil { 12 | return 13 | } 14 | pathVals = valsIn 15 | 16 | monocaseString, ok := t.(string) 17 | if !ok { 18 | err = errors.New("value for 'prefix' must be a string") 19 | return 20 | } 21 | val := typedVal{ 22 | vType: monocaseType, 23 | val: `"` + monocaseString + `"`, 24 | } 25 | pathVals = append(pathVals, val) 26 | 27 | // has to be } or tokenizer will throw error 28 | _, err = pb.jd.Token() 29 | return 30 | } 31 | 32 | // makeMonocaseFA builds a FA to match "ignore-case" patterns. The Unicode Standard specifies algorithm 3.13, 33 | // relying on the file CaseFolding.txt in the Unicode Character Database. This function uses the "Simple" flavor 34 | // of casefolding, i.e. the lines in CaseFolding.txt that are marked with "C". The discussion in the Unicode 35 | // standard doesn't mention this, but the algorithm essentially replaces upper-case characters with lower-case 36 | // equivalents. 37 | // We need to exercise caution to keep from creating states wastefully. For "CAT", after matching '"', 38 | // you transition on either 'c' or 'C' but in this particular case you want to transition to the same 39 | // next state. Note that there are many characters in Unicode where the upper and lower case forms are 40 | // multi-byte and in fact not even the same number of bytes. So in that case you need two paths forward that step 41 | // through the bytes of each form and then rejoin to arrive at a state. Also note 42 | // that in many cases the upper/lower case versions of a rune have leading bytes in common 43 | func makeMonocaseFA(val []byte, pp printer) (*smallTable, *fieldMatcher) { 44 | fm := newFieldMatcher() 45 | index := 0 46 | table := newSmallTable() // start state 47 | startTable := table 48 | var nextStep *faNext 49 | for index < len(val) { 50 | var orig, alt []byte 51 | r, width := utf8.DecodeRune(val[index:]) 52 | orig = val[index : index+width] 53 | altRune, ok := caseFoldingPairs[r] 54 | if ok { 55 | alt = make([]byte, utf8.RuneLen(altRune)) 56 | utf8.EncodeRune(alt, altRune) 57 | } 58 | nextStep = &faNext{states: []*faState{{table: newSmallTable()}}} 59 | pp.labelTable(nextStep.states[0].table, fmt.Sprintf("On %d, alt=%v", val[index], alt)) 60 | if alt == nil { 61 | // easy case, no casefolding issues. We should maybe try to coalesce these 62 | // no-casefolding sections and only call makeFAFragment once for all of them 63 | origFA := makeFAFragment(orig, nextStep, pp) 64 | table.addByteStep(orig[0], origFA) 65 | } else { 66 | // two paths to next state 67 | // but they might have a common prefix 68 | var commonPrefix int 69 | for commonPrefix = 0; orig[commonPrefix] == alt[commonPrefix]; commonPrefix++ { 70 | prefixNext := &faNext{states: []*faState{{table: newSmallTable()}}} 71 | table.addByteStep(orig[commonPrefix], prefixNext) 72 | table = prefixNext.states[0].table 73 | pp.labelTable(table, fmt.Sprintf("common prologue on %v", orig[commonPrefix])) 74 | } 75 | // now build automata for the orig and alt versions of the char 76 | // TODO: make sure that makeFAFragment works with length == 1 77 | origFA := makeFAFragment(orig[commonPrefix:], nextStep, pp) 78 | altFA := makeFAFragment(alt[commonPrefix:], nextStep, pp) 79 | table.addByteStep(orig[commonPrefix], origFA) 80 | table.addByteStep(alt[commonPrefix], altFA) 81 | } 82 | table = nextStep.states[0].table 83 | index += width 84 | } 85 | laststate := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{fm}} 86 | lastStep := &faNext{states: []*faState{laststate}} 87 | nextStep.states[0].table.addByteStep(valueTerminator, lastStep) 88 | return startTable, fm 89 | } 90 | -------------------------------------------------------------------------------- /monocase_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestABCDMono(t *testing.T) { 9 | permuteAndTest(t, "abcd", "ABCD") 10 | } 11 | func TestHungarianMono(t *testing.T) { 12 | orig := []rune{0x10C80, 0x10C9D, 0x10C95, 0x10C8B} 13 | alts := []rune{0x10CC0, 0x10CDD, 0x10CD5, 0x10CCB} 14 | permuteAndTest(t, string(orig), string(alts)) 15 | } 16 | func TestIntermittentMono(t *testing.T) { 17 | permuteAndTest(t, "a,8899bc d", "A,8899BC D") 18 | } 19 | 20 | func permuteAndTest(t *testing.T, origS, altsS string) { 21 | t.Helper() 22 | orig := []byte(origS) 23 | alts := []byte(altsS) 24 | t.Helper() 25 | permutations := permuteCase(t, orig, alts, nil, 0, nil) 26 | pp := newPrettyPrinter(98987) 27 | fa, fm := makeMonocaseFA(orig, pp) 28 | for _, p := range permutations { 29 | ff := traverseDFA(fa, p, nil) 30 | if len(ff) != 1 || ff[0] != fm { 31 | t.Error("FfFfAIL") 32 | } 33 | } 34 | fmt.Printf("%s/%s: %s\n", origS, altsS, pp.printNFA(fa)) 35 | } 36 | func permuteCase(t *testing.T, orig []byte, alts []byte, sofar []byte, index int, permutations [][]byte) [][]byte { 37 | t.Helper() 38 | if index == len(orig) { 39 | next := make([]byte, len(sofar)) 40 | copy(next, sofar) 41 | permutations = append(permutations, next) 42 | } else { 43 | permutations = permuteCase(t, orig, alts, append(sofar, orig[index]), index+1, permutations) 44 | permutations = permuteCase(t, orig, alts, append(sofar, alts[index]), index+1, permutations) 45 | } 46 | return permutations 47 | } 48 | 49 | func TestSingletonMonocaseMerge(t *testing.T) { 50 | cm := newCoreMatcher() 51 | var err error 52 | err = cm.addPattern("singleton", `{"x": ["singleton"] }`) 53 | if err != nil { 54 | t.Error("add singleton: " + err.Error()) 55 | } 56 | err = cm.addPattern("mono", `{"x": [ {"equals-ignore-case": "foo"}]}`) 57 | if err != nil { 58 | t.Error("add mono") 59 | } 60 | matches, _ := cm.matchesForJSONEvent([]byte(`{"x": "singleton"}`)) 61 | if len(matches) != 1 && !containsX(matches, "singleton") { 62 | t.Error("singleton match failed") 63 | } 64 | matches, _ = cm.matchesForJSONEvent([]byte(`{"x": "FoO"}`)) 65 | if len(matches) != 1 && !containsX(matches, "mono") { 66 | t.Error("singleton match failed") 67 | } 68 | } 69 | 70 | func TestEqualsIgnoreCaseMatching(t *testing.T) { 71 | rule1 := "{ \"a\" : [ { \"equals-ignore-case\": \"aBc\" } ] }" 72 | rule2 := "{ \"b\" : [ { \"equals-ignore-case\": \"XyZ\" } ] }" 73 | rule3 := "{ \"b\" : [ { \"equals-ignore-case\": \"xyZ\" } ] }" 74 | 75 | var err error 76 | cm := newCoreMatcher() 77 | err = cm.addPattern("r1", rule1) 78 | if err != nil { 79 | t.Error("AddPattern: " + err.Error()) 80 | } 81 | err = cm.addPattern("r2", rule2) 82 | if err != nil { 83 | t.Error("AddPattern: " + err.Error()) 84 | } 85 | err = cm.addPattern("r3", rule3) 86 | if err != nil { 87 | t.Error("AddPattern: " + err.Error()) 88 | } 89 | matches, _ := cm.matchesForJSONEvent([]byte("{\"a\" : \"abc\"}")) 90 | if len(matches) != 1 || matches[0] != "r1" { 91 | t.Error("wrong on rule1") 92 | } 93 | matches, _ = cm.matchesForJSONEvent([]byte("{\"b\" : \"XYZ\"}")) 94 | if len(matches) != 2 || !containsX(matches, "r2", "r3") { 95 | t.Error("wrong on XYZ") 96 | } 97 | matches, _ = cm.matchesForJSONEvent([]byte("{\"a\" : \"AbC\"}")) 98 | if len(matches) != 1 || !containsX(matches, "r1") { 99 | t.Error("wrong on AbC") 100 | } 101 | matches, _ = cm.matchesForJSONEvent([]byte("{\"b\" : \"xyzz\"}")) 102 | if len(matches) != 0 { 103 | t.Error("wrong on xyzz") 104 | } 105 | matches, _ = cm.matchesForJSONEvent([]byte("{\"b\" : \"aabc\"}")) 106 | if len(matches) != 0 { 107 | t.Error("wrong on aabc") 108 | } 109 | matches, _ = cm.matchesForJSONEvent([]byte("{\"b\" : \"ABCXYZ\"}")) 110 | if len(matches) != 0 { 111 | t.Error("wrong on ABCXYZ") 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /nfa.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import "fmt" 4 | 5 | // This groups the functions that traverse, merge, and debug Quamina's nondeterministic finite automata 6 | 7 | // faState is used by the valueMatcher automaton - every step through the 8 | // automaton requires a smallTable and for some of them, taking the step means you've matched a value and can 9 | // transition to a new fieldMatcher, in which case the fieldTransitions slice will be non-nil 10 | type faState struct { 11 | table *smallTable 12 | fieldTransitions []*fieldMatcher 13 | } 14 | 15 | // struct wrapper to make this comparable to help with pack/unpack 16 | type faNext struct { 17 | states []*faState 18 | } 19 | 20 | type nfaMetadata struct { 21 | maxOutDegree int 22 | } 23 | 24 | type transmap struct { 25 | set map[*fieldMatcher]bool 26 | } 27 | 28 | func (tm *transmap) add(fms []*fieldMatcher) { 29 | for _, fm := range fms { 30 | tm.set[fm] = true 31 | } 32 | } 33 | 34 | func (tm *transmap) all() []*fieldMatcher { 35 | var all []*fieldMatcher 36 | for fm := range tm.set { 37 | all = append(all, fm) 38 | } 39 | return all 40 | } 41 | 42 | // While some Quamina patterns require the use of NFAs, many (most?) don't, and while we're still using a 43 | // NFA-capable data structure, we can traverse it deterministically if we know in advance that every 44 | // combination of an faState with a byte will transition to at most one other faState. 45 | 46 | func traverseDFA(table *smallTable, val []byte, transitions []*fieldMatcher) []*fieldMatcher { 47 | for index := 0; index <= len(val); index++ { 48 | var utf8Byte byte 49 | if index < len(val) { 50 | utf8Byte = val[index] 51 | } else { 52 | utf8Byte = valueTerminator 53 | } 54 | next := table.dStep(utf8Byte) 55 | if next == nil { 56 | break 57 | } 58 | transitions = append(transitions, next.fieldTransitions...) 59 | table = next.table 60 | } 61 | return transitions 62 | } 63 | 64 | func traverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, bufs *bufpair) []*fieldMatcher { 65 | currentStates := bufs.buf1 66 | currentStates = append(currentStates, &faState{table: table}) 67 | nextStates := bufs.buf2 68 | 69 | // a lot of the transitions stuff is going to be empty, but on the other hand 70 | // a * entry with a transition could end up getting added a lot. 71 | newTransitions := &transmap{set: make(map[*fieldMatcher]bool, len(transitions))} 72 | newTransitions.add(transitions) 73 | stepResult := &stepOut{} 74 | for index := 0; len(currentStates) != 0 && index <= len(val); index++ { 75 | var utf8Byte byte 76 | if index < len(val) { 77 | utf8Byte = val[index] 78 | } else { 79 | utf8Byte = valueTerminator 80 | } 81 | for _, state := range currentStates { 82 | state.table.step(utf8Byte, stepResult) 83 | for _, nextStep := range stepResult.steps { 84 | newTransitions.add(nextStep.fieldTransitions) 85 | nextStates = append(nextStates, nextStep) 86 | } 87 | for _, nextStep := range stepResult.epsilon { 88 | newTransitions.add(nextStep.fieldTransitions) 89 | nextStates = append(nextStates, nextStep) 90 | } 91 | } 92 | // re-use these 93 | swapStates := currentStates 94 | currentStates = nextStates 95 | nextStates = swapStates[:0] 96 | } 97 | bufs.buf1 = currentStates[:0] 98 | bufs.buf2 = nextStates[:0] 99 | return newTransitions.all() 100 | } 101 | 102 | type faStepKey struct { 103 | step1 *faState 104 | step2 *faState 105 | } 106 | 107 | // mergeFAs compute the union of two valueMatch automata. If you look up the textbook theory about this, 108 | // they say to compute the set product for automata A and B and build A0B0, A0B1 … A1BN, A1B0 … but if you look 109 | // at that you realize that many of the product states aren't reachable. So you compute A0B0 and then keep 110 | // recursing on the transitions coming out, I'm pretty sure you get a correct result. I don't know if it's 111 | // minimal or even avoids being wasteful. 112 | // INVARIANT: neither argument is nil 113 | // INVARIANT: To be thread-safe, no existing table can be updated except when we're building it 114 | func mergeFAs(table1, table2 *smallTable, printer printer) *smallTable { 115 | state1 := &faState{table: table1} 116 | state2 := &faState{table: table2} 117 | return mergeFAStates(state1, state2, make(map[faStepKey]*faState), printer).table 118 | } 119 | 120 | func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState, printer printer) *faState { 121 | // try to memo-ize 122 | mKey := faStepKey{state1, state2} 123 | combined, ok := keyMemo[mKey] 124 | if ok { 125 | return combined 126 | } 127 | 128 | fieldTransitions := append(state1.fieldTransitions, state2.fieldTransitions...) 129 | combined = &faState{table: newSmallTable(), fieldTransitions: fieldTransitions} 130 | 131 | pretty, ok := printer.(*prettyPrinter) 132 | if ok { 133 | printer.labelTable(combined.table, fmt.Sprintf("%d∎%d", 134 | pretty.tableSerial(state1.table), pretty.tableSerial(state2.table))) 135 | } 136 | 137 | keyMemo[mKey] = combined 138 | u1 := unpackTable(state1.table) 139 | u2 := unpackTable(state2.table) 140 | var uComb unpackedTable 141 | for i, next1 := range u1 { 142 | next2 := u2[i] 143 | switch { 144 | case next1 == next2: // no need to merge 145 | uComb[i] = next1 146 | case next2 == nil: // u1 must be non-nil 147 | uComb[i] = next1 148 | case next1 == nil: // u2 must be non-nil 149 | uComb[i] = next2 150 | case i > 0 && next1 == u1[i-1] && next2 == u2[i-1]: // dupe of previous step - happens a lot 151 | uComb[i] = uComb[i-1] 152 | default: // have to recurse & merge 153 | var comboNext []*faState 154 | for _, nextStep1 := range next1.states { 155 | for _, nextStep2 := range next2.states { 156 | comboNext = append(comboNext, mergeFAStates(nextStep1, nextStep2, keyMemo, printer)) 157 | } 158 | } 159 | uComb[i] = &faNext{states: comboNext} 160 | } 161 | } 162 | combined.table.pack(&uComb) 163 | combined.table.epsilon = append(state1.table.epsilon, state2.table.epsilon...) 164 | 165 | return combined 166 | } 167 | -------------------------------------------------------------------------------- /nfa_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "unsafe" 7 | ) 8 | 9 | // TestArrayBehavior is here prove that (a) you can index a map with an array and 10 | // the indexing actually relies on the values in the array. This has nothing to do with 11 | // Quamina, but I'm leaving it here because I had to write this stupid test after failing 12 | // to find a straightforward question of whether this works as expected anywhere in the 13 | // Golang docs. 14 | func TestArrayBehavior(t *testing.T) { 15 | type gpig [4]int 16 | pigs := []gpig{ 17 | {1, 2, 3, 4}, 18 | {4, 3, 2, 1}, 19 | } 20 | nonPigs := []gpig{ 21 | {3, 4, 3, 4}, 22 | {99, 88, 77, 66}, 23 | } 24 | m := make(map[gpig]bool) 25 | for _, pig := range pigs { 26 | m[pig] = true 27 | } 28 | for _, pig := range pigs { 29 | _, ok := m[pig] 30 | if !ok { 31 | t.Error("missed pig") 32 | } 33 | } 34 | pigs[0][0] = 111 35 | pigs[1][3] = 777 36 | pigs = append(pigs, nonPigs...) 37 | for _, pig := range pigs { 38 | _, ok := m[pig] 39 | if ok { 40 | t.Error("mutant pig") 41 | } 42 | } 43 | newPig := gpig{1, 2, 3, 4} 44 | _, ok := m[newPig] 45 | if !ok { 46 | t.Error("Newpig") 47 | } 48 | } 49 | 50 | func TestFocusedMerge(t *testing.T) { 51 | shellStyles := []string{ 52 | "a*b", 53 | "ab*", 54 | "*ab", 55 | } 56 | var automata []*smallTable 57 | var matchers []*fieldMatcher 58 | 59 | for _, shellStyle := range shellStyles { 60 | str := `"` + shellStyle + `"` 61 | automaton, matcher := makeShellStyleFA([]byte(str), &nullPrinter{}) 62 | automata = append(automata, automaton) 63 | matchers = append(matchers, matcher) 64 | } 65 | 66 | var cab uintptr 67 | for _, mm := range matchers { 68 | uu := uintptr(unsafe.Pointer(mm)) 69 | cab = cab ^ uu 70 | } 71 | 72 | merged := newSmallTable() 73 | for _, automaton := range automata { 74 | merged = mergeFAs(merged, automaton, sharedNullPrinter) 75 | 76 | s := statsAccum{ 77 | fmVisited: make(map[*fieldMatcher]bool), 78 | vmVisited: make(map[*valueMatcher]bool), 79 | stVisited: make(map[*smallTable]bool), 80 | } 81 | faStats(merged, &s) 82 | fmt.Println(s.stStats()) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /numbers.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "strconv" 7 | ) 8 | 9 | // You can't easily build automata to compare numbers based on either the decimal notation found 10 | // in text data or the internal floating-point bits. Therefore, we map floating-point numbers 11 | // (which is what JSON numbers basically are) to comparable slices of 7-bit bytes which preserve the 12 | // numbers' ordering. Versions of Quamina up to 1.3 used a home-grown format which used 14 hex digits 13 | // to represent a subset of numbers. This has now been replaced by Arne Hormann's "numbits" 14 | // construct, see numbits.go. It uses up to 10 base128 bytes to represent the entire range of float64 numbers. 15 | // Both this file and numbits.go are very short, but I'm keeping them separated because someone might 16 | // figure out a still-better serialization of numbers and then this part wouldn't have to change. 17 | // In Quamina these are called "Q numbers". 18 | 19 | // There is considerable effort to track, at the NFA level, which NFAs are built to match field values 20 | // that are Q numbers; see vmFields.hasNumbers. Similarly, the JSONFlattener, since it has to 21 | // look at all the digits in a number in order to parse it, can keep track of whether it can be made 22 | // a Q number. The key benefit of this is in valueMatcher.transitionOn, which incurs the cost of 23 | // making a Q number only if it is known that the valueMatcher's NFA can benefit from it and 24 | // that the number in the incoming event can in fact be made a Q number. 25 | 26 | type qNumber []byte 27 | 28 | // qNumFromBytes works out whether a string representing a number falls within the 29 | // limits imposed for Q numbers. It is heavily optimized and relies on the form 30 | // of the number already having been validated, e.g. by flattenJSON(). 31 | func qNumFromBytes(bytes []byte) (qNumber, error) { 32 | numeric, err := strconv.ParseFloat(string(bytes), 64) 33 | if err != nil { 34 | return nil, errors.New("not a float") // should never happen, json parser upstream 35 | } 36 | return qNumFromFloat(numeric), nil 37 | } 38 | 39 | // qNumFromFLoat is here mostly to support testing 40 | func qNumFromFloat(f float64) qNumber { 41 | return numbitsFromFloat64(f).toQNumber() 42 | } 43 | 44 | // for debugging 45 | func (q qNumber) String() string { 46 | ret := "" 47 | for i, b := range q { 48 | if i != 0 { 49 | ret += "-" 50 | } 51 | ret += fmt.Sprintf("%02x", b) 52 | } 53 | return ret 54 | } 55 | -------------------------------------------------------------------------------- /numbers_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "math" 7 | "math/rand" 8 | "sort" 9 | "strconv" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | func BenchmarkNumberMatching(b *testing.B) { 15 | // we’re going to have a pattern that matches one of ten random floats, then we're going to throw 16 | // 10K random events at it, 10% of which will match the pattern 17 | rand.New(rand.NewSource(2325)) 18 | pattern := `{"x": [` 19 | var targets []string 20 | for i := 0; i < 10; i++ { 21 | numString := fmt.Sprintf("%.6f", rand.Float64()) 22 | targets = append(targets, numString) 23 | if i != 0 { 24 | pattern += ", " 25 | } 26 | pattern += numString 27 | } 28 | pattern += `]}` 29 | cm := newCoreMatcher() 30 | flattener := newJSONFlattener() 31 | err := cm.addPattern("P", pattern) 32 | if err != nil { 33 | b.Error("addP") 34 | } 35 | b.ResetTimer() 36 | b.ReportAllocs() 37 | targetInd := 0 38 | calls := 0 39 | for i := 0; i < b.N; i++ { 40 | if i%2 == 0 { 41 | val := targets[targetInd] 42 | event := `{"x":` + val + "}" 43 | matches, err := cm.matchesForJSONWithFlattener([]byte(event), flattener) 44 | calls++ 45 | if err != nil { 46 | b.Error("match target") 47 | } 48 | if len(matches) == 0 { 49 | b.Error("Missed target") 50 | } 51 | targetInd = (targetInd + 1) % len(targets) 52 | } else { 53 | event := `{"x":` + fmt.Sprintf("%.6f", rand.Float64()) + "}" 54 | _, err := cm.matchesForJSONEvent([]byte(event)) 55 | if err != nil { 56 | b.Error("match non-target") 57 | } 58 | } 59 | } 60 | } 61 | 62 | func TestWildlyVaryingNumbersAreComparable(t *testing.T) { 63 | data := []float64{ 64 | -5_000_000_000, -4_999_999_999.99999, -4_999_999_999.99998, -4_999_999_999.99997, 65 | -999999999.99999, -999999999.99, -10000, -122.413496, -0.000002, 66 | 0, 0.000001, 3.8, 3.9, 11, 12, 122.415028, 2.5e4, 999999999.999998, 999999999.999999, 67 | 4_999_999_999.99997, 4_999_999_999.99998, 4_999_999_999.99999, 5_000_000_000, 68 | } 69 | for i := 1; i < len(data); i++ { 70 | s0 := qNumFromFloat(data[i-1]) 71 | s1 := qNumFromFloat(data[i]) 72 | if bytes.Compare(s0, s1) >= 0 { 73 | t.Errorf("FOO %d / %f - %f", i, data[i-1], data[i]) 74 | fmt.Printf("lo %s %f\nhi %s %f\n", s0, data[i-1], s1, data[i]) 75 | } 76 | } 77 | } 78 | 79 | /* needs extension to ruler-style anything-but 80 | func TestNumericAnythingBut(t *testing.T) { 81 | pat := `{"x": [ { "anything-but": [3.50, 4.5e1]}` 82 | m := newCoreMatcher() 83 | err := m.addPattern("p", pat) 84 | if err != nil { 85 | t.Error("Add Pattern: " + err.Error()) 86 | } 87 | event := `{"x": 3.5}` 88 | matches, _ := m.matchesForJSONEvent([]byte(event)) 89 | if len(matches) != 0 { 90 | t.Error("NumAB") 91 | } 92 | } 93 | */ 94 | 95 | func TestShowBigSmall(t *testing.T) { 96 | lows := []string{"-5_000_000_000.00000", "-4_999_999_999.99999", "-4_999_999_999.99998"} 97 | highs := []string{"4_999_999_999.99998", "4_999_999_999.99999", "5_000_000_000.00000"} 98 | for _, low := range lows { 99 | c, err := qNumFromBytes([]byte(low)) 100 | if err != nil { 101 | t.Errorf("Problem with %s: %s", low, err.Error()) 102 | } 103 | fmt.Printf("%s <%s>\n", low, c) 104 | } 105 | for _, high := range highs { 106 | c, err := qNumFromBytes([]byte(high)) 107 | if err != nil { 108 | t.Errorf("Problem with %s: %s", high, err.Error()) 109 | } 110 | fmt.Printf("%s <%s>\n", high, c) 111 | } 112 | } 113 | 114 | func TestBadNumbers(t *testing.T) { 115 | var err error 116 | bads := []string{ 117 | "xy", "- 53", "124x", "1.5ee7", 118 | } 119 | for _, bad := range bads { 120 | _, err = qNumFromBytes([]byte(bad)) 121 | if err == nil { 122 | t.Error("Accepted: " + bad) 123 | } 124 | } 125 | } 126 | 127 | func TestFloatVariants(t *testing.T) { 128 | f := []float64{350, 350.0, 350.0000000000, 3.5e2} 129 | var o []qNumber 130 | for _, s := range f { 131 | c := qNumFromFloat(s) 132 | o = append(o, c) 133 | } 134 | for i := 1; i < len(o); i++ { 135 | if !bytes.Equal(o[i], o[i-1]) { 136 | t.Errorf("%s and %s differ", o[i-1], o[i]) 137 | } 138 | } 139 | } 140 | func TestByteVariants(t *testing.T) { 141 | f := []string{"350", "350.0", "350.0000", "3.5e2"} 142 | var o []qNumber 143 | for _, s := range f { 144 | c, err := qNumFromBytes([]byte(s)) 145 | if err != nil { 146 | t.Errorf("qnum err on %s: %s", s, err.Error()) 147 | } 148 | o = append(o, c) 149 | } 150 | for i := 1; i < len(o); i++ { 151 | if !bytes.Equal(o[i], o[i-1]) { 152 | t.Errorf("%s and %s differ", o[i-1], o[i]) 153 | } 154 | } 155 | } 156 | 157 | func TestOrdering(t *testing.T) { 158 | var in []float64 159 | for i := 0; i < 10000; i++ { 160 | // nolint:gosec 161 | f := rand.Float64() * math.Pow(10, 9) * 2 162 | f -= 1000000000.0 163 | in = append(in, f) 164 | } 165 | sort.Float64s(in) 166 | var out []string 167 | for _, f := range in { 168 | c := qNumFromFloat(f) 169 | out = append(out, string(c)) 170 | } 171 | if !sort.StringsAreSorted(out) { 172 | t.Errorf("Not sorted") 173 | } 174 | } 175 | 176 | func TestMatcherNumerics(t *testing.T) { 177 | p := `{"x": [35.0]}` 178 | shoulds := []string{ 179 | "35", "3.5e1", "35.000", "0.000035e6", 180 | } 181 | for _, should := range shoulds { 182 | f, err := strconv.ParseFloat(should, 64) 183 | if err != nil { 184 | t.Error("Parse? " + err.Error()) 185 | } 186 | q, err := qNumFromBytes([]byte(should)) 187 | if err != nil { 188 | t.Error("QF: " + err.Error()) 189 | } 190 | fmt.Printf("%f <%s>\n", f, q) 191 | if f != 35.0 { 192 | t.Error("Not 35!") 193 | } 194 | } 195 | 196 | template := `{"x": NUM}` 197 | m := newCoreMatcher() 198 | err := m.addPattern("35", p) 199 | if err != nil { 200 | t.Error("Oops " + err.Error()) 201 | } 202 | for _, should := range shoulds { 203 | event := strings.Replace(template, "NUM", should, 5) 204 | matches, err := m.matchesForJSONEvent([]byte(event)) 205 | if err != nil { 206 | t.Error("Match: " + err.Error()) 207 | } 208 | if len(matches) != 1 { 209 | t.Error("Didn't match " + should) 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /numbits.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | // numbits is an alternative binary representation of float64 numbers. 8 | // float64 are stored as (sign | exponent | mantissa) 9 | // with 1 bit sign, 11 bits exponent, 52 bits mantissa 10 | // They can be represented as [8]byte or as string and can be created from 11 | // these representations. 12 | // All possible float64 values are representable as numbits. 13 | // numbits were implemented by Arne Hormann for Quamina; he later discovered 14 | // that an equivalent representation was used long ago in the disk format of DB2. 15 | // 16 | // Arne's implementation carefully handled NaN, -0, and infinities, but 17 | // Quamina ignores those issues because a combination of JSON rules and 18 | // Quamina's parsers prevent those values from occurring. 19 | type numbits uint64 20 | 21 | // numbitsFromFloat64 converts a float64 value to its numbits representation. 22 | func numbitsFromFloat64(f float64) numbits { 23 | u := math.Float64bits(f) 24 | //nolint:gosec // disable G115 25 | // transform without branching: 26 | // if high bit is 0, xor with sign bit 1 << 63, else negate (xor with ^0). 27 | // Using a sign extending right shift was proposed by Raph Levien in 28 | // https://mastodon.online/@raph/113071041069390831 29 | mask := uint64(int64(u)>>63) | (1 << 63) 30 | return numbits(u ^ mask) 31 | } 32 | 33 | const MaxBytesInEncoding = 10 34 | 35 | // toQNumber turns a numbits into a minimal variable-width encoding that preservers equality and ordering. 36 | // Storing 8 bytes of data in base-128 would in principle require 10 bytes, but it turns out that since 37 | // the byte-string encoding is big-endian, trailing zeroes don't count, so the encoding can be as short as 38 | // one byte. 39 | // Idea and some code by Axel Wagner 40 | func (nb numbits) toQNumber() qNumber { 41 | // Iterate through the numbits 7 bits at a time, right to left, first bypassing bits that generate 42 | // trailing zeroes in the encoded form. Note that index could go to 0 if the numbits value was uint(0) 43 | // but that value represents NaN and can't appear in JSON 44 | trailingZeroes := 0 45 | var index int 46 | for index = MaxBytesInEncoding - 1; index >= 0; index-- { 47 | if nb&0x7f != 0 { 48 | break 49 | } 50 | trailingZeroes++ 51 | nb >>= 7 52 | } 53 | 54 | // now we fill in the byte encoding for the digits up to the last non-zero 55 | b := make([]byte, MaxBytesInEncoding-trailingZeroes) 56 | for ; index >= 0; index-- { 57 | b[index] = byte(nb & 0x7f) 58 | nb >>= 7 59 | } 60 | return b 61 | } 62 | -------------------------------------------------------------------------------- /numbits_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "bytes" 5 | "math" 6 | "math/rand" 7 | "sort" 8 | "testing" 9 | "unicode/utf8" 10 | ) 11 | 12 | func TestToQNumber(t *testing.T) { 13 | rand.New(rand.NewSource(230948)) 14 | var nbs []numbits 15 | var utf8s [][]byte 16 | for i := 0; i < 10000; i++ { 17 | nb := numbits(rand.Uint64()) 18 | nbs = append(nbs, nb) 19 | nbu := nb.toQNumber() 20 | if !utf8.Valid(nbu) { 21 | t.Error("Invalid UTF8!") 22 | } 23 | utf8s = append(utf8s, nbu) 24 | } 25 | for i := 1; i < len(nbs); i++ { 26 | uCompare := bytes.Compare(utf8s[i], utf8s[i-1]) 27 | if nbs[i] > nbs[i-1] { 28 | if uCompare <= 0 { 29 | t.Error("Out of order 1") 30 | } 31 | } else if nbs[i] < nbs[i-1] { 32 | if uCompare >= 0 { 33 | t.Error("Out of order 2") 34 | } 35 | } else if nbs[i] == nbs[i-1] { 36 | if uCompare != 0 { 37 | t.Error("Out of order 3") 38 | } 39 | } 40 | } 41 | } 42 | 43 | var ( 44 | // boundaries of floating point value ranges 45 | f64Zero = math.Float64frombits(0b0_00000000000_0000_00000000_00000000_00000000_00000000_00000000_00000000) 46 | f64SubnormLo = math.Float64frombits(0b0_00000000000_0000_00000000_00000000_00000000_00000000_00000000_00000001) 47 | f64SubnormHi = math.Float64frombits(0b0_00000000000_1111_11111111_11111111_11111111_11111111_11111111_11111111) 48 | f64NormLoLo = math.Float64frombits(0b0_00000000001_0000_00000000_00000000_00000000_00000000_00000000_00000000) 49 | f64NormLoHi = math.Float64frombits(0b0_00000000001_1111_11111111_11111111_11111111_11111111_11111111_11111111) 50 | f64NormHiLo = math.Float64frombits(0b0_11111111110_0000_00000000_00000000_00000000_00000000_00000000_00000000) 51 | f64NormHiHi = math.Float64frombits(0b0_11111111110_1111_11111111_11111111_11111111_11111111_11111111_11111111) 52 | specials = []float64{f64Zero, f64SubnormHi, f64SubnormLo, f64NormLoLo, f64NormLoHi, f64NormHiLo, f64NormHiHi} 53 | ) 54 | 55 | func TestNumbits_Compare(t *testing.T) { 56 | rand.New(rand.NewSource(203785)) 57 | floats := append([]float64{}, specials...) 58 | 59 | for i := 0; i < 1000; i++ { 60 | floats = append(floats, rand.Float64()) 61 | } 62 | sort.Float64s(floats) 63 | last := numbitsFromFloat64(floats[0]) 64 | for i := 1; i < len(floats); i++ { 65 | this := numbitsFromFloat64(floats[i]) 66 | if last >= this { 67 | t.Error("out of order") 68 | } 69 | last = this 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pattern.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "strings" 10 | ) 11 | 12 | type valType int 13 | 14 | const ( 15 | stringType valType = iota 16 | numberType 17 | literalType 18 | existsTrueType 19 | existsFalseType 20 | shellStyleType 21 | anythingButType 22 | prefixType 23 | monocaseType 24 | wildcardType 25 | regexpType 26 | ) 27 | 28 | // typedVal represents the value of a field in a pattern, giving the value and the type of pattern. 29 | // - list is used to handle anything-but matches with multiple values. 30 | // - parsedRegexp only used for vType == regexpType 31 | type typedVal struct { 32 | vType valType 33 | val string 34 | list [][]byte 35 | parsedRegexp regexpRoot 36 | } 37 | 38 | // patternField represents a field in a pattern. 39 | // vals is a list because field values are always given as a JSON array. 40 | type patternField struct { 41 | path string 42 | vals []typedVal 43 | } 44 | 45 | // patternBuild tracks the progress of patternFromJSON through a pattern-compilation project. 46 | type patternBuild struct { 47 | jd *json.Decoder 48 | path []string 49 | results []*patternField 50 | } 51 | 52 | // patternFromJSON compiles a JSON text provided in jsonBytes into a list of patternField structures. 53 | // I love naked returns and I cannot lie 54 | func patternFromJSON(jsonBytes []byte) (fields []*patternField, err error) { 55 | // we can't use json.Unmarshal because it round-trips numbers through float64 and %f, so they won't end up matching 56 | // what the caller actually wrote in the patternField. json.Decoder is kind of slow due to excessive 57 | // memory allocation, but I haven't got around to prematurely optimizing the patternFromJSON code path 58 | var pb patternBuild 59 | pb.jd = json.NewDecoder(bytes.NewReader(jsonBytes)) 60 | pb.jd.UseNumber() 61 | 62 | // we use the tokenizer rather than pulling the pattern in with UnMarshall 63 | t, err := pb.jd.Token() 64 | if errors.Is(err, io.EOF) { 65 | err = errors.New("empty Pattern") 66 | return 67 | } else if err != nil { 68 | err = errors.New("patternField is not a JSON object" + err.Error()) 69 | return 70 | } 71 | switch tt := t.(type) { 72 | case json.Delim: 73 | if tt != '{' { 74 | err = errors.New("patternField is not a JSON object") 75 | return 76 | } 77 | default: 78 | err = errors.New("event is not a JSON object: doesn't start with '{'") 79 | return 80 | } 81 | 82 | err = readPatternObject(&pb) 83 | fields = pb.results 84 | return 85 | } 86 | 87 | func readPatternObject(pb *patternBuild) error { 88 | for { 89 | t, err := pb.jd.Token() 90 | if errors.Is(err, io.EOF) { 91 | return errors.New("event atEnd mid-object") 92 | } else if err != nil { 93 | return errors.New("pattern malformed: " + err.Error()) 94 | } 95 | 96 | switch tt := t.(type) { 97 | case string: 98 | pb.path = append(pb.path, tt) 99 | err = readPatternMember(pb) 100 | if err != nil { 101 | return err 102 | } 103 | pb.path = pb.path[:len(pb.path)-1] 104 | 105 | case json.Delim: 106 | // has to be '}' or the tokenizer would have thrown an error 107 | return nil 108 | } 109 | } 110 | } 111 | 112 | func readPatternMember(pb *patternBuild) error { 113 | t, err := pb.jd.Token() 114 | if errors.Is(err, io.EOF) { 115 | return errors.New("pattern ends mid-field") 116 | } else if err != nil { 117 | return errors.New("pattern malformed: " + err.Error()) 118 | } 119 | 120 | switch tt := t.(type) { 121 | case json.Delim: 122 | switch tt { 123 | case '[': 124 | return readPatternArray(pb) 125 | case '{': 126 | return readPatternObject(pb) 127 | default: // can't happen 128 | return fmt.Errorf("pattern malformed, illegal %v", tt) 129 | } 130 | default: 131 | return fmt.Errorf("pattern malformed, illegal %v", tt) 132 | } 133 | } 134 | 135 | func readPatternArray(pb *patternBuild) error { 136 | pathName := strings.Join(pb.path, SegmentSeparator) 137 | var containsExclusive string 138 | elementCount := 0 139 | var pathVals []typedVal 140 | for { 141 | t, err := pb.jd.Token() 142 | if errors.Is(err, io.EOF) { 143 | return errors.New("patternField atEnd mid-field") 144 | } else if err != nil { 145 | // can't happen 146 | return errors.New("pattern malformed: " + err.Error()) 147 | } 148 | 149 | switch tt := t.(type) { 150 | case json.Delim: 151 | if tt == ']' { 152 | if (containsExclusive != "") && (elementCount > 1) { 153 | return fmt.Errorf(`%s cannot be combined with other values in pattern`, containsExclusive) 154 | } 155 | pb.results = append(pb.results, &patternField{path: pathName, vals: pathVals}) 156 | return nil 157 | } else if tt == '{' { 158 | var ce string 159 | pathVals, ce, err = readSpecialPattern(pb, pathVals) 160 | if ce != "" { 161 | containsExclusive = ce 162 | } 163 | if err != nil { 164 | return err 165 | } 166 | } else { 167 | return fmt.Errorf("pattern malformed, illegal %v", tt) 168 | } 169 | case string: 170 | pathVals = append(pathVals, typedVal{vType: stringType, val: `"` + tt + `"`}) 171 | case json.Number: 172 | pathVals = append(pathVals, typedVal{vType: numberType, val: tt.String()}) 173 | case bool: 174 | if tt { 175 | pathVals = append(pathVals, typedVal{vType: literalType, val: "true"}) 176 | } else { 177 | pathVals = append(pathVals, typedVal{vType: literalType, val: "false"}) 178 | } 179 | case nil: 180 | pathVals = append(pathVals, typedVal{vType: literalType, val: "null"}) 181 | } 182 | elementCount++ 183 | } 184 | } 185 | 186 | func readSpecialPattern(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, containsExclusive string, err error) { 187 | containsExclusive = "" 188 | pathVals = valsIn 189 | t, err := pb.jd.Token() 190 | if err != nil { 191 | return 192 | } 193 | 194 | // tokenizer will throw an error if it's not a string 195 | tt := t.(string) 196 | switch tt { 197 | case "anything-but": 198 | containsExclusive = tt 199 | pathVals, err = readAnythingButSpecial(pb, pathVals) 200 | case "exists": 201 | containsExclusive = tt 202 | pathVals, err = readExistsSpecial(pb, pathVals) 203 | case "shellstyle": 204 | pathVals, err = readShellStyleSpecial(pb, pathVals) 205 | case "wildcard": 206 | pathVals, err = readWildcardSpecial(pb, pathVals) 207 | case "prefix": 208 | pathVals, err = readPrefixSpecial(pb, pathVals) 209 | case "equals-ignore-case": 210 | pathVals, err = readMonocaseSpecial(pb, pathVals) 211 | case "regexp": 212 | containsExclusive = tt 213 | pathVals, err = readRegexpSpecial(pb, pathVals) 214 | default: 215 | err = errors.New("unrecognized in special pattern: " + tt) 216 | } 217 | return 218 | } 219 | 220 | func readPrefixSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) { 221 | t, err := pb.jd.Token() 222 | if err != nil { 223 | return 224 | } 225 | pathVals = valsIn 226 | 227 | prefixString, ok := t.(string) 228 | if !ok { 229 | err = errors.New("value for 'prefix' must be a string") 230 | return 231 | } 232 | val := typedVal{ 233 | vType: prefixType, 234 | val: `"` + prefixString + `"`, 235 | } 236 | pathVals = append(pathVals, val) 237 | 238 | // has to be } or tokenizer will throw error 239 | _, err = pb.jd.Token() 240 | return 241 | } 242 | 243 | func readExistsSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) { 244 | t, err := pb.jd.Token() 245 | if err != nil { 246 | return 247 | } 248 | pathVals = valsIn 249 | switch tt := t.(type) { 250 | case bool: 251 | if tt { 252 | pathVals = append(pathVals, typedVal{vType: existsTrueType}) 253 | } else { 254 | pathVals = append(pathVals, typedVal{vType: existsFalseType}) 255 | } 256 | default: 257 | err = errors.New("value for 'exists' pattern must be true or false") 258 | return 259 | } 260 | 261 | t, err = pb.jd.Token() 262 | if err != nil { 263 | return 264 | } 265 | switch t.(type) { 266 | case json.Delim: 267 | // no-op, has to be } 268 | default: 269 | err = errors.New("trailing garbage in 'existsMatches' pattern") 270 | } 271 | return 272 | } 273 | -------------------------------------------------------------------------------- /pattern_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestPatternErrorHandling(t *testing.T) { 8 | _, err := patternFromJSON([]byte{}) 9 | if err == nil { 10 | t.Error("accepted empty pattern") 11 | } 12 | _, err = patternFromJSON([]byte("33")) 13 | if err == nil { 14 | t.Error("accepted non-object JSON text") 15 | } 16 | _, err = patternFromJSON([]byte("{")) 17 | if err == nil { 18 | t.Error("accepted stub JSON object") 19 | } 20 | _, err = patternFromJSON([]byte("{ =")) 21 | if err == nil { 22 | t.Error("accepted malformed JSON object") 23 | } 24 | _, err = patternFromJSON([]byte(`{ "foo": `)) 25 | if err == nil { 26 | t.Error("accepted stub JSON object") 27 | } 28 | _, err = patternFromJSON([]byte(`{ "foo": [`)) 29 | if err == nil { 30 | t.Error("accepted stub JSON array") 31 | } 32 | 33 | _, err = patternFromJSON([]byte(`{ "foo": [ { "exists" == ] }`)) 34 | if err == nil { 35 | t.Error("accepted stub JSON array") 36 | } 37 | 38 | _, err = patternFromJSON([]byte(`{ "foo": [ { "exists": false . ] }`)) 39 | if err == nil { 40 | t.Error("accepted stub JSON array") 41 | } 42 | } 43 | 44 | func TestPatternFromJSON(t *testing.T) { 45 | bads := []string{ 46 | `x`, 47 | `{"foo": ]`, 48 | `{"foo": 11 }`, 49 | `{"foo": "x" }`, 50 | `{"foo": true}`, 51 | `{"foo": null}`, 52 | `{"oof": [ ]`, 53 | `[33,22]`, 54 | `{"xxx": { }`, 55 | `{"xxx": [ [ 22 ] }`, 56 | `{"xxx": [ {"x": 1} ]`, 57 | `{"xxx": [ { [`, 58 | `{"xxx": [ { "exists": 23 } ] }`, 59 | `{"xxx": [ { "exists": true }, 15 ] }`, 60 | `{"xxx": [ { "exists": true, "a": 3 }] }`, 61 | `{"xxx": [ { "exists": false, "x": ["a", 3 ] }] }`, 62 | `{"abc": [ {"shellstyle":15} ] }`, 63 | `{"abc": [ {"shellstyle":"15"] ] }`, 64 | `{"abc": [ {"shellstyle":"15", "x", 1} ] }`, 65 | `{"abc": [ {"shellstyle":"a**b"}, "foo" ] }`, 66 | `{"abc": [ {"prefix":23}, "foo" ] }`, 67 | `{"abc": [ {"prefix":["a", "b"]}, "foo" ] }`, 68 | `{"abc": [ {"prefix": - }, "foo" ] }`, 69 | `{"abc": [ {"prefix": - "a" }, "foo" ] }`, 70 | `{"abc": [ {"prefix": "a" {, "foo" ] }`, 71 | `{"abc": [ {"equals-ignore-case":23}, "foo" ] }`, 72 | `{"abc": [ {"wildcard":"15", "x", 1} ] }`, 73 | `{"abc": [ {"wildcard":"a**b"}, "foo" ] }`, 74 | `{"abc": [ {"wildcard":"a\\b"}, "foo" ] }`, // after JSON parsing, code sees `a/b` 75 | `{"abc": [ {"wildcard":"a\\"}, "foo" ] }`, // after JSON parsing, code sees `a\` 76 | "{\"a\": [ { \"anything-but\": { \"equals-ignore-case\": [\"1\", \"2\" \"3\"] } } ] }", // missing , 77 | "{\"a\": [ { \"anything-but\": { \"equals-ignore-case\": [1, 2, 3] } } ] }", // no numbers 78 | "{\"a\": [ { \"anything-but\": { \"equals-ignore-case\": [\"1\", \"2\" } } ] }", // missing ] 79 | "{\"a\": [ { \"anything-but\": { \"equals-ignore-case\": [\"1\", \"2\" ] } ] }", // missing } 80 | "{\"a\": [ { \"equals-ignore-case\": 5 } ] }", 81 | "{\"a\": [ { \"equals-ignore-case\": [ \"abc\" ] } ] }", 82 | } 83 | for _, b := range bads { 84 | _, err := patternFromJSON([]byte(b)) 85 | if err == nil { 86 | t.Error("accepted bad pattern: " + b) 87 | } 88 | } 89 | 90 | goods := []string{ 91 | `{"x": [ 2 ]}`, 92 | `{"x": [ null, true, false, "hopp", 3.072e-11] }`, 93 | `{"x": { "a": [27, 28], "b": { "m": [ "a", "b" ] } } }`, 94 | `{"x": [ {"exists": true} ] }`, 95 | `{"x": { "y": [ {"exists": false} ] } }`, 96 | `{"abc": [ 3, {"shellstyle":"a*b"} ] }`, 97 | `{"abc": [ {"shellstyle":"a*b"}, "foo" ] }`, 98 | `{"abc": [ {"shellstyle":"a*b*c"} ] }`, 99 | `{"x": [ {"equals-ignore-case":"a*b*c"} ] }`, 100 | `{"abc": [ 3, {"wildcard":"a*b"} ] }`, 101 | `{"abc": [ {"wildcard":"a*b"}, "foo" ] }`, 102 | `{"abc": [ {"wildcard":"a*b*c"} ] }`, 103 | `{"abc": [ {"wildcard":"a*b\\*c"} ] }`, 104 | } 105 | w1 := []*patternField{{path: "x", vals: []typedVal{{vType: numberType, val: "2"}}}} 106 | w2 := []*patternField{{path: "x", vals: []typedVal{ 107 | {literalType, "null", nil, nil}, 108 | {literalType, "true", nil, nil}, 109 | {literalType, "false", nil, nil}, 110 | {stringType, `"hopp"`, nil, nil}, 111 | {numberType, "3.072e-11", nil, nil}, 112 | }}} 113 | w3 := []*patternField{ 114 | {path: "x\na", vals: []typedVal{ 115 | {numberType, "27", nil, nil}, 116 | {numberType, "28", nil, nil}, 117 | }}, 118 | {path: "x\nb\nm", vals: []typedVal{ 119 | {stringType, `"a"`, nil, nil}, 120 | {stringType, `"b"`, nil, nil}, 121 | }}, 122 | } 123 | w4 := []*patternField{ 124 | { 125 | path: "x", vals: []typedVal{ 126 | {vType: existsTrueType, val: ""}, 127 | }, 128 | }, 129 | } 130 | w5 := []*patternField{ 131 | { 132 | path: "x\ny", vals: []typedVal{ 133 | {vType: existsFalseType, val: ""}, 134 | }, 135 | }, 136 | } 137 | w6 := []*patternField{ 138 | { 139 | path: "abc", vals: []typedVal{ 140 | {vType: stringType, val: "3"}, 141 | {vType: shellStyleType, val: `"a*b"`}, 142 | }, 143 | }, 144 | } 145 | w7 := []*patternField{ 146 | { 147 | path: "abc", vals: []typedVal{ 148 | {vType: shellStyleType, val: `"a*b"`}, 149 | {vType: stringType, val: `"foo"`}, 150 | }, 151 | }, 152 | } 153 | w8 := []*patternField{ 154 | { 155 | path: "abc", vals: []typedVal{ 156 | {vType: shellStyleType, val: `"a*b*c"`}, 157 | }, 158 | }, 159 | } 160 | w9 := []*patternField{ 161 | { 162 | path: "x", vals: []typedVal{ 163 | {vType: monocaseType, val: `"a*b*c"`}, 164 | }, 165 | }, 166 | } 167 | w10 := []*patternField{ 168 | { 169 | path: "abc", vals: []typedVal{ 170 | {vType: stringType, val: "3"}, 171 | {vType: wildcardType, val: `"a*b"`}, 172 | }, 173 | }, 174 | } 175 | w11 := []*patternField{ 176 | { 177 | path: "abc", vals: []typedVal{ 178 | {vType: wildcardType, val: `"a*b"`}, 179 | {vType: stringType, val: `"foo"`}, 180 | }, 181 | }, 182 | } 183 | w12 := []*patternField{ 184 | { 185 | path: "abc", vals: []typedVal{ 186 | {vType: wildcardType, val: `"a*b*c"`}, 187 | }, 188 | }, 189 | } 190 | w13 := []*patternField{ 191 | { 192 | path: "abc", vals: []typedVal{ 193 | {vType: wildcardType, val: `"a*b\*c"`}, 194 | }, 195 | }, 196 | } 197 | wanted := [][]*patternField{w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13} 198 | 199 | for i, good := range goods { 200 | fields, err := patternFromJSON([]byte(good)) 201 | if err != nil { 202 | t.Error("pattern:" + good + ": " + err.Error()) 203 | } 204 | w := wanted[i] 205 | if len(w) != len(fields) { 206 | t.Errorf("at %d len(w)=%d, len(fields)=%d", i, len(w), len(fields)) 207 | } 208 | for j, ww := range w { 209 | if ww.path != fields[j].path { 210 | t.Error("pathSegments mismatch: " + ww.path + "/" + fields[j].path) 211 | } 212 | for k, www := range ww.vals { 213 | if www.val != fields[j].vals[k].val { 214 | t.Errorf("At [%d][%d], val mismatch %s/%s", j, k, www.val, fields[j].vals[k].val) 215 | } 216 | } 217 | } 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /prettyprinter.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "strings" 7 | ) 8 | 9 | // printer is an interface used to generate representations of Quamina data structures to facilitate 10 | // debugging and optimization. It's an interface rather than a type so that a null implementation can 11 | // be provided for production that should incur very little performance cost. 12 | type printer interface { 13 | labelTable(table *smallTable, label string) 14 | printNFA(table *smallTable) string 15 | shortPrintNFA(table *smallTable) string 16 | } 17 | 18 | // nullPrinter is what the name says, a do-nothing implementation of the printer interface which ideally 19 | // should consume close to zero CPU cycles. 20 | type nullPrinter struct{} 21 | 22 | const noPP = "prettyprinting not enabled" 23 | 24 | func (*nullPrinter) labelTable(_ *smallTable, _ string) { 25 | } 26 | func (*nullPrinter) printNFA(_ *smallTable) string { 27 | return noPP 28 | } 29 | func (*nullPrinter) shortPrintNFA(_ *smallTable) string { 30 | return noPP 31 | } 32 | 33 | var sharedNullPrinter = &nullPrinter{} 34 | 35 | // prettyPrinter makes a human-readable representation of a NFA; each smallTable may be 36 | // given a label and as a side effect will get a random 3-digit serial number. For an example 37 | // of the output, see the functions TestPP and TestNullPP in prettyprinter_test.go 38 | type prettyPrinter struct { 39 | randInts rand.Source 40 | tableLabels map[*smallTable]string 41 | tableSerials map[*smallTable]uint 42 | } 43 | 44 | func newPrettyPrinter(seed int) *prettyPrinter { 45 | return &prettyPrinter{ 46 | randInts: rand.NewSource(int64(seed)), 47 | tableLabels: make(map[*smallTable]string), 48 | tableSerials: make(map[*smallTable]uint), 49 | } 50 | } 51 | 52 | func (pp *prettyPrinter) tableSerial(t *smallTable) uint { 53 | return pp.tableSerials[t] 54 | } 55 | func (pp *prettyPrinter) tableLabel(t *smallTable) string { 56 | return pp.tableLabels[t] 57 | } 58 | 59 | func (pp *prettyPrinter) labelTable(table *smallTable, label string) { 60 | pp.tableLabels[table] = label 61 | newSerial := pp.randInts.Int63()%500 + 500 62 | //nolint:gosec 63 | pp.tableSerials[table] = uint(newSerial) 64 | } 65 | 66 | func (pp *prettyPrinter) printNFA(t *smallTable) string { 67 | return pp.printNFAStep(&faState{table: t}, 0, make(map[*smallTable]bool)) 68 | } 69 | 70 | func (pp *prettyPrinter) printNFAStep(fas *faState, indent int, already map[*smallTable]bool) string { 71 | t := fas.table 72 | trailer := "\n" 73 | if len(fas.fieldTransitions) != 0 { 74 | trailer = fmt.Sprintf(" [%d transition(s)]\n", len(fas.fieldTransitions)) 75 | } 76 | s := " " + pp.printTable(t) + trailer 77 | for _, step := range t.steps { 78 | if step != nil { 79 | for _, state := range step.states { 80 | _, ok := already[state.table] 81 | if !ok { 82 | already[state.table] = true 83 | s += pp.printNFAStep(state, indent+1, already) 84 | } 85 | } 86 | } 87 | } 88 | return s 89 | } 90 | 91 | func (pp *prettyPrinter) printTable(t *smallTable) string { 92 | // going to build a string rep of a smallTable based on the unpacked form 93 | // each line is going to be a range like 94 | // 'c' .. 'e' => %X 95 | // lines where the *faNext is nil are omitted 96 | // TODO: Post-nfa-rationalization, I don't think the whole defTrans thing is necessary any more? 97 | var rows []string 98 | unpacked := unpackTable(t) 99 | 100 | var rangeStart int 101 | var b int 102 | 103 | defTrans := unpacked[0] 104 | 105 | // TODO: Try to generate an NFA with a state with multiple epsilons 106 | if len(t.epsilon) != 0 { 107 | fas := "" 108 | for i, eps := range t.epsilon { 109 | ep := &faNext{states: []*faState{eps}} 110 | if i != 0 { 111 | fas += ", " 112 | } 113 | fas += pp.nextString(ep) 114 | } 115 | rows = append(rows, "ε → "+fas) 116 | } 117 | for { 118 | for b < len(unpacked) && unpacked[b] == nil { 119 | b++ 120 | } 121 | if b == len(unpacked) { 122 | break 123 | } 124 | rangeStart = b 125 | lastN := unpacked[b] 126 | for b < len(unpacked) && unpacked[b] == lastN { 127 | b++ 128 | } 129 | if lastN != defTrans { 130 | row := "" 131 | if b == rangeStart+1 { 132 | row += fmt.Sprintf("'%s'", branchChar(byte(rangeStart))) 133 | } else { 134 | row += fmt.Sprintf("'%s'…'%s'", branchChar(byte(rangeStart)), branchChar(byte(b-1))) 135 | } 136 | row += " → " + pp.nextString(lastN) 137 | rows = append(rows, row) 138 | } 139 | } 140 | serial := pp.tableSerial(t) 141 | label := pp.tableLabel(t) 142 | if defTrans != nil { 143 | dtString := "★ → " + pp.nextString(defTrans) 144 | return fmt.Sprintf("%d[%s] ", serial, label) + strings.Join(rows, " / ") + " / " + dtString 145 | } else { 146 | return fmt.Sprintf("%d[%s] ", serial, label) + strings.Join(rows, " / ") 147 | } 148 | } 149 | 150 | func (pp *prettyPrinter) nextString(n *faNext) string { 151 | var snames []string 152 | for _, step := range n.states { 153 | snames = append(snames, fmt.Sprintf("%d[%s]", 154 | pp.tableSerial(step.table), pp.tableLabel(step.table))) 155 | } 156 | return strings.Join(snames, " · ") 157 | } 158 | 159 | func branchChar(b byte) string { 160 | replaceStr := []string{ 161 | "nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel", "bs", "ht", "nl", "vt", "np", "cr", "so", "si", "dle", 162 | "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb", "can", "em", "sub", "esc", "fs", "gs", "rs", "us", "sp", 163 | "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", 164 | "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 165 | ":", ";", "<", "=", ">", "?", "@", 166 | "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", 167 | "S", "T", "U", "V", "W", "X", "Y", "Z", 168 | "[", "\\", "]", "^", "_", "`", 169 | "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", 170 | "s", "t", "u", "v", "w", "x", "y", "z", 171 | "{", "|", "}", "~", "del"} 172 | switch b { 173 | // TODO: Figure out how to test commented-out cases 174 | case valueTerminator: 175 | return fmt.Sprintf("%x/ℵ", valueTerminator) 176 | default: 177 | if b < 128 { 178 | return fmt.Sprintf("%x/%s", b, replaceStr[b]) 179 | } else { 180 | return fmt.Sprintf("%x/", b) 181 | } 182 | } 183 | } 184 | 185 | func (pp *prettyPrinter) shortPrintNFA(table *smallTable) string { 186 | return fmt.Sprintf("%d[%s]", pp.tableSerials[table], pp.tableLabels[table]) 187 | } 188 | -------------------------------------------------------------------------------- /prettyprinter_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestPP(t *testing.T) { 8 | pp := newPrettyPrinter(1) 9 | table, _ := makeShellStyleFA([]byte(`"x*9"`), pp) 10 | pp.labelTable(table, "START HERE") 11 | wanted := ` 758[START HERE] '22/"' → 910[on " at 0] 12 | 910[on " at 0] '78/x' → 821[gS at 2] 13 | 821[gS at 2] ε → 821[gS at 2] / '39/9' → 551[gX on 9 at 3] 14 | 551[gX on 9 at 3] '22/"' → 937[on " at 4] 15 | 937[on " at 4] 'f5/ℵ' → 820[last step at 5] 16 | 820[last step at 5] [1 transition(s)] 17 | ` 18 | s := pp.printNFA(table) 19 | if s != wanted { 20 | t.Errorf("LONG: wanted\n<%s>\ngot\n<%s>\n", wanted, s) 21 | } 22 | if pp.shortPrintNFA(table) != "758[START HERE]" { 23 | t.Errorf("SHORT: wanted <%s> got <%s>\n", "758[START HERE]", pp.shortPrintNFA(table)) 24 | } 25 | } 26 | 27 | func TestNullPP(t *testing.T) { 28 | np := &nullPrinter{} 29 | table := newSmallTable() 30 | table.addByteStep(3, &faNext{}) 31 | np.labelTable(table, "foo") 32 | if np.printNFA(table) != noPP || np.shortPrintNFA(table) != noPP { 33 | t.Error("didn't get noPP") 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /quamina.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | // Quamina instances provide the public APIs of this pattern-matching library. A single Quamina instance is 9 | // not thread-safe in that it cannot safely be used simultaneously in multiple goroutines. To re-use a 10 | // Quamina instance concurrently in multiple goroutines, create copies using the Copy API. 11 | type Quamina struct { 12 | flattener Flattener 13 | matcher matcher 14 | mediaTypeSpecified bool 15 | deletionSpecified bool 16 | } 17 | 18 | // Option is an interface type used in Quamina's New API to pass in options. By convention, Option names 19 | // have a prefix of "With". 20 | type Option func(q *Quamina) error 21 | 22 | // WithMediaType provides a media-type to support the selection of an appropriate Flattener. 23 | // This option call may not be provided more than once, nor can it be combined on the same 24 | // invocation of quamina.New() with the WithFlattener() option. 25 | func WithMediaType(mediaType string) Option { 26 | return func(q *Quamina) error { 27 | if q.flattener != nil { 28 | return errors.New("flattener already specified") 29 | } 30 | if q.mediaTypeSpecified { 31 | return errors.New("media-type specified more than once") 32 | } 33 | switch mediaType { 34 | case "application/json": 35 | q.flattener = newJSONFlattener() 36 | default: 37 | return fmt.Errorf(`media type "%s" is not supported by Quamina`, mediaType) 38 | } 39 | q.mediaTypeSpecified = true 40 | return nil 41 | } 42 | } 43 | 44 | // WithFlattener allows the specification of a caller-provided Flattener instance to use on incoming Events. 45 | // This option call may not be provided more than once, nor can it be combined on the same 46 | // invocation of quamina.New() with the WithMediaType() option. 47 | func WithFlattener(f Flattener) Option { 48 | return func(q *Quamina) error { 49 | if q.mediaTypeSpecified { 50 | return errors.New("media-type already specified") 51 | } 52 | if q.flattener != nil { 53 | return errors.New("flattener specified more than once") 54 | } 55 | if f == nil { 56 | return errors.New("nil Flattener") 57 | } 58 | q.flattener = f 59 | return nil 60 | } 61 | } 62 | 63 | // WithPatternDeletion arranges, if the argument is true, that this Quamina instance will support 64 | // the DeletePatterns() method. This option call may not be provided more than once. 65 | func WithPatternDeletion(b bool) Option { 66 | return func(q *Quamina) error { 67 | if q.deletionSpecified { 68 | return errors.New("pattern deletion already specified") 69 | } 70 | if b { 71 | q.matcher = newPrunerMatcher(nil) 72 | } else { 73 | q.matcher = newCoreMatcher() 74 | } 75 | q.deletionSpecified = true 76 | return nil 77 | } 78 | } 79 | 80 | // WithPatternStorage supplies the Quamina instance with a LivePatternState 81 | // instance to be used to store the active patterns, i.e. those that have been 82 | // added with AddPattern but not deleted with DeletePattern. This option call 83 | // may not be provided more than once. 84 | func WithPatternStorage(ps LivePatternsState) Option { 85 | return func(q *Quamina) error { 86 | if ps == nil { 87 | return errors.New("null PatternStorage") 88 | } 89 | return errors.New(" Pattern storage option not implemented yet") 90 | } 91 | } 92 | 93 | // New returns a new Quamina instance. Consult the APIs beginning with “With” for the options 94 | // that may be used to configure the new instance. 95 | func New(opts ...Option) (*Quamina, error) { 96 | var q Quamina 97 | for _, option := range opts { 98 | if err := option(&q); err != nil { 99 | return nil, err 100 | } 101 | } 102 | if (!q.mediaTypeSpecified) && (q.flattener == nil) { 103 | q.flattener = newJSONFlattener() 104 | } 105 | if !q.deletionSpecified { 106 | q.matcher = newCoreMatcher() 107 | } 108 | return &q, nil 109 | } 110 | 111 | // Copy produces a new Quamina instance designed to be used safely in parallel with existing instances on different 112 | // goroutines. Copy'ed instances share the same underlying data structures, so a pattern added to any instance 113 | // with AddPattern will be visible in all of them. 114 | func (q *Quamina) Copy() *Quamina { 115 | return &Quamina{matcher: q.matcher, flattener: q.flattener.Copy()} 116 | } 117 | 118 | // X is used in the AddPattern and MatchesForEvent APIs to identify the patterns that are added to 119 | // a Quamina instance and are reported by that instance as matching an event. Commonly, X is a string 120 | // used to name the pattern. 121 | type X any 122 | 123 | // AddPattern adds a pattern, identified by the x argument, to a Quamina instance. 124 | // patternJSON is a JSON object. error is returned in the case that the PatternJSON is invalid JSON or 125 | // has a leaf which is not provided as an array. AddPattern is single-threaded; if it is invoked concurrently 126 | // from multiple goroutines (in instances created using the Copy method) calls will block until any other 127 | // AddPattern call in progress succeeds. 128 | func (q *Quamina) AddPattern(x X, patternJSON string) error { 129 | return q.matcher.addPattern(x, patternJSON) 130 | } 131 | 132 | // DeletePatterns removes patterns identified by the x argument from the Quamina instance; the effect 133 | // is that return values from future calls to MatchesForEvent will not include this x value. 134 | func (q *Quamina) DeletePatterns(x X) error { 135 | return q.matcher.deletePatterns(x) 136 | } 137 | 138 | // MatchesForEvent returns a slice of X values which identify patterns that have previously been added to this 139 | // Quamina instance and which “match” the event in the sense described in README. The matches slice may be empty 140 | // if no patterns match. error can be returned in case that the event is not a valid JSON object or contains 141 | // invalid UTF-8 byte sequences. 142 | func (q *Quamina) MatchesForEvent(event []byte) ([]X, error) { 143 | fields, err := q.flattener.Flatten(event, q.matcher.getSegmentsTreeTracker()) 144 | if err != nil { 145 | return nil, err 146 | } 147 | return q.matcher.matchesForFields(fields) 148 | } 149 | -------------------------------------------------------------------------------- /quamina_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestCopy(t *testing.T) { 10 | q, err := New() 11 | if err != nil { 12 | t.Error("New? " + err.Error()) 13 | } 14 | q2 := q.Copy() 15 | if q2.matcher != q.matcher || q2.flattener == q.flattener { 16 | t.Error("improper copy") 17 | } 18 | } 19 | 20 | func TestNewQOptions(t *testing.T) { 21 | var q *Quamina 22 | var err error 23 | var ok bool 24 | q, err = New(WithMediaType("application/json")) 25 | if err != nil { 26 | t.Error(err.Error()) 27 | } 28 | _, ok = q.flattener.(*flattenJSON) 29 | if !ok { 30 | t.Error("Should be flattenJSON") 31 | } 32 | _, err = New(WithMediaType("text/html")) 33 | if err == nil { 34 | t.Error("accepted text/html") 35 | } 36 | q, err = New(WithFlattener(newJSONFlattener())) 37 | if err != nil { 38 | t.Error(err.Error()) 39 | } 40 | _, ok = q.flattener.(*flattenJSON) 41 | if !ok { 42 | t.Error("should be flattenJSON") 43 | } 44 | _, err = New(WithFlattener(nil)) 45 | if err == nil { 46 | t.Error("accepted nil flattener") 47 | } 48 | _, err = New(WithPatternStorage(nil)) 49 | if err == nil { 50 | t.Error("accepted WIthPatternStorage") 51 | } 52 | q, err = New(WithPatternDeletion(true)) 53 | if err != nil { 54 | t.Error("didn't take PatternDeletion(true") 55 | } 56 | _, ok = q.matcher.(*prunerMatcher) 57 | if !ok { 58 | t.Error("should be pruner") 59 | } 60 | q, err = New(WithPatternDeletion(false)) 61 | if err != nil { 62 | t.Error("didn't take PatternDeletion(false") 63 | } 64 | _, ok = q.matcher.(*coreMatcher) 65 | if !ok { 66 | t.Error("should be core") 67 | } 68 | 69 | _, err = New(WithPatternDeletion(true), WithPatternDeletion(true)) 70 | if err == nil { 71 | t.Error("allowed 2 patternDel" + err.Error()) 72 | } 73 | _, err = New(WithFlattener(newJSONFlattener()), WithFlattener(newJSONFlattener())) 74 | if err == nil { 75 | t.Error("allowed 2 flatteners" + err.Error()) 76 | } 77 | _, err = New(WithMediaType("application/json"), WithMediaType("application/json")) 78 | if err == nil { 79 | t.Error("allowed 2 mediatypes" + err.Error()) 80 | } 81 | _, err = New(WithMediaType("application/json"), WithFlattener(newJSONFlattener())) 82 | if err == nil { 83 | t.Error("allowed flattener and media type" + err.Error()) 84 | } 85 | q, err = New(WithPatternDeletion(true)) 86 | if err != nil { 87 | t.Error("WithPatternDeletion failed: " + err.Error()) 88 | } 89 | _, ok = q.matcher.(*prunerMatcher) 90 | if !ok { 91 | t.Error("not a pruner matcher") 92 | } 93 | _, ok = q.flattener.(*flattenJSON) 94 | if !ok { 95 | t.Error("flattener not for JSON") 96 | } 97 | } 98 | 99 | // reduced to allow unit tests in slow GitHub actions to pass 100 | // const thresholdPerformance = 120000.0 101 | const thresholdPerformance = 1.0 102 | 103 | // TestCityLots is the benchmark that was used in most of Quamina's performance tuning. It's fairly pessimal in 104 | // that it uses geometry/co-ordintes, which will force the fj flattener to process the big arrays of numbers in 105 | // each line. A high proportion of typical Quamina workloads should run faster. 106 | func TestCityLots(t *testing.T) { 107 | patterns := []string{ 108 | `{ "properties": { "STREET": [ "CRANLEIGH" ] } }`, 109 | `{ "properties": { "STREET": [ "17TH" ], "ODD_EVEN": [ "E"] } }`, 110 | `{ "geometry": { "coordinates": [ 37.807807921694092 ] } }`, 111 | `{ "properties": { "MAPBLKLOT": ["0011008"], "BLKLOT": ["0011008"]}, "geometry": { "coordinates": [ 37.807807921694092 ] } } `, 112 | } 113 | names := []string{ 114 | "CRANLEIGH", 115 | "17TH Even", 116 | "Geometry", 117 | "0011008", 118 | } 119 | wanted := map[X]int{ 120 | "CRANLEIGH": 7, 121 | "17TH Even": 836, 122 | "Geometry": 2, 123 | "0011008": 1, 124 | } 125 | 126 | var err error 127 | q, err := New() 128 | if err != nil { 129 | t.Error("New(): " + err.Error()) 130 | } 131 | for i := range names { 132 | err = q.AddPattern(names[i], patterns[i]) 133 | if err != nil { 134 | t.Error("Addpattern: " + err.Error()) 135 | } 136 | } 137 | results := make(map[X]int) 138 | fmt.Println(matcherStats(q.matcher.(*coreMatcher))) 139 | 140 | lines := getCityLotsLines(t) 141 | before := time.Now() 142 | for _, line := range lines { 143 | matches, err := q.MatchesForEvent(line) 144 | if err != nil { 145 | t.Error("Matches4JSON: " + err.Error()) 146 | } 147 | for _, match := range matches { 148 | count, ok := results[match] 149 | if !ok { 150 | count = 0 151 | } 152 | results[match] = count + 1 153 | } 154 | } 155 | fmt.Println() 156 | 157 | elapsed := float64(time.Since(before).Milliseconds()) 158 | perSecond := float64(cityLotsLineCount) / (elapsed / 1000.0) 159 | fmt.Printf("%.2f matches/second\n\n", perSecond) 160 | 161 | if perSecond < thresholdPerformance { 162 | message1 := fmt.Sprintf("Events-per-second benchmark ran at %.0f events per second, below threshold of %.0f.", 163 | perSecond, thresholdPerformance) 164 | message2 := ` 165 | It may be that re-running the benchmark test will address this, or it may be that you're running on a machine 166 | that is slower than the one the software was developed on, in which case you might want to readjust the 167 | "thresholdPerformance" constant. However, it may be that you made a change that reduced the throughput of the 168 | library, which would be unacceptable.` 169 | t.Error(message1 + message2) 170 | } 171 | 172 | if len(results) != len(wanted) { 173 | t.Errorf("got %d results, wanted %d", len(results), len(wanted)) 174 | } 175 | for match, count := range results { 176 | if count != wanted[match] { 177 | t.Errorf("For %s, wanted=%d, result=%d", match, wanted[match], count) 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /race_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "math/rand" 7 | "sync" 8 | "testing" 9 | ) 10 | 11 | func TestConcurrencyCore(t *testing.T) { 12 | testConcurrency(t, newCoreMatcher()) 13 | } 14 | 15 | func testConcurrency(t *testing.T, m matcher) { 16 | t.Helper() 17 | 18 | var ( 19 | goroutines = 4 20 | n = 500 21 | tasks = 6 22 | ) 23 | 24 | log.Printf("TestConcurrency %T goroutines: %d, tasks: %d", 25 | m, goroutines, tasks) 26 | 27 | populate := func() { 28 | for i := 0; i < n; i++ { 29 | p := fmt.Sprintf(`{"like":["tacos","queso"],"want":[%d]}`, i) 30 | if err := m.addPattern(i, p); err != nil { 31 | t.Fatal(err) 32 | } 33 | } 34 | } 35 | 36 | query := func(verify bool) { 37 | f := newJSONFlattener() 38 | 39 | for i := 0; i < n; i++ { 40 | e := fmt.Sprintf(`{"like":"tacos","want":%d}`, i) 41 | fs, err := f.Flatten([]byte(e), m.(*coreMatcher).getSegmentsTreeTracker()) 42 | if err != nil { 43 | t.Fatal(err) 44 | } 45 | if got, err := m.matchesForFields(fs); err != nil { 46 | t.Fatal(err) 47 | } else if verify && len(got) != 1 { 48 | t.Fatal(got) 49 | } 50 | } 51 | } 52 | 53 | wg := sync.WaitGroup{} 54 | for i := 0; i < goroutines; i++ { 55 | wg.Add(1) 56 | go func(i int) { 57 | // We defer to get Done called after a t.Fatal(). 58 | defer wg.Done() 59 | for j, k := range rand.Perm(tasks) { 60 | switch k { 61 | case 0, 1: 62 | populate() 63 | // case 1: 64 | // depopulate() 65 | default: 66 | query(false) 67 | } 68 | log.Printf("task %d,%d (%d) complete", i, j, k) 69 | } 70 | }(i) 71 | } 72 | wg.Wait() 73 | } 74 | -------------------------------------------------------------------------------- /rebuilding.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | // This file contains some experimental rebuildWhileLocked policies that are not 4 | // currently used anywhere. Here just for examples and possible 5 | // future use. 6 | 7 | // liveRatioTrigger's rebuild function returns true when there are at 8 | // least MinLive live patterns and the ratio of removed to live 9 | // patterns is greater than 1. 10 | // 11 | // This type is not used anywhere; just here as an example and maybe 12 | // for future consideration. 13 | type liveRatioTrigger struct { 14 | Ratio float64 15 | MinLive int 16 | } 17 | 18 | func newLiveRatioTrigger(ratio float64, minimum int) *liveRatioTrigger { 19 | return &liveRatioTrigger{ 20 | Ratio: ratio, 21 | MinLive: minimum, 22 | } 23 | } 24 | 25 | func (t *liveRatioTrigger) rebuild(added bool, s *prunerStats) bool { 26 | if added { 27 | return false 28 | } 29 | live := s.Live - s.Deleted 30 | if live == 0 { 31 | return false 32 | } 33 | if live < t.MinLive { 34 | return false 35 | } 36 | return t.Ratio <= float64(s.Deleted)/float64(live) 37 | } 38 | 39 | // neverTrigger is a rebuildTrigger that will never trigger a rebuild. 40 | // 41 | // Setting prunerMatcher.rebuildTrigger to nil will have the same effect. 42 | // 43 | // This type is not used anywhere; just here as an example and maybe 44 | // for future consideration. 45 | type neverTrigger struct{} 46 | 47 | func newNeverTrigger() *neverTrigger { 48 | return &neverTrigger{} 49 | } 50 | 51 | func (t *neverTrigger) rebuild(added bool, s *prunerStats) bool { 52 | return false 53 | } 54 | -------------------------------------------------------------------------------- /rebuilding_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestLiveRatioTrigger(t *testing.T) { 9 | r := newLiveRatioTrigger(0.5, 2) 10 | 11 | s := &prunerStats{} 12 | 13 | if r.rebuild(false, s) { 14 | t.Fatal("shouldn't have fired") 15 | } 16 | 17 | s.Live = 5 18 | s.Deleted = 3 19 | 20 | if r.rebuild(true, s) { 21 | t.Fatal("shouldn't have fired") 22 | } 23 | 24 | if !r.rebuild(false, s) { 25 | t.Fatal("should have fired") 26 | } 27 | 28 | s.Live = 1 29 | if r.rebuild(false, s) { 30 | t.Fatal("shouldn't have fired") 31 | } 32 | } 33 | 34 | func TestNeverTrigger(t *testing.T) { 35 | r := newNeverTrigger() 36 | s := &prunerStats{ 37 | Live: 42, 38 | Deleted: 17, 39 | } 40 | if r.rebuild(false, s) { 41 | t.Fatal("you only had one job") 42 | } 43 | } 44 | 45 | // sane verifies that certain prunerStats are not negative. 46 | // 47 | // The types in question aren't uint(64) but maybe they should be. 48 | func (s prunerStats) sane() error { 49 | if s.Live < 0 { 50 | return fmt.Errorf("prunerStats.Live is negative") 51 | } 52 | 53 | if s.Added < 0 { 54 | return fmt.Errorf("prunerStats.Added is negative") 55 | } 56 | 57 | if s.Deleted < 0 { 58 | return fmt.Errorf("prunerStats.Deleted is negative") 59 | } 60 | 61 | if s.Filtered < 0 { 62 | return fmt.Errorf("prunerStats.Filtered is negative") 63 | } 64 | 65 | return nil 66 | } 67 | 68 | func (m *prunerMatcher) checkStats() error { 69 | return m.getStats().sane() 70 | } 71 | -------------------------------------------------------------------------------- /regexp_end2end_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | type rxTest struct { 9 | rx string 10 | matches []string 11 | nonMatches []string 12 | } 13 | 14 | func TestRegexpEnd2End(t *testing.T) { 15 | // somewhat duplicative of the samples-based regexp_validity_test but worth 16 | // doing just to check for merge problems. 17 | allPatternsCM := newCoreMatcher() 18 | 19 | tests := []rxTest{ 20 | {rx: "a|b", matches: []string{"a", "b"}, nonMatches: []string{"x", "Á"}}, 21 | {rx: "a", matches: []string{"a"}, nonMatches: []string{"b", ""}}, 22 | {rx: "a.b", matches: []string{"axb", "a.b", "aÉb"}, nonMatches: []string{"ab", "axxb"}}, 23 | {rx: "abc|def", matches: []string{"abc", "def"}, nonMatches: []string{"x", "Á"}}, 24 | {rx: "[hij]", matches: []string{"h", "i", "j"}, nonMatches: []string{"x", "Á"}}, 25 | {rx: "a[e-g]x", matches: []string{"aex", "afx", "agx"}, nonMatches: []string{"ax", "axx"}}, 26 | {rx: "[ae-gx]", matches: []string{"a", "e", "f", "g", "x"}, nonMatches: []string{"b", "Á"}}, 27 | {rx: "[-ab]", matches: []string{"-", "a", "b"}, nonMatches: []string{"c", "Á"}}, 28 | {rx: "[ab-]", matches: []string{"-", "a", "b"}, nonMatches: []string{"c", "Á"}}, 29 | {rx: "[~[~]]", matches: []string{"[", "]"}, nonMatches: []string{"", "Á"}}, 30 | {rx: "[~r~t~n]", matches: []string{"\\r", "\\t", "\\n"}, nonMatches: []string{"c", "Á"}}, 31 | {rx: "[a-c]|[xz]", matches: []string{"a", "b", "c", "x", "z"}, nonMatches: []string{"", "Á", "w"}}, 32 | {rx: "[ac-e]h|p[xy]", matches: []string{"ah", "ch", "dh", "eh", "px", "py"}, nonMatches: []string{"", "Á", "xp"}}, 33 | {rx: "[0-9][0-9][rtn][dh]", matches: []string{"11th", "23rd", "22nd"}, nonMatches: []string{"first", "9th"}}, 34 | {rx: "a(h|i)z", matches: []string{"ahz", "aiz"}, nonMatches: []string{"a.z", "Á"}}, 35 | {rx: "a([1-3]|ac)z", matches: []string{"a1z", "a2z", "a3z", "aacz"}, nonMatches: []string{"a.z", "Á", "a0^z"}}, 36 | {rx: "a(h|([x-z]|(1|2)))z", matches: []string{"ahz", "axz", "a1z", "a2z"}, nonMatches: []string{"a.z", "Á"}}, 37 | } 38 | 39 | for _, test := range tests { 40 | cm := newCoreMatcher() 41 | pattern := fmt.Sprintf(`{"a": [{"regexp": "%s"}]}`, test.rx) 42 | err := cm.addPattern("a", pattern) 43 | if err != nil { 44 | t.Error("addP: " + err.Error()) 45 | continue 46 | } 47 | err = allPatternsCM.addPattern(pattern, pattern) 48 | if err != nil { 49 | t.Error("addPAll" + err.Error()) 50 | continue 51 | } 52 | for _, match := range test.matches { 53 | event := fmt.Sprintf(`{"a": "%s"}`, match) 54 | matches, err := cm.matchesForJSONEvent([]byte(event)) 55 | if err != nil { 56 | t.Error("M4JE: " + err.Error()) 57 | } 58 | if len(matches) != 1 || matches[0] != "a" { 59 | t.Errorf("%s didn't match /%s/", match, test.rx) 60 | } 61 | } 62 | for _, match := range test.nonMatches { 63 | event := fmt.Sprintf(`{"a": "%s"}`, match) 64 | matches, err := cm.matchesForJSONEvent([]byte(event)) 65 | if err != nil { 66 | t.Error("M4JE: " + err.Error()) 67 | } 68 | if len(matches) != 0 { 69 | t.Errorf("%s matched /%s/", match, test.rx) 70 | } 71 | } 72 | } 73 | // now let's see if the merged FA's work 74 | for _, test := range tests { 75 | for _, match := range test.matches { 76 | event := fmt.Sprintf(`{"a": "%s"}`, match) 77 | matches, err := allPatternsCM.matchesForJSONEvent([]byte(event)) 78 | if err != nil { 79 | t.Error("M4JE: " + err.Error()) 80 | } 81 | if len(matches) == 0 { 82 | t.Errorf("%s didn't match in merge FA", match) 83 | } 84 | pattern := fmt.Sprintf(`{"a": [{"regexp": "%s"}]}`, test.rx) 85 | if !containsX(matches, pattern) { 86 | t.Errorf("event %s should match %s", event, pattern) 87 | } 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /regexp_parse.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "unicode/utf8" 6 | ) 7 | 8 | // regexpParse represents the state of a regexp read, validate, and parse project 9 | type regexpParse struct { 10 | bytes []byte 11 | index int 12 | lastIndex int 13 | nesting []regexpRoot 14 | features *regexpFeatureChecker 15 | tree regexpRoot 16 | } 17 | 18 | func (p *regexpParse) nest() { 19 | p.nesting = append(p.nesting, p.tree) 20 | p.tree = regexpRoot{} 21 | } 22 | 23 | // unNest is only called after isNested. We've been building up a subtree in p.tree, so we need to 24 | // save that subtree, pop whatever was on the nesting stack back into p.tree, and then return the 25 | // sub tree so it can be built into a quantifiedAtom 26 | func (p *regexpParse) unNest() regexpRoot { 27 | subtree := p.tree 28 | p.tree = p.nesting[len(p.nesting)-1] 29 | p.nesting = p.nesting[0 : len(p.nesting)-1] 30 | return subtree 31 | } 32 | 33 | func (p *regexpParse) isNested() bool { 34 | return len(p.nesting) > 0 35 | } 36 | 37 | func newRxParseState(t []byte) *regexpParse { 38 | return ®expParse{ 39 | bytes: t, 40 | features: defaultRegexpFeatureChecker(), 41 | tree: regexpRoot{}, 42 | } 43 | } 44 | 45 | func (p *regexpParse) nextRune() (rune, error) { 46 | if p.index >= len(p.bytes) { 47 | return 0, errRegexpEOF 48 | } 49 | p.lastIndex = p.index 50 | c, length := utf8.DecodeRune(p.bytes[p.index:]) 51 | if c == utf8.RuneError { 52 | return 0, fmt.Errorf("UTF-8 encoding error at offset %d", p.lastOffset()) 53 | } 54 | p.index += length 55 | return c, nil 56 | } 57 | 58 | // require checks to see if the first rune matches the supplied argument. If it fails, it doesn't back up or 59 | // recover or anything, on the assumption that you're giving up. 60 | func (p *regexpParse) require(wanted rune) error { 61 | got, err := p.nextRune() 62 | if err != nil { 63 | return err 64 | } 65 | if got != wanted { 66 | return fmt.Errorf("incorrect character at %d; got %c wanted %c", p.lastOffset(), got, wanted) 67 | } 68 | return nil 69 | } 70 | 71 | func (p *regexpParse) bypassOptional(c rune) (bool, error) { 72 | next, err := p.nextRune() 73 | if err != nil { 74 | return false, err 75 | } 76 | if next != c { 77 | p.backup1(next) 78 | } 79 | return next == c, nil 80 | } 81 | 82 | func (p *regexpParse) backup1(oneRune rune) { 83 | p.index -= utf8.RuneLen(oneRune) 84 | } 85 | 86 | func (p *regexpParse) offset() int { 87 | return p.index 88 | } 89 | func (p *regexpParse) lastOffset() int { 90 | return p.lastIndex 91 | } 92 | 93 | func (p *regexpParse) isEmpty() bool { 94 | return p.index >= len(p.bytes) 95 | } 96 | -------------------------------------------------------------------------------- /regexp_parse_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | "unicode/utf8" 7 | ) 8 | 9 | func TestBasicRunelist(t *testing.T) { 10 | bytes := []byte("foo") 11 | r := newRxParseState(bytes) 12 | for i, b := range bytes { 13 | next, err := r.nextRune() 14 | if err != nil { 15 | t.Errorf("err at %d", i) 16 | } 17 | if next != rune(b) { 18 | t.Errorf("mismatch at %d", i) 19 | } 20 | } 21 | _, err := r.nextRune() 22 | if !errors.Is(err, errRegexpEOF) { 23 | t.Error("missed EOF") 24 | } 25 | if !r.isEmpty() { 26 | t.Error("missed empty") 27 | } 28 | } 29 | 30 | func TestBadUTF8(t *testing.T) { 31 | bad := []byte{0xF8} 32 | ps := newRxParseState(bad) 33 | _, err := ps.nextRune() 34 | if err == nil { 35 | t.Error("bad UTF8") 36 | } 37 | } 38 | 39 | func TestVariablePlaneRunelist(t *testing.T) { 40 | runes := []rune{'&', 0x416, 0x4E2D, 0x10346} 41 | lengths := []int{1, 2, 3, 4} 42 | list := newRxParseState([]byte(string(runes))) 43 | read := 0 44 | for i := range runes { 45 | r, err := list.nextRune() 46 | read += utf8.RuneLen(r) 47 | if err != nil { 48 | t.Errorf("err at %d", i) 49 | } 50 | if r != runes[i] { 51 | t.Errorf("mismatch at %d", i) 52 | } 53 | if utf8.RuneLen(r) != lengths[i] { 54 | t.Errorf("length mismatch at %d", i) 55 | } 56 | if read != list.offset() { 57 | t.Errorf("wrong length at %d", i) 58 | } 59 | } 60 | if !list.isEmpty() { 61 | t.Error("Missed empty") 62 | } 63 | for i := 3; i >= 0; i-- { 64 | list.backup1(runes[i]) 65 | read -= utf8.RuneLen(runes[i]) 66 | if list.offset() != read { 67 | t.Errorf("wrong offset at %d", i) 68 | } 69 | } 70 | if list.offset() != 0 { 71 | t.Error("offset not 0") 72 | } 73 | } 74 | 75 | func TestRuneListRequire(t *testing.T) { 76 | r := newRxParseState([]byte("foo")) 77 | err := r.require('f') 78 | if err != nil { 79 | t.Error("require mode 1") 80 | } 81 | r = newRxParseState([]byte("foo")) 82 | err = r.require('É') 83 | if err == nil { 84 | t.Error("require mode 2") 85 | } 86 | r = newRxParseState([]byte("Éé")) 87 | err = r.require('É') 88 | if err != nil { 89 | t.Error("require mode 3") 90 | } 91 | r = newRxParseState([]byte("Éé")) 92 | err = r.require('é') 93 | if err == nil { 94 | t.Error("require mode 4") 95 | } 96 | } 97 | 98 | func TestRuneListBypass(t *testing.T) { 99 | r := newRxParseState([]byte("Éé")) 100 | _, err := r.bypassOptional('é') 101 | if err != nil { 102 | t.Error("bypass mode 1") 103 | } 104 | next, err := r.nextRune() 105 | if err != nil || next != 'É' { 106 | t.Error("bypass mode 2") 107 | } 108 | r = newRxParseState([]byte("Éé")) 109 | _, err = r.bypassOptional('x') 110 | if err != nil { 111 | t.Error("bypass mode 3") 112 | } 113 | next, err = r.nextRune() 114 | if err != nil || next != 'É' { 115 | t.Error("bypass mode 4") 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /regexp_reader_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | // NormalChar = ( %x00-27 / "," / "-" / %x2F-3E ; '/'-'>' 11 | // / %x40-5A ; '@'-'Z' 12 | // / %x5E-7A ; '^'-'z' 13 | // / %x7E-D7FF ; skip surrogate code points 14 | // / %xE000-10FFFF ) 15 | func TestIsNormalChar(t *testing.T) { 16 | normals := []rune{ 17 | 0, 1, 0x26, 0x27, 18 | 0x40, 0x41, 0x59, 0x5a, 0x5c, 19 | 0x5e, 0x5f, 0x79, 0x7a, 20 | 0x7f, 0xd7fe, 0xd7ff, 21 | 0xe000, 0xe001, 0x10fffe, 0x10ffff, 22 | } 23 | for _, normal := range normals { 24 | if !isNormalChar(normal) { 25 | t.Errorf("%x abnormal", normal) 26 | } 27 | } 28 | abormals := []rune{ 29 | 0x28, 0x2e, 0x3f, 30 | 0x3f, 0x5b, 31 | 0x5d, 0x7b, 32 | 0x7d, 0x7e, 0xd800, 33 | 0xdfff, 34 | } 35 | for _, abnormal := range abormals { 36 | if isNormalChar(abnormal) { 37 | t.Errorf("%x normal", abnormal) 38 | } 39 | } 40 | } 41 | 42 | func TestSingleCharEscape(t *testing.T) { 43 | // SingleCharEsc = "\" ( %x28-2B ; '('-'+' 44 | // / "-" / "." / "?" / %x5B-5E ; '['-'^' 45 | // / %s"n" / %s"r" / %s"t" / %x7B-7D ; '{'-'}' 46 | //) 47 | sces := []rune{ 48 | 0x28, 0x29, 0x2a, 0x2b, 49 | '-', '.', '?', 0x5B, 0x5C, 0x5D, 0x5E, 50 | 'n', 'r', 't', 0x7B, 0x7C, 0x7D, 51 | '~', 52 | } 53 | for _, sce := range sces { 54 | _, ok := checkSingleCharEscape(sce) 55 | if !ok { 56 | t.Errorf("%x not sce", sce) 57 | } 58 | } 59 | notSces := []rune{ 60 | 0x27, 0x2C, 0x5A, 0x5F, 'j', 0x7A, 0x7F, 61 | } 62 | for _, notSce := range notSces { 63 | _, ok := checkSingleCharEscape(notSce) 64 | if ok { 65 | t.Errorf("%x is sce", notSce) 66 | } 67 | } 68 | } 69 | 70 | func TestReadCCE1(t *testing.T) { 71 | goods := []string{ 72 | "a", `~n-~r`, "ab", "a-b", 73 | } 74 | bads := []string{ 75 | "a-~P{Lu}", "~P{Lu}-x", 76 | } 77 | for _, good := range goods { 78 | _, err := readRegexp("[" + good + "]") 79 | if err != nil { 80 | t.Errorf("Missed good /[%s]/: %s", good, err.Error()) 81 | } 82 | } 83 | for _, bad := range bads { 84 | _, err := readRegexp("[" + bad + "]") 85 | if err == nil { 86 | t.Errorf("Missed bad %s", bad) 87 | } 88 | } 89 | } 90 | 91 | func TestRuneRangesFromCCE1(t *testing.T) { 92 | cce1s := []string{ 93 | "[ax]", "[a]", "[abc]", 94 | "[c-g]", "[ah-mq]", 95 | "[~n-~r]", 96 | "[-bdg-h]", 97 | } 98 | wanted := []RuneRange{ 99 | {{'a', 'a'}, {'x', 'x'}}, {{'a', 'a'}}, {{'a', 'c'}}, 100 | {{'c', 'g'}}, {{'a', 'a'}, {'h', 'm'}, {'q', 'q'}}, 101 | {{10, 13}}, 102 | {{'-', '-'}, {'b', 'b'}, {'d', 'd'}, {'g', 'h'}}, 103 | } 104 | for i, cce1 := range cce1s { 105 | parse := newRxParseState([]byte(cce1[1:])) 106 | rr, err := readCCE1s(parse) 107 | if err != nil { 108 | t.Error("RC: " + err.Error()) 109 | } 110 | if !runeRangeEqual(t, wanted[i], rr) { 111 | t.Errorf("Failed on %s", cce1) 112 | } 113 | } 114 | } 115 | 116 | func TestSimplifyRR(t *testing.T) { 117 | in := []RuneRange{ 118 | {{'a', 'b'}, {'e', 'j'}, {'l', 'n'}}, 119 | {{'a', 'e'}, {'b', 'e'}, {'d', 'm'}}, 120 | {{'a', 'c'}, {'d', 'r'}, {'s', 'x'}}, 121 | } 122 | wanteds := []RuneRange{ 123 | {{'a', 'b'}, {'e', 'j'}, {'l', 'n'}}, 124 | {{'a', 'm'}}, 125 | {{'a', 'x'}}, 126 | } 127 | for i, rrin := range in { 128 | wanted := wanteds[i] 129 | out := simplifyRuneRange(rrin) 130 | if !runeRangeEqual(t, out, wanted) { 131 | t.Errorf("botch at %d", i) 132 | } 133 | } 134 | } 135 | 136 | func runeRangeEqual(t *testing.T, wanted RuneRange, got RuneRange) bool { 137 | t.Helper() 138 | if len(wanted) != len(got) { 139 | return false 140 | } 141 | sort.Slice(wanted, func(i, j int) bool { return wanted[i].Lo < wanted[j].Lo }) 142 | sort.Slice(got, func(i, j int) bool { return got[i].Lo < got[j].Lo }) 143 | for i, w := range wanted { 144 | g := got[i] 145 | if w.Lo != g.Lo || w.Hi != g.Hi { 146 | return false 147 | } 148 | } 149 | return true 150 | } 151 | 152 | func TestBasicRegexpFeatureRead(t *testing.T) { 153 | type fw struct { 154 | rx string 155 | wanted []regexpFeature 156 | } 157 | 158 | var tfw = []fw{ 159 | {rx: "a.b", wanted: []regexpFeature{rxfDot}}, 160 | {rx: "ab*", wanted: []regexpFeature{rxfStar}}, 161 | {rx: "a+b", wanted: []regexpFeature{rxfPlus}}, 162 | {rx: "(ab)+", wanted: []regexpFeature{rxfParenGroup, rxfPlus}}, 163 | {rx: "zz?zz", wanted: []regexpFeature{rxfQM}}, 164 | {rx: "zzzz{3}", wanted: []regexpFeature{rxfRange}}, 165 | {rx: "zzzz{0,3}", wanted: []regexpFeature{rxfRange}}, 166 | {rx: "zzzz{3,}", wanted: []regexpFeature{rxfRange}}, 167 | {rx: "a~p{Lt}", wanted: []regexpFeature{rxfProperty}}, 168 | {rx: "a~P{Me}", wanted: []regexpFeature{rxfProperty}}, 169 | {rx: "a[fox37é]z", wanted: []regexpFeature{rxfClass}}, 170 | {rx: "a[-fox37é-]z", wanted: []regexpFeature{rxfClass}}, 171 | {rx: "a[fox33-87é]z", wanted: []regexpFeature{rxfClass}}, 172 | {rx: "a[^fox37é]z", wanted: []regexpFeature{rxfClass, rxfNegatedClass}}, 173 | {rx: "(abc)|(def)", wanted: []regexpFeature{rxfOrBar, rxfParenGroup}}, 174 | } 175 | 176 | var parse *regexpParse 177 | var err error 178 | for _, w := range tfw { 179 | fmt.Println("RX: " + w.rx) 180 | parse, err = readRegexp(w.rx) 181 | if err != nil { 182 | t.Errorf("botch on %s: %s", w.rx, err.Error()) 183 | } 184 | if len(w.wanted) != len(parse.features.found) { 185 | t.Errorf("for %s got %d wanted %d", w.rx, len(parse.features.found), len(w.wanted)) 186 | } else { 187 | for _, f := range w.wanted { 188 | _, ok := parse.features.found[f] 189 | if !ok { 190 | t.Errorf("for %s missed feature %s", w.rx, f) 191 | } 192 | } 193 | } 194 | } 195 | parse, _ = readRegexp("a*b") 196 | unimpl := parse.features.foundUnimplemented() 197 | foundStar := false 198 | for _, u := range unimpl { 199 | if u == rxfStar { 200 | foundStar = true 201 | } 202 | } 203 | if !foundStar { 204 | t.Error("Didn't find Star") 205 | } 206 | } 207 | 208 | func TestRegexpErrors(t *testing.T) { 209 | bads := []string{ 210 | "~P{L", 211 | "~P{L*}", 212 | string([]byte{'~', 0xfe, 0xff}), 213 | string([]byte{'[', 'a', 'b', 0xfe, 0xff, ']'}), 214 | string([]byte{'[', 'a', '-', 0xff, ']'}), 215 | string([]byte{'[', 'a', '-', '~', 0xff, ']'}), 216 | string([]byte{'a', 0xff}), 217 | string([]byte{'a', '{', 0xff, '}'}), 218 | string([]byte{'a', '{', '2', 0xff, '}'}), 219 | "a{9999999999998,9999999999999}", 220 | "a{2x-3}", 221 | "a{2,", 222 | string([]byte{'a', '{', '2', 0xff}), 223 | "a{2,r}", 224 | string([]byte{'a', '{', '2', ',', 0xff}), 225 | "a{2,4", 226 | string([]byte{'a', '{', '2', ',', '4', 0xff}), 227 | "a{2,4x", 228 | "a{2,9999999999999}", 229 | "abc)", 230 | } 231 | for _, bad := range bads { 232 | _, err := readRegexp(bad) 233 | if err == nil { 234 | t.Error("Took " + bad) 235 | } 236 | } 237 | } 238 | 239 | func TestAddRegexpTransition(t *testing.T) { 240 | // TODO: Keep adding/subtracting from this as we add features 241 | goods := []string{ 242 | "a.", 243 | } 244 | bads := []string{ 245 | "a?", "a*", "a+", "a?", 246 | "a{1,3}", "~p{Lu}", "[^abc]", 247 | } 248 | template := `{"a":[{"regexp": "FOO"}]}` 249 | cm := newCoreMatcher() 250 | for _, good := range goods { 251 | pat := strings.Replace(template, "FOO", good, 10) 252 | err := cm.addPattern("foo", pat) 253 | if err != nil { 254 | t.Errorf("thinks it found unimplemented feature in /%s/", good) 255 | } 256 | } 257 | for _, bad := range bads { 258 | pat := strings.Replace(template, "FOO", bad, 10) 259 | err := cm.addPattern("foo", pat) 260 | if err == nil { 261 | t.Errorf("missed unimplemented feature in /%s/", bad) 262 | } 263 | } 264 | } 265 | 266 | func TestRegexpReader(t *testing.T) { 267 | pat := `{"a":[{"regexp": "a.b"}]}` 268 | cm := newCoreMatcher() 269 | err := cm.addPattern("x", pat) 270 | if err != nil { 271 | t.Error("ap: " + err.Error()) 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /regexp_validity_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func oneRegexp(t *testing.T, re string, valid bool) { 9 | t.Helper() 10 | _, err := readRegexp(re) 11 | if valid && err != nil { 12 | t.Errorf("should be valid: /%s/, but <%s>", re, err.Error()) 13 | } 14 | if (!valid) && err == nil { 15 | t.Errorf("should NOT be valid: /%s/", re) 16 | } 17 | //fmt.Println("ERR: " + err.Error()) 18 | } 19 | 20 | func TestDebugRegexp(t *testing.T) { 21 | oneRegexp(t, "[~]", false) 22 | } 23 | 24 | func TestEmptyRegexp(t *testing.T) { 25 | parse := newRxParseState([]byte{}) 26 | parse, err := readRegexpWithParse(parse) 27 | if err != nil { 28 | fmt.Println("OOPS: " + err.Error()) 29 | } 30 | table, _ := makeRegexpNFA(parse.tree, false) 31 | // raw empty string should NOT match 32 | var transitions []*fieldMatcher 33 | bufs := &bufpair{} 34 | fields := traverseNFA(table, []byte(""), transitions, bufs) 35 | if len(fields) != 0 { 36 | t.Error("Matched empty string") 37 | } 38 | 39 | // matching on a field SHOULD match 40 | pattern := `{"a": [{"regexp": ""}]}` 41 | cm := newCoreMatcher() 42 | err = cm.addPattern("a", pattern) 43 | if err != nil { 44 | t.Error("addPattern: " + err.Error()) 45 | } 46 | event := `{"a": ""}` 47 | mm, err := cm.matchesForJSONEvent([]byte(event)) 48 | if err != nil { 49 | t.Error("M4J: " + err.Error()) 50 | } 51 | if len(mm) == 0 { 52 | t.Error("Didn't match empty to empty") 53 | } 54 | } 55 | 56 | func TestRegexpValidity(t *testing.T) { 57 | problems := 0 58 | tests := 0 59 | implemented := 0 60 | for _, sample := range regexpSamples { 61 | tests++ 62 | parse := newRxParseState([]byte(sample.regex)) 63 | 64 | parse, err := readRegexpWithParse(parse) 65 | if sample.valid { 66 | if len(parse.features.foundUnimplemented()) == 0 { 67 | implemented++ 68 | table, dest := makeRegexpNFA(parse.tree, false) 69 | for _, should := range sample.matches { 70 | // the sample regexp tests think the empty string matches lots of regexps with which 71 | // I don't think it should 72 | if should == "" { 73 | continue 74 | } 75 | var transitions []*fieldMatcher 76 | bufs := &bufpair{} 77 | fields := traverseNFA(table, []byte(should), transitions, bufs) 78 | if !containsFM(t, fields, dest) { 79 | t.Errorf("<%s> failed to match /%s/", should, sample.regex) 80 | problems++ 81 | } 82 | } 83 | for _, shouldNot := range sample.nomatches { 84 | var transitions []*fieldMatcher 85 | bufs := &bufpair{} 86 | fields := traverseNFA(table, []byte(shouldNot), transitions, bufs) 87 | if len(fields) != 0 { 88 | t.Errorf("<%s> matched /%s/", shouldNot, sample.regex) 89 | problems++ 90 | } 91 | } 92 | } 93 | if err != nil { 94 | t.Errorf("should be valid: /%s/, but <%s> (after %d lines) ", sample.regex, err.Error(), tests) 95 | problems++ 96 | } 97 | } else { 98 | if err == nil { 99 | t.Errorf("should NOT be valid: /%s/ (after %d lines) ", sample.regex, tests) 100 | problems++ 101 | } 102 | } 103 | if problems == 10 { 104 | return 105 | } 106 | } 107 | fmt.Printf("tests: %d, implemented: %d\n", tests, implemented) 108 | } 109 | -------------------------------------------------------------------------------- /segments_tree.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | ) 7 | 8 | const SegmentSeparator = "\n" 9 | 10 | // segmentsTree implements the SegmentsTreeTracker interface, and includes other calls used by 11 | // the AddPattern() code to load up the tree tracker. 12 | type segmentsTree struct { 13 | root bool 14 | 15 | // nodes stores a map from a segment to its children. 16 | // in a hierarchial data format like JSON, a node can be Object or Array. 17 | // for example, in this path "context\nuser\nid", both "context" and "user" will be nodes. 18 | nodes map[string]*segmentsTree 19 | 20 | // fields maps the children of this node which are leafs rather than nodes 21 | // to the []byte representation of the Path component of the Field. 22 | // In the "context\nuser\nid" example: 23 | // leaf "id" will be mapped to []byte("context\nuser\nid") 24 | // leaf "user", if it has non-node values, will be mapped to []byte("context\nuser") 25 | fields map[string][]byte 26 | } 27 | 28 | // newSegmentsIndex creates a segmentsTree node which is the root. 29 | // The paths argument is used for testing; it auto-adds those to the tree. 30 | func newSegmentsIndex(paths ...string) *segmentsTree { 31 | st := newSegmentsIndexNode(true) 32 | for _, path := range paths { 33 | st.add(path) 34 | } 35 | return st 36 | } 37 | 38 | // newSegmentsIndexNode initializes a segmentsTree node 39 | func newSegmentsIndexNode(root bool) *segmentsTree { 40 | return &segmentsTree{ 41 | root: root, 42 | nodes: make(map[string]*segmentsTree), 43 | fields: make(map[string][]byte), 44 | } 45 | } 46 | 47 | func (p *segmentsTree) add(path string) { 48 | segments := strings.Split(path, SegmentSeparator) 49 | 50 | // If we have only one segment, it's a field on the root. 51 | if len(segments) == 1 { 52 | // It's a direct field. 53 | p.fields[path] = []byte(path) 54 | return 55 | } 56 | 57 | var node *segmentsTree 58 | node = p 59 | 60 | for i, segment := range segments { 61 | // If this the last segment, add it as field 62 | // example: context\nuser\nid, in this case "id" is the field ("context" & "user" are nodes) 63 | if i == len(segments)-1 { 64 | node.addSegment(segment, []byte(path)) 65 | } else { 66 | node = node.getOrCreate(segment) 67 | } 68 | } 69 | } 70 | 71 | func (p *segmentsTree) getOrCreate(name string) *segmentsTree { 72 | _, ok := p.nodes[name] 73 | if !ok { 74 | p.nodes[name] = newSegmentsIndexNode(false) 75 | } 76 | return p.nodes[name] 77 | } 78 | 79 | func (p *segmentsTree) addSegment(segment string, path []byte) { 80 | _, ok := p.fields[segment] 81 | if !ok { 82 | p.fields[segment] = path 83 | } 84 | } 85 | 86 | // Get implements SegmentsTreeTracker 87 | func (p *segmentsTree) Get(name []byte) (SegmentsTreeTracker, bool) { 88 | n, ok := p.nodes[string(name)] 89 | return n, ok 90 | } 91 | 92 | // IsRoot implements SegmentsTreeTracker 93 | func (p *segmentsTree) IsRoot() bool { 94 | return p.root 95 | } 96 | 97 | // IsSegmentUsed implements SegmentsTreeTracker 98 | func (p *segmentsTree) IsSegmentUsed(segment []byte) bool { 99 | // In the next path: "context\nuser\nid" 100 | // "context" / "user" are nodes, while "id" is a field 101 | // As a result a segment can be both node and field, we need to check 102 | // in both maps. 103 | _, isField := p.fields[string(segment)] 104 | if isField { 105 | return true 106 | } 107 | _, isNode := p.nodes[string(segment)] 108 | return isNode 109 | } 110 | 111 | // PathForSegment implements SegmentsTreeTracker 112 | func (p *segmentsTree) PathForSegment(segment []byte) []byte { 113 | return p.fields[string(segment)] 114 | } 115 | 116 | // NodesCount implements SegmentsTreeTracker 117 | func (p *segmentsTree) NodesCount() int { 118 | return len(p.nodes) 119 | } 120 | 121 | // FieldsCount implements SegmentsTreeTracker 122 | func (p *segmentsTree) FieldsCount() int { 123 | return len(p.fields) 124 | } 125 | 126 | // String used for debugging purposes 127 | func (p *segmentsTree) String() string { 128 | nodeNames := make([]string, 0) 129 | for n := range p.nodes { 130 | nodeNames = append(nodeNames, n) 131 | } 132 | 133 | fieldNames := make([]string, 0) 134 | for f := range p.fields { 135 | fieldNames = append(fieldNames, f) 136 | } 137 | 138 | return fmt.Sprintf("root: %v, nodes [%s], fields: [%s]", p.root, strings.Join(nodeNames, ","), strings.Join(fieldNames, ",")) 139 | } 140 | 141 | // copy produces a fresh copy of an existing segmentsTree which is used to support atomic update of 142 | // the Quamina automaton. 143 | func (p *segmentsTree) copy() *segmentsTree { 144 | np := newSegmentsIndexNode(p.root) 145 | 146 | // copy fields 147 | for name, path := range p.fields { 148 | np.fields[name] = path 149 | } 150 | 151 | // copy nodes 152 | for name, node := range p.nodes { 153 | np.nodes[name] = node.copy() 154 | } 155 | 156 | return np 157 | } 158 | -------------------------------------------------------------------------------- /segments_tree_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestSegmentsTreeSanity(t *testing.T) { 8 | tree := newSegmentsIndex("field1") 9 | 10 | if !tree.IsRoot() { 11 | t.Errorf(`Expected "newSegmentsIndex" to return a root node: %s`, tree.String()) 12 | } 13 | 14 | expectCounts(t, tree, 1, 0) 15 | expectSegmentsToBeUsed(t, tree, "field1") 16 | 17 | tree.add("node\nfield") 18 | tree.add("node\nfield_2") 19 | tree.add("node\nsub_node\nleaf") 20 | 21 | if !tree.IsRoot() { 22 | t.Fatalf("Expect tree to be root: %s", tree.String()) 23 | } 24 | 25 | expectCounts(t, tree, 1, 1) 26 | expectSegmentsToBeUsed(t, tree, "field1", "node") 27 | 28 | n, ok := tree.Get([]byte("node")) 29 | if !ok { 30 | t.Fatalf(`Failed to fetch "node" from tree: %s`, tree.String()) 31 | } 32 | 33 | if n.IsRoot() { 34 | t.Fatalf("Expect node to not be root: %s", n.String()) 35 | } 36 | 37 | expectCounts(t, n, 2, 1) 38 | expectSegmentsToBeUsed(t, n, "field", "field_2") 39 | 40 | leaf, ok := n.Get([]byte("sub_node")) 41 | if !ok { 42 | t.Fatalf(`Failed to fetch "sub_node" from "node": %s`, n.String()) 43 | } 44 | 45 | if leaf.IsRoot() { 46 | t.Fatalf("Expect sub_node to not be root: %s", leaf.String()) 47 | } 48 | 49 | expectCounts(t, leaf, 1, 0) 50 | expectSegmentsToBeUsed(t, leaf, "leaf") 51 | } 52 | 53 | func TestSegmentsTreeString(t *testing.T) { 54 | tree := newSegmentsIndex("node\nsub_node\nfield", "root_field") 55 | 56 | expectedString := "root: true, nodes [node], fields: [root_field]" 57 | 58 | if tree.String() != expectedString { 59 | t.Errorf("Expected tree.String(): [%s] to equal [%s]", tree.String(), expectedString) 60 | } 61 | } 62 | 63 | func expectSegmentsToBeUsed(t *testing.T, tree SegmentsTreeTracker, segments ...string) { 64 | t.Helper() 65 | 66 | for _, seg := range segments { 67 | if !tree.IsSegmentUsed([]byte(seg)) { 68 | t.Fatalf("Expected '%s' segment to be used, but it's not: %s", seg, tree.String()) 69 | } 70 | } 71 | } 72 | 73 | func expectCounts(t *testing.T, tree SegmentsTreeTracker, fieldsCount, nodesCount int) { 74 | t.Helper() 75 | 76 | if tree.FieldsCount() != fieldsCount || tree.NodesCount() != nodesCount { 77 | t.Fatalf("Expected to have %v fields & %v nodes: %s", fieldsCount, nodesCount, tree.String()) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /segments_tree_tracker.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | // SegmentsTreeTracker is an interface used by Flattener to represents all the paths mentioned 4 | // Patterns added to a Quamina instance in AddPattern() calls. It allows a Flattener to determine 5 | // which Event fields may safely be ignored, and also caches the runtime form of the Field.Path 6 | // value. 7 | // 8 | // Consider this JSON example: 9 | // 10 | // { "a": {"b": 1, "c": 2}} 11 | // 12 | // The tree will look like that: 13 | // 14 | // [ root ] 15 | // | 16 | // [ "a" ] -> as node 17 | // |-> with fields of: "b" and "c" 18 | // 19 | // This allow us to traverse the hierarchial data together with the segments tree, 20 | // fetch a node and answer: 21 | // - Is the current segment is used? (JSON - is the current property needs to be selected) 22 | // - Do we need to traverse into this Node as well? (JSON - do we need traverse this object?) 23 | // - How much fields & nodes we have to traverse in the current hierarchy until we are finished? 24 | // for example: in the current level, in the tree node we have 1 node and 2 fields 25 | // we finishded selecting them, can we finish traversing this node? 26 | type SegmentsTreeTracker interface { 27 | // Get returns another level of the hierarchy, referred as "Node" 28 | // If a node is returned we will need to traverse into (in JSON/CBOR/ProtoBuf/etc..) 29 | Get(segment []byte) (SegmentsTreeTracker, bool) 30 | 31 | // IsRoot - are we root node? 32 | // NOTE: need for early exit, can be solved differently maybe. 33 | IsRoot() bool 34 | 35 | // Called by the Flattener looking at a member name in a JSON object to ascertain 36 | // whether this particular member of the object is mentioned in any Patterns added 37 | // to the Quamina instance. 38 | IsSegmentUsed(segment []byte) bool 39 | 40 | // When a Flattener reaches the last (leaf) step of a path, this returns the full 41 | // path-name for that Field. This is an optimization; since these need to be calculated 42 | // while executing `ddPattern, we might as wewll remember them for use during Flattening. 43 | PathForSegment(name []byte) []byte 44 | 45 | // Called by the Flattener to return the number of nodes (non-leaf children) and fields 46 | // (field values) contained in any node. When processing through the node, once we've 47 | // hit the right number of nodes and fields we can terminate the Flattening process. 48 | NodesCount() int 49 | FieldsCount() int 50 | 51 | // String is used only for debugging. 52 | String() string 53 | } 54 | -------------------------------------------------------------------------------- /shell_style.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "strings" 8 | ) 9 | 10 | // readShellStyleSpecial parses a shellStyle object in a Pattern 11 | func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) { 12 | t, err := pb.jd.Token() 13 | if err != nil { 14 | return 15 | } 16 | pathVals = valsIn 17 | shellString, ok := t.(string) 18 | if !ok { 19 | err = errors.New("value for `shellstyle` must be a string") 20 | return 21 | } 22 | 23 | // no adjacent wildcards 24 | if strings.Contains(shellString, "**") { 25 | err = fmt.Errorf("adjacent '*' characters not allowed") 26 | return 27 | } 28 | 29 | pathVals = append(pathVals, typedVal{vType: shellStyleType, val: `"` + shellString + `"`}) 30 | 31 | t, err = pb.jd.Token() 32 | if err != nil { 33 | return 34 | } 35 | switch t.(type) { 36 | case json.Delim: 37 | // } is all that will be returned 38 | default: 39 | err = errors.New("trailing garbage in shellstyle pattern") 40 | } 41 | 42 | return 43 | } 44 | 45 | // makeShellStyleFA does what it says. It is precisely equivalent to a regex with the only operator 46 | // being a single ".*". Once we've implemented regular expressions we can use that to more or less eliminate this 47 | func makeShellStyleFA(val []byte, printer printer) (start *smallTable, nextField *fieldMatcher) { 48 | table := newSmallTable() 49 | start = table 50 | nextField = newFieldMatcher() 51 | 52 | // for each byte in the pattern 53 | valIndex := 0 54 | for valIndex < len(val) { 55 | ch := val[valIndex] 56 | if ch == '*' { 57 | // special-case handling for string ending in '*"' - transition to field match on any character. 58 | // we know the trailing '"' will be there because of JSON syntax. We could use an epsilon state 59 | // but then the matcher will process through all the rest of the bytes, when it doesn't need to 60 | if valIndex == len(val)-2 { 61 | step := &faState{ 62 | table: newSmallTable(), 63 | fieldTransitions: []*fieldMatcher{nextField}, 64 | } 65 | table.epsilon = []*faState{step} 66 | printer.labelTable(table, fmt.Sprintf("prefix escape at %d", valIndex)) 67 | return 68 | } 69 | globStep := &faState{table: table} 70 | printer.labelTable(table, fmt.Sprintf("gS at %d", valIndex)) 71 | table.epsilon = []*faState{globStep} 72 | 73 | valIndex++ 74 | globNext := &faState{table: newSmallTable()} 75 | printer.labelTable(globNext.table, fmt.Sprintf("gX on %c at %d", val[valIndex], valIndex)) 76 | table.addByteStep(val[valIndex], &faNext{states: []*faState{globNext}}) 77 | table = globNext.table 78 | } else { 79 | nextStep := &faState{table: newSmallTable()} 80 | printer.labelTable(nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex)) 81 | table.addByteStep(ch, &faNext{states: []*faState{nextStep}}) 82 | table = nextStep.table 83 | } 84 | valIndex++ 85 | } 86 | lastStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} 87 | printer.labelTable(lastStep.table, fmt.Sprintf("last step at %d", valIndex)) 88 | table.addByteStep(valueTerminator, &faNext{states: []*faState{lastStep}}) 89 | return 90 | } 91 | -------------------------------------------------------------------------------- /shell_style_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "strings" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | func TestLongCase(t *testing.T) { 12 | m := newCoreMatcher() 13 | pat := `{"x": [ {"shellstyle": "*abab"} ] }` 14 | err := m.addPattern("x", pat) 15 | if err != nil { 16 | t.Error("addPat? " + err.Error()) 17 | } 18 | shoulds := []string{ 19 | "abaabab", 20 | "ababab", 21 | "ababaabab", 22 | } 23 | for _, should := range shoulds { 24 | event := fmt.Sprintf(`{"x": "%s"}`, should) 25 | matches, err := m.matchesForJSONEvent([]byte(event)) 26 | if err != nil { 27 | t.Error("m4j " + err.Error()) 28 | } 29 | if len(matches) != 1 { 30 | t.Error("MISSED: " + should) 31 | } 32 | } 33 | } 34 | func TestMakeShellStyleFA(t *testing.T) { 35 | patterns := []string{ 36 | `"*ST"`, 37 | `"foo*"`, 38 | `"*foo"`, 39 | `"*foo*"`, 40 | `"xx*yy*zz"`, 41 | `"*xx*yy*"`, 42 | } 43 | shouldsForPatterns := [][]string{ 44 | {`"STA ST"`, `"1ST"`}, 45 | {`"fooabc"`, `"foo"`}, 46 | {`"afoo"`, `"foo"`}, 47 | {`"xxfooyy"`, `"fooyy"`, `"xxfoo"`, `"foo"`}, 48 | {`"xxabyycdzz"`, `"xxyycdzz"`, `"xxabyyzz"`, `"xxyyzz"`}, 49 | {`"abxxcdyyef"`, `"xxcdyyef"`, `"abxxyyef"`, `"abxxcdyy"`, `"abxxyy"`, `"xxcdyy"`, `"xxyyef"`, `"xxyy"`}, 50 | } 51 | shouldNotForPatterns := [][]string{ 52 | {`"STA"`, `"STAST "`}, 53 | {`"afoo"`, `"fofo"`}, 54 | {`"foox"`, `"afooo"`}, 55 | {`"afoa"`, `"fofofoxooxoo"`}, 56 | {`"xyzyxzy yy zz"`, `"zz yy xx"`}, 57 | {`"ayybyyzxx"`}, 58 | } 59 | 60 | for i, pattern := range patterns { 61 | a, wanted := makeShellStyleFA([]byte(pattern), sharedNullPrinter) 62 | vm := newValueMatcher() 63 | vmf := vmFields{startTable: a} 64 | vm.update(&vmf) 65 | var bufs bufpair 66 | for _, should := range shouldsForPatterns[i] { 67 | var transitions []*fieldMatcher 68 | gotTrans := traverseNFA(a, []byte(should), transitions, &bufs) 69 | if len(gotTrans) != 1 || gotTrans[0] != wanted { 70 | t.Errorf("Failure for %s on %s", pattern, should) 71 | } 72 | } 73 | for _, shouldNot := range shouldNotForPatterns[i] { 74 | var transitions []*fieldMatcher 75 | gotTrans := traverseNFA(a, []byte(shouldNot), transitions, &bufs) 76 | if gotTrans != nil { 77 | t.Errorf("bogus match for %s on %s", pattern, shouldNot) 78 | } 79 | } 80 | } 81 | } 82 | 83 | func TestWildCardRuler(t *testing.T) { 84 | rule1 := "{ \"a\" : [ { \"shellstyle\": \"*bc\" } ] }" 85 | rule2 := "{ \"b\" : [ { \"shellstyle\": \"d*f\" } ] }" 86 | rule3 := "{ \"b\" : [ { \"shellstyle\": \"d*ff\" } ] }" 87 | rule4 := "{ \"c\" : [ { \"shellstyle\": \"xy*\" } ] }" 88 | rule5 := "{ \"c\" : [ { \"shellstyle\": \"xy*\" } ] }" 89 | rule6 := "{ \"d\" : [ { \"shellstyle\": \"12*4*\" } ] }" 90 | 91 | cm := newCoreMatcher() 92 | _ = cm.addPattern("r1", rule1) 93 | _ = cm.addPattern("r2", rule2) 94 | _ = cm.addPattern("r3", rule3) 95 | _ = cm.addPattern("r4", rule4) 96 | _ = cm.addPattern("r5", rule5) 97 | _ = cm.addPattern("r6", rule6) 98 | 99 | var matches []X 100 | matches, _ = cm.matchesForJSONEvent([]byte("{\"a\" : \"bc\"}")) 101 | if len(matches) != 1 || matches[0] != "r1" { 102 | t.Error("Missed on r1") 103 | } 104 | matches, _ = cm.matchesForJSONEvent([]byte("{\"a\" : \"abc\"}")) 105 | if len(matches) != 1 || matches[0] != "r1" { 106 | t.Error("Missed on r1") 107 | } 108 | matches, _ = cm.matchesForJSONEvent([]byte("{\"b\" : \"dexef\"}")) 109 | if len(matches) != 1 || matches[0] != "r2" { 110 | t.Error("Missed on r2") 111 | } 112 | matches, _ = cm.matchesForJSONEvent([]byte("{\"b\" : \"dexeff\"}")) 113 | if len(matches) != 2 || (!containsX(matches, "r2", "r3")) { 114 | t.Error("Missed on r2/r3") 115 | } 116 | matches, _ = cm.matchesForJSONEvent([]byte("{\"c\" : \"xyzzz\"}")) 117 | if len(matches) != 2 || (!containsX(matches, "r4", "r5")) { 118 | t.Error("Missed on r4/r5") 119 | } 120 | matches, _ = cm.matchesForJSONEvent([]byte("{\"d\" : \"12345\"}")) 121 | if len(matches) != 1 || matches[0] != "r6" { 122 | t.Error("Missed on r6") 123 | } 124 | 125 | shouldNots := []string{ 126 | "{\"c\" : \"abc\"}", 127 | "{\"a\" : \"xyz\"}", 128 | "{\"c\" : \"abcxyz\"}", 129 | "{\"b\" : \"ef\"}", 130 | "{\"b\" : \"de\"}", 131 | "{\"d\" : \"1235\"}", 132 | } 133 | for _, shouldNot := range shouldNots { 134 | matches, _ := cm.matchesForJSONEvent([]byte(shouldNot)) 135 | if len(matches) != 0 { 136 | t.Error("shouldn't have matched: " + shouldNot) 137 | } 138 | } 139 | } 140 | 141 | func containsX(matches []X, wanteds ...string) bool { 142 | var sMatches []string 143 | for _, x := range matches { 144 | sMatches = append(sMatches, x.(string)) 145 | } 146 | for _, wanted := range wanteds { 147 | for _, sMatch := range sMatches { 148 | if wanted == sMatch { 149 | return true 150 | } 151 | } 152 | } 153 | return false 154 | } 155 | 156 | func TestShellStyleBuildTime(t *testing.T) { 157 | words := readWWords(t) 158 | fmt.Printf("WC %d\n", len(words)) 159 | starWords := make([]string, 0, len(words)) 160 | patterns := make([]string, 0, len(words)) 161 | source := rand.NewSource(293591) 162 | for _, word := range words { 163 | //nolint:gosec 164 | starAt := source.Int63() % 6 165 | starWord := string(word[:starAt]) + "*" + string(word[starAt:]) 166 | starWords = append(starWords, starWord) 167 | pattern := fmt.Sprintf(`{"x": [ {"shellstyle": "%s" } ] }`, starWord) 168 | patterns = append(patterns, pattern) 169 | } 170 | q, _ := New() 171 | for i := range words { 172 | err := q.AddPattern(starWords[i], patterns[i]) 173 | if err != nil { 174 | t.Error("AddP: " + err.Error()) 175 | } 176 | } 177 | cm := q.matcher.(*coreMatcher) 178 | 179 | fmt.Println(matcherStats(cm)) 180 | cm.analyze() 181 | fmt.Printf("MaxP: %d\n", cm.fields().nfaMeta.maxOutDegree) 182 | 183 | // make sure that all the words actually are matched 184 | before := time.Now() 185 | for _, word := range words { 186 | record := fmt.Sprintf(`{"x": "%s"}`, word) 187 | matches, err := q.MatchesForEvent([]byte(record)) 188 | if err != nil { 189 | t.Error("M4E on " + string(word)) 190 | } 191 | if len(matches) == 0 { 192 | t.Error("no matches for " + string(word)) 193 | } 194 | if len(matches) > 1 { 195 | fmt.Printf("%d matches for %s\n", len(matches), word) 196 | } 197 | } 198 | elapsed := float64(time.Since(before).Milliseconds()) 199 | eps := float64(len(words)) / (elapsed / 1000.0) 200 | fmt.Printf("Huge-machine events/sec: %.1f\n", eps) 201 | } 202 | 203 | func TestMixedPatterns(t *testing.T) { 204 | // let's mix up some prefix, infix, suffix, and exact-match searches 205 | x := map[string]int{ 206 | `"*ST"`: 5754, 207 | `"*TH"`: 34310, 208 | `"B*K"`: 746, 209 | `"C*L"`: 1022, 210 | `"CH*"`: 2226, 211 | `"Z*"`: 25, 212 | `"BANNOCK"`: 22, 213 | `"21ST"`: 1370, 214 | `"ZOE"`: 19, 215 | `"CRYSTAL"`: 6, 216 | } 217 | 218 | stringTemplate := `{"properties": { "STREET": [ XX ] } }` 219 | shellTemplate := `{"properties": {"STREET":[ {"shellstyle": XX} ] } }` 220 | m := newCoreMatcher() 221 | for name := range x { 222 | var pat string 223 | if strings.Contains(name, "*") { 224 | pat = strings.ReplaceAll(shellTemplate, "XX", name) 225 | } else { 226 | pat = strings.ReplaceAll(stringTemplate, "XX", name) 227 | } 228 | 229 | err := m.addPattern(name, pat) 230 | if err != nil { 231 | t.Error("addPattern: " + name + ", prob=" + err.Error()) 232 | } 233 | } 234 | fmt.Println("M: " + matcherStats(m)) 235 | 236 | got := make(map[X]int) 237 | lines := getCityLotsLines(t) 238 | for _, line := range lines { 239 | matches, err := m.matchesForJSONEvent(line) 240 | if err != nil { 241 | t.Error("Matches4JSON: " + err.Error()) 242 | } 243 | for _, match := range matches { 244 | count, ok := got[match] 245 | if !ok { 246 | got[match] = 1 247 | } else { 248 | got[match] = count + 1 249 | } 250 | } 251 | } 252 | for match, count := range got { 253 | sm := match.(string) 254 | if x[sm] != count { 255 | t.Errorf("For %s wanted %d got %d", sm, x[sm], count) 256 | } 257 | } 258 | } 259 | -------------------------------------------------------------------------------- /small_table.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | // byteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go's byte, which is uint8. The values 4 | // 0xF5-0xFF can't appear in UTF-8 strings. We use 0xF5 as a value terminator, so characters F6 and higher 5 | // can't appear. 6 | const byteCeiling int = 0xf6 7 | 8 | // valueTerminator - whenever we're trying to match a value with a pattern that extends to the end of that 9 | // value, we virtually add one of these as the last character, both to the automaton and the value at run-time. 10 | // This simplifies things because you don't have to treat absolute-string-match (only works at last char in 11 | // value) and prefix match differently. 12 | const valueTerminator byte = 0xf5 13 | 14 | // nolint:gofmt,goimports 15 | // smallTable serves as a lookup table that encodes mappings between ranges of byte values and the 16 | // transition on any byte in the range. 17 | // 18 | // The way it works is exposed in the step() function just below. Logically, it's a slice of {byte, S} 19 | // but I imagine organizing it this way is a bit more memory-efficient. Suppose we want to model a table where 20 | // byte values 3 and 4 map to ss1 and byte 0x34 maps to ss2. Then the smallTable would look like: 21 | // 22 | // ceilings:---|3|----|5|-|0x34|--|x35|-|byteCeiling| 23 | // states:---|nil|-|&ss1|--|nil|-|&ss2|---------|nil| 24 | // invariant: The last element of ceilings is always byteCeiling 25 | // 26 | // The motivation is that we want to build a state machine on byte values to implement things like prefixes and 27 | // ranges of bytes. This could be done simply with an array of size byteCeiling for each state in the machine, 28 | // or a map[byte]S, but both would be size-inefficient, particularly in the case where you're implementing 29 | // ranges. Now, the step function is O(N) in the number of entries, but empirically, the number of entries is 30 | // small even in large automata, so skipping throgh the ceilings list is measurably about the same speed as a map 31 | // or array construct. One could imagine making step() smarter and do a binary search in the case where there are 32 | // more than some number of entries. But I'm dubious, the ceilings field is []byte and running through a single-digit 33 | // number of those has a good chance of minimizing memory fetches. 34 | // Since this is used to support nondeterministic finite automata (NFAs), it is possible for a state 35 | // to have epsilon transitions, i.e. a transition that is always taken whatever the next input symbol is. 36 | type smallTable struct { 37 | ceilings []byte 38 | steps []*faNext 39 | epsilon []*faState 40 | } 41 | 42 | // newSmallTable mostly exists to enforce the constraint that every smallTable has a byteCeiling entry at 43 | // the end, which smallTable.step totally depends on. 44 | func newSmallTable() *smallTable { 45 | return &smallTable{ 46 | ceilings: []byte{byte(byteCeiling)}, 47 | steps: []*faNext{nil}, 48 | } 49 | } 50 | 51 | type stepOut struct { 52 | steps []*faState 53 | epsilon []*faState 54 | } 55 | 56 | var forbiddenBytes = map[byte]bool{ 57 | 0xC0: true, 0xC1: true, 58 | 0xF5: true, 0xF6: true, 0xF7: true, 0xF8: true, 0xF9: true, 0xFA: true, 59 | 0xFB: true, 0xFC: true, 0xFD: true, 0xFE: true, 0xFF: true, 60 | } 61 | 62 | // step finds the list of states that result from a transition on the utf8Byte argument. The states can come 63 | // as a result of looking in the table structure, and also the "epsilon" transitions that occur on every 64 | // input byte. Since this is the white-hot center of Quamina's runtime CPU, we don't want to be merging 65 | // the two lists. So to avoid any memory allocation, the caller passes in a structure with the two lists 66 | // and step fills them in. 67 | func (t *smallTable) step(utf8Byte byte, out *stepOut) { 68 | out.epsilon = t.epsilon 69 | for index, ceiling := range t.ceilings { 70 | if utf8Byte < ceiling { 71 | if t.steps[index] == nil { 72 | out.steps = nil 73 | } else { 74 | out.steps = t.steps[index].states 75 | } 76 | return 77 | } 78 | } 79 | _, forbidden := forbiddenBytes[utf8Byte] 80 | if forbidden { 81 | return 82 | } 83 | panic("Malformed smallTable") 84 | } 85 | 86 | // dStep takes a step through an NFA in the case where it is known that the NFA in question 87 | // is deterministic, i.e. each combination of an faState and a byte value transitions to at 88 | // most one other byte value. 89 | func (t *smallTable) dStep(utf8Byte byte) *faState { 90 | for index, ceiling := range t.ceilings { 91 | if utf8Byte < ceiling { 92 | if t.steps[index] == nil { 93 | return nil 94 | } else { 95 | return t.steps[index].states[0] 96 | } 97 | } 98 | } 99 | _, forbidden := forbiddenBytes[utf8Byte] 100 | if forbidden { 101 | return nil 102 | } 103 | panic("Malformed smallTable") 104 | } 105 | 106 | // makeSmallTable creates a pre-loaded small table, with all bytes not otherwise specified having the defaultStep 107 | // value, and then a few other values with their indexes and values specified in the other two arguments. The 108 | // goal is to reduce memory churn 109 | // constraint: positions must be provided in order 110 | func makeSmallTable(defaultStep *faNext, indices []byte, steps []*faNext) *smallTable { 111 | t := smallTable{ 112 | ceilings: make([]byte, 0, len(indices)+2), 113 | steps: make([]*faNext, 0, len(indices)+2), 114 | } 115 | 116 | var lastIndex byte = 0 117 | for i, index := range indices { 118 | if index > lastIndex { 119 | t.ceilings = append(t.ceilings, index) 120 | t.steps = append(t.steps, defaultStep) 121 | } 122 | t.ceilings = append(t.ceilings, index+1) 123 | t.steps = append(t.steps, steps[i]) 124 | lastIndex = index + 1 125 | } 126 | if indices[len(indices)-1] < byte(byteCeiling) { 127 | t.ceilings = append(t.ceilings, byte(byteCeiling)) 128 | t.steps = append(t.steps, defaultStep) 129 | } 130 | return &t 131 | } 132 | 133 | func (t *smallTable) gatherMetadata(meta *nfaMetadata) { 134 | eps := len(t.epsilon) 135 | for _, step := range t.steps { 136 | if step != nil { 137 | if (eps + len(step.states)) > meta.maxOutDegree { 138 | meta.maxOutDegree = eps + len(step.states) 139 | } 140 | for _, state := range step.states { 141 | state.table.gatherMetadata(meta) 142 | } 143 | } 144 | } 145 | } 146 | 147 | // unpackedTable replicates the data in the smallTable ceilings and states arrays. It's quite hard to 148 | // update the list structure in a smallTable, but trivial in an unpackedTable. The idea is that to update 149 | // a smallTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point… 150 | // TODO: Figure out how to update a smallTable in place 151 | type unpackedTable [byteCeiling]*faNext 152 | 153 | func unpackTable(t *smallTable) *unpackedTable { 154 | var u unpackedTable 155 | unpackedIndex := 0 156 | for packedIndex, c := range t.ceilings { 157 | ceiling := int(c) 158 | for unpackedIndex < ceiling { 159 | u[unpackedIndex] = t.steps[packedIndex] 160 | unpackedIndex++ 161 | } 162 | } 163 | return &u 164 | } 165 | 166 | func (t *smallTable) pack(u *unpackedTable) { 167 | ceilings := make([]byte, 0, 16) 168 | steps := make([]*faNext, 0, 16) 169 | lastStep := u[0] 170 | for unpackedIndex, ss := range u { 171 | if ss != lastStep { 172 | ceilings = append(ceilings, byte(unpackedIndex)) 173 | steps = append(steps, lastStep) 174 | } 175 | lastStep = ss 176 | } 177 | ceilings = append(ceilings, byte(byteCeiling)) 178 | steps = append(steps, lastStep) 179 | t.ceilings = ceilings 180 | t.steps = steps 181 | } 182 | 183 | func (t *smallTable) addByteStep(utf8Byte byte, step *faNext) { 184 | unpacked := unpackTable(t) 185 | unpacked[utf8Byte] = step 186 | t.pack(unpacked) 187 | } 188 | -------------------------------------------------------------------------------- /small_table_test.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestFAMergePerf(t *testing.T) { 10 | words := readWWords(t) 11 | patterns := make([]string, 0, len(words)) 12 | for _, word := range words { 13 | pattern := fmt.Sprintf(`{"x": [ "%s" ] }`, string(word)) 14 | patterns = append(patterns, pattern) 15 | } 16 | before := time.Now() 17 | q, _ := New() 18 | for _, pattern := range patterns { 19 | err := q.AddPattern(pattern, pattern) 20 | if err != nil { 21 | t.Error("ap: " + err.Error()) 22 | } 23 | } 24 | elapsed := float64(time.Since(before).Milliseconds()) 25 | 26 | for _, word := range words { 27 | event := fmt.Sprintf(`{"x": "%s"}`, string(word)) 28 | matches, err := q.MatchesForEvent([]byte(event)) 29 | if err != nil { 30 | t.Error("M4: " + err.Error()) 31 | } 32 | if len(matches) != 1 { 33 | t.Errorf("wanted 1 got %d", len(matches)) 34 | } 35 | } 36 | perSecond := float64(len(patterns)) / (elapsed / 1000.0) 37 | fmt.Printf("%.2f addPatterns/second with letter patterns\n\n", perSecond) 38 | } 39 | 40 | func TestUnpack(t *testing.T) { 41 | st1 := newSmallTable() 42 | nextState := faState{ 43 | table: st1, 44 | fieldTransitions: nil, 45 | } 46 | nextStep := faNext{states: []*faState{&nextState}} 47 | 48 | st := smallTable{ 49 | ceilings: []uint8{2, 3, byte(byteCeiling)}, 50 | steps: []*faNext{nil, &nextStep, nil}, 51 | } 52 | u := unpackTable(&st) 53 | for i := range u { 54 | if i == 2 { 55 | if u[i] != &nextStep { 56 | t.Error("Not in pos 2") 57 | } 58 | } else { 59 | if u[i] != nil { 60 | t.Errorf("Non-nil at %d", i) 61 | } 62 | } 63 | } 64 | } 65 | 66 | func TestDodgeBadUTF8(t *testing.T) { 67 | st := makeSmallTable(nil, []byte{'a'}, []*faNext{{states: []*faState{{}}}}) 68 | so := &stepOut{} 69 | st.step(0xFE, so) 70 | st.dStep(0xFE) 71 | } 72 | -------------------------------------------------------------------------------- /stats.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import "fmt" 4 | 5 | // TODO: add stats for average and max smallTable fanout 6 | type statsAccum struct { 7 | fmCount int 8 | fmTblCount int 9 | fmEntries int 10 | fmMax int 11 | fmVisited map[*fieldMatcher]bool 12 | vmCount int 13 | vmVisited map[*valueMatcher]bool 14 | stCount int 15 | stTblCount int 16 | stEntries int 17 | stMax int 18 | stDepth int 19 | stEpsilon int 20 | stEpMax int 21 | stVisited map[*smallTable]bool 22 | siCount int 23 | } 24 | 25 | func (s *statsAccum) stStats() string { 26 | avgStSize := "n/a" 27 | if s.stTblCount > 0 { 28 | avgStSize = fmt.Sprintf("%.3f", float64(s.stEntries)/float64(s.stTblCount)) 29 | } 30 | return fmt.Sprintf("SmallTables %d (avg size %s, max %d), singletons %d", s.stCount, avgStSize, s.stMax, s.siCount) 31 | } 32 | 33 | // matcherStats gathers statistics about the size of a coreMatcher, including the average and max fanout sizes of 34 | // the transition tables, returning this information in string form 35 | func matcherStats(m *coreMatcher) string { 36 | s := statsAccum{ 37 | fmVisited: make(map[*fieldMatcher]bool), 38 | vmVisited: make(map[*valueMatcher]bool), 39 | stVisited: make(map[*smallTable]bool), 40 | } 41 | fmStats(m.fields().state, &s) 42 | avgFmSize := fmt.Sprintf("%.3f", float64(s.fmEntries)/float64(s.fmTblCount)) 43 | avgStSize := "n/a" 44 | avgEpSize := "n/a" 45 | if s.stTblCount > 0 { 46 | avgStSize = fmt.Sprintf("%.3f", float64(s.stEntries)/float64(s.stTblCount)) 47 | } 48 | if s.stEpsilon > 0 { 49 | avgEpSize = fmt.Sprintf("%.3f", float64(s.stEpsilon)/float64(s.stTblCount)) 50 | } 51 | fmPart := fmt.Sprintf("Field matchers: %d (avg size %s, max %d)", s.fmCount, avgFmSize, s.fmMax) 52 | vmPart := fmt.Sprintf("Value matchers: %d", s.vmCount) 53 | stPart := fmt.Sprintf("SmallTables %d (unique %d, avg %s, max %d, epsilon avg %s, max %d) singletons %d", 54 | s.stCount, len(s.stVisited), avgStSize, s.stMax, avgEpSize, s.stEpMax, s.siCount) 55 | 56 | return fmPart + "\n" + vmPart + "\n" + stPart 57 | } 58 | 59 | func fmStats(m *fieldMatcher, s *statsAccum) { 60 | if s.fmVisited[m] { 61 | return 62 | } 63 | s.fmVisited[m] = true 64 | s.fmCount++ 65 | tSize := len(m.fields().transitions) 66 | if tSize > 0 { 67 | if tSize > s.fmMax { 68 | s.fmMax = tSize 69 | } 70 | s.fmTblCount++ 71 | s.fmEntries += tSize 72 | } 73 | 74 | for _, val := range m.fields().transitions { 75 | vmStats(val, s) 76 | } 77 | } 78 | 79 | func vmStats(m *valueMatcher, s *statsAccum) { 80 | if s.vmVisited[m] { 81 | return 82 | } 83 | s.vmVisited[m] = true 84 | s.vmCount++ 85 | state := m.fields() 86 | if state.singletonMatch != nil { 87 | s.siCount++ 88 | fmStats(state.singletonTransition, s) 89 | } 90 | if state.startTable != nil { 91 | faStats(state.startTable, s) 92 | } 93 | } 94 | 95 | func faStats(t *smallTable, s *statsAccum) { 96 | s.stCount++ 97 | if s.stVisited[t] { 98 | return 99 | } 100 | s.stVisited[t] = true 101 | tSize := len(t.ceilings) 102 | if tSize > 1 { 103 | if tSize > s.stMax { 104 | s.stMax = tSize 105 | } 106 | s.stTblCount++ 107 | s.stEntries += len(t.ceilings) 108 | s.stEpsilon += len(t.epsilon) 109 | if len(t.epsilon) > s.stEpMax { 110 | s.stEpMax = len(t.epsilon) 111 | } 112 | } 113 | for _, next := range t.steps { 114 | if next != nil { 115 | for _, step := range next.states { 116 | if step.fieldTransitions != nil { 117 | for _, m := range step.fieldTransitions { 118 | fmStats(m, s) 119 | } 120 | } 121 | faStats(step.table, s) 122 | } 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /testdata/arrayEvent1.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": "aws.cloudwatch", 3 | "detail-type": "CloudWatch Alarm State Change", 4 | "detail": { 5 | "alarmName": "CPUAlarm", 6 | "state": { 7 | "value": "ALARM" 8 | }, 9 | "previousState": { 10 | "value": "OK" 11 | }, 12 | "configuration": { 13 | "evaluationPeriods": 3, 14 | "datapointsToAlarm": 3, 15 | "description": "CPU utilization of EC2 instance 1 has gone above 90%", 16 | "metrics" : [ 17 | { 18 | "id": "M1", 19 | "label": "CPU", 20 | "metricStat" : { 21 | "metric": { 22 | "dimensions": { 23 | "instanceId": "1" 24 | }, 25 | "metricName": "CPUUtilization", 26 | "namespace": "AWS/EC2" 27 | }, 28 | "period" : 60, 29 | "stat" : "avg", 30 | "unit": null 31 | } 32 | } 33 | ], 34 | "timestamp": "2017-06-21T12:04:03.125Z", 35 | "threshold": 90, 36 | "treatMissingData": "ignore" 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /testdata/arrayEvent2.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": "aws.cloudwatch", 3 | "detail-type": "CloudWatch Alarm State Change", 4 | "detail": { 5 | "alarmName": "DynamoCapacityPercentage", 6 | "state": { 7 | "value": "ALARM" 8 | }, 9 | "previousState": { 10 | "value": "OK" 11 | }, 12 | "configuration": { 13 | "evaluationPeriods": 3, 14 | "datapointsToAlarm": 3, 15 | "description": "There is less than 10% of DynamoDB capacity remaining", 16 | "metrics" : [ 17 | { 18 | "id" : "M1", 19 | "label" : "UsedDynamoCapacity", 20 | "metricStat" : { 21 | "metric": { 22 | "metricName": "ConsumedReadCapacityUnits", 23 | "namespace": "AWS/DynamoDB" 24 | }, 25 | "period" : 60, 26 | "stat" : "avg", 27 | "unit": null 28 | } 29 | }, 30 | { 31 | "id": "M2", 32 | "label": "TotalDynamoCapacity", 33 | "metricStat": { 34 | "metric": { 35 | "metricName": "ProvisionedReadCapacityUnits", 36 | "namespace": "AWS/DynamoDB" 37 | }, 38 | "period" : 60, 39 | "stat" : "avg", 40 | "unit": null 41 | } 42 | }, 43 | { 44 | "id": "E1", 45 | "label": "PercentageCapacityUsed", 46 | "expression": "(M1 / M2) * 100" 47 | } 48 | ], 49 | "timestamp": "2017-06-21T12:04:03.125Z", 50 | "threshold": 90, 51 | "treatMissingData": "ignore" 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /testdata/arrayEvent3.json: -------------------------------------------------------------------------------- 1 | { 2 | "metrics": [ 3 | { 4 | "metricName": "CPUUtilization", 5 | "namespace": "AWS/EC2" 6 | }, 7 | { 8 | "metricName": "RequestCount", 9 | "namespace": "AWS/ES" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /testdata/arrayEvent4.json: -------------------------------------------------------------------------------- 1 | { 2 | "metrics": [ 3 | { 4 | "metricName": "CPUUtilization", 5 | "namespace": "AWS/ES" 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /testdata/arrayRule1.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": ["aws.cloudwatch"], 3 | "detail-type": ["CloudWatch Alarm State Change"], 4 | "detail": { 5 | "state": { 6 | "value": ["ALARM"] 7 | }, 8 | "configuration": { 9 | "metrics": { 10 | "metricStat": { 11 | "metric": { 12 | "namespace": ["AWS/EC2"] 13 | } 14 | } 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /testdata/arrayRule2.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": ["aws.cloudwatch"], 3 | "detail-type": ["CloudWatch Alarm State Change"], 4 | "detail": { 5 | "state": { 6 | "value": ["ALARM"] 7 | }, 8 | "configuration": { 9 | "metrics": { 10 | "metricStat": { 11 | "metric": { 12 | "metricName": ["CPUUtilization"] 13 | } 14 | } 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /testdata/arrayRule3.json: -------------------------------------------------------------------------------- 1 | { 2 | "metrics": { 3 | "metricName": [ "CPUUtilization" ], 4 | "namespace": [ "AWS/ES" ] 5 | } 6 | } -------------------------------------------------------------------------------- /testdata/arrayRule4.json: -------------------------------------------------------------------------------- 1 | { 2 | "metrics": { 3 | "metricName": [ "CPUUtilization", "ReadLatency" ], 4 | "namespace": [ "AWS/EC2", "AWS/ES" ] 5 | } 6 | } -------------------------------------------------------------------------------- /testdata/citylots.jlines.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timbray/quamina/0526acc321a81d4df535caf790879648ace11c86/testdata/citylots.jlines.gz -------------------------------------------------------------------------------- /testdata/citylots2.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timbray/quamina/0526acc321a81d4df535caf790879648ace11c86/testdata/citylots2.json.gz -------------------------------------------------------------------------------- /testdata/cl-sample-0: -------------------------------------------------------------------------------- 1 | { 2 | "type": "Feature", 3 | "properties": { 4 | "MAPBLKLOT": "0522019", 5 | "BLKLOT": "0522029", 6 | "BLOCK_NUM": "0522", 7 | "LOT_NUM": "029", 8 | "FROM_ST": "1501", 9 | "TO_ST": "1501", 10 | "STREET": "GREENWICH", 11 | "ST_TYPE": "ST", 12 | "ODD_EVEN": "O" 13 | }, 14 | "geometry": { 15 | "type": "Polygon", 16 | "coordinates": [ 17 | [ 18 | [ 19 | -122.42485025893909, 20 | 37.79987617371178, 21 | 0 22 | ], 23 | [ 24 | -122.42497841626495, 25 | 37.79985988225351, 26 | 0 27 | ], 28 | [ 29 | -122.42500404454923, 30 | 37.79985662417723, 31 | 0 32 | ], 33 | [ 34 | -122.42501748155543, 35 | 37.79992324159192, 36 | 0 37 | ], 38 | [ 39 | -122.42474494351363, 40 | 37.79995793763353, 41 | 0 42 | ], 43 | [ 44 | -122.42480702649785, 45 | 37.80026572899349, 46 | 0 47 | ], 48 | [ 49 | -122.42455467486131, 50 | 37.80029785556764, 51 | 0 52 | ], 53 | [ 54 | -122.42447893872841, 55 | 37.7999233762548, 56 | 0 57 | ], 58 | [ 59 | -122.42485025893909, 60 | 37.79987617371178, 61 | 0 62 | ] 63 | ] 64 | ] 65 | } 66 | } -------------------------------------------------------------------------------- /testdata/cl-sample-1: -------------------------------------------------------------------------------- 1 | { 2 | "type": "Feature", 3 | "properties": { 4 | "MAPBLKLOT": "1137023", 5 | "BLKLOT": "1137024", 6 | "BLOCK_NUM": "1137", 7 | "LOT_NUM": "024", 8 | "FROM_ST": "2942", 9 | "TO_ST": "2942", 10 | "STREET": "TURK", 11 | "ST_TYPE": "BLVD", 12 | "ODD_EVEN": "E" 13 | }, 14 | "geometry": { 15 | "type": "Polygon", 16 | "coordinates": [ 17 | [ 18 | [ 19 | -122.45409388918634, 20 | 37.777883689479076, 21 | 0 22 | ], 23 | [ 24 | -122.45413030345098, 25 | 37.778062628581004, 26 | 0 27 | ], 28 | [ 29 | -122.45395950559532, 30 | 37.77808448801483, 31 | 0 32 | ], 33 | [ 34 | -122.45392309059642, 35 | 37.77790554887966, 36 | 0 37 | ], 38 | [ 39 | -122.45409388918634, 40 | 37.777883689479076, 41 | 0 42 | ] 43 | ] 44 | ] 45 | } 46 | } -------------------------------------------------------------------------------- /testdata/cl-sample-2: -------------------------------------------------------------------------------- 1 | { 2 | "type": "Feature", 3 | "properties": { 4 | "MAPBLKLOT": "1663033", 5 | "BLKLOT": "1663033", 6 | "BLOCK_NUM": "1663", 7 | "LOT_NUM": "033", 8 | "FROM_ST": "1917", 9 | "TO_ST": "1917", 10 | "STREET": "CABRILLO", 11 | "ST_TYPE": "ST", 12 | "ODD_EVEN": "O" 13 | }, 14 | "geometry": { 15 | "type": "Polygon", 16 | "coordinates": [ 17 | [ 18 | [ 19 | -122.47930560364344, 20 | 37.774349512589446, 21 | 0 22 | ], 23 | [ 24 | -122.47930063079642, 25 | 37.77428096746438, 26 | 0 27 | ], 28 | [ 29 | -122.47938698614243, 30 | 37.77427702740435, 31 | 0 32 | ], 33 | [ 34 | -122.47940687674428, 35 | 37.77455120700214, 36 | 0 37 | ], 38 | [ 39 | -122.4793205211046, 40 | 37.77455514797736, 41 | 0 42 | ], 43 | [ 44 | -122.47931554823005, 45 | 37.77448660285547, 46 | 0 47 | ], 48 | [ 49 | -122.47931057649956, 50 | 37.774418057713426, 51 | 0 52 | ], 53 | [ 54 | -122.47930560364344, 55 | 37.774349512589446, 56 | 0 57 | ] 58 | ] 59 | ] 60 | } 61 | } -------------------------------------------------------------------------------- /wildcard.go: -------------------------------------------------------------------------------- 1 | package quamina 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | ) 8 | 9 | type wcState int 10 | 11 | const ( 12 | wcChilling wcState = iota 13 | wcAfterBS 14 | wcAfterGlob 15 | ) 16 | 17 | func readWildcardSpecial(pb *patternBuild, valsIn []typedVal) ([]typedVal, error) { 18 | t, err := pb.jd.Token() 19 | if err != nil { 20 | return nil, err 21 | } 22 | pathVals := valsIn 23 | wcInput, ok := t.(string) 24 | if !ok { 25 | return nil, errors.New("value for `wildcard` must be a string") 26 | } 27 | inBytes := []byte(wcInput) 28 | state := wcChilling 29 | for i, b := range inBytes { 30 | switch state { 31 | case wcChilling: 32 | switch b { 33 | case '\\': 34 | if i == len(inBytes)-1 { 35 | return nil, errors.New("'\\' at end of string not allowed") 36 | } 37 | state = wcAfterBS 38 | case '*': 39 | state = wcAfterGlob 40 | } 41 | case wcAfterBS: 42 | switch b { 43 | case '\\', '*': 44 | state = wcChilling 45 | default: 46 | return nil, errors.New("`\\` can only be followed by '\\' or '*'") 47 | } 48 | case wcAfterGlob: 49 | switch b { 50 | case '*': 51 | return nil, fmt.Errorf("adjacent '*' characters not allowed") 52 | case '\\': 53 | state = wcAfterBS 54 | default: 55 | state = wcChilling 56 | } 57 | } 58 | } 59 | pathVals = append(pathVals, typedVal{vType: wildcardType, val: `"` + wcInput + `"`}) 60 | 61 | t, err = pb.jd.Token() 62 | if err != nil { 63 | return nil, err 64 | } 65 | switch t.(type) { 66 | case json.Delim: 67 | // } is all that will be returned 68 | default: 69 | return nil, errors.New("trailing garbage in wildcard pattern") 70 | } 71 | 72 | return pathVals, nil 73 | } 74 | 75 | // makeWildcardFA is a replacement for shellstyle patterns, the only difference being that escaping is 76 | // provided for * and \. 77 | func makeWildcardFA(val []byte, printer printer) (start *smallTable, nextField *fieldMatcher) { 78 | table := newSmallTable() 79 | start = table 80 | nextField = newFieldMatcher() 81 | 82 | // for each byte in the pattern. \-escape processing is simplified because illegal constructs such as \a and \ 83 | // at the end of the value have been rejected by readWildcardSpecial. 84 | valIndex := 0 85 | for valIndex < len(val) { 86 | ch := val[valIndex] 87 | escaped := ch == '\\' 88 | if escaped { 89 | valIndex++ 90 | ch = val[valIndex] 91 | } 92 | if ch == '*' && !escaped { 93 | // special-case handling for string ending in '*"' - transition to field match on any character. 94 | // we know the trailing '"' will be there because of JSON syntax. We could use an epsilon state 95 | // but then the matcher will process through all the rest of the bytes, when it doesn't need to 96 | if valIndex == len(val)-2 { 97 | step := &faState{ 98 | table: newSmallTable(), 99 | fieldTransitions: []*fieldMatcher{nextField}, 100 | } 101 | table.epsilon = []*faState{step} 102 | printer.labelTable(table, fmt.Sprintf("prefix escape at %d", valIndex)) 103 | return 104 | } 105 | globStep := &faState{table: table} 106 | printer.labelTable(table, fmt.Sprintf("gS at %d", valIndex)) 107 | table.epsilon = []*faState{globStep} 108 | 109 | valIndex++ 110 | // ** is forbidden, if we're seeing *\* then the second * is non-magic, if we're seeing *\\, it 111 | // just means \, so either way, all we need to do is hop over this \ 112 | if val[valIndex] == '\\' { 113 | valIndex++ 114 | } 115 | globNext := &faState{table: newSmallTable()} 116 | printer.labelTable(globNext.table, fmt.Sprintf("gX on %c at %d", val[valIndex], valIndex)) 117 | table.addByteStep(val[valIndex], &faNext{states: []*faState{globNext}}) 118 | table = globNext.table 119 | } else { 120 | nextStep := &faState{table: newSmallTable()} 121 | printer.labelTable(nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex)) 122 | table.addByteStep(ch, &faNext{states: []*faState{nextStep}}) 123 | table = nextStep.table 124 | } 125 | valIndex++ 126 | } 127 | lastStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} 128 | printer.labelTable(lastStep.table, fmt.Sprintf("last step at %d", valIndex)) 129 | table.addByteStep(valueTerminator, &faNext{states: []*faState{lastStep}}) 130 | return 131 | } 132 | --------------------------------------------------------------------------------