├── .deepsource.toml ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── others.md └── workflows │ ├── docker.yml │ ├── quality.yml │ ├── rust.yml │ └── windows.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.yml ├── docs ├── dev_notes.md ├── examples.md ├── fire-128.png ├── obsidian_example_2023-Feb-05.mp4 ├── release_notes.md ├── release_notes_0.2_2024Sep.md ├── screen_record_20220514.mkv ├── screenshot_demo.png ├── screenshot_demo_640_400.png └── server.md ├── example.env ├── fireSeqSearch_addon ├── icons │ ├── fire-48.png │ ├── notebook_logo_32.png │ ├── notebook_logo_512.png │ └── notebook_logo_64.png ├── main.js ├── manifest.json ├── monkeyscript.user.js ├── options.html ├── options.js ├── violentmonkeyscript.user.js └── wordcloud_draw.js ├── fire_seq_search_server ├── Cargo.toml ├── debug_server.sh ├── debug_server_mac.sh ├── deny.toml ├── obsidian.sh ├── run_server.sh ├── src │ ├── http_client │ │ ├── endpoints.rs │ │ └── mod.rs │ ├── language_tools │ │ ├── cn_stopwords.rs │ │ ├── mod.rs │ │ └── tokenizer.rs │ ├── lib.rs │ ├── load_notes │ │ └── mod.rs │ ├── local_llm │ │ ├── example_llama_response.json │ │ └── mod.rs │ ├── main.rs │ ├── markdown_parser │ │ ├── markdown_to_text.rs │ │ ├── mod.rs │ │ └── pdf_parser.rs │ ├── post_query │ │ ├── app_uri.rs │ │ ├── highlighter.rs │ │ ├── hit_parsed.rs │ │ ├── logseq_uri.rs │ │ ├── mod.rs │ │ └── obsidian_uri.rs │ ├── query_engine │ │ └── mod.rs │ └── word_frequency │ │ └── mod.rs └── tests │ ├── resource │ ├── assets │ │ └── screenshot_demo_640_400.png │ ├── journals │ │ ├── 2022_02_26.md │ │ └── 2022_08_30.md │ ├── logseq │ │ └── pages-metadata.edn │ └── pages │ │ ├── International Language, Past, Present & Future by Walter John Clark.md │ │ ├── LATIN FOR BEGINNERS.md │ │ ├── Rust.md │ │ ├── Softmax.md │ │ ├── advanced_query.md │ │ ├── blog_thunderbird_zh.md │ │ ├── cyrillic.md │ │ ├── feditips.md │ │ ├── fireSeqSearch___test___5.md │ │ ├── 咖啡.md │ │ └── 孙子兵法.md │ ├── run_render.sh │ ├── unit_test_load_notes.rs │ ├── unit_test_post_query.rs │ └── unit_test_render_block.rs └── pack_firefox_extension.sh /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [[analyzers]] 4 | name = "javascript" 5 | enabled = true 6 | 7 | [[analyzers]] 8 | name = "rust" 9 | enabled = true 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a bug report 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **What's the term you're searching** 14 | 15 | 16 | **Server-side log and version** 17 | Please review the log before posting. Sensitive or private data may be included 18 | 19 | 20 | **Client-side log and version** 21 | Please review the log before posting. Sensitive or private data may be included 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | Note: Only English and Chinese posts are allowed in the issues section. English is preferred. 33 | 请使用英文或中文发 issue. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | Note: Only English and Chinese posts are allowed in the issues section. English is preferred. 29 | 请使用英文或中文发 issue. 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/others.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Others 3 | about: Feel free to post other issues here 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Don't worry if your issue couldn't fit in a template. The template is designed to save your time, instead of enforcing requirements for posting issues. 11 | -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: Build/release docker images 2 | on: 3 | push: 4 | branches: 5 | - 'master' 6 | paths-ignore: 7 | - '**.md' 8 | jobs: 9 | docker: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | with: 14 | fetch-depth: 0 15 | 16 | - name: Login to GitHub Container Registry 17 | uses: docker/login-action@v1 18 | with: 19 | registry: ghcr.io 20 | username: ${{ github.repository_owner }} 21 | password: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - name: Lower case for ghcr 24 | id: ghcr_string 25 | uses: ASzc/change-string-case-action@v1 26 | with: 27 | string: ${{ github.event.repository.full_name }} 28 | 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v2 31 | 32 | - name: Cache Docker layers 33 | uses: actions/cache@v2 34 | with: 35 | path: /tmp/.buildx-cache 36 | key: ${{ runner.os }}-buildx-${{ github.sha }} 37 | restore-keys: | 38 | ${{ runner.os }}-buildx- 39 | - name: Build and push 40 | uses: docker/build-push-action@v3 41 | with: 42 | context: . 43 | platforms: linux/amd64 44 | push: true 45 | tags: | 46 | ghcr.io/${{ steps.ghcr_string.outputs.lowercase }} 47 | cache-from: type=local,src=/tmp/.buildx-cache 48 | cache-to: type=local,dest=/tmp/.buildx-cache-new 49 | 50 | - name: Move cache 51 | run: | 52 | rm -rf /tmp/.buildx-cache 53 | mv /tmp/.buildx-cache-new /tmp/.buildx-cache 54 | -------------------------------------------------------------------------------- /.github/workflows/quality.yml: -------------------------------------------------------------------------------- 1 | name: Quality Control 2 | on: [push, pull_request] 3 | 4 | env: 5 | CARGO_TERM_COLOR: always 6 | RUSTC_WRAPPER: "sccache" 7 | SCCACHE_GHA_ENABLED: "true" 8 | 9 | jobs: 10 | cargo-deny: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Install Rust 15 | uses: actions-rs/toolchain@v1 16 | with: 17 | toolchain: stable 18 | profile: minimal 19 | override: true 20 | - name: Run sccache-cache 21 | uses: mozilla-actions/sccache-action@v0.0.8 22 | - name: Get Date 23 | id: get-date 24 | run: | 25 | echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT 26 | shell: bash 27 | - name: Cache cargo registry 28 | uses: actions/cache@v3 29 | continue-on-error: false 30 | with: 31 | path: | 32 | ~/.cargo/registry 33 | key: rust-${{ runner.os }}-${{ matrix.rust }}-${{ hashFiles('**/Cargo.toml') }}-${{ steps.get-date.outputs.date }}-qc 34 | restore-keys: | 35 | rust-${{ runner.os }}-${{ matrix.rust }}-${{ hashFiles('**/Cargo.toml') }}- 36 | rust-${{ runner.os }} 37 | - name: Install cargo deny 38 | run: | 39 | cargo install --locked cargo-deny 40 | - name: cargo deny license 41 | run: | 42 | cd fire_seq_search_server 43 | cargo deny check licenses 44 | - name: cargo deny advisories 45 | run: | 46 | cd fire_seq_search_server 47 | cargo deny check advisories 48 | - name: clippy 49 | run: | 50 | cd fire_seq_search_server 51 | rustup component add clippy 52 | cargo clippy 53 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Linux and macOS 2 | 3 | # Template Reference: https://www.infinyon.com/blog/2021/04/github-actions-best-practices/ 4 | on: 5 | push: 6 | branches: [ master, llm_candidate ] 7 | pull_request: 8 | branches: [ master, llm_candidate ] 9 | 10 | env: 11 | CARGO_TERM_COLOR: always 12 | RUSTC_WRAPPER: "sccache" 13 | SCCACHE_GHA_ENABLED: "true" 14 | 15 | jobs: 16 | build: 17 | name: Cargo test (${{ matrix.rust }}) (${{ matrix.os }}) 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | os: [ubuntu-latest, macos-13, macos-14] 23 | rust: [stable] 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Run sccache-cache 27 | uses: mozilla-actions/sccache-action@v0.0.8 28 | - name: Install Rust ${{ matrix.rust }} 29 | uses: actions-rs/toolchain@v1 30 | with: 31 | toolchain: ${{ matrix.rust }} 32 | profile: minimal 33 | override: true 34 | - name: Run sccache-cache 35 | uses: mozilla-actions/sccache-action@v0.0.8 36 | - name: Get Date 37 | id: get-date 38 | run: | 39 | echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT 40 | shell: bash 41 | - name: Cache cargo registry 42 | uses: actions/cache@v3 43 | continue-on-error: false 44 | with: 45 | path: | 46 | ~/.cargo/registry 47 | key: rust-${{ runner.os }}-${{ matrix.rust }}-${{ hashFiles('**/Cargo.toml') }}-${{ steps.get-date.outputs.date }}-build 48 | restore-keys: | 49 | rust-${{ runner.os }}-${{ matrix.rust }}-${{ hashFiles('**/Cargo.toml') }}- 50 | rust-${{ runner.os }} 51 | # Real tests starts here 52 | - name: Check version 53 | run: rustup --version && rustc --version && cargo --version 54 | - name: Install rustfmt 55 | run: | 56 | rustup component add rustfmt 57 | which rustfmt && rustfmt --version 58 | - name: Build 59 | run: | 60 | cd fire_seq_search_server 61 | cargo build --verbose 62 | - name: Run tests 63 | run: | 64 | cd fire_seq_search_server 65 | cargo test --verbose 66 | - name: Install 67 | run: | 68 | cd fire_seq_search_server 69 | cargo install --path . 70 | - name: Package 71 | run: | 72 | cd fire_seq_search_server 73 | cargo package --verbose 74 | - name: Run sccache stat for check 75 | shell: bash 76 | run: ${SCCACHE_PATH} --show-stats 77 | 78 | 79 | release: 80 | needs: build 81 | runs-on: ubuntu-latest 82 | steps: 83 | - uses: actions/checkout@v2 84 | - name: Install Rust stable 85 | uses: actions-rs/toolchain@v1 86 | with: 87 | toolchain: stable 88 | profile: minimal 89 | override: true 90 | - name: Run sccache-cache 91 | uses: mozilla-actions/sccache-action@v0.0.8 92 | - name: Get Date 93 | id: get-date 94 | run: | 95 | echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT 96 | shell: bash 97 | - name: Cache cargo registry 98 | uses: actions/cache@v3 99 | continue-on-error: false 100 | with: 101 | path: | 102 | ~/.cargo/registry 103 | key: rust-${{ runner.os }}-${{ matrix.rust }}-${{ hashFiles('**/Cargo.toml') }}-${{ steps.get-date.outputs.date }}-build 104 | restore-keys: | 105 | rust-${{ runner.os }}-${{ matrix.rust }}-${{ hashFiles('**/Cargo.toml') }}- 106 | rust-${{ runner.os }} 107 | - name: Build 108 | run: | 109 | cd fire_seq_search_server 110 | cargo build --verbose 111 | - name: Save Artifact 112 | run: | 113 | mkdir builds 114 | mv fire_seq_search_server/target/debug/fire_seq_search_server builds 115 | - name: Upload Artifact 116 | uses: actions/upload-artifact@v4 117 | with: 118 | name: logseq-${{ runner.os }}-builds 119 | path: builds 120 | - name: Run sccache stat for check 121 | shell: bash 122 | run: ${SCCACHE_PATH} --show-stats 123 | 124 | 125 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | # This file is based on https://github.com/rust-lang/rustfmt/blob/master/.github/workflows/windows.yml 2 | # rustfmt is licensed in MIT Copyright (c) 2016-2021 The Rust Project Developers https://github.com/rust-lang/rustfmt/blob/master/LICENSE-MIT 3 | 4 | 5 | name: Windows 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | env: 13 | CARGO_TERM_COLOR: always 14 | RUSTC_WRAPPER: "sccache" 15 | SCCACHE_GHA_ENABLED: "true" 16 | 17 | 18 | jobs: 19 | build: 20 | runs-on: windows-latest 21 | name: (${{ matrix.target }}, ${{ matrix.cfg_release_channel }}) 22 | env: 23 | CFG_RELEASE_CHANNEL: ${{ matrix.cfg_release_channel }} 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | target: [ 28 | i686-pc-windows-msvc, 29 | x86_64-pc-windows-msvc, 30 | i686-pc-windows-gnu, 31 | x86_64-pc-windows-gnu, 32 | ] 33 | cfg_release_channel: [stable] 34 | 35 | steps: 36 | # The Windows runners have autocrlf enabled by default 37 | # which causes failures for some of rustfmt's line-ending sensitive tests 38 | - name: disable git eol translation 39 | run: git config --global core.autocrlf false 40 | - name: checkout 41 | uses: actions/checkout@v3 42 | 43 | # Run build 44 | - name: Install Rustup using win.rustup.rs 45 | run: | 46 | # Disable the download progress bar which can cause perf issues 47 | $ProgressPreference = "SilentlyContinue" 48 | Invoke-WebRequest https://win.rustup.rs/ -OutFile rustup-init.exe 49 | .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc --default-toolchain=none 50 | del rustup-init.exe 51 | rustup target add ${{ matrix.target }} 52 | shell: powershell 53 | - name: Run sccache-cache 54 | uses: mozilla-actions/sccache-action@v0.0.8 55 | - name: Get Date 56 | id: get-date 57 | run: | 58 | echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT 59 | shell: bash 60 | - name: Cache cargo registry 61 | uses: actions/cache@v3 62 | continue-on-error: false 63 | with: 64 | path: | 65 | ~/.cargo/registry 66 | key: rust-${{ runner.os }}-${{ matrix.target }}-${{ hashFiles('**/Cargo.toml') }}-${{ steps.get-date.outputs.date }}-build 67 | restore-keys: | 68 | rust-${{ runner.os }}-${{ matrix.target }}-${{ hashFiles('**/Cargo.toml') }} 69 | rust-${{ runner.os }} 70 | 71 | - name: build 72 | run: | 73 | rustc -Vv 74 | cargo -V 75 | cd fire_seq_search_server 76 | cargo build 77 | shell: cmd 78 | 79 | - name: test 80 | run: | 81 | cd fire_seq_search_server 82 | cargo test 83 | shell: cmd 84 | - name: Package 85 | run: | 86 | cd fire_seq_search_server 87 | cargo package --verbose 88 | shell: cmd 89 | - name: Run sccache stat for check 90 | shell: bash 91 | run: ${SCCACHE_PATH} --show-stats 92 | 93 | 94 | release: 95 | needs: build 96 | runs-on: windows-latest 97 | name: Build for (${{ matrix.target }} with ${{ matrix.cfg_release_channel }}) 98 | env: 99 | CFG_RELEASE_CHANNEL: ${{ matrix.cfg_release_channel }} 100 | strategy: 101 | fail-fast: false 102 | matrix: 103 | target: [ 104 | i686-pc-windows-msvc, 105 | x86_64-pc-windows-msvc, 106 | ] 107 | cfg_release_channel: [ stable ] 108 | steps: 109 | - name: disable git eol translation 110 | run: git config --global core.autocrlf false 111 | - name: checkout 112 | uses: actions/checkout@v3 113 | - name: Install Rustup using win.rustup.rs 114 | run: | 115 | # Disable the download progress bar which can cause perf issues 116 | $ProgressPreference = "SilentlyContinue" 117 | Invoke-WebRequest https://win.rustup.rs/ -OutFile rustup-init.exe 118 | .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc --default-toolchain=none 119 | del rustup-init.exe 120 | rustup target add ${{ matrix.target }} 121 | shell: powershell 122 | - name: Run sccache-cache 123 | uses: Xuanwo/sccache-action@c94e27bef21ab3fb4a5152c8a878c53262b4abb0 124 | with: 125 | version: "v0.4.0-pre.6" 126 | - name: Get Date 127 | id: get-date 128 | run: | 129 | echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT 130 | shell: bash 131 | - name: Cache cargo registry and sccache 132 | uses: actions/cache@v3 133 | continue-on-error: false 134 | with: 135 | path: | 136 | ~/.cargo/registry 137 | key: rust-${{ runner.os }}-${{ matrix.target }}-${{ hashFiles('**/Cargo.toml') }}-${{ steps.get-date.outputs.date }}-build 138 | restore-keys: | 139 | rust-${{ runner.os }}-${{ matrix.target }}-${{ hashFiles('**/Cargo.toml') }} 140 | rust-${{ runner.os }} 141 | - name: build 142 | run: | 143 | rustc -Vv 144 | cargo -V 145 | cd fire_seq_search_server 146 | cargo build --release 147 | shell: cmd 148 | - name: test 149 | run: | 150 | cd fire_seq_search_server 151 | cargo test 152 | shell: cmd 153 | - name: Package 154 | run: | 155 | cd fire_seq_search_server 156 | cargo package --verbose 157 | shell: cmd 158 | - name: Save Artifact 159 | run: | 160 | mkdir builds 161 | mv fire_seq_search_server/target/debug/fire_seq_search_server.exe builds 162 | shell: cmd 163 | - name: Upload Artifact 164 | uses: actions/upload-artifact@v4 165 | with: 166 | name: logseq-${{ runner.os }}-builds 167 | path: builds 168 | - name: Run sccache stat for check 169 | shell: bash 170 | run: ${SCCACHE_PATH} --show-stats 171 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | fire_seq_search_server/Cargo.lock 2 | fire_seq_search_server/target 3 | 4 | # === https://github.com/rust-lang/rust/blob/master/.gitignore === 5 | 6 | ## File system 7 | .DS_Store 8 | desktop.ini 9 | 10 | ## Editor 11 | *.swp 12 | *.swo 13 | Session.vim 14 | .cproject 15 | .idea 16 | *.iml 17 | .vscode 18 | .project 19 | .favorites.json 20 | .settings/ 21 | 22 | ## Tool 23 | .valgrindrc 24 | .cargo 25 | # Included because it is part of the test case 26 | !/src/test/run-make/thumb-none-qemu/example/.cargo 27 | 28 | ## Configuration 29 | /config.toml 30 | /Makefile 31 | config.mk 32 | config.stamp 33 | no_llvm_build 34 | 35 | ## Build 36 | /dl/ 37 | /doc/ 38 | /inst/ 39 | /llvm/ 40 | /mingw-build/ 41 | /build/ 42 | /dist/ 43 | /unicode-downloads 44 | /target 45 | /src/tools/x/target 46 | # Generated by compiletest for incremental 47 | /tmp/ 48 | # Created by default with `src/ci/docker/run.sh` 49 | /obj/ 50 | 51 | ## Temporary files 52 | *~ 53 | \#* 54 | \#*\# 55 | .#* 56 | 57 | ## Tags 58 | tags 59 | tags.* 60 | TAGS 61 | TAGS.* 62 | 63 | ## Python 64 | __pycache__/ 65 | *.py[cod] 66 | *$py.class 67 | 68 | ## Node 69 | node_modules 70 | package-lock.json 71 | 72 | ## Rustdoc GUI tests 73 | src/test/rustdoc-gui/src/**.lock 74 | 75 | # ==== GitHub ignore list for node ==== 76 | # Logs 77 | logs 78 | *.log 79 | npm-debug.log* 80 | yarn-debug.log* 81 | yarn-error.log* 82 | lerna-debug.log* 83 | 84 | # Diagnostic reports (https://nodejs.org/api/report.html) 85 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 86 | 87 | # Runtime data 88 | pids 89 | *.pid 90 | *.seed 91 | *.pid.lock 92 | 93 | # Directory for instrumented libs generated by jscoverage/JSCover 94 | lib-cov 95 | 96 | # Coverage directory used by tools like istanbul 97 | coverage 98 | *.lcov 99 | 100 | # nyc test coverage 101 | .nyc_output 102 | 103 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 104 | .grunt 105 | 106 | # Bower dependency directory (https://bower.io/) 107 | bower_components 108 | 109 | # node-waf configuration 110 | .lock-wscript 111 | 112 | # Compiled binary addons (https://nodejs.org/api/addons.html) 113 | build/Release 114 | 115 | # Dependency directories 116 | node_modules/ 117 | jspm_packages/ 118 | 119 | # TypeScript v1 declaration files 120 | typings/ 121 | 122 | # TypeScript cache 123 | *.tsbuildinfo 124 | 125 | # Optional npm cache directory 126 | .npm 127 | 128 | # Optional eslint cache 129 | .eslintcache 130 | 131 | # Microbundle cache 132 | .rpt2_cache/ 133 | .rts2_cache_cjs/ 134 | .rts2_cache_es/ 135 | .rts2_cache_umd/ 136 | 137 | # Optional REPL history 138 | .node_repl_history 139 | 140 | # Output of 'npm pack' 141 | *.tgz 142 | 143 | # Yarn Integrity file 144 | .yarn-integrity 145 | 146 | # dotenv environment variables file 147 | .env 148 | .env.test 149 | 150 | # parcel-bundler cache (https://parceljs.org/) 151 | .cache 152 | 153 | # Next.js build output 154 | .next 155 | 156 | # Nuxt.js build / generate output 157 | .nuxt 158 | dist 159 | 160 | # Gatsby files 161 | .cache/ 162 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 163 | # https://nextjs.org/blog/next-9-1#public-directory-support 164 | # public 165 | 166 | # vuepress build output 167 | .vuepress/dist 168 | 169 | # Serverless directories 170 | .serverless/ 171 | 172 | # FuseBox cache 173 | .fusebox/ 174 | 175 | # DynamoDB Local files 176 | .dynamodb/ 177 | 178 | # TernJS port file 179 | .tern-port 180 | 181 | 182 | 183 | /fire_seq_search_server/fire_seq_search_server 184 | /fireSeqSearch_chrome.zip 185 | /fireSeqSearch.zip 186 | /chrome_tmp 187 | 188 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1.65-buster AS builder 2 | 3 | WORKDIR /fire_seq_search_server 4 | COPY ./fire_seq_search_server . 5 | 6 | RUN cargo install --path . 7 | 8 | FROM ubuntu:20.04 9 | COPY --from=builder /usr/local/cargo/bin/fire_seq_search_server /usr/local/bin/fire_seq_search_server 10 | 11 | ENV RUST_LOG=debug 12 | CMD ["sh", "-c", "fire_seq_search_server --notebook_path $NOTEBOOK_DIR --host 0.0.0.0:3030"] 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2023 Zhenbo Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fireSeqSearch: Append Logseq/Obsidian notes while Googling 2 | 3 | Introduction 4 | -------- 5 | [fireSeqSearch](https://github.com/Endle/fireSeqSearch) is inspired by [Evernote](https://evernote.com)'s browser extension - if we search a term, for example, `softmax` in Google, [fireSeqSearch](https://github.com/Endle/fireSeqSearch) will also search in our personal notebook, and append the hits into Google results. 6 | 7 | More examples at 8 | 9 | 10 | 11 | How to use it 12 | ------------------ 13 | You need to install **BOTH** the server-side app and the browser extension. The server reads your logseq notebooks in read-only mode, and hosts endpoints at 127.0.0.1:3030. 14 | 15 | ### Install Browser Extension 16 | 1. Install latest web extension 17 | 2. If you're using other browser, you can install userscirpts instead. [Tampermonkey](https://www.tampermonkey.net/) => [monkeyscript.user.js](https://github.com/Endle/fireSeqSearch/raw/master/fireSeqSearch_addon/monkeyscript.user.js). [Violentmonkey](https://violentmonkey.github.io/) => [violentmonkeyscript.user.js](https://github.com/Endle/fireSeqSearch/blob/master/fireSeqSearch_addon/violentmonkeyscript.user.js) 18 | 19 | 20 | ### Install Local Server 21 | 22 | **Obsidian MD** users: Run `fire_seq_search_server --notebook_path --obsidian-md`. [Example obsidian.sh](https://github.com/Endle/fireSeqSearch/blob/master/fire_seq_search_server/obsidian.sh) 23 | 24 | 25 | #### Windows 26 | Steps: 27 | 1. Download the latest release at 28 | 2. If you're using PowerShell, run `.\fire_seq_search_server.exe --notebook_path C:\Users\li\logseq_notebook` 29 | 3. If you're using Msys2, run `./fire_seq_search_server --notebook_path /c/Users/li/logseq_notebook` 30 | 4. Please remember to change the path to your notebook 31 | 32 | #### Linux and macOS 33 | 1. Install rust. See 34 | 2. `git clone https://github.com/Endle/fireSeqSearch` 35 | 3. `cd fire_seq_search_server && cargo build` 36 | 4. `target/debug/fire_seq_search_server --notebook_path /home/li/my_notebook` 37 | 5. Min rust version: See https://github.com/Endle/fireSeqSearch/blob/master/.github/workflows/rust.yml#L21 38 | 39 | 40 | 41 | License 42 | ---------------- 43 | This project (both server and addon) is using MIT license. Some third party library may have other licenses (see source code) 44 | 45 | 46 | Ui icons created by manshagraphics - Flaticon 47 | 48 | 49 | LOGO link: 50 | 51 | 52 | LOGO license: Flaticon license 53 | 54 | 55 | How it works 56 | --------- 57 | This is what [fireSeqSearch](https://github.com/Endle/fireSeqSearch) does on my logseq notebook. I had to split it into two parts because Firefox extensions are not allowed to access local files. 58 | 59 | fireSeqSearch has two parts: 60 | 61 | ### 1. search server 62 | It read all local loseq notebooks, and hosts logseq pages on http://127.0.0.1:3030 63 | 64 | It provides the API `http://127.0.0.1:3030/query/` 65 | 66 | 67 | ### 2. Browser extension 68 | Every time we use search engine, it will fetch `http://127.0.0.1:3030/query/keywords`and append all hits to the web page. 69 | 70 | 71 | Similar Projects 72 | -------------- 73 | * [karlicoss/promnesia](https://github.com/karlicoss/promnesia) - [Promnesia](https://github.com/karlicoss/promnesia) is a mature and interesting project, aming a more ambitious goal. [fireSeqSearch](https://github.com/Endle/fireSeqSearch) only does one thing - append logseq hits to search engine results. 74 | * Logseq Copilot - https://chrome.google.com/webstore/detail/logseq-copilot/hihgfcgbmnbomabfdbajlbpnacndeihl 75 | 76 | Star History 77 | -------- 78 | 79 | 80 | [![Star History Chart](https://api.star-history.com/svg?repos=Endle/fireSeqSearch&type=Date)](https://star-history.com/#Endle/fireSeqSearch&Date) 81 | 82 | Provided by 83 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | fire_seq_search_server: 5 | image: ghcr.io/endle/fireseqsearch:latest 6 | env_file: .env 7 | environment: 8 | - NOTEBOOK_DIR 9 | restart: always 10 | ports: 11 | - "127.0.0.1:3030:3030" 12 | volumes: 13 | - ${NOTEBOOK_DIR}:${NOTEBOOK_DIR}:Z 14 | -------------------------------------------------------------------------------- /docs/dev_notes.md: -------------------------------------------------------------------------------- 1 | ## How to set manifest.json permission 2 | 3 | I received an answer at 4 | 5 | Due to I should not include port. 6 | 7 | ## Why use local server 8 | 9 | Firefox extension is not allowed to read local files. 10 | 11 | ## Why insert fireSeqSearchDom before document.body.firstChild 12 | 13 | I struggle for . I tried several dom elements, like `search`, `GyAeWb`, but `insertBefore()` failed. 14 | By naive frontend knowledge is not sufficient to solve it. 2022-Jan-28 15 | 16 | ## Rust version needed 17 | 18 | lz4_flex is using newly stabled features. tantivy depends on it. 19 | 20 | clap-rs even requires newer versions, so I'm only supporting 1.55+. 2022-Feb-05 21 | 22 | ## Match all Google regions 23 | 24 | In 0.0.4, I added one extra domain. I think I should find a more elegent way. 25 | 26 | 27 | ## CLion's shell is confusing under Windows 28 | I got `ld: cannot find -lntdll`. No idea why it happens. However, my previous clone is good. Why this terrible OS is so popular?! 29 | 30 | ## How to handle long paragraph 31 | 32 | 33 | On 2022-Nov-13, I tried https://bminixhofer.github.io/nnsplit . This tool is powerful. However, the model is too slow for my purpose. 34 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | ## LLM 2 | 3 | 2024 Sept 22 4 | 5 | https://github.com/user-attachments/assets/b0a4ca66-0a33-401a-a916-af7a69f2ae7b 6 | 7 | ## ObsidianMD 8 | 9 | [obsidian_example_2023-Feb-05.mp4]( 10 | https://user-images.githubusercontent.com/3221521/216853025-5cb82b18-fbcc-438e-8ff6-f791713c6b8b.mp4) 11 | 12 | 13 | # Old examples (before highlighter) 14 | 15 | 16 | ![screenshot_demo](https://user-images.githubusercontent.com/3221521/168455027-965da612-b783-4d92-83e2-4cd7b4830a43.png) 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/fire-128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/docs/fire-128.png -------------------------------------------------------------------------------- /docs/obsidian_example_2023-Feb-05.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/docs/obsidian_example_2023-Feb-05.mp4 -------------------------------------------------------------------------------- /docs/release_notes.md: -------------------------------------------------------------------------------- 1 | ### 0.1.3 2 | 3 | #### New Feature: Generate wordcloud. 4 | 5 | Just visit `http://127.0.0.1:3030/wordcloud`, and fireSeqSearch will generate a wordcloud with your logseq notes. Each word in the cloud is clickable. With a single click, you can search your top-words in search engines and your personal notes simultaneously. 6 | 7 | [This demo video](https://github.com/Endle/fireSeqSearch/assets/3221521/524fe70d-a128-4393-bd26-bee71871f38e) used `note of greek myth`, created by [yongerG](https://www.douban.com/note/807432536/?_i=8350280BMJZhl7). This note is [licensed with CC-BY-SA-4.0 license](https://github.com/Lihaogx/graph-note-of-greek-myth/blob/main/LICENSE). 8 | 9 | Thanks to [timdream](https://timdream.org/) and other contributors for the amazing library [wordcloud2.js](https://github.com/timdream/wordcloud2.js). 10 | 11 | #### New Feature: Allow to filter out zotero imported pages [Issue 122](https://github.com/Endle/fireSeqSearch/issues/122) 12 | 13 | ### 0.1.2 14 | New server-side feature: [Read and Search PDF contents](https://github.com/Endle/fireSeqSearch/issues/63)! In a logseq page, the PDF link `![title](../assets/doc_000123_0.pdf)` will be parsed, and appended to the document. 15 | 16 | #### How to use it 17 | This feature is turned off by default. Adding `--parse-pdf-links` to enable PDF parsing. [See example](https://github.com/Endle/fireSeqSearch/blob/81a9c2fc53ef589e8e63d19467825d63a84bd404/fire_seq_search_server/debug_server.sh#L8) 18 | 19 | Deficient: Performance. It needs further evaluation. 20 | 21 | #### Thanks 22 | The crate [PDF-extract](https://github.com/jrmuizel/pdf-extract) makes this new feature possible. Thanks [Jeff Muizelaar](https://github.com/jrmuizel) and [Joep Meindertsma](https://github.com/joepio) for it. 23 | 24 | 25 | [Clifford Enoc](https://github.com/cliffordx) created this feature request. 26 | 27 | 28 | ### 0.1.1 29 | This is the first time for bumping the **MINOR version** for a big new feature: 30 | 31 | ObsidianMD support! 32 | 33 | Bug fixes with contribution of xxchan. 34 | Dev change: Added sccache with the support of xuanwo. 35 | 36 | This a server side update. 37 | 38 | ### 0.0.22 39 | This is both server-side and client-side update. 40 | 41 | New feature: [include journal pages in search results](https://github.com/Endle/fireSeqSearch/issues/65). This feature is turned off by default. Use `--enable-journal-query` to enable it. 42 | 43 | Currently, I haven't figured out an approach to generate the Logseq URI for a specific journal page. 44 | 45 | ### 0.0.19 46 | This is a server-side update. 47 | 48 | 1. Fixed [highlight for Cyrillic letters](https://github.com/Endle/fireSeqSearch/issues/59). 49 | 2. Improvement: When a paragraph is too long, use its summary (See [Issue 57](https://github.com/Endle/fireSeqSearch/issues/57) and [commit](https://github.com/Endle/fireSeqSearch/commit/fb15a17bb9a47754bb7817891b01f08108c8c952)) 50 | 51 | ### 0.0.18 52 | Exciting new UI by @phoenixeliot and @yoyurec 53 | Thank you for your contribution! 54 | 55 | No change at server side. All you need is to update Firefox extension or user script. 56 | 57 | ### 0.0.16 58 | 59 | 1. Experimental support to search summary. 60 | 2. Parse markdown before feeding to tantivy. It expects to reduce false positive in search hits. 61 | 62 | #### How to enable search summary 63 | 1. Update server and Firefox extension to last version. 64 | 2. Firefox Tools->Settings->Extension->fireSeqSearch, enable "Show Summary" 65 | 66 | #### Deficient 67 | If the block is very long, for example, you clipped a long article into logseq, then the summary would be hard (or useless) to read. That's why there is a "Hide Summary" button. 68 | 69 | #### Thanks 70 | @raphlinus and other https://github.com/raphlinus/pulldown-cmark developers 71 | @arranf and @fbecart for https://github.com/fbecart/markdown_to_text 72 | 73 | -------------------------------------------------------------------------------- /docs/release_notes_0.2_2024Sep.md: -------------------------------------------------------------------------------- 1 | ### 0.2.1 2 | 3 | New feature: Note Summarization with Local LLM. 4 | 5 | What happens locally, what stays locally. 6 | 7 | #### Run server with local LLM 8 | fireSeqSearch facilitates [llamafile](https://github.com/Mozilla-Ocho/llamafile) by [Mozilla](https://github.com/Mozilla-Ocho). 9 | 10 | ``` 11 | mkdir -pv ~/.llamafile && cd ~/.llamafile 12 | wget https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q4_0.llamafile?download=true 13 | chmod +x mistral-7b-instruct-v0.2.Q4_0.llamafile 14 | ``` 15 | 16 | After that, compile and run fireSeqSearch with LLM 17 | ``` 18 | cargo build --features llm 19 | target/debug/fire_seq_search_server --notebook_path ~/logseq 20 | # Obsidian users 21 | target/debug/fire_seq_search_server --notebook_path ~/obsidian --obsidian-md 22 | ``` 23 | 24 | Finally, update the [Firefox Addon](https://addons.mozilla.org/en-US/firefox/addon/fireseqsearch/). 25 | 26 | #### Demo Video 27 | https://github.com/user-attachments/assets/b0a4ca66-0a33-401a-a916-af7a69f2ae7b 28 | 29 | This demo used [AstroWiki](https://github.com/AYelland/AstroWiki_2.0), which is licensed under MIT license. 30 | -------------------------------------------------------------------------------- /docs/screen_record_20220514.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/docs/screen_record_20220514.mkv -------------------------------------------------------------------------------- /docs/screenshot_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/docs/screenshot_demo.png -------------------------------------------------------------------------------- /docs/screenshot_demo_640_400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/docs/screenshot_demo_640_400.png -------------------------------------------------------------------------------- /docs/server.md: -------------------------------------------------------------------------------- 1 | ##### fire_seq_search_server 2 | 3 | Currently, this server is running at hard-coded port (or http://localhost:3030) 4 | 5 | ### Endpoints 6 | 7 | #### GET `/server_info` 8 | 9 | 10 | #### GET `/query/%s` 11 | 12 | Returns an array of `hit`s. 13 | 14 | Schema of `hit` (**unstable**) 15 | title: The title of the logseq page 16 | summary 17 | score 18 | 19 | 20 | -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | NOTEBOOK_DIR=/path/to/notebook -------------------------------------------------------------------------------- /fireSeqSearch_addon/icons/fire-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/fireSeqSearch_addon/icons/fire-48.png -------------------------------------------------------------------------------- /fireSeqSearch_addon/icons/notebook_logo_32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/fireSeqSearch_addon/icons/notebook_logo_32.png -------------------------------------------------------------------------------- /fireSeqSearch_addon/icons/notebook_logo_512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/fireSeqSearch_addon/icons/notebook_logo_512.png -------------------------------------------------------------------------------- /fireSeqSearch_addon/icons/notebook_logo_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/fireSeqSearch_addon/icons/notebook_logo_64.png -------------------------------------------------------------------------------- /fireSeqSearch_addon/main.js: -------------------------------------------------------------------------------- 1 | // MIT License 2 | // Copyright (c) 2021-2024 Zhenbo Li 3 | 4 | const fireSeqSearchDomId = "fireSeqSearchDom"; 5 | 6 | 7 | const fireSeqSearchScriptCSS = ` 8 | #fireSeqSearchDom { 9 | margin: 1em 1em 1em 1em; 10 | color: var(--theme-col-txt-snippet); /* duckduck color*/ 11 | } 12 | #fireSeqSearchDom.experimentalLayout { 13 | position: fixed; 14 | top: 140px; 15 | right: 12px; 16 | width: 200px; 17 | background-color: hsla(200, 40%, 96%, .8); 18 | font-size: 12px; 19 | border-radius: 6px; 20 | z-index: 99999; 21 | } 22 | .fireSeqSearchTitleBar { 23 | margin: 0.5em 0; 24 | } 25 | .hideSummary { 26 | margin: 0 1em; 27 | } 28 | #fireSeqSearchDom ul { 29 | margin: 0; 30 | padding: 0.6em; 31 | border: 1px dotted gray; 32 | list-style: none; 33 | line-height: 1.5em; 34 | } 35 | #fireSeqSearchDom ul li { 36 | font-size: 15px; 37 | } 38 | #fireSeqSearchDom ul li + li { 39 | margin-top: 0.4em; 40 | } 41 | #fireSeqSearchDom ul li a { 42 | text-decoration: underline; 43 | text-decoration-style: dotted; 44 | text-decoration-thickness: 1px; 45 | text-underline-offset: 2px; 46 | } 47 | #fireSeqSearchDom ul li::before { 48 | content: ' '; 49 | display: inline-block; 50 | margin-right: 0.4em; 51 | line-height: 1em; 52 | width: 1em; 53 | height: 1em; 54 | transform: translateY(3px); 55 | border-radius: 3px; 56 | background-image: url(); 57 | background-repeat: no-repeat; 58 | background-size: 16px; 59 | } 60 | .fireSeqSearchHitSummary { 61 | font-size: 0.9em 62 | } 63 | .fireSeqSearchHitSummary::before { 64 | content: "\\00A0::\\00A0"; 65 | } 66 | .fireSeqSearchHighlight { 67 | padding: 0 4px; 68 | color: black !important; 69 | background-color: gold; 70 | border-radius: 3px; 71 | } 72 | `; 73 | 74 | function consoleLogForDebug(message) { 75 | console.log(message); //skipcq: JS-0002 76 | } 77 | 78 | 79 | function addGlobalStyle(css) { 80 | const head = document.getElementsByTagName("head")[0]; 81 | if (!head) { return; } 82 | const style = document.createElement("style"); 83 | style.id = "fireSeqSearchScriptCSS"; 84 | // style.type = "text/css"; 85 | style.innerHTML = css; 86 | head.appendChild(style); 87 | } 88 | 89 | 90 | function createElementWithText(type, text) { 91 | const element = document.createElement(type); 92 | element.textContent = text; 93 | return element; 94 | } 95 | 96 | 97 | function createHrefToLogseq(record, serverInfo) { 98 | const name = serverInfo.notebook_name; 99 | 100 | const title = record.title; 101 | const prettyTitle = title.replaceAll("%2F", "/"); 102 | 103 | const target = record.logseq_uri || `logseq://graph/${name}?page=${title}`; 104 | 105 | const logseqPageLink = document.createElement('a'); 106 | const text = document.createTextNode(prettyTitle); 107 | logseqPageLink.appendChild(text); 108 | logseqPageLink.title = prettyTitle; 109 | logseqPageLink.href = target; 110 | consoleLogForDebug(logseqPageLink); 111 | return logseqPageLink; 112 | } 113 | 114 | 115 | function checkUserOptions() { 116 | return Promise.all([ 117 | /*global browser */ 118 | browser.storage.sync.get("debugStr"), 119 | browser.storage.sync.get("ExperimentalLayout"), 120 | browser.storage.sync.get("ShowHighlight"), 121 | browser.storage.sync.get("ShowScore") 122 | ]).then(function(res) { 123 | consoleLogForDebug(res); 124 | 125 | const options = { 126 | debugStr: res[0].debugStr, 127 | ExperimentalLayout: res[1].ExperimentalLayout, 128 | ShowHighlight: res[2].ShowHighlight, 129 | ShowScore: res[3].ShowScore 130 | } 131 | return options; 132 | }); 133 | } 134 | 135 | 136 | function parseRawList(rawSearchResult) { 137 | const hits = []; 138 | for (const rawRecord of rawSearchResult) { 139 | const record = JSON.parse(rawRecord); 140 | hits.push(record); 141 | } 142 | return hits; 143 | } 144 | 145 | async function processLlmSummary(serverInfo, parsedSearchResult, fireDom) { 146 | 147 | const doneListApi = "http://127.0.0.1:3030/llm_done_list"; 148 | let list = await fetch(doneListApi); 149 | list = await list.text(); 150 | list = JSON.parse(list); 151 | 152 | const findByTitle = function(title) { 153 | const ul = fireDom.querySelector( ".fireSeqSearchHitList" ); 154 | if (ul === null) return null; 155 | for (const child of ul.children) { 156 | const liTitle = child.firstChild.text; 157 | if (title === liTitle) { 158 | return child; 159 | } 160 | } 161 | return null; 162 | }; 163 | const setLlmResult = function (title, llmSummary) { 164 | const targetRow = findByTitle(title); 165 | if (targetRow === null) { 166 | consoleLogForDebug("Error! Can't find dom for ", title); 167 | return; 168 | } 169 | if (targetRow.querySelector( ".fireSeqSearchLlmSummary" ) != null) { 170 | consoleLogForDebug("Skip. We have the summary for ", title); 171 | return; 172 | } 173 | 174 | const summary = createElementWithText("span", ""); 175 | summary.innerHTML = llmSummary; 176 | summary.classList.add('fireSeqSearchLlmSummary'); 177 | targetRow.appendChild(summary); 178 | }; 179 | for (const record of parsedSearchResult) { 180 | const title = record.title; 181 | if (!list.includes(title)) { 182 | consoleLogForDebug("Not ready, skip" + title); 183 | continue; 184 | } 185 | // TODO remove hard code port 186 | const llm_api = "http://127.0.0.1:3030/summarize/" + title; 187 | let sum = await fetch(llm_api); 188 | sum = await sum.text(); 189 | setLlmResult(title, sum); 190 | } 191 | } 192 | 193 | 194 | function createFireSeqDom(serverInfo, parsedSearchResult) { 195 | const count = parsedSearchResult.length; 196 | const div = document.createElement("div"); 197 | div.setAttribute("id", fireSeqSearchDomId); 198 | 199 | const createTitleBarDom = function () { 200 | const titleBar = createElementWithText("div"); 201 | titleBar.classList.add('fireSeqSearchTitleBar'); 202 | const hitCount = `We found ${count.toString()} results in your logseq notebook`; 203 | titleBar.insertAdjacentHTML("afterbegin",hitCount); 204 | 205 | function setSummaryState(cl, state) { 206 | let prop = 'none'; 207 | if (state) { prop = ''; } 208 | for (const el of document.querySelectorAll(cl)) { 209 | el.style.display=prop; 210 | } 211 | } 212 | let btn = document.createElement("button"); 213 | btn.classList.add("hideSummary"); 214 | let text = document.createTextNode("Hide Summary"); 215 | btn.appendChild(text); 216 | btn.onclick = function () { 217 | setSummaryState(".fireSeqSearchHitSummary", false); 218 | setSummaryState(".fireSeqSearchLlmSummary", false); 219 | }; 220 | titleBar.appendChild(btn); 221 | 222 | btn = document.createElement("button"); 223 | btn.classList.add("showSummary"); 224 | text = document.createTextNode("Summary"); 225 | btn.appendChild(text); 226 | btn.onclick = function () { 227 | setSummaryState(".fireSeqSearchHitSummary", true); 228 | setSummaryState(".fireSeqSearchLlmSummary", false); 229 | }; 230 | titleBar.appendChild(btn); 231 | 232 | btn = document.createElement("button"); 233 | btn.classList.add("showLlm"); 234 | text = document.createTextNode("LLM"); 235 | btn.appendChild(text); 236 | btn.onclick = function () { 237 | setSummaryState(".fireSeqSearchHitSummary", false); 238 | setSummaryState(".fireSeqSearchLlmSummary", true); 239 | processLlmSummary(serverInfo, parsedSearchResult, div); 240 | }; 241 | titleBar.appendChild(btn); 242 | return titleBar; 243 | }; 244 | const bar = createTitleBarDom(); 245 | div.appendChild(bar); 246 | return div; 247 | } 248 | 249 | async function appendResultToSearchResult(serverInfo, parsedSearchResult, dom) { 250 | const firefoxExtensionUserOption = await checkUserOptions(); 251 | consoleLogForDebug('Loaded user option: ' + JSON.stringify(firefoxExtensionUserOption)); 252 | 253 | function buildListItems(parsedSearchResult) { 254 | const hitList = document.createElement("ul"); 255 | hitList.classList.add('fireSeqSearchHitList'); 256 | for (const record of parsedSearchResult) { 257 | const li = createElementWithText("li", ""); 258 | li.classList.add('fireSeqSearchHitListItem'); 259 | if (firefoxExtensionUserOption.ShowScore) { 260 | const score = createElementWithText("span", String(record.score)); 261 | li.appendChild(score); 262 | } 263 | const href = createHrefToLogseq(record, serverInfo); 264 | li.appendChild(href); 265 | 266 | const summary = createElementWithText("span", ""); 267 | summary.innerHTML = record.summary; 268 | summary.classList.add('fireSeqSearchHitSummary'); 269 | li.appendChild(summary); 270 | 271 | hitList.appendChild(li); 272 | } 273 | return hitList; 274 | } 275 | const hitList = buildListItems(parsedSearchResult); 276 | dom.appendChild(hitList); 277 | 278 | if (firefoxExtensionUserOption.ExperimentalLayout) { 279 | // Inspired by https://twitter.com/rockucn 280 | // https://greasyfork.org/en/scripts/446492-%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E%E5%88%87%E6%8D%A2%E5%99%A8-search-engine-switcher/code 281 | 282 | dom.classList.add("experimentalLayout"); 283 | } 284 | 285 | function insertDivToWebpage(result) { 286 | let contextId = "rcnt"; 287 | if (window.location.host.includes("duckduckgo.com")) { 288 | contextId = "web_content_wrapper"; 289 | } 290 | if (window.location.host.includes("searx")) { // https://github.com/Endle/fireSeqSearch/issues/103 291 | contextId = "results"; 292 | } 293 | if (window.location.host.includes("metager")) { // https://github.com/Endle/fireSeqSearch/issues/127 294 | contextId = "results"; 295 | } 296 | document.getElementById(contextId).insertAdjacentElement("beforebegin", result); 297 | 298 | } 299 | 300 | insertDivToWebpage(dom); 301 | } 302 | 303 | async function mainProcess(fetchResultArray) { 304 | consoleLogForDebug("main process"); 305 | 306 | const serverInfo = fetchResultArray[0]; 307 | const rawSearchResult = fetchResultArray[1]; 308 | consoleLogForDebug(serverInfo); 309 | const parsedSearchResult = parseRawList(rawSearchResult); 310 | 311 | const fireDom = createFireSeqDom(serverInfo, parsedSearchResult); 312 | 313 | appendResultToSearchResult(serverInfo, parsedSearchResult, fireDom); 314 | 315 | } 316 | 317 | 318 | function getSearchParameterFromCurrentPage() { 319 | let searchParam = ""; 320 | 321 | function getSearchParameterOfSearx() { 322 | const inputBox = document.getElementById("q"); 323 | return inputBox.value; 324 | } 325 | function getSearchParameterOfMetager() { 326 | const urlParams = new URLSearchParams(window.location.search); 327 | return urlParams.get('eingabe'); 328 | } 329 | 330 | if (window.location.toString().includes("searx")) { 331 | searchParam = getSearchParameterOfSearx(); 332 | } else if (window.location.toString().includes("metager")) { 333 | searchParam = getSearchParameterOfMetager(); 334 | } else { 335 | // https://stackoverflow.com/a/901144/1166518 336 | const urlParams = new URLSearchParams(window.location.search); 337 | searchParam = urlParams.get('q'); 338 | } 339 | 340 | consoleLogForDebug(`Got search param: ${searchParam}`); 341 | return searchParam; 342 | } 343 | 344 | 345 | 346 | (function() { 347 | const searchParameter = getSearchParameterFromCurrentPage(); 348 | 349 | addGlobalStyle(fireSeqSearchScriptCSS); 350 | 351 | //https://gomakethings.com/waiting-for-multiple-all-api-responses-to-complete-with-the-vanilla-js-promise.all-method/ 352 | Promise.all([ 353 | fetch("http://127.0.0.1:3030/server_info"), 354 | fetch("http://127.0.0.1:3030/query/" + searchParameter) 355 | ]).then(function (responses) { 356 | return Promise.all(responses.map(function (response) {return response.json();})); 357 | }).then(function (data) { 358 | mainProcess(data); 359 | }).then((_e) => { 360 | const highlightedItems = document.querySelectorAll('.fireSeqSearchHighlight'); 361 | consoleLogForDebug(highlightedItems); 362 | highlightedItems.forEach((element) => { 363 | element.style.color = 'red'; 364 | }); 365 | }).catch( 366 | error => {consoleLogForDebug(error)} 367 | ); 368 | 369 | 370 | })(); 371 | -------------------------------------------------------------------------------- /fireSeqSearch_addon/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "name": "fireSeqSearch", 4 | "version": "0.2.2", 5 | 6 | "description": "Everytime you use the search engine, this plugin will search against your personal logseq notes.", 7 | 8 | "icons": { 9 | "32": "icons/notebook_logo_32.png", 10 | "64": "icons/notebook_logo_64.png" 11 | }, 12 | 13 | "options_ui": { 14 | "page": "options.html", 15 | "browser_style": true 16 | }, 17 | "content_scripts": [ 18 | { 19 | "matches": [ 20 | "*://*.bing.com/*", 21 | "*://www.google.com/search*", 22 | "*://www.google.com.hk/search*", 23 | "*://duckduckgo.com/*", 24 | "*://searx.prvcy.eu/search", 25 | "*://searx.fmac.xyz/search", 26 | "*://metager.org/*" 27 | ], 28 | "js": ["main.js"] 29 | }, 30 | { 31 | "matches": [ 32 | "*://127.0.0.1/*" 33 | ], 34 | "js": ["wordcloud_draw.js"] 35 | } 36 | ], 37 | 38 | "permissions": ["*://127.0.0.1/*", "storage"], 39 | 40 | 41 | "browser_specific_settings": { 42 | "gecko": { 43 | "id": "{293a97e7-c815-4ce2-a537-87af8818cbc0}", 44 | "strict_min_version": "99.0" 45 | } 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /fireSeqSearch_addon/monkeyscript.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name fireSeqSearchScript 3 | // @namespace https://github.com/Endle/fireSeqSearch 4 | // @version 0.0.18 5 | // @description Everytime you use the search engine, FireSeqSearch searches your personal logseq notes. 6 | // @author Zhenbo Li 7 | // @match https://www.google.com/search* 8 | // @match https://duckduckgo.com/?q=* 9 | // @icon https://www.google.com/s2/favicons?sz=64&domain=tampermonkey.net 10 | // @grant GM_xmlhttpRequest 11 | // ==/UserScript== 12 | 13 | // MIT License 14 | // Copyright (c) 2021-2022 Zhenbo Li 15 | 16 | /*global GM*/ 17 | 18 | const fireSeqSearchDomId = "fireSeqSearchDom"; 19 | 20 | 21 | const fireSeqSearchScriptCSS = ` 22 | #fireSeqSearchDom { 23 | margin: 1em 1em 1em 1em; 24 | color: var(--theme-col-txt-snippet); /* duckduck color*/ 25 | } 26 | #fireSeqSearchDom.experimentalLayout { 27 | position: fixed; 28 | top: 140px; 29 | right: 12px; 30 | width: 200px; 31 | background-color: hsla(200, 40%, 96%, .8); 32 | font-size: 12px; 33 | border-radius: 6px; 34 | z-index: 99999; 35 | } 36 | .fireSeqSearchTitleBar { 37 | margin: 0.5em 0; 38 | } 39 | .hideSummary { 40 | margin: 0 1em; 41 | } 42 | #fireSeqSearchDom ul { 43 | margin: 0; 44 | padding: 0.6em; 45 | border: 1px dotted gray; 46 | list-style: none; 47 | line-height: 1.5em; 48 | } 49 | #fireSeqSearchDom ul li { 50 | font-size: 15px; 51 | } 52 | #fireSeqSearchDom ul li + li { 53 | margin-top: 0.4em; 54 | } 55 | #fireSeqSearchDom ul li a { 56 | text-decoration: underline; 57 | text-decoration-style: dotted; 58 | text-decoration-thickness: 1px; 59 | text-underline-offset: 2px; 60 | } 61 | #fireSeqSearchDom ul li::before { 62 | content: ' '; 63 | display: inline-block; 64 | margin-right: 0.4em; 65 | line-height: 1em; 66 | width: 1em; 67 | height: 1em; 68 | transform: translateY(3px); 69 | border-radius: 3px; 70 | background-image: url(); 71 | background-repeat: no-repeat; 72 | background-size: 16px; 73 | } 74 | .fireSeqSearchHitSummary { 75 | font-size: 0.9em 76 | } 77 | .fireSeqSearchHitSummary::before { 78 | content: "\\00A0::\\00A0"; 79 | } 80 | .fireSeqSearchHighlight { 81 | padding: 0 4px; 82 | color: black !important; 83 | background-color: gold; 84 | border-radius: 3px; 85 | } 86 | `; 87 | 88 | function consoleLogForDebug(message) { 89 | console.log(message); //skipcq: JS-0002 90 | // Comment it in master branch, to make deepSource happy 91 | } 92 | 93 | 94 | function addGlobalStyle(css) { 95 | const head = document.getElementsByTagName("head")[0]; 96 | if (!head) { return; } 97 | const style = document.createElement("style"); 98 | style.id = "fireSeqSearchScriptCSS"; 99 | // style.type = "text/css"; 100 | style.innerHTML = css; 101 | head.appendChild(style); 102 | } 103 | 104 | 105 | function createElementWithText(type, text) { 106 | const element = document.createElement(type); 107 | element.textContent = text; 108 | return element; 109 | } 110 | 111 | 112 | function createHrefToLogseq(record, serverInfo) { 113 | const name = serverInfo.notebook_name; 114 | 115 | const title = record.title; 116 | const prettyTitle = title.replaceAll("%2F", "/"); 117 | 118 | const target = `logseq://graph/${name}?page=${title}`; 119 | const logseqPageLink = document.createElement('a'); 120 | const text = document.createTextNode(prettyTitle); 121 | logseqPageLink.appendChild(text); 122 | logseqPageLink.title = prettyTitle; 123 | logseqPageLink.href = target; 124 | consoleLogForDebug(logseqPageLink); 125 | return logseqPageLink; 126 | } 127 | 128 | 129 | function checkUserOptions() { 130 | const options = { 131 | debugStr: "tampermonkey", 132 | ExperimentalLayout: false, 133 | ShowHighlight: true, 134 | ShowScore: false 135 | } 136 | consoleLogForDebug(options); 137 | return options; 138 | 139 | } 140 | 141 | 142 | async function appendResultToSearchResult(fetchResultArray, container) { 143 | const serverInfo = fetchResultArray[0]; 144 | const rawSearchResult = fetchResultArray[1]; 145 | const firefoxExtensionUserOption = await checkUserOptions(); 146 | 147 | 148 | consoleLogForDebug(firefoxExtensionUserOption); 149 | 150 | function createTitleBarDom(count) { 151 | const titleBar = createElementWithText("div"); 152 | titleBar.classList.add('fireSeqSearchTitleBar'); 153 | const hitCount = `We found ${count.toString()} results in your logseq notebook`; 154 | titleBar.insertAdjacentHTML("afterbegin",hitCount); 155 | const btn = document.createElement("button"); 156 | btn.classList.add("hideSummary"); 157 | const text = document.createTextNode("Hide Summary (Tmp)"); 158 | btn.appendChild(text); 159 | btn.onclick = function () { 160 | // alert("Button is clicked"); 161 | for (const el of document.querySelectorAll('.fireSeqSearchHitSummary')) { 162 | // el.style.visibility = 'hidden'; 163 | el.remove(); 164 | } 165 | }; 166 | titleBar.appendChild(btn); 167 | return titleBar; 168 | } 169 | 170 | 171 | 172 | function createFireSeqDom() { 173 | 174 | const div = document.createElement("div"); 175 | // div.appendChild(createElementWithText("p", "fireSeqSearch launched!")); 176 | div.setAttribute("id", fireSeqSearchDomId); 177 | 178 | 179 | return div; 180 | } 181 | 182 | const dom = createFireSeqDom(); 183 | dom.appendChild(createTitleBarDom(rawSearchResult.length)); 184 | consoleLogForDebug(dom); 185 | 186 | const hitList = document.createElement("ul"); 187 | 188 | consoleLogForDebug(rawSearchResult); 189 | for (const rawRecord of rawSearchResult) { 190 | // const e = document.createTextNode(record); 191 | consoleLogForDebug(rawRecord); 192 | const record = JSON.parse(rawRecord); 193 | consoleLogForDebug(typeof record); 194 | 195 | const li = createElementWithText("li", ""); 196 | 197 | 198 | if (firefoxExtensionUserOption.ShowScore) { 199 | const score = createElementWithText("span", String(record.score)); 200 | li.appendChild(score); 201 | } 202 | const href = createHrefToLogseq(record, serverInfo); 203 | li.appendChild(href); 204 | li.append(' ') 205 | if (firefoxExtensionUserOption.ShowHighlight) { 206 | const summary = createElementWithText("span", ""); 207 | summary.innerHTML = record.summary; 208 | summary.classList.add('fireSeqSearchHitSummary'); 209 | li.appendChild(summary); 210 | } 211 | // let e = wrapRawRecordIntoElement(record, serverInfo); 212 | 213 | // e.style. 214 | hitList.appendChild(li); 215 | // consoleLogForDebug("Added an element to the list"); 216 | } 217 | dom.appendChild(hitList); 218 | 219 | if (firefoxExtensionUserOption.ExperimentalLayout) { 220 | // Inspired by https://twitter.com/rockucn 221 | // https://greasyfork.org/en/scripts/446492-%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E%E5%88%87%E6%8D%A2%E5%99%A8-search-engine-switcher/code 222 | 223 | dom.classList.add("experimentalLayout"); 224 | } 225 | let contextId = "rcnt"; 226 | if (window.location.href.includes("duckduckgo.com")) { 227 | contextId = "web_content_wrapper"; 228 | } 229 | document.getElementById(contextId).insertAdjacentElement("beforebegin", dom); 230 | 231 | } 232 | 233 | function getSearchParameterFromCurrentPage() { 234 | let searchParam = ""; 235 | 236 | function getSearchParameterOfSearx() { 237 | const inputBox = document.getElementById("q"); 238 | return inputBox.value; 239 | } 240 | 241 | if (window.location.toString().includes("searx")) { 242 | searchParam = getSearchParameterOfSearx(); 243 | } else { 244 | // https://stackoverflow.com/a/901144/1166518 245 | const urlParams = new URLSearchParams(window.location.search); 246 | // consoleLogForDebug(urlParams); 247 | searchParam = urlParams.get('q'); 248 | } 249 | 250 | consoleLogForDebug(`Got search param: ${searchParam}`); 251 | return searchParam; 252 | } 253 | 254 | 255 | 256 | (function() { 257 | const searchParameter = getSearchParameterFromCurrentPage(); 258 | 259 | consoleLogForDebug(searchParameter); 260 | addGlobalStyle(fireSeqSearchScriptCSS); 261 | 262 | GM.xmlHttpRequest({ 263 | method: "GET", 264 | url: "http://127.0.0.1:3030/server_info", 265 | onload(infoResponse) { 266 | const server_info = JSON.parse(infoResponse.responseText); 267 | consoleLogForDebug(server_info); 268 | GM.xmlHttpRequest({ 269 | method: "GET", 270 | url: `http://127.0.0.1:3030/query/${searchParameter}`, 271 | onload(queryResponse) { 272 | const hit = JSON.parse(queryResponse.responseText); 273 | // consoleLogForDebug(hit); 274 | consoleLogForDebug(typeof hit); 275 | 276 | appendResultToSearchResult([server_info, hit]) 277 | .then((_e) => { 278 | const highlightedItems = document.querySelectorAll('.fireSeqSearchHighlight'); 279 | consoleLogForDebug(highlightedItems); 280 | }) 281 | .catch(error => { 282 | consoleLogForDebug(error); 283 | }); 284 | 285 | } 286 | }); 287 | } 288 | }); 289 | 290 | /* 291 | //https://gomakethings.com/waiting-for-multiple-all-api-responses-to-complete-with-the-vanilla-js-promise.all-method/ 292 | Promise.all([ 293 | fetch("http://127.0.0.1:3030/server_info"), 294 | fetch(`http://127.0.0.1:3030/query/${searchParameter}`) 295 | ]).then(function (responses) { 296 | return Promise.all(responses.map(function (response) {return response.json();})); 297 | }).then(function (data) { 298 | consoleLogForDebug(data); 299 | return appendResultToSearchResult(data); 300 | }).then((_e) => { 301 | const highlightedItems = document.querySelectorAll('.fireSeqSearchHighlight'); 302 | consoleLogForDebug(highlightedItems); 303 | highlightedItems.forEach((element) => { 304 | element.style.color = 'red'; 305 | }); 306 | }).catch(function (error) { 307 | consoleLogForDebug(error); 308 | }); 309 | 310 | 311 | 312 | */ 313 | 314 | 315 | 316 | })(); 317 | -------------------------------------------------------------------------------- /fireSeqSearch_addon/options.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 |
13 | 14 | 16 | 17 |
18 | 19 | 21 | 22 |
23 | 24 | 26 | 27 |
28 | 29 | 30 |
31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /fireSeqSearch_addon/options.js: -------------------------------------------------------------------------------- 1 | 2 | function saveOptions(e) { 3 | e.preventDefault(); 4 | const ex = document.querySelector("#ExperimentalLayout").checked; 5 | 6 | browser.storage.sync.set({ 7 | debugStr: document.querySelector("#debugStr").value, 8 | ExperimentalLayout: ex, 9 | ShowScore: document.querySelector("#ShowScore").checked, 10 | ShowHighlight: document.querySelector("#ShowHighlight").checked 11 | }); 12 | } 13 | 14 | function restoreOptions() { 15 | document.querySelector("#debugStr").value = 'Default red'; 16 | 17 | /*global browser */ 18 | let gettingItem = browser.storage.sync.get('debugStr'); 19 | gettingItem.then((res) => { 20 | document.querySelector("#debugStr").value = res.debugStr || 'Not Found'; 21 | }); 22 | 23 | let ex = browser.storage.sync.get('ExperimentalLayout'); 24 | ex.then((res) => { 25 | if (res.ExperimentalLayout) { 26 | document.querySelector("#ExperimentalLayout").checked = true; 27 | } 28 | }); 29 | 30 | browser.storage.sync.get('ShowHighlight') 31 | .then((res) => { 32 | if (res.ShowHighlight) { 33 | document.querySelector("#ShowHighlight").checked = true; 34 | } 35 | }); 36 | browser.storage.sync.get('ShowScore') 37 | .then((res) => { 38 | if (res.ShowScore) { 39 | document.querySelector("#ShowScore").checked = true; 40 | } 41 | }); 42 | } 43 | 44 | document.addEventListener('DOMContentLoaded', restoreOptions); 45 | document.querySelector("form").addEventListener("submit", saveOptions); -------------------------------------------------------------------------------- /fireSeqSearch_addon/violentmonkeyscript.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name fireSeqSearchScript 3 | // @namespace https://github.com/Endle/fireSeqSearch 4 | // @version 0.1.4 5 | // @description Everytime you use the search engine, FireSeqSearch searches your personal logseq notes. 6 | // @author Zhenbo Li 7 | // @match https://www.google.com/search* 8 | // @match https://duckduckgo.com/* 9 | // @match https://metager.org/* 10 | // @icon https://www.google.com/s2/favicons?sz=64&domain=tampermonkey.net 11 | // @grant GM.xmlHttpRequest 12 | // ==/UserScript== 13 | 14 | // MIT License 15 | // Copyright (c) 2021-2023 Zhenbo Li 16 | 17 | /*global GM*/ 18 | 19 | const fireSeqSearchDomId = "fireSeqSearchDom"; 20 | 21 | 22 | const fireSeqSearchScriptCSS = ` 23 | #fireSeqSearchDom { 24 | margin: 1em 1em 1em 1em; 25 | color: var(--theme-col-txt-snippet); /* duckduck color*/ 26 | } 27 | #fireSeqSearchDom.experimentalLayout { 28 | position: fixed; 29 | top: 140px; 30 | right: 12px; 31 | width: 200px; 32 | background-color: hsla(200, 40%, 96%, .8); 33 | font-size: 12px; 34 | border-radius: 6px; 35 | z-index: 99999; 36 | } 37 | .fireSeqSearchTitleBar { 38 | margin: 0.5em 0; 39 | } 40 | .hideSummary { 41 | margin: 0 1em; 42 | } 43 | #fireSeqSearchDom ul { 44 | margin: 0; 45 | padding: 0.6em; 46 | border: 1px dotted gray; 47 | list-style: none; 48 | line-height: 1.5em; 49 | } 50 | #fireSeqSearchDom ul li { 51 | font-size: 15px; 52 | } 53 | #fireSeqSearchDom ul li + li { 54 | margin-top: 0.4em; 55 | } 56 | #fireSeqSearchDom ul li a { 57 | text-decoration: underline; 58 | text-decoration-style: dotted; 59 | text-decoration-thickness: 1px; 60 | text-underline-offset: 2px; 61 | } 62 | #fireSeqSearchDom ul li::before { 63 | content: ' '; 64 | display: inline-block; 65 | margin-right: 0.4em; 66 | line-height: 1em; 67 | width: 1em; 68 | height: 1em; 69 | transform: translateY(3px); 70 | border-radius: 3px; 71 | background-image: url(); 72 | background-repeat: no-repeat; 73 | background-size: 16px; 74 | } 75 | .fireSeqSearchHitSummary { 76 | font-size: 0.9em 77 | } 78 | .fireSeqSearchHitSummary::before { 79 | content: "\\00A0::\\00A0"; 80 | } 81 | .fireSeqSearchHighlight { 82 | padding: 0 4px; 83 | color: black !important; 84 | background-color: gold; 85 | border-radius: 3px; 86 | } 87 | `; 88 | 89 | function consoleLogForDebug(message) { 90 | console.log(message); //skipcq: JS-0002 91 | } 92 | 93 | 94 | function addGlobalStyle(css) { 95 | const head = document.getElementsByTagName("head")[0]; 96 | if (!head) { return; } 97 | const style = document.createElement("style"); 98 | style.id = "fireSeqSearchScriptCSS"; 99 | // style.type = "text/css"; 100 | style.innerHTML = css; 101 | head.appendChild(style); 102 | } 103 | 104 | 105 | function createElementWithText(type, text) { 106 | const element = document.createElement(type); 107 | element.textContent = text; 108 | return element; 109 | } 110 | 111 | 112 | function createHrefToLogseq(record, serverInfo) { 113 | const name = serverInfo.notebook_name; 114 | 115 | const title = record.title; 116 | const prettyTitle = title.replaceAll("%2F", "/"); 117 | 118 | const target = record.logseq_uri || `logseq://graph/${name}?page=${title}`; 119 | 120 | const logseqPageLink = document.createElement('a'); 121 | const text = document.createTextNode(prettyTitle); 122 | logseqPageLink.appendChild(text); 123 | logseqPageLink.title = prettyTitle; 124 | logseqPageLink.href = target; 125 | consoleLogForDebug(logseqPageLink); 126 | return logseqPageLink; 127 | } 128 | 129 | 130 | function checkUserOptions() { 131 | const options = { 132 | debugStr: "tampermonkey", 133 | ExperimentalLayout: false, 134 | ShowHighlight: true, 135 | ShowScore: false 136 | } 137 | consoleLogForDebug(options); 138 | return options; 139 | 140 | } 141 | 142 | 143 | async function appendResultToSearchResult(fetchResultArray, _container) { 144 | const serverInfo = fetchResultArray[0]; 145 | const rawSearchResult = fetchResultArray[1]; 146 | const firefoxExtensionUserOption = await checkUserOptions(); 147 | 148 | consoleLogForDebug('Loaded user option: ' + JSON.stringify(firefoxExtensionUserOption)); 149 | 150 | function createTitleBarDom(count) { 151 | const titleBar = createElementWithText("div"); 152 | titleBar.classList.add('fireSeqSearchTitleBar'); 153 | const hitCount = `We found ${count.toString()} results in your logseq notebook`; 154 | titleBar.insertAdjacentHTML("afterbegin",hitCount); 155 | const btn = document.createElement("button"); 156 | btn.classList.add("hideSummary"); 157 | const text = document.createTextNode("Hide Summary (Tmp)"); 158 | btn.appendChild(text); 159 | btn.onclick = function () { 160 | // alert("Button is clicked"); 161 | for (const el of document.querySelectorAll('.fireSeqSearchHitSummary')) { 162 | // el.style.visibility = 'hidden'; 163 | el.remove(); 164 | } 165 | }; 166 | titleBar.appendChild(btn); 167 | return titleBar; 168 | } 169 | function createFireSeqDom() { 170 | const div = document.createElement("div"); 171 | div.setAttribute("id", fireSeqSearchDomId); 172 | return div; 173 | } 174 | 175 | const dom = createFireSeqDom(); 176 | dom.appendChild(createTitleBarDom(rawSearchResult.length)); 177 | consoleLogForDebug(dom); 178 | 179 | const hitList = document.createElement("ul"); 180 | 181 | consoleLogForDebug(rawSearchResult); 182 | for (const rawRecord of rawSearchResult) { 183 | // const e = document.createTextNode(record); 184 | consoleLogForDebug(rawRecord); 185 | const record = JSON.parse(rawRecord); 186 | consoleLogForDebug(typeof record); 187 | 188 | const li = createElementWithText("li", ""); 189 | 190 | 191 | if (firefoxExtensionUserOption.ShowScore) { 192 | const score = createElementWithText("span", String(record.score)); 193 | li.appendChild(score); 194 | } 195 | const href = createHrefToLogseq(record, serverInfo); 196 | li.appendChild(href); 197 | li.append(' ') 198 | if (firefoxExtensionUserOption.ShowHighlight) { 199 | const summary = createElementWithText("span", ""); 200 | summary.innerHTML = record.summary; 201 | summary.classList.add('fireSeqSearchHitSummary'); 202 | li.appendChild(summary); 203 | } 204 | // let e = wrapRawRecordIntoElement(record, serverInfo); 205 | 206 | // e.style. 207 | hitList.appendChild(li); 208 | // consoleLogForDebug("Added an element to the list"); 209 | } 210 | dom.appendChild(hitList); 211 | 212 | if (firefoxExtensionUserOption.ExperimentalLayout) { 213 | // Inspired by https://twitter.com/rockucn 214 | // https://greasyfork.org/en/scripts/446492-%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E%E5%88%87%E6%8D%A2%E5%99%A8-search-engine-switcher/code 215 | 216 | dom.classList.add("experimentalLayout"); 217 | } 218 | 219 | function insertDivToWebpage(result) { 220 | let contextId = "rcnt"; 221 | if (window.location.host.includes("duckduckgo.com")) { 222 | contextId = "web_content_wrapper"; 223 | } 224 | if (window.location.host.includes("searx")) { // https://github.com/Endle/fireSeqSearch/issues/103 225 | contextId = "results"; 226 | } 227 | if (window.location.host.includes("metager")) { // https://github.com/Endle/fireSeqSearch/issues/127 228 | contextId = "results"; 229 | } 230 | document.getElementById(contextId).insertAdjacentElement("beforebegin", result); 231 | 232 | } 233 | 234 | insertDivToWebpage(dom); 235 | } 236 | 237 | function getSearchParameterFromCurrentPage() { 238 | let searchParam = ""; 239 | 240 | function getSearchParameterOfSearx() { 241 | const inputBox = document.getElementById("q"); 242 | return inputBox.value; 243 | } 244 | function getSearchParameterOfMetager() { 245 | const urlParams = new URLSearchParams(window.location.search); 246 | return urlParams.get('eingabe'); 247 | } 248 | 249 | if (window.location.toString().includes("searx")) { 250 | searchParam = getSearchParameterOfSearx(); 251 | } else if (window.location.toString().includes("metager")) { 252 | searchParam = getSearchParameterOfMetager(); 253 | } else { 254 | // https://stackoverflow.com/a/901144/1166518 255 | const urlParams = new URLSearchParams(window.location.search); 256 | searchParam = urlParams.get('q'); 257 | } 258 | 259 | consoleLogForDebug(`Got search param: ${searchParam}`); 260 | return searchParam; 261 | } 262 | 263 | 264 | 265 | (function() { 266 | const searchParameter = getSearchParameterFromCurrentPage(); 267 | 268 | 269 | addGlobalStyle(fireSeqSearchScriptCSS); 270 | 271 | GM.xmlHttpRequest({ 272 | method: "GET", 273 | url: "http://127.0.0.1:3030/server_info", 274 | onload(infoResponse) { 275 | const server_info = JSON.parse(infoResponse.responseText); 276 | consoleLogForDebug(server_info); 277 | GM.xmlHttpRequest({ 278 | method: "GET", 279 | url: `http://127.0.0.1:3030/query/${searchParameter}`, 280 | onload(queryResponse) { 281 | const hit = JSON.parse(queryResponse.responseText); 282 | // consoleLogForDebug(hit); 283 | consoleLogForDebug(typeof hit); 284 | 285 | appendResultToSearchResult([server_info, hit]) 286 | .then((_e) => { 287 | const highlightedItems = document.querySelectorAll('.fireSeqSearchHighlight'); 288 | consoleLogForDebug(highlightedItems); 289 | }) 290 | .catch(error => { 291 | consoleLogForDebug(error); 292 | }); 293 | 294 | } 295 | }); 296 | } 297 | }); 298 | 299 | /* 300 | //https://gomakethings.com/waiting-for-multiple-all-api-responses-to-complete-with-the-vanilla-js-promise.all-method/ 301 | Promise.all([ 302 | fetch("http://127.0.0.1:3030/server_info"), 303 | fetch(`http://127.0.0.1:3030/query/${searchParameter}`) 304 | ]).then(function (responses) { 305 | return Promise.all(responses.map(function (response) {return response.json();})); 306 | }).then(function (data) { 307 | consoleLogForDebug(data); 308 | return appendResultToSearchResult(data); 309 | }).then((_e) => { 310 | const highlightedItems = document.querySelectorAll('.fireSeqSearchHighlight'); 311 | consoleLogForDebug(highlightedItems); 312 | highlightedItems.forEach((element) => { 313 | element.style.color = 'red'; 314 | }); 315 | }).catch(function (error) { 316 | consoleLogForDebug(error); 317 | }); 318 | 319 | 320 | 321 | */ 322 | 323 | 324 | 325 | })(); 326 | -------------------------------------------------------------------------------- /fire_seq_search_server/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fire_seq_search_server" 3 | version = "0.2.1" 4 | edition = "2021" 5 | license = "MIT" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | 10 | [dependencies] 11 | 12 | tokio = { version = "1", features = ["full"] } 13 | 14 | # Http Client 15 | axum = "0.7.5" 16 | serde_json = "1.0" 17 | 18 | # Serde 19 | # https://serde.rs/derive.html 20 | # https://stackoverflow.com/a/49313680/1166518 21 | serde = { version = "1.0", features = ["derive", "rc"] } 22 | url = "2.3.1" 23 | 24 | # QueryEngine 25 | tantivy = "0.22" 26 | tantivy-tokenizer-api = "0.3.0" 27 | jieba-rs = { version = "0.7.0" } 28 | 29 | 30 | log = "0.4.22" 31 | env_logger = "0.11.5" 32 | 33 | # Rust 34 | clap = { version = "4.0", features = ["derive"] } 35 | lazy_static = "1.4.0" 36 | rayon = "1.5" 37 | futures = "0.3" 38 | ctrlc = "3.4" 39 | sysinfo = "0.34.2" 40 | kill_tree = "0.2.4" 41 | urlencoding = "2.1.0" 42 | 43 | 44 | # Language Processing 45 | stopwords = "0.1.1" 46 | stop-words = "0.7.2" 47 | 48 | regex = "1" 49 | lingua = { version = "1.4.0", default-features = false, features = ["chinese", "english"] } 50 | shellexpand = "3.1" 51 | 52 | #Highlight (Output) 53 | html-escape = "0.2.13" 54 | 55 | # Parsing 56 | pulldown-cmark = { version = "0.9.2", default-features = false } 57 | # Error 58 | #at /rustc/897e37553bba8b42751c67658967889d11ecd120\library\core\src/option.rs:775:21 59 | #4: pdf_extract::show_text 60 | #at C:\Users\z2369li\.cargo\git\checkouts\pdf-extract-c67a6fa67c2d526c\0d8b9d9\src\lib.rs:1262:16 61 | #pdf-extract = "0.6.4" 62 | pdf-extract-temporary-mitigation-panic = "0.7.1" 63 | 64 | 65 | 66 | # TODO Currently turn them off will make cargo build fail 67 | # I should make these deps optional, so those who doesn't want LLM could have a smaller binary 68 | sha256 = { version = "1.5.0", optional = true } 69 | reqwest = { version = "0.12", features = ["json"], optional = false } 70 | serde_derive = { version = "1.0.209", optional = false} 71 | 72 | [features] 73 | #default = ["llm"] 74 | llm = ["sha256", 75 | #"serde_derive", 76 | #"request" 77 | ] 78 | -------------------------------------------------------------------------------- /fire_seq_search_server/debug_server.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | rm -f ./fire_seq_search_server 3 | # nix-shell -p cargo -p rustc -p libiconv --run "cargo build" 4 | cargo build --features llm 5 | cp target/debug/fire_seq_search_server ./fire_seq_search_server 6 | 7 | export RUST_LOG="warn,fire_seq_search_server=info" 8 | #export RUST_LOG="debug" 9 | export RUST_BACKTRACE=1 10 | #RAYON_NUM_THREADS=1 11 | ./fire_seq_search_server --notebook_path ~/logseq --enable-journal-query 12 | -------------------------------------------------------------------------------- /fire_seq_search_server/debug_server_mac.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | rm -f ./fire_seq_search_server 3 | #nix-shell -p cargo -p rustc -p libiconv --run "cargo build" 4 | cargo build --features llm 5 | cp target/debug/fire_seq_search_server ./fire_seq_search_server 6 | 7 | export RUST_LOG="warn,fire_seq_search_server=info" 8 | #export RUST_LOG="debug" 9 | export RUST_BACKTRACE=1 10 | ./fire_seq_search_server --notebook_path ~/logseq 11 | #--enable-journal-query 12 | -------------------------------------------------------------------------------- /fire_seq_search_server/deny.toml: -------------------------------------------------------------------------------- 1 | [graph] 2 | targets = [ 3 | ] 4 | all-features = false 5 | no-default-features = false 6 | 7 | [output] 8 | feature-depth = 1 9 | 10 | [advisories] 11 | # Not finished 12 | ignore = [ 13 | { id = "RUSTSEC-2020-0056", reason = "pdf extract" }, 14 | { id = "RUSTSEC-2021-0153", reason = "pdf" }, 15 | ] 16 | 17 | 18 | [licenses] 19 | # List of explicitly allowed licenses 20 | # See https://spdx.org/licenses/ for list of possible licenses 21 | # [possible values: any SPDX 3.11 short identifier (+ optional exception)]. 22 | allow = [ 23 | "MIT", "Apache-2.0", 24 | "Zlib", 25 | "BSD-2-Clause", "BSD-3-Clause", 26 | "CC0-1.0", 27 | "MPL-2.0", 28 | "Unicode-3.0", 29 | ] 30 | # The confidence threshold for detecting a license from license text. 31 | # The higher the value, the more closely the license text must be to the 32 | # canonical license text of a valid SPDX license file. 33 | # [possible values: any between 0.0 and 1.0]. 34 | confidence-threshold = 0.8 35 | # Allow 1 or more licenses on a per-crate basis, so that particular licenses 36 | # aren't accepted for every possible crate as with the normal allow list 37 | exceptions = [ 38 | { name = "fastdivide", allow = ["zlib-acknowledgement"] }, 39 | ] 40 | 41 | # This section is considered when running `cargo deny check bans`. 42 | # More documentation about the 'bans' section can be found here: 43 | # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html 44 | [bans] 45 | # Lint level for when multiple versions of the same crate are detected 46 | multiple-versions = "warn" 47 | # Lint level for when a crate version requirement is `*` 48 | wildcards = "allow" 49 | # The graph highlighting used when creating dotgraphs for crates 50 | # with multiple versions 51 | # * lowest-version - The path to the lowest versioned duplicate is highlighted 52 | # * simplest-path - The path to the version with the fewest edges is highlighted 53 | # * all - Both lowest-version and simplest-path are used 54 | highlight = "all" 55 | # The default lint level for `default` features for crates that are members of 56 | # the workspace that is being checked. This can be overridden by allowing/denying 57 | # `default` on a crate-by-crate basis if desired. 58 | workspace-default-features = "allow" 59 | # The default lint level for `default` features for external crates that are not 60 | # members of the workspace. This can be overridden by allowing/denying `default` 61 | # on a crate-by-crate basis if desired. 62 | external-default-features = "allow" 63 | # List of crates that are allowed. Use with care! 64 | allow = [ 65 | #"ansi_term@0.11.0", 66 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" }, 67 | ] 68 | # List of crates to deny 69 | deny = [ 70 | #"ansi_term@0.11.0", 71 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" }, 72 | # Wrapper crates can optionally be specified to allow the crate when it 73 | # is a direct dependency of the otherwise banned crate 74 | #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] }, 75 | ] 76 | 77 | # List of features to allow/deny 78 | # Each entry the name of a crate and a version range. If version is 79 | # not specified, all versions will be matched. 80 | #[[bans.features]] 81 | #crate = "reqwest" 82 | # Features to not allow 83 | #deny = ["json"] 84 | # Features to allow 85 | #allow = [ 86 | # "rustls", 87 | # "__rustls", 88 | # "__tls", 89 | # "hyper-rustls", 90 | # "rustls", 91 | # "rustls-pemfile", 92 | # "rustls-tls-webpki-roots", 93 | # "tokio-rustls", 94 | # "webpki-roots", 95 | #] 96 | # If true, the allowed features must exactly match the enabled feature set. If 97 | # this is set there is no point setting `deny` 98 | #exact = true 99 | 100 | # Certain crates/versions that will be skipped when doing duplicate detection. 101 | skip = [ 102 | #"ansi_term@0.11.0", 103 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" }, 104 | ] 105 | # Similarly to `skip` allows you to skip certain crates during duplicate 106 | # detection. Unlike skip, it also includes the entire tree of transitive 107 | # dependencies starting at the specified crate, up to a certain depth, which is 108 | # by default infinite. 109 | skip-tree = [ 110 | #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies 111 | #{ crate = "ansi_term@0.11.0", depth = 20 }, 112 | ] 113 | 114 | # This section is considered when running `cargo deny check sources`. 115 | # More documentation about the 'sources' section can be found here: 116 | # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html 117 | [sources] 118 | # Lint level for what to happen when a crate from a crate registry that is not 119 | # in the allow list is encountered 120 | unknown-registry = "warn" 121 | # Lint level for what to happen when a crate from a git repository that is not 122 | # in the allow list is encountered 123 | unknown-git = "warn" 124 | # List of URLs for allowed crate registries. Defaults to the crates.io index 125 | # if not specified. If it is specified but empty, no registries are allowed. 126 | allow-registry = ["https://github.com/rust-lang/crates.io-index"] 127 | # List of URLs for allowed Git repositories 128 | allow-git = [] 129 | 130 | -------------------------------------------------------------------------------- /fire_seq_search_server/obsidian.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | cargo build --features llm 3 | rm ./fire_seq_search_server -f 4 | cp --force target/debug/fire_seq_search_server ./fire_seq_search_server 5 | 6 | NOTEBOOK_NAME=AstroWiki_2.0-main 7 | 8 | RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ 9 | --notebook_path ~/Documents/$NOTEBOOK_NAME \ 10 | --obsidian-md 11 | -------------------------------------------------------------------------------- /fire_seq_search_server/run_server.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | # --release remove this parameter to save compile time 3 | cargo build 4 | rm -f ./fire_seq_search_server 5 | # Still use the debug version 6 | cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server 7 | RUST_LOG=warn ./fire_seq_search_server --notebook_path /c/Users/z2369li/Nextcloud/logseq_notebook 8 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/http_client/endpoints.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use log::{debug}; 3 | 4 | use crate::query_engine::{QueryEngine, ServerInformation}; 5 | use axum::Json; 6 | use axum::extract::State; 7 | use axum::{response::Html, extract::Path}; 8 | 9 | pub async fn get_server_info(State(engine_arc): State>) 10 | -> Json { 11 | axum::Json( engine_arc.server_info.to_owned() ) 12 | } 13 | 14 | pub async fn query( 15 | Path(term) : Path, 16 | State(engine_arc): State> 17 | ) -> Html{ 18 | 19 | debug!("Original Search term {}", term); 20 | let r = engine_arc.query_pipeline(term); 21 | Html(r.await) 22 | } 23 | 24 | pub async fn summarize( 25 | Path(title) : Path, 26 | State(engine_arc): State> 27 | ) -> Html{ 28 | 29 | let r = engine_arc.summarize(title); 30 | Html(r.await) 31 | } 32 | 33 | pub async fn get_llm_done_list( 34 | State(engine_arc): State> 35 | ) -> Html{ 36 | let r = engine_arc.get_llm_done_list(); 37 | Html(r.await) 38 | } 39 | 40 | pub async fn generate_word_cloud(State(engine_arc): State>) 41 | -> Html { 42 | let div_id = "fireSeqSearchWordcloudRawJson"; 43 | let json = engine_arc.generate_wordcloud(); 44 | 45 | let div = format!("
{}
", div_id, json); 46 | Html(div) 47 | } 48 | 49 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/http_client/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod endpoints; -------------------------------------------------------------------------------- /fire_seq_search_server/src/language_tools/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod tokenizer; 2 | mod cn_stopwords; 3 | 4 | use std::collections::HashSet; 5 | use lingua::{Language, LanguageDetector, LanguageDetectorBuilder}; 6 | use lingua::Language::{Chinese, English}; 7 | 8 | pub fn is_chinese(sentence: &str) -> bool { 9 | lazy_static! { 10 | static ref LANGS: Vec = vec![Chinese, English]; 11 | // let mut languages = Vec::with_capacity(); 12 | // languages.push(Chinese); 13 | static ref DETECTOR: LanguageDetector = LanguageDetectorBuilder:: 14 | from_languages(&LANGS).build(); 15 | } 16 | let detected_language: Option = DETECTOR.detect_language_of(sentence); 17 | match detected_language { 18 | Some(x) => x == Chinese, 19 | None => false 20 | } 21 | } 22 | 23 | 24 | 25 | /// ``` 26 | /// let l = fire_seq_search_server::language_tools::generate_stopwords_list(); 27 | /// assert!(l.contains("the")); 28 | /// assert!(!l.contains("thex")); 29 | /// ``` 30 | pub fn generate_stopwords_list() -> HashSet { 31 | use stopwords::Stopwords; 32 | let mut nltk: std::collections::HashSet<&str> = stopwords::NLTK::stopwords(stopwords::Language::English).unwrap().iter().cloned().collect(); 33 | nltk.insert("span"); 34 | nltk.insert("class"); 35 | nltk.insert("fireSeqSearchHighlight"); 36 | 37 | nltk.insert("theorem"); 38 | nltk.insert("-"); 39 | 40 | nltk.insert("view"); 41 | 42 | 43 | let mut nltk: HashSet = nltk.iter().map(|&s|s.into()).collect(); 44 | 45 | for c in 'a'..='z' { 46 | nltk.insert(String::from(c)); 47 | } 48 | // To Improve: I should be aware about the upper/lower case for terms. -Zhenbo Li 2023-Jan-19 49 | for c in 'A'..='Z' { 50 | nltk.insert(String::from(c)); 51 | } 52 | 53 | for c in '0'..='9' { 54 | nltk.insert(String::from(c)); 55 | } 56 | 57 | 58 | let words = stop_words::get(stop_words::LANGUAGE::English); 59 | for w in words { 60 | nltk.insert(w); 61 | } 62 | let words = stop_words::get(stop_words::LANGUAGE::Chinese); 63 | for w in words { 64 | nltk.insert(w); 65 | } 66 | for c in ['的', '有'] { 67 | nltk.insert(String::from(c)); 68 | } 69 | 70 | for s in crate::language_tools::cn_stopwords::cn_stopwords_list() { 71 | nltk.insert(String::from(s)); 72 | } 73 | for s in crate::language_tools::cn_stopwords::cn_hit_stopword_list() { 74 | nltk.insert(String::from(s)); 75 | } 76 | 77 | nltk 78 | } 79 | 80 | 81 | #[cfg(test)] 82 | mod test_language_detect { 83 | #[test] 84 | fn zh() { 85 | use crate::language_tools::is_chinese; 86 | assert!(is_chinese("李华")); 87 | assert!(!is_chinese("rust")); 88 | assert!(!is_chinese("Это статья .")); 89 | } 90 | } 91 | // assert_eq!(detected_language, Some(English)); 92 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/language_tools/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use log::{debug, info}; 3 | 4 | /// ``` 5 | /// let l = fire_seq_search_server::language_tools::generate_stopwords_list(); 6 | /// assert!(l.contains("the")); 7 | /// assert!(!l.contains("thex")); 8 | /// 9 | /// let terms = vec![String::from("the"), String::from("The"), String::from("answer")]; 10 | /// let result = fire_seq_search_server::language_tools::tokenizer::filter_out_stopwords(&terms, &l); 11 | /// assert_eq!(result.len(), 1); 12 | /// ``` 13 | pub fn filter_out_stopwords<'a,'b>(term_tokens: &'a [String], nltk: &'b HashSet) -> Vec<&'a str> { 14 | let term_ref: Vec<&str> = term_tokens.iter().map(|s| &**s).collect(); 15 | let terms_selected: Vec<&str> = term_ref.into_iter() 16 | .filter(|&s| ! (s.trim().is_empty() ) ) 17 | .filter(|&s| !nltk.contains(&(&s).to_lowercase() ) ) 18 | .collect(); 19 | terms_selected 20 | } 21 | 22 | 23 | 24 | pub fn tokenize(sentence: &str) -> Vec { 25 | /* 26 | lazy_static! { 27 | static ref TK: crate::JiebaTokenizer = crate::JiebaTokenizer {}; 28 | } 29 | */ 30 | if crate::language_tools::is_chinese(sentence) { 31 | info!("Use Tokenizer for Chinese term {}", sentence); 32 | let mut jieba = FireSeqTokenizer {}; 33 | //TODO don't create a tokenizer every time 34 | crate::tokenize_sentence_to_text_vec(&mut jieba, sentence) 35 | } else { 36 | // info!("Space Tokenizer {}", sentence); 37 | let result : Vec<&str> = sentence.split_whitespace() 38 | .collect(); 39 | debug!("Got tokens {:?}", &result); 40 | let result:Vec = result.iter().map(|&s|s.into()).collect(); 41 | result 42 | // vec![String::from(sentence)] 43 | } 44 | } 45 | 46 | use lazy_static::lazy_static; 47 | use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer}; 48 | 49 | lazy_static! { 50 | static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); 51 | } 52 | 53 | pub const TOKENIZER_ID: &str = "fireseq_tokenizer"; 54 | 55 | #[derive(Clone)] 56 | pub struct FireSeqTokenizer; 57 | 58 | 59 | 60 | pub struct JiebaTokenStream { 61 | tokens: Vec, 62 | index: usize, 63 | } 64 | 65 | impl TokenStream for JiebaTokenStream { 66 | fn advance(&mut self) -> bool { 67 | if self.index < self.tokens.len() { 68 | self.index = self.index + 1; 69 | true 70 | } else { 71 | false 72 | } 73 | } 74 | fn token(&self) -> &Token { 75 | &self.tokens[self.index - 1] 76 | } 77 | fn token_mut(&mut self) -> &mut Token { 78 | &mut self.tokens[self.index - 1] 79 | } 80 | } 81 | 82 | impl Tokenizer for FireSeqTokenizer { 83 | type TokenStream<'a> = JiebaTokenStream; 84 | fn token_stream<'a>(&mut self, text: &'a str) -> JiebaTokenStream { 85 | let mut indices = text.char_indices().collect::>(); 86 | indices.push((text.len(), '\0')); 87 | let orig_tokens = JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, true); 88 | let mut tokens = Vec::new(); 89 | // copy tantivy-jieba code for now 90 | for token in orig_tokens { 91 | tokens.push(Token { 92 | offset_from: indices[token.start].0, 93 | offset_to: indices[token.end].0, 94 | position: token.start, 95 | text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), 96 | position_length: token.end - token.start, 97 | }); 98 | } 99 | /* 100 | for i in 0..orig_tokens.len() { 101 | let token = &orig_tokens[i]; 102 | match process_token_text(text, &indices, &token) { 103 | Some(text) => tokens.push(Token { 104 | offset_from: indices[token.start].0, 105 | offset_to: indices[token.end].0, 106 | position: token.start, 107 | text, 108 | position_length: token.end - token.start, 109 | }), 110 | None => () 111 | } 112 | 113 | } 114 | */ 115 | JiebaTokenStream { tokens, index: 0 } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod post_query; 2 | pub mod load_notes; 3 | pub mod markdown_parser; 4 | pub mod language_tools; 5 | pub mod http_client; 6 | pub mod query_engine; 7 | pub mod word_frequency; 8 | pub mod local_llm; 9 | 10 | 11 | use log::debug; 12 | use crate::query_engine::ServerInformation; 13 | use crate::query_engine::NotebookSoftware::Logseq; 14 | 15 | 16 | #[macro_use] 17 | extern crate lazy_static; 18 | 19 | pub static JOURNAL_PREFIX: &str = "@journal@"; 20 | 21 | 22 | pub struct Article { 23 | #[allow(dead_code)] /* TODO rethink if we need it 2024 Sep 21 */ 24 | file_name: String, 25 | content: String 26 | } 27 | 28 | // Based on https://github.com/jiegec/tantivy-jieba 29 | // tantivy-jieba is licensed under MIT, Copyright 2019-2020 Jiajie Chen 30 | // I had heavy modifications on it 31 | /* 32 | lazy_static! { 33 | static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); 34 | } 35 | */ 36 | 37 | //pub const TOKENIZER_ID: &str = "fss_tokenizer"; 38 | 39 | 40 | /* 41 | impl Tokenizer for JiebaTokenizer { 42 | type TokenStream<'a> = JiebaTokenStream; 43 | fn token_stream<'a>(&mut self, text: &'a str) -> JiebaTokenStream { 44 | let mut indices = text.char_indices().collect::>(); 45 | indices.push((text.len(), '\0')); 46 | let jieba : jieba_rs::Jieba = jieba_rs::Jieba::new(); //TODO use a static one 47 | let orig_tokens = jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true); 48 | let mut tokens = Vec::new(); 49 | for i in 0..orig_tokens.len() { 50 | let token = &orig_tokens[i]; 51 | match process_token_text(text, &indices, &token) { 52 | Some(text) => tokens.push(Token { 53 | offset_from: indices[token.start].0, 54 | offset_to: indices[token.end].0, 55 | position: token.start, 56 | text, 57 | position_length: token.end - token.start, 58 | }), 59 | None => () 60 | } 61 | 62 | } 63 | JiebaTokenStream { tokens, index: 0 } 64 | 65 | } 66 | } 67 | */ 68 | 69 | /* 70 | Thoughts on lowercase 2022-07-04: 71 | tanvity's default tokenizer will lowercase all English characters. 72 | https://docs.rs/tantivy/latest/tantivy/tokenizer/index.html 73 | I'm just trying my best to simulate it 74 | However, I think there could be a better approach 75 | 1. use https://github.com/pemistahl/lingua-rs to determine the language of the text 76 | 2. Select proper tokenizer 77 | fn process_token_text(text: &str, indices: &Vec<(usize, char)>, token: &jieba_rs::Token<'_>) -> Option { 78 | let raw = String::from(&text[(indices[token.start].0)..(indices[token.end].0)]); 79 | let lower = raw.to_lowercase(); 80 | if lower.trim().is_empty() { 81 | None 82 | } else { 83 | Some(lower) 84 | } 85 | } 86 | */ 87 | 88 | // TODO use stub now 89 | pub fn tokenize_default(sentence: &str) -> Vec { 90 | let mut r = Vec::new(); 91 | r.push(sentence.to_owned()); 92 | r 93 | } 94 | /* 95 | // TODO: Move tokenizer-related things into language_tools 96 | pub fn tokenize_default(sentence: &str) -> Vec { 97 | /* 98 | lazy_static! { 99 | static ref TK: JiebaTokenizer = crate::JiebaTokenizer {}; 100 | } 101 | */ 102 | // TODO use static tokenizer 103 | let mut tokenizer = crate::JiebaTokenizer{}; 104 | if language_tools::is_chinese(sentence) { 105 | info!("Use Tokenizer for Chinese term {}", sentence); 106 | tokenize_sentence_to_text_vec(&mut tokenizer, sentence) 107 | } else { 108 | // info!("Space Tokenizer {}", sentence); 109 | let result : Vec<&str> = sentence.split_whitespace() 110 | .collect(); 111 | // debug!("Got tokens {:?}", &result); 112 | let result:Vec = result.iter().map(|&s|s.into()).collect(); 113 | result 114 | // vec![String::from(sentence)] 115 | } 116 | } 117 | */ 118 | 119 | 120 | use crate::language_tools::tokenizer::FireSeqTokenizer; 121 | pub fn tokenize_sentence_to_text_vec(tokenizer: &mut FireSeqTokenizer, sentence: &str) -> Vec { 122 | let tokens = tokenize_sentence_to_vector(tokenizer, sentence); 123 | tokens_to_text_vec(&tokens) 124 | } 125 | pub fn tokenize_sentence_to_vector(tokenizer: &mut FireSeqTokenizer, sentence: &str) -> Vec { 126 | use tantivy::tokenizer::*; 127 | let mut token_stream = tokenizer.token_stream( 128 | sentence 129 | ); 130 | let mut tokens = Vec::new(); 131 | 132 | while let Some(token) = token_stream.next() { 133 | tokens.push(token.clone()); 134 | 135 | } 136 | tokens 137 | } 138 | pub fn tokens_to_text_vec(tokens: &Vec) -> Vec { 139 | let mut token_text = Vec::new(); 140 | for token in tokens { 141 | token_text.push(token.text.clone()); 142 | } 143 | token_text 144 | } 145 | 146 | 147 | 148 | pub fn decode_cjk_str(original: String) -> Vec { 149 | use urlencoding::decode; 150 | 151 | let mut result = Vec::new(); 152 | for s in original.split(' ') { 153 | let t = decode(s).expect("UTF-8"); 154 | debug!("Decode {} -> {}", s, t); 155 | result.push(String::from(t)); 156 | } 157 | 158 | result 159 | } 160 | 161 | 162 | 163 | // ============= BELOW IS TEST CASES ==================== 164 | pub fn generate_server_info_for_test() -> ServerInformation { 165 | let server_info = ServerInformation { 166 | notebook_path: "stub_path".to_string(), 167 | notebook_name: "logseq_notebook".to_string(), 168 | enable_journal_query: false, 169 | show_top_hits: 0, 170 | show_summary_single_line_chars_limit: 0, 171 | parse_pdf_links: false, 172 | exclude_zotero_items: false, 173 | software: Logseq, 174 | convert_underline_hierarchy: true, 175 | host: "127.0.0.1:22024".to_string(), 176 | llm_enabled: false, 177 | llm_max_waiting_time: 60, 178 | }; 179 | server_info 180 | } 181 | 182 | /* 183 | #[cfg(test)] 184 | mod test_tokenizer { 185 | #[test] 186 | fn english() { 187 | let _tokens = base("Travel to japan", vec!["travel", "to", "japan"]); 188 | } 189 | 190 | #[test] 191 | fn simple_zh() { 192 | let tokens = base("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途", 193 | vec![ 194 | // "a", 195 | "张华", 196 | "考上", 197 | "了", 198 | "北京", 199 | "大学", 200 | "北京大学", 201 | ";", 202 | "李萍", 203 | "进", 204 | "了", 205 | "中等", 206 | "技术", 207 | "术学", 208 | "学校", 209 | "技术学校", 210 | ";", 211 | "我", 212 | "在", 213 | "百货", 214 | "公司", 215 | "百货公司", 216 | "当", 217 | "售货", 218 | "货员", 219 | "售货员", 220 | ":", 221 | "我们", 222 | "都", 223 | "有", 224 | "光明", 225 | "的", 226 | "前途" 227 | ] 228 | ); 229 | // offset should be byte-indexed 230 | assert_eq!(tokens[0].offset_from, 0); 231 | assert_eq!(tokens[0].offset_to, "张华".bytes().len()); 232 | assert_eq!(tokens[1].offset_from, "张华".bytes().len()); 233 | } 234 | fn base(sentence: &str, expect_tokens: Vec<&str>) -> Vec { 235 | 236 | use crate::{tokenize_sentence_to_vector,tokens_to_text_vec}; 237 | let tokenizer = crate::JiebaTokenizer {}; 238 | let tokens = tokenize_sentence_to_vector(&tokenizer, sentence); 239 | let token_text = tokens_to_text_vec(&tokens); 240 | // check tokenized text 241 | assert_eq!( 242 | token_text, 243 | expect_tokens 244 | ); 245 | tokens 246 | } 247 | 248 | 249 | } 250 | */ 251 | 252 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/load_notes/mod.rs: -------------------------------------------------------------------------------- 1 | use log::{debug, error, info}; 2 | use std::process; 3 | 4 | use crate::query_engine::ServerInformation; 5 | 6 | 7 | use std::borrow::Cow; 8 | use std::borrow::Borrow; 9 | 10 | #[derive(Debug, Clone)] 11 | pub struct NoteListItem { 12 | pub realpath: String, 13 | pub title: String, 14 | } 15 | 16 | use crate::query_engine::NotebookSoftware; 17 | pub fn retrive_note_list(server_info: &ServerInformation) -> Vec { 18 | let path: &str = &server_info.notebook_path; 19 | 20 | let note_list = match &server_info.software { 21 | NotebookSoftware::Obsidian => list_directory( Cow::from(path) , true), 22 | NotebookSoftware::Logseq => { 23 | let pp = path.to_string() + "/pages"; 24 | let mut pages = list_directory( Cow::from(pp), false ); 25 | 26 | // TODO Journal prefix 27 | let pp = path.to_string() + "/journals"; 28 | let jours = list_directory( Cow::from(pp), false ); 29 | 30 | pages.extend(jours); 31 | pages 32 | }, 33 | }; 34 | // TODO didn't handle logseq 35 | note_list 36 | } 37 | 38 | fn list_directory(path: Cow<'_, str>, recursive: bool) -> Vec { 39 | debug!("Listing directory {}", &path); 40 | let mut result = Vec::new(); 41 | 42 | let path_ref: &str = path.borrow(); 43 | let notebooks = match std::fs::read_dir(path_ref) { 44 | Ok(x) => x, 45 | Err(e) => { 46 | error!("Fatal error ({:?}) when reading {}", e, &path); 47 | process::abort(); 48 | } 49 | }; 50 | 51 | for note_result in notebooks { 52 | let entry = match note_result { 53 | Ok(x) => x, 54 | Err(e) => { 55 | error!("Error during looping {:?}", &e); 56 | continue; 57 | } 58 | }; 59 | let file_type = match entry.file_type() { 60 | Ok(x) => x, 61 | Err(e) => { 62 | error!("Error: Can't get file type {:?} {:?}", &entry, &e); 63 | continue; 64 | } 65 | }; 66 | 67 | let entry_path = entry.path(); 68 | let entry_path_str = entry_path.to_string_lossy(); 69 | 70 | if file_type.is_dir() { 71 | if recursive { 72 | let next = list_directory(entry_path_str, true); 73 | result.extend(next); 74 | } 75 | continue; 76 | } 77 | 78 | if !entry_path_str.ends_with(".md") { 79 | info!("skip non-md file {:?}", &entry); 80 | continue; 81 | } 82 | 83 | let note_title = match entry_path.file_stem() { 84 | Some(osstr) => osstr.to_str().unwrap(), 85 | None => { 86 | error!("Couldn't get file_stem for {:?}", entry_path); 87 | continue; 88 | } 89 | }; 90 | let row = NoteListItem { 91 | realpath: entry_path_str.to_string(), 92 | title: note_title.to_string(), 93 | }; 94 | result.push(row); 95 | } 96 | return result; 97 | } 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/local_llm/example_llama_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "choices": [ 3 | { 4 | "finish_reason": "stop", 5 | "index": 0, 6 | "message": { 7 | "content": " It seems like there might be some confusion in your question. \"MS file format\" typically refers to the Microsoft Office document file formats, such as .docx, .xlsx, and .pptx.\n\nHowever, if you meant to ask about the WIF file format, then here's some information for you:\n\nWIF (Windows Image File) is not a widely used file format. It is a proprietary file format used by Microsoft's Windows Imaging Component (WIC) for storing and manipulating image data. WIF files can contain multiple images, each with its own metadata, and can be used for tasks such as image processing, thumbnail generation, and icon extraction.\n\nWIF files are not meant to be opened or edited by users directly, but rather are used as input and output files for applications that use the WIC API. If you need to work with WIF files, you would typically use a programming language and the WIC API to read and write the files.\n\nI hope this information helps clarify any confusion around the MS file format and the WIF file format. Let me know if you have any other questions!", 8 | "role": "assistant" 9 | } 10 | } 11 | ], 12 | "created": 1724517653, 13 | "id": "chatcmpl-4B", 14 | "model": "model", 15 | "object": "chat.completion", 16 | "usage": { 17 | "completion_tokens": 247, 18 | "prompt_tokens": 14, 19 | "total_tokens": 261 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/local_llm/mod.rs: -------------------------------------------------------------------------------- 1 | use log::{info, error}; 2 | use crate::query_engine::DocData; 3 | 4 | use std::collections::HashMap; 5 | use std::collections::VecDeque; 6 | use std::process::{Command, Stdio}; 7 | use std::fs::File; 8 | 9 | use std::sync::Arc; 10 | use tokio::sync::Mutex; 11 | use tokio::task::yield_now; 12 | use tokio::task; 13 | use tokio; 14 | 15 | use std::borrow::Cow; 16 | use std::borrow::Cow::Borrowed; 17 | 18 | 19 | //#[cfg(feature = "llm")] 20 | use { 21 | reqwest, 22 | reqwest::StatusCode, 23 | shellexpand::tilde, 24 | 25 | serde_derive::Deserialize, 26 | serde_derive::Serialize, 27 | }; 28 | 29 | 30 | 31 | // TODO Allow user to set prompt, instead of hard-coded in code 32 | const HARD_CODED_PROMPT_STR: &'static str = r##" 33 | You are a seasoned summary expert, capable of condensing and summarizing given articles, papers, or posts, accurately conveying the main idea to make the content easier to understand. 34 | 35 | You place great emphasis on user experience, never adding irrelevant content like "Summary," "The summary is as follows," "Original text," "You can check the original text if interested," or "Original link." Your summaries always convey the core information directly. 36 | 37 | You are adept at handling various large, small, and even chaotic text content, always accurately extracting key information and summarizing the core content globally to make it easier to understand. 38 | 39 | === Below is the article === 40 | 41 | "##; 42 | 43 | // Generated by https://transform.tools/json-to-rust-serde 44 | #[derive(Debug, Serialize, Deserialize)] 45 | pub struct OpenAiData { 46 | pub model: String, 47 | pub messages: Vec, 48 | } 49 | 50 | #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] 51 | pub struct LlamaResponse { 52 | pub choices: Vec, 53 | pub created: i64, 54 | pub id: String, 55 | pub model: String, 56 | pub object: String, 57 | pub usage: Usage, 58 | } 59 | 60 | #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] 61 | pub struct Choice { 62 | pub finish_reason: String, 63 | pub index: i64, 64 | pub message: Message, 65 | } 66 | 67 | #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] 68 | pub struct Message { 69 | pub content: String, 70 | pub role: String, 71 | } 72 | 73 | #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] 74 | pub struct Usage { 75 | pub completion_tokens: i64, 76 | pub prompt_tokens: i64, 77 | pub total_tokens: i64, 78 | } 79 | 80 | #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] 81 | pub struct HealthCheck { 82 | pub slots_idle: i64, 83 | pub slots_processing: i64, 84 | pub status: String, 85 | } 86 | 87 | // End genereated 88 | 89 | const LLM_SERVER_PORT: &str = "8081"; // TODO Remove this magic number 90 | 91 | 92 | #[derive(Debug)] 93 | pub struct LlmJob { 94 | pub title: String, 95 | pub body : String, 96 | pub time : std::time::Instant, /* 16 bytes */ 97 | } 98 | 99 | struct JobProcessor { 100 | done_job: HashMap, 101 | job_queue: VecDeque, 102 | } 103 | 104 | impl JobProcessor { 105 | pub fn new() -> Self { 106 | JobProcessor { 107 | done_job: HashMap::new(), 108 | job_queue: VecDeque::new(), 109 | } 110 | } 111 | pub fn add(&mut self, doc:DocData) { 112 | let title: &str = &doc.title; 113 | info!("Job posted for {}", &title); 114 | if !self.done_job.contains_key(title) { 115 | let job: LlmJob = LlmJob { 116 | title: doc.title, 117 | body: doc.body, 118 | time: std::time::Instant::now(), 119 | }; 120 | self.job_queue.push_back(job); 121 | } 122 | } 123 | } 124 | 125 | use crate::ServerInformation; 126 | 127 | 128 | use sysinfo::Pid; 129 | 130 | pub struct LlmEngine { 131 | endpoint: String, 132 | client: reqwest::Client, 133 | job_cache: Arc>, 134 | server_info: Arc, 135 | engine_pid: Pid, 136 | } 137 | 138 | 139 | 140 | impl LlmEngine { 141 | pub fn pid_hit_list(&self)->Pid { // TODO not a list yet 142 | return self.engine_pid; 143 | } 144 | pub async fn llm_init(server_info: Arc) -> Self { 145 | info!("llm called"); 146 | 147 | let lfile = locate_llamafile().await; 148 | let lfile:String = lfile.unwrap(); 149 | 150 | let cmd = Command::new("nice") 151 | .args([ "-n", "19", 152 | &lfile, "--nobrowser", 153 | "--port", LLM_SERVER_PORT, 154 | ]) 155 | .stdout(Stdio::from(File::create("/tmp/llamafile.stdout.txt").unwrap())) 156 | .stderr(Stdio::from(File::create("/tmp/llamafile.stderr.txt").unwrap())) 157 | .spawn() 158 | .expect("llm model failed to launch"); 159 | 160 | yield_now().await; 161 | let wait_llm = tokio::time::Duration::from_millis(500); 162 | tokio::time::sleep(wait_llm).await; 163 | task::yield_now().await; 164 | 165 | let endpoint = format!("http://127.0.0.1:{}", LLM_SERVER_PORT).to_string(); 166 | 167 | loop { 168 | let resp = reqwest::get(endpoint.to_owned() + "/health").await; 169 | let resp = match resp { 170 | Err(_e) => { 171 | info!("llm not ready"); 172 | let wait_llm = tokio::time::Duration::from_millis(1000); 173 | tokio::time::sleep(wait_llm).await; 174 | task::yield_now().await; 175 | continue; 176 | }, 177 | Ok(r) => r, 178 | }; 179 | if resp.status() != StatusCode::from_u16(200).unwrap() { 180 | info!("endpoint failed"); 181 | //TODO error handling 182 | } 183 | break; 184 | } 185 | 186 | let client = reqwest::Client::new(); 187 | 188 | info!("llm engine initialized"); 189 | let map = Arc::new(Mutex::new( 190 | JobProcessor::new())); 191 | Self { 192 | endpoint, 193 | client, 194 | job_cache: map, 195 | server_info, 196 | engine_pid: Pid::from_u32(cmd.id()), 197 | } 198 | } 199 | 200 | fn build_data(full_text: Cow<'_, str>) -> OpenAiData { 201 | 202 | fn build_message(chat:String) -> Message { 203 | Message{ 204 | role: "user".to_owned(), 205 | content: chat, 206 | } 207 | } 208 | let mut msgs = Vec::new(); 209 | 210 | let prompt_string = &HARD_CODED_PROMPT_STR; 211 | let mut chat_text = prompt_string.to_string(); 212 | chat_text += &full_text; 213 | msgs.push( build_message(chat_text) ); 214 | 215 | OpenAiData { 216 | model: "model".to_owned(), 217 | messages: msgs, 218 | } 219 | } 220 | } 221 | 222 | impl LlmEngine{ 223 | pub async fn summarize(&self, full_text: &str) -> String { 224 | //http://localhost:8080/completion 225 | let ep = self.endpoint.to_owned() + "/v1/chat/completions"; 226 | let data = Self::build_data( Borrowed(full_text) ); 227 | let res = self.client.post(&ep) 228 | .header("Content-Type", "application/json") 229 | .json(&data) 230 | .send() 231 | .await 232 | .unwrap(); 233 | let content = res.text().await.unwrap(); 234 | let parsed: LlamaResponse = serde_json::from_str(&content).unwrap(); 235 | let v = parsed.choices; 236 | let v0 = v.into_iter().next().unwrap(); 237 | v0.message.content 238 | //TODO remove unwrap 239 | } 240 | 241 | pub async fn post_summarize_job(&self, doc: DocData) { 242 | //TODO error handler? 243 | let mut jcache = self.job_cache.lock().await;//.unwrap(); 244 | jcache.add(doc); 245 | drop(jcache); 246 | } 247 | 248 | pub async fn call_llm_engine(&self) { 249 | let health = self.health().await.unwrap(); 250 | if health.slots_idle == 0 { 251 | info!("No valid slot, continue"); 252 | return; 253 | } 254 | 255 | let next_job: Option; 256 | 257 | let mut jcache = self.job_cache.lock().await;//.unwrap(); 258 | next_job = jcache.job_queue.pop_front(); 259 | drop(jcache); 260 | 261 | let doc = match next_job { 262 | Some(x) => x, 263 | None => { return; }, 264 | }; 265 | 266 | let title = doc.title.to_owned(); 267 | 268 | let jcache = self.job_cache.lock().await; 269 | if jcache.done_job.contains_key(&title) { 270 | return; 271 | } 272 | drop(jcache); 273 | 274 | let waiting_time = doc.time.elapsed().as_secs(); 275 | let allowed_wait = self.server_info.llm_max_waiting_time; 276 | if waiting_time > allowed_wait { 277 | info!("Waiting for {} for {} seconds, discard", 278 | &title, waiting_time); 279 | return; 280 | } 281 | 282 | 283 | info!("Start summarize job: {}", &title); 284 | let summarize_result = self.summarize(&doc.body).await; 285 | info!("Finished summarize job: {}", &title); 286 | 287 | let mut jcache = self.job_cache.lock().await; 288 | jcache.done_job.insert(title, summarize_result); 289 | drop(jcache); 290 | } 291 | 292 | pub async fn quick_fetch(&self, title: &str) -> Option { 293 | let jcache = self.job_cache.lock().await; 294 | return jcache.done_job.get(title).cloned(); 295 | } 296 | 297 | pub async fn get_llm_done_list(&self) -> Vec { 298 | let mut r = Vec::new(); 299 | let jcache = self.job_cache.lock().await; 300 | for (title, _text) in &jcache.done_job { 301 | r.push(title.to_owned()); 302 | } 303 | return r; 304 | } 305 | 306 | pub async fn health(&self) -> Result> { 307 | let res = self.client.get(self.endpoint.to_owned() + "/health") 308 | .send() 309 | .await 310 | .unwrap(); 311 | let content = res.text().await.unwrap(); 312 | let parsed: HealthCheck = serde_json::from_str(&content).unwrap(); 313 | Ok(parsed) 314 | } 315 | } 316 | 317 | #[derive(Debug)] 318 | struct LlamaFileDef { 319 | pub filename: String, 320 | pub filepath: Option, 321 | pub sha256: String, 322 | #[allow(dead_code)] /* TODO rethink if we want auto download 2024 Sep 21 */ 323 | pub download_link: String, 324 | } 325 | 326 | 327 | async fn locate_llamafile() -> Option { 328 | let mut lf = LlamaFileDef { 329 | filename: "mistral-7b-instruct-v0.2.Q4_0.llamafile".to_owned(), 330 | filepath: None, 331 | sha256: "1903778f7defd921347b25327ebe5dd902f29417ba524144a8e4f7c32d83dee8".to_owned(), 332 | download_link: "mistral-7b-instruct-v0.2.Q4_0.llamafile".to_owned(), 333 | }; 334 | 335 | let lf_base = tilde("~/.llamafile/"); 336 | let lf_path = lf_base.to_string() + &lf.filename; 337 | lf.filepath = Some( lf_path.to_owned() ); 338 | info!("lf {:?}", &lf); 339 | 340 | let _ppath = std::path::Path::new(&lf_path); 341 | //let val = sha256::try_digest(ppath).unwrap(); 342 | let val = "1903778f7defd921347b25327ebe5dd902f29417ba524144a8e4f7c32d83dee8"; 343 | if val != lf.sha256 { 344 | error!("Wrong sha256sum for the model. Quit"); 345 | return None; 346 | } 347 | 348 | return lf.filepath; 349 | 350 | } 351 | 352 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/main.rs: -------------------------------------------------------------------------------- 1 | use log::info; 2 | use fire_seq_search_server::query_engine::{QueryEngine, ServerInformation}; 3 | use fire_seq_search_server::local_llm::LlmEngine; 4 | 5 | use fire_seq_search_server::query_engine::NotebookSoftware::*; 6 | 7 | use clap::Parser; 8 | 9 | #[derive(Parser)] 10 | #[command(author, version)] 11 | #[command(about = "Server for fireSeqSearch: hosting logseq notebooks at 127.0.0.1", 12 | long_about = None)] 13 | struct Cli{ 14 | #[arg(long="notebook_path")] 15 | notebook_path: String, 16 | #[arg(long="notebook_name")] 17 | notebook_name: Option, 18 | 19 | #[arg(long, default_value_t = false)] 20 | parse_pdf_links: bool, 21 | 22 | #[arg(long, default_value_t = false)] 23 | obsidian_md: bool, 24 | 25 | #[arg(long,default_value_t = false)] 26 | enable_journal_query: bool, 27 | 28 | #[arg(long,default_value_t = false)] 29 | exclude_zotero_items: bool, 30 | 31 | #[arg(long,default_value_t = 10, value_name="HITS")] 32 | show_top_hits: usize, 33 | 34 | /* 35 | This is really an arbitrary limit. 36 | https://stackoverflow.com/a/33758289/1166518 37 | It doesn't mean the width limit of output, 38 | but a threshold between short paragraph and long paragraph 39 | */ 40 | #[arg(long,default_value_t = 120*2, value_name="LEN")] 41 | show_summary_single_line_chars_limit: usize, 42 | 43 | #[arg(long="host")] 44 | host: Option, 45 | } 46 | 47 | use tokio::task; 48 | 49 | use axum; 50 | use axum::routing::get; 51 | use fire_seq_search_server::http_client::endpoints; 52 | use std::sync::Arc; 53 | use ctrlc; 54 | use kill_tree::{blocking::kill_tree}; 55 | 56 | #[tokio::main] 57 | async fn main() { 58 | env_logger::builder() 59 | .format_timestamp(None) 60 | .format_target(false) 61 | .init(); 62 | 63 | info!("main thread running"); 64 | let matches = Cli::parse(); 65 | let server_info: ServerInformation = build_server_info(matches); 66 | 67 | let mut llm_loader = None; 68 | if cfg!(feature="llm") { 69 | info!("LLM Enabled"); 70 | let serv_info = Arc::new(server_info.clone()); 71 | llm_loader = Some(task::spawn( async { LlmEngine::llm_init( serv_info ).await })); 72 | } 73 | 74 | let mut engine = QueryEngine::construct(server_info).await; 75 | 76 | info!("query engine build finished"); 77 | if cfg!(feature="llm") { 78 | let llm:LlmEngine = llm_loader.unwrap().await.unwrap(); 79 | let llm_arc = Arc::new(llm); 80 | let llm_poll = llm_arc.clone(); 81 | engine.llm = Some(llm_arc); 82 | 83 | let _poll_handle = tokio::spawn( async move { 84 | loop { 85 | llm_poll.call_llm_engine().await; 86 | let wait_llm = tokio::time::Duration::from_millis(500); 87 | tokio::time::sleep(wait_llm).await; 88 | } 89 | }); 90 | } 91 | 92 | 93 | let engine_arc = std::sync::Arc::new(engine); 94 | 95 | let engine_arc_for_destructor = engine_arc.clone(); 96 | ctrlc::set_handler(move|| { 97 | info!("Ctrl - C received. Exiting..."); 98 | if cfg!(feature="llm") { 99 | let llm = engine_arc_for_destructor.llm.as_ref().unwrap(); 100 | let pid = llm.pid_hit_list(); 101 | info!("Kill LLM Engine by pid {}", &pid); 102 | kill_tree(pid.as_u32()).unwrap(); 103 | } 104 | std::process::exit(0); 105 | }).expect("Error setting Ctrl-C handler"); 106 | 107 | let app = axum::Router::new() 108 | .route("/query/:term", get(endpoints::query)) 109 | .route("/server_info", get(endpoints::get_server_info)) 110 | .route("/wordcloud", get(endpoints::generate_word_cloud)) 111 | .route("/summarize/:title", get(endpoints::summarize)) 112 | .route("/llm_done_list", get(endpoints::get_llm_done_list)) 113 | .with_state(engine_arc.clone()); 114 | 115 | let listener = tokio::net::TcpListener::bind(&engine_arc.server_info.host) 116 | .await.unwrap(); 117 | axum::serve(listener, app).await.unwrap(); 118 | } 119 | 120 | 121 | 122 | fn build_server_info(args: Cli) -> ServerInformation { 123 | let notebook_name = match args.notebook_name { 124 | Some(x) => x.to_string(), 125 | None => { 126 | let chunks: Vec<&str> = args.notebook_path.split('/').collect(); 127 | let guess: &str = *chunks.last().unwrap(); 128 | info!("fire_seq_search guess the notebook name is {}", guess); 129 | String::from(guess) 130 | } 131 | }; 132 | let host: String = args.host.clone().unwrap_or_else(|| "127.0.0.1:3030".to_string()); 133 | let mut software = Logseq; 134 | if args.obsidian_md { 135 | software = Obsidian; 136 | } 137 | ServerInformation{ 138 | notebook_path: args.notebook_path, 139 | notebook_name, 140 | enable_journal_query: args.enable_journal_query, 141 | show_top_hits: args.show_top_hits, 142 | show_summary_single_line_chars_limit: 143 | args.show_summary_single_line_chars_limit, 144 | parse_pdf_links: args.parse_pdf_links, 145 | exclude_zotero_items:args.exclude_zotero_items, 146 | software, 147 | convert_underline_hierarchy: true, 148 | host, 149 | llm_enabled: cfg!(feature="llm"), 150 | llm_max_waiting_time: 180, 151 | } 152 | } 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/markdown_parser/markdown_to_text.rs: -------------------------------------------------------------------------------- 1 | // This file is based on https://github.com/fbecart/markdown_to_text 2 | // 3 | // MIT License 4 | // 5 | // Copyright (c) 2019 Arran France 6 | // 7 | // Permission is hereby granted, free of charge, to any person obtaining a copy 8 | // of this software and associated documentation files (the "Software"), to deal 9 | // in the Software without restriction, including without limitation the rights 10 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | // copies of the Software, and to permit persons to whom the Software is 12 | // furnished to do so, subject to the following conditions: 13 | // 14 | // The above copyright notice and this permission notice shall be included in all 15 | // copies or substantial portions of the Software. 16 | // 17 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | // SOFTWARE. 24 | 25 | 26 | #![warn(clippy::all, clippy::pedantic)] 27 | 28 | 29 | use log::{debug, warn}; 30 | use pulldown_cmark::{Event, Options, Parser, Tag}; 31 | use crate::markdown_parser::pdf_parser::try_parse_pdf; 32 | use crate::query_engine::ServerInformation; 33 | 34 | pub fn convert_from_logseq(markdown:&str, document_title: &str, server_info: &ServerInformation) -> String { 35 | let mut options = Options::empty(); 36 | options.insert(Options::ENABLE_STRIKETHROUGH); 37 | 38 | let parser = Parser::new_ext(&markdown, options); 39 | let mut tags_stack = Vec::new(); 40 | let mut buffer = String::default(); 41 | 42 | // For each event we push into the buffer to produce the plain text version. 43 | for event in parser { 44 | // println!("{:?}", &event); 45 | match event { 46 | // The start and end events don't contain the text inside the tag. That's handled by the `Event::Text` arm. 47 | // However, pdf is considered as Image, and will be specially handled when parsing end tag 48 | Event::Start(tag) => { 49 | start_tag(&tag, &mut buffer, &mut tags_stack); 50 | tags_stack.push(tag); 51 | } 52 | Event::End(tag) => { 53 | tags_stack.pop(); 54 | end_tag(&tag, &mut buffer, &tags_stack); 55 | if server_info.parse_pdf_links { 56 | let pdf_str = try_parse_pdf(&tag, server_info); 57 | match pdf_str { 58 | Some(s) => { 59 | debug!("PDF document {:?} appended to {}", &tag, document_title); 60 | buffer.push_str(&s) 61 | }, 62 | None => () 63 | } 64 | } 65 | } 66 | Event::Text(content) => { 67 | if !tags_stack.iter().any(is_strikethrough) { 68 | buffer.push_str(&content) 69 | } 70 | } 71 | Event::Code(content) => buffer.push_str(&content), 72 | Event::SoftBreak => buffer.push(' '), 73 | _ => (), 74 | } 75 | } 76 | buffer.trim().to_string() 77 | } 78 | 79 | 80 | 81 | #[must_use] 82 | pub fn convert(markdown: &str) -> String { 83 | // GFM tables and tasks lists are not enabled. 84 | let mut options = Options::empty(); 85 | options.insert(Options::ENABLE_STRIKETHROUGH); 86 | 87 | let parser = Parser::new_ext(&markdown, options); 88 | let mut tags_stack = Vec::new(); 89 | let mut buffer = String::default(); 90 | 91 | // For each event we push into the buffer to produce the plain text version. 92 | for event in parser { 93 | match event { 94 | // The start and end events don't contain the text inside the tag. That's handled by the `Event::Text` arm. 95 | Event::Start(tag) => { 96 | start_tag(&tag, &mut buffer, &mut tags_stack); 97 | tags_stack.push(tag); 98 | } 99 | Event::End(tag) => { 100 | tags_stack.pop(); 101 | end_tag(&tag, &mut buffer, &tags_stack); 102 | } 103 | Event::Text(content) => { 104 | if !tags_stack.iter().any(is_strikethrough) { 105 | buffer.push_str(&content) 106 | } 107 | } 108 | Event::Code(content) => buffer.push_str(&content), 109 | Event::SoftBreak => buffer.push(' '), 110 | _ => (), 111 | } 112 | } 113 | buffer.trim().to_string() 114 | } 115 | 116 | fn start_tag(tag: &Tag, buffer: &mut String, tags_stack: &mut Vec) { 117 | match tag { 118 | Tag::Link(_, _, title) | Tag::Image(_, _, title) => buffer.push_str(&title), 119 | Tag::Item => { 120 | buffer.push('\n'); 121 | let mut lists_stack = tags_stack 122 | .iter_mut() 123 | .filter_map(|tag| match tag { 124 | Tag::List(nb) => Some(nb), 125 | _ => None, 126 | }) 127 | .collect::>(); 128 | let prefix_tabs_count = lists_stack.len() - 1; 129 | for _ in 0..prefix_tabs_count { 130 | buffer.push('\t') 131 | } 132 | if let Some(Some(nb)) = lists_stack.last_mut() { 133 | buffer.push_str(&nb.to_string()); 134 | buffer.push_str(". "); 135 | *nb += 1; 136 | } else { 137 | buffer.push_str("• "); 138 | } 139 | } 140 | Tag::Paragraph | Tag::CodeBlock(_) | Tag::Heading(..) => buffer.push('\n'), 141 | _ => (), 142 | } 143 | } 144 | 145 | fn end_tag(tag: &Tag, buffer: &mut String, tags_stack: &[Tag]) { 146 | match tag { 147 | Tag::Paragraph | Tag::Heading(..) => buffer.push('\n'), 148 | Tag::CodeBlock(_) => { 149 | if !buffer.ends_with('\n') { 150 | buffer.push('\n'); 151 | } 152 | } 153 | Tag::List(_) => { 154 | let is_sublist = tags_stack.iter().any(|tag| match tag { 155 | Tag::List(_) => true, 156 | _ => false, 157 | }); 158 | if !is_sublist { 159 | buffer.push('\n') 160 | } 161 | } 162 | _ => (), 163 | } 164 | } 165 | 166 | fn is_strikethrough(tag: &Tag) -> bool { 167 | match tag { 168 | Tag::Strikethrough => true, 169 | _ => false, 170 | } 171 | } 172 | 173 | #[cfg(test)] 174 | mod tests { 175 | use crate::generate_server_info_for_test; 176 | use super::convert; 177 | use super::convert_from_logseq; 178 | 179 | #[test] 180 | fn links_to_pdf() { 181 | let markdown = r#"Refer to ![order.pdf](../assets/readings_1634910859348_0.pdf)"#; 182 | let expected = "Refer to order.pdf"; 183 | assert_eq!(convert(markdown), expected); 184 | 185 | let mut info = generate_server_info_for_test(); 186 | info.notebook_path = "C:\\Users\\z2369li\\Nextcloud\\logseq_notebook".to_string(); 187 | info.parse_pdf_links = true; 188 | // println!("{:?}", &info); 189 | let _a = convert_from_logseq(markdown, "title", &info); 190 | } 191 | 192 | #[test] 193 | fn basic_inline_strong() { 194 | let markdown = r#"**Hello**"#; 195 | let expected = "Hello"; 196 | assert_eq!(convert(markdown), expected); 197 | } 198 | 199 | #[test] 200 | fn basic_inline_emphasis() { 201 | let markdown = r#"_Hello_"#; 202 | let expected = "Hello"; 203 | assert_eq!(convert(markdown), expected); 204 | } 205 | 206 | #[test] 207 | fn basic_header() { 208 | let markdown = r#"# Header 209 | 210 | ## Sub header 211 | 212 | End paragraph."#; 213 | let expected = "Header 214 | 215 | Sub header 216 | 217 | End paragraph."; 218 | assert_eq!(convert(markdown), expected); 219 | } 220 | 221 | #[test] 222 | fn alt_header() { 223 | let markdown = r#" 224 | Header 225 | ====== 226 | 227 | End paragraph."#; 228 | let expected = "Header 229 | 230 | End paragraph."; 231 | assert_eq!(convert(markdown), expected); 232 | } 233 | 234 | #[test] 235 | fn strong_emphasis() { 236 | let markdown = r#"**asterisks and _underscores_**"#; 237 | let expected = "asterisks and underscores"; 238 | assert_eq!(convert(markdown), expected); 239 | } 240 | 241 | #[test] 242 | fn strikethrough() { 243 | let markdown = r#"This was ~~erased~~ deleted."#; 244 | let expected = "This was deleted."; 245 | assert_eq!(convert(markdown), expected); 246 | } 247 | 248 | #[test] 249 | fn mixed_list() { 250 | let markdown = r#"Start paragraph. 251 | 252 | 1. First ordered list item 253 | 2. Another item 254 | 1. Actual numbers don't matter, just that it's a number 255 | 1. Ordered sub-list 256 | 4. And another item. 257 | 258 | End paragraph."#; 259 | 260 | let expected = "Start paragraph. 261 | 262 | 1. First ordered list item 263 | 2. Another item 264 | 3. Actual numbers don't matter, just that it's a number 265 | 4. Ordered sub-list 266 | 5. And another item. 267 | 268 | End paragraph."; 269 | assert_eq!(convert(markdown), expected); 270 | } 271 | 272 | #[test] 273 | fn nested_lists() { 274 | let markdown = r#" 275 | * alpha 276 | * beta 277 | * one 278 | * two 279 | * gamma 280 | "#; 281 | let expected = "• alpha 282 | • beta 283 | \t• one 284 | \t• two 285 | • gamma"; 286 | assert_eq!(convert(markdown), expected); 287 | } 288 | 289 | #[test] 290 | fn list_with_header() { 291 | let markdown = r#"# Title 292 | * alpha 293 | * beta 294 | "#; 295 | let expected = r#"Title 296 | 297 | • alpha 298 | • beta"#; 299 | assert_eq!(convert(markdown), expected); 300 | } 301 | 302 | #[test] 303 | fn basic_link() { 304 | let markdown = "I'm an [inline-style link](https://www.google.com)."; 305 | let expected = "I'm an inline-style link."; 306 | assert_eq!(convert(markdown), expected) 307 | } 308 | 309 | #[ignore] 310 | #[test] 311 | fn link_with_itself() { 312 | let markdown = "Go to [https://www.google.com]."; 313 | let expected = "Go to https://www.google.com."; 314 | assert_eq!(convert(markdown), expected) 315 | } 316 | 317 | #[test] 318 | fn basic_image() { 319 | let markdown = "As displayed in ![img alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png)."; 320 | let expected = "As displayed in img alt text."; 321 | assert_eq!(convert(markdown), expected); 322 | } 323 | 324 | #[test] 325 | fn inline_code() { 326 | let markdown = "This is `inline code`."; 327 | let expected = "This is inline code."; 328 | assert_eq!(convert(markdown), expected); 329 | } 330 | 331 | #[test] 332 | fn code_block() { 333 | let markdown = r#"Start paragraph. 334 | ```javascript 335 | var s = "JavaScript syntax highlighting"; 336 | alert(s); 337 | ``` 338 | End paragraph."#; 339 | let expected = r#"Start paragraph. 340 | 341 | var s = "JavaScript syntax highlighting"; 342 | alert(s); 343 | 344 | End paragraph."#; 345 | assert_eq!(convert(markdown), expected); 346 | } 347 | 348 | #[test] 349 | fn block_quote() { 350 | let markdown = r#"Start paragraph. 351 | 352 | > Blockquotes are very handy in email to emulate reply text. 353 | > This line is part of the same quote. 354 | 355 | End paragraph."#; 356 | let expected = "Start paragraph. 357 | 358 | Blockquotes are very handy in email to emulate reply text. This line is part of the same quote. 359 | 360 | End paragraph."; 361 | assert_eq!(convert(markdown), expected); 362 | } 363 | 364 | #[test] 365 | fn paragraphs() { 366 | let markdown = r#"Paragraph 1. 367 | 368 | Paragraph 2."#; 369 | let expected = "Paragraph 1. 370 | 371 | Paragraph 2."; 372 | assert_eq!(convert(markdown), expected); 373 | } 374 | } 375 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/markdown_parser/mod.rs: -------------------------------------------------------------------------------- 1 | mod markdown_to_text; 2 | mod pdf_parser; 3 | 4 | use std::borrow::Cow; 5 | use regex::Regex; 6 | use crate::query_engine::ServerInformation; 7 | 8 | // https://docs.rs/regex/latest/regex/#repetitions 9 | // https://stackoverflow.com/a/8303552/1166518 10 | pub fn exclude_advanced_query(md: Cow<'_,str>) -> Cow<'_, str> { 11 | if !md.contains('#') { 12 | return md; 13 | } 14 | 15 | lazy_static! { 16 | static ref RE: Regex = Regex::new( 17 | r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY") 18 | .unwrap(); 19 | } 20 | return RE.replace_all(&md, " ").into_owned().into(); 21 | } 22 | 23 | fn hack_specific_chars_cow(text: Cow) -> String { 24 | //https://www.compart.com/en/unicode/U+2022 25 | let bullet = char::from_u32(0x00002022).unwrap(); 26 | text.replace(bullet, " ") 27 | } 28 | 29 | use crate::query_engine::NotebookSoftware; 30 | use std::borrow::Borrow; 31 | use log::info; 32 | 33 | fn remove_obsidian_header<'a>(content: Cow<'a, str>) -> Cow<'a, str> { 34 | lazy_static! { 35 | static ref RE: Regex = Regex::new( 36 | r"---[\s\S]*?---" 37 | ).unwrap(); 38 | } 39 | info!("from {:?}", &content); 40 | let cr = content.borrow(); 41 | let ret: Cow = RE.replace(cr, " "); 42 | info!("into {:?}", &ret); 43 | ret.into_owned().into() 44 | } 45 | 46 | pub fn parse_logseq_notebook(md: Cow<'_,str>, title: &str, server_info: &ServerInformation) -> String { 47 | // Now we do some parsing for this file 48 | let content = exclude_advanced_query(md); 49 | let content = hack_specific_chars_cow(content); 50 | let content = remove_angled_bracket(content); 51 | 52 | let content = Cow::from(content); 53 | let content = match &server_info.software { 54 | NotebookSoftware::Obsidian => remove_obsidian_header(content), 55 | _ => content, 56 | }; 57 | let content: String = markdown_to_text::convert_from_logseq( 58 | &content, title, server_info); 59 | 60 | //let content = content.into_owned(); 61 | content 62 | 63 | } 64 | 65 | // TODO This function is no longer used. 2025-04-30 66 | pub fn parse_to_plain_text(md: &str) -> String { 67 | let plain_text: String = markdown_to_text::convert(&md); 68 | let plain_text = hack_specific_chars(plain_text); 69 | let plain_text = remove_angled_bracket(plain_text); 70 | 71 | // println!("{}", &plain_text); 72 | plain_text 73 | } 74 | 75 | // < > will break html elements 76 | fn remove_angled_bracket(text: String) -> String { 77 | let s1 = text.replace('<', "("); 78 | let s2 = s1.replace('>', ")"); 79 | s2 80 | } 81 | 82 | fn hack_specific_chars(text: String) -> String { 83 | //https://www.compart.com/en/unicode/U+2022 84 | let bullet = char::from_u32(0x00002022).unwrap(); 85 | // println!("{}", bullet); 86 | text.replace(bullet, " ") 87 | } 88 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/markdown_parser/pdf_parser.rs: -------------------------------------------------------------------------------- 1 | 2 | use std::path::Path; 3 | use log::{debug, error, info}; 4 | use pulldown_cmark::Tag; 5 | use crate::query_engine::ServerInformation; 6 | 7 | // extern crate pdf_extract; 8 | extern crate pdf_extract_temporary_mitigation_panic; 9 | use pdf_extract_temporary_mitigation_panic::extract_text; 10 | 11 | pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { 12 | 13 | let destination_uri = match tag { 14 | Tag::Image(_link_type, destination_uri, _title) => { 15 | if !destination_uri.ends_with(".pdf") { 16 | return None; 17 | } 18 | debug!("Trying to parse PDF {:?}", tag); 19 | // println!("{:?}", &tag); 20 | destination_uri.replace("../", "") 21 | }, 22 | _ => {return None;} 23 | }; 24 | 25 | let path = Path::new(&server_info.notebook_path); 26 | let pdf_path = path.join(destination_uri); 27 | // println!("{:?}, {:?}", &pdf_path, pdf_path.is_file()); 28 | if !pdf_path.is_file() { 29 | error!("pdf_path is not a file, skipping {:?}", &pdf_path); 30 | return None; 31 | } 32 | 33 | 34 | let text = match extract_text(&pdf_path) { 35 | Ok(s) => {s} 36 | Err(e) => { 37 | error!("Failed({:?} to load pdf {:?}", e, pdf_path); 38 | return None; 39 | } 40 | }; 41 | 42 | match pdf_path.file_name() { 43 | None => {error!("Extracted text len {}, file_name() failed", text.len());} 44 | Some(f) => {info!("Extracted text from {:?} len {}", f, text.len());} 45 | }; 46 | 47 | 48 | Some(text) 49 | } -------------------------------------------------------------------------------- /fire_seq_search_server/src/post_query/app_uri.rs: -------------------------------------------------------------------------------- 1 | use log::{error, info}; 2 | use crate::post_query::logseq_uri::{generate_logseq_uri,parse_date_from_str}; 3 | use crate::post_query::obsidian_uri::generate_obsidian_uri; 4 | use crate::query_engine::ServerInformation; 5 | 6 | 7 | // Maybe I should wrap them with the same interface? -Zhenbo Li 2023-Feb-05 8 | // Deprecated on 2024-Sep-21 9 | pub fn generate_uri(title: &str, is_page_hit: &bool, server_info: &ServerInformation) -> String { 10 | if server_info.software == Obsidian { 11 | info!("Generating Obsidian URI for {}", title); 12 | if !is_page_hit { 13 | error!("Journal is unsupported for Obsidian yet"); 14 | return String::from("https://github.com/Endle/fireSeqSearch/issues"); 15 | } 16 | return generate_obsidian_uri(&title, *is_page_hit, &server_info); 17 | } 18 | 19 | return generate_logseq_uri(&title, *is_page_hit, &server_info); 20 | } 21 | 22 | use crate::query_engine::NotebookSoftware::{Logseq,Obsidian}; 23 | 24 | pub fn generate_uri_v2(title: &str, server_info: &ServerInformation) -> String { 25 | match &server_info.software { 26 | Obsidian => generate_obsidian_uri(title, true, server_info), 27 | Logseq => { 28 | let dt = parse_date_from_str(title); 29 | // TODO remove this duplicate calc 30 | // I don't care the performance here, but I want to make code cleaner - 2024 Sep 21 31 | generate_logseq_uri(title, dt.is_none(), server_info) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/post_query/hit_parsed.rs: -------------------------------------------------------------------------------- 1 | use log::debug; 2 | use crate::JOURNAL_PREFIX; 3 | use crate::post_query::app_uri::generate_uri_v2; 4 | use crate::post_query::highlighter::highlight_keywords_in_body; 5 | use crate::query_engine::ServerInformation; 6 | 7 | #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)] 8 | pub struct FireSeqSearchHitParsed { 9 | // pub title: String, 10 | pub title: String, 11 | pub summary: String, 12 | pub score: f32, 13 | pub metadata: String, 14 | pub logseq_uri: String, 15 | } 16 | 17 | use tantivy::schema::document::OwnedValue; 18 | impl FireSeqSearchHitParsed { 19 | 20 | //TODO remove these dup code 21 | fn take_str_from_doc(doc: &tantivy::TantivyDocument, pos:usize) -> &str { 22 | /* 23 | let title: &str = doc.field_values()[0].value().as_text().unwrap(); 24 | let body: &str = doc.field_values()[1].value().as_text().unwrap(); 25 | */ 26 | let v: &OwnedValue = doc.field_values()[pos].value(); 27 | match v{ 28 | OwnedValue::Str(s) => s, 29 | _ => panic!("Wrong type") 30 | } 31 | } 32 | pub fn from_tantivy(doc: &tantivy::TantivyDocument, 33 | score: f32, term_tokens: &Vec, 34 | server_info: &ServerInformation) ->FireSeqSearchHitParsed { 35 | 36 | let title = Self::take_str_from_doc(doc, 0); 37 | let body = Self::take_str_from_doc(doc, 1); 38 | let summary = highlight_keywords_in_body(body, term_tokens, server_info); 39 | 40 | let mut is_page_hit = true; 41 | let title = if title.starts_with(JOURNAL_PREFIX) { 42 | assert!(server_info.enable_journal_query); 43 | debug!("Found a journal hit {}", title); 44 | is_page_hit = false; 45 | let t = title.strip_prefix(JOURNAL_PREFIX); 46 | t.unwrap().to_string() 47 | } else { 48 | title.to_string() 49 | }; 50 | 51 | let logseq_uri = generate_uri_v2(&title, server_info); 52 | 53 | debug!("Processing a hit, title={}, uri={}, summary_len={}", &title, &logseq_uri,summary.len()); 54 | 55 | let metadata: String = if is_page_hit { 56 | String::from("page_hit") 57 | } else { 58 | String::from("journal_hit") 59 | }; 60 | 61 | FireSeqSearchHitParsed { 62 | title, 63 | summary, 64 | score, 65 | logseq_uri, 66 | metadata, 67 | } 68 | } 69 | 70 | // Wrap this part into a function, so I can document it and add tests - ZLi 2023-Jan 71 | pub fn serde_to_string(self: &Self) -> String { 72 | serde_json::to_string(&self).unwrap() 73 | } 74 | 75 | } 76 | 77 | 78 | 79 | #[cfg(test)] 80 | mod test_serde { 81 | // use crate::generate_server_info_for_test; 82 | // use crate::post_query::hit_parsed::FireSeqSearchHitParsed; 83 | // use crate::post_query::logseq_uri::generate_logseq_uri; 84 | 85 | 86 | // fn get_parsed_hit(title: &str) -> FireSeqSearchHitParsed { 87 | // let server_info = generate_server_info_for_test(); 88 | // let logseq_uri = generate_logseq_uri(title, &true, &server_info); 89 | // FireSeqSearchHitParsed{ 90 | // title: title.to_owned(), 91 | // summary: String::from("summary"), 92 | // score: 1.0, 93 | // logseq_uri, 94 | // metadata: String::from("meta") 95 | // } 96 | // } 97 | // fn serde(title: &str) -> String { 98 | // let h = get_parsed_hit(title); 99 | // h.serde_to_string() 100 | // } 101 | 102 | // TODO: This solution is buggy. Consider PR#100, which might be a better idea. -Zli, 2023-Jan 103 | // This test disabled on 2023-Feb-02 for PR #112 104 | // #[test] 105 | // fn test_serde_uri() { 106 | // assert!(serde("EU4").contains("\"logseq://graph/logseq_notebook?page=EU4\"")); 107 | // 108 | // assert!(serde("Games/EU4").contains("\"logseq://graph/logseq_notebook?page=Games/EU4\"")); 109 | // 110 | // } 111 | } 112 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/post_query/logseq_uri.rs: -------------------------------------------------------------------------------- 1 | use log::error; 2 | use crate::ServerInformation; 3 | use url::Url; 4 | 5 | /// 6 | /// 7 | /// # Arguments 8 | /// 9 | /// * `file_name`: File name of the Logseq page, without .md 10 | /// * `server_info`: 11 | /// 12 | /// returns: String 13 | /// 14 | /// # Examples 15 | /// 16 | /// ``` 17 | /// use fire_seq_search_server::post_query::logseq_uri::process_note_title; 18 | /// let server_info = fire_seq_search_server::generate_server_info_for_test(); 19 | /// let r = process_note_title("Canada___Clothes", &server_info); 20 | /// assert_eq!("Canada/Clothes", &r); 21 | /// let r = process_note_title("C++", &server_info); 22 | /// assert_eq!("C++", &r); 23 | /// let r = process_note_title("Programming Languages%2FTypes", &server_info); 24 | /// assert_eq!("Programming Languages/Types", &r); 25 | /// let r = process_note_title("Context of Std%3A%3Astring (highlights)", &server_info); 26 | /// assert_eq!("Context of Std::string (highlights)", &r); 27 | /// ``` 28 | // I tried to put this part when loading the notebooks, and it reduced the query sensitivity 29 | // https://github.com/Endle/fireSeqSearch/issues/99 30 | // 2022-12-30 31 | pub fn process_note_title(file_name: &str, server_info: &ServerInformation) -> String { 32 | // let file_name = file_name.replace("%2F", "/"); 33 | let file_name = urlencoding::decode(file_name).expect("UTF-8").to_string(); 34 | if server_info.convert_underline_hierarchy { 35 | return file_name.replace("___", "/"); 36 | } 37 | file_name 38 | } 39 | 40 | pub fn generate_logseq_uri(title: &str, is_page_hit: bool, server_info: &ServerInformation) -> String { 41 | return if is_page_hit { 42 | let title = process_note_title(title, server_info); 43 | let mut uri = Url::parse("logseq://graph/").unwrap(); 44 | uri.set_path(&server_info.notebook_name); 45 | uri.query_pairs_mut() 46 | .append_pair("page", &title); 47 | uri.to_string() 48 | } else { 49 | generate_logseq_journal_uri(title, server_info) 50 | 51 | }; 52 | // logseq://graph/logseq_notebook?page=Nov%2026th%2C%202022 53 | } 54 | 55 | #[derive(PartialEq, Debug)] 56 | pub struct JournalDate { 57 | pub year: u32, 58 | pub month: u32, 59 | pub date: u32, 60 | } 61 | 62 | impl JournalDate { 63 | pub fn to_str(&self, _: &ServerInformation) -> String { 64 | let mut result = Vec::new(); 65 | result.push(match self.month { 66 | 1 => "Jan", 67 | 2 => "Feb", 68 | 3 => "Mar", 69 | 4 => "Apr", 70 | 5 => "May", 71 | 6 => "Jun", 72 | 7 => "Jul", 73 | 8 => "Aug", 74 | 9 => "Sep", 75 | 10 => "Oct", 76 | 11 => "Nov", 77 | 12 => "Dec", 78 | _ => { 79 | error!("Unexpected month {}", self.month); 80 | "ErrMonth" 81 | } 82 | }.to_string()); 83 | 84 | result.push(" ".to_string()); 85 | match self.date { 86 | 1|21|31 => { 87 | let s = self.date.to_string(); 88 | result.push(s); 89 | result.push("st".to_string()); 90 | }, 91 | 2|22 => { 92 | let s = self.date.to_string(); 93 | result.push(s); 94 | result.push("nd".to_string()); 95 | }, 96 | 3|23 => { 97 | let s = self.date.to_string(); 98 | result.push(s); 99 | result.push("rd".to_string()); 100 | }, 101 | _ => { 102 | let s = self.date.to_string(); 103 | result.push(s); 104 | result.push("th".to_string()); 105 | } 106 | }; 107 | 108 | result.push(", ".to_string()); 109 | result.push(self.year.to_string()); 110 | 111 | result.concat() 112 | } 113 | } 114 | 115 | 116 | fn generate_logseq_journal_uri(title: &str, server_info: &ServerInformation) -> String { 117 | let mut uri = Url::parse("logseq://graph/").unwrap(); 118 | uri.set_path(&server_info.notebook_name); 119 | let dt = parse_date_from_str(title); 120 | let dt = match dt { 121 | None => { 122 | error!("Failed to gen JournalDate from {}", title); 123 | return format!("logseq://graph/{}", server_info.notebook_name); 124 | } 125 | Some(x) => x 126 | }; 127 | let journal_name = dt.to_str(server_info); 128 | format!("logseq://graph/{}?page={}", 129 | server_info.notebook_name, journal_name); 130 | uri.query_pairs_mut() 131 | .append_pair("page", &journal_name); 132 | uri.to_string() 133 | } 134 | 135 | fn parse_slice_to_u8(slice: Option<&str>) -> Option { 136 | match slice{ 137 | Some(x) => { 138 | let y = x.parse::(); 139 | match y { 140 | Ok(i) => Some(i), 141 | Err(e) => { 142 | error!("Parse({}) Int Error: ({:?})", x, e); 143 | None 144 | } 145 | } 146 | }, 147 | None => { 148 | error!("Invalid slice"); 149 | None 150 | } 151 | 152 | } 153 | } 154 | 155 | pub fn parse_date_from_str(title: &str) -> Option { 156 | if title.len() != 10 { 157 | return None; 158 | } 159 | 160 | let year = match parse_slice_to_u8(title.get(0..4)) { 161 | Some(x) => x, 162 | None => { 163 | return None; 164 | } 165 | }; 166 | let month = match parse_slice_to_u8(title.get(5..=6)) { 167 | Some(x) => x, 168 | None => { 169 | return None; 170 | } 171 | }; 172 | let date = match parse_slice_to_u8(title.get(8..=9)) { 173 | Some(x) => x, 174 | None => { 175 | return None; 176 | } 177 | }; 178 | Some(JournalDate{ 179 | year, 180 | month, 181 | date 182 | }) 183 | } 184 | 185 | #[cfg(test)] 186 | mod test_logseq_uri { 187 | use crate::generate_server_info_for_test; 188 | use crate::post_query::logseq_uri::{generate_logseq_journal_uri, generate_logseq_uri}; 189 | use crate::post_query::logseq_uri::parse_date_from_str; 190 | 191 | 192 | #[test] 193 | fn test_parse() { 194 | let server_info = generate_server_info_for_test(); 195 | assert_eq!(None, parse_date_from_str("22")); 196 | let d = parse_date_from_str("2022_12_05"); 197 | assert!(d.is_some()); 198 | let d = d.unwrap(); 199 | assert_eq!(d.to_str(&server_info), "Dec 5th, 2022"); 200 | } 201 | #[test] 202 | fn test_generate() { 203 | 204 | let server_info = generate_server_info_for_test(); 205 | 206 | // Don't encode / at here. It would be processed by serde. - 2022-11-27 207 | let r = generate_logseq_uri("Games/EU4", true, &server_info); 208 | assert_eq!(&r, "logseq://graph/logseq_notebook?page=Games%2FEU4"); 209 | 210 | let r = generate_logseq_uri("Games/赛马娘", true, &server_info); 211 | assert_eq!(&r, "logseq://graph/logseq_notebook?page=Games%2F%E8%B5%9B%E9%A9%AC%E5%A8%98"); 212 | let r = generate_logseq_journal_uri("2022_12_14", &server_info); 213 | assert_eq!(&r,"logseq://graph/logseq_notebook?page=Dec+14th%2C+2022"); 214 | 215 | let r = generate_logseq_uri("fireSeqSearch___test___5", true, &server_info); 216 | assert_eq!(&r,"logseq://graph/logseq_notebook?page=fireSeqSearch%2Ftest%2F5"); 217 | 218 | let r = generate_logseq_uri("C++", true, &server_info); 219 | assert_eq!(&r, "logseq://graph/logseq_notebook?page=C%2B%2B"); 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/post_query/mod.rs: -------------------------------------------------------------------------------- 1 | use log::info; 2 | use crate::query_engine::ServerInformation; 3 | use crate::language_tools::tokenizer::tokenize; 4 | 5 | pub mod logseq_uri; 6 | pub mod highlighter; 7 | pub mod hit_parsed; 8 | pub mod app_uri; 9 | pub mod obsidian_uri; 10 | 11 | use rayon::prelude::*; 12 | use crate::post_query::hit_parsed::FireSeqSearchHitParsed; 13 | 14 | pub fn post_query_wrapper(top_docs: Vec<(f32, tantivy::DocAddress)>, 15 | term: &str, 16 | searcher: &tantivy::Searcher, 17 | server_info: &ServerInformation) -> Vec { 18 | let term_tokens = tokenize(term); 19 | info!("get term tokens({}) {:?}", term_tokens.len(), &term_tokens); 20 | let result: Vec = top_docs.par_iter() 21 | .map(|x| parse_and_serde(x, searcher, &term_tokens, server_info)) 22 | .collect(); 23 | result 24 | } 25 | 26 | fn parse_and_serde(x: &(f32, tantivy::DocAddress), 27 | searcher: &tantivy::Searcher, 28 | term_tokens: &Vec, 29 | server_info: &ServerInformation) -> String { 30 | // FireSeqSearchHitParsed 31 | let doc: tantivy::TantivyDocument = searcher.doc(x.1).unwrap(); 32 | let score = x.0; 33 | let hit_parsed = FireSeqSearchHitParsed::from_tantivy( 34 | &doc, score, term_tokens, server_info 35 | ); // it also provides the highlight 36 | hit_parsed.serde_to_string() 37 | } 38 | 39 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/post_query/obsidian_uri.rs: -------------------------------------------------------------------------------- 1 | use url::Url; 2 | use crate::query_engine::ServerInformation; 3 | 4 | /// 5 | /// 6 | /// # Arguments 7 | /// 8 | /// * `title`: 9 | /// * `is_page_hit`: 10 | /// * `server_info`: 11 | /// 12 | /// returns: String 13 | /// 14 | /// # Examples 15 | /// obsidian://open?vault=linotes&file=fedi%20note 16 | /// ``` 17 | /// use fire_seq_search_server::post_query::obsidian_uri::generate_obsidian_uri; 18 | /// let server_info = fire_seq_search_server::generate_server_info_for_test(); 19 | /// let r = generate_obsidian_uri("fedi%20note", true, &server_info); 20 | /// assert_eq!("obsidian://open?vault=logseq_notebook&file=fedi%20note", &r); 21 | /// ``` 22 | pub fn generate_obsidian_uri(title: &str, _is_page_hit: bool, server_info: &ServerInformation) -> String { 23 | 24 | let title = urlencoding::decode(title).expect("UTF-8").to_string(); 25 | let mut uri = Url::parse("obsidian://open").unwrap(); 26 | // uri.set_path(&server_info.notebook_name); 27 | uri.query_pairs_mut() 28 | .append_pair("vault", &server_info.notebook_name); 29 | uri.query_pairs_mut() 30 | .append_pair("file", &title); 31 | let result = uri.to_string(); 32 | //TODO too hacky here 33 | result.replace("+", "%20") 34 | 35 | } 36 | 37 | 38 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/query_engine/mod.rs: -------------------------------------------------------------------------------- 1 | // Everything about Tantivy should be hidden behind this component 2 | 3 | use log::{debug, info, error}; 4 | use crate::decode_cjk_str; 5 | use crate::post_query::post_query_wrapper; 6 | use std::sync::Arc; 7 | 8 | 9 | 10 | use std::borrow::Cow; 11 | 12 | #[derive(Debug, Clone, serde::Serialize,PartialEq)] 13 | pub enum NotebookSoftware { 14 | Logseq, 15 | Obsidian, 16 | } 17 | 18 | // This struct should be immutable when the program starts running 19 | #[derive(Debug, Clone, serde::Serialize)] 20 | pub struct ServerInformation { 21 | pub notebook_path: String, 22 | pub notebook_name: String, 23 | pub enable_journal_query: bool, 24 | pub show_top_hits: usize, 25 | pub show_summary_single_line_chars_limit: usize, 26 | pub parse_pdf_links: bool, 27 | pub exclude_zotero_items:bool, 28 | pub software: NotebookSoftware, 29 | 30 | /// Experimental. Not sure if I should use this global config - 2022-12-30 31 | pub convert_underline_hierarchy: bool, 32 | 33 | pub host: String, 34 | 35 | pub llm_enabled: bool, 36 | pub llm_max_waiting_time: u64, /* in secs */ 37 | } 38 | 39 | use crate::language_tools::tokenizer::FireSeqTokenizer; 40 | struct DocumentSetting { 41 | schema: tantivy::schema::Schema, 42 | tokenizer: FireSeqTokenizer, 43 | } 44 | 45 | use crate::local_llm::LlmEngine; 46 | pub struct QueryEngine { 47 | pub server_info: ServerInformation, 48 | reader: tantivy::IndexReader, 49 | query_parser: tantivy::query::QueryParser, 50 | //articles: Vec
, //TODO remove it. only word cloud needs it 51 | pub llm: Option>, 52 | } 53 | 54 | use tantivy::IndexWriter; 55 | use tantivy::TantivyDocument; 56 | 57 | use crate::load_notes::NoteListItem; 58 | use futures::stream::FuturesUnordered; 59 | use futures::StreamExt; 60 | 61 | use tantivy::doc; 62 | 63 | impl QueryEngine { 64 | pub async fn construct(server_info: ServerInformation) -> Self { 65 | 66 | let document_setting: DocumentSetting = build_document_setting(); 67 | let note_list = crate::load_notes::retrive_note_list(&server_info); 68 | let index: tantivy::Index = QueryEngine::build_index(&server_info, 69 | &document_setting, 70 | note_list).await; 71 | let (reader, query_parser) = build_reader_parser(&index, &document_setting); 72 | 73 | debug!("Query engine construction finished"); 74 | 75 | QueryEngine { 76 | server_info, 77 | reader, 78 | query_parser, 79 | // articles: Vec::new(), 80 | // articles: loaded_articles, 81 | llm: None, 82 | } 83 | } 84 | 85 | async fn load_single_note( 86 | server_info: &ServerInformation, 87 | document_setting: &DocumentSetting, 88 | note: NoteListItem, 89 | index_writer: &IndexWriter) { 90 | 91 | let raw_content = match std::fs::read_to_string(¬e.realpath) { 92 | Ok(s) => s, 93 | Err(e) => { 94 | error!("Failed to read {:?} err({:?}, skipping", ¬e, &e); 95 | return; 96 | } 97 | }; 98 | 99 | let content = crate::markdown_parser::parse_logseq_notebook( 100 | Cow::from(raw_content), ¬e.title, server_info); 101 | 102 | let schema = &document_setting.schema; 103 | let title = schema.get_field("title").unwrap(); 104 | let body = schema.get_field("body").unwrap(); 105 | index_writer.add_document( 106 | tantivy::doc!{ 107 | title => note.title, 108 | body => content, 109 | } 110 | ).unwrap(); 111 | } 112 | 113 | async fn load_all_notes(server_info: &ServerInformation, 114 | document_setting: &DocumentSetting, 115 | note_list: Vec, 116 | index_writer: &IndexWriter) { 117 | 118 | let mut futs: FuturesUnordered<_> = FuturesUnordered::new(); 119 | for article in note_list { 120 | futs.push( 121 | QueryEngine::load_single_note( 122 | server_info, 123 | document_setting, 124 | article, 125 | index_writer) 126 | ); 127 | } 128 | while let Some(_result) = futs.next().await {} 129 | } 130 | async fn build_index(server_info: &ServerInformation, 131 | document_setting: &DocumentSetting, 132 | note_list: Vec) -> tantivy::Index { 133 | 134 | let schema = &document_setting.schema; 135 | let index = tantivy::Index::create_in_ram(schema.clone()); 136 | 137 | index.tokenizers().register(TOKENIZER_ID, document_setting.tokenizer.clone()); 138 | let mut index_writer = index.writer(50_000_000).unwrap(); 139 | 140 | QueryEngine::load_all_notes(&server_info, 141 | &document_setting, 142 | note_list, 143 | &index_writer).await; 144 | 145 | index_writer.commit().unwrap(); 146 | index 147 | } 148 | } 149 | 150 | #[derive(Debug)] 151 | pub struct DocData { 152 | pub title: String, 153 | pub body: String, 154 | } 155 | use tantivy::schema::OwnedValue; 156 | impl DocData { 157 | fn take_str_from_doc(doc: &tantivy::TantivyDocument, pos:usize) -> &str { 158 | /* 159 | let title: &str = doc.field_values()[0].value().as_text().unwrap(); 160 | let body: &str = doc.field_values()[1].value().as_text().unwrap(); 161 | */ 162 | let v: &OwnedValue = doc.field_values()[pos].value(); 163 | match v{ 164 | OwnedValue::Str(s) => s, 165 | _ => panic!("Wrong type") 166 | } 167 | } 168 | pub fn retrive(searcher: &tantivy::Searcher, docid: tantivy::DocAddress) -> Self { 169 | let doc: tantivy::TantivyDocument = searcher.doc(docid).unwrap(); 170 | let title = Self::take_str_from_doc(&doc, 0).to_owned(); 171 | let body = Self::take_str_from_doc(&doc, 1).to_owned(); 172 | Self { 173 | title, body 174 | } 175 | } 176 | } 177 | 178 | impl QueryEngine { 179 | pub fn generate_wordcloud(self: &Self) -> String { 180 | String::from("TODO: wordcloud is turned off") 181 | //crate::word_frequency::generate_wordcloud(&self.articles) 182 | } 183 | 184 | pub async fn query_pipeline(self: &Self, term: String) -> String { 185 | let term: String = term_preprocess(term); 186 | info!("Searching {}", &term); 187 | 188 | 189 | let server_info: &ServerInformation = &self.server_info; 190 | 191 | let top_docs: Vec<(f32, tantivy::DocAddress)> = self.get_top_docs(&term); 192 | let searcher: tantivy::Searcher = self.reader.searcher(); 193 | 194 | if cfg!(feature="llm") { 195 | for (_f, docid) in &top_docs { 196 | let doc = DocData::retrive(&searcher, *docid); 197 | let llm = self.llm.as_ref().unwrap(); 198 | llm.post_summarize_job(doc).await; 199 | } 200 | } 201 | 202 | 203 | let result: Vec = post_query_wrapper(top_docs, &term, &searcher, &server_info); 204 | 205 | 206 | let json = serde_json::to_string(&result).unwrap(); 207 | 208 | json 209 | } 210 | 211 | fn get_top_docs(&self, term: &str) -> Vec<(f32, tantivy::DocAddress)> { 212 | let searcher = self.reader.searcher(); 213 | let server_info: &ServerInformation = &self.server_info; 214 | let query: Box = self.query_parser.parse_query(&term).unwrap(); 215 | let top_docs: Vec<(f32, tantivy::DocAddress)> = 216 | searcher.search(&query, 217 | &tantivy::collector::TopDocs::with_limit(server_info.show_top_hits)) 218 | .unwrap(); 219 | 220 | top_docs 221 | } 222 | } 223 | 224 | impl QueryEngine { 225 | async fn wait_for_summarize(&self, title: String) -> String { 226 | let llm = self.llm.as_ref().unwrap(); 227 | let wait_llm = tokio::time::Duration::from_millis(50); 228 | // TODO maybe add a guard to make sure don't wait too long? 229 | loop { 230 | let result = llm.quick_fetch(&title).await; 231 | match result { 232 | Some(s) => { return s; }, 233 | None => { } 234 | }; 235 | tokio::time::sleep(wait_llm).await; 236 | } 237 | } 238 | pub async fn summarize(&self, title: String) -> String { 239 | info!("Called summarize on {}", &title); 240 | if cfg!(feature="llm") { 241 | self.wait_for_summarize(title).await 242 | } else { 243 | "LLM turned off".to_owned() 244 | } 245 | } 246 | pub async fn get_llm_done_list(&self) -> String { 247 | if cfg!(feature="llm") { 248 | let llm = self.llm.as_ref().unwrap(); 249 | let result = &llm.get_llm_done_list().await; 250 | let json = serde_json::to_string(&result).unwrap(); 251 | return json; 252 | } else { 253 | "LLM turned off".to_owned() 254 | } 255 | } 256 | } 257 | 258 | fn term_preprocess(term:String) -> String { 259 | // in the future, I would use tokenize_sentence_to_text_vec here 260 | let term = term.replace("%20", " "); 261 | let term_vec = decode_cjk_str(term); 262 | term_vec.join(" ") 263 | } 264 | 265 | 266 | fn build_reader_parser(index: &tantivy::Index, document_setting: &DocumentSetting) 267 | -> (tantivy::IndexReader, tantivy::query::QueryParser) { 268 | let reader = index 269 | .reader_builder() 270 | .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay) // TODO switch to manual 271 | .try_into().unwrap(); 272 | let title = document_setting.schema.get_field("title").unwrap(); 273 | let body = document_setting.schema.get_field("body").unwrap(); 274 | let query_parser = tantivy::query::QueryParser::for_index(index, vec![title, body]); 275 | (reader, query_parser) 276 | } 277 | 278 | fn build_document_setting() -> DocumentSetting { 279 | let (schema, tokenizer) = build_schema_tokenizer(); 280 | DocumentSetting{ 281 | schema, tokenizer 282 | } 283 | } 284 | 285 | use crate::language_tools::tokenizer::TOKENIZER_ID; 286 | fn build_schema_tokenizer() -> (tantivy::schema::Schema, 287 | FireSeqTokenizer 288 | // Box 289 | ) { 290 | let mut schema_builder = tantivy::schema::SchemaBuilder::default(); 291 | let text_indexing = tantivy::schema::TextFieldIndexing::default() 292 | .set_tokenizer(TOKENIZER_ID) // Set custom tokenizer 293 | .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions); 294 | let text_options = tantivy::schema::TextOptions::default() 295 | .set_indexing_options(text_indexing) 296 | .set_stored(); 297 | let tokenizer = FireSeqTokenizer {}; 298 | 299 | let _title = schema_builder.add_text_field("title", text_options.clone()); 300 | let _body = schema_builder.add_text_field("body", text_options); 301 | 302 | let schema = schema_builder.build(); 303 | (schema, 304 | tokenizer 305 | // Box::new(tokenizer) 306 | ) 307 | } 308 | 309 | -------------------------------------------------------------------------------- /fire_seq_search_server/src/word_frequency/mod.rs: -------------------------------------------------------------------------------- 1 | use log::info; 2 | use crate::Article; 3 | use std::collections::{HashMap, HashSet}; 4 | 5 | 6 | use rayon::prelude::*; 7 | 8 | // let x: Vec> = vec![vec![1, 2], vec![3, 4]]; 9 | // let y: Vec<_> = x.into_par_iter().flatten().collect(); 10 | pub fn generate_wordcloud(articles: &Vec
) -> String { 11 | info!("Generating wordlist"); 12 | 13 | let tokens: Vec = articles.par_iter().map(article_to_tokens) 14 | .flatten().collect(); 15 | // for art in articles { 16 | // let tokens = article_to_tokens(art); 17 | // } 18 | info!("After flatten, we got {} tokens", tokens.len()); 19 | 20 | // silly group by 21 | let mut freq: HashMap = HashMap::new(); 22 | for t in tokens { 23 | match freq.get(&t) { 24 | Some(count) => { freq.insert(t, count + 1); } 25 | None => { freq.insert(t, 1 as i64); } 26 | } 27 | } 28 | 29 | 30 | 31 | let mut sorted_pairs: Vec<(String,i64)> = freq.into_iter().collect(); 32 | sorted_pairs.sort_by(|a, b| b.1.cmp(&a.1)); 33 | sorted_pairs.truncate(200); 34 | // sorted_pairs 35 | 36 | 37 | let serialized_data = serde_json::to_string(&sorted_pairs).unwrap(); 38 | serialized_data 39 | } 40 | 41 | fn article_to_tokens(art: &Article) -> Vec { 42 | let tokens = crate::language_tools::tokenizer::tokenize(&art.content); 43 | 44 | //TODO use another stop word list for wordcloud 45 | lazy_static! { 46 | static ref STOPWORDS_LIST: HashSet = crate::language_tools::generate_stopwords_list(); 47 | } 48 | let tokens = crate::language_tools::tokenizer::filter_out_stopwords(&tokens, &STOPWORDS_LIST); 49 | let tokens: Vec<&str> = tokens.into_iter().filter(|x| is_valid_for_wordcloud(x)).collect(); 50 | info!("Got tokens {:?}", &tokens); 51 | let tokens : Vec = tokens.into_iter().map(|x| x.to_string()).collect(); 52 | tokens 53 | } 54 | 55 | 56 | fn is_valid_for_wordcloud(s:&str) -> bool{ 57 | if is_symbol(s) { 58 | return false; 59 | } 60 | let invalid_end_pattern = vec!["::", "]]", "}}"]; 61 | let invalid_start_pattern = vec!["[[", "{{", "{\\"]; 62 | 63 | for ep in invalid_end_pattern { 64 | if s.ends_with(ep) { 65 | return false; 66 | } 67 | } 68 | for sp in invalid_start_pattern { 69 | if s.starts_with(sp) { 70 | return false; 71 | } 72 | } 73 | let logseq_exclude_list = vec!["DONE", "true", "SCHEDULED:", "collapsed", "file", "com", 74 | "CLOCK:", ":LOGBOOK:", ":END:"]; 75 | for stop in logseq_exclude_list { 76 | if s == stop { 77 | return false; 78 | } 79 | } 80 | // 81 | true 82 | } 83 | fn is_symbol(s:&str) -> bool { 84 | if s.len() == 0 { return true; } 85 | if s.len() > 3 { return false; } 86 | let mut flag = true; 87 | for c in s.chars() { 88 | if c.is_alphanumeric() { 89 | flag = false; 90 | } 91 | } 92 | flag 93 | } -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/assets/screenshot_demo_640_400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Endle/fireSeqSearch/143f6168d44148c2fa40ec2e8a512e2a2c44c638/fire_seq_search_server/tests/resource/assets/screenshot_demo_640_400.png -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/journals/2022_02_26.md: -------------------------------------------------------------------------------- 1 | - DONE [[Benchmark]] 2 | :LOGBOOK: 3 | CLOCK: [2022-02-26 Sat 11:18:52]--[2022-02-28 Mon 11:11:44] => 47:52:52 4 | :END: 5 | - DONE Try [[Debug]] 6 | :LOGBOOK: 7 | CLOCK: [2022-02-26 Sat 19:51:54]--[2022-02-27 Sun 15:34:21] => 19:42:27 8 | :END: 9 | -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/journals/2022_08_30.md: -------------------------------------------------------------------------------- 1 | - [[LATIN FOR BEGINNERS]] 2 | - [[International Language, Past, Present & Future by Walter John Clark]] 3 | - [[孙子兵法]] -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/logseq/pages-metadata.edn: -------------------------------------------------------------------------------- 1 | [{:block/name "a", 2 | :block/created-at 1661896874219, 3 | :block/updated-at 1661896874219} 4 | {:block/name "aug 30th, 2022", 5 | :block/created-at 1661896924808, 6 | :block/updated-at 1661897331497} 7 | {:block/name "b", 8 | :block/created-at 1661896874219, 9 | :block/updated-at 1661896874219} 10 | {:block/name "benchmark", 11 | :block/created-at 1661896924957, 12 | :block/updated-at 1661896924957} 13 | {:block/name "c", 14 | :block/created-at 1661896874219, 15 | :block/updated-at 1661896874219} 16 | {:block/name "canceled", 17 | :block/created-at 1661896874219, 18 | :block/updated-at 1661896874219} 19 | {:block/name "cancelled", 20 | :block/created-at 1661896874219, 21 | :block/updated-at 1661896874219} 22 | {:block/name "card", 23 | :block/created-at 1661896874219, 24 | :block/updated-at 1661896874219} 25 | {:block/name "contents", 26 | :block/created-at 1661896874219, 27 | :block/updated-at 1661896874219} 28 | {:block/name "debug", 29 | :block/created-at 1661896924959, 30 | :block/updated-at 1661896924959} 31 | {:block/name "doing", 32 | :block/created-at 1661896874219, 33 | :block/updated-at 1661896874219} 34 | {:block/name "done", 35 | :block/created-at 1661896874219, 36 | :block/updated-at 1661896874219} 37 | {:block/name "favorites", 38 | :block/created-at 1661896874219, 39 | :block/updated-at 1661896874219} 40 | {:block/name "feb 26th, 2022", 41 | :block/created-at 1661896924960, 42 | :block/updated-at 1661896924960} 43 | {:block/name "in-progress", 44 | :block/created-at 1661896874219, 45 | :block/updated-at 1661896874219} 46 | {:block/name 47 | "international language, past, present & future by walter john clark", 48 | :block/created-at 1661897225667, 49 | :block/updated-at 1661897239720} 50 | {:block/name "later", 51 | :block/created-at 1661896874219, 52 | :block/updated-at 1661896874219} 53 | {:block/name "latin for beginners", 54 | :block/created-at 1661897128500, 55 | :block/updated-at 1661897151913} 56 | {:block/name "now", 57 | :block/created-at 1661896874219, 58 | :block/updated-at 1661896874219} 59 | {:block/name "rust", 60 | :block/created-at 1661896924926, 61 | :block/updated-at 1661896924926} 62 | {:block/name "rust/closure", 63 | :block/created-at 1661896924924, 64 | :block/updated-at 1661896924924} 65 | {:block/name "rust/static variable", 66 | :block/created-at 1661896924922, 67 | :block/updated-at 1661896924922} 68 | {:block/name "rust/trait", 69 | :block/created-at 1661896924925, 70 | :block/updated-at 1661896924925} 71 | {:block/name "slipbox", 72 | :block/created-at 1661896924927, 73 | :block/updated-at 1661896924927} 74 | {:block/name "todo", 75 | :block/created-at 1661896874219, 76 | :block/updated-at 1661896874219} 77 | {:block/name "wait", 78 | :block/created-at 1661896874219, 79 | :block/updated-at 1661896874219} 80 | {:block/name "waiting", 81 | :block/created-at 1661896874219, 82 | :block/updated-at 1661896874219} 83 | {:block/name "孙子兵法", 84 | :block/created-at 1661897331497, 85 | :block/updated-at 1661897525235}] 86 | -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/International Language, Past, Present & Future by Walter John Clark.md: -------------------------------------------------------------------------------- 1 | - As an ounce of personal experience is worth a pound of second-hand recital, a brief statement may here be given of the way in which the present writer came to take up Esperanto, and of the experiences which soon led him to the conviction of its absolute practicability and utility. 2 | - In October, 1905, having just returned from an absence of some years in Canada and the Far East, he had his attention turned to Esperanto for the first time by reading an account of the Congress of Boulogne. He had no previous knowledge of, or leanings towards, a universal language; and if he had thought about it at all, it was only to laugh at the idea as a wild and visionary scheme. In short, his attitude was quite normal. 3 | - But here was a definite statement, professing to be one of positive accomplished fact. One of two things: either the newspaper account was not true; or else, the facts being as represented, here was a new possibility to be reckoned with. The only course was to send for the books and test the thing on its merits. Being somewhat used to languages, he did not take long to see that this one was good enough in itself. A letter, written in 13 Esperanto, after a few days' study of the grammar at odd times, with a halfpenny Esperanto-English key enclosed, was fully understood by the addressee, though he was ignorant up till then of the very existence of Esperanto. This experience has often been since repeated; indeed, the correspondent will often write back after a few days in Esperanto. Such letters have always been found intelligible, though in no case did the correspondent know Esperanto previously. The experiment is instructive and amusing, and can be tried by any one for an expenditure of twopence for keys and a few hours for studying the sixteen rules and their application. To many minds these are far simpler and more easy to grasp for practical use than the rules for scoring at bridge. 4 | - After a month or two's playing with the language in spare time, the writer further tested it, by sending out a flight of postcards to various selected Esperantists' addresses in different parts of the Russian Empire. The addressees ranged from St. Petersburg and Helsingfors through Poland to the Caucasus and to far Siberia. In nearly every case answers were received, and in some instances the initial interchange of postcards led to an extremely interesting correspondence, throwing much light on the disturbed state of things in the native town or province of the correspondent. From a Tiflis doctor came a graphic account of the state of affairs in the Caucasus; while a school inspector from the depths of Eastern Siberia painted a vivid picture of the effect of political unrest on the schools—lockouts and "malodorous chemical obstructions" (Anglice—the schools were stunk out). Many writers expressed themselves with great freedom, but feared their letters would not pass the censor. Judging by the proportion of answers received, the censorship was not at that time efficient. In no case was there any difficulty in grasping the writer's meaning. All the answers were in Esperanto. 5 | - This was fairly convincing, but still having doubts on the question of pronunciation, the writer resolved to attend the Esperanto Congress to be held at Geneva in August 1906. To 14 this end he continued to read Esperanto at odd minutes and took in an Esperanto gazette. About three weeks before the congress he got a member of his family to read aloud to him every day as far as possible a page or two of Esperanto, in order to attune his ear. He never had an opportunity of speaking the language before the congress, except once for a few minutes, when he travelled some distance to attend a meeting of the nearest English group. 6 | - Thus equipped, he went through the Congress of Geneva, and found himself able to follow most of the proceedings, and to converse freely, though slowly, with people of the most diverse nationality. At an early sitting of the congress he found himself next to a Russian from Kischineff, who had been through the first great pogrom, and a most interesting conversation ensued. Another day the neighbours were an Indian nawab and an abbé from Madrid. Another time it was a Bulgarian. At the first official banquet he sat next to a Finn, who rejoiced in the name of Attila, and, but for the civilizing influence of a universal language, might have been in the sunny south, like his namesake of the ancient world, on a very different errand from his present peaceful one. Yet here he was, rubbing elbows with Italians, as if there had never been such things as Huns or a sack of Rome by northern barbarians. 7 | - During the meal a Frenchman, finding himself near us English and some Germans, proposed a toast to the "entente cordiale taking in Germany," which was honoured with great enthusiasm. This is merely an instance of the small ways in which such gatherings make for peace and good will. 8 | - With all these people it was perfectly easy to converse in the common tongue, pronunciation and national idiom being no bar in practice. 9 | - And this experience was general throughout the duration of the congress. Day by day sittings were held for the transaction of all kinds of business and the discussion of the most varied subjects. It was impressive to see people from half the countries of the 15 world rise from different corners of the hall and contribute their share to the discussion in the most matter-of-fact way. Day by day the congressists met in social functions, debates, lectures, and sectional groups (chemical, medical, legal, etc.) for the regulation of matters touching their special interests. Everything was done in Esperanto, and never was there the slightest hitch or misunderstanding, or failure to give adequate expression to opinions owing to defects of language. The language difficulty was annihilated. 10 | - Perhaps one of the most striking demonstrations of this return to pre-Babel conditions was the performance of a three-part comedy by a Frenchman, a Russian, and a Spaniard. Such a thing would inevitably have been grotesque in any national language; but here they met on common neutral ground. No one's accent was "foreign," and none of the spectators possessed that mother-tongue acquaintance with Esperanto that would lead them to feel slight divergences shocking, or even noticeable without extreme attention to the point. Other theatrical performances were given at Geneva, as also at Boulogne, where a play of Molière was performed in Esperanto by actors of eight nationalities with one rehearsal, and with full success. 11 | - In the face of these facts it is idle to oppose a universal artificial language on the score of impossibility or inadequacy. The theoretical pronunciation difficulty completely crumbled away before the test of practice. 12 | - The "war-at-any-price party," the whole-hoggers à tous crins (the juxtaposition of the two national idioms lends a certain realism, and heightens the effect of each), are therefore driven back on their second line of attack, if the Hibernianism may be excused. "Yes," they say, "your language may be possible, but, after all, why not learn an existing language, if you've got to learn one anyway?" 13 | - Now, quite apart from the obvious fact that the nations will never agree to give the preference to the language of one of them to the prejudice of the others, this argument involves the 16 suggestion that an artificial language is no easier to learn than a natural one. We thus come to the question of ease as a qualification. 14 | - 15 | - 16 | - https://www.gutenberg.org/files/16737/16737-h/16737-h.htm 17 | - -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/Rust.md: -------------------------------------------------------------------------------- 1 | - Stub now for refs 2 | - 3 | - [[Rust/Static variable]] 4 | - [[Rust/Closure]] 5 | - [[Rust/trait]] 6 | - 7 | - 8 | - [[SlipBox]] 9 | - What's the cost of index bound check? 10 | - https://users.rust-lang.org/t/is-bound-checking-the-only-runtime-cost-of-rust/66661/3 11 | - -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/Softmax.md: -------------------------------------------------------------------------------- 1 | - also known as softargmax or normalized exponential function 2 | 3 | - Calculate $f(\vec{z})$ 4 | - 5 | $$b=\sum_{k=1}^{K} exp(z_k)$$ 6 | - 7 | $$ f(\vec{z})_i = \dfrac{exp(z_i)}{b}$$ 8 | 9 | -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/advanced_query.md: -------------------------------------------------------------------------------- 1 | - #+BEGIN_QUERY 2 | {:title "advance exempli gratia" 3 | :query [ 4 | :find (pull ?b [*]) 5 | :where 6 | [?b :block/page ?p] 7 | [?p :page/name ?pn] 8 | [?b :block/marker ?marker] 9 | [(contains? #{"NOW" "DOING" "TODO"} ?marker)] 10 | ] 11 | } 12 | #+END_QUERY 13 | - 14 | - 15 | - In this test page we have some queries. We want to exclude the query statement from results 16 | - 17 | - #+BEGIN_QUERY 18 | {:title "advance exempli gratia" 19 | :query [ 20 | :find (pull ?b [*]) 21 | :where 22 | [?b :block/page ?p] 23 | [?p :page/name ?pn] 24 | [?b :block/marker ?marker] 25 | [(contains? #{"NOW" "DOING" "TODO"} ?marker)] 26 | ] 27 | } 28 | #+END_QUERY 29 | - -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/blog_thunderbird_zh.md: -------------------------------------------------------------------------------- 1 | - [Aug 3, 2021 - 使用 git shallow clone 下载并编译 Thunderbird](https://endle.github.io/2021/08/03/git-shallow-clone-build-thunderbird/) 2 | 3 | 4 | 5 | 6 | 7 | 最近在尝试编译 Thunderbird. [官方的手册](https://developer.thunderbird.net/thunderbird-development/getting-started) 的建议是 8 | 9 | ``` 10 | hg clone https://hg.mozilla.org/mozilla-central source/ 11 | cd source/ 12 | hg clone https://hg.mozilla.org/comm-central comm/ 13 | ``` 14 | 15 | 因为我网络情况不好,硬盘空间也有些捉襟见肘,就只想下载最新的版本。可是,[Mercurial HG 并不支持](https://stackoverflow.com/a/4205246/1166518). 16 | 17 | Mozilla 已经在 GitHub 上有了实验性的 Mirror. 因此,我使用如下的方式下载 Thunderbird 的代码。 18 | 19 | ``` 20 | # My personal habit 21 | cd ~/src/mozilla 22 | git clone --depth=1 https://github.com/mozilla/gecko-dev.git mozilla-central 23 | git clone --depth=1 https://github.com/mozilla/releases-comm-central comm-central 24 | cp -R --reflink=auto comm-central/ mozilla-central/comm 25 | ``` 26 | 27 | 我会使用如下代码进行更新。 28 | 29 | ``` 30 | cd mozilla-central && git pull origin master && trash comm && cd .. 31 | cd comm-central && git pull origin master && cd .. 32 | cp -R --reflink=auto comm-central/ mozilla-central/comm 33 | cd mozilla-central 34 | ``` 35 | - 36 | - 37 | - 38 | - Source: https://endle.github.io/2021/08/03/git-shallow-clone-build-thunderbird/ 39 | - CC-BY 4.0 Zhenbo Li -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/cyrillic.md: -------------------------------------------------------------------------------- 1 | - Это статья для тестов поиска в кириллических символах. -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/feditips.md: -------------------------------------------------------------------------------- 1 | - Below information is copied from https://fedi.tips 2 | - Copyrights The text of the articles are available to use on your own libre projects under [Creative Commons Attribution ShareAlike 4.0 ⧉](https://creativecommons.org/licenses/by-sa/4.0/), you can attribute by crediting and linking back to [https://fedi.tips/](https://fedi.tips/) 3 | - 4 | - See https://fedi.tips/about-this-site/ 5 | - 6 | - On Mastodon, hashtag following has been merged into the latest version of the software and will be introduced in the next update across all servers. It is already being tested on some servers such as mastodon.social and mastodon.online. If you’re on a server that runs it, you can follow hashtags by logging in through the website, searching for a hashtag and then clicking the follow button in the top right corner of the results. Posts that are visible to your server which include that hashtag will then appear in your normal timeline, even if you’re not following the account that posted it. You can unfollow by searching for the same hashtag and clicking the unfollow button (which is in the same place the follow button was). On Friendica, hashtag following has been available as standard for years now, and works in a very similar way: search for a hashtag and click the + logo in the top right corner to follow it. Posts with that tag will appear in your normal timeline. Whatever platform you’re following from, it’s a really handy way of discovering interesting posts and new accounts to follow. Note that this only shows posts in your timeline made after the follow began, so there may be a delay in seeing such posts appear in your timeline, depending on whether a new post with that tag has been published. Also, it only shows posts that are visible to your server anyway, it is not pulling posts in purely on the basis of the hashtag. Because no one owns the Fediverse, there is no central authority to give out “verified” badges the way Twitter etc do. If you do see any Twitter-style verified badges these are just custom emoji and don’t mean anything, it’s just people messing around. There are various websites trying to set themselves up as central authorities, but we strongly recommend avoiding these completely. [The entire point of being on the Fediverse is to prevent any central authorities taking over](https://fedi.tips/mastodon-and-the-fediverse-beginners-start-here/#whoownsthefediverse). However, there are still ways to actually verify your identity on the Fedi: If you’re already verified on Twitter etc, tell people about your Fediverse account and link to it, then link to this post on your Fediverse account. This will let people on the Fediverse know that you’re the same person who owns the verified account on Twitter etc. If you have an official website, link to your Fediverse account from your website and link to your website from your Fediverse account. If people already trust your website to be official, then by extension they will know your Fediverse account is official. On Mastodon, you can take the website method a step further. Log in through the website, go to *Edit profile > Verification*, copy and paste the HTML code into your website’s front page’s code. Add your website’s address into your profile’s Metadata section, remembering to include https:// at the beginning. After you’ve done all this, press the *Save changes* button in your profile settings. You will then see a link to your website on your Mastodon profile which has turned green with a green tick next to it, to verify you are the site’s owner. If you need to verify lots of accounts from a group or organisation, you might want to make your own Fediverse server as a subdomain of your official website. This is what the European Union did when they made [their own Mastodon server ⧉](https://social.network.europa.eu/) and [their own PeerTube server ⧉](https://tube.network.europa.eu/). Because the European Union’s official website is at europa.eu, and their servers were all subdomains of europa.eu, it meant all the accounts on their servers could be trusted as being official EU Fediverse accounts. Making your own server on a subdomain [is much easier and cheaper than you think](https://growyourown.services/grow-your-own-social-network/). And whatever you do, don’t use the “verified” emoji. This means nothing at all on the Fediverse, anyone can add it to their profiles. **NOTE:** If you’re verifying your Mastodon account using the code-pasting method, make sure that all the links to your Mastodon account on your website include rel=”me” in their link code. If there’s one without rel=”me”, for example in a dropdown menu, the verification process may fail. Also, bear in mind there may be some delay before your website address turns green on your profile, don’t worry if it doesn’t happen straight away. Using multiple accounts First of all, it’s worth saying again that [most people do not need to use multiple accounts](https://fedi.tips/mastodon-and-the-fediverse-beginners-start-here/#doineedmultipleaccounts). The Fediverse is designed in such a way that people on different servers can interact seamlessly, as if they were all on one network. However, some people may need separate personal and work accounts, or an extra account that focuses on a specialist topic which they wish to keep separate from their main account. Whatever your reasons, it’s very easy to use multiple accounts on Mastodon and the Fediverse: all you have to do is sign up on a different server for each account you want. Because the servers are independent, you can use the same email address for each account, and you can be signed into all the accounts simultaneously on the web or on an app. Signing up for accounts on different servers also means that if one server goes down you can use your alternative account on another server. If you use the Fediverse through the web, you can log into all the accounts at once and switch views by keeping each account open in a separate tab. Official and third party apps support multiple accounts too. You can be signed into all your accounts at once, and switch between them within the app. The interface for switching differs from app to app. On the official Mastodon app, you can add accounts and switch between them by holding down your profile image in the bottom right corner. A menu will appear which lets you add or switch accounts. 7 | -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/fireSeqSearch___test___5.md: -------------------------------------------------------------------------------- 1 | - Hello -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/咖啡.md: -------------------------------------------------------------------------------- 1 | - 我试着解读一下,你这个咖啡拉花是对二十世纪的科学大发现的年代的挽歌。来自植物(咖啡)和来自动物(牛)的产物 2 | - -------------------------------------------------------------------------------- /fire_seq_search_server/tests/resource/pages/孙子兵法.md: -------------------------------------------------------------------------------- 1 | - https://www.gutenberg.org/cache/epub/23864/pg23864.html 2 | - 3 | - 4 | - 始計第一 5 | - 孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。 6 | - 故經之以五事,校之以計,而索其情:一曰道,二曰天,三曰地,四曰將,五曰法。 7 | - 道者,令民與上同意,可與之死,可與之生,而不畏危也;天者,陰陽、寒暑、時制也;地者,遠近、險易、廣狹、死生也;將者,智、信、仁、勇、嚴也;法者,曲制、官道、主用也。凡此五者,將莫不聞,知之者勝,不知者不勝。 8 | - 故校之以計,而索其情,曰:主孰有道?將孰有能?天地孰得?法令孰行?兵眾孰強?士卒孰練?賞罰孰明?吾以此知勝負矣。 9 | - 將聽吾計,用之必勝,留之;將不聽吾計,用之必敗,去之。 10 | - 計利以聽,乃為之勢,以佐其外。勢者,因利而制權也。 11 | - 兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之,實而備之,強而避之,怒而撓之,卑而驕之,佚而勞之,親而離之,攻其無備,出其不意。此兵家之勝,不可先傳也。 12 | - 夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也。多算勝,少算不勝,而況無算乎!吾以此觀之,勝負見矣。 13 | - 作戰第二 14 | - 孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬,千里饋糧。則內外之費,賓客之用,膠漆之材,車甲之奉,日費千金,然後十萬之師舉矣。 15 | - 其用戰也,貴勝,久則鈍兵挫銳,攻城則力屈,久暴師則國用不足。夫鈍兵挫銳,屈力殫貨,則諸侯乘其弊而起,雖有智者,不能善其後矣。故兵聞拙速,未睹巧之久也。夫兵久而國利者,未之有也。故不盡知用兵之害者,則不能盡知用兵之利也。 16 | - 善用兵者,役不再籍,糧不三載,取用於國,因糧於敵,故軍食可足也。國之貧於師者遠輸,遠輸則百姓貧;近於師者貴賣,貴賣則百姓竭,財竭則急於丘役。力屈財殫,中原內虛於家,百姓之費,十去其七;公家之費,破軍罷馬,甲胄矢弩,戟楯矛櫓,丘牛大車,十去其六。 17 | - 故智將務食於敵,食敵一鍾,當吾二十鍾;萁稈一石,當吾二十石。故殺敵者,怒也;取敵之利者,貨也。故車戰,得車十乘以上,賞其先得者,而更其旌旗。車雜而乘之,卒善而養之,是謂勝敵而益強。 18 | - 故兵貴勝,不貴久。故知兵之將,民之司命。國家安危之主也。 19 | - 謀攻第三 20 | - 孫子曰:凡用兵之法,全國為上,破國次之;全軍為上,破軍次之;全旅為上,破旅次之;全卒為上,破卒次之;全伍為上,破伍次之。是故百戰百勝,非善之善者也;不戰而屈人之兵,善之善者也。 21 | - 故上兵伐謀,其次伐交,其次伐兵,其下攻城。攻城之法,為不得已。修櫓轒轀,具器械,三月而後成;距闉,又三月而後已。將不勝其忿,而蟻附之,殺士三分之一,而城不拔者,此攻之災也。 22 | - 故善用兵者,屈人之兵,而非戰也,拔人之城而非攻也,毀人之國而非久也,必以全爭於天下,故兵不頓而利可全,此謀攻之法也。 23 | - 故用兵之法,十則圍之,五則攻之,倍則分之,敵則能戰之,少則能逃之,不若則能避之。故小敵之堅,大敵之擒也。 24 | - 夫將者,國之輔也。輔周則國必強,輔隙則國必弱。故君之所以患於軍者三:不知軍之不可以進而謂之進,不知軍之不可以退而謂之退,是謂縻軍;不知三軍之事,而同三軍之政,則軍士惑矣;不知三軍之權,而同三軍之任,則軍士疑矣。三軍既惑且疑,則諸侯之難至矣。是謂亂軍引勝。 25 | - 故知勝有五:知可以戰與不可以戰者,勝。識眾寡之用者,勝。上下同欲者,勝。以虞待不虞者,勝。將能而君不御者,勝。此五者,知勝之道也。 26 | - 故曰:知己知彼,百戰不貽;不知彼而知己,一勝一負;不知彼不知己,每戰必敗。 27 | - 軍形第四 28 | - 孫子曰:昔之善戰者,先為不可勝,以待敵之可勝。不可勝在己,可勝在敵。故善戰者,能為不可勝,不能使敵必可勝。故曰:勝可知,而不可為。 29 | - 不可勝者,守也;可勝者,攻也。守則不足,攻則有餘。善守者,藏於九地之下,善攻者,動於九天之上,故能自保而全勝也。 30 | - 見勝不過眾人之所知,非善之善者也;戰勝而天下曰善,非善之善者也。故舉秋毫不為多力,見日月不為明目,聞雷霆不為聰耳。古之善戰者,勝於易勝者也。故善戰者之勝也,無智名,無勇功,故其戰勝不忒。不忒者,其所措必勝,勝已敗者也。故善戰者,先立於不敗之地,而不失敵之敗也。是故勝兵先勝,而後求戰,敗兵先戰而後求勝。善用兵者,修道而保法,故能為勝敗之政。 31 | - 兵法:一曰度,二曰量,三曰數,四曰稱,五曰勝。地生度,度生量,量生數,數生稱,稱生勝。故勝兵若以鎰稱銖,敗兵若以銖稱鎰。勝者之戰,若決積水於千仞之谿者,形也。 32 | - 兵勢第五 33 | - 孫子曰:凡治眾如治寡,分數是也;鬥眾如鬥寡,形名是也;三軍之眾,可使必受敵而無敗者,奇正是也;兵之所加,如以碫投卵者,虛實是也。 34 | - 凡戰者,以正合,以奇勝。故善出奇者,無窮如天地,不竭如江海。終而複始,日月是也。死而復生,四時是也。聲不過五,五聲之變,不可勝聽也;色不過五,五色之變,不可勝觀也;味不過五,五味之變,不可勝嘗也;戰勢,不過奇正,奇正之變,不可勝窮也。奇正相生,如循環之無端,熟能窮之哉? 35 | - 激水之疾,至於漂石者,勢也;鷙鳥之疾,至於毀折者,節也。是故善戰者,其勢險,其節短。勢如張弩,節如發機。 36 | - 紛紛紜紜,鬥亂而不可亂也;渾渾沌沌,形圓而不可敗也。亂生於治,怯生於勇,弱生於強。治亂,數也;勇怯,勢也;強弱,形也。故善動敵者,形之,敵必從之;予之,敵必取之。以利動之,以卒待之。 37 | - 故善戰者,求之於勢,不責於人;故能擇人而任勢。任勢者,其戰人也,如轉木石。木石之性,安則靜,危則動,方則止,圓則行。故善戰人之勢,如轉圓石於千仞之山者,勢也。 38 | - 虛實第六 39 | - 孫子曰:凡先處戰地而待敵者佚,後處戰地而趨戰者勞。 40 | - 故善戰者,致人而不致於人。能使敵人自至者,利之也;能使敵人不得至者,害之也。故敵佚能勞之,飽能饑之,安能動之。出其所必趨,趨其所不意。行千里而不勞者,行於無人之地也;攻而必取者,攻其所不守也。守而必固者,守其所不攻也。 41 | - 故善攻者,敵不知其所守;善守者,敵不知其所攻。微乎微乎,至於無形;神乎神乎,至於無聲,故能為敵之司命。進而不可禦者,沖其虛也;退而不可追者,速而不可及也。故我欲戰,敵雖高壘深溝,不得不與我戰者,攻其所必救也;我不欲戰,雖畫地而守之,敵不得與我戰者,乖其所之也。故形人而我無形,則我專而敵分。我專為一,敵分為十,是以十攻其一也。則我眾敵寡,能以眾擊寡者,則吾之所與戰者約矣。吾所與戰之地不可知,不可知則敵所備者多,敵所備者多,則吾所與戰者寡矣。故備前則後寡,備後則前寡,備左則右寡,備右則左寡,無所不備,則無所不寡。寡者,備人者也;眾者,使人備己者也。故知戰之地,知戰之日,則可千里而會戰;不知戰之地,不知戰日,則左不能救右,右不能救左,前不能救後,後不能救前,而況遠者數十裏,近者數裏乎!以吾度之,越人之兵雖多,亦奚益於勝哉!故曰:勝可為也。敵雖眾,可使無鬥。故策之而知得失之計,候之而知動靜之理,形之而知死生之地,角之而知有餘不足之處。故形兵之極,至於無形。無形則深間不能窺,智者不能謀。因形而措勝於眾,眾不能知。人皆知我所以勝之形,而莫知吾所以制勝之形。故其戰勝不復,而應形於無窮。夫兵形象水,水之行避高而趨下,兵之形避實而擊虛;水因地而制流,兵因敵而制勝。故兵無常勢,水無常形。能因敵變化而取勝者,謂之神。故五行無常勝,四時無常位,日有短長,月有死生。 42 | - 軍爭第七 43 | - 孫子曰: 44 | 凡用兵之法,將受命於君,合軍聚眾,交和而舍,莫難於軍爭。軍爭之難者,以迂為直,以患為利。故迂其途,而誘之以利,後人發,先人至,此知迂直之計者也。軍爭為利,軍爭為危。舉軍而爭利則不及,委軍而爭利則輜重捐。是故捲甲而趨,日夜不處,倍道兼行,百裡而爭利,則擒三將軍,勁者先,疲者後,其法十一而至;五十裏而爭利,則蹶上將軍,其法半至;三十裏而爭利,則三分之二至。是故軍無輜重則亡,無糧食則亡,無委積則亡。故不知諸侯之謀者,不能豫交;不知山林、險阻、沮澤之形者,不能行軍;不用鄉導者,不能得地利。故兵以詐立,以利動,以分和為變者也。故其疾如風,其徐如林,侵掠如火,不動如山,難知如陰,動如雷震。掠鄉分眾,廓地分利,懸權而動。先知迂直之計者勝,此軍爭之法也。《軍政》曰:“言不相聞,故為之金鼓;視不相見,故為之旌旗。”夫金鼓旌旗者,所以一民之耳目也。民既專一,則勇者不得獨進,怯者不得獨退,此用眾之法也。故夜戰多金鼓,晝戰多旌旗,所以變人之耳目也。三軍可奪氣,將軍可奪心。是故朝氣銳,晝氣惰,暮氣歸。善用兵者,避其銳氣,擊其惰歸,此治氣者也。以治待亂,以靜待嘩,此治心者也。以近待遠,以佚待勞,以飽待饑,此治力者也。無邀正正之旗,無擊堂堂之陳,此治變者也。故用兵之法,高陵勿向,背丘勿逆,佯北勿從,銳卒勿攻,餌兵勿食,歸師勿遏,圍師遺闕,窮寇勿迫,此用兵之法也。 45 | - 九變第八 46 | - 孫子曰: 47 | 凡用兵之法,將受命於君,合軍聚合。泛地無舍,衢地合交,絕地無留,圍地則謀,死地則戰,途有所不由,軍有所不擊,城有所不攻,地有所不爭,君命有所不受。故將通於九變之利者,知用兵矣;將不通九變之利,雖知地形,不能得地之利矣;治兵不知九變之術,雖知五利,不能得人之用矣。是故智者之慮,必雜於利害,雜於利而務可信也,雜於害而患可解也。是故屈諸侯者以害,役諸侯者以業,趨諸侯者以利。故用兵之法,無恃其不來,恃吾有以待之;無恃其不攻,恃吾有所不可攻也。故將有五危,必死可殺,必生可虜,忿速可侮,廉潔可辱,愛民可煩。凡此五者,將之過也,用兵之災也。覆軍殺將,必以五危,不可不察也。 48 | - 行軍第九 49 | - 孫子曰:凡處軍相敵,絕山依穀,視生處高,戰隆無登,此處山之軍也。絕水必遠水,客絕水而來,勿迎之於水內,令半渡而擊之利,欲戰者,無附於水而迎客,視生處高,無迎水流,此處水上之軍也。絕斥澤,唯亟去無留,若交軍於斥澤之中,必依水草而背眾樹,此處斥澤之軍也。平陸處易,右背高,前死後生,此處平陸之軍也。凡此四軍之利,黃帝之所以勝四帝也。凡軍好高而惡下,貴陽而賤陰,養生而處實,軍無百疾,是謂必勝。丘陵堤防,必處其陽而右背之,此兵之利,地之助也。上雨水流至,欲涉者,待其定也。凡地有絕澗、天井、天牢、天羅、天陷、天隙,必亟去之,勿近也。吾遠之,敵近之;吾迎之,敵背之。軍旁有險阻、潢井、蒹葭、小林、蘙薈者,必謹覆索之,此伏姦之所處也。敵近而靜者,恃其險也;遠而挑戰者,欲人之進也;其所居易者,利也;眾樹動者,來也;眾草多障者,疑也;鳥起者,伏也;獸駭者,覆也;塵高而銳者,車來也;卑而廣者,徒來也;散而條達者,樵採也;少而往來者,營軍也;辭卑而備者,進也;辭強而進驅者,退也;輕車先出居其側者,陳也;無約而請和者,謀也;奔走而陳兵者,期也;半進半退者,誘也;杖而立者,饑也;汲而先飲者,渴也;見利而不進者,勞也;鳥集者,虛也;夜呼者,恐也;軍擾者,將不重也;旌旗動者,亂也;吏怒者,倦也;殺馬肉食者,軍無糧也;懸甀不返其舍者,窮寇也;諄諄翕翕,徐與人言者,失眾也;數賞者,窘也;數罰者,困也;先暴而後畏其眾者,不精之至也;來委謝者,欲休息也。兵怒而相迎,久而不合,又不相去,必謹察之。兵非貴益多也,惟無武進,足以並力料敵取人而已。夫惟無慮而易敵者,必擒於人。卒未親而罰之,則不服,不服則難用。卒已親附而罰不行,則不可用。故合之以文,齊之以武,是謂必取。令素行以教其民,則民服;令素不行以教其民,則民不服。令素行者,與眾相得也。 50 | - 地形第十 51 | - 孫子曰:地形有通者、有掛者、有支者、有隘者、有險者、有遠者。我可以往,彼可以來,曰通。通形者,先居高陽,利糧道,以戰則利。可以往,難以返,曰掛。掛形者,敵無備,出而勝之,敵若有備,出而不勝,難以返,不利。我出而不利,彼出而不利,曰支。支形者,敵雖利我,我無出也,引而去之,令敵半出而擊之利。隘形者,我先居之,必盈之以待敵。若敵先居之,盈而勿從,不盈而從之。險形者,我先居之,必居高陽以待敵;若敵先居之,引而去之,勿從也。遠形者,勢均難以挑戰,戰而不利。凡此六者,地之道也,將之至任,不可不察也。凡兵有走者、有馳者、有陷者、有崩者、有亂者、有北者。凡此六者,非天地之災,將之過也。夫勢均,以一擊十,曰走;卒強吏弱,曰馳;吏強卒弱,曰陷;大吏怒而不服,遇敵懟而自戰,將不知其能,曰崩;將弱不嚴,教道不明,吏卒無常,陳兵縱橫,曰亂;將不能料敵,以少合眾,以弱擊強,兵無選鋒,曰北。凡此六者,敗之道也,將之至任,不可不察也。夫地形者,兵之助也。料敵制勝,計險隘遠近,上將之道也。知此而用戰者必勝,不知此而用戰者必敗。故戰道必勝,主曰無戰,必戰可也;戰道不勝,主曰必戰,無戰可也。故進不求名,退不避罪,唯民是保,而利於主,國之寶也。視卒如嬰兒,故可以與之赴深溪;視卒如愛子,故可與之俱死。厚而不能使,愛而不能令,亂而不能治,譬若驕子,不可用也。知吾卒之可以擊,而不知敵之不可擊,勝之半也;知敵之可擊,而不知吾卒之不可以擊,勝之半也;知敵之可擊,知吾卒之可以擊,而不知地形之不可以戰,勝之半也。故知兵者,動而不迷,舉而不窮。故曰:知彼知己,勝乃不殆;知天知地,勝乃可全。 52 | - 九地第十一 53 | - 孫子曰:用兵之法,有散地,有輕地,有爭地,有交地,有衢地,有重地,有泛地,有圍地,有死地。諸侯自戰其地者,為散地;入人之地不深者,為輕地;我得亦利,彼得亦利者,為爭地;我可以往,彼可以來者,為交地;諸侯之地三屬,先至而得天下眾者,為衢地;入人之地深,背城邑多者,為重地;山林、險阻、沮澤,凡難行之道者,為泛地;所由入者隘,所從歸者迂,彼寡可以擊吾之眾者,為圍地;疾戰則存,不疾戰則亡者,為死地。是故散地則無戰,輕地則無止,爭地則無攻,交地則無絕,衢地則合交,重地則掠,泛地則行,圍地則謀,死地則戰。古之善用兵者,能使敵人前後不相及,眾寡不相恃,貴賤不相救,上下不相收,卒離而不集,兵合而不齊。合於利而動,不合於利而止。敢問敵眾而整將來,待之若何曰:先奪其所愛則聽矣。兵之情主速,乘人之不及。由不虞之道,攻其所不戒也。凡為客之道,深入則專。主人不克,掠於饒野,三軍足食。謹養而勿勞,並氣積力,運兵計謀,為不可測。投之無所往,死且不北。死焉不得,士人盡力。兵士甚陷則不懼,無所往則固,深入則拘,不得已則鬥。是故其兵不修而戒,不求而得,不約而親,不令而信,禁祥去疑,至死無所之。吾士無餘財,非惡貨也;無餘命,非惡壽也。令發之日,士卒坐者涕沾襟,偃臥者涕交頤,投之無所往,諸、劌之勇也。故善用兵者,譬如率然。率然者,常山之蛇也。擊其首則尾至,擊其尾則首至,擊其中則首尾俱至。敢問兵可使如率然乎?曰可。夫吳人與越人相惡也,當其同舟而濟而遇風,其相救也如左右手。是故方馬埋輪,未足恃也;齊勇如一,政之道也;剛柔皆得,地之理也。故善用兵者,攜手若使一人,不得已也。將軍之事,靜以幽,正以治,能愚士卒之耳目,使之無知;易其事,革其謀,使人無識;易其居,迂其途,使民不得慮。帥與之期,如登高而去其梯;帥與之深入諸侯之地,而發其機。若驅群羊,驅而往,驅而來,莫知所之。聚三軍之眾,投之於險,此謂將軍之事也。九地之變,屈伸之力,人情之理,不可不察也。凡為客之道,深則專,淺則散。去國越境而師者,絕地也;四徹者,衢地也;入深者,重地也;入淺者,輕地也;背固前隘者,圍地也;無所往者,死地也。是故散地吾將一其志,輕地吾將使之屬,爭地吾將趨其後,交地吾將謹其守,交地吾將固其結,衢地吾將謹其恃,重地吾將繼其食,泛地吾將進其途,圍地吾將塞其闕,死地吾將示之以不活。故兵之情:圍則禦,不得已則鬥,過則從。是故不知諸侯之謀者,不能預交;不知山林、險阻、沮澤之形者,不能行軍;不用鄉導,不能得地利。四五者,一不知,非霸王之兵也。夫霸王之兵,伐大國,則其眾不得聚;威加於敵,則其交不得合。是故不爭天下之交,不養天下之權,信己之私,威加於敵,則其城可拔,其國可隳。施無法之賞,懸無政之令。犯三軍之眾,若使一人。犯之以事,勿告以言;犯之以害,勿告以利。投之亡地然後存,陷之死地然後生。夫眾陷於害,然後能為勝敗。故為兵之事,在順詳敵之意,並敵一向,千里殺將,是謂巧能成事。是故政舉之日,夷關折符,無通其使,厲於廊廟之上,以誅其事。敵人開闔,必亟入之,先其所愛,微與之期,踐墨隨敵,以決戰事。是故始如處女,敵人開戶;後如脫兔,敵不及拒。 54 | - 火攻第十二 55 | - 孫子曰:凡火攻有五:一曰火人,二曰火積,三曰火輜,四曰火庫,五曰火隊。行火必有因,因必素具。發火有時,起火有日。時者,天之燥也。日者,月在箕、壁、翼、軫也。凡此四宿者,風起之日也。凡火攻,必因五火之變而應之:火發於內,則早應之於外;火發而其兵靜者,待而勿攻,極其火力,可從而從之,不可從則上。火可發於外,無待於內,以時發之,火發上風,無攻下風,晝風久,夜風止。凡軍必知五火之變,以數守之。故以火佐攻者明,以水佐攻者強。水可以絕,不可以奪。夫戰勝攻取而不惰其功者凶,命曰“費留”。故曰:明主慮之,良將惰之,非利不動,非得不用,非危不戰。主不可以怒而興師,將不可以慍而攻戰。合於利而動,不合於利而上。怒可以複喜,慍可以複說,亡國不可以複存,死者不可以複生。故明主慎之,良將警之。此安國全軍之道也。 56 | - 用間第十三 57 | - 孫子曰: 58 | 凡興師十萬,出征千里,百姓之費,公家之奉,日費千金,內外騷動,怠於道路,不得操事者,七十萬家。相守數年,以爭一日之勝,而愛爵祿百金,不知敵之情者,不仁之至也,非民之將也,非主之佐也,非勝之主也。故明君賢將所以動而勝人,成功出於眾者,先知也。先知者,不可取於鬼神,不可象於事,不可驗於度,必取於人,知敵之情者也。故用間有五:有因間,有內間,有反間,有死間,有生間。五間俱起,莫知其道,是謂神紀,人君之寶也。鄉間者,因其鄉人而用之;內間者,因其官人而用之;反間者,因其敵間而用之;死間者,為誑事於外,令吾聞知之而傳於敵間也;生間者,反報也。故三軍之事,莫親於間,賞莫厚於間,事莫密於間,非聖賢不能用間,非仁義不能使間,非微妙不能得間之實。微哉微哉!無所不用間也。間事未發而先聞者,間與所告者兼死。凡軍之所欲擊,城之所欲攻,人之所欲殺,必先知其守將、左右、謁者、門者、舍人之姓名,令吾間必索知之。敵間之來間我者,因而利之,導而舍之,故反間可得而用也;因是而知之,故鄉間、內間可得而使也;因是而知之,故死間為誑事,可使告敵;因是而知之,故生間可使如期。五間之事,主必知之,知之必在於反間,故反間不可不厚也。昔殷之興也,伊摯在夏;周之興也,呂牙在殷。故明君賢將,能以上智為間者,必成大功。此兵之要,三軍之所恃而動也。 59 | - 60 | - -------------------------------------------------------------------------------- /fire_seq_search_server/tests/run_render.sh: -------------------------------------------------------------------------------- 1 | RUST_LOG=info cargo test --test unit_test_render_block -- --nocapture 2 | -------------------------------------------------------------------------------- /fire_seq_search_server/tests/unit_test_load_notes.rs: -------------------------------------------------------------------------------- 1 | use fire_seq_search_server::markdown_parser::{exclude_advanced_query, parse_to_plain_text}; 2 | 3 | use std::borrow::Cow; 4 | 5 | 6 | fn load_articles() -> Vec<(String, String)> { 7 | let r = read_specific_directory("tests/resource/pages"); 8 | r 9 | } 10 | 11 | #[test] 12 | fn test_load_articles() { 13 | let r = load_articles(); 14 | assert_eq!(r.len(), 11); 15 | for (title,body) in &r{ 16 | assert!(title.len()>0); 17 | assert!(body.len()>0); 18 | } 19 | } 20 | 21 | 22 | fn read_file_to_line(relative_path: &str) -> String { 23 | let path = vec![String::from("tests/resource/pages"), 24 | relative_path.to_string()]; 25 | let path = path.join("/"); 26 | std::fs::read_to_string(&path) 27 | .expect("Should have been able to read the file") 28 | } 29 | 30 | 31 | #[test] 32 | fn parse() { 33 | let md = read_file_to_line("blog_thunderbird_zh.md"); 34 | let result = parse_to_plain_text(&md); 35 | assert!(result.contains("Aug 3, 2021 - 使用 git shallow clone 下载并编译 Thunderbird")); 36 | assert!(!result.contains("https://developer.thunderbird.net/thunderbird-development/getting-started")); 37 | 38 | } 39 | 40 | #[test] 41 | fn exclude_advance_query() { 42 | let md = read_file_to_line("advanced_query.md"); 43 | let md = Cow::from(md); 44 | let result = exclude_advanced_query(md); 45 | assert!(!result.contains("exempli")); 46 | assert!(result.contains("In this test page we have")); 47 | 48 | 49 | let md = read_file_to_line("blog_thunderbird_zh.md"); 50 | let md = Cow::from(md); 51 | let result = exclude_advanced_query(md.clone()); 52 | assert_eq!(md, result); 53 | } 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | // ===================== 62 | // These functions are removed in https://github.com/Endle/fireSeqSearch/pull/149/commits/7692bd9091380858b0cbeb2fa10d8c01dabcba91 63 | // aka https://github.com/Endle/fireSeqSearch/pull/147 64 | // To make unit test happy, I copied them as test helper functions 65 | // Zhenbo - 2024 Sep 21 66 | use std::fs::DirEntry; 67 | use rayon::iter::IntoParallelRefIterator; 68 | use rayon::iter::ParallelIterator; 69 | use std::process; 70 | fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { 71 | if let Ok(file_type) = note.file_type() { 72 | // Now let's show our entry's file type! 73 | if file_type.is_dir() { 74 | return None; 75 | } 76 | } else { 77 | return None; 78 | } 79 | 80 | let note_path = note.path(); 81 | let note_title = match note_path.file_stem() { 82 | Some(osstr) => osstr.to_str().unwrap(), 83 | None => { 84 | return None; 85 | } 86 | }; 87 | let content : String = match std::fs::read_to_string(¬e_path) { 88 | Ok(c) => c, 89 | Err(e) => { 90 | if note_title.to_lowercase() == ".ds_store" { 91 | } else { 92 | } 93 | return None; 94 | } 95 | }; 96 | 97 | Some((note_title.to_string(),content)) 98 | } 99 | fn read_specific_directory(path: &str) -> Vec<(String, String)> { 100 | let notebooks = match std::fs::read_dir(path) { 101 | Ok(x) => x, 102 | Err(e) => { 103 | process::abort(); 104 | } 105 | }; 106 | let mut note_filenames: Vec = Vec::new(); 107 | for note in notebooks { 108 | let note : DirEntry = note.unwrap(); 109 | note_filenames.push(note); 110 | } 111 | let result: Vec<(String,String)> = note_filenames.par_iter() 112 | .map(|note| read_md_file_wo_parse(¬e)) 113 | .filter(|x| (&x).is_some()) 114 | .map(|x| x.unwrap()) 115 | .collect(); 116 | 117 | result 118 | } 119 | -------------------------------------------------------------------------------- /fire_seq_search_server/tests/unit_test_post_query.rs: -------------------------------------------------------------------------------- 1 | use fire_seq_search_server::post_query::highlighter::{highlight_keywords_in_body, highlight_sentence_with_keywords, locate_single_keyword, split_body_to_blocks, wrap_text_at_given_spots}; 2 | use fire_seq_search_server::generate_server_info_for_test; 3 | 4 | fn get_english_text() -> String { 5 | std::fs::read_to_string("tests/resource/pages/International Language, Past, Present & Future by Walter John Clark.md") 6 | .expect("Should have been able to read the file") 7 | } 8 | fn highlight_keywords_in_body_old_2024_apr(body:&str, terms: &Vec, limit:usize) ->String { 9 | let mut server_info = generate_server_info_for_test(); 10 | server_info.show_summary_single_line_chars_limit = limit; 11 | highlight_keywords_in_body(body, terms, &server_info) 12 | } 13 | 14 | #[test] 15 | fn test_empty_key() { 16 | let text = "Hello World"; 17 | let v = Vec::new(); 18 | 19 | let r = highlight_keywords_in_body_old_2024_apr(text, &v, 120); 20 | assert_eq!(4,4); 21 | 22 | assert_eq!(&r, ""); 23 | } 24 | 25 | 26 | 27 | #[test] 28 | fn test_highlight_wrap() { 29 | let contents = "使用 git shallow clone 下载并编译 Thunderbird".to_string(); 30 | let v = vec![String::from("thunderbird")]; 31 | let r = highlight_keywords_in_body_old_2024_apr(&contents, &v, 120); 32 | assert_eq!(&r, "使用 git shallow clone 下载并编译 Thunderbird"); 33 | } 34 | 35 | #[test] 36 | fn test_highlight_latex() { 37 | let contents = "$\\vec{q_i}^T \\vec{a_j}, i String { 6 | std::fs::read_to_string("tests/resource/pages/International Language, Past, Present & Future by Walter John Clark.md") 7 | .expect("Should have been able to read the file") 8 | } 9 | 10 | 11 | 12 | 13 | /* 14 | #[test] 15 | fn test_highlight_single_term_single_appearance() { 16 | let _ = env_logger::try_init(); 17 | let server_info = generate_server_info_for_test(); 18 | let content = "使用 git shallow clone 下载并编译 Thunderbird".to_string(); 19 | let token = "thunderbird"; 20 | let tokens = [token]; 21 | let mut root = build_tree(&content, &server_info); 22 | 23 | let r = root.children[0].split_leaf_node_by_single_term(token, &server_info); 24 | //println!("{:?}", &r); 25 | assert!(r.len() >= 2); 26 | assert!(r[1].is_hit); 27 | // TODO The behaviour at here is not stable. This is hacky test case - 2024-Apr 28 | 29 | let r2 = root.children[0].split_leaf_node_by_terms(&tokens, &server_info); 30 | assert_eq!(r.len(), r2.len()); 31 | 32 | root.parse_highlight(&tokens, &server_info); 33 | println!("{:?}", &root); 34 | } 35 | 36 | #[test] 37 | fn test_highlight_single_term_multi_appearance() { 38 | let _ = env_logger::try_init(); 39 | let server_info = generate_server_info_for_test(); 40 | let content = "使用 git shallow clone 下载并编译 Thunderbird : compile thunderbird".to_string(); 41 | let token = "thunderbird"; 42 | let tokens = [token]; 43 | let mut root = build_tree(&content, &server_info); 44 | 45 | 46 | root.parse_highlight(&tokens, &server_info); 47 | //println!("Parsed result: {:?}", &root); 48 | root.flattern(); 49 | //println!("Flattern: {:?}", &root); 50 | assert_eq!(root.children.len(), 4); 51 | assert!(root.children[1].is_hit); 52 | assert!(root.children[3].is_hit); 53 | } 54 | */ 55 | 56 | #[test] 57 | fn test_highlight_multiple_terms() { 58 | let _ = env_logger::try_init(); 59 | let server_info = generate_server_info_for_test(); 60 | let content = "使用 git shallow clone 下载并编译 Thunderbird : compile thunderbird with git shallow".to_string(); 61 | let token = "thunderbird"; 62 | let token2 = "git"; 63 | let tokens = [token, token2]; 64 | let mut root = build_tree(&content, &server_info); 65 | 66 | 67 | root.parse_highlight(&tokens, &server_info); 68 | //println!("Parsed result: {:?}", &root); 69 | root.flattern(); 70 | //println!("Flattern: {:?}", &root); 71 | assert!(root.children[1].is_hit); 72 | /* 73 | assert_eq!(root.children.len(), 4); 74 | assert!(root.children[3].is_hit); 75 | */ 76 | } 77 | -------------------------------------------------------------------------------- /pack_firefox_extension.sh: -------------------------------------------------------------------------------- 1 | cd fireSeqSearch_addon 2 | zip -r -FS ../fireSeqSearch.zip * --exclude '*.git*' --exclude "monkeyscript.user.js" --exclude "violentmonkeyscript.user.js" 3 | cd .. 4 | cp -f fireSeqSearch.zip ~/Downloads #/dev/shm 5 | --------------------------------------------------------------------------------