├── .editorconfig
├── .eslintignore
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── build.yml
├── .gitignore
├── .npmrc
├── .nvmrc
├── .prettierignore
├── .prettierrc.yaml
├── LICENSE
├── README.md
├── build
    ├── entitlements.mac.plist
    ├── ico2048.png
    ├── icon.icns
    ├── icon.ico
    ├── icon.png
    ├── icon1024.png
    ├── icon256.png
    └── icon512.png
├── dev-app-update.yml
├── docs
    └── img
    │   ├── mac-codesigning-approval-dialog.png
    │   ├── mac-codesigning-errormessage.png
    │   ├── mac-codesigning-install.png
    │   ├── mac-codesigning-rightclick-menu.png
    │   └── search-result.png
├── e2e
    ├── features
    │   ├── api-key-status.feature
    │   ├── initial_application_view.feature
    │   ├── search-page.feature
    │   ├── settings-page.feature
    │   └── upload-process.feature
    ├── step-definitions
    │   ├── api-key-status.steps.ts
    │   ├── common.steps.ts
    │   ├── initial_application_view.steps.ts
    │   ├── search_page.steps.ts
    │   ├── settings_page.steps.ts
    │   └── upload_process.steps.ts
    └── test-storage
    │   ├── constellation-test.csv
    │   └── newline-test.csv
├── electron-builder.yml
├── electron.vite.config.ts
├── eslint.config.cjs
├── package-lock.json
├── package.json
├── python-prototype
    ├── embed.ipynb
    ├── embed.py
    ├── non_stupid_csv_reader.py
    ├── non_stupid_sentence_splitter.py
    ├── preprocessing.ipynb
    ├── requirements.txt
    ├── search.ipynb
    ├── search.py
    └── understanding_sentence_splitter.ipynb
├── resources
    ├── icon.icns
    └── icon.png
├── src
    ├── docs
    │   └── img
    │   │   └── groceries_screenshot.png
    ├── main
    │   ├── DocumentSetManager.ts
    │   ├── DocumentSetService.ts
    │   ├── api
    │   │   ├── embedding.test.ts
    │   │   └── embedding.ts
    │   ├── index.ts
    │   ├── services
    │   │   ├── csvLoader.test.ts
    │   │   ├── csvLoader.ts
    │   │   ├── embeddings.test.ts
    │   │   ├── embeddings.ts
    │   │   ├── loggingOpenAIEmbedding.ts
    │   │   ├── mockEmbedding.ts
    │   │   ├── optional_trim_sentence_tokenizer.js
    │   │   ├── sentenceSplitter.test.ts
    │   │   ├── sentenceSplitter.ts
    │   │   ├── sploder.ts
    │   │   └── weaviateService.ts
    │   ├── types
    │   │   └── index.ts
    │   └── utils.ts
    ├── preload.ts
    ├── preload
    │   ├── index.d.ts
    │   └── index.ts
    └── renderer
    │   ├── index.html
    │   └── src
    │       ├── App.svelte
    │       ├── assets
    │           ├── base.css
    │           ├── electron.svg
    │           └── main.css
    │       ├── components
    │           ├── ApiKeyPage.svelte
    │           ├── ApiKeyStatus.svelte
    │           ├── CsvUpload.svelte
    │           ├── ExistingDatabases.svelte
    │           ├── FrontPage.svelte
    │           ├── HelpPage.svelte
    │           ├── Preview.svelte
    │           ├── Results.svelte
    │           ├── SearchPage.svelte
    │           └── Table.svelte
    │       ├── env.d.ts
    │       └── main.ts
├── svelte.config.mjs
├── tsconfig.json
├── tsconfig.node.json
└── wdio.conf.ts


/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 | 
3 | [*]
4 | charset = utf-8
5 | indent_style = space
6 | indent_size = 2
7 | end_of_line = lf
8 | insert_final_newline = true
9 | trim_trailing_whitespace = true


--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 | out
4 | .gitignore
5 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: jeremybmerrill
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
16 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: Build Distributions
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - '*'
  7 |   workflow_dispatch:
  8 | 
  9 | jobs:
 10 |   build-apple:
 11 |     name: Build macOS (Apple Silicon)
 12 |     runs-on: macos-latest
 13 |     environment: "mac build" 
 14 |     outputs:
 15 |       artifact-folder: ${{ steps.upload.outputs.artifact-folder }}
 16 |     steps:
 17 |       - uses: actions/checkout@v4
 18 | 
 19 |       - name: Configure Node caching
 20 |         uses: actions/cache@v4
 21 |         with:
 22 |           path: ~/.npm
 23 |           key: ${{ runner.os }}-npm-${{ hashFiles('**/package-lock.json') }}
 24 |           restore-keys: ${{ runner.os }}-npm-
 25 | 
 26 |       - name: Install dependencies
 27 |         run: npm install
 28 | 
 29 |       - name: Run tests
 30 |         run: npm test
 31 |         timeout-minutes: 5
 32 | 
 33 |       - name: Build distribution (macOS ARM)
 34 |         env:
 35 |           CSC_KEY_PASSWORD: ${{ secrets.CSC_KEY_PASSWORD }}
 36 |           CSC_LINK: ${{ secrets.CSC_LINK }}
 37 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 38 |         run: npm run build:mac
 39 | 
 40 |       - name: Create README
 41 |         run: |
 42 |           echo "More information: https://github.com/jeremybmerrill/meaningfully" > dist/README.txt
 43 | 
 44 |       - name: Upload artifact (macOS)
 45 |         id: upload
 46 |         uses: actions/upload-artifact@v4
 47 |         with:
 48 |           name: meaningfully-macOS
 49 |           path: |
 50 |             dist/meaningfully-${{ github.ref_name }}.arm64.dmg
 51 |             dist/README.txt
 52 | 
 53 |   build-intel:
 54 |     name: Build macOS (Intel)
 55 |     runs-on: macos-13
 56 |     environment: "mac build"
 57 |     outputs:
 58 |       artifact-folder: ${{ steps.upload.outputs.artifact-folder }}
 59 |     steps:
 60 |       - uses: actions/checkout@v4
 61 | 
 62 |       - name: Configure Node caching
 63 |         uses: actions/cache@v4
 64 |         with:
 65 |           path: ~/.npm
 66 |           key: ${{ runner.os }}-npm-${{ hashFiles('**/package-lock.json') }}
 67 |           restore-keys: ${{ runner.os }}-npm-
 68 | 
 69 |       - name: Install brew dependencies
 70 |         run: brew install pkg-config cairo pango libjpeg giflib librsvg
 71 | 
 72 |       - name: Install npm deps
 73 |         run: npm install
 74 | 
 75 |       - name: Run tests
 76 |         run: npm test
 77 |         timeout-minutes: 5
 78 | 
 79 |       - name: Build distribution (macOS Intel)
 80 |         env:
 81 |           CSC_KEY_PASSWORD: ${{ secrets.CSC_KEY_PASSWORD }}
 82 |           CSC_LINK: ${{ secrets.CSC_LINK }}
 83 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 84 |         run: npm run build:mac -- --publish always
 85 | 
 86 |       - name: Create README
 87 |         run: 'echo "More information: https://github.com/jeremybmerrill/meaningfully" > dist/README.txt'
 88 | 
 89 |       - name: Upload artifact (macOS Intel)
 90 |         id: upload
 91 |         uses: actions/upload-artifact@v4
 92 |         with:
 93 |           name: meaningfully-macOS-intel
 94 |           path: |
 95 |             dist/meaningfully-${{ github.ref_name }}.x64.dmg
 96 |             dist/README.txt
 97 | 
 98 |   build-linux:
 99 |     name: Build Linux
100 |     runs-on: ubuntu-latest
101 |     environment: "mac build" 
102 |     steps:
103 |       - uses: actions/checkout@v4
104 | 
105 |       - name: Configure Node caching
106 |         uses: actions/cache@v4
107 |         with:
108 |           path: ~/.npm
109 |           key: ${{ runner.os }}-npm-${{ hashFiles('**/package-lock.json') }}
110 |           restore-keys: ${{ runner.os }}-npm-
111 | 
112 |       - name: Install Linux Build Dependencies
113 |         run: |
114 |           sudo apt-get update
115 |           sudo apt-get install -y build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev libarchive-tools libfuse2 libgtk-3-0 libnss3 libxshmfence1 libatk-bridge2.0-0 libx11-xcb1 libxcb-dri3-0 libxcomposite1 libxcursor1 libxdamage1 libxfixes3 libxi6 libxrandr2 libxtst6 libgbm1 libpangocairo-1.0-0 libpango-1.0-0 libcairo2 libatspi2.0-0
116 |           # sudo snap install snapcraft --classic
117 | 
118 |       - name: Install dependencies
119 |         run: npm install
120 |     
121 |       - name: Run tests
122 |         run: npm test
123 |         timeout-minutes: 5
124 | 
125 |       - name: Build distribution (Linux)
126 |         env:
127 |           CSC_KEY_PASSWORD: ${{ secrets.CSC_KEY_PASSWORD }}
128 |           CSC_LINK: ${{ secrets.CSC_LINK }}
129 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
130 |           # SNAPCRAFT_STORE_CREDENTIALS: ${{ secrets.SNAPCRAFT_STORE_CREDENTIALS }}          
131 |         run: npm run build:linux -- --publish always
132 | 
133 |       - name: Create README
134 |         run: 'echo "More information: https://github.com/jeremybmerrill/meaningfully" > dist/README.txt'
135 | 
136 |       - name: Upload artifact (Linux)
137 |         uses: actions/upload-artifact@v4
138 |         with:
139 |           name: meaningfully-linux
140 | 
141 |           # add back later under path.  dist/meaningfully-${{ github.ref_name }}.snap
142 |           path: |
143 |             dist/meaningfully-${{ github.ref_name }}.deb
144 |             dist/meaningfully-${{ github.ref_name }}.AppImage
145 |             dist/README.txt
146 |   integration-test:
147 |     name: Integration Test
148 |     runs-on: ubuntu-latest
149 |     needs: [build-apple, build-intel, build-linux]
150 |     steps:
151 |       - uses: actions/checkout@v4
152 | 
153 |       - name: Configure Node caching
154 |         uses: actions/cache@v4
155 |         with:
156 |           path: ~/.npm
157 |           key: ${{ runner.os }}-npm-${{ hashFiles('**/package-lock.json') }}
158 |           restore-keys: ${{ runner.os }}-npm-
159 | 
160 |       - name: Install Linux Build Dependencies
161 |         run: |
162 |           sudo apt-get update
163 |           sudo apt-get install -y build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev libarchive-tools libfuse2 libgtk-3-0 libnss3 libxshmfence1 libatk-bridge2.0-0 libx11-xcb1 libxcb-dri3-0 libxcomposite1 libxcursor1 libxdamage1 libxfixes3 libxi6 libxrandr2 libxtst6 libgbm1 libpangocairo-1.0-0 libpango-1.0-0 libcairo2 libatspi2.0-0
164 |           # sudo snap install snapcraft --classic
165 | 
166 |       - name: Install dependencies
167 |         run: npm install
168 |     
169 |       - name: Download Linux artifact
170 |         uses: actions/download-artifact@v4
171 |         with:
172 |           name: meaningfully-linux
173 |           path: ./dist
174 |       - name: Run integration tests 
175 |         run: npm run wdio
176 |   release:
177 |     name: Create GitHub Release
178 |     runs-on: ubuntu-latest
179 |     environment: "mac build"
180 |     needs:
181 |       - build-apple
182 |       - build-intel
183 |       # - build-windows
184 |       - build-linux
185 |       - integration-test
186 |     steps:
187 |       - uses: actions/checkout@v4
188 | 
189 |       - name: Download macOS artifact
190 |         uses: actions/download-artifact@v4
191 |         with:
192 |           name: meaningfully-macOS
193 |           path: ./artifacts/macOS
194 | 
195 |       - name: Download macOS (Intel) artifact
196 |         uses: actions/download-artifact@v4
197 |         with:
198 |           name: meaningfully-macOS-intel
199 |           path: ./artifacts/macOS-intel
200 | 
201 |       - name: Download Linux artifact
202 |         uses: actions/download-artifact@v4
203 |         with:
204 |           name: meaningfully-linux
205 |           path: ./artifacts/linux
206 | 
207 |       - name: Create Release
208 |         uses: softprops/action-gh-release@v2
209 |         env:
210 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
211 |         with:
212 |           tag_name: ${{ github.ref_name }}
213 |           name: Release ${{ github.ref_name }}
214 |           draft: false
215 |           prerelease: false
216 |           # add back later under files:              ./artifacts/linux/meaningfully-${{ github.ref_name }}.snap
217 |           files: |
218 |             ./artifacts/linux/meaningfully-${{ github.ref_name }}.deb
219 |             ./artifacts/linux/meaningfully-${{ github.ref_name }}.AppImage
220 |             ./artifacts/macOS-intel/meaningfully-${{ github.ref_name }}.x64.dmg
221 |             ./artifacts/macOS/meaningfully-${{ github.ref_name }}.arm64.dmg
222 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .env
 3 | sample-data/
 4 | .ipynb_checkpoints/
 5 | *.duckdb
 6 | python-prototype/chroma/
 7 | __pycache__
 8 | js/meaningfully/storage
 9 | e2e/test-storage/metadata.db
10 | .vscode/
11 | node_modules
12 | dist
13 | out
14 | .DS_Store
15 | *.log*
16 | e2e/test-storage/metadata.db
17 | certs/
18 | 


--------------------------------------------------------------------------------
/.npmrc:
--------------------------------------------------------------------------------
1 | electron_mirror=https://npmmirror.com/mirrors/electron/
2 | electron_builder_binaries_mirror=https://npmmirror.com/mirrors/electron-builder-binaries/
3 | 


--------------------------------------------------------------------------------
/.nvmrc:
--------------------------------------------------------------------------------
1 | 23
2 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | out
2 | dist
3 | pnpm-lock.yaml
4 | LICENSE.md
5 | tsconfig.json
6 | tsconfig.*.json
7 | 


--------------------------------------------------------------------------------
/.prettierrc.yaml:
--------------------------------------------------------------------------------
 1 | singleQuote: true
 2 | semi: false
 3 | printWidth: 100
 4 | trailingComma: none
 5 | plugins:
 6 |   - prettier-plugin-svelte
 7 | overrides:
 8 |   - files: '*.svelte'
 9 |     options:
10 |       parser: svelte
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2025 Jeremy B. F. Merrill
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Meaningfully (is still in pre-alpha but you can try it!)
  2 | 
  3 | Meaningfully is a semantic search tool for text data in spreadsheets. 
  4 | 
  5 | Keyword searching in Excel or Google Sheets is painful because text data is displayed awkwardly and because keywords miss circumlocutions, typos, unexpected wording and foreign-language data. Semantic search solves all of that. Meaningfully works best for *semi-structured* data, where you have thousands of instances of a type and want to find certain instances.
  6 | 
  7 | For example:
  8 | 
  9 |   - consumer complaints about a product or business
 10 |   - credit card transactions
 11 |   - descriptions of government contracts
 12 |   - responses to a survey
 13 | 
 14 | ## Who is this for?
 15 | 
 16 | Journalists, researchers, academics, people who do surveys or solicit submissions, anybody.
 17 | 
 18 | ## What is semantic search?
 19 | 
 20 | It's a middle-ground between AI chatbot search and keyword search. It uses the smarts of AI to 'understand' human language, but doesn't risk making stuff up like AI.
 21 | 
 22 | ## Is Meaningfully ready to use?
 23 | 
 24 | Not really, but you can try it! It is kind of the minimum viable semantic search app. If people like it, I hope to sand down the rough edges and build extra features. Right now, I make zero promises about whether it will work. **Please email me or open a ticket to tell me about how Meaningfully worked (or didn't work) for you.**
 25 | 
 26 | In particular, Meaningfully is _slow_ and can't handle large document sets (>10,000 rows, let's say) yet.
 27 | 
 28 | ## How do I search with meaningfully?
 29 | 
 30 | Once you've uploaded a CSV with a text column, search is simple.
 31 | 
 32 | ![a screenshot of the search page, with a query "he got fired" and a result saying "There are no modifications to Mr. Smith's compensation arrangements in connection with his departure.  He will not receive severance and will forfeit all equity that has not vested as of his termination date."](https://github.com/jeremybmerrill/meaningfully/blob/main/docs/img/search-result.png)
 33 | 
 34 | 
 35 | 1. 🤔 Just imagine what you're looking for, then imagine a phrase or sentence from that perfect result. Like, "he got fired."
 36 | 2. Type the imagined phrase or sentence in the search box, and then click search.
 37 | 3. Hopefully the closest results from the CSV will appear at the top of the search results.
 38 | 
 39 | You can also filter by metadata attributes.
 40 | 
 41 | <!-- 
 42 | ## How do I upload a CSV to meaningfully?
 43 | 
 44 | 1. 
 45 | 2.  -->
 46 | 
 47 | ## Is meaningfully free?
 48 | 
 49 | Mostly. Semantic search requires "embedding" snippets of your document into numbers. You can do this on your computer, but it's very slow, but free (but for your electric bill). I recommend you get an OpenAI API key, put it into meaningfully, and use that; you'll be responsible for the OpenAI charges, but meaningfully doesn't cost any extra on top of that. (And it's generally very cheap. Most spreadsheets, even with tens of thousands of rows, will cost a few pennies.)
 50 | 
 51 | Eventually, meaningfully may include some paid options.
 52 | 
 53 | ## How can I run this app myself?
 54 | 
 55 | ### Install
 56 | 
 57 | Visit meaningfully's [release page](https://github.com/jeremybmerrill/meaningfully/releases), download the appropriate installer for your platform, and install it. 
 58 | 
 59 | There might be some platform-specific instruction.
 60 | 
 61 | #### Mac-specific instructions:
 62 | 
 63 | Install `meaningfully-<version>.arm64.dmg` (with `arm64`) if your Mac has Apple Silicon. Install the `x64` version if your Mac has an Intel chip.
 64 | 
 65 | I haven't yet set up code-signing for this app, so once you install the app, you might get an error message that says ""meaningfully" cannot be opened because the developer cannot be verified." (picture below).
 66 | 
 67 | ![a screenshot of a warning that meaningfully cannot be opened because the developer cannot be verified.](
 68 | https://github.com/jeremybmerrill/meaningfully/blob/main/docs/img/mac-codesigning-errormessage.png | width=300)
 69 | 
 70 | ##### Here are the steps to work around this error
 71 | 
 72 | 1. Install the app as usual, by copying it from the disk image (dmg) to your Applications folder.
 73 | 
 74 | ![a screenshot of a Finder folder with the meaningfully icon and the Applications folder](https://github.com/jeremybmerrill/meaningfully/blob/main/docs/img/mac-codesigning-install.png | width=300)
 75 | 
 76 | 2. Right-click (or command-click) the app, then click open.
 77 | 
 78 | ![a screenshot of the right-click menu you get when you right-click on the meaningfully app, with the Open option](https://github.com/jeremybmerrill/meaningfully/blob/main/docs/img/mac-codesigning-rightclick-menu.png | width=300)
 79 | 
 80 | 3. Then click "Open" on the pop-up dialog that says "macOS cannot verify the developer of 'meaningfully'. Are you sure you want to open it?"
 81 | 
 82 | ![a dialog that says macOS cannot verify the developer of 'meaningfully'. Are you sure you want to open it](https://github.com/jeremybmerrill/meaningfully/blob/main/docs/img/mac-codesigning-approval-dialog.png | width=300)
 83 | 
 84 | Sometimes you might have to try several times. But once it works, it should stay working until you update the app. If you'd like to eliminate this obstacle, please consider sponsoring this project -- as the code-signing workflow for Macs costs like $100, and I don't want to spend that until I'm sure that this project benefits people.
 85 | 
 86 | #### Windows
 87 | 
 88 | I couldn't get the Windows builds to work. If you use Windows and want to try meaningfully, please try development mode below, or help me get the Windows builds working.
 89 | 
 90 | #### Linux
 91 | 
 92 | Snaps coming soon, I hope.
 93 | 
 94 | ### Development mode
 95 | You'll need Node v22 or higher. You might try installing [nvm](https://github.com/nvm-sh/nvm) and then running `nvm install 22` and `nvm use 22` but troubleshooting and other methods are outside the scope of this document.
 96 | 
 97 | ```
 98 | npm install
 99 | npm run dev
100 | ```
101 | 
102 | There's a weird bug where sometimes I think the storage directory isn't created right. If you get weird errors like `Error searching document set: Error: ENOENT: no such file or directory`, maybe try running `mkdir ~/Library/Application\ Support/meaningfully/simple_vector_store/` and trying again. I'm trying to fix it. :D
103 | 
104 | ### Testing:
105 | 
106 | Run the unit tests for the backend with `npm test`. Run the integration tests for the frontend by building (`npm run build:<platform>`) with `npm run wdio`; specify a specific file with `CUCUMBER_TEST_ONLY_FEATURE=upload-process npm run wdio`.
107 | 
108 | ## My documents are PDFs, not spreadsheets. Can I use Meaningfully?
109 | 
110 | Try [Semantra](https://github.com/freedmand/semantra).


--------------------------------------------------------------------------------
/build/entitlements.mac.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 |   <dict>
 5 |     <key>com.apple.security.cs.allow-jit</key>
 6 |     <true/>
 7 |     <key>com.apple.security.cs.allow-unsigned-executable-memory</key>
 8 |     <true/>
 9 |     <key>com.apple.security.cs.allow-dyld-environment-variables</key>
10 |     <true/>
11 |   </dict>
12 | </plist>
13 | 


--------------------------------------------------------------------------------
/build/ico2048.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/ico2048.png


--------------------------------------------------------------------------------
/build/icon.icns:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/icon.icns


--------------------------------------------------------------------------------
/build/icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/icon.ico


--------------------------------------------------------------------------------
/build/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/icon.png


--------------------------------------------------------------------------------
/build/icon1024.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/icon1024.png


--------------------------------------------------------------------------------
/build/icon256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/icon256.png


--------------------------------------------------------------------------------
/build/icon512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/build/icon512.png


--------------------------------------------------------------------------------
/dev-app-update.yml:
--------------------------------------------------------------------------------
1 | provider: generic
2 | url: https://example.com/auto-updates
3 | updaterCacheDirName: meaningfully-updater
4 | 


--------------------------------------------------------------------------------
/docs/img/mac-codesigning-approval-dialog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/docs/img/mac-codesigning-approval-dialog.png


--------------------------------------------------------------------------------
/docs/img/mac-codesigning-errormessage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/docs/img/mac-codesigning-errormessage.png


--------------------------------------------------------------------------------
/docs/img/mac-codesigning-install.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/docs/img/mac-codesigning-install.png


--------------------------------------------------------------------------------
/docs/img/mac-codesigning-rightclick-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/docs/img/mac-codesigning-rightclick-menu.png


--------------------------------------------------------------------------------
/docs/img/search-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/docs/img/search-result.png


--------------------------------------------------------------------------------
/e2e/features/api-key-status.feature:
--------------------------------------------------------------------------------
 1 | Feature: API Key Status Banner
 2 | 
 3 |   Scenario: Verify API Key Status Banner is shown if setting store is empty
 4 |     Given the application has started
 5 |     When the settings store is empty
 6 |     And the page has been reloaded
 7 |     Then the "API Key Status" component should be visible
 8 | 
 9 |   Scenario: Verify API Key Status Banner is not shown if API Key is set
10 |     Given the application has started
11 |     When the setting store has an OpenAI API Key value    
12 |     And the page has been reloaded
13 |     Then the "API Key Status" component should not be visible
14 | 


--------------------------------------------------------------------------------
/e2e/features/initial_application_view.feature:
--------------------------------------------------------------------------------
 1 | Feature: Initial Application View
 2 | 
 3 |   Scenario: Verify initial components are displayed
 4 |     Given the application has started
 5 |     Then the "Upload a Spreadsheet" component should be visible
 6 |     And the "Existing Spreadsheets" component should be visible
 7 | 
 8 |   Scenario: Verify empty state for Existing Spreadsheets
 9 |     Given the application has started
10 |     And the metadata store is empty
11 |     And the page has been reloaded
12 |     Then the "Existing Spreadsheets" component should be visible
13 |     And no datasets should be listed
14 | 
15 |   Scenario: Verify Existing Spreadsheets with 1 dataset
16 |     Given the application has started
17 | 
18 |     # set an API key
19 |     
20 |     And the settings store is empty 
21 |     And the app is navigated to the 'Settings / API Keys' link
22 |     And the uploadCsv function has been mocked
23 |     And the OpenAI API Key value is set on the page
24 |     And the "Save" component has been clicked
25 | 
26 |     And a file has been selected in the "Upload a Spreadsheet" component
27 |     And the column "paragraph" has been selected as column to embed
28 |     And the metadata column with name "font-size" has been selected 
29 |     And the metadata column with name "cik" has been selected
30 |     And the "Dataset Name input" component has been set to "Test Dataset 1"
31 |     And the "Upload button" component has been clicked
32 |     And the app is navigated to the "Home" link
33 |     Then the "Existing Spreadsheets" component should be visible
34 |     And 1 datasets should be listed
35 | 
36 |     And the dataset "Test Dataset 1" should be listed
37 | 
38 |     # And the page has been reloaded
39 |     # Then the "Existing Spreadsheets" component should be visible
40 |     # And 2 datasets should be listed
41 |     # And the dataset "Test Dataset 1" should be listed
42 |     # And the dataset "Test Dataset 2" should be listed


--------------------------------------------------------------------------------
/e2e/features/search-page.feature:
--------------------------------------------------------------------------------
 1 | Feature: Search page
 2 | 
 3 |   Scenario: Verify initial components are displayed
 4 |     Given the application has started
 5 |     And the settings store is empty 
 6 |     And the app is navigated to the 'Settings / API Keys' link
 7 |     And the uploadCsv function has been mocked
 8 |     And the OpenAI API Key value is set on the page
 9 |     And the "Save" component has been clicked
10 | 
11 |     And a file has been selected in the "Upload a Spreadsheet" component
12 |     And the column "paragraph" has been selected as column to embed
13 |     And the metadata column with name "font-size" has been selected 
14 |     And the metadata column with name "cik" has been selected
15 |     And the "Dataset Name input" component has been set to "Test Dataset 1"
16 |     And the "Upload button" component has been clicked
17 |     # And the app is navigated to the "Home" link
18 | 
19 |     # And the app is navigated to the "Test Dataset 1" dataset link
20 |     Then the "Document Set name" component should be visible
21 |     And the "Search bar" component should be visible
22 |     And the "Search button" component should be visible
23 | 
24 |   Scenario: Verify search button is disabled if there is no query
25 |     Given the application has started
26 |     And the app is navigated to the 'Home' link
27 |     And the app is navigated to the "Test Dataset 1" dataset link
28 |     And no search query has been entered
29 |     Then the "Search button" component should be visible
30 |     And the "Search button" component should be disabled
31 | 
32 |   Scenario: Verify search button is enabled if there is a query
33 |     Given the application has started
34 |     And the app is navigated to the 'Home' link
35 |     And the app is navigated to the "Test Dataset 1" dataset link
36 |     And a search query has been entered
37 |     Then the "Search button" component should be enabled
38 | 
39 |   Scenario: Verify results are shown
40 |     Given the application has started
41 |     And the app is navigated to the 'Home' link
42 |     And the app is navigated to the "Test Dataset 1" dataset link
43 |     And a search query has been entered
44 |     And the search button has been clicked
45 |     Then the "Results" component should be visible
46 |     And the "Results" component should have multiple rows shown
47 | 
48 |   Scenario: Verify the result modal is shown
49 |     Given the application has started
50 |     And the app is navigated to the 'Home' link
51 |     And the app is navigated to the "Test Dataset 1" dataset link
52 |     And a search query has been entered
53 |     And the search button has been clicked
54 |     And a result row modal button has been clicked
55 |     Then the "Details" component should be visible
56 |     # And the details component should be scrollable # can't be handled by the current test case whose text is too short to need scrolling


--------------------------------------------------------------------------------
/e2e/features/settings-page.feature:
--------------------------------------------------------------------------------
 1 | Feature: Settings Page
 2 | 
 3 |   Scenario: OpenAI API Key value is empty if settings store is empty
 4 |     Given the application has started
 5 |     When the settings store is empty
 6 |     And the app is navigated to the 'Settings / API Keys' link
 7 |     Then the "OpenAI API Key input" component should be visible
 8 |     And the "OpenAI API Key input" component should be empty
 9 | 
10 |   Scenario: new OpenAI API Key values are persisted and masked
11 |     Given the application has started
12 |     When the settings store is empty 
13 |     And the app is navigated to the 'Settings / API Keys' link
14 |     And the OpenAI API Key value is set on the page
15 |     And the "Save" component has been clicked
16 |     And the app is navigated to the 'Settings / API Keys' link
17 |     Then the "OpenAI API Key input" component should be visible
18 |     And the text of the "OpenAI API Key input" component is masked
19 |     And the text of the "OpenAI API Key input" component is a masked version of the set value.
20 | 
21 |   Scenario: after save, the app is navigated back to the home page
22 |     Given the application has started
23 |     When the settings store is empty 
24 |     And the app is navigated to the 'Settings / API Keys' link
25 |     And the OpenAI API Key value is set on the page
26 |     And the "Save" component has been clicked
27 |     Then the "Upload a Spreadsheet" component should be visible
28 |     And the "Existing Spreadsheets" component should be visible


--------------------------------------------------------------------------------
/e2e/features/upload-process.feature:
--------------------------------------------------------------------------------
 1 | Feature: Upload page
 2 | 
 3 |   Scenario: Verify upload page is shown once a file is selected
 4 |     Given the application has started
 5 |     And a file has been selected in the "Upload a Spreadsheet" component
 6 |     Then the "CSV Upload Settings" component should be visible
 7 |     And the "Preview" component should not be visible
 8 | 
 9 |   Scenario: Verify preview is shown if a column is selected
10 |     Given the application has started
11 |     And a file has been selected in the "Upload a Spreadsheet" component
12 |     And the column "paragraph" has been selected as column to embed
13 |     Then the "CSV Upload Settings" component should be visible
14 |     And the "Preview" component should be visible
15 |     And the "Cost Estimate" component should be visible
16 |     And the "Preview" component should contain a header row with name "paragraph"
17 |     And the "Preview" component should contain HTML linebreaks not unescaped newlines
18 | 
19 |   Scenario: Verify metadata columns are shown if metadata columns are selected
20 |     Given the application has started
21 |     And a file has been selected in the "Upload a Spreadsheet" component
22 |     And the column "paragraph" has been selected as column to embed
23 |     And the metadata column with name "font-size" has been selected 
24 |     And the metadata column with name "cik" has been selected 
25 |     Then the "CSV Upload Settings" component should be visible
26 |     And the "Preview" component should be visible
27 |     And the "Preview" component should contain a header row with name "font-size"
28 |     And the "Preview" component should contain a header row with name "cik"
29 | 
30 |   Scenario: Verify upload button is disabled if no column is selected
31 |     Given the application has started
32 |     # todo there needs to be a navigation to the homepage first, to reset things.
33 |     And a file has been selected in the "Upload a Spreadsheet" component
34 |     And no column has been selected as column to embed
35 |     Then the "CSV Upload Settings" component should be visible
36 |     And the "Preview" component should not be visible
37 |     And the "Upload button" component should be disabled
38 | 
39 |   Scenario: Verify upload button is enabled if a column is selected
40 |     Given the application has started
41 |     And a file has been selected in the "Upload a Spreadsheet" component
42 |     And the column "paragraph" has been selected as column to embed
43 |     Then the "CSV Upload Settings" component should be visible
44 |     And the "Preview" component should be visible
45 |     And the "Upload button" component should be enabled
46 | 
47 | 


--------------------------------------------------------------------------------
/e2e/step-definitions/api-key-status.steps.ts:
--------------------------------------------------------------------------------
 1 | import { Given, When, Then } from '@wdio/cucumber-framework';
 2 | import { expect, $$, $ } from '@wdio/globals';
 3 | import { execSync } from 'child_process';
 4 | 
 5 | 
 6 | //execSync('sqlite3  ./e2e/test-storage/metadata.db "CREATE TABLE IF NOT EXISTS meaningfully_settings (settings_id INTEGER PRIMARY KEY AUTOINCREMENT,  settings TEXT NOT NULL );" "CREATE TABLE IF NOT EXISTS document_sets ( set_id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL UNIQUE, upload_date TEXT NOT NULL, parameters TEXT NOT NULL, total_documents INTEGER NOT NULL DEFAULT 0);"');
 7 | 
 8 | // --- Steps ---
 9 | 
10 | Given("the settings store is empty", async () => {
11 |     //execSync('sqlite3  ./e2e/test-storage/metadata.db "DELETE FROM meaningfully_settings"');
12 |     await browser.execute(() => {
13 |         // @ts-ignore
14 |         if (window.api && window.api.setSettings) {
15 |             window.api.setSettings({ openAIKey: null, oLlamaModelType: null, oLlamaBaseURL: null });
16 |         }
17 |     });
18 |     await browser.pause(500); // Optional: Adjust as needed
19 | });
20 | 
21 | Given("the setting store has an OpenAI API Key value", async () => {
22 |     // execSync('sqlite3  ./e2e/test-storage/metadata.db "DELETE FROM meaningfully_settings"');
23 |     // execSync(`sqlite3  ./e2e/test-storage/metadata.db "INSERT OR REPLACE INTO meaningfully_settings (settings_id, settings) VALUES (1, '{\"openAIKey\":\"\"}')"`);
24 |     await browser.execute(() => {
25 |         // @ts-ignore
26 |         if (window.api && window.api.setSettings) {
27 |             window.api.setSettings({ openAIKey: "sk-proj-meaningfullytesting-1234567890123456789012345678901234567890", oLlamaModelType: null, oLlamaBaseURL: null });
28 |         }
29 |     });
30 |     await browser.pause(500); // Optional: Adjust as needed
31 | });
32 | 
33 | // TODO: use APIs instead of shelling out to sqlite3
34 | // // Step: Simulate empty settings store.
35 | // When("the settings store is empty", async () => {
36 | //     // Implement your method to clear the settings store.
37 | //     // For example, using browser.execute to call your electronAPI:
38 | //     await browser.execute(() => {
39 | //         // @ts-ignore
40 | //         if (window.api && window.api.clearSettings) {
41 | //             window.api.clearSettings();
42 | //         }
43 | //     });
44 | //     await browser.pause(500);
45 | // });
46 | 
47 | // // Step: Simulate settings store with an OpenAI API Key value.
48 | // When("the setting store has an OpenAI API Key value", async () => {
49 | //     // Implement your method to add a key.
50 | //     await browser.execute(() => {
51 | //         // @ts-ignore
52 | //         if (window.api && window.api.setSettings) {
53 | //             // Set a dummy API key.
54 | //             window.api.setSettings({ openAIKey: "sk-dummyapikeyvalue", oLlamaModelType: "", oLlamaBaseURL: "" });
55 | //         }
56 | //     });
57 | //     await browser.pause(500);
58 | // });


--------------------------------------------------------------------------------
/e2e/step-definitions/common.steps.ts:
--------------------------------------------------------------------------------
 1 | import { Given, When, Then } from '@wdio/cucumber-framework';
 2 | import { expect, $$, $ } from '@wdio/globals';
 3 | 
 4 | Given("the application has started", async () => {
 5 |     // WebdriverIO Electron service typically handles app launch automatically.
 6 |     // You might add a small wait here if needed for the UI to stabilize.
 7 |     await browser.pause(500); // Optional: Adjust as needed
 8 | });
 9 | 
10 | Given("the page has been reloaded", async () => {
11 |     // Reload the current page  
12 |     // await browser.reloadSession();
13 |     const currentUrl = await browser.getUrl();
14 |     await browser.url(currentUrl);
15 | });
16 | 
17 | 
18 | // Step: Simulate clicking the Save button.
19 | When('the {string} component has been clicked', async (componentName: string) => {
20 |     let selector: string  = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
21 |     const btn = await $(selector);
22 |     await btn.waitForDisplayed({ timeout: 5000 });
23 |     await btn.click();
24 |     await browser.pause(500);
25 | });
26 | 
27 | 
28 | 
29 | // These depend on the idea that the Feature file specifies a name that,
30 | // by convention, is the same as the data-testid attribute in the component
31 | // subject to lowercasing and spaces-to-dashes.
32 | Then("the {string} component should be visible", async (componentName: string) => {
33 |     let selector: string  = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
34 |     const component = await $(selector);
35 |     await expect(component).toBeDisplayed();
36 | });
37 | 
38 | Then("the {string} component should not be visible", async (componentName: string) => {
39 |     let selector: string  = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
40 |     const component = await $(selector);
41 |     await expect(component).not.toBeDisplayed();
42 | });
43 | 
44 | // Navigation step: go to the search page.
45 | // Adjust the URL as needed for your Electron app.
46 | Given("the app is navigated to the {string} link", async (linkText: string) => {
47 |     // Example: navigate to a search page with a document set id of 1.
48 |     const settingsLink = await $('.navbar').$(`a*=${linkText}`);
49 |     await settingsLink.click();
50 |     // Wait for the search bar to be displayed as indicator of page load.
51 | });
52 | 
53 | const DATASET_ROW_SELECTOR = '[data-testid="existing-spreadsheet-row"]'; // Selector for a single dataset row/item
54 | Given("the app is navigated to the {string} dataset link", async (linkText: string) => {
55 |     // Example: navigate to a search page with a document set id of 1.
56 |     //const settingsLink =  await (await $$(DATASET_ROW_SELECTOR).filter((elem) => !!elem.$(`a*=${linkText}`))).map((row) => row.$(`a*=${linkText}`) )[0];
57 |     //const datasetLink = await $(`a*=${linkText}`);
58 |     const anchorElement = await $(`a=${linkText}`);    
59 |     await anchorElement.click();
60 |     // const datasetRow = await $$(DATASET_ROW_SELECTOR);
61 |     // const filteredRows = await datasetRow.filter((elem) => !!elem.$(`a*="${linkText}"`));
62 |     // if (filteredRows.length === 0) {
63 |     //     throw new Error(`No link found with text: ${linkText}`);
64 |     // }
65 |     // const link = await filteredRows[0].$(`a*=${linkText}`);
66 |     // await link.waitForDisplayed({ timeout: 5000 });
67 |     // await link.click();
68 |     // Wait for the search bar to be displayed as indicator of page load.
69 | });
70 | 
71 | When("the {string} component has been set to {string}", async (componentName: string, val: string) => {
72 |     let selector = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
73 |     const input = await $(selector);
74 |     await input.waitForDisplayed({ timeout: 5000 });
75 |     // Clear existing value and set a new one.
76 |     await input.clearValue();
77 |     // Provide a new key value.
78 |     await input.setValue(val);
79 |     await browser.pause(500);
80 | });
81 | 
82 | 


--------------------------------------------------------------------------------
/e2e/step-definitions/initial_application_view.steps.ts:
--------------------------------------------------------------------------------
  1 | import { Given, When, Then } from '@wdio/cucumber-framework';
  2 | import { expect, $$, $ } from '@wdio/globals';
  3 | import { execSync } from 'child_process';
  4 | import path from 'path';
  5 | 
  6 | // // --- Selectors ---
  7 | const DATASET_ROW_SELECTOR = '[data-testid="existing-spreadsheet-row"]'; // Selector for a single dataset row/item
  8 | 
  9 | // execSync('sqlite3  ./e2e/test-storage/metadata.db "CREATE TABLE IF NOT EXISTS meaningfully_settings (settings_id INTEGER PRIMARY KEY AUTOINCREMENT,  settings TEXT NOT NULL );" "CREATE TABLE IF NOT EXISTS document_sets ( set_id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL UNIQUE, upload_date TEXT NOT NULL, parameters TEXT NOT NULL, total_documents INTEGER NOT NULL DEFAULT 0);"');
 10 | 
 11 | Given("the metadata store is empty", async () => {
 12 |     // execSync('sqlite3  ./e2e/test-storage/metadata.db "DELETE FROM document_sets"');
 13 |     // starts empty!
 14 |     1+1
 15 | });
 16 | 
 17 | Given("the uploadCsv function has been mocked", async () => {
 18 |     // This step is to ensure that the uploadCsv function is mocked in the browser context.
 19 |     // It might be set up in your test environment or application code.
 20 |     return browser.execute(() => {
 21 |         // @ts-ignore
 22 |         if (window.testHooks && window.testHooks.overrideUploadCsv) {
 23 |             const originalUploadCsv = window.api.uploadCsv;
 24 |             window.testHooks.overrideUploadCsv(async (formData: {
 25 |                     file: File,
 26 |                     datasetName: string,
 27 |                     description: string,
 28 |                     textColumns: string[],
 29 |                     metadataColumns: string[],
 30 |                     splitIntoSentences: boolean,
 31 |                     combineSentencesIntoChunks: boolean,
 32 |                     sploderMaxSize: number,
 33 |                     chunkSize: number,
 34 |                     chunkOverlap: number,
 35 |                     modelName: string,
 36 |                     modelProvider: string
 37 |                   }) => {
 38 |                 // Mock implementation of uploadCsv
 39 |                 console.log("Mock uploadCsv called with:", formData);
 40 |                 formData["modelProvider"] = "mock"; // Ensure modelProvider is set to "mock" so we don't hit a paid API.
 41 |                 return originalUploadCsv(formData);
 42 |             });
 43 |                      
 44 |         }
 45 |     });
 46 | });
 47 | // And a dataset "Test Dataset 1" has been uploaded
 48 | // And a dataset "Test Dataset 2" has been uploaded
 49 | // Given("a dataset {string} has been uploaded", async (datasetName: string) => {
 50 | //     const localFilePath = path.resolve(process.cwd(), 'e2e/test-storage/constellation-test.csv');
 51 | //     // Upload the file to the Selenium/Electron server.
 52 | //     const remoteFilePath = await browser.uploadFile(localFilePath);
 53 | 
 54 | //     await browser.execute((index, remotePath) => {
 55 | //         // In the browser context, use fetch to retrieve the uploaded file as a blob.
 56 | //         fetch(remotePath)
 57 | //             .then(response => response.blob())
 58 | //             .then(blob => {
 59 | //                 const file = new File([blob], 'constellation-test.csv', { type: "text/csv" });
 60 | //                 // Use your app’s API to simulate the upload.
 61 | //                 if (window.api && window.api.uploadCsv) {
 62 | //                     window.api.uploadCsv({
 63 | //                         file: file,
 64 | //                         datasetName: datasetName,
 65 | //                         description: "",
 66 | //                         textColumns: ["paragraph"],
 67 | //                         metadataColumns: ["cik", "classification"],
 68 | //                         splitIntoSentences: true,
 69 | //                         combineSentencesIntoChunks: true,
 70 | //                         sploderMaxSize: 500,
 71 | //                         chunkSize: 100,
 72 | //                         chunkOverlap: 20,
 73 | //                         modelName: "text-embedding-3-small",
 74 | //                         modelProvider: "mock"
 75 | //                     });
 76 | //                 }
 77 | //             });
 78 | //     }, remoteFilePath, datasetName);
 79 | //     await browser.pause(500);
 80 | // });
 81 | 
 82 | 
 83 | Given("the metadata store contains {int} entries", async (count: number) => {
 84 |     // Resolve the local path for the CSV you want to upload.
 85 |     const localFilePath = path.resolve(process.cwd(), 'e2e/test-storage/constellation-test.csv');
 86 |     // Upload the file to the Selenium/Electron server.
 87 |     const remoteFilePath = await browser.uploadFile(localFilePath);
 88 |     
 89 |     for (let i = 0; i < count; i++) {
 90 |         // Pass the remote file path into browser.execute.
 91 |         await browser.execute((index, remotePath) => {
 92 |             // In the browser context, use fetch to retrieve the uploaded file as a blob.
 93 |             fetch(remotePath)
 94 |                 .then(response => response.blob())
 95 |                 .then(blob => {
 96 |                     const file = new File([blob], 'constellation-test.csv', { type: "text/csv" });
 97 |                     // Use your app’s API to simulate the upload.
 98 |                     if (window.api && window.api.uploadCsv) {
 99 |                         window.api.uploadCsv({
100 |                             file: file,
101 |                             datasetName: `Test ${index + 1}`,
102 |                             description: "",
103 |                             textColumns: ["paragraph"],
104 |                             metadataColumns: ["cik", "classification"],
105 |                             splitIntoSentences: true,
106 |                             combineSentencesIntoChunks: true,
107 |                             sploderMaxSize: 500,
108 |                             chunkSize: 100,
109 |                             chunkOverlap: 20,
110 |                             modelName: "text-embedding-3-small",
111 |                             modelProvider: "mock"
112 |                         });
113 |                     }
114 |                 });
115 |         }, i, remoteFilePath);
116 |     }
117 |     // Pause a bit to let the uploads process.
118 |     await browser.pause(500);
119 | });
120 | 
121 | Then("no datasets should be listed", async () => {
122 |     const datasets = await $$(DATASET_ROW_SELECTOR);
123 |     await expect(datasets).toBeElementsArrayOfSize(0);
124 | });
125 | 
126 | Then("{int} datasets should be listed", async (expectedCount: number) => {
127 |     const datasets = await $$(DATASET_ROW_SELECTOR);
128 |     await expect(datasets).toBeElementsArrayOfSize(expectedCount);
129 | });
130 | 
131 | Then("the dataset {string} should be listed", async (datasetName: string) => {
132 |     const datasetNames = await $$(DATASET_ROW_SELECTOR).map((datasetRow) => datasetRow.$$('td')[0].getText());
133 |     expect(datasetNames).toContain(datasetName);
134 | });
135 | 
136 | ///////////////////////////////////////////////////////////////////////////////////////////
137 | 
138 | 
139 | 
140 | 
141 | // Given("the metadata store is empty", async () => {
142 | //     // TODO: Implement logic to ensure the metadata store is empty.
143 | //     // This might involve:
144 | //     // - Calling a specific function via browser.execute:
145 | //     //   await browser.execute(() => (window as any).electronAPI.clearMetadataStore());
146 | //     // - Interacting with the UI to clear data if applicable.
147 | //     // - Restarting the app in a clean state (might be handled by wdio setup).
148 | //     console.warn("Step 'the metadata store is empty' requires implementation.");
149 | // });
150 | 
151 | // Given("the metadata store contains {int} entries", async (count: number) => {
152 | //     // TODO: Implement logic to populate the metadata store with 'count' entries.
153 | //     // Similar to the empty state, this might involve:
154 | //     // - Calling a function via browser.execute:
155 | //     //   await browser.execute((num) => (window as any).electronAPI.addMockMetadata(num), count);
156 | //     // - UI interactions to add data.
157 | //     console.warn(`Step 'the metadata store contains ${count} entries' requires implementation.`);
158 | //     // Add a small pause if data loading is asynchronous
159 | //     await browser.pause(200);
160 | // });
161 | 
162 | 
163 | // Then("no datasets should be listed", async () => {
164 | //     const datasets = await $$(DATASET_ROW_SELECTOR);
165 | //     await expect(datasets).toBeElementsArrayOfSize(0);
166 | // });
167 | 
168 | // Then("{int} datasets should be listed", async (expectedCount: number) => {
169 | //     const datasets = await $$(DATASET_ROW_SELECTOR);
170 | //     await expect(datasets).toBeElementsArrayOfSize(expectedCount);
171 | // });


--------------------------------------------------------------------------------
/e2e/step-definitions/search_page.steps.ts:
--------------------------------------------------------------------------------
 1 | import { Given, When, Then } from '@wdio/cucumber-framework';
 2 | import { expect, $, $$ } from '@wdio/globals';
 3 | 
 4 | 
 5 | // Step: Enter a search query.
 6 | When("a search query has been entered", async () => {
 7 |     const searchInput = await $('[data-testid="search-bar"]');
 8 |     await searchInput.waitForDisplayed({ timeout: 5000 });
 9 |     // Enter a sample query.
10 |     await searchInput.setValue("test search query");
11 |     await browser.pause(500);
12 | });
13 | 
14 | When("no search query has been entered", async () => {
15 |     const searchInput = await $('[data-testid="search-bar"]');
16 |     await searchInput.waitForDisplayed({ timeout: 5000 });
17 |     // Enter a sample query.
18 |     await searchInput.clearValue();
19 |     await browser.pause(500);
20 | });
21 | 
22 | 
23 | // Step: Verify search button state.
24 | Then("the search button is {string}", async (state: string) => {
25 |     const searchButton = await $('[data-testid="search-button"]');
26 |     await searchButton.waitForDisplayed({ timeout: 5000 });
27 |     const isDisabled = await searchButton.getAttribute("disabled");
28 |     if (state === "disabled") {
29 |         expect(isDisabled).not.toBeNull();
30 |     } else if (state === "enabled") {
31 |         expect(isDisabled).toBeNull();
32 |     } else {
33 |         throw new Error(`Unknown state: ${state}`);
34 |     }
35 | });
36 | 
37 | // Step: Click the search button.
38 | When("the search button has been clicked", async () => {
39 |     const searchButton = await $('[data-testid="search-button"]');
40 |     await searchButton.waitForDisplayed({ timeout: 5000 });
41 |     await searchButton.click();
42 |     // Allow search results to load.
43 |     await browser.pause(1000);
44 | });
45 | 
46 | // Step: Verify that the Results component has multiple rows.
47 | // For this example, we assume that results are rendered as multiple <tr> elements within the Results component.
48 | Then("the {string} component should have multiple rows shown", async (componentName: string) => {
49 |     let selector = "";
50 |     if (componentName === "Results") {
51 |         // In your Results component, assume each result row has a common class or data attribute.
52 |         // Adjust this selector to match your implementation.
53 |         selector = '[data-testid="results"] tr';
54 |     } else {
55 |         throw new Error(`Unknown component for rows: ${componentName}`);
56 |     }
57 |     const rows = await $$(selector);
58 |     // Expect at least 2 rows.
59 |     expect(rows.length).toBeGreaterThan(1);
60 | });
61 | 
62 | // Step: Click a result row modal button.
63 | When("a result row modal button has been clicked", async () => {
64 |     // In your Results component, assume each row has a button to open the modal with data-testid="result-modal-button".
65 |     const modalButtons = await $('[data-testid="result-modal-button"]');
66 |     // if (modalButtons.length === 0) {
67 |     //     throw new Error("No modal button found in results.");
68 |     // }
69 |     // Click the first result modal button.
70 |     await modalButtons.click();
71 |     await browser.pause(1000);
72 | });
73 | 
74 | // Step: Verify the details component is scrollable.
75 | Then("the details component should be scrollable", async () => {
76 |     const details = await $('[data-testid="details"]');
77 |     await details.waitForDisplayed({ timeout: 5000 });
78 |     // Check that scrollHeight is greater than clientHeight.
79 |     const scrollHeight = await details.getProperty("scrollHeight");
80 |     const clientHeight = await details.getProperty("clientHeight");
81 |     expect(scrollHeight).toBeGreaterThan(clientHeight);
82 | });


--------------------------------------------------------------------------------
/e2e/step-definitions/settings_page.steps.ts:
--------------------------------------------------------------------------------
 1 | import { Given, When, Then } from '@wdio/cucumber-framework';
 2 | import { expect, $ } from '@wdio/globals';
 3 | 
 4 | // Selectors
 5 | const OPENAI_API_KEY_INPUT = '[data-testid="openai-api-key-input"]';
 6 | const SAVE_BUTTON = '[data-testid="save"]';
 7 | const FAKE_API_KEY = "sk-proj-meaningfullytesting-1234567890123456789012345678901234567890"
 8 | 
 9 | // Step: Simulate entering an OpenAI API Key on the page.
10 | When("the OpenAI API Key value is set on the page", async () => {
11 |     const input = await $(OPENAI_API_KEY_INPUT);
12 |     await input.waitForDisplayed({ timeout: 5000 });
13 |     // Clear existing value and set a new one.
14 |     await input.clearValue();
15 |     // Provide a new key value.
16 |     await input.setValue(FAKE_API_KEY);
17 |     await browser.pause(500);
18 | });
19 | 
20 | // // Step: Simulate clicking the Save button.
21 | // When('the "Save" component has been clicked', async () => {
22 | //     const btn = await $(SAVE_BUTTON);
23 | //     await btn.waitForDisplayed({ timeout: 5000 });
24 | //     await btn.click();
25 | //     await browser.pause(500);
26 | // });
27 | 
28 | Then('the {string} component should be empty', async (componentName: string) => {
29 |     let selector = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
30 |     const input = await $(selector);
31 |     await input.waitForDisplayed({ timeout: 5000 });
32 |     const value = await input.getValue();
33 |     // Verify that the input is empty.
34 |     expect(value).toBe("");
35 | });
36 | 
37 | // Then: Verify that the OpenAI API Key's text is masked.
38 | Then('the text of the {string} component is masked', async (componentName: string) => {
39 |     let selector = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
40 |     const input = await $(selector);
41 |     await input.waitForDisplayed({ timeout: 5000 });
42 |     const value = await input.getValue();
43 |     // We assume the masking inserts "*******" into the displayed value.
44 |     expect(value).toContain("*******");
45 | });
46 | 
47 | // Then: Verify that the text of the "OpenAI API Key input" component is a masked version of the set value.
48 | Then('the text of the {string} component is a masked version of the set value.', async (componentName: string) => {
49 |     // This step may be similar to the previous, but you can add further checks if needed.
50 |     let selector = `[data-testid="${componentName.toLowerCase().replace(/ /g, '-')}"]`;
51 |     const input = await $(selector);
52 |     await input.waitForDisplayed({ timeout: 5000 });
53 |     const value = await input.getValue();
54 |     // Check that the value both contains "*******" and does not equal the plain key.
55 |     expect(value).toContain("*******");
56 |     expect(value).not.toEqual(FAKE_API_KEY);
57 | });


--------------------------------------------------------------------------------
/e2e/step-definitions/upload_process.steps.ts:
--------------------------------------------------------------------------------
  1 | import { Given, When, Then } from '@wdio/cucumber-framework';
  2 | import path from 'path';
  3 | import { expect, $$, $ } from '@wdio/globals';
  4 | 
  5 | // Selectors – adjust these if needed.
  6 | const UPLOAD_COMPONENT_SELECTOR = '[data-testid="upload-a-spreadsheet"]';
  7 | const CSV_UPLOAD_PAGE_SELECTOR = '[data-testid="csv-upload-settings"]';
  8 | const PREVIEW_COMPONENT_SELECTOR = '[data-testid="preview"]';
  9 | 
 10 | const TEST_CSV_FILE_NAME = "newline-test.csv"; // The name of the test CSV file to use.
 11 | const INDEX_OF_COLUMN_TO_EMBED = 4; 
 12 | 
 13 | // Step: Simulate file selection using the test CSV file.
 14 | Given(
 15 |     "a file has been selected in the {string} component", 
 16 |     async (componentName: string) => {
 17 |         // Locate the file input inside the specified component.
 18 |         const fileInputSelector = `[data-testid="${componentName
 19 |             .toLowerCase()
 20 |             .replace(/ /g, '-')}"] input[type="file"]`;
 21 |         const fileInput = await $(fileInputSelector);
 22 |         // Resolve path to the test CSV file.
 23 |         const filePath = path.resolve(process.cwd(), `e2e/test-storage/${TEST_CSV_FILE_NAME}`);
 24 |         // Upload the file (this copies the file to a temporary location on the Selenium server).
 25 |         const remoteFilePath = await browser.uploadFile(filePath);
 26 |         await fileInput.setValue(remoteFilePath);
 27 |         // Trigger change event if necessary.
 28 |         await browser.execute((input: HTMLInputElement) => {
 29 |             const event = new Event('change', { bubbles: true });
 30 |             input.dispatchEvent(event);
 31 |         }, fileInput);
 32 |         // Allow time for the file selection to process.
 33 |         await browser.pause(1000);
 34 |     }
 35 | );
 36 | 
 37 | // Step: Simulate choosing a column to embed.
 38 | When("the column {string} has been selected as column to embed", async (columnName: string) => {
 39 |     // Assume the CSV Upload page contains a <select> for the text column.
 40 |     // Here we target the first select element inside the CSV Upload page.
 41 |     const selectSelector = `${CSV_UPLOAD_PAGE_SELECTOR} select[data-testid="column-to-embed-select"]`;
 42 |     const selectElem = await $(selectSelector);
 43 |     await selectElem.waitForDisplayed({ timeout: 5000 });
 44 |     // Select by index (skipping the default placeholder at index 0).
 45 |     await selectElem.selectByVisibleText(columnName); // Replace with the actual index of the column you want to embed.
 46 |     await browser.pause(500);
 47 | });
 48 | When("no column has been selected as column to embed", async () => {
 49 |     const selectSelector = `${CSV_UPLOAD_PAGE_SELECTOR} select[data-testid="column-to-embed-select"]`;
 50 |     // Select the default empty option, assuming it has an empty value.
 51 |     await $(selectSelector).selectByAttribute('value', '');
 52 |     await browser.pause(500);
 53 | });
 54 | 
 55 | // Step: Simulate selecting two metadata columns.
 56 | When("the metadata column with name {string} has been selected", async (columnName: string) => {
 57 |     const checkboxSelector = `${CSV_UPLOAD_PAGE_SELECTOR} input[type="checkbox"][id="metadata-${columnName}"]`;
 58 |     const checkbox = await $(checkboxSelector);
 59 |     await checkbox.click();
 60 |     await browser.pause(500);
 61 | });
 62 | 
 63 | // Step: Verify header row content in the Preview component.
 64 | Then(
 65 |     'the {string} component should contain a header row with name {string}',
 66 |     async (componentName: string, columnName: string) => {
 67 |         // Assumes the Preview component renders a table with a <thead> row.
 68 |         let selector = "";
 69 |         if (componentName === "Preview") {
 70 |             selector = `${PREVIEW_COMPONENT_SELECTOR} table thead tr`;
 71 |         } else {
 72 |             throw new Error(`Unknown component: ${componentName}`);
 73 |         }
 74 |         const headerRow = await $(selector);
 75 |         await headerRow.waitForDisplayed({ timeout: 5000 });
 76 |         const headerText = await headerRow.getText();
 77 |         expect(headerText).toContain(columnName);
 78 |     }
 79 | );
 80 | Then ('the "Preview" component should contain HTML linebreaks not unescaped newlines', async () => {
 81 |         // Assumes the Preview component renders a table with a <thead> row.
 82 |         let selector = `${PREVIEW_COMPONENT_SELECTOR} table td`;
 83 |         const dataRows = await $$(selector);
 84 |         await dataRows[0].waitForDisplayed({ timeout: 5000 });
 85 |         const cellText = await dataRows[0].getText(); 
 86 |         const cellHTML = await dataRows[0].getHTML(); // TIGHT-COUPLING: This assumes that the first cell of newline-test.csv contains text with linebreaks (with a \n in the CSV, which should be a <br> in the component under test).
 87 |         console.log('cellText: ', cellText);
 88 |         console.log('cellText length: ', await dataRows[0].getHTML());
 89 |         // Check if the text contains HTML linebreaks
 90 |         const hasLineBreaks = cellHTML.includes('<br />');
 91 |         // Check if the text does not contain unescaped newlines
 92 |         const hasUnescapedNewlines = cellText.includes('\\n');
 93 |         expect(hasLineBreaks).toBe(true);
 94 |         expect(hasUnescapedNewlines).toBe(false);
 95 |     }
 96 | );
 97 | Then(
 98 |     'the {string} component should be disabled',
 99 |     async (componentName: string) => {
100 |         // Assumes the Preview component renders a table with a <thead> row.
101 |         const selector = `[data-testid="${componentName
102 |             .toLowerCase()
103 |             .replace(/ /g, '-')}"]`;
104 |         const component = await $(selector);
105 |         await component.waitForDisplayed({ timeout: 5000 });
106 |         expect(component).toBeDisabled();
107 |     }
108 | );
109 | 
110 | Then(
111 |     'the {string} component should be enabled',
112 |     async (componentName: string) => {
113 |         // Assumes the Preview component renders a table with a <thead> row.
114 |         const selector = `[data-testid="${componentName
115 |             .toLowerCase()
116 |             .replace(/ /g, '-')}"]`;
117 |         const component = await $(selector);
118 |         await component.waitForDisplayed({ timeout: 5000 });
119 |         expect(component).toBeEnabled();
120 |     }
121 | );


--------------------------------------------------------------------------------
/e2e/test-storage/constellation-test.csv:
--------------------------------------------------------------------------------
1 | ,Unnamed: 0,cik,fn,paragraph,classification,paragraph_index,font-size,font-family,font-style,font-weight,line-height,text-align,width,margin-bottom,margin-top,text-indent,vertical-align,color,text_len,pct_numbers
2 | 1363,756,19617,filings_raw/0000019617-2024Q1-8-K-jpm-20240116.html,"JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee.  Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019.  He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time.",body,22,10pt,"""Amplitude TF"", sans-serif",,400,120%,,,,,,,#000,482.0,0.029045643153526972
3 | 


--------------------------------------------------------------------------------
/e2e/test-storage/newline-test.csv:
--------------------------------------------------------------------------------
1 | ,Unnamed: 0,cik,fn,paragraph,classification,paragraph_index,font-size,font-family,font-style,font-weight,line-height,text-align,width,margin-bottom,margin-top,text-indent,vertical-align,color,text_len,pct_numbers
2 | 1363,756,19617,filings_raw/0000019617-2024Q1-8-K-jpm-20240116.html,"JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee.\n Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019.\n He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time.",body,22,10pt,"""Amplitude TF"", sans-serif",,400,120%,,,,,,,#000,482.0,0.029045643153526972
3 | 


--------------------------------------------------------------------------------
/electron-builder.yml:
--------------------------------------------------------------------------------
 1 | appId: com.electron.app
 2 | productName: meaningfully
 3 | directories:
 4 |   buildResources: build
 5 | files:
 6 |   - '!**/.vscode/*'
 7 |   - '!src/*'
 8 |   - '!electron.vite.config.{js,ts,mjs,cjs}'
 9 |   - '!{.eslintignore,.eslintrc.cjs,.prettierignore,.prettierrc.yaml,dev-app-update.yml,CHANGELOG.md,README.md}'
10 |   - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}'
11 |   - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}'
12 |   - '!{.git,.github,.husky,.idea,.vscode}'
13 |   - '!{coverage,docs,public,e2e}'
14 |   - '!{scripts}'
15 | asarUnpack:
16 |   - resources/**
17 | win:
18 |   executableName: meaningfully
19 | nsis:
20 |   artifactName: ${name}-${version}-setup.${ext}
21 |   shortcutName: ${productName}
22 |   uninstallDisplayName: ${productName}
23 |   createDesktopShortcut: always
24 | mac:
25 |   extendInfo:
26 |     - NSDocumentsFolderUsageDescription: Application requests access to the user's Documents folder.
27 |     - NSDownloadsFolderUsageDescription: Application requests access to the user's Downloads folder.
28 |   notarize: false
29 |   category: public.app-category.utilities # from Claude, for building w/o signing
30 |   target: "dmg" # from Claude, for building w/o signing
31 |   identity: null # from Claude, for building w/o signing
32 |   hardenedRuntime: false # from Claude, for building w/o signing
33 |   gatekeeperAssess: false # from Claude, for building w/o signing
34 |   entitlements: null # from Claude, for building w/o signing
35 |   entitlementsInherit: null # from Claude, for building w/o signing
36 |   # entitlementsInherit: build/entitlements.mac.plist # re-enable for building WITH signing.
37 |   signIgnore: # from Claude, for building w/o signing
38 |     - "node_modules"
39 |   cscLink: ${CSC_LINK}
40 |   cscKeyPassword: "${CSC_KEY_PASSWORD}"
41 | dmg:
42 |   artifactName: ${name}-${version}.${arch}.${ext}
43 |   sign: false
44 | linux:
45 |   target:
46 |     - AppImage
47 |     - deb
48 | #    - snap
49 |   maintainer: electronjs.org
50 |   category: Utility
51 | appImage:
52 |   artifactName: ${name}-${version}.${ext}
53 | deb: 
54 |   artifactName: ${name}-${version}.${ext}
55 |   depends:
56 |     - libappindicator1
57 |     - libgconf-2-4
58 |     - libgtk-3-0
59 |     - libnotify4
60 |     - libnss3
61 |     - libx11-xcb1
62 |     - libxss1
63 |     - libasound2
64 |     - libgbm-dev
65 |     - gconf-service-backend
66 |   priority: optional
67 |   description: "Meaningfully is a semantic search tool for text data in spreadsheets."
68 | npmRebuild: false
69 | publish:
70 |   provider: generic
71 |   url: https://example.com/auto-updates
72 | electronDownload:
73 |   mirror: https://npmmirror.com/mirrors/electron/
74 | # afterSign: "./scripts/notarize.js" # from Claude, for building w/o signing
75 | 


--------------------------------------------------------------------------------
/electron.vite.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig, externalizeDepsPlugin } from 'electron-vite'
 2 | import { svelte } from '@sveltejs/vite-plugin-svelte'
 3 | import tailwindcss from '@tailwindcss/vite'
 4 | 
 5 | export default defineConfig({
 6 |   main: {
 7 |     plugins: [externalizeDepsPlugin()]
 8 |   },
 9 |   preload: {
10 |     plugins: [externalizeDepsPlugin()]
11 |   },
12 |   renderer: {
13 |     plugins: [svelte(), tailwindcss()]
14 |   }
15 | })
16 | 


--------------------------------------------------------------------------------
/eslint.config.cjs:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   parserOptions: {
 3 |     extraFileExtensions: ['.svelte']
 4 |   },
 5 |   extends: [
 6 |     'eslint:recommended',
 7 |     'plugin:svelte/recommended',
 8 |     '@electron-toolkit/eslint-config-ts/recommended',
 9 |     '@electron-toolkit/eslint-config-prettier'
10 |   ],
11 |   overrides: [
12 |     {
13 |       files: ['*.svelte'],
14 |       parser: 'svelte-eslint-parser',
15 |       parserOptions: {
16 |         parser: '@typescript-eslint/parser'
17 |       }
18 |     }
19 |   ],
20 |   rules: {
21 |     'svelte/no-unused-svelte-ignore': 'off',
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "meaningfully",
 3 |   "version": "0.0.2",
 4 |   "description": "Semantic search over CSVs`",
 5 |   "main": "./out/main/index.js",
 6 |   "author": "example.com",
 7 |   "homepage": "https://electron-vite.org",
 8 |   "type": "module",
 9 |   "scripts": {
10 |     "format": "prettier --plugin prettier-plugin-svelte --write .",
11 |     "lint": "eslint . --ext .js,.jsx,.cjs,.mjs,.ts,.tsx,.cts,.mts --fix",
12 |     "typecheck:node": "tsc --noEmit -p tsconfig.node.json --composite false",
13 |     "svelte-check": "svelte-check --tsconfig ./tsconfig.json",
14 |     "typecheck": "npm run typecheck:node && npm run svelte-check",
15 |     "start": "electron-vite preview",
16 |     "dev": "electron-vite dev",
17 |     "build": "npm run typecheck && electron-vite build",
18 |     "postinstall": "electron-builder install-app-deps",
19 |     "build:unpack": "npm run build && electron-builder --dir",
20 |     "build:win": "npm run build && electron-builder --win",
21 |     "build:mac": "npm run build && electron-builder --mac",
22 |     "build:linux": "npm run build && electron-builder --linux",
23 |     "test": "vitest",
24 |     "wdio": "NODE_ENV=test wdio run ./wdio.conf.ts"
25 |   },
26 |   "dependencies": {
27 |     "@electron-toolkit/preload": "^3.0.1",
28 |     "@electron-toolkit/utils": "^3.0.0",
29 |     "@llamaindex/weaviate": "^0.0.20",
30 |     "better-sqlite3": "^11.8.1",
31 |     "electron-updater": "^6.1.7",
32 |     "js-tiktoken": "^1.0.8",
33 |     "llamaindex": "^0.8.37",
34 |     "lodash": "^4.17.21",
35 |     "natural": "^8.0.1",
36 |     "svelte-routing": "^2.13.0",
37 |     "weaviate-client": "^3.5.3",
38 |     "weaviate-ts-embedded": "github:jeremybmerrill/typescript-embedded"
39 |   },
40 |   "devDependencies": {
41 |     "@electron-toolkit/eslint-config-prettier": "^2.0.0",
42 |     "@electron-toolkit/eslint-config-ts": "^2.0.0",
43 |     "@electron-toolkit/tsconfig": "^1.0.1",
44 |     "@sveltejs/vite-plugin-svelte": "^4.0.4",
45 |     "@tailwindcss/vite": "^4.0.6",
46 |     "@testing-library/webdriverio": "^3.2.1",
47 |     "@types/lodash": "^4.17.17",
48 |     "@types/natural": "^5.1.5",
49 |     "@types/node": "^20.14.8",
50 |     "@wdio/cli": "^9.12.6",
51 |     "@wdio/cucumber-framework": "^9.12.6",
52 |     "@wdio/local-runner": "^9.12.6",
53 |     "@wdio/spec-reporter": "^9.12.6",
54 |     "electron": "^31.0.2",
55 |     "electron-builder": "^24.13.3",
56 |     "electron-vite": "^2.3.0",
57 |     "eslint": "^8.57.0",
58 |     "eslint-plugin-svelte": "^2.45.1",
59 |     "prettier": "^3.3.2",
60 |     "prettier-plugin-svelte": "^3.2.6",
61 |     "svelte": "^5.0.0",
62 |     "svelte-check": "^4.0.0",
63 |     "tailwindcss": "^4.0.6",
64 |     "tslib": "^2.6.3",
65 |     "typescript": "^5.7.3",
66 |     "vite": "^5.4.4",
67 |     "vite-node": "^3.0.8",
68 |     "vitest": "^3.0.8",
69 |     "wdio-electron-service": "^8.1.0",
70 |     "wdio-wait-for": "^3.1.0"
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/python-prototype/embed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
  8 | from glob import glob
  9 | 
 10 | from functools import wraps
 11 | from time import time
 12 | from datetime import datetime
 13 | 
 14 | 
 15 | # In[3]:
 16 | 
 17 | 
 18 | 
 19 | # In[5]:
 20 | 
 21 | def load_csv_to_documents(fname, text_column_name):
 22 |     from non_stupid_csv_reader import NonStupidCSVReader
 23 |     #TODO: automatically guess the text column name
 24 |     # TODO: let users select multiple text columns (sewing them together into one entry? or embedding them separately? idk.)
 25 |     documents = NonStupidCSVReader().load_data(fname, "paragraph")
 26 | 
 27 |     # TODO: if embedding multiple columns, this is a way to embed them together.
 28 |     # bizarrely, by default, LlamaIndex embeds the metadata too.
 29 |     # we don't want that. Just embed the dang'd text.
 30 |     # cf. https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_documents/
 31 |     for document in documents:
 32 |         document.excluded_embed_metadata_keys = document.metadata.keys()
 33 | 
 34 | 
 35 | # In[6]:
 36 | 
 37 | 
 38 | # just for lookin'
 39 | 
 40 | len(documents)
 41 | 
 42 | 
 43 | # In[19]:
 44 | 
 45 | 
 46 | from llama_index.core.schema import TextNode, TransformComponent, NodeRelationship
 47 | from llama_index.core.node_parser import SentenceSplitter
 48 | 
 49 | class Sploder(TransformComponent):
 50 |     def __call__(self, nodes, max_string_token_count=50, **kwargs):
 51 |         new_nodes = []
 52 |         splitter = SentenceSplitter() # just for the token_size method
 53 |                                       # TODO just go get the token_size method
 54 |         for node in nodes:
 55 |             new_nodes.append(node)
 56 |             if splitter._token_size(node.text) > max_string_token_count: continue
 57 |             if NodeRelationship.NEXT in node.relationships:
 58 |                 b_c_node = TextNode(
 59 |                                 text =  node.text + \
 60 |                                         node.relationships[NodeRelationship.NEXT].metadata["original_text"], 
 61 |                                 metadata=node.metadata)
 62 |                 new_nodes.append(b_c_node)
 63 |             if NodeRelationship.NEXT in node.relationships and NodeRelationship.PREVIOUS in node.relationships:
 64 |                 a_b_c_node = TextNode(text=node.relationships[NodeRelationship.PREVIOUS].metadata["original_text"] + \
 65 |                                        node.text + \
 66 |                                        node.relationships[NodeRelationship.NEXT].metadata["original_text"],
 67 |                                   metadata=node.metadata
 68 |                                  )
 69 |                 new_nodes.append(a_b_c_node)
 70 |         return new_nodes
 71 | 
 72 | 
 73 | # In[20]:
 74 | 
 75 | 
 76 | from typing import Any, Callable, List
 77 | from llama_index.core import Document
 78 | from llama_index.embeddings.openai import OpenAIEmbedding
 79 | from llama_index.core.node_parser import SentenceWindowNodeParser
 80 | from llama_index.core.extractors import TitleExtractor
 81 | from llama_index.core.ingestion import IngestionPipeline, IngestionCache
 82 | from llama_index.core.node_parser.text.utils import split_by_sentence_tokenizer_internal
 83 | 
 84 | import openai
 85 | import os
 86 | from dotenv import load_dotenv
 87 | load_dotenv()
 88 | 
 89 | import tiktoken
 90 | 
 91 | 
 92 | openai.api_key = os.environ["OPENAI_API_KEY"]
 93 | # Settings.embed_model = OpenAIEmbedding(model=MODEL_NAME)
 94 | 
 95 | 
 96 | # via https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/node_parser/text/utils.py
 97 | def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
 98 |     # via https://stackoverflow.com/questions/14095971/how-to-tweak-the-nltk-sentence-tokenizer
 99 |     from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
100 |     punkt_param = PunktParameters()    
101 |     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'corp', 'ltd'])
102 |     tokenizer = PunktSentenceTokenizer(punkt_param)
103 |     return lambda text: split_by_sentence_tokenizer_internal(text, tokenizer)
104 | 
105 | def get_text_transformations(use_sploder=True):
106 |     text_transformations = [
107 |         # automatically splits by sentences in a logical way
108 |         # my sentence splitter handlers abbreviations better.
109 |         SentenceWindowNodeParser(sentence_splitter=split_by_sentence_tokenizer()), 
110 |     ]
111 |     if USE_SPLODER:
112 |         text_transformations.append(
113 |             Sploder(max_string_token_count=SPLODER_MAX_SIZE),
114 |                                         # for reasonably short sentences
115 |                                         # add nodes for:
116 |                                         # - the sentence and the sentence after
117 |                                         # - the sentence before, tjhe sentence, and the sentence after
118 |                                         # in hopes of capturing paragraph-level meaning too (but only for short sentences)
119 |         )
120 |     return text_transformations
121 | 
122 | 
123 | def create_preview_nodes(documents, text_transformations):
124 |     price_estimation_pipeline = IngestionPipeline(
125 |         transformations=text_transformations
126 |     )
127 |     preview_nodes = price_estimation_pipeline.run(documents=documents)
128 | 
129 | price_per_1M = {"text-embedding-3-small": 0.020,
130 |                 "text-embedding-3-large": 0.130}
131 | 
132 | def estimate_cost(preview_nodes):
133 |     enc = tiktoken.encoding_for_model(MODEL_NAME)
134 | 
135 |     token_count = sum([len(enc.encode(node.text)) for node in preview_nodes])
136 |     estimated_price = token_count * (price_per_1M[MODEL_NAME] / 1_000_000)
137 |     print("cost estimate: ${:.2f} ({:,.0f} tokens)".format(estimated_price, token_count))
138 | 
139 | 
140 | # this is just monkeypatching to look at what's going on.
141 | # from llama_index.embeddings.openai.base import get_embeddings
142 | # def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
143 | #     """Get text embeddings.
144 | 
145 | #     By default, this is a wrapper around _get_text_embedding.
146 | #     Can be overridden for batch queries.
147 | 
148 | #     """
149 | #     client = self._get_client()
150 | #     retry_decorator = self._create_retry_decorator()
151 | #     print(texts)
152 | #     @retry_decorator
153 | #     def _retryable_get_embeddings():
154 | #         return get_embeddings(
155 | #             client,
156 | #             texts,
157 | #             engine=self._text_engine,
158 | #             **self.additional_kwargs,
159 | #         )
160 | 
161 | #     return _retryable_get_embeddings()
162 | 
163 | # OpenAIEmbedding._get_text_embeddings = _get_text_embeddings
164 | 
165 | 
166 | # In[24]:
167 | 
168 | def embed_documents(documents, text_transformations, embedding_step):
169 |     # run the pipeline
170 |     pipeline = IngestionPipeline(
171 |         transformations=text_transformations + [
172 |             embedding_step
173 |         ]
174 |     )
175 |     start_time = datetime.now()
176 |     nodes = pipeline.run(documents=documents)
177 |     end_time = datetime.now()
178 |     duration = end_time - start_time
179 |     print("took {}s to embed {} documents".format(duration.total_seconds(), len(documents)))
180 |     return nodes
181 | 
182 | 
183 | from llama_index.vector_stores.postgres import PGVectorStore
184 | from llama_index.vector_stores.duckdb import DuckDBVectorStore
185 | from llama_index.core import VectorStoreIndex, StorageContext
186 | from llama_index.vector_stores.chroma import ChromaVectorStore
187 | import chromadb
188 | 
189 | model_dims = {
190 |     "text-embedding-3-small": 1536,
191 |     "text-embedding-3-large": 3072
192 | }
193 | 
194 | 
195 | def create_vector_store_and_persist(project_name, vector_store_type, nodes):
196 |     start_time = datetime.now()
197 |     if vector_store_type == "chroma":
198 |         db = chromadb.PersistentClient(path="./chroma")
199 |         chroma_collection = db.get_or_create_collection("{}".format(project_name))
200 |         vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
201 |     elif vector_store_type == "duckdb":
202 |         vector_store = DuckDBVectorStore("{}2.duckdb".format(project_name), persist_dir="../duckdb/")
203 |     elif vector_store_type == "postgres":
204 |         vector_store = PGVectorStore.from_params(
205 |             database="meaningfully",
206 |             host="localhost",
207 |             # password=url.password,
208 |             port=5432,
209 |             user="jeremybmerrill",
210 |             table_name=project_name,
211 |             embed_dim=model_dims[MODEL_NAME],  # openai embedding dimension
212 |             hnsw_kwargs={
213 |                 "hnsw_m": 16,
214 |                 "hnsw_ef_construction": 64,
215 |                 "hnsw_ef_search": 40,
216 |                 "hnsw_dist_method": "vector_cosine_ops",
217 |             },
218 |         )
219 |     storage_context = StorageContext.from_defaults(vector_store=vector_store)
220 |     index = VectorStoreIndex(
221 |         nodes, storage_context=storage_context, show_progress=True
222 |     )
223 |     end_time = datetime.now()
224 |     duration = end_time - start_time
225 |     print("took {}s to store {} nodes".format(duration.total_seconds(), len(nodes)))
226 |     #index = VectorStoreIndex.from_documents(documents)
227 |     return index
228 | 
229 | from llama_index.core.vector_stores.types import (
230 |     MetadataFilter,
231 |     MetadataFilters,
232 |     FilterOperator,
233 | )
234 | def search(index, query, n_results=10):
235 |     retriever = index.as_retriever(
236 |         similarity_top_k=n_results,
237 |         embed_model=OpenAIEmbedding(model_name=MODEL_NAME)
238 |     )
239 |     result_nodes = retriever.retrieve(query)
240 |     return result_nodes
241 |     
242 | 
243 | def node_to_dict(node, metadata_columns_to_display):
244 |     d = {"text": node.text, **{k:node.metadata[k] for k in metadata_columns_to_display}}
245 |     if hasattr(node, "score"):
246 |         d["score"] = node.score
247 |     return d
248 | 
249 | def nodes_to_df(nodes, metadata_columns_to_display):
250 |     return pd.DataFrame(
251 |         [node_to_dict(node, metadata_columns_to_display) for node in nodes]
252 |     )
253 | 
254 | #
255 | def display_nodes(nodes, metadata_columns_to_display):
256 |     # just for lookin'
257 |     with pd.option_context('display.max_colwidth', 500):
258 |         display(nodes_to_df(nodes, metadata_columns_to_display))
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | def main():
266 |     # for each document set (eventually)
267 |     # read CSV probably using NonStupidCSVReader
268 |     import re
269 |     import os
270 | 
271 |     fname = glob('../sample-data/*.csv')[0]
272 |     # fname = "../sample-data/Tweets by @NYCFireWire - Sheet1.csv"
273 |     fname = "../sample-data/constellation-10q.csv"
274 |     project_name = re.sub(r"\W", "_", os.path.basename(fname).split(".")[0])
275 | 
276 | 
277 |     VECTOR_STORE = "chroma"
278 |     MODEL_NAME = "text-embedding-3-small"
279 |     USE_SPLODER = True
280 |     SPLODER_MAX_SIZE = 50
281 | 
282 |     metadata = {
283 |         "../sample-data/Tweets by @NYCFireWire - Sheet1.csv":["date", "acct", "url", "addr", "notes"],
284 |         "../sample-data/constellation-10q.csv": ["fn", "paragraph_index"]
285 |     }
286 | 
287 |     documents = load_csv_to_documents(fname)
288 |     text_transformations = get_text_transformations()
289 |     preview_nodes = create_preview_nodes(documents, text_transformations)
290 |     
291 |     # display an arbitrary sample of the preview nodes from the middle, but retaining the order 
292 |     display_nodes(preview_nodes[len(preview_nodes)//2:(len(preview_nodes)//2)+10], metadata[fname])
293 | 
294 |     estimate_cost(preview_nodes) # prints cost estimate
295 | 
296 |     embedding_step = OpenAIEmbedding(model=MODEL_NAME )
297 |     nodes = embed_documents(documents, text_transformations, embedding_step)
298 | 
299 |     index = create_vector_store_and_persist(project_name, vector_store_type=, nodes)
300 |     display_nodes(search(index, "snow at our factory cost us a lot of money"), metadata[fname])
301 | 


--------------------------------------------------------------------------------
/python-prototype/non_stupid_csv_reader.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional, Dict, List
 2 | from fsspec import AbstractFileSystem
 3 | from pathlib import Path
 4 | 
 5 | import pandas as pd
 6 | from llama_index.core.readers.base import BaseReader
 7 | from llama_index.core.schema import Document
 8 | 
 9 | class NonStupidCSVReader(BaseReader):
10 |     r"""Pandas-based CSV parser.
11 | 
12 |     Parses CSVs using the separator detection from Pandas `read_csv`function.
13 |     If special parameters are required, use the `pandas_config` dict.
14 | 
15 |     Args:
16 |         text_column_name: the CSV column containing the text to be embedded 
17 |         
18 |         pandas_config (dict): Options for the `pandas.read_csv` function call.
19 |             Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
20 |             for more information.
21 |             Set to empty dict by default, this means pandas will try to figure
22 |             out the separators, table head, etc. on its own.
23 | 
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         *args: Any,
29 |         pandas_config: dict = {},
30 |         **kwargs: Any
31 |     ) -> None:
32 |         """Init params."""
33 |         super(BaseReader, self).__init__(*args, **kwargs)
34 |         self._pandas_config = pandas_config
35 | 
36 |     def load_data(
37 |         self,
38 |         file: Path,
39 |         text_column_name: str,
40 |         extra_info: Optional[Dict] = None,
41 |         fs: Optional[AbstractFileSystem] = None,
42 |     ) -> List[Document]:
43 |         """Parse file."""
44 |         if fs:
45 |             with fs.open(file) as f:
46 |                 df = pd.read_csv(f, **self._pandas_config)
47 |         else:
48 |             df = pd.read_csv(file, **self._pandas_config)
49 | 
50 |         df.fillna('', inplace=True) # Postgres chokes on NaNs in metadata.
51 | 
52 |         return [
53 |             Document(text=row[text_column_name], metadata=row[[col for col in df.columns if col != text_column_name]].to_dict() or {}) for _, row in df.iterrows()
54 |         ]
55 | 


--------------------------------------------------------------------------------
/python-prototype/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | llama-index
 3 | jupyter
 4 | gradio
 5 | python-dotenv
 6 | llama-index-vector-stores-postgres
 7 | # duckdb
 8 | # llama-index-vector-stores-duckdb
 9 | chromadb
10 | llama-index-vector-stores-chroma


--------------------------------------------------------------------------------
/python-prototype/search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[2]:
  5 | 
  6 | 
  7 | from glob import glob
  8 | import os
  9 | import re
 10 | 
 11 | csvs = glob('../sample-data/*.csv')
 12 | print("Possible project names:")
 13 | for fname in csvs:
 14 |     possible_project_name = re.sub(r"\W", "_", os.path.basename(fname).split(".")[0])
 15 |     print(possible_project_name)
 16 | 
 17 | 
 18 | # In[4]:
 19 | 
 20 | 
 21 | from llama_index.vector_stores.postgres import PGVectorStore
 22 | from llama_index.vector_stores.duckdb import DuckDBVectorStore
 23 | from llama_index.vector_stores.chroma import ChromaVectorStore
 24 | import chromadb
 25 | 
 26 | from llama_index.core import VectorStoreIndex, StorageContext
 27 | import re
 28 | project_name = 'constellation_10q'
 29 | 
 30 | VECTOR_STORE = "chroma"
 31 | 
 32 | if VECTOR_STORE == 'chroma':
 33 |     chroma_client = chromadb.PersistentClient(path="./chroma")
 34 |     chroma_collection = chroma_client.get_or_create_collection("{}".format(project_name))
 35 |     vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
 36 | elif VECTOR_STORE == "duckdb":
 37 |     vector_store = DuckDBVectorStore("{}2.duckdb".format(project_name), persist_dir="../duckdb/")
 38 | elif VECTOR_STORE == "postgres":
 39 |     vector_store = PGVectorStore.from_params(
 40 |         database="meaningfully",
 41 |         host="localhost",
 42 |         # password=url.password,
 43 |         port=5432,
 44 |         user="jeremybmerrill",
 45 |         table_name=project_name,
 46 |         embed_dim=model_dims[MODEL_NAME],  # openai embedding dimension
 47 |         hnsw_kwargs={
 48 |             "hnsw_m": 16,
 49 |             "hnsw_ef_construction": 64,
 50 |             "hnsw_ef_search": 40,
 51 |             "hnsw_dist_method": "vector_cosine_ops",
 52 |         },
 53 |     )
 54 | 
 55 | 
 56 | # In[5]:
 57 | 
 58 | 
 59 | import os
 60 | from dotenv import load_dotenv
 61 | load_dotenv()
 62 | 
 63 | # from llama_index.core import Settings
 64 | from llama_index.embeddings.openai import OpenAIEmbedding
 65 | 
 66 | MODEL_NAME = "text-embedding-3-small"
 67 | # Settings.embed_model = OpenAIEmbedding(model=MODEL_NAME)
 68 | 
 69 | 
 70 | index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
 71 | 
 72 | 
 73 | # In[6]:
 74 | 
 75 | 
 76 | from llama_index.core.vector_stores.types import (
 77 |     MetadataFilter,
 78 |     MetadataFilters,
 79 |     FilterOperator,
 80 | )
 81 | 
 82 | filters = MetadataFilters(
 83 |     filters=[
 84 |         #MetadataFilter(key="date", value="February 15", operator=FilterOperator.TEXT_MATCH),
 85 |         # MetadataFilter(key="author", value="sven@timescale.com"),
 86 |     ],
 87 |     # condition="or",
 88 | )
 89 | 
 90 | retriever = index.as_retriever(
 91 |     similarity_top_k=10,
 92 |     #filters=filters,
 93 |     embed_model = OpenAIEmbedding(model=MODEL_NAME)
 94 | )
 95 | result_nodes = retriever.retrieve("we fired an executive")
 96 | 
 97 | 
 98 | # In[7]:
 99 | 
100 | 
101 | METADATA_COLUMNS_TO_DISPLAY = ["date", "acct", "url", "addr", "notes"]
102 | 
103 | 
104 | # In[8]:
105 | 
106 | 
107 | import pandas as pd
108 | # just for lookin'
109 | result_nodes_list = []
110 | for node_with_score in result_nodes:
111 |     result_node_dict = {"text": node_with_score.node.text.replace("\n", ' '), **node_with_score.node.metadata}
112 |     # TODO add shingles
113 |     result_node_dict["score"] = node_with_score.score
114 |     result_nodes_list.append(result_node_dict)
115 | result_nodes_df = pd.DataFrame(result_nodes_list)
116 | with pd.option_context('display.max_colwidth', 500):
117 |     display(result_nodes_df[["text"] + [c for c in METADATA_COLUMNS_TO_DISPLAY if c in result_nodes_df.columns] + ["score"]])
118 | 
119 | 
120 | # In[9]:
121 | 
122 | 
123 | # mimicing asking for more results
124 | # TODO: Figure out how to not re-embed the query
125 | retriever.similarity_top_k=30
126 | result_nodes = retriever.retrieve("we fired an executive and he isn't getting paid")
127 | 
128 | 
129 | # In[10]:
130 | 
131 | 
132 | import pandas as pd
133 | # just for lookin'
134 | result_nodes_list = []
135 | for node_with_score in result_nodes:
136 |     result_node_dict = {"text": node_with_score.node.text.replace("\n", ' '), **node_with_score.node.metadata}
137 |     # TODO add shingles
138 |     result_node_dict["score"] = node_with_score.score
139 |     result_nodes_list.append(result_node_dict)
140 | result_nodes_df = pd.DataFrame(result_nodes_list)
141 | with pd.option_context('display.max_colwidth', 500):
142 |     display(result_nodes_df[["text"] + [c for c in METADATA_COLUMNS_TO_DISPLAY if c in result_nodes_df.columns] + ["score"]])
143 | 
144 | 
145 | # In[ ]:
146 | 
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/resources/icon.icns:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/resources/icon.icns


--------------------------------------------------------------------------------
/resources/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/resources/icon.png


--------------------------------------------------------------------------------
/src/docs/img/groceries_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/src/docs/img/groceries_screenshot.png


--------------------------------------------------------------------------------
/src/main/DocumentSetManager.ts:
--------------------------------------------------------------------------------
  1 | import Database from 'better-sqlite3';
  2 | import type { DocumentSetMetadata, Settings } from './types';
  3 | import { join } from 'path';
  4 | export class DocumentSetManager {
  5 |   private sqliteDb: Database;
  6 | 
  7 |   constructor(storagePath: string) {
  8 |     // Initialize SQLite database for document set metadata
  9 |     this.sqliteDb = new Database(join(storagePath, 'metadata.db'));
 10 |     
 11 |     // Initialize the collection
 12 |     this.initializeDatabase();
 13 |   }
 14 | 
 15 |   private async initializeDatabase() {
 16 |     // Create SQLite table for document sets
 17 |     this.sqliteDb.exec(`
 18 |       CREATE TABLE IF NOT EXISTS document_sets (
 19 |         set_id INTEGER PRIMARY KEY AUTOINCREMENT,
 20 |         name TEXT NOT NULL UNIQUE,
 21 |         upload_date TEXT NOT NULL,
 22 |         parameters TEXT NOT NULL,
 23 |         total_documents INTEGER NOT NULL DEFAULT 0
 24 |       );
 25 |     `);
 26 |     this.sqliteDb.exec(`
 27 |       CREATE TABLE IF NOT EXISTS meaningfully_settings (
 28 |         settings_id INTEGER PRIMARY KEY AUTOINCREMENT,
 29 |         settings TEXT NOT NULL
 30 |       );
 31 |     `);    
 32 |   }
 33 | 
 34 |   async addDocumentSet(metadata: Omit<DocumentSetMetadata, 'documentSetId'>): Promise<number> {
 35 |     const stmt = this.sqliteDb.prepare(`
 36 |       INSERT INTO document_sets (name, upload_date, parameters, total_documents)
 37 |       VALUES (?, ?, ?, ?)
 38 |     `);
 39 | 
 40 |     const result = stmt.run(
 41 |       metadata.name,
 42 |       metadata.uploadDate.toISOString(),
 43 |       JSON.stringify(metadata.parameters),
 44 |       metadata.totalDocuments
 45 |     );
 46 | 
 47 |     return result.lastInsertRowid as number;
 48 |   }
 49 | 
 50 |   async getDocumentSet(documentSetId: number): Promise<DocumentSetMetadata | null> {
 51 |     const stmt = this.sqliteDb.prepare(`
 52 |       SELECT * FROM document_sets WHERE set_id = ?
 53 |     `);
 54 |     
 55 |     const row = stmt.get(documentSetId);
 56 |     if (!row) return null;
 57 | 
 58 |     return {
 59 |       documentSetId: row.set_id,
 60 |       name: row.name,
 61 |       uploadDate: new Date(row.upload_date),
 62 |       parameters: JSON.parse(row.parameters),
 63 |       totalDocuments: row.total_documents
 64 |     };
 65 |   }
 66 | 
 67 |   async getDocumentSets(page: number = 1, pageSize: number = 10): Promise<{documents: DocumentSetMetadata[], total: number}> {
 68 |     const offset = (page - 1) * pageSize;
 69 |     
 70 |     // Get total count
 71 |     const countStmt = this.sqliteDb.prepare('SELECT COUNT(*) as count FROM document_sets');
 72 |     const totalCount = countStmt.get().count;
 73 | 
 74 |     // Get paginated results
 75 |     const stmt = this.sqliteDb.prepare(`
 76 |         SELECT * FROM document_sets 
 77 |         ORDER BY upload_date DESC 
 78 |         LIMIT ? OFFSET ?
 79 |     `);
 80 |     const rows = stmt.all(pageSize, offset);
 81 | 
 82 |     const documents = rows.map(row => ({
 83 |         documentSetId: row.set_id,
 84 |         name: row.name,
 85 |         uploadDate: new Date(row.upload_date),
 86 |         parameters: JSON.parse(row.parameters),
 87 |         totalDocuments: row.total_documents
 88 |     }));
 89 | 
 90 |     return {
 91 |         documents,
 92 |         total: totalCount
 93 |     };
 94 |   }
 95 | 
 96 |   async updateDocumentCount(documentSetId: number, count: number) {
 97 |     const stmt = this.sqliteDb.prepare(`
 98 |       UPDATE document_sets 
 99 |       SET total_documents = total_documents + ?
100 |       WHERE set_id = ?
101 |     `);
102 |     
103 |     stmt.run(count, documentSetId);
104 |   }
105 | 
106 |   async deleteDocumentSet(documentSetId: number) {
107 |     const stmt = this.sqliteDb.prepare(`
108 |       DELETE FROM document_sets
109 |       WHERE set_id = ?
110 |     `);
111 |     
112 |     stmt.run(documentSetId);
113 |   }
114 | 
115 |   async getSettings() { 
116 |     const DEFAULT_SETTINGS = {
117 |       "openAIKey": null,
118 |       "oLlamaModelType": null,
119 |       "oLlamaBaseURL": null,
120 |     }
121 |     const stmt = this.sqliteDb.prepare(`
122 |       SELECT * FROM meaningfully_settings WHERE settings_id = 1
123 |     `);
124 |     
125 |     const row = stmt.get();
126 |     let settings;
127 |     if (row){
128 |       try {
129 |         settings = JSON.parse(row.settings) as Settings;
130 |       } catch (error) {
131 |         console.error("Error parsing settings JSON:", error);
132 |         settings = DEFAULT_SETTINGS;
133 |       }
134 |     }else{
135 |       settings = DEFAULT_SETTINGS;
136 |     }
137 |     settings = Object.assign({}, DEFAULT_SETTINGS, settings)
138 |     return settings; 
139 |   }
140 | 
141 |   async setSettings(settings: Settings){
142 |     const stmt = this.sqliteDb.prepare(`
143 |       INSERT OR REPLACE INTO meaningfully_settings (settings_id, settings)
144 |       VALUES (1, ?)
145 |     `);
146 |     
147 |     stmt.run(JSON.stringify(settings));
148 |     return Object.assign(settings, {"success": true});
149 |   }
150 | 
151 |   close() {
152 |     this.sqliteDb.close();
153 |   }
154 | }


--------------------------------------------------------------------------------
/src/main/DocumentSetService.ts:
--------------------------------------------------------------------------------
  1 | import { DocumentSetManager } from './DocumentSetManager';
  2 | import { loadDocumentsFromCsv } from './services/csvLoader';
  3 | import { createEmbeddings, getIndex, search, previewResults, getDocStore } from './api/embedding';
  4 | import { capitalizeFirstLetter, unescapeNodeMetadataKeys } from './utils';
  5 | import { join } from 'path';
  6 | import { DocumentSetParams, Settings, MetadataFilter, Clients } from './types';
  7 | import fs from 'fs';
  8 | 
  9 | type HasFilePath = {filePath: string};
 10 | type DocumentSetParamsFilePath = DocumentSetParams & HasFilePath;
 11 | 
 12 | const maskKey = (key: string, n: number = 20): string => {
 13 |   return (key && key.length > (n*2)) ? key.slice(0, n) + "*******" + key.slice(key.length - n) : key;
 14 | };
 15 | 
 16 | 
 17 | export class DocumentService {
 18 |   private manager: DocumentSetManager;
 19 |   private storagePath: string;
 20 |   private clients: Clients;
 21 | 
 22 |   constructor({ storagePath, weaviateClient }: { storagePath: string, weaviateClient?: any }) {
 23 |     this.storagePath = storagePath;
 24 |     this.manager = new DocumentSetManager(this.storagePath);
 25 |     this.clients = {
 26 |       weaviateClient: weaviateClient,
 27 |       postgresClient: null
 28 |     };
 29 |   }
 30 | 
 31 |   setClients(clients: Clients) {
 32 |     this.clients = { ...this.clients, ...clients };
 33 |   }
 34 |   getClients() {
 35 |     return this.clients;
 36 |   }
 37 | 
 38 |   async listDocumentSets(page: number = 1, pageSize: number = 10) {
 39 |     return await this.manager.getDocumentSets(page, pageSize);
 40 |   }
 41 | 
 42 |   async getDocumentSet(documentSetId: number) {
 43 |     return await this.manager.getDocumentSet(documentSetId);
 44 |   }
 45 |   async deleteDocumentSet(documentSetId: number) {
 46 |     // Delete the document set from the database
 47 |     const result = await this.manager.getDocumentSet(documentSetId);
 48 |     if (result){
 49 |       // Delete the document set from the database
 50 |       await this.manager.deleteDocumentSet(documentSetId);
 51 |       // Delete the associated files from the filesystem
 52 |       fs.rmSync(join(this.storagePath, 'simple_vector_store', result.name), { recursive: true, force: true });
 53 |       fs.rmSync(join(this.storagePath, 'weaviate_data', capitalizeFirstLetter(result.name)), { recursive: true, force: true });
 54 |     }
 55 |     return { success: true };
 56 |   }
 57 | 
 58 |   getVectorStoreType() {
 59 |     return this.clients.weaviateClient ? 'weaviate' : 'simple';
 60 |   }
 61 | 
 62 |   async generatePreviewData(data: DocumentSetParamsFilePath) {
 63 |     const vectorStoreType = this.getVectorStoreType();
 64 |     try {
 65 |       return await previewResults(data.filePath, data.textColumns[0], {
 66 |         modelName: data.modelName, // needed to tokenize, estimate costs
 67 |         modelProvider: data.modelProvider,
 68 |         splitIntoSentences: data.splitIntoSentences,
 69 |         combineSentencesIntoChunks: data.combineSentencesIntoChunks,
 70 |         sploderMaxSize: 100,
 71 |         vectorStoreType: vectorStoreType,
 72 |         projectName: data.datasetName,
 73 |         storagePath: this.storagePath,
 74 |         chunkSize: data.chunkSize,
 75 |         chunkOverlap: data.chunkOverlap
 76 |     });
 77 |   } catch (error) {
 78 |     throw error;
 79 |   }
 80 | }
 81 | 
 82 |     async uploadCsv(data: DocumentSetParamsFilePath) {
 83 |     // figure out if weaviate is available
 84 |     const vectorStoreType = this.getVectorStoreType();
 85 | 
 86 |     // First create the document set record
 87 |     const documentSetId = await this.manager.addDocumentSet({
 88 |       name: data.datasetName,
 89 |       uploadDate: new Date(),
 90 |       parameters: {
 91 |         description: data.description,
 92 |         textColumns: data.textColumns,
 93 |         metadataColumns: data.metadataColumns,
 94 |         splitIntoSentences: data.splitIntoSentences,
 95 |         combineSentencesIntoChunks: data.combineSentencesIntoChunks,
 96 |         sploderMaxSize: data.sploderMaxSize,
 97 |         chunkSize: data.chunkSize,
 98 |         chunkOverlap: data.chunkOverlap,
 99 |         modelName: data.modelName,
100 |         modelProvider: data.modelProvider,
101 |         vectorStoreType: vectorStoreType,
102 |       },
103 |       totalDocuments: 0 // We'll update this after processing
104 |     });
105 | 
106 |     const embedSettings = await this.manager.getSettings()
107 | 
108 |     // Load and process the documents
109 |     try {
110 |       // Process each text column
111 |       for (const textColumn of data.textColumns) {
112 |         const documents = await loadDocumentsFromCsv(data.filePath, textColumn);
113 |         
114 |         // Update total documents count
115 |         await this.manager.updateDocumentCount(documentSetId, documents.length);
116 | 
117 |         // Create embeddings for this column
118 |         let ret = await createEmbeddings(data.filePath, textColumn, {
119 |           modelName: data.modelName,
120 |           modelProvider: data.modelProvider,
121 |           splitIntoSentences: data.splitIntoSentences,
122 |           combineSentencesIntoChunks: data.combineSentencesIntoChunks,
123 |           sploderMaxSize: 100, // TODO: make configurable
124 |           vectorStoreType: vectorStoreType,
125 |           projectName: data.datasetName,
126 |                         // via https://medium.com/cameron-nokes/how-to-store-user-data-in-electron-3ba6bf66bc1e
127 |           storagePath:  this.storagePath,
128 |           chunkSize: data.chunkSize,
129 |           chunkOverlap: data.chunkOverlap,
130 |         }, embedSettings, this.clients);
131 |         if (!ret.success) {
132 |           throw new Error(ret.error);
133 |         }
134 |       }
135 |       return { success: true, documentSetId };
136 |     } catch (error) {
137 |       // If something fails, we should probably delete the document set
138 |       await this.manager.deleteDocumentSet(documentSetId);
139 |       console.error("deleting document set due to failure ", documentSetId, error);
140 |       throw error;
141 |     }
142 |   }
143 | 
144 | 
145 |     async searchDocumentSet(documentSetId: number, query: string, n_results: number = 10,   filters?: MetadataFilter[]  ) {
146 |     const documentSet = await this.manager.getDocumentSet(documentSetId);
147 |     const settings = await this.manager.getSettings();
148 |     if (!documentSet) {
149 |       throw new Error('Document set not found');
150 |     } 
151 |     const index = await getIndex({
152 |       modelName: documentSet.parameters.modelName as string,
153 |       modelProvider: documentSet.parameters.modelProvider as string,
154 |       splitIntoSentences: documentSet.parameters.splitIntoSentences as boolean,
155 |       combineSentencesIntoChunks: documentSet.parameters.combineSentencesIntoChunks as boolean,
156 |       sploderMaxSize: 100,
157 |       vectorStoreType: documentSet.parameters.vectorStoreType as 'simple' | 'weaviate',
158 |       projectName: documentSet.name,
159 |       storagePath: this.storagePath,
160 |       chunkSize: 1024, // not actually used, we just re-use a config object that has this option
161 |       chunkOverlap: 20, // not actually used, we just re-use a config object that has this option
162 |     }, settings, this.clients);
163 |     const results = await search(index, query, n_results, filters);
164 |     return results;
165 |   }   
166 | 
167 |   async getDocument(documentSetId: number, documentNodeId: string){
168 |     const documentSet = await this.manager.getDocumentSet(documentSetId);
169 |     if (!documentSet) {
170 |       throw new Error('Document set not found');
171 |     } 
172 |     const docStore = await getDocStore({
173 |       modelName: documentSet.parameters.modelName as string,
174 |       modelProvider: documentSet.parameters.modelProvider as string,
175 |       splitIntoSentences: documentSet.parameters.splitIntoSentences as boolean,
176 |       combineSentencesIntoChunks: documentSet.parameters.combineSentencesIntoChunks as boolean,
177 |       sploderMaxSize: 100,
178 |       vectorStoreType: documentSet.parameters.vectorStoreType as 'simple' | 'weaviate',
179 |       projectName: documentSet.name,
180 |       storagePath: this.storagePath,
181 |       chunkSize: 1024, // not actually used, we just re-use a config object that has this option
182 |       chunkOverlap: 20, // not actually used, we just re-use a config object that has this option
183 |     });
184 |     const document = unescapeNodeMetadataKeys(await docStore.getNode(documentNodeId));
185 |     if (!document) {
186 |       throw new Error('Document not found');
187 |     }
188 |     return document;
189 |   }
190 | 
191 | 
192 |   async getSettings() {
193 |     return this.manager.getSettings();
194 |   }
195 |   async setSettings(settings: Settings) {
196 |     return this.manager.setSettings(settings);
197 |   } 
198 | 
199 |   async getMaskedSettings() {
200 |     const settings = await this.manager.getSettings();
201 |     return {
202 |       openAIKey: maskKey(settings.openAIKey),
203 |       oLlamaModelType: settings.oLlamaModelType,
204 |       oLlamaBaseURL: settings.oLlamaBaseURL
205 |     };
206 |   }
207 |   async setMaskedSettings(newSettings: Settings) { 
208 |     const oldSettings = await this.manager.getSettings();
209 |     const settings = {
210 |       ...newSettings,
211 |       openAIKey: newSettings.openAIKey == maskKey(oldSettings.openAIKey) ? oldSettings.openAIKey : newSettings.openAIKey
212 |     };
213 |     return this.manager.setSettings(settings);
214 |   }
215 | 
216 | }


--------------------------------------------------------------------------------
/src/main/api/embedding.test.ts:
--------------------------------------------------------------------------------
  1 | //@ts-nocheck
  2 | import { describe, it, expect, vi } from 'vitest';
  3 | import { createEmbeddings, previewResults, getDocStore, getIndex, search } from './embedding';
  4 | import { loadDocumentsFromCsv } from '../services/csvLoader';
  5 | import { embedDocuments, createPreviewNodes, estimateCost, searchDocuments, getExistingVectorStoreIndex, persistNodes, getExistingDocStore } from '../services/embeddings';
  6 | import { MetadataMode } from 'llamaindex';
  7 | 
  8 | // filepath: /Users/jeremybmerrill/code/meaningfully/src/main/api/embedding.test.ts
  9 | 
 10 | 
 11 | vi.mock('../services/csvLoader');
 12 | vi.mock('../services/embeddings');
 13 | 
 14 | describe('embedding.ts', () => {
 15 |     describe('createEmbeddings', () => {
 16 |         it('should create embeddings and return success', async () => {
 17 |             const mockDocuments = [{ text: 'doc1' }, { text: 'doc2' }];
 18 |             const mockNodes = [{ node: 'node1' }, { node: 'node2' }];
 19 |             const mockIndex = 'index1';
 20 |             loadDocumentsFromCsv.mockResolvedValue(mockDocuments);
 21 |             embedDocuments.mockResolvedValue(mockNodes);
 22 |             persistNodes.mockResolvedValue(mockIndex);
 23 | 
 24 |             const result = await createEmbeddings('path/to/csv', 'text', {}, {});
 25 | 
 26 |             expect(result).toEqual({ success: true, index: mockIndex });
 27 |         });
 28 | 
 29 |         it('should return error on failure', async () => {
 30 |             loadDocumentsFromCsv.mockRejectedValue(new Error('Failed to load documents'));
 31 | 
 32 |             const result = await createEmbeddings('path/to/csv', 'text', {}, {});
 33 | 
 34 |             expect(result).toEqual({ success: false, error: 'Failed to load documents' });
 35 |         });
 36 | 
 37 |         it('should handle empty documents', async () => {
 38 |             loadDocumentsFromCsv.mockResolvedValue([]);
 39 | 
 40 |             const result = await createEmbeddings('path/to/csv', 'text', {}, {});
 41 | 
 42 |             expect(result).toEqual({ success: false, error: 'That CSV does not appear to contain any documents. Please check the file and try again.' });
 43 |         });
 44 |     });
 45 | 
 46 |     describe('previewResults', () => {
 47 |         it('should return preview results and estimated cost', async () => {
 48 |             const mockDocuments = Array(20).fill({ text: 'doc' });
 49 |             const mockNodes = [{ text: 'node1', metadata: {} }, { text: 'node2', metadata: {} }];
 50 |             const mockPreviewNodes = [{ text: 'node1', metadata: {} }, { text: 'node2', metadata: {} }];
 51 |             const mockEstimate = { estimatedPrice: 10, tokenCount: 100, pricePer1M: 0.01 };
 52 |             loadDocumentsFromCsv.mockResolvedValue(mockDocuments);
 53 |             createPreviewNodes.mockResolvedValue(mockNodes);
 54 |             estimateCost.mockReturnValue(mockEstimate);
 55 | 
 56 |             const result = await previewResults('path/to/csv', 'text', {});
 57 | 
 58 |             expect(result).toEqual({
 59 |                 success: true,
 60 |                 nodes: mockPreviewNodes,
 61 |                 ...mockEstimate
 62 |             });
 63 |         });
 64 | 
 65 |         it('should return error on failure', async () => {
 66 |             loadDocumentsFromCsv.mockRejectedValue(new Error('Failed to load documents'));
 67 | 
 68 |             const result = await previewResults('path/to/csv', 'text', {});
 69 | 
 70 |             expect(result).toEqual({ success: false, error: 'Failed to load documents' });
 71 |         });
 72 | 
 73 |         it('should handle empty documents', async () => {
 74 |             loadDocumentsFromCsv.mockResolvedValue([]);
 75 | 
 76 |             const result = await previewResults('path/to/csv', 'text', {});
 77 | 
 78 |             expect(result).toEqual({ success: false, error: 'That CSV does not appear to contain any documents. Please check the file and try again.' });
 79 |         });
 80 |     });
 81 | 
 82 |     describe('getDocStore', () => {
 83 |         it('should return existing doc store', async () => {
 84 |             const mockDocStore = 'docStore';
 85 |             getExistingDocStore.mockResolvedValue(mockDocStore);
 86 | 
 87 |             const result = await getDocStore({});
 88 | 
 89 |             expect(result).toBe(mockDocStore);
 90 |         });
 91 |     });
 92 | 
 93 |     describe('getIndex', () => {
 94 |         it('should return existing vector store index', async () => {
 95 |             const mockIndex = 'index';
 96 |             getExistingVectorStoreIndex.mockResolvedValue(mockIndex);
 97 | 
 98 |             const result = await getIndex({}, {});
 99 | 
100 |             expect(result).toBe(mockIndex);
101 |         });
102 |     });
103 | 
104 |     describe('search', () => {
105 |         it('should return search results', async () => {
106 |             const mockResults = [
107 |                 { node: { getContent: () => 'content1', metadata: {} }, score: 1 },
108 |                 { node: { getContent: () => 'content2', metadata: {} }, score: 2 }
109 |             ];
110 |             searchDocuments.mockResolvedValue(mockResults);
111 | 
112 |             const result = await search('index', 'query');
113 | 
114 |             expect(result).toEqual([
115 |                 { text: 'content1', score: 1, metadata: {} },
116 |                 { text: 'content2', score: 2, metadata: {} }
117 |             ]);
118 |         });
119 | 
120 |         it('should handle no search results', async () => {
121 |             searchDocuments.mockResolvedValue([]);
122 | 
123 |             const result = await search('index', 'query');
124 | 
125 |             expect(result).toEqual([]);
126 |         });
127 | 
128 |         it('should handle search results with null scores', async () => {
129 |             const mockResults = [
130 |                 { node: { getContent: () => 'content1', metadata: {} }, score: null },
131 |                 { node: { getContent: () => 'content2', metadata: {} }, score: null }
132 |             ];
133 |             searchDocuments.mockResolvedValue(mockResults);
134 | 
135 |             const result = await search('index', 'query');
136 | 
137 |             expect(result).toEqual([
138 |                 { text: 'content1', score: 0, metadata: {} },
139 |                 { text: 'content2', score: 0, metadata: {} }
140 |             ]);
141 |         });
142 |     });
143 | });
144 | 
145 |   describe('previewResults', () => {
146 |     it('should return preview results and estimated cost', async () => {
147 |       const mockDocuments = Array(20).fill({ text: 'doc' });
148 |       const mockNodes = [{ text: 'node1', metadata: {} }, { text: 'node2', metadata: {} }];
149 |       const mockPreviewNodes = [{ text: 'node1', metadata: {} }, { text: 'node2', metadata: {} }];
150 |       const mockEstimate = { estimatedPrice: 10, tokenCount: 100, pricePer1M: 0.01 };
151 |       loadDocumentsFromCsv.mockResolvedValue(mockDocuments);
152 |       createPreviewNodes.mockResolvedValue(mockNodes);
153 |       estimateCost.mockReturnValue(mockEstimate);
154 | 
155 |       const result = await previewResults('path/to/csv', 'text', {});
156 | 
157 |       expect(result).toEqual({
158 |         success: true,
159 |         nodes: mockPreviewNodes,
160 |         ...mockEstimate
161 |       });
162 |     });
163 | 
164 |     it('should return error on failure', async () => {
165 |       loadDocumentsFromCsv.mockRejectedValue(new Error('Failed to load documents'));
166 | 
167 |       const result = await previewResults('path/to/csv', 'text', {});
168 | 
169 |       expect(result).toEqual({ success: false, error: 'Failed to load documents' });
170 |     });
171 |   });
172 | 
173 |   describe('getDocStore', () => {
174 |     it('should return existing doc store', async () => {
175 |       const mockDocStore = 'docStore';
176 |       getExistingDocStore.mockResolvedValue(mockDocStore);
177 | 
178 |       const result = await getDocStore({});
179 | 
180 |       expect(result).toBe(mockDocStore);
181 |     });
182 |   });
183 | 
184 |   describe('getIndex', () => {
185 |     it('should return existing vector store index', async () => {
186 |       const mockIndex = 'index';
187 |       getExistingVectorStoreIndex.mockResolvedValue(mockIndex);
188 | 
189 |       const result = await getIndex({}, {});
190 | 
191 |       expect(result).toBe(mockIndex);
192 |     });
193 |   });
194 |     describe('search', () => {
195 |         it('should return search results', async () => {
196 |         const mockResults = [
197 |             { node: { getContent: () => 'content1', metadata: {} }, score: 1 },
198 |             { node: { getContent: () => 'content2', metadata: {} }, score: 2 }
199 |         ];
200 |         searchDocuments.mockResolvedValue(mockResults);
201 |     
202 |         const result = await search('index', 'query');
203 |     
204 |         expect(result).toEqual([
205 |             { text: 'content1', score: 1, metadata: {} },
206 |             { text: 'content2', score: 2, metadata: {} }
207 |         ]);
208 |         });
209 |     
210 |         it('should handle no search results', async () => {
211 |         searchDocuments.mockResolvedValue([]);
212 |     
213 |         const result = await search('index', 'query');
214 |     
215 |         expect(result).toEqual([]);
216 |         });
217 |     
218 |         it('should handle search results with null scores', async () => {
219 |         const mockResults = [
220 |             { node: { getContent: () => 'content1', metadata: {} }, score: null },
221 |             { node: { getContent: () => 'content2', metadata: {} }, score: null }
222 |         ];
223 |         searchDocuments.mockResolvedValue(mockResults);
224 |     
225 |         const result = await search('index', 'query');
226 |     
227 |         expect(result).toEqual([
228 |             { text: 'content1', score: 0, metadata: {} },
229 |             { text: 'content2', score: 0, metadata: {} }
230 |         ]);
231 |         });
232 |     });


--------------------------------------------------------------------------------
/src/main/api/embedding.ts:
--------------------------------------------------------------------------------
  1 | import { embedDocuments, createPreviewNodes, estimateCost, searchDocuments, getExistingVectorStoreIndex, persistNodes, persistDocuments, getExistingDocStore } from "../services/embeddings";
  2 | import type { EmbeddingConfig, EmbeddingResult, SearchResult, PreviewResult, Settings, MetadataFilter, Clients} from "../types";
  3 | import { loadDocumentsFromCsv } from "../services/csvLoader";
  4 | import { MetadataMode } from "llamaindex";
  5 | 
  6 | export async function createEmbeddings(
  7 |   csvPath: string,
  8 |   textColumnName: string,
  9 |   config: EmbeddingConfig,
 10 |   settings: Settings,
 11 |   clients: Clients
 12 | ): Promise<EmbeddingResult> {
 13 |   try {
 14 |     console.time("createEmbeddings Run Time");
 15 | 
 16 |     const documents = await loadDocumentsFromCsv(csvPath, textColumnName);
 17 |     if (documents.length === 0) {
 18 |       console.timeEnd("createEmbeddings Run Time");
 19 |       return {
 20 |         success: false,
 21 |         error: "That CSV does not appear to contain any documents. Please check the file and try again.",
 22 |       };
 23 |     }
 24 |     const nodes = await embedDocuments(documents, config, settings);
 25 |     const index = await persistNodes(nodes, config, settings, clients);
 26 |     await persistDocuments(documents, config, settings, clients);
 27 |     console.timeEnd("createEmbeddings Run Time");
 28 |     return {
 29 |       success: true,
 30 |       index,
 31 |     };
 32 |   } catch (error) {
 33 |     return {
 34 |       success: false,
 35 |       error: error instanceof Error ? error.message : "Unknown error occurred",
 36 |     };
 37 |   }
 38 | }
 39 | 
 40 | // TODO: rename this to be parallel to createEmbeddings
 41 | export async function previewResults(
 42 |   csvPath: string,
 43 |   textColumnName: string,
 44 |   config: EmbeddingConfig
 45 | ): Promise<PreviewResult> {
 46 |   try {
 47 |     const documents = await loadDocumentsFromCsv(csvPath, textColumnName);
 48 |     if (documents.length === 0) {
 49 |       return {
 50 |         success: false,
 51 |         error: "That CSV does not appear to contain any documents. Please check the file and try again.",
 52 |       };
 53 |     }
 54 |     // Take 10 rows from the middle of the dataset for preview
 55 |     // we take a consistent 10 so that the results of the preview are consistent (i.e. with a larger chunk size, you have fewer, longer results, but more shorter ones if you adjust it)
 56 |     // and we take from the middle because the initial rows may be idiosyncratic.
 57 |     const previewDocumentsSubset = documents.slice(
 58 |       Math.floor(documents.length / 2),
 59 |       Math.floor(documents.length / 2) + 10
 60 |     );
 61 | 
 62 |     const previewNodes = await createPreviewNodes(documents, config);
 63 |     const previewSubsetNodes = await createPreviewNodes(previewDocumentsSubset, config);
 64 |     const { estimatedPrice, tokenCount, pricePer1M } = estimateCost(previewNodes, config.modelName);
 65 | 
 66 |     return {
 67 |       success: true,
 68 |       nodes: previewSubsetNodes.map(node => ({
 69 |         text: node.text,
 70 |         metadata: node.metadata
 71 |       })),
 72 |       estimatedPrice,
 73 |       tokenCount,
 74 |       pricePer1M
 75 |     };
 76 |   } catch (error) {
 77 |     return {
 78 |       success: false,
 79 |       error: error instanceof Error ? error.message : "Unknown error occurred"
 80 |     };
 81 |   }
 82 | } 
 83 | 
 84 | export async function getDocStore(config: EmbeddingConfig) {
 85 |   return await getExistingDocStore(config);
 86 | }
 87 | 
 88 | export async function getIndex(config: EmbeddingConfig, settings: Settings, clients: Clients) {
 89 |   return await getExistingVectorStoreIndex(config, settings, clients);
 90 | }
 91 | 
 92 | export async function search(
 93 |   index: any,
 94 |   query: string,
 95 |   numResults: number = 10,
 96 |   filters?: MetadataFilter[]
 97 | ): Promise<SearchResult[]> {
 98 |   const results = await searchDocuments(index, query, numResults, filters);
 99 |   return results.map((result) => ({
100 |     text: result.node.getContent(MetadataMode.NONE),
101 |     score: result.score ?? 0,
102 |     metadata: result.node.metadata,
103 |     //  @ts-ignore
104 |     sourceNodeId: result.node.relationships?.SOURCE?.nodeId
105 |   }));
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/index.ts:
--------------------------------------------------------------------------------
  1 | import { app, shell, BrowserWindow, ipcMain } from 'electron'
  2 | import { join } from 'path'
  3 | import { electronApp, optimizer, is } from '@electron-toolkit/utils'
  4 | import icon from '../../resources/icon.png?asset'
  5 | import { DocumentService } from './DocumentSetService'
  6 | import { writeFileSync, readFileSync } from 'fs'
  7 | import { tmpdir } from 'os'
  8 | import { join as pathJoin } from 'path'
  9 | import { DocumentSetParams, MetadataFilter } from './types';
 10 | import { create_weaviate_database, teardown_weaviate_database } from './services/weaviateService';
 11 | 
 12 | type HasFilePathAndName =  { file: { path: string, name: string }};
 13 | type DocumentSetParamsFileAndPath = DocumentSetParams & HasFilePathAndName;
 14 | 
 15 | const storageArg = process.argv.find(arg => arg.startsWith('--storage-path='));
 16 | const storagePath = storageArg ? storageArg.split('=')[1] : app.getPath('userData');;
 17 | 
 18 | const docService = new DocumentService({ 
 19 |   storagePath,
 20 |   weaviateClient: null // Initially set to null, will be updated after DB service is init'ec.
 21 | });
 22 | 
 23 | create_weaviate_database(storagePath).then((weaviateClient) => {
 24 |   docService.setClients({ weaviateClient, postgresClient: null });
 25 |   // weaviateClient.collections.listAll().then((res) => console.log(res)).catch((error) => {
 26 |   //   console.error('Error listing Weaviate collections:', error);
 27 |   // });
 28 | }).catch((error) => {
 29 |   console.error('Error creating Weaviate database:', error);
 30 |   // fall back to not using weaviate (using SimpleVectorstore)
 31 | });
 32 | 
 33 | function createWindow(): void {
 34 |   // Create the browser window.
 35 |   const mainWindow = new BrowserWindow({
 36 |     width: 900,
 37 |     height: 670,
 38 |     show: false,
 39 |     autoHideMenuBar: true,
 40 |     ...(process.platform === 'linux' ? { icon } : {}),
 41 |     webPreferences: {
 42 |       preload: join(__dirname, '../preload/index.mjs'), // was index.js, but this caused it to note work. maybe related to package.json's type: module
 43 |       sandbox: false
 44 |     }
 45 |   })
 46 | 
 47 |   mainWindow.on('ready-to-show', () => {
 48 |     mainWindow.maximize();
 49 |     mainWindow.show();
 50 |   })
 51 | 
 52 |   mainWindow.webContents.setWindowOpenHandler((details) => {
 53 |     shell.openExternal(details.url)
 54 |     return { action: 'deny' }
 55 |   })
 56 | 
 57 |   // HMR for renderer base on electron-vite cli.
 58 |   // Load the remote URL for development or the local html file for production.
 59 |   if (is.dev && process.env['ELECTRON_RENDERER_URL']) {
 60 |     mainWindow.loadURL(process.env['ELECTRON_RENDERER_URL'])
 61 |   } else {
 62 |     mainWindow.loadFile(join(__dirname, '../renderer/index.html'))
 63 |   }
 64 | }
 65 | 
 66 | // This method will be called when Electron has finished
 67 | // initialization and is ready to create browser windows.
 68 | // Some APIs can only be used after this event occurs.
 69 | app.whenReady().then(() => {
 70 |   // Set app user model id for windows
 71 |   electronApp.setAppUserModelId('com.electron')
 72 | 
 73 |   // Default open or close DevTools by F12 in development
 74 |   // and ignore CommandOrControl + R in production.
 75 |   // see https://github.com/alex8088/electron-toolkit/tree/master/packages/utils
 76 |   app.on('browser-window-created', (_, window) => {
 77 |     optimizer.watchWindowShortcuts(window)
 78 |   })
 79 | 
 80 |   // IPC test
 81 |   //ipc stuff could go in its own file.
 82 |   ipcMain.handle('list-document-sets', async (_, page: number = 1, pageSize: number = 10) => {
 83 |     try {
 84 |       return await docService.listDocumentSets(page, pageSize);
 85 |     } catch (error) {
 86 |       console.error('Error listing document sets:', error);
 87 |       throw error;
 88 |     }
 89 |   });
 90 | 
 91 |   ipcMain.handle('get-document-set', async (_, documentSetId: number) => {
 92 |     try {
 93 |       return await docService.getDocumentSet(documentSetId);
 94 |     } catch (error) {
 95 |       console.error('Error getting document set:', error);
 96 |       throw error;
 97 |     }
 98 |   });
 99 |   ipcMain.handle('delete-document-set', async (_, documentSetId: number) => {
100 |     try {
101 |       return await docService.deleteDocumentSet(documentSetId);
102 |     } catch (error) {
103 |       console.error('Error deleting document set:', error); 
104 |       throw error;
105 |     }
106 |   });
107 | 
108 |   ipcMain.handle('search-document-set', async (_, params: { documentSetId: number, query: string, n_results: number, filters?: MetadataFilter[]}) => {
109 |     try {
110 |       return await docService.searchDocumentSet(params.documentSetId, params.query, params.n_results, params.filters);
111 |     } catch (error) {
112 |       console.error('Error searching document set:', error, params.documentSetId, params.query, params.n_results, params.filters);
113 |       throw error;  
114 |     }
115 |   });
116 | 
117 |   ipcMain.handle('get-document', async (_, params: { documentSetId: number, documentId: string}) => {
118 |     try {
119 |       return await docService.getDocument(params.documentSetId, params.documentId);
120 |     } catch (error) {
121 |       console.error('Error searching document set:', error, params.documentSetId, params.documentId);
122 |       throw error;  
123 |     }
124 |   });
125 | 
126 |   ipcMain.handle('upload-csv', async (_, formData: DocumentSetParamsFileAndPath) => {
127 |     try {
128 |       // For files from renderer, we need to handle the Buffer data
129 |       const tempPath = pathJoin(tmpdir(), `${Date.now()}-${formData.file.name}`)
130 |       await writeFileSync(tempPath, readFileSync(formData.file.path))
131 |       return await docService.uploadCsv({
132 |         ...formData,
133 |         filePath: tempPath
134 |       });
135 |     } catch (error) {
136 |       console.error('Error uploading CSV:', error);
137 |       throw error;
138 |     }
139 |   });
140 | 
141 |   ipcMain.handle('generate-preview-data', async (_, formData: DocumentSetParamsFileAndPath) => {
142 |     try {
143 |       const tempPath = pathJoin(tmpdir(), `${Date.now()}-${formData.file.name}`)
144 |       await writeFileSync(tempPath, readFileSync(formData.file.path))
145 |       return await docService.generatePreviewData({
146 |         ...formData,
147 |         filePath: tempPath
148 |       });
149 | 
150 |     } catch (error) {
151 |       console.error('Error generating preview data:', error);
152 |       throw error;
153 |     }
154 |   });
155 | 
156 |   ipcMain.handle('get-settings', async () => {   
157 |     try {
158 |       return await docService.getMaskedSettings();
159 |     } catch (error) {
160 |       console.error('Error getting settings:', error);
161 |       throw error;
162 |     }
163 |   }
164 |   );
165 | 
166 |   ipcMain.handle('set-settings', async (_, settings) => {
167 |     try {
168 |       return await docService.setMaskedSettings(settings);
169 |     } catch (error) {
170 |       console.error('Error setting settings:', error);
171 |       throw error;
172 |     }
173 |   });
174 | 
175 | 
176 |   createWindow()
177 |   app.on('activate', function () {
178 |     // On macOS it's common to re-create a window in the app when the
179 |     // dock icon is clicked and there are no other windows open.
180 |     if (BrowserWindow.getAllWindows().length === 0) createWindow()
181 |   })
182 | })
183 | 
184 | app.on('before-quit', async (event) => {
185 |   if (docService.getClients().weaviateClient) {
186 |     event.preventDefault() // Prevent quitting until cleanup is done
187 |     console.log('Cleaning up Weaviate database before quitting...');
188 |     await teardown_weaviate_database(docService.getClients().weaviateClient);
189 |     console.log('Weaviate database cleaned up.');
190 |     docService.setClients({weaviateClient: null, postgresClient: null}); // Clear clients
191 |     app.quit() // Now actually quit
192 |   }
193 | })
194 | 


--------------------------------------------------------------------------------
/src/main/services/csvLoader.test.ts:
--------------------------------------------------------------------------------
 1 | //@ts-nocheck
 2 | import { describe, it, expect, vi } from 'vitest';
 3 | import { readFileSync } from 'fs';
 4 | import { loadDocumentsFromCsv } from './csvLoader';
 5 | import { Document } from 'llamaindex';
 6 | import Papa from 'papaparse';
 7 | 
 8 | 
 9 | vi.mock('fs');
10 | vi.mock('papaparse');
11 | 
12 | describe('csvLoader.ts', () => {
13 |   describe('loadDocumentsFromCsv', () => {
14 |     it('should load documents from CSV and return Document instances', async () => {
15 |       const mockFileContent = 'text,metadata1,metadata2\ncontent1,meta1,meta2\ncontent2,meta3,meta4';
16 |       const mockParsedData = {
17 |         data: [
18 |           { text: 'content1', metadata1: 'meta1', metadata2: 'meta2' },
19 |           { text: 'content2', metadata1: 'meta3', metadata2: 'meta4' }
20 |         ]
21 |       };
22 |       readFileSync.mockReturnValue(mockFileContent);
23 |       Papa.parse.mockReturnValue(mockParsedData);
24 | 
25 |       const result = await loadDocumentsFromCsv('path/to/csv', 'text');
26 | 
27 |       expect(remove_id(result)).toEqual(remove_id([
28 |         new Document({ text: 'content1', metadata: { metadata1: 'meta1', metadata2: 'meta2' } }),
29 |         new Document({ text: 'content2', metadata: { metadata1: 'meta3', metadata2: 'meta4' } })
30 |       ]));
31 |     });
32 | 
33 |     it('should handle empty CSV file', async () => {
34 |       const mockFileContent = '';
35 |       const mockParsedData = { data: [] };
36 |       readFileSync.mockReturnValue(mockFileContent);
37 |       Papa.parse.mockReturnValue(mockParsedData);
38 | 
39 |       const result = await loadDocumentsFromCsv('path/to/csv', 'text');
40 | 
41 |       expect(result).toEqual([]);
42 |     });
43 | 
44 |     it('should handle missing text column', async () => {
45 |       const mockFileContent = 'metadata1,metadata2\nmeta1,meta2\nmeta3,meta4';
46 |       const mockParsedData = {
47 |         data: [
48 |           { metadata1: 'meta1', metadata2: 'meta2' },
49 |           { metadata1: 'meta3', metadata2: 'meta4' }
50 |         ]
51 |       };
52 |       readFileSync.mockReturnValue(mockFileContent);
53 |       Papa.parse.mockReturnValue(mockParsedData);
54 | 
55 |       const result = await loadDocumentsFromCsv('path/to/csv', 'text');
56 | 
57 |       expect(remove_id(result)).toEqual(remove_id([
58 |         new Document({ text: undefined, metadata: { metadata1: 'meta1', metadata2: 'meta2' } }),
59 |         new Document({ text: undefined, metadata: { metadata1: 'meta3', metadata2: 'meta4' } })
60 |       ]));
61 |     });
62 | 
63 |     it('should handle null values in metadata', async () => {
64 |         const mockFileContent = 'text,metadata1,metadata2\ncontent1,,meta2\ncontent2,meta3,';
65 |         const mockParsedData = {
66 |             data: [
67 |             { text: 'content1', metadata1: null, metadata2: 'meta2' },
68 |             { text: 'content2', metadata1: 'meta3', metadata2: null }
69 |             ]
70 |         };
71 |         readFileSync.mockReturnValue(mockFileContent);
72 |         Papa.parse.mockReturnValue(mockParsedData);
73 |     
74 |         const result = await loadDocumentsFromCsv('path/to/csv', 'text');
75 |         expect(remove_id(result)).toEqual(remove_id([
76 |             new Document({ text: 'content1', metadata: { metadata1: '', metadata2: 'meta2' } }),
77 |             new Document({ text: 'content2', metadata: { metadata1: 'meta3', metadata2: '' } })
78 |         ]));
79 |         });
80 |     });
81 | });
82 | 
83 | function remove_id(list_of_documents): Document[] {
84 |   return list_of_documents.map((doc) => {
85 |     const { id_, ...doc_without_id } = doc;
86 |     return doc_without_id;
87 | });
88 | }


--------------------------------------------------------------------------------
/src/main/services/csvLoader.ts:
--------------------------------------------------------------------------------
 1 | import { Document } from "llamaindex";
 2 | import { readFileSync } from "fs";
 3 | import Papa from "papaparse";
 4 | 
 5 | export async function loadDocumentsFromCsv(
 6 |   filePath: string,
 7 |   textColumnName: string
 8 | ): Promise<Document[]> {
 9 |   const fileContent = readFileSync(filePath, "utf-8");
10 |   const { data: records } = Papa.parse(fileContent, {
11 |     header: true,
12 |     skipEmptyLines: true,
13 |   });
14 | 
15 |   return records.map((record: any) => {
16 |     const { [textColumnName]: text, ...metadata } = record;
17 |     return new Document({
18 |       text,
19 |       metadata: Object.fromEntries(
20 |         Object.entries(metadata).map(([k, v]) => [k, v ?? ""])
21 |       ),
22 |     });
23 |   });
24 | }


--------------------------------------------------------------------------------
/src/main/services/embeddings.test.ts:
--------------------------------------------------------------------------------
 1 | //@ts-nocheck
 2 | /*
 3 | 
 4 | This needs some human brain attentino to make it not ACTUALLY embed the documents
 5 | but still return SOMETHING so that the test can pass.
 6 | we need a fake OpenAIEmbedding that returns nothing.
 7 | 
 8 | */
 9 | 
10 | 
11 | import { describe, it, expect, vi } from 'vitest';
12 | import { Document, TextNode } from 'llamaindex';
13 | import { transformDocuments, embedDocuments } from './embeddings';
14 | 
15 | vi.mock(import("./embeddings"), async (importOriginal) => {
16 |     const actual = await importOriginal()
17 |     return {
18 |       ...actual,
19 |       // your mocked methods
20 |       transformDocuments: vi.fn(),
21 |     }
22 |   })
23 |   
24 | describe('embedDocuments', () => {
25 |   const mockConfig = {
26 |     chunkSize: 100,
27 |     chunkOverlap: 10,
28 |     combineSentencesIntoChunks: true,
29 |     sploderMaxSize: 500,
30 |     modelProvider: 'mock',
31 |     modelName: 'text-embedding-3-small',
32 |     vectorStoreType: "simple" as "simple",
33 |     storagePath: './storage',
34 |     projectName: 'test_project',
35 |     splitIntoSentences: true,
36 |   };
37 | 
38 |   const mockSettings = {
39 |     openAIKey: 'mock-api-key',
40 |     oLlamaBaseURL: 'http://localhost',
41 |     oLlamaModelType: 'mock-model',
42 |   };
43 | 
44 |   it('should process documents and return nodes', async () => {
45 |     const mockDocuments = [
46 |       new Document({ text: 'Document 1', metadata: { key1: 'value1' } }),
47 |       new Document({ text: 'Document 2', metadata: { key2: 'value2' } }),
48 |     ];
49 |     const mockNodes = [
50 |       new TextNode({ text: 'Document 1' }),
51 |       new TextNode({ text: 'Document 2' }),
52 |     ];
53 | 
54 |     (transformDocuments as vi.Mock).mockResolvedValue(mockNodes);
55 | 
56 |     const result = await embedDocuments(mockDocuments, mockConfig, mockSettings);
57 | 
58 |     expect(result.map((n) => n.text)).toEqual(mockNodes.map((n) => n.text));
59 |     // TODO: I can't get these to work. Apparently you can't spyOn a function that is imported from the same file.
60 |     // all well and good but ... why did CoPilot generate a test that can't work?
61 |     // expect(getEmbedModel).toHaveBeenCalledWith(mockConfig, mockSettings);
62 |     // expect(transformDocuments).toHaveBeenCalledWith(mockDocuments, expect.any(Array));
63 |   });
64 | 
65 |   it('should filter out documents with null, undefined, or zero-length text', async () => {
66 |     const mockDocuments = [
67 |       new Document({ text: 'Valid Document', metadata: { key1: 'value1' } }),
68 |       new Document({ text: undefined, metadata: { key3: 'value3' } }),
69 |       new Document({ text: '', metadata: { key4: 'value4' } }),
70 |     ];
71 |     const filteredDocuments = [mockDocuments[0]];
72 |     const mockNodes = [new TextNode({ text: 'Valid Document' })];
73 | 
74 |     (transformDocuments as vi.Mock).mockResolvedValue(mockNodes);
75 | 
76 |     const result = await embedDocuments(mockDocuments, mockConfig, mockSettings);
77 | 
78 |     expect(result.map((n) => n.text)).toEqual(mockNodes.map((n) => n.text));
79 |     
80 |     // TODO: I can't get these to work. Apparently you can't spyOn a function that is imported from the same file.
81 |     // all well and good but ... why did CoPilot generate a test that can't work?
82 |     // expect(transformDocuments).toHaveBeenCalledWith(filteredDocuments, expect.any(Array));
83 |   });
84 | 
85 |   it('should exclude all metadata keys from embedding', async () => {
86 |     const mockDocuments = [
87 |       new Document({ text: 'Document 1', metadata: { key1: 'value1', key2: 'value2' } }),
88 |     ];
89 | //    const mockNodes = [new TextNode({ text: 'Document 1' })];
90 | 
91 | //    (getEmbedModel as vi.Mock).mockReturnValue(mockEmbeddingModel);
92 | //    (transformDocuments as vi.Mock).mockResolvedValue(mockNodes);
93 | 
94 |     await embedDocuments(mockDocuments, mockConfig, mockSettings);
95 | 
96 |     expect(mockDocuments[0].excludedEmbedMetadataKeys).toEqual(['key1', 'key2']);
97 |   });
98 | });


--------------------------------------------------------------------------------
/src/main/services/loggingOpenAIEmbedding.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | // temporary
 3 | // this is a wrapper around OpenAIEmbedding that logs the input of the embedding
 4 | // it's used to debug the embedding process (to make sure random metadata isn't wrongfully included)
 5 | // it's not used in the production code
 6 | 
 7 | import { OpenAIEmbedding } from "llamaindex";
 8 | import type {
 9 |   AzureClientOptions,
10 |   OpenAI as OpenAILLM,
11 | } from "openai";
12 | type LLMInstance = Pick<OpenAILLM, "embeddings" | "apiKey">;
13 | 
14 | 
15 | export class LoggingOpenAIEmbedding extends OpenAIEmbedding {
16 |   constructor(
17 |     init?: Omit<Partial<OpenAIEmbedding>, "lazySession"> & {
18 |       session?: LLMInstance | undefined;
19 |       azure?: AzureClientOptions;
20 |     },
21 |   ) {
22 |     super(init);
23 |     // overwrite private member "getMessage" 🙀
24 |     (this as any).getOpenAIEmbedding = async function(input: string[]): Promise<number[][]> {
25 |       // TODO: ensure this for every sub class by calling it in the base class
26 |       input = this.truncateMaxTokens(input);
27 |     
28 |       const { data } = await (
29 |         await this.session
30 |       ).embeddings.create(
31 |         this.dimensions
32 |           ? {
33 |               model: this.model,
34 |               dimensions: this.dimensions, // only sent to OpenAI if set by user
35 |               input,
36 |             }
37 |           : {
38 |               model: this.model,
39 |               input,
40 |             },
41 |       );
42 |     
43 |       return data.map((d) => d.embedding);
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/services/mockEmbedding.ts:
--------------------------------------------------------------------------------
 1 | //@ts-nocheck
 2 | import { BaseEmbedding } from "llamaindex";
 3 | 
 4 | export class MockEmbedding extends BaseEmbedding {
 5 |     constructor() {
 6 |         super();
 7 |     }   
 8 |     async getTextEmbedding(text: string): Promise<number[]> {
 9 |         return new Promise((resolve) => {
10 |             resolve([1, 0, 0, 0, 0, 0]);
11 |         });
12 |     }
13 | };


--------------------------------------------------------------------------------
/src/main/services/optional_trim_sentence_tokenizer.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2024, Hugo W.L. ter Doest
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in
 12 | all copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | THE SOFTWARE.
 21 | */
 22 | 
 23 | // JEREMY:
 24 | // just copied from natural's tokenizers/sentence_tokenizer.js
 25 | // with one substantive modification (making the trim() stuff controlled by the trimSentences argument on tokenize())
 26 | 
 27 | // Strings that will be used to create placeholders
 28 | const NUM = 'NUMBER'
 29 | const DELIM = 'DELIM'
 30 | const URI = 'URI'
 31 | const ABBREV = 'ABBREV'
 32 | 
 33 | const DEBUG = false
 34 | 
 35 | function generateUniqueCode (base, index) {
 36 |   // Surround the placeholder with {{}} to prevent shorter numbers to be recognized
 37 |   // in larger numbers
 38 |   return `{{${base}_${index}}}`
 39 | }
 40 | 
 41 | function escapeRegExp (string) {
 42 |   return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
 43 | }
 44 | 
 45 | // from natural's tokenizers/tokenizer.js (oddly not exported)
 46 | class Tokenizer {
 47 |   trim (array) {
 48 |     while (array[array.length - 1] === '') { array.pop() }
 49 | 
 50 |     while (array[0] === '') { array.shift() }
 51 | 
 52 |     return array
 53 |   }
 54 | }
 55 | 
 56 | 
 57 | export class SentenceTokenizer extends Tokenizer {
 58 |   constructor (abbreviations, trimSentences) {
 59 |     super()
 60 |     if (abbreviations) {
 61 |       this.abbreviations = abbreviations
 62 |     } else {
 63 |       this.abbreviations = []
 64 |     }
 65 |     if (trimSentences === undefined) {
 66 |       this.trimSentences = true
 67 |     }else{
 68 |       this.trimSentences = trimSentences;
 69 |     }
 70 |     this.replacementMap = null
 71 |     this.replacementCounter = 0
 72 |   }
 73 | 
 74 |   replaceUrisWithPlaceholders (text) {
 75 |     const urlPattern = /(https?:\/\/\S+|www\.\S+|ftp:\/\/\S+|(mailto:)?[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|file:\/\/\S+)/gi
 76 | 
 77 |     const modifiedText = text.replace(urlPattern, (match) => {
 78 |       const placeholder = generateUniqueCode(URI, this.replacementCounter++)
 79 |       this.replacementMap.set(placeholder, match)
 80 |       return placeholder
 81 |     })
 82 | 
 83 |     return modifiedText
 84 |   }
 85 | 
 86 |   replaceAbbreviations (text) {
 87 |     if (this.abbreviations.length === 0) {
 88 |       return text
 89 |     }
 90 |     const pattern = new RegExp(`(${this.abbreviations.map(abbrev => escapeRegExp(abbrev)).join('|')})`, 'gi')
 91 |     const replacedText = text.replace(pattern, match => {
 92 |       const code = generateUniqueCode(ABBREV, this.replacementCounter++)
 93 |       this.replacementMap.set(code, match)
 94 |       return code
 95 |     })
 96 | 
 97 |     return replacedText
 98 |   }
 99 | 
100 |   replaceDelimitersWithPlaceholders (text) {
101 |     // Regular expression for sentence delimiters optionally followed by a bracket or quote
102 |     // Multiple delimiters with spaces in between are allowed
103 |     // The expression makes sure that the sentence delimiter group ends with a sentence delimiter
104 |     const delimiterPattern = /([.?!… ]*)([.?!…])(["'”’)}\]]?)/g
105 |     const modifiedText = text.replace(delimiterPattern, (match, p1, p2, p3) => {
106 |       const placeholder = generateUniqueCode(DELIM, this.replacementCounter++)
107 |       this.delimiterMap.set(placeholder, p1 + p2 + p3)
108 |       return placeholder
109 |     })
110 | 
111 |     return modifiedText
112 |   }
113 | 
114 |   splitOnPlaceholders (text, placeholders) {
115 |     if (this.delimiterMap.size === 0) {
116 |       return [text]
117 |     }
118 | 
119 |     const keys = Array.from(this.delimiterMap.keys())
120 |     const pattern = new RegExp(`(${keys.map(escapeRegExp).join('|')})`)
121 |     const parts = text.split(pattern)
122 | 
123 |     const sentences = []
124 |     for (let i = 0; i < parts.length; i += 2) {
125 |       const sentence = parts[i]
126 |       const placeholder = parts[i + 1] || ''
127 |       sentences.push(sentence + placeholder)
128 |     }
129 | 
130 |     return sentences
131 |   }
132 | 
133 |   replaceNumbersWithCode (text) {
134 |     // Regular expression to match numbers, including decimal points and commas
135 |     const numberPattern = /\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b/g
136 | 
137 |     const replacedText = text.replace(numberPattern, match => {
138 |       const code = generateUniqueCode(NUM, this.replacementCounter++)
139 |       this.replacementMap.set(code, match)
140 |       return code
141 |     })
142 | 
143 |     return replacedText
144 |   }
145 | 
146 |   revertReplacements (text) {
147 |     let originalText = text
148 |     for (const [placeholder, replacement] of this.replacementMap.entries()) {
149 |       const pattern = new RegExp(escapeRegExp(placeholder), 'g')
150 |       originalText = originalText.replace(pattern, replacement)
151 |     }
152 | 
153 |     return originalText
154 |   }
155 | 
156 |   revertDelimiters (text) {
157 |     let originalText = text
158 |     for (const [placeholder, replacement] of this.delimiterMap.entries()) {
159 |       const pattern = new RegExp(escapeRegExp(placeholder), 'g')
160 |       originalText = originalText.replace(pattern, replacement)
161 |     }
162 | 
163 |     return originalText
164 |   }
165 | 
166 |   tokenize (text) {
167 |     this.replacementCounter = 0
168 |     this.replacementMap = new Map()
169 |     this.delimiterMap = new Map()
170 | 
171 |     DEBUG && console.log('---Start of sentence tokenization-----------------------')
172 |     DEBUG && console.log('Original input: >>>' + text + '<<<')
173 |     // Replace abbreviations
174 |     const result1 = this.replaceAbbreviations(text)
175 |     DEBUG && console.log('Phase 1: replacing abbreviations: ' + JSON.stringify(result1))
176 | 
177 |     // Replace URIs
178 |     const result2 = this.replaceUrisWithPlaceholders(result1)
179 |     DEBUG && console.log('Phase 2: replacing URIs: ' + JSON.stringify(result2))
180 | 
181 |     // Replace delimiters followed by optional quotes, brackets, and braces
182 |     const result3 = this.replaceNumbersWithCode(result2)
183 |     DEBUG && console.log('Phase 3: replacing numbers with placeholders: ' + JSON.stringify(result3))
184 | 
185 |     // Replace delimiters followed by optional quotes, brackets, and braces
186 |     const result4 = this.replaceDelimitersWithPlaceholders(result3)
187 |     DEBUG && console.log('Phase 4: replacing delimiters with placeholders: ' + JSON.stringify(result4))
188 | 
189 |     // Split on placeholders for sentence delimiters
190 |     const sentences = this.splitOnPlaceholders(result4)
191 |     DEBUG && console.log('Phase 5: splitting into sentences on placeholders: ' + JSON.stringify(sentences))
192 | 
193 |     // Replace back all abbreviations, URIs, and delimiters
194 |     const newSentences = sentences.map(s => {
195 |       const s1 = this.revertReplacements(s)
196 |       return this.revertDelimiters(s1)
197 |     })
198 |     DEBUG && console.log('Phase 6: replacing back abbreviations, URIs, numbers and delimiters: ' + JSON.stringify(newSentences))
199 | 
200 |     const trimmedSentences = this.trim(newSentences)
201 |     DEBUG && console.log('Phase 7: trimming array of empty sentences: ' + JSON.stringify(trimmedSentences))
202 | 
203 |     const trimmedSentences2 = trimmedSentences.map(sent => this.trimSentences ? sent.trim() : sent ) // Jeremy's modification to preserve spaces.
204 |     DEBUG && console.log('Phase 8: trimming sentences from surrounding whitespace: ' + JSON.stringify(trimmedSentences2))
205 |     DEBUG && console.log('---End of sentence tokenization--------------------------')
206 |     DEBUG && console.log('---Replacement map---------------------------------------')
207 |     DEBUG && console.log([...this.replacementMap.entries()])
208 |     DEBUG && console.log('---Delimiter map-----------------------------------------')
209 |     DEBUG && console.log([...this.delimiterMap.entries()])
210 |     DEBUG && console.log('---------------------------------------------------------')
211 | 
212 |     return trimmedSentences2
213 |   }
214 | }
215 | 


--------------------------------------------------------------------------------
/src/main/services/sentenceSplitter.test.ts:
--------------------------------------------------------------------------------
 1 | //@ts-nocheck
 2 | import { expect, test } from 'vitest'
 3 | import { CustomSentenceSplitter } from './sentenceSplitter'
 4 | import { SentenceSplitter, IngestionPipeline, Document } from "llamaindex";
 5 | 
 6 | // do these tests just to make sure that we can factor out my hacky fixes when llamaindex is fixed.
 7 | // test that original sentenceSplitter splits on abbreviations
 8 | // test that original sentenceSplitter splits on abbreviations even when specified
 9 | 
10 | // test that my modified sentenceSplitter excludes metadata when arg is specified
11 | // test that my modified sentenceSplitter includes metadata when arg is specified the other way
12 | 
13 | 
14 | 
15 | let documents = [
16 |     new Document({ text: "JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee.  Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019.  He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time." }),
17 | ];
18 | 
19 | let originalSentenceSplitterPipeline = new IngestionPipeline({
20 |     transformations: [  
21 |         new SentenceSplitter({ chunkSize: 50, chunkOverlap: 10 }),
22 |         ],
23 |     });
24 | let customSentenceSplitterPipeline = new IngestionPipeline({
25 |     transformations: [
26 |       new CustomSentenceSplitter({ chunkSize: 50, chunkOverlap: 10 }),
27 |     ],
28 |   });
29 | 
30 | test("my modified sentenceSplitter doesn't eliminate spaces", () => {
31 |     customSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
32 |         expect(nodes.some((node) => node["text"].indexOf("Co.elected") > -1)).toEqual(false);
33 |         expect(nodes.some((node) => node["text"].indexOf("Mr.Weinberger") > -1)).toEqual(false);
34 |         expect(nodes.some((node) => node["text"].indexOf("A.and") > -1)).toEqual(false);
35 |     });
36 | });
37 | 
38 | test("original sentenceSplitter does eliminate spaces", () => {
39 |     originalSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
40 |         expect(nodes.some((node) => node["text"].indexOf("Co.elected") > -1)).toEqual(true);
41 |         expect(nodes.some((node) => node["text"].indexOf("Mr.Weinberger") > -1)).toEqual(true);
42 |         expect(nodes.some((node) => node["text"].indexOf("A.and") > -1)).toEqual(true);
43 |     });
44 | });
45 | 
46 | let noAbbrevsCustomSentenceSplitterPipeline = new IngestionPipeline({
47 |     transformations: [
48 |       new CustomSentenceSplitter({ chunkSize: 50, chunkOverlap: 10, abbreviations: []}),
49 |     ],
50 |   });
51 | 
52 | 
53 |   test("my modified sentenceSplitter doesn't split on specified abbreviations", () => {
54 |     customSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
55 |         expect(nodes.map((node) => !!node["text"].match(/Mr\.$/))).not.toContainEqual(true);
56 |     });
57 | });
58 | 
59 | // this is only a problem on branch fix/sentence-splitter-spaces
60 | // where the chunker is eliminated entirely in favor of just splitting by sentences with natural.
61 | test("original sentenceSplitter splits in silly places, like Mr", () => {
62 |     noAbbrevsCustomSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
63 |         expect(nodes.map((node) => !!node["text"].match(/Mr\.$/))).toContainEqual(true);
64 |     });
65 | });
66 | 
67 | const testcases = [
68 |     ["USA v. 4227 JENIFER STREET N.W. WASHINGTON, D.C., AND ELECTRONIC DEVICES THEREIN UNDER RULE 41", "USA v. 4227 JENIFER STREET N.W. WASHINGTON, D.C., AND ELECTRONIC DEVICES THEREIN UNDER RULE 41"],
69 |     ["JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee.", "JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee."],
70 |     ["Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019.", "Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019."],
71 |     ["He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time.", "He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time."],
72 | 
73 | ];
74 | testcases.forEach(([testcase_input, testcase_expected_output]) => {
75 |     test(`my sentenceSplitter correctly handles short sentence ${testcase_input}`, () => {
76 |         customSentenceSplitterPipeline.run({documents: [new Document({text: testcase_input})]}).then((nodes) => {
77 |             expect(nodes.length).toEqual(1);
78 |             expect(nodes[0]["text"]).toEqual(testcase_expected_output);
79 |         });
80 |     })
81 | });
82 | 


--------------------------------------------------------------------------------
/src/main/services/sentenceSplitter.ts:
--------------------------------------------------------------------------------
  1 | import { SentenceSplitter, splitBySep, splitByRegex, splitByChar, Settings } from "llamaindex";
  2 | import { SentenceTokenizer } from "./optional_trim_sentence_tokenizer"; // like the version from natural, but with trimSentence option added.
  3 | 
  4 | /*
  5 | LlamaIndex's includes the length of the metadata as part of the size of the chunk when splitting by sentences. 
  6 | This produces very unintuitive behavior: e.g. when the user specifies a chunk-size of 50 and nodes have metadata of length 40, 
  7 | the resulting split sentences are about 10 tokens long -- as opposed to the specified 50.
  8 | 
  9 | This modified SentenceSplitter adds a `include_metadata_in_chunksize` flag that disables the above behavior,
 10 | ignoring metadata when calculating chunksize (i.e. only including the size of the text datga when calculating chunksize.)
 11 | 
 12 | Additionally, splitTextMetadataAware does some bizarre stuff where it will split sentences at abbreviations -- even if the 
 13 | underlying tokenizer knows about the abbreviations, I think due to some weird sub-sentence splitting. It also sews sentence
 14 | chunks back together in a way that eliminates spaces, e.g. `JPMorgan Chase & Co.elected Mark Weinberger` and  `Mr.Weinberger was Global Chairman`.
 15 | 
 16 | I also tried making SentenceSplitter just split on sentences (with Natural) but this misbehaved by splitting TOO much. I do need short sentences grouped
 17 | together (whether they are true short sentences, or false-positives like "USA v. one 12 ft. I.B.M. mainframe").
 18 | 
 19 | 
 20 | */
 21 | // TODO: make this configurable
 22 | const INCLUDE_METADATA_IN_CHUNKSIZE = false;
 23 | SentenceSplitter.prototype.splitTextMetadataAware = function(text: string, metadata: string): string[] {
 24 |   const metadataLength = this.tokenSize(metadata);
 25 |   const effectiveChunkSize = INCLUDE_METADATA_IN_CHUNKSIZE ? this.chunkSize - metadataLength : this.chunkSize;
 26 |   if (effectiveChunkSize <= 0) {
 27 |     throw new Error(
 28 |       `Metadata length (${metadataLength}) is longer than chunk size (${this.chunkSize}). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`,
 29 |     );
 30 |   } else if (effectiveChunkSize < 50) {
 31 |     console.log(
 32 |       `Metadata length (${metadataLength}) is close to chunk size (${this.chunkSize}). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`,
 33 |     );
 34 |   }
 35 |   return this._splitText(text, effectiveChunkSize);
 36 | }
 37 | 
 38 | const default_abbreviations= ['dr.', 'vs.', 'mr.', 'ms.', 'mx.', 'mrs.', 'prof.', 'inc.', 'corp.', 'co.', 'llc.', 'ltd.', 'etc.', "i.e.",
 39 |   "etc.",
 40 |   "vs.",
 41 |   "A.S.A.P.",
 42 | ];
 43 | 
 44 | // verbatim copies
 45 | type TextSplitterFn = (text: string) => string[];
 46 | type _Split = {
 47 |   text: string;
 48 |   isSentence: boolean;
 49 |   tokenSize: number;
 50 | };
 51 | 
 52 | 
 53 | // This varies from SentenceSplitter in two ways:
 54 | // 1. it uses abbreviations set here.
 55 | // 2. it uses a custom SentenceTokenizer with a second trimSentences arguemnt that controls
 56 | //    whether or not leading/trailing whitespace is preserved.
 57 | //    We want to preserve it, so that when sentences are merged back again, we don't end up with 
 58 | //    sentences that are not separated by spaces.
 59 | // Because JavaScript is stupid, we have to copy over almost the whole SentenceSplitter just to make those few small changes.
 60 | export class CustomSentenceSplitter extends SentenceSplitter {
 61 | 
 62 |   // this function is new.
 63 |   chunkingTokenizerFn = (): TextSplitterFn => {
 64 |     return (text: string) => {
 65 |       try {
 66 |         return this.tokenizer.tokenize(text, false); // this false argument does all the work of preserving spaces.
 67 |       } catch {
 68 |         return [text];
 69 |       }
 70 |     };
 71 |   };
 72 |   #splitFns: Set<TextSplitterFn> = new Set();
 73 |   #subSentenceSplitFns: Set<TextSplitterFn> = new Set();
 74 |   abbreviations: string[];
 75 |   tokenizer: SentenceTokenizer;
 76 | 
 77 |   constructor(params: { chunkSize?: number; chunkOverlap?: number; abbreviations?: string[] } = {}) {
 78 |     super(params);
 79 |     // Create custom tokenizer with abbreviations
 80 |     this.abbreviations = params.abbreviations || default_abbreviations;
 81 |     this.tokenizer = new SentenceTokenizer(this.abbreviations, false); // false is don't trim sentences
 82 | 
 83 |     // copied from the superclass.
 84 |     this.#splitFns.add(splitBySep(this.paragraphSeparator));
 85 | 
 86 |     this.#splitFns.add(this.chunkingTokenizerFn()); // the ONLY change here in the constructor.
 87 | 
 88 |     // copied from the superclass.
 89 |     this.#subSentenceSplitFns.add(splitByRegex(this.secondaryChunkingRegex));
 90 |     this.#subSentenceSplitFns.add(splitBySep(this.separator));
 91 |     this.#subSentenceSplitFns.add(splitByChar());
 92 | 
 93 |     // left over from a failed attempt to JUST use natural.SentenceTokenizer
 94 |     // but I DO in fact need the merge stuff.
 95 |     // const tokenizer = 
 96 |     // Override the default splitText method
 97 |     // this.splitText = (text: string): string[] => {
 98 |     //   return tokenizer.tokenize(text);
 99 |     // };
100 |     // /* tslint:disable:no-unused-variable */
101 |     // this.splitTextMetadataAware = (text: string, metadata: string): string[] => {
102 |     //   return tokenizer.tokenize(text);
103 |     // }
104 |   }
105 | 
106 | 
107 |   //just verbatim copies of the parent class
108 | 
109 |   _splitText(text: string, chunkSize: number): string[] {
110 |     if (text === "") return [text];
111 | 
112 |     const callbackManager = Settings.callbackManager;
113 | 
114 |     callbackManager.dispatchEvent("chunking-start", {
115 |       text: [text],
116 |     });
117 |     const splits = this.#split(text, chunkSize);
118 |     const chunks = this.#merge(splits, chunkSize);
119 | 
120 |     callbackManager.dispatchEvent("chunking-end", {
121 |       chunks,
122 |     });
123 |     return chunks;
124 |   }
125 | 
126 |   #split(text: string, chunkSize: number): _Split[] {
127 |     const tokenSize = this.tokenSize(text);
128 |     if (tokenSize <= chunkSize) {
129 |       return [
130 |         {
131 |           text,
132 |           isSentence: true,
133 |           tokenSize,
134 |         },
135 |       ];
136 |     }
137 |     const [textSplitsByFns, isSentence] = this.#getSplitsByFns(text);
138 |     const textSplits: _Split[] = [];
139 | 
140 |     for (const textSplit of textSplitsByFns) {
141 |       const tokenSize = this.tokenSize(textSplit);
142 |       if (tokenSize <= chunkSize) {
143 |         textSplits.push({
144 |           text: textSplit,
145 |           isSentence,
146 |           tokenSize,
147 |         });
148 |       } else {
149 |         const recursiveTextSplits = this.#split(textSplit, chunkSize);
150 |         textSplits.push(...recursiveTextSplits);
151 |       }
152 |     }
153 |     return textSplits;
154 |   }
155 | 
156 |   #getSplitsByFns(text: string): [splits: string[], isSentence: boolean] {
157 |     for (const splitFn of this.#splitFns) {
158 |       const splits = splitFn(text);
159 |       if (splits.length > 1) {
160 |         return [splits, true];
161 |       }
162 |     }
163 |     for (const splitFn of this.#subSentenceSplitFns) {
164 |       const splits = splitFn(text);
165 |       if (splits.length > 1) {
166 |         return [splits, false];
167 |       }
168 |     }
169 |     return [[text], true];
170 |   }
171 | 
172 |   #merge(splits: _Split[], chunkSize: number): string[] {
173 |     const chunks: string[] = [];
174 |     let currentChunk: [string, number][] = [];
175 |     let lastChunk: [string, number][] = [];
176 |     let currentChunkLength = 0;
177 |     let newChunk = true;
178 | 
179 |     const closeChunk = (): void => {
180 |       chunks.push(currentChunk.map(([text]) => text).join(""));
181 |       lastChunk = currentChunk;
182 |       currentChunk = [];
183 |       currentChunkLength = 0;
184 |       newChunk = true;
185 | 
186 |       let lastIndex = lastChunk.length - 1;
187 |       while (
188 |         lastIndex >= 0 &&
189 |         currentChunkLength + lastChunk[lastIndex]![1] <= this.chunkOverlap
190 |       ) {
191 |         const [text, length] = lastChunk[lastIndex]!;
192 |         currentChunkLength += length;
193 |         currentChunk.unshift([text, length]);
194 |         lastIndex -= 1;
195 |       }
196 |     };
197 | 
198 |     while (splits.length > 0) {
199 |       const curSplit = splits[0]!;
200 |       if (curSplit.tokenSize > chunkSize) {
201 |         throw new Error("Single token exceeded chunk size");
202 |       }
203 |       if (currentChunkLength + curSplit.tokenSize > chunkSize && !newChunk) {
204 |         closeChunk();
205 |       } else {
206 |         if (
207 |           curSplit.isSentence ||
208 |           currentChunkLength + curSplit.tokenSize <= chunkSize ||
209 |           newChunk
210 |         ) {
211 |           currentChunkLength += curSplit.tokenSize;
212 |           currentChunk.push([curSplit.text, curSplit.tokenSize]);
213 |           splits.shift();
214 |           newChunk = false;
215 |         } else {
216 |           closeChunk();
217 |         }
218 |       }
219 |     }
220 | 
221 |     // Handle the last chunk
222 |     if (!newChunk) {
223 |       chunks.push(currentChunk.map(([text]) => text).join(""));
224 |     }
225 | 
226 |     return this.#postprocessChunks(chunks);
227 |   }
228 | 
229 |   #postprocessChunks(chunks: string[]): string[] {
230 |     const newChunks: string[] = [];
231 |     for (const chunk of chunks) {
232 |       const trimmedChunk = chunk.trim();
233 |       if (trimmedChunk !== "") {
234 |         newChunks.push(trimmedChunk);
235 |       }
236 |     }
237 |     return newChunks;
238 |   }
239 | } 


--------------------------------------------------------------------------------
/src/main/services/sploder.ts:
--------------------------------------------------------------------------------
 1 | import { TextNode, BaseNode, TransformComponent } from "llamaindex";
 2 | import { encodingForModel } from "js-tiktoken";
 3 | 
 4 | interface SploderConfig {
 5 |   maxStringTokenCount: number;
 6 | }
 7 | 
 8 | export class Sploder extends TransformComponent {
 9 |   private maxTokenCount: number;
10 |   private tokenizer: any; // js-tiktoken encoder
11 | 
12 |   // TODO: this is a hack to get the tokenizer for the embedding model
13 |   // TODO: this should be a singleton
14 |   constructor(config: SploderConfig) {
15 |     super(async (nodes: BaseNode[]) => nodes); // no-op, to be replaced later
16 |     this.maxTokenCount = config.maxStringTokenCount;
17 |     this.tokenizer = encodingForModel("text-embedding-3-small");
18 |   }
19 | 
20 |   private getTokenCount(text: string): number {
21 |     return this.tokenizer.encode(text).length;
22 |   }
23 | 
24 |   async transform(nodes: TextNode[]): Promise<TextNode[]> {
25 |     const newNodes: TextNode[] = [];
26 | 
27 |     nodes.forEach((node, index) => {
28 |       // Keep original node
29 |       newNodes.push(node);
30 | 
31 |       // Skip if text is too long
32 |       if (this.getTokenCount(node.text) > this.maxTokenCount) {
33 |         return;
34 |       }
35 | 
36 |       const prevNode = index > 0 ? nodes[index - 1] : null;
37 |       const nextNode = index < nodes.length - 1 ? nodes[index + 1] : null;
38 | 
39 |       // Create node with current + next if available
40 |       if (nextNode) {
41 |         newNodes.push(
42 |           new TextNode({
43 |             text: node.text + " " + nextNode.text,
44 |             metadata: { ...node.metadata, isExpanded: true }
45 |           })
46 |         );
47 |       }
48 | 
49 |       // Create node with prev + current + next if both available
50 |       if (prevNode && nextNode) {
51 |         newNodes.push(
52 |           new TextNode({
53 |             text: prevNode.text + " " + node.text + " " + nextNode.text,
54 |             metadata: { ...node.metadata, isExpanded: true }
55 |           })
56 |         );
57 |       }
58 |     });
59 | 
60 |     return newNodes;
61 |   }
62 | } 


--------------------------------------------------------------------------------
/src/main/services/weaviateService.ts:
--------------------------------------------------------------------------------
 1 | import weaviate, { EmbeddedOptions, EmbeddedDB } from 'weaviate-ts-embedded';
 2 | import path from 'path';
 3 | 
 4 | export async function create_weaviate_database(storagePath: string) {
 5 |     let embedded_db: any = null;
 6 |   
 7 |     const embeddedOptions = new EmbeddedOptions({
 8 |       host: '127.0.0.1',
 9 |       port: 9898,
10 |       persistenceDataPath: path.join(storagePath, "weaviate_data"),
11 |     });
12 |     
13 |     embedded_db = new EmbeddedDB(embeddedOptions);
14 |     await embedded_db.start();
15 |     
16 |     const client = await weaviate.client(embeddedOptions,   {
17 |     //   scheme: 'http',
18 |       host: '127.0.0.1',
19 |       port: 9898
20 |     });
21 |     client.embedded = embedded_db;
22 |     console.log('Weaviate binary:', client.embedded.options.binaryPath);
23 |     console.log('Weaviate data path:', client.embedded.options.persistenceDataPath);
24 |     
25 |     console.info('\nEmbedded DB started\n');
26 |     return client
27 |   }
28 |   
29 | 
30 | export async function teardown_weaviate_database(client) {
31 |   if (client) {
32 |     await client.embedded.stop();
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/main/types/index.ts:
--------------------------------------------------------------------------------
 1 | export interface SearchResult {
 2 |   text: string;
 3 |   score: number;
 4 |   metadata: Record<string, any>;
 5 | }
 6 | 
 7 | export interface EmbeddingResult {
 8 |   success: boolean;
 9 |   error?: string;
10 |   index?: any;
11 | }
12 | 
13 | export interface PreviewResult {
14 |   success: boolean;
15 |   error?: string;
16 |   nodes?: Array<{
17 |     text: string;
18 |     metadata: Record<string, any>;
19 |   }>;
20 |   estimatedPrice?: number;
21 |   tokenCount?: number;
22 |   pricePer1M?: number;
23 | } 
24 | 
25 | 
26 | // Define types for our document set metadata
27 | export interface DocumentSetMetadata {
28 |   documentSetId: number;
29 |   name: string;
30 |   uploadDate: Date;
31 |   parameters: Record<string, unknown>;
32 |   totalDocuments: number;
33 | }
34 | 
35 | export interface DocumentSetParams {
36 |   datasetName: string,
37 |   description: string,
38 |   textColumns: string[],
39 |   metadataColumns: string[],
40 |   splitIntoSentences: boolean,
41 |   combineSentencesIntoChunks: boolean,
42 |   sploderMaxSize: number,
43 |   chunkSize: number,
44 |   chunkOverlap: number,
45 |   modelName: string,
46 |   modelProvider: string
47 | }
48 | 
49 | 
50 | export interface EmbeddingConfig {
51 |   modelName: string;
52 |   modelProvider: string
53 |   vectorStoreType: "simple" | "postgres" | "weaviate";
54 |   projectName: string;
55 |   storagePath: string;
56 |   splitIntoSentences: boolean;
57 |   combineSentencesIntoChunks: boolean;
58 |   sploderMaxSize: number;
59 |   chunkSize: number;
60 |   chunkOverlap: number;
61 | }
62 | 
63 | 
64 | export interface Settings {
65 |   openAIKey: string | null;
66 |   oLlamaModelType: string | null;
67 |   oLlamaBaseURL: string | null;
68 | }
69 | 
70 | export interface MetadataFilter{ 
71 |   key: string, 
72 |   operator: "==" | "in" | ">" | "<" | "!=" | ">=" | "<=" | "nin" | "any" | "all" | "text_match" | "contains" | "is_empty", 
73 |   value: any 
74 | }
75 | 
76 | export interface Clients {
77 |   weaviateClient: any;
78 |   postgresClient: any;
79 | }


--------------------------------------------------------------------------------
/src/main/utils.ts:
--------------------------------------------------------------------------------
  1 | import { 
  2 |   Document, 
  3 |   NodeWithScore,
  4 |   BaseNode
  5 | } from "llamaindex";
  6 | 
  7 | export function sanitizeProjectName(projectName: string) {
  8 |   return projectName.replace(/[^a-zA-Z0-9]/g, "_");
  9 | }
 10 | export function capitalizeFirstLetter(val) {
 11 |   return String(val).charAt(0).toUpperCase() + String(val).slice(1);
 12 | }
 13 | 
 14 | function escapeKey(key: string): string {
 15 |   const validKeyRegex = /^[_A-Za-z][_0-9A-Za-z]*$/;
 16 |   const ESCAPE_PREFIX = "__ESC__";
 17 |   if (validKeyRegex.test(key)) {
 18 |     return key;
 19 |   } else {
 20 |     // Replace each invalid character with _X<hex>_
 21 |     let escaped = "";
 22 |     for (let i = 0; i < key.length; i++) {
 23 |       const char = key[i];
 24 |       if (
 25 |         (i === 0 && !/[A-Za-z_]/.test(char)) ||
 26 |         (i > 0 && !/[A-Za-z0-9_]/.test(char))
 27 |       ) {
 28 |         escaped += `_X${char.charCodeAt(0).toString(16).toUpperCase()}_`;
 29 |       } else {
 30 |         escaped += char;
 31 |       }
 32 |     }
 33 |     return ESCAPE_PREFIX + escaped;
 34 |   }
 35 | }
 36 | 
 37 | function unescapeKey(key: string): string {
 38 |   const ESCAPE_PREFIX = "__ESC__";
 39 |   if (key.startsWith(ESCAPE_PREFIX)) {
 40 |     // Replace all _X<hex>_ with the corresponding character
 41 |     return key
 42 |       .slice(ESCAPE_PREFIX.length)
 43 |       .replace(/_X([0-9A-F]+)_/g, (_, hex) =>
 44 |         String.fromCharCode(parseInt(hex, 16))
 45 |       );
 46 |   } else {
 47 |     return key;
 48 |   }
 49 | }
 50 | 
 51 | // Escapes metadata keys that don't match /^[_A-Za-z][_0-9A-ZaZ]*$/ using URI encoding with a prefix
 52 | export function escapeDocumentMetadataKeys(document: Document): Document {
 53 |   const newMetadata: Record<string, any> = {};
 54 | 
 55 |   for (const key in document.metadata) {
 56 |     newMetadata[escapeKey(key)] = document.metadata[key];
 57 |   }
 58 |   document.metadata = newMetadata;
 59 |   return document;
 60 | }
 61 | 
 62 | // Reverses the escaping done by escapeMetadataKeys
 63 | export function unescapeDocumentMetadataKeys(document: Document): Document {
 64 |   const ESCAPE_PREFIX = "__ESC__";
 65 |   const newMetadata: Record<string, any> = {};
 66 | 
 67 |   for (const key in document.metadata) {
 68 |     if (key.startsWith(ESCAPE_PREFIX)) {
 69 |       const originalKey = unescapeKey(key)
 70 |       newMetadata[originalKey] = document.metadata[key];
 71 |     } else {
 72 |       newMetadata[key] = document.metadata[key];
 73 |     }
 74 |   }
 75 |   document.metadata = newMetadata;
 76 |   return document;
 77 | }
 78 | 
 79 | 
 80 | // Reverses the escaping done by escapeMetadataKeys
 81 | export function unescapeNodeWithScoreMetadataKeys(node_with_score: NodeWithScore): NodeWithScore {
 82 |   const ESCAPE_PREFIX = "__ESC__";
 83 |   const newMetadata: Record<string, any> = {};
 84 | 
 85 |   for (const key in node_with_score.node.metadata) {
 86 |     if (key.startsWith(ESCAPE_PREFIX)) {
 87 |       const originalKey = unescapeKey(key)
 88 |       newMetadata[originalKey] = node_with_score.node.metadata[key];
 89 |     } else {
 90 |       newMetadata[key] = node_with_score.node.metadata[key];
 91 |     }
 92 |   }
 93 |   node_with_score.node.metadata = newMetadata;
 94 |   return node_with_score
 95 | }
 96 | 
 97 | // Reverses the escaping done by escapeMetadataKeys
 98 | export function unescapeNodeMetadataKeys(node: BaseNode): BaseNode {
 99 |   const ESCAPE_PREFIX = "__ESC__";
100 |   const newMetadata: Record<string, any> = {};
101 | 
102 |   for (const key in node.metadata) {
103 |     if (key.startsWith(ESCAPE_PREFIX)) {
104 |       const originalKey = unescapeKey(key)
105 |       newMetadata[originalKey] = node.metadata[key];
106 |     } else {
107 |       newMetadata[key] = node.metadata[key];
108 |     }
109 |   }
110 |   node.metadata = newMetadata;
111 |   return node
112 | }


--------------------------------------------------------------------------------
/src/preload.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremybmerrill/meaningfully/f2cb3bc2c4627556e2c93818330108f47635dbff/src/preload.ts


--------------------------------------------------------------------------------
/src/preload/index.d.ts:
--------------------------------------------------------------------------------
 1 | import { ElectronAPI } from '@electron-toolkit/preload'
 2 | 
 3 | 
 4 | // copy pasta from ../main/types/index.tx
 5 | export interface SearchResult {
 6 |   text: string;
 7 |   score: number;
 8 |   metadata: Record<string, any>;
 9 | }
10 | 
11 | export interface EmbeddingResult {
12 |   success: boolean;
13 |   error?: string;
14 |   index?: any;
15 | }
16 | 
17 | export interface PreviewResult {
18 |   success: boolean;
19 |   error?: string;
20 |   nodes?: Array<{
21 |     text: string;
22 |     metadata: Record<string, any>;
23 |   }>;
24 |   estimatedPrice?: number;
25 |   tokenCount?: number;
26 |   pricePer1M?: number;
27 | } 
28 | 
29 | export interface DocumentSetMetadata {
30 |   documentSetId: number;
31 |   name: string;
32 |   uploadDate: Date;
33 |   parameters: Record<string, unknown>;
34 |   totalDocuments: number;
35 | }
36 | 
37 | export interface Settings {
38 |   openAIKey: string;
39 |   oLlamaModelType: string;
40 |   oLlamaBaseURL: string;
41 | }
42 | 
43 | declare global {
44 |   interface Window {
45 |     electron: ElectronAPI
46 |     api: {
47 |       listDocumentSets: () => Promise<{documents: DocumentSetMetadata[], total: number}> ,
48 |       getDocumentSet: (documentSetId: number) => Promise<DocumentSetMetadata>,
49 |       uploadCsv: (formData: {
50 |         file: File,
51 |         datasetName: string,
52 |         description: string,
53 |         textColumns: string[],
54 |         metadataColumns: string[]
55 |       }) => Promise<{ success: true, documentSetId: number }>,
56 |       searchDocumentSet: (params: {
57 |         documentSetId: number;
58 |         query: string;
59 |         n_results: number;
60 |         filters?: { 
61 |           key: string, 
62 |           operator: "==" | "in" | ">" | "<" | "!=" | ">=" | "<=" | "nin" | "any" | "all" | "text_match" | "contains" | "is_empty", 
63 |           value: any 
64 |         }[];
65 |       }) => Promise<SearchResult[]>,
66 |       getDocument(params: {
67 |         documentSetId: number;
68 |         documentId: string;
69 |       }): Promise<{ text: string, metadata: Record<string, any> }>,
70 |       getSettings: () => Promise<Settings>, 
71 |       setSettings: (settings: Settings) => Promise<void>,
72 |       deleteDocumentSet: (documentSetId: number) => Promise<{ success: boolean }>,
73 |       generatePreviewData: (formData: {
74 |         file: File,
75 |         datasetName: string,
76 |         description: string,
77 |         textColumns: string[],
78 |         metadataColumns: string[],
79 |         splitIntoSentences: boolean,
80 |         combineSentencesIntoChunks: boolean,
81 |         sploderMaxSize: number,
82 |         chunkSize: number,
83 |         chunkOverlap: number,
84 |         modelName: string,
85 |         modelProvider: string
86 |       }) => Promise<{ success: boolean, nodes: Record<string, any>[], estimatedPrice: number, tokenCount: number }>,
87 |     }
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/preload/index.ts:
--------------------------------------------------------------------------------
 1 | import { contextBridge, ipcRenderer } from 'electron'
 2 | // import type { Settings } from './index.d'
 3 | // Expose protected methods that allow the renderer process to use
 4 | // the ipcRenderer without exposing the entire object
 5 | contextBridge.exposeInMainWorld('api', {
 6 |   listDocumentSets: (page?: number, pageSize?: number) => 
 7 |     ipcRenderer.invoke('list-document-sets', page, pageSize),
 8 |   getDocumentSet: (documentSetId: number) => ipcRenderer.invoke('get-document-set', documentSetId),
 9 |   deleteDocumentSet: (documentSetId: number) => ipcRenderer.invoke('delete-document-set', documentSetId),
10 |   uploadCsv: (formData: {
11 |     file: File,
12 |     datasetName: string,
13 |     description: string,
14 |     textColumns: string[],
15 |     metadataColumns: string[],
16 |     splitIntoSentences: boolean,
17 |     combineSentencesIntoChunks: boolean,
18 |     sploderMaxSize: number, 
19 |     chunkSize: number,
20 |     chunkOverlap: number,
21 |   }) => {
22 |     // Convert File object to a format that can be sent over IPC
23 |     if (process.env.NODE_ENV === 'test') {
24 |       formData["modelProvider"] = "mock"; // Ensure modelProvider is set to "mock" so we don't hit a paid API.
25 |     }
26 |     const { file, ...rest } = formData;
27 |     return ipcRenderer.invoke('upload-csv', {
28 |       ...rest,
29 |       file: {
30 |         name: file.name,
31 |         path: file.path
32 |       }
33 |     });
34 |   },
35 |   // generatePreviewData and uploadCsv
36 |   // should have exactly the same argument signatures, etc.
37 |   // but different return types (because uploadCsv mutates the state of the
38 |   // various databases returning an ID and generatePreviewData just returns a list of records)
39 |   generatePreviewData: (formData: {
40 |     file: File,
41 |     datasetName: string,
42 |     description: string,
43 |     textColumns: string[],
44 |     metadataColumns: string[],
45 |     splitIntoSentences: boolean,
46 |     combineSentencesIntoChunks: boolean,
47 |     sploderMaxSize: number, 
48 |     chunkSize: number,
49 |     chunkOverlap: number,
50 |   }) => {
51 |     const { file, ...rest } = formData;
52 |     return ipcRenderer.invoke('generate-preview-data', {
53 |       ...rest,
54 |       file: {
55 |         name: file.name,
56 |         path: file.path
57 |       }
58 |     });
59 |   },
60 |   searchDocumentSet: (params: {
61 |     documentSetId: number;
62 |     query: string;
63 |     n_results: number;
64 |     filters?: Record<string, any>;
65 |   }) => ipcRenderer.invoke('search-document-set', params),
66 |   
67 |   getDocument: (params: {
68 |     documentSetId: number;
69 |     documentId: string;
70 |   }) => ipcRenderer.invoke('get-document', params),
71 | 
72 |   getSettings: () => ipcRenderer.invoke('get-settings'),
73 |   setSettings: (settings: {  openAIKey: string;
74 |     oLlamaModelType: string;
75 |     oLlamaBaseURL: string;
76 |   }) => ipcRenderer.invoke('set-settings', settings)
77 | })
78 | 
79 | // Expose electron utilities
80 | contextBridge.exposeInMainWorld('electron', {
81 |   ipcRenderer: {
82 |     send: (channel: string, ...args: any[]) => {
83 |       ipcRenderer.send(channel, ...args)
84 |     }
85 |   },
86 |   process: {
87 |     versions: process.versions
88 |   }
89 | })


--------------------------------------------------------------------------------
/src/renderer/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <title>Electron</title>
 6 |     <!-- https://developer.mozilla.org/en-US/docs/Web/HTTP/CSP -->
 7 |     <meta
 8 |       http-equiv="Content-Security-Policy"
 9 |       content="default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data:"
10 |     />
11 |   </head>
12 | 
13 |   <body>
14 |     <div id="app"></div>
15 |     <script type="module" src="/src/main.ts"></script>
16 |   </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/src/renderer/src/App.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |   import { onMount } from 'svelte';
 3 |   import { Router, Route, Link } from "svelte-routing"; // also Link
 4 |   import SearchPage from './components/SearchPage.svelte'
 5 |   import FrontPage from './components/FrontPage.svelte'
 6 |   import ApiKeyPage from './components/ApiKeyPage.svelte'
 7 |   import HelpPage from './components/HelpPage.svelte'
 8 |   import ApiKeyStatus from './components/ApiKeyStatus.svelte'
 9 | //  import electronLogo from './assets/electron.svg'
10 |   let url = $state("");
11 |   let settings: Settings | null = $state(null);
12 | 
13 |   const getSettings = async () => {
14 |       try {
15 |           settings = await window.api.getSettings();
16 |       } catch (error) {
17 |           console.error('Error fetching settings:', error);
18 |       }
19 |   };
20 |   let validApiKeysSet = $derived(settings && !!((!!settings.openAIKey) || (settings.oLlamaModelType && settings.oLlamaBaseURL)));
21 | 
22 |   onMount(getSettings);
23 | 
24 | </script>
25 | 
26 | <!-- <img alt="logo" class="logo" src={electronLogo} /> -->
27 | 
28 | <Router url={url} >
29 |   <Link to="/">
30 |     <h1 class="text-2xl font-bold">
31 |       Meaningfully
32 |     </h1>
33 |   </Link>
34 | 
35 |   <h2 class="text-xl font-semibold">
36 |     Semantic search for your spreadsheets
37 |   </h2>
38 | 
39 |   {#if settings}
40 |     <ApiKeyStatus settings={settings} validApiKeysSet={validApiKeysSet} />
41 |   {/if}
42 | 
43 |   <main class="container mx-auto px-4 py-8">
44 |     <Route path="">
45 |       <FrontPage validApiKeysSet={validApiKeysSet} />
46 |     </Route>
47 |     <Route path="search/:id"><SearchPage validApiKeysSet={validApiKeysSet} /></Route>
48 |     <Route path="help" component={HelpPage} />
49 |     <Route path="settings">
50 |       {#if settings}
51 |         <ApiKeyPage settings={settings} settingsUpdated={() => getSettings() } />
52 |       {/if}
53 |     </Route>
54 |   </main>
55 | 
56 |   <nav class="navbar">
57 |     <Link to="" class="nav-link underline text-blue-600 hover:text-blue-800 visited:text-purple-600">Home</Link>
58 |     <Link to="help" class="nav-link underline text-blue-600 hover:text-blue-800 visited:text-purple-600">Help</Link>
59 |     <Link to="settings" class="nav-link underline text-blue-600 hover:text-blue-800 visited:text-purple-600">Settings / API Keys</Link>
60 |     <span class="nav-link">Built with ✨ by Jeremy</span>
61 |     <span class="nav-link">© 2025</span>
62 |   </nav>
63 | 
64 | </Router>
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/src/renderer/src/assets/base.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |   --ev-c-white: #ffffff;
 3 |   --ev-c-white-soft: #f8f8f8;
 4 |   --ev-c-white-mute: #f2f2f2;
 5 | 
 6 |   --ev-c-black: #1b1b1f;
 7 |   --ev-c-black-soft: #222222;
 8 |   --ev-c-black-mute: #282828;
 9 | 
10 |   --ev-c-gray-1: #515c67;
11 |   --ev-c-gray-2: #414853;
12 |   --ev-c-gray-3: #32363f;
13 | 
14 |   --ev-c-text-1: rgba(255, 255, 245, 0.86);
15 |   --ev-c-text-2: rgba(235, 235, 245, 0.6);
16 |   --ev-c-text-3: rgba(235, 235, 245, 0.38);
17 | 
18 |   --ev-button-alt-border: transparent;
19 |   --ev-button-alt-text: var(--ev-c-text-1);
20 |   --ev-button-alt-bg: var(--ev-c-gray-3);
21 |   --ev-button-alt-hover-border: transparent;
22 |   --ev-button-alt-hover-text: var(--ev-c-text-1);
23 |   --ev-button-alt-hover-bg: var(--ev-c-gray-2);
24 | }
25 | 
26 | :root {
27 |   --color-background: var(--ev-c-black);
28 |   --color-background-soft: var(--ev-c-black-soft);
29 |   --color-background-mute: var(--ev-c-black-mute);
30 | 
31 |   --color-text: var(--ev-c-text-1);
32 | }
33 | 
34 | body {
35 |   min-height: 100vh;
36 |   color: var(--color-text);
37 |   background: var(--color-background);
38 |   line-height: 1.6;
39 |   font-family:
40 |     Inter,
41 |     -apple-system,
42 |     BlinkMacSystemFont,
43 |     'Segoe UI',
44 |     Roboto,
45 |     Oxygen,
46 |     Ubuntu,
47 |     Cantarell,
48 |     'Fira Sans',
49 |     'Droid Sans',
50 |     'Helvetica Neue',
51 |     sans-serif;
52 |   text-rendering: optimizeLegibility;
53 |   -webkit-font-smoothing: antialiased;
54 |   -moz-osx-font-smoothing: grayscale;
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/src/renderer/src/assets/electron.svg:
--------------------------------------------------------------------------------
 1 | <svg viewBox="0 0 128 128" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 |   <circle cx="64" cy="64" r="64" fill="#2F3242"/>
 3 |   <ellipse cx="63.9835" cy="23.2036" rx="4.48794" ry="4.495" stroke="#A2ECFB" stroke-width="3.6" stroke-linecap="round"/>
 4 |   <path d="M51.3954 39.5028C52.3733 39.6812 53.3108 39.033 53.4892 38.055C53.6676 37.0771 53.0194 36.1396 52.0414 35.9612L51.3954 39.5028ZM28.6153 43.5751L30.1748 44.4741L30.1748 44.4741L28.6153 43.5751ZM28.9393 60.9358C29.4332 61.7985 30.5329 62.0976 31.3957 61.6037C32.2585 61.1098 32.5575 60.0101 32.0636 59.1473L28.9393 60.9358ZM37.6935 66.7457C37.025 66.01 35.8866 65.9554 35.1508 66.6239C34.415 67.2924 34.3605 68.4308 35.029 69.1666L37.6935 66.7457ZM53.7489 81.7014L52.8478 83.2597L53.7489 81.7014ZM96.9206 89.515C97.7416 88.9544 97.9526 87.8344 97.3919 87.0135C96.8313 86.1925 95.7113 85.9815 94.8904 86.5422L96.9206 89.515ZM52.0414 35.9612C46.4712 34.9451 41.2848 34.8966 36.9738 35.9376C32.6548 36.9806 29.0841 39.1576 27.0559 42.6762L30.1748 44.4741C31.5693 42.0549 34.1448 40.3243 37.8188 39.4371C41.5009 38.5479 46.1547 38.5468 51.3954 39.5028L52.0414 35.9612ZM27.0559 42.6762C24.043 47.9029 25.2781 54.5399 28.9393 60.9358L32.0636 59.1473C28.6579 53.1977 28.1088 48.0581 30.1748 44.4741L27.0559 42.6762ZM35.029 69.1666C39.6385 74.24 45.7158 79.1355 52.8478 83.2597L54.6499 80.1432C47.8081 76.1868 42.0298 71.5185 37.6935 66.7457L35.029 69.1666ZM52.8478 83.2597C61.344 88.1726 70.0465 91.2445 77.7351 92.3608C85.359 93.4677 92.2744 92.6881 96.9206 89.515L94.8904 86.5422C91.3255 88.9767 85.4902 89.849 78.2524 88.7982C71.0793 87.7567 62.809 84.8612 54.6499 80.1432L52.8478 83.2597ZM105.359 84.9077C105.359 81.4337 102.546 78.6127 99.071 78.6127V82.2127C100.553 82.2127 101.759 83.4166 101.759 84.9077H105.359ZM99.071 78.6127C95.5956 78.6127 92.7831 81.4337 92.7831 84.9077H96.3831C96.3831 83.4166 97.5892 82.2127 99.071 82.2127V78.6127ZM92.7831 84.9077C92.7831 88.3817 95.5956 91.2027 99.071 91.2027V87.6027C97.5892 87.6027 96.3831 86.3988 96.3831 84.9077H92.7831ZM99.071 91.2027C102.546 91.2027 105.359 88.3817 105.359 84.9077H101.759C101.759 86.3988 100.553 87.6027 99.071 87.6027V91.2027Z" fill="#A2ECFB"/>
 5 |   <path d="M91.4873 65.382C90.8456 66.1412 90.9409 67.2769 91.7002 67.9186C92.4594 68.5603 93.5951 68.465 94.2368 67.7058L91.4873 65.382ZM99.3169 43.6354L97.7574 44.5344L99.3169 43.6354ZM84.507 35.2412C83.513 35.2282 82.6967 36.0236 82.6838 37.0176C82.6708 38.0116 83.4661 38.8279 84.4602 38.8409L84.507 35.2412ZM74.9407 39.8801C75.9127 39.6716 76.5315 38.7145 76.323 37.7425C76.1144 36.7706 75.1573 36.1517 74.1854 36.3603L74.9407 39.8801ZM53.7836 46.3728L54.6847 47.931L53.7836 46.3728ZM25.5491 80.9047C25.6932 81.8883 26.6074 82.5688 27.5911 82.4247C28.5747 82.2806 29.2552 81.3664 29.1111 80.3828L25.5491 80.9047ZM94.2368 67.7058C97.8838 63.3907 100.505 58.927 101.752 54.678C103.001 50.4213 102.9 46.2472 100.876 42.7365L97.7574 44.5344C99.1494 46.9491 99.3603 50.0419 98.2974 53.6644C97.2323 57.2945 94.9184 61.3223 91.4873 65.382L94.2368 67.7058ZM100.876 42.7365C97.9119 37.5938 91.7082 35.335 84.507 35.2412L84.4602 38.8409C91.1328 38.9278 95.7262 41.0106 97.7574 44.5344L100.876 42.7365ZM74.1854 36.3603C67.4362 37.8086 60.0878 40.648 52.8826 44.8146L54.6847 47.931C61.5972 43.9338 68.5948 41.2419 74.9407 39.8801L74.1854 36.3603ZM52.8826 44.8146C44.1366 49.872 36.9669 56.0954 32.1491 62.3927C27.3774 68.63 24.7148 75.2115 25.5491 80.9047L29.1111 80.3828C28.4839 76.1026 30.4747 70.5062 35.0084 64.5802C39.496 58.7143 46.2839 52.7889 54.6847 47.931L52.8826 44.8146Z" fill="#A2ECFB"/>
 6 |   <path d="M49.0825 87.2295C48.7478 86.2934 47.7176 85.8059 46.7816 86.1406C45.8455 86.4753 45.358 87.5055 45.6927 88.4416L49.0825 87.2295ZM78.5635 96.4256C79.075 95.5732 78.7988 94.4675 77.9464 93.9559C77.0941 93.4443 75.9884 93.7205 75.4768 94.5729L78.5635 96.4256ZM79.5703 85.1795C79.2738 86.1284 79.8027 87.1379 80.7516 87.4344C81.7004 87.7308 82.71 87.2019 83.0064 86.2531L79.5703 85.1795ZM84.3832 64.0673H82.5832H84.3832ZM69.156 22.5301C68.2477 22.1261 67.1838 22.535 66.7799 23.4433C66.3759 24.3517 66.7848 25.4155 67.6931 25.8194L69.156 22.5301ZM45.6927 88.4416C47.5994 93.7741 50.1496 98.2905 53.2032 101.505C56.2623 104.724 59.9279 106.731 63.9835 106.731V103.131C61.1984 103.131 58.4165 101.765 55.8131 99.0249C53.2042 96.279 50.8768 92.2477 49.0825 87.2295L45.6927 88.4416ZM63.9835 106.731C69.8694 106.731 74.8921 102.542 78.5635 96.4256L75.4768 94.5729C72.0781 100.235 68.0122 103.131 63.9835 103.131V106.731ZM83.0064 86.2531C85.0269 79.7864 86.1832 72.1831 86.1832 64.0673H82.5832C82.5832 71.8536 81.4723 79.0919 79.5703 85.1795L83.0064 86.2531ZM86.1832 64.0673C86.1832 54.1144 84.4439 44.922 81.4961 37.6502C78.5748 30.4436 74.3436 24.8371 69.156 22.5301L67.6931 25.8194C71.6364 27.5731 75.3846 32.1564 78.1598 39.0026C80.9086 45.7836 82.5832 54.507 82.5832 64.0673H86.1832Z" fill="#A2ECFB"/>
 7 |   <path fill-rule="evenodd" clip-rule="evenodd" d="M103.559 84.9077C103.559 82.4252 101.55 80.4127 99.071 80.4127C96.5924 80.4127 94.5831 82.4252 94.5831 84.9077C94.5831 87.3902 96.5924 89.4027 99.071 89.4027C101.55 89.4027 103.559 87.3902 103.559 84.9077V84.9077Z" stroke="#A2ECFB" stroke-width="3.6" stroke-linecap="round"/>
 8 |   <path fill-rule="evenodd" clip-rule="evenodd" d="M28.8143 89.4027C31.2929 89.4027 33.3023 87.3902 33.3023 84.9077C33.3023 82.4252 31.2929 80.4127 28.8143 80.4127C26.3357 80.4127 24.3264 82.4252 24.3264 84.9077C24.3264 87.3902 26.3357 89.4027 28.8143 89.4027V89.4027V89.4027Z" stroke="#A2ECFB" stroke-width="3.6" stroke-linecap="round"/>
 9 |   <path fill-rule="evenodd" clip-rule="evenodd" d="M64.8501 68.0857C62.6341 68.5652 60.451 67.1547 59.9713 64.9353C59.4934 62.7159 60.9007 60.5293 63.1167 60.0489C65.3326 59.5693 67.5157 60.9798 67.9954 63.1992C68.4742 65.4186 67.066 67.6052 64.8501 68.0857Z" fill="#A2ECFB"/>
10 | </svg>
11 | 


--------------------------------------------------------------------------------
/src/renderer/src/assets/main.css:
--------------------------------------------------------------------------------
  1 | @import './base.css';
  2 | @import "tailwindcss";
  3 | 
  4 | /* body {
  5 |   display: flex;
  6 |   align-items: center;
  7 |   justify-content: center;
  8 |   background-image: url('./wavy-lines.svg');
  9 |   background-size: cover;
 10 |   user-select: none;
 11 | } */
 12 | 
 13 | code {
 14 |   font-weight: 600;
 15 |   padding: 3px 5px;
 16 |   border-radius: 2px;
 17 |   background-color: var(--color-background-mute);
 18 |   font-family:
 19 |     ui-monospace,
 20 |     SFMono-Regular,
 21 |     SF Mono,
 22 |     Menlo,
 23 |     Consolas,
 24 |     Liberation Mono,
 25 |     monospace;
 26 |   font-size: 85%;
 27 | }
 28 | 
 29 | #app {
 30 |   display: flex;
 31 |   align-items: center;
 32 |   justify-content: center;
 33 |   flex-direction: column;
 34 |   margin-bottom: 80px;
 35 | }
 36 | 
 37 | .logo {
 38 |   margin-bottom: 20px;
 39 |   -webkit-user-drag: none;
 40 |   height: 128px;
 41 |   width: 128px;
 42 |   will-change: filter;
 43 |   transition: filter 300ms;
 44 | }
 45 | 
 46 | .logo:hover {
 47 |   filter: drop-shadow(0 0 1.2em #6988e6aa);
 48 | }
 49 | 
 50 | .creator {
 51 |   font-size: 14px;
 52 |   line-height: 16px;
 53 |   color: var(--ev-c-text-2);
 54 |   font-weight: 600;
 55 |   margin-bottom: 10px;
 56 | }
 57 | 
 58 | .text {
 59 |   font-size: 28px;
 60 |   color: var(--ev-c-text-1);
 61 |   font-weight: 700;
 62 |   line-height: 32px;
 63 |   text-align: center;
 64 |   margin: 0 10px;
 65 |   padding: 16px 0;
 66 | }
 67 | h1 { font-size: var(--text-2xl);  } 
 68 | h2 { font-size: var(--text-xl);  }
 69 | 
 70 | .tip {
 71 |   font-size: 16px;
 72 |   line-height: 24px;
 73 |   color: var(--ev-c-text-2);
 74 |   font-weight: 600;
 75 | }
 76 | 
 77 | .svelte {
 78 |   background: -webkit-linear-gradient(315deg, #ff3e00 35%, #647eff);
 79 |   background-clip: text;
 80 |   -webkit-background-clip: text;
 81 |   -webkit-text-fill-color: transparent;
 82 |   font-weight: 700;
 83 | }
 84 | 
 85 | .ts {
 86 |   background: -webkit-linear-gradient(315deg, #3178c6 45%, #f0dc4e);
 87 |   background-clip: text;
 88 |   -webkit-background-clip: text;
 89 |   -webkit-text-fill-color: transparent;
 90 |   font-weight: 700;
 91 | }
 92 | 
 93 | .actions {
 94 |   display: flex;
 95 |   padding-top: 32px;
 96 |   margin: -6px;
 97 |   flex-wrap: wrap;
 98 |   justify-content: flex-start;
 99 | }
100 | 
101 | .action {
102 |   flex-shrink: 0;
103 |   padding: 6px;
104 | }
105 | 
106 | .action a {
107 |   cursor: pointer;
108 |   text-decoration: none;
109 |   display: inline-block;
110 |   border: 1px solid transparent;
111 |   text-align: center;
112 |   font-weight: 600;
113 |   white-space: nowrap;
114 |   border-radius: 20px;
115 |   padding: 0 20px;
116 |   line-height: 38px;
117 |   font-size: 14px;
118 |   border-color: var(--ev-button-alt-border);
119 |   color: var(--ev-button-alt-text);
120 |   background-color: var(--ev-button-alt-bg);
121 | }
122 | 
123 | .action a:hover {
124 |   border-color: var(--ev-button-alt-hover-border);
125 |   color: var(--ev-button-alt-hover-text);
126 |   background-color: var(--ev-button-alt-hover-bg);
127 | }
128 | 
129 | .versions {
130 |   position: absolute;
131 |   bottom: 30px;
132 |   margin: 0 auto;
133 |   padding: 15px 0;
134 |   font-family: 'Menlo', 'Lucida Console', monospace;
135 |   display: inline-flex;
136 |   overflow: hidden;
137 |   align-items: center;
138 |   border-radius: 22px;
139 |   background-color: #202127;
140 |   backdrop-filter: blur(24px);
141 | }
142 | 
143 | .versions li {
144 |   display: block;
145 |   float: left;
146 |   border-right: 1px solid var(--ev-c-gray-1);
147 |   padding: 0 20px;
148 |   font-size: 14px;
149 |   line-height: 14px;
150 |   opacity: 0.8;
151 |   &:last-child {
152 |     border: none;
153 |   }
154 | }
155 | 
156 | @media (max-width: 720px) {
157 |   .text {
158 |     font-size: 20px;
159 |   }
160 | }
161 | 
162 | @media (max-width: 620px) {
163 |   .versions {
164 |     display: none;
165 |   }
166 | }
167 | 
168 | @media (max-width: 350px) {
169 |   .tip,
170 |   .actions {
171 |     display: none;
172 |   }
173 | }
174 | 


--------------------------------------------------------------------------------
/src/renderer/src/components/ApiKeyPage.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |     import { navigate } from 'svelte-routing'
 3 | 
 4 |     interface Props {
 5 |         settings: Settings;
 6 |         settingsUpdated: () => void;
 7 |     }
 8 | 
 9 |     let { settings, settingsUpdated }: Props = $props();
10 |     let openAIKey: string = $state(settings.openAIKey);
11 |     let oLlamaModelType: string = $state(settings.oLlamaModelType);
12 |     let oLlamaBaseURL: string = $state(settings.oLlamaBaseURL);
13 | 
14 |     const saveSettings = async () => {
15 |         const newSettings = {
16 |             openAIKey,
17 |             oLlamaModelType,
18 |             oLlamaBaseURL
19 |         };
20 | 
21 |         try {
22 |             const response = await window.api.setSettings(newSettings);
23 |             if (!response.success) {
24 |                 throw new Error('Failed to save settings');
25 |             }
26 |             settingsUpdated();
27 |             navigate("/");
28 |         } catch (error) {
29 |             console.error(error);
30 |             alert('Error saving settings');
31 |         }
32 |     };
33 | </script>
34 | 
35 | <div>
36 |     <h1>Settings and API Keys</h1>
37 |     <p>Configure API keys for at least one provider.</p>
38 |     <div class="settings-section">
39 |         <h2>OpenAI</h2>
40 |         <p>OpenAI provides embeddings at a (generally very cheap) cost.</p>
41 |         <input type="text" data-testid="openai-api-key-input" placeholder="sk-proj-test-1234567890" bind:value={openAIKey} />
42 |     </div>
43 |     <div class="settings-section">
44 |         <h2>OLlama</h2>
45 |         <p>OLlama lets you run embedding models on your computer. This is free (except for electricity, wear-and-tear, etc.).</p>
46 |         <input type="text" placeholder="mxbai-embed-large" bind:value={oLlamaModelType} />
47 |         <input type="text" placeholder="http://localhost:11434" bind:value={oLlamaBaseURL} />
48 |     </div>
49 | 
50 |     <button data-testid="save" onclick={saveSettings}>Save</button>
51 | 
52 | </div>
53 | 
54 | <style>
55 |     .settings-section {
56 |         margin-bottom: 20px;
57 |     }
58 |     .settings-section h2 {
59 |         margin-bottom: 10px;
60 |     }
61 |     .settings-section input {
62 |         display: block;
63 |         margin-bottom: 10px;
64 |         padding: 8px;
65 |         width: 100%;
66 |         box-sizing: border-box;
67 |     }
68 |     button {
69 |         padding: 10px 20px;
70 |         background-color: #007BFF;
71 |         color: white;
72 |         border: none;
73 |         cursor: pointer;
74 |     }
75 |     button:hover {
76 |         background-color: #0056b3;
77 |     }
78 | </style>
79 | 


--------------------------------------------------------------------------------
/src/renderer/src/components/ApiKeyStatus.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |     import { Link } from 'svelte-routing'
 3 |     interface Props {
 4 |         settings: Settings; // TODO: replace with $props call in Svelte5 
 5 |         validApiKeysSet: boolean;
 6 |     }
 7 | 
 8 |     let {  validApiKeysSet }: Props = $props();
 9 | </script>
10 | 
11 | {#if !validApiKeysSet }
12 |     <div class="alert alert-warning" data-testid="api-key-status">
13 |         <p>No OpenAI API key is set. Please <Link to="/settings" class="text-blue text-decoration-line"><span class="text-blue text-decoration-line">add one</span></Link> (or details for another provider) in order to use Meaningfully.</p>
14 |     </div>
15 | {/if}
16 | 
17 | 
18 | <style>
19 | .alert {
20 |     padding: 10px;
21 |     background-color: #f8d7da;
22 |     color: #721c24;
23 |     border: 1px solid #f5c6cb;
24 |     border-radius: 5px;
25 |     margin-top: 20px;
26 | }
27 | .alert-warning {
28 |     background-color: #fff3cd;
29 |     color: #856404;
30 |     border-color: #ffeeba;
31 | }
32 | .alert :global(a) , .alert-warning :global(a) { /* hack */
33 |     color: #0d6efd;
34 |     text-decoration: underline;
35 |     cursor: pointer;
36 |     font-weight: bold;
37 | }
38 | </style>


--------------------------------------------------------------------------------
/src/renderer/src/components/ExistingDatabases.svelte:
--------------------------------------------------------------------------------
  1 | <script lang="ts">
  2 |   import { onMount } from 'svelte';
  3 |   import type { DocumentSet } from '../main';
  4 |   import { Link } from 'svelte-routing';
  5 | 
  6 |   let documentSets: DocumentSet[] = $state([]);
  7 |   let loading = $state(true);
  8 |   let error: string | null = $state(null);
  9 |   let hidden = $state(false);
 10 |   let currentPage = $state(1);
 11 |   let totalPages = $state(1);
 12 |   let totalDocuments = $state(0);
 13 |   const pageSize = 10;
 14 | 
 15 |   export function hide() {
 16 |     hidden = true;
 17 |   }
 18 | 
 19 |   export function show() {
 20 |     hidden = false;
 21 |   }
 22 | 
 23 |   export async function loadDocumentSets(page: number = 1) {
 24 |     try {
 25 |       loading = true;
 26 |       const result = await window.api.listDocumentSets(page, pageSize);
 27 |       documentSets = result.documents.map(set => ({
 28 |         ...set,
 29 |         uploadDate: new Date(set.uploadDate)
 30 |       }));
 31 |       totalDocuments = result.total;
 32 |       totalPages = Math.ceil(totalDocuments / pageSize);
 33 |     } catch (e) {
 34 |       error = e instanceof Error ? e.message : 'Failed to load document sets';
 35 |     } finally {
 36 |       loading = false;
 37 |     }
 38 |   }
 39 | 
 40 |   async function handleDelete(documentSetId: number, name: string, e: Event) {
 41 |     e.preventDefault();
 42 |     if (confirm(`Are you sure you want to delete "${name}"? This cannot be undone.`)) {
 43 |       try {
 44 |         await window.api.deleteDocumentSet(documentSetId);
 45 |         await loadDocumentSets(currentPage);
 46 |       } catch (e) {
 47 |         error = e instanceof Error ? e.message : 'Failed to delete document set';
 48 |       }
 49 |     }
 50 |   }
 51 | 
 52 |   function nextPage(e: Event) {
 53 |     e.preventDefault();
 54 |     if (currentPage < totalPages) {
 55 |       currentPage++;
 56 |       loadDocumentSets(currentPage);
 57 |     }
 58 |   }
 59 | 
 60 |   function previousPage(e: Event) {
 61 |     e.preventDefault();
 62 |     if (currentPage > 1) {
 63 |       currentPage--;
 64 |       loadDocumentSets(currentPage);
 65 |     }
 66 |   }
 67 | 
 68 |   onMount(() => loadDocumentSets(1));
 69 | </script>
 70 | 
 71 | {#if hidden}
 72 |   <div class="my-10 flex justify-center p-8">
 73 |   </div>
 74 | {:else if loading}
 75 |   <div class="my-10 flex justify-center p-8">
 76 |     <div class="animate-spin h-8 w-8 border-4 border-blue-500 rounded-full border-t-transparent"></div>
 77 |   </div>
 78 | {:else if error}
 79 |   <div class="my-10 p-4 bg-red-100 text-red-700 rounded-md">
 80 |     {error}
 81 |   </div>
 82 | {:else}
 83 |   <div class="my-10 bg-white p-6 rounded-lg shadow space-y-6 text-black" data-testid="existing-spreadsheets">
 84 |     <h2 class="text-2xl font-bold">Existing Spreadsheets</h2>
 85 |     {#if documentSets.length === 0}
 86 |       <p class="text-gray-500">No spreadsheets found. Upload one to get started.</p>
 87 |     {:else}
 88 |       <div class="overflow-x-auto">
 89 |         <table class="min-w-full table-auto">
 90 |           <thead>
 91 |             <tr class="">
 92 |               <th class="px-4 py-2 text-left">Name</th>
 93 |               <th class="px-4 py-2 text-left">Upload Date</th>
 94 |               <th class="px-4 py-2 text-left">Documents</th>
 95 |               <th class="px-4 py-2 text-left">Parameters</th>
 96 |               <th class="px-4 py-2 text-left"><span class="sr-only">Actions</span></th>
 97 |             </tr>
 98 |           </thead>
 99 |           <tbody>
100 |             {#each documentSets as set}
101 |               <tr 
102 |                 class="border-t hover:bg-gray-50 transition-colors" 
103 |                 data-testid="existing-spreadsheet-row"
104 |               >
105 |                 <td class="px-4 py-2 font-medium">
106 |                   <Link 
107 |                     to={`/search/${set.documentSetId}`} 
108 |                     class="underline text-blue-600 hover:text-blue-800 visited:text-purple-600"
109 |                   >
110 |                     {set.name}
111 |                   </Link>
112 |                 </td>
113 |                 <td class="px-4 py-2 text-gray-600">{set.uploadDate.toLocaleString()}</td>
114 |                 <td class="px-4 py-2 text-gray-600">{set.totalDocuments}</td>
115 |                 <td class="px-4 py-2">
116 |                   {#if Object.keys(set.parameters).length > 0}
117 |                     <details>
118 |                       <summary class="cursor-pointer text-sm text-blue-600">View Parameters</summary>
119 |                       <pre class="mt-2 p-2 bg-gray-50 rounded text-sm">{JSON.stringify(set.parameters, null, 2)}</pre>
120 |                     </details>
121 |                   {:else}
122 |                     <span class="text-gray-400">None</span>
123 |                   {/if}
124 |                 </td>
125 |                 <td class="px-4 py-2">
126 |                   <button
127 |                     type="button"
128 |                     class="text-gray-500 hover:text-red-600 transition-colors"
129 |                     aria-label="Delete {set.name}"
130 |                     title="Delete {set.name}"
131 |                     onclick={(e) => handleDelete(set.documentSetId, set.name, e)}
132 |                   >
133 |                     <svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5" viewBox="0 0 20 20" fill="currentColor">
134 |                       <path fill-rule="evenodd" d="M4.293 4.293a1 1 0 011.414 0L10 8.586l4.293-4.293a1 1 0 111.414 1.414L11.414 10l4.293 4.293a1 1 0 01-1.414 1.414L10 11.414l-4.293 4.293a1 1 0 01-1.414-1.414L8.586 10 4.293 5.707a1 1 0 010-1.414z" clip-rule="evenodd" />
135 |                     </svg>
136 |                   </button>
137 |                 </td>
138 |               </tr>
139 |             {/each}
140 |           </tbody>
141 |         </table>
142 |         
143 |         <!-- Add pagination controls -->
144 |         <div class="mt-4 flex items-center justify-between px-4">
145 |           <div class="text-sm text-gray-700">
146 |             Showing page {currentPage} of {totalPages}
147 |           </div>
148 |           <div class="flex gap-2">
149 |             <button
150 |               class="px-4 py-2 border rounded-md disabled:opacity-50 disabled:cursor-not-allowed"
151 |               onclick={previousPage}
152 |               disabled={currentPage === 1}
153 |             >
154 |               Previous
155 |             </button>
156 |             <button
157 |               class="px-4 py-2 border rounded-md disabled:opacity-50 disabled:cursor-not-allowed"
158 |               onclick={nextPage}
159 |               disabled={currentPage === totalPages}
160 |             >
161 |               Next
162 |             </button>
163 |           </div>
164 |         </div>
165 |       </div>
166 |     {/if}
167 |   </div>
168 | {/if}
169 | 


--------------------------------------------------------------------------------
/src/renderer/src/components/FrontPage.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |     import ExistingDatabases from './ExistingDatabases.svelte'
 3 |     import CsvUpload from './CsvUpload.svelte'
 4 | 
 5 |     let databasesComponent: ExistingDatabases = $state()
 6 |   interface Props {
 7 |     validApiKeysSet: boolean;
 8 |   }
 9 | 
10 |   let { validApiKeysSet }: Props = $props();
11 | </script>
12 | 
13 | 
14 | <div class="container mx-auto px-4 space-y-8">
15 |     <CsvUpload 
16 |         validApiKeysSet={validApiKeysSet}
17 |         fileSelected={() => {databasesComponent.hide()}}
18 |         uploadComplete={() => {
19 |           databasesComponent.loadDocumentSets(); 
20 |           databasesComponent.show();
21 |         }}/>
22 |     <ExistingDatabases bind:this={databasesComponent}/>
23 | </div>
24 |   


--------------------------------------------------------------------------------
/src/renderer/src/components/HelpPage.svelte:
--------------------------------------------------------------------------------
  1 | <script lang="ts">
  2 | </script>
  3 | 
  4 | 
  5 | <div class="container mx-auto px-4 space-y-8">
  6 |     <h1>Help</h1>
  7 |     <section>
  8 |         <h2>What is Semantic Search?</h2>
  9 |         <p>
 10 |             Keyword search has been the only kind of search for decades. Sometimes it fails:
 11 |         </p>
 12 | 
 13 |             <ul>
 14 |                 <li>Ambiguity and synonyms (people getting fired and things catching on fire, cars and automobiles)</li>
 15 |                 <li>Circumlocutions and legalese</li>
 16 |                 <li>Documents written by laypeople describing complex situations in widely-varied language</li>
 17 |                 <li>Typos</li>
 18 |                 <li>Multilingual documents</li>
 19 |             </ul>
 20 |         <p>
 21 |             Semantic search works better in these situations, by finding results that <i>mean</i> something
 22 |             similar to the query. Even if the words are completely different, semantic search can still
 23 |             surface the results you need.
 24 |         </p>
 25 |     </section>
 26 |     <section>
 27 |         <h2> a CSV</h2>
 28 |         <p>
 29 |             You can select a CSV from your computer to "upload" it to Meaningfully. Then, select one column
 30 |             from the CSV to search semantically, and any number of other columns to be shown alongside it in results.
 31 |         </p>
 32 |         <p>
 33 |             Once you upload the CSV, each entry in the chosen column will be embedded, with the results stored on your
 34 |             computer. If you choose a remote embedding API -- like OpenAI's text-embedding-small or text-embedding-large --
 35 |             then the entries in your column will be sent
 36 |             to that service; if you choose a local one, then the data will not leave your computer.
 37 |         </p>
 38 |         <p>
 39 |             Meaningfully provides additional options:
 40 |         </p>
 41 |         <ul>
 42 |             <li>Split long text into sentences. When a single row contains many ideas, splitting it by sentence helps the search surface 
 43 |                 results that match a single idea.
 44 |             </li>
 45 |             <li>Combine short sentences into chunks. You can adjust how large the chunks are and how much overlap exists between chunks.</li>
 46 |         </ul>
 47 |         <p>
 48 |             Embedding can take a while, especially for large CSVs with 10,000 or more rows. Once it finishes, you'll
 49 |             be able to search your CSV.
 50 |         </p>
 51 |     </section>
 52 |     <section>
 53 |         <h2>How should I write my search query?</h2>
 54 |         <p>
 55 |             <strong>Do: </strong> Imagine the perfect version of what you're looking for, that you wish exists in your spreadsheet. <span class="block whitespace-pre overflow-x-scroll">My car caught on fire as I was driving on the highway.</span>
 56 |         </p>
 57 |         <p>
 58 |             <strong>Don't: </strong> Just write keywords. <span class="block whitespace-pre overflow-x-scroll">crash OR fire OR aflame</span>
 59 |         </p>
 60 |         <p>
 61 |             <strong>Don't: </strong> Ask a question, like you would to a chatbot. <span class="block whitespace-pre overflow-x-scroll">Please find me examples about cars catching on fire.</span>
 62 |         </p>
 63 | 
 64 |     </section>
 65 |     <section>
 66 |         <h2>How much does it cost?</h2>
 67 |         <p>
 68 |             Generally less than a dollar per document set, but you're paying OpenAI, not me, and you're responsible for all costs, no matter what.
 69 |         </p>
 70 |         <p>
 71 |             Eventually, some Meaningfully features may require payment.
 72 |         </p>
 73 |     </section>
 74 |     <section>
 75 |         <h2>How can I support development of Meaningfully??</h2>
 76 |         <p>
 77 |             You can <a href="https://buymeacoffee.com/jeremybmerrill">Buy Me A Coffee</a>.
 78 |         </p>
 79 | 
 80 |     </section>
 81 | </div>
 82 |   
 83 | <style>
 84 |     .container {
 85 |         max-width: 800px;
 86 |         margin: 0 auto;
 87 |         padding: 20px;
 88 |     }
 89 | 
 90 |     h1 {
 91 |         font-size: 2rem;
 92 |         margin-bottom: 1rem;
 93 |     }
 94 | 
 95 |     h2 {
 96 |         font-size: 1.5rem;
 97 |         margin-top: 1.5rem;
 98 |         margin-bottom: 0.5rem;
 99 |     }
100 | 
101 |     p {
102 |         margin-bottom: 0.5rem;
103 |         margin-top: 0.5rem;
104 |     }
105 | 
106 |     /* ol {
107 |         margin-left: 20px;
108 |     }       
109 |     ol li {
110 |         list-style-type: decimal;
111 |         list-style-position: inside;
112 |         padding-left: 0.5rem;
113 |     } */
114 |     ul li {
115 |         list-style-type: disc;
116 |         list-style-position: inside;
117 |         padding-left: 0.5rem;
118 |     }
119 | </style>
120 | 


--------------------------------------------------------------------------------
/src/renderer/src/components/Preview.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |   import Table from './Table.svelte';
 3 |   
 4 |   interface Props {
 5 |     previewData?: Array<Record<string, any>>;
 6 |     textColumn: string;
 7 |     metadataColumns?: string[];
 8 |     loading?: boolean;
 9 |   }
10 | 
11 |   let {
12 |     previewData = [],
13 |     textColumn,
14 |     metadataColumns = [],
15 |     loading = false
16 |   }: Props = $props();
17 | </script>
18 | 
19 | <div data-testid="preview" >
20 |   {#if loading}
21 |     <div class="flex justify-center items-center h-full">
22 |       <div class="animate-spin rounded-full h-12 w-12 border-t-2 border-b-2 border-violet-500"></div>
23 |     </div>
24 |   { :else }
25 |     <div class="space-y-4">
26 |       <h2 class="text-xl font-semibold">Preview</h2>
27 |       <div class="bg-white rounded-lg shadow text-black">
28 |         <Table
29 |           data={previewData}
30 |           {textColumn}
31 |           {metadataColumns}
32 |           showSimilarity={false}
33 |         />
34 |       </div>
35 |     </div> 
36 |   {/if} 
37 | </div>


--------------------------------------------------------------------------------
/src/renderer/src/components/Results.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |   import Table from './Table.svelte';
 3 |   
 4 |   interface Props {
 5 |     results?: Array<Record<string, any>>;
 6 |     textColumn: string;
 7 |     metadataColumns?: string[];
 8 |     loading?: boolean;
 9 |     originalDocumentClick?: (sourceNodeId: string) => void;
10 |   }
11 | 
12 |   let {
13 |     results = [],
14 |     textColumn,
15 |     metadataColumns = [],
16 |     loading = false,
17 |     originalDocumentClick = () => {},
18 |   }: Props = $props();
19 | 
20 |   // Initial number of results to display
21 |   const initialDisplayCount = 10;
22 |   let displayCount = $state(initialDisplayCount);
23 | 
24 |   // Function to load more results
25 |   const showMore = () => {
26 |     displayCount += 10;
27 |   };
28 | 
29 |   // Computed property for visible results
30 |   let visibleResults = $derived(results.slice(0, displayCount));
31 | </script>
32 | 
33 | <div class="space-y-4">
34 |   <h2 class="text-xl font-semibold">Search Results</h2>
35 |   
36 |   {#if loading}
37 |   <div class="flex justify-center items-center h-full">
38 |     <div class="animate-spin rounded-full h-12 w-12 border-t-2 border-b-2 border-violet-500"></div>
39 |   </div>
40 |   {:else if results.length === 0}
41 |     <div class="bg-white rounded-lg shadow text-black">
42 |       <p>No results found. Is it possible there is no data in the dataset?</p>
43 |     </div>
44 |   {:else}
45 |     <div class="bg-white rounded-lg shadow text-black">
46 |       <Table
47 |         data={visibleResults}
48 |         {textColumn}
49 |         {metadataColumns}
50 |         showSimilarity={true}
51 |         showShowOriginal={true}
52 |         originalDocumentClick={originalDocumentClick}
53 |       />
54 |     </div>
55 |     
56 |     {#if displayCount < results.length}
57 |       <div class="flex justify-center mt-4">
58 |         <button
59 |           onclick={showMore}
60 |           class="px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600 transition-colors"
61 |         >
62 |           Show More
63 |         </button>
64 |       </div>
65 |     {/if}
66 |   {/if}
67 | </div>


--------------------------------------------------------------------------------
/src/renderer/src/components/SearchPage.svelte:
--------------------------------------------------------------------------------
  1 | <script lang="ts">
  2 |   import { navigate } from 'svelte-routing';
  3 |   import type { DocumentSet } from '../main';
  4 |   import Results from './Results.svelte';
  5 | 
  6 |   interface Props {
  7 |     validApiKeysSet: boolean;
  8 |   }
  9 | 
 10 |   let { validApiKeysSet }: Props = $props();
 11 | 
 12 |   let documentSetId = parseInt(window.location.href.split("?")[0].split('/').pop());
 13 |   let documentSet: DocumentSet | null = $state(null);
 14 |   let documentSetLoading = $state(true);
 15 |   let metadataColumns: string[] = $state([]);
 16 |   let textColumn: string = $state('');
 17 |   let loading = $state(false);
 18 |   let hasResults = $state(false);
 19 |   let showModal = $state(false);
 20 |   let modalContent: Record<string, any> | null = $state(null);
 21 | 
 22 |   const blankSearchQuery = '';
 23 |   let searchQuery = $state(blankSearchQuery);
 24 |   let metadataFilters: Array<{ key: string, operator: "==" | "in" | ">" | "<" | "!=" | ">=" | "<=" | "nin" | "any" | "all" | "text_match" | "contains" | "is_empty", value: any }> = $state([]);
 25 | 
 26 |   let results: Array<Record<string, any>> = $state([]);
 27 |   let error: string | null = $state(null);
 28 | 
 29 |   window.api.getDocumentSet(documentSetId).then(receivedDocumentSet => {
 30 |     documentSet = receivedDocumentSet;
 31 |     metadataColumns = (documentSet.parameters.metadataColumns ?? []) as string[];
 32 |     textColumn = documentSet.parameters.textColumns[0] as string;
 33 |     documentSetLoading = false;
 34 |   }).catch(error => {
 35 |     console.error('Error fetching document set:', error);
 36 |     navigate('/');
 37 |   });
 38 | 
 39 |   const placeholderQueries = [
 40 |     "The CEO got fired",
 41 |     "My car caught on fire as I was driving on the highway",
 42 |     "I surprised my closest friends by starting a business selling handmade candles",
 43 |     "Our company's stock price could plummet if we don't address the recent scandal involving our CEO",
 44 |     "Don't tell anyone that I was the one who leaked the confidential information about our competitor's new product launch",
 45 |     "I can't believe I got fired for accidentally sending a company-wide email with a meme instead of the quarterly report",
 46 |   ];
 47 |   const placeholderQuery = placeholderQueries[Math.floor(Math.random() * placeholderQueries.length)];
 48 | 
 49 |   async function handleSearch() {
 50 |     if (!searchQuery.trim()) return;
 51 |     hasResults = true;
 52 |     loading = true;
 53 |     try {
 54 |       const searchResults = await window.api.searchDocumentSet({
 55 |         documentSetId: documentSet.documentSetId,
 56 |         query: searchQuery,
 57 |         n_results: 100,
 58 |         filters: metadataFilters.map(filter => ({
 59 |           key: filter.key,
 60 |           operator: filter.operator,
 61 |           value: filter.value
 62 |         }))
 63 |       });
 64 |       results = searchResults.map(result => ({ // TODO Factor this out if preview and search use the same data structure.
 65 |         ...result.metadata, // flatten the metadata so that this object is the same shape as a CSV row.
 66 |         similarity: result.score.toFixed(2),
 67 |         [textColumn]: result.text,
 68 |         sourceNodeId: result.sourceNodeId
 69 |       })); 
 70 |       error = null; 
 71 |     } catch (error_) {
 72 |       console.error('Search failed:', error_);
 73 |       error = error_;
 74 |     } finally {
 75 |       loading = false;
 76 |     }
 77 |   }
 78 | 
 79 |   function addFilter() {
 80 |     metadataFilters = [...metadataFilters, { key: '', operator: '==', value: '' }];
 81 |   }
 82 | 
 83 |   function removeFilter(index: number) {
 84 |     metadataFilters = metadataFilters.filter((_, i) => i !== index);
 85 |   }
 86 | 
 87 |   async function handleOriginalDocumentClick( documentId: string) {
 88 |     try {
 89 |       const documentData = await window.api.getDocument({ documentSetId, documentId });
 90 |       modalContent = documentData;
 91 |       showModal = true;
 92 |     } catch (error) {
 93 |       console.error('Error fetching document:', error);
 94 |     }
 95 |   }
 96 | 
 97 |   function closeModal() {
 98 |     showModal = false;
 99 |     modalContent = null;
100 |   }
101 | </script>
102 | 
103 | <div class="p-6 space-y-6">
104 |   <div class="flex items-center space-x-4">
105 |     <button 
106 |       class="text-blue-500 hover:text-blue-600 flex items-center space-x-1"
107 |       onclick={() => history.back()}
108 |     >
109 |       <svg class="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
110 |         <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M15 19l-7-7 7-7" />
111 |       </svg>
112 |       <span>Back to Document Sets</span>
113 |     </button>
114 |   </div>
115 | 
116 |   {#if documentSetLoading}
117 |     <p>Loading document set...</p>
118 |   {:else if !documentSet}
119 |     <p>Document set not found. {documentSetId}</p>
120 |   {:else}
121 |     <div class="space-y-2">
122 |       <h1 class="text-2xl font-bold" data-testid="document-set-name">{documentSet.name}</h1>
123 |       <p class="text-gray-600">
124 |         {documentSet.totalDocuments} documents • Uploaded {documentSet.uploadDate.toLocaleDateString()}
125 |       </p>
126 |     </div>
127 | 
128 |     <div class="space-y-4 max-w-3xl">
129 |       <!-- Search Input -->
130 |       <div class="space-y-2">
131 |         <label for="search" class="block text-sm font-medium text-gray-700">
132 |           Semantic Search
133 |         </label>
134 |         <p class="text-xs text-gray-500">
135 |           Imagine the perfect document that you hope might exist in your spreadsheet. Type it here. Meaningfully will find the real documents that mean 
136 |           about the same thing -- even if they have no keywords in common.
137 |         </p>
138 |         <div class="flex space-x-4">
139 |           <input
140 |             id="search"
141 |             type="text"
142 |             bind:value={searchQuery}
143 |             placeholder={placeholderQuery}
144 |             data-testid="search-bar"
145 |             class="flex-1 px-4 py-2 border border-gray-300 rounded-md shadow-sm focus:ring-blue-500 focus:border-blue-500 placeholder-gray-400"
146 |           />
147 |           <button
148 |             onclick={handleSearch}
149 |             disabled={loading || !validApiKeysSet}
150 |             data-testid="search-button"
151 |             class="px-4 py-2 bg-blue-500 text-white rounded-md hover:bg-blue-600 disabled:opacity-50 disabled:cursor-not-allowed"
152 |           >
153 |             {loading ? 'Searching...' : 'Search'}
154 |           </button>
155 |         </div>
156 |       </div>
157 | 
158 |       <!-- Metadata Filters -->
159 |       {#if metadataColumns.length > 0}  
160 |       <div class="space-y-2">
161 |         <p class="block text-sm font-medium text-gray-700">
162 |           Search only records that match...
163 |         </p>
164 |         <div class="space-y-4">
165 |           {#each metadataFilters as filter, index}
166 |             <div class="flex space-x-2 items-center">
167 |               <select bind:value={filter.key} class="px-2 py-1 border border-gray-300 rounded-md">
168 |                 <option value="" disabled>Select column</option>
169 |                 {#each metadataColumns as column}
170 |                   <option value={column}>{column}</option>
171 |                 {/each}
172 |               </select>
173 |               <select bind:value={filter.operator} class="px-2 py-1 border border-gray-300 rounded-md">
174 |                 <option value="==">==</option>
175 |                 <option value="in">in</option>
176 |                 <option value=">">&gt;</option>
177 |                 <option value="<">&lt;</option>
178 |                 <option value="!=">!=</option>
179 |                 <option value=">=">&gt;=</option>
180 |                 <option value="<=">&lt;=</option>
181 |                 <option value="nin">not in</option>
182 |                 <option value="any">any</option>
183 |                 <option value="all">all</option>
184 |                 <option value="text_match">text matches</option>
185 |                 <option value="contains">contains</option>
186 |                 <option value="is_empty">is empty</option>
187 |               </select>
188 |               <input
189 |                 type="text"
190 |                 bind:value={filter.value}
191 |                 placeholder="Value"
192 |                 class="flex-1 px-2 py-1 border border-gray-300 rounded-md"
193 |               />
194 |               <button onclick={() => removeFilter(index)} class="text-red-500 hover:text-red-600">
195 |                 Remove
196 |               </button>
197 |             </div>
198 |           {/each}
199 |           <button onclick={addFilter} class="text-blue-500 hover:text-blue-600">
200 |             Add Filter
201 |           </button>
202 |         </div>
203 |       </div>
204 |       {/if}
205 |     </div>
206 | 
207 |     <!-- Results -->
208 |     {#if error}
209 |       <div class="my-10 p-4 bg-red-100 text-red-700 rounded-md">
210 |         {error}
211 |       </div>
212 |     {/if}
213 |     {#if (searchQuery != blankSearchQuery || metadataFilters.length > 0) && hasResults}
214 |       <!-- Wrap Results component for easier selection -->
215 |       <div data-testid="results">
216 |         <Results
217 |           {results}
218 |           {loading}
219 |           {textColumn}
220 |           {metadataColumns}
221 |           originalDocumentClick={handleOriginalDocumentClick}
222 |           />
223 |       </div>
224 |     {/if}
225 |   {/if}
226 | </div>
227 | 
228 | <!-- modal for showing a whole document -->
229 | {#if showModal && modalContent}
230 |   <div data-testid="details" class="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
231 |     <div class="bg-white text-black p-6 rounded-lg shadow-lg max-w-xl w-full max-h-screen overflow-y-auto">
232 |       <h2 class="text-xl font-semibold mb-4">Original Document</h2>
233 |       <table>
234 |         <thead>
235 |           <tr>
236 |           </tr>
237 |         </thead>
238 |         <tbody>
239 |           <tr>
240 |             <td class="px-4 py-2 text-left border-b text-black">Original text</td>
241 |             <td class="px-4 py-2 border-b text-black">{modalContent.text}</td>
242 |           </tr>
243 |           {#each metadataColumns as key}
244 |             <tr>
245 |               <td class="px-4 py-2 text-left border-b text-black">{key}</td>
246 |               <td class="px-4 py-2 border-b text-black">{modalContent.metadata[key]}</td>
247 |             </tr>
248 |           {/each}
249 |         </tbody>
250 |       </table>
251 |       <button class="mt-4 px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600" onclick={closeModal}>Close</button>
252 |     </div>
253 |   </div>
254 | {/if}


--------------------------------------------------------------------------------
/src/renderer/src/components/Table.svelte:
--------------------------------------------------------------------------------
 1 | <script lang="ts">
 2 |   interface Props {
 3 |     data?: Array<Record<string, any>>;
 4 |     textColumn: string;
 5 |     metadataColumns?: string[];
 6 |     showSimilarity?: boolean;
 7 |     showShowOriginal?: boolean;
 8 |     originalDocumentClick?: (sourceNodeId: string) => void;
 9 |   }
10 | 
11 |   let {
12 |     data = [],
13 |     textColumn,
14 |     metadataColumns = [],
15 |     showSimilarity = false,
16 |     showShowOriginal = false,
17 |     originalDocumentClick = () => {},
18 |   }: Props = $props();
19 | 
20 |   // Combine all columns in display order: metadata, similarity
21 |   // text column is always called text internally, but we rename just the header.
22 |   let columns = $derived([textColumn, ...metadataColumns, ...(showSimilarity ? ['similarity'] : [])]);
23 | 
24 |   function sanitizeAndFormatText(text: string): string {
25 |     text = text.trim().replace(/^\\n/, '').replace(/\\n$/, '');
26 |     // First escape special characters
27 |     const escaped = text.replace(/[&<>"']/g, char => ({
28 |       '&': '&amp;',
29 |       '<': '&lt;',
30 |       '>': '&gt;',
31 |       '"': '&quot;',
32 |       "'": '&#39;'
33 |     }[char]));
34 |     
35 |     // Then convert newlines to <br> tags
36 |     return escaped.replace(/\\n/g, '<br>');
37 |   }
38 | </script>
39 | 
40 | <div class="w-full overflow-x-auto">
41 |   <table class="min-w-full table-auto border-collapse">
42 |     <thead>
43 |       <tr class="bg-gray-100">
44 |         {#each columns as column}
45 |           <th class="px-4 py-2 text-left border-b">{column}</th>
46 |         {/each}
47 |         {#if showShowOriginal}
48 |           <th class="px-4 py-2 text-left border-b"></th><!-- blank column for show all button-->
49 |         {/if}
50 |       </tr>
51 |     </thead>
52 |     <tbody>
53 |       {#each data as row}
54 |         <tr class="border-b hover:bg-gray-50">
55 |           {#each columns as column}
56 |             <td class="px-4 py-2">
57 |               {#if column === 'similarity' && row[column] !== undefined}
58 |                 {(row[column] * 100).toFixed(1)}%
59 |               {:else if column === textColumn}
60 |                 {@html sanitizeAndFormatText(row[column] || '')}
61 |               {:else}
62 |                 {(row[column]) || ''}
63 |               {/if}
64 |             </td>
65 |           {/each}
66 |           {#if showShowOriginal}
67 |             <td class="px-4 py-2">
68 |               <button data-testid="result-modal-button" class="text-blue-500 hover:text-blue-700" onclick={() => originalDocumentClick(row.sourceNodeId)}>Original Document</button>
69 |             </td>
70 |           {/if}
71 |         </tr>
72 |       {/each}
73 |     </tbody>
74 |   </table>
75 | </div>


--------------------------------------------------------------------------------
/src/renderer/src/env.d.ts:
--------------------------------------------------------------------------------
 1 | /// <reference types="svelte" />
 2 | /// <reference types="vite/client" />
 3 | 
 4 | 
 5 | interface Settings {
 6 |   openAIKey: string;
 7 |   oLlamaModelType: string;
 8 |   oLlamaBaseURL: string;
 9 | }
10 | interface SearchResult {
11 |   content: string;
12 |   similarity: number;
13 |   [key: string]: any; // For metadata fields
14 |   sourceNodeId: string | undefined;
15 | } 
16 | 
17 | interface Window {
18 |     electron: {
19 |       ipcRenderer: {
20 |         send: (channel: string, ...args: any[]) => void
21 |       }
22 |       process: {
23 |         versions: Record<string, string>
24 |       }
25 |     }
26 |     api: {
27 |       listDocumentSets: (page: number, pageSize: number) => Promise<{documents: DocumentSetMetadata[], total: number}> ,
28 |       uploadCsv: (formData: {
29 |         file: File,
30 |         datasetName: string,
31 |         description: string,
32 |         textColumns: string[],
33 |         metadataColumns: string[],
34 |         splitIntoSentences: boolean,
35 |         combineSentencesIntoChunks: boolean,
36 |         sploderMaxSize: number,
37 |         chunkSize: number,
38 |         chunkOverlap: number,
39 |         modelName: string,
40 |         modelProvider: string
41 |       }) => Promise<{ success: true, documentSetId: number }>,
42 |       generatePreviewData: (formData: {
43 |         file: File,
44 |         datasetName: string, // not really needed
45 |         description: string, // not really needed
46 |         textColumns: string[],
47 |         metadataColumns: string[],
48 |         splitIntoSentences: boolean,
49 |         combineSentencesIntoChunks: boolean,
50 |         sploderMaxSize: number,
51 |         chunkSize: number,
52 |         chunkOverlap: number,
53 |         modelName: string,
54 |         modelProvider: string
55 |       }) => Promise<{ success: boolean, nodes: Record<string, any>[], estimatedPrice: number, tokenCount: number, pricePer1M: number }>,
56 |       searchDocumentSet: (params: {
57 |         documentSetId: number;
58 |         query: string;
59 |         n_results: number;
60 |         filters?: { 
61 |           key: string, 
62 |           operator: "==" | "in" | ">" | "<" | "!=" | ">=" | "<=" | "nin" | "any" | "all" | "text_match" | "contains" | "is_empty", 
63 |           value: any 
64 |         }[];
65 |       }) => Promise<SearchResult[]>;
66 |       getDocument: (params: {documentSetId: number, documentId: string}) => Promise<{ text: string, metadata: Record<string, any> }>;
67 |       getSettings: () => Promise<Settings>;
68 |       setSettings: (settings: Settings) => Promise<{success: boolean}>;
69 |       deleteDocumentSet: (documentSetId: number) => Promise<void>;
70 |       getDocumentSet: (documentSetId: number) => Promise<DocumentSet>;
71 |     }
72 |   }
73 |   


--------------------------------------------------------------------------------
/src/renderer/src/main.ts:
--------------------------------------------------------------------------------
 1 | import './assets/main.css'
 2 | 
 3 | import App from './App.svelte'
 4 | import { mount } from "svelte";
 5 | 
 6 | const app = mount(App, {
 7 |   target: document.getElementById('app')
 8 | })
 9 | 
10 | export default app
11 | 
12 | export interface DocumentSet {
13 |   documentSetId: number;
14 |   name: string;
15 |   uploadDate: Date;
16 |   parameters: Record<string, unknown>;
17 |   totalDocuments: number;
18 | }
19 | 


--------------------------------------------------------------------------------
/svelte.config.mjs:
--------------------------------------------------------------------------------
1 | import { vitePreprocess } from '@sveltejs/vite-plugin-svelte'
2 | 
3 | export default {
4 |   // Consult https://svelte.dev/docs#compile-time-svelte-preprocess
5 |   // for more information about preprocessors
6 |   preprocess: vitePreprocess()
7 | }
8 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "@electron-toolkit/tsconfig/tsconfig.json",
 3 |   "include": [
 4 |     "src/renderer/src/env.d.ts",
 5 |     "src/renderer/src/**/*",
 6 |     "src/renderer/src/**/*.svelte",
 7 |     "src/preload/*.d.ts"
 8 |   ],
 9 |   "compilerOptions": {
10 |     "verbatimModuleSyntax": true,
11 |     "useDefineForClassFields": true,
12 |     "strict": false,
13 |     "allowJs": true,
14 |     "checkJs": true,
15 |     "lib": ["ESNext", "DOM", "DOM.Iterable"]
16 |   },
17 |   "references": [{ "path": "./tsconfig.node.json" }]
18 | }
19 | 


--------------------------------------------------------------------------------
/tsconfig.node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "@electron-toolkit/tsconfig/tsconfig.node.json",
 3 |   "include": ["electron.vite.config.*", "src/main/**/*", "src/preload/**/*"],
 4 |   "compilerOptions": {
 5 |     "composite": true,
 6 |     "types": ["electron-vite/node"],
 7 |     "moduleResolution": "bundler"    
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------