├── .eslintrc.js
├── .github
    └── workflows
    │   └── deploy.yml
├── .gitignore
├── LICENSE
├── README.md
├── SemanticFinder.gif
├── SemanticFinder_Chrome_Extension.gif
├── SemanticFinder_Chrome_Extension_en.zip
├── SemanticFinder_gource.png
├── extension
    ├── .gitignore
    ├── README.md
    ├── package-lock.json
    ├── package.json
    ├── public
    │   ├── icons
    │   │   ├── logo128.png
    │   │   └── logo48.png
    │   └── manifest.json
    ├── src
    │   ├── content
    │   │   ├── content.css
    │   │   └── content.js
    │   ├── options
    │   │   ├── options.css
    │   │   ├── options.html
    │   │   └── options.js
    │   ├── popup
    │   │   ├── AnimatedInput.vue
    │   │   ├── popup.css
    │   │   ├── popup.html
    │   │   ├── popup.js
    │   │   ├── popup.vue
    │   │   └── result.vue
    │   ├── serviceworkers
    │   │   ├── background.js
    │   │   ├── pdf.js
    │   │   ├── pdf.sandbox.js
    │   │   ├── pdf.worker.entry.js
    │   │   ├── pdf.worker.js
    │   │   └── semantic.js
    │   └── utils
    │   │   ├── cache.js
    │   │   └── utils.js
    └── webpack.config.js
├── index.html
├── jsconfig.json
├── logo.png
├── misc
    ├── Generate_large_textfile_from_books.ipynb
    └── README.md
├── package-lock.json
├── package.json
├── src
    ├── css
    │   └── styles.css
    ├── js
    │   ├── SemanticFinder.svg
    │   ├── index.js
    │   ├── semantic.js
    │   ├── utils.js
    │   └── worker.js
    └── models
    │   ├── feature-extraction_downloads.json
    │   ├── feature-extraction_downloads_sizes.json
    │   ├── feature-extraction_likes.json
    │   ├── feature-extraction_likes_sizes.json
    │   ├── feature-extraction_modified.json
    │   ├── feature-extraction_modified_sizes.json
    │   ├── feature-extraction_trending.json
    │   ├── feature-extraction_trending_sizes.json
    │   ├── model_miner.js
    │   ├── model_miner_simple.js
    │   ├── model_size_miner.ipynb
    │   ├── text2text_downloads.json
    │   ├── text2text_likes.json
    │   ├── text2text_modified.json
    │   └── text2text_trending.json
└── webpack.config.js


/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |     env: {
 3 |         browser: true,
 4 |         es2021: true
 5 |     },
 6 |     extends: 'standard',
 7 |     overrides: [
 8 |         {
 9 |             env: {
10 |                 node: true
11 |             },
12 |             files: [
13 |                 '.eslintrc.{js,cjs}'
14 |             ],
15 |             parserOptions: {
16 |                 sourceType: 'script'
17 |             }
18 |         }
19 |     ],
20 |     parserOptions: {
21 |         ecmaVersion: 'latest',
22 |         sourceType: 'module'
23 |     },
24 |     rules: {
25 |         indent: ['error', 4],
26 |         'space-before-function-paren': ['error', 'never'],
27 |         semi: ['error', 'always']
28 |     }
29 | };
30 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy to GitHub Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - webgpu
 8 | 
 9 | jobs:
10 |   deploy:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       # Checkout and Deploy Main Branch
15 |       - name: Checkout main branch
16 |         uses: actions/checkout@v3
17 |         with:
18 |           ref: main
19 | 
20 |       - name: Set up Node.js for main
21 |         uses: actions/setup-node@v3
22 |         with:
23 |           node-version: '22'
24 |           cache: 'npm'
25 | 
26 |       - name: Install dependencies for main
27 |         run: npm install
28 | 
29 |       # Build and deploy main branch
30 |       - name: Build and deploy main
31 |         run: |
32 |           npm run build
33 |           mkdir -p main_build
34 |           mv dist/* main_build/
35 |           echo "Deploying main branch..."
36 | 
37 |       - name: Deploy main to GitHub Pages
38 |         uses: peaceiris/actions-gh-pages@v3
39 |         with:
40 |           github_token: ${{ secrets.GITHUB_TOKEN }}
41 |           publish_dir: ./main_build # Deploy from your custom directory
42 | 
43 |       # Checkout and Deploy webgpu Branch
44 |       - name: Checkout webgpu branch
45 |         uses: actions/checkout@v3
46 |         with:
47 |           ref: webgpu
48 | 
49 |       - name: Set up Node.js for webgpu
50 |         uses: actions/setup-node@v3
51 |         with:
52 |           node-version: '22'
53 |           cache: 'npm' 
54 | 
55 |       - name: Install dependencies for webgpu
56 |         run: npm install
57 | 
58 |       # Build and deploy webgpu branch
59 |       - name: Build and deploy webgpu
60 |         run: |
61 |           npm run build
62 |           mkdir -p webgpu_build
63 |           mv dist/* webgpu_build/
64 |           echo "Deploying webgpu branch..."
65 | 
66 |       - name: Deploy webgpu to GitHub Pages subdirectory
67 |         uses: peaceiris/actions-gh-pages@v3
68 |         with:
69 |           github_token: ${{ secrets.GITHUB_TOKEN }}
70 |           publish_dir: ./webgpu_build 
71 |           destination_dir: webgpu 
72 |           keep_files: true
73 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | .vscode/
3 | .idea/
4 | .DS_Store
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 do-me
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <a href="https://do-me.github.io/SemanticFinder/">
  3 |     <img src="https://github.com/do-me/SemanticFinder/assets/47481567/4522ab9d-08f4-4f4c-92db-dbf14ccb2b70" width="320" alt="SemanticFinder">
  4 |   </a>    
  5 | <h1 align="center">Frontend-only live semantic search and chat-with-your-documents built on transformers.js. Supports Wasm and WebGPU!</h1>
  6 | </p>
  7 | 
  8 | ![](/SemanticFinder.gif?)
  9 | 
 10 | ## [Try the web app](https://do-me.github.io/SemanticFinder/), [install the Chrome extension](#browser-extension) or read the [introduction blog post](https://geo.rocks/post/semanticfinder-semantic-search-frontend-only/).
 11 | 
 12 | ## 🔥 For best performance try the [WebGPU Version here!](https://do-me.github.io/SemanticFinder/webgpu/) 🔥
 13 | 
 14 | Semantic search right in your browser! Calculates the embeddings and cosine similarity client-side without server-side inferencing, using [transformers.js](https://xenova.github.io/transformers.js/) and latest SOTA embedding models from Huggingface.
 15 | 
 16 | ## Intro Video
 17 | [![SemanticFinder Introduction](https://github.com/user-attachments/assets/9febc0e7-f444-4039-8cf2-af39f3d7733f)](https://www.youtube.com/watch?v=FZsWH1J4MXo "Get started with semantic search in the browser")
 18 | 
 19 | ## Models
 20 | All transformers.js-compatible feature-extraction models are supported. Here is a sortable list you can go through: [daily updated list](https://do-me.github.io/trending-huggingface-models/). Download the compatible models table as xlsx, csv, json, parquet, or html here: https://github.com/do-me/trending-huggingface-models/.
 21 | Note that the wasm backend in transformers.js supports all mentioned models. If you want the best performance, make sure to use a WebGPU-compatible model.
 22 | 
 23 | ## Catalogue 
 24 | You can use super fast pre-indexed examples for *really* large books like the Bible or Les Misérables with hundreds of pages and search the content in less than 2 seconds 🚀. Try one of these and convince yourself:
 25 | 
 26 | | filesize | textTitle | textAuthor | textYear | textLanguage | URL | modelName | quantized | splitParam | splitType | characters | chunks | wordsToAvoidAll | wordsToCheckAll | wordsToAvoidAny | wordsToCheckAny | exportDecimals | lines | textNotes | textSourceURL | filename |
 27 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
 28 | | 4.78 | Das Kapital | Karl Marx | 1867 | de | https://do-me.github.io/SemanticFinder/?hf=Das_Kapital_c1a84fba | Xenova/multilingual-e5-small | True | 80 | Words | 2003807 | 3164 |  |  |  |  | 5 | 28673 |  | https://ia601605.us.archive.org/13/items/KarlMarxDasKapitalpdf/KAPITAL1.pdf | Das_Kapital_c1a84fba.json.gz |
 29 | | 2.58 | Divina Commedia | Dante | 1321 | it | https://do-me.github.io/SemanticFinder/?hf=Divina_Commedia_d5a0fa67 | Xenova/multilingual-e5-base | True | 50 | Words | 383782 | 1179 |  |  |  |  | 5 | 6225 |  | http://www.letteratura-italiana.com/pdf/divina%20commedia/08%20Inferno%20in%20versione%20italiana.pdf | Divina_Commedia_d5a0fa67.json.gz |
 30 | | 11.92 | Don Quijote | Miguel de Cervantes | 1605 | es | https://do-me.github.io/SemanticFinder/?hf=Don_Quijote_14a0b44 | Xenova/multilingual-e5-base | True | 25 | Words | 1047150 | 7186 |  |  |  |  | 4 | 12005 |  | https://parnaseo.uv.es/lemir/revista/revista19/textos/quijote_1.pdf | Don_Quijote_14a0b44.json.gz |
 31 | | 0.06 | Hansel and Gretel | Brothers Grimm | 1812 | en | https://do-me.github.io/SemanticFinder/?hf=Hansel_and_Gretel_4de079eb | TaylorAI/gte-tiny | True | 100 | Chars | 5304 | 55 |  |  |  |  | 5 | 9 |  | https://www.grimmstories.com/en/grimm_fairy-tales/hansel_and_gretel | Hansel_and_Gretel_4de079eb.json.gz |
 32 | | 1.74 | IPCC Report 2023 | IPCC | 2023 | en | https://do-me.github.io/SemanticFinder/?hf=IPCC_Report_2023_2b260928 | Supabase/bge-small-en | True | 200 | Chars | 307811 | 1566 |  |  |  |  | 5 | 3230 | state of knowledge of climate change | https://report.ipcc.ch/ar6syr/pdf/IPCC_AR6_SYR_LongerReport.pdf | IPCC_Report_2023_2b260928.json.gz |
 33 | | 25.56 | King James Bible |  | None | en | https://do-me.github.io/SemanticFinder/?hf=King_James_Bible_24f6dc4c | TaylorAI/gte-tiny | True | 200 | Chars | 4556163 | 23056 |  |  |  |  | 5 | 80496 |  | https://www.holybooks.com/wp-content/uploads/2010/05/The-Holy-Bible-King-James-Version.pdf | King_James_Bible_24f6dc4c.json.gz |
 34 | | 11.45 | King James Bible |  | None | en | https://do-me.github.io/SemanticFinder/?hf=King_James_Bible_6434a78d | TaylorAI/gte-tiny | True | 200 | Chars | 4556163 | 23056 |  |  |  |  | 2 | 80496 |  | https://www.holybooks.com/wp-content/uploads/2010/05/The-Holy-Bible-King-James-Version.pdf | King_James_Bible_6434a78d.json.gz |
 35 | | 39.32 | Les Misérables | Victor Hugo | 1862 | fr | https://do-me.github.io/SemanticFinder/?hf=Les_Misérables_2239df51 | Xenova/multilingual-e5-base | True | 25 | Words | 3236941 | 19463 |  |  |  |  | 5 | 74491 | All five acts included | https://beq.ebooksgratuits.com/vents/Hugo-miserables-1.pdf | Les_Misérables_2239df51.json.gz |
 36 | | 0.46 | REGULATION (EU) 2023/138 | European Commission | 2022 | en | https://do-me.github.io/SemanticFinder/?hf=REGULATION_(EU)_2023_138_c00e7ff6 | Supabase/bge-small-en | True | 25 | Words | 76809 | 424 |  |  |  |  | 5 | 1323 |  | https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R0138&qid=1704492501351 | REGULATION_(EU)_2023_138_c00e7ff6.json.gz |
 37 | | 0.07 | Universal Declaration of Human Rights | United Nations | 1948 | en | https://do-me.github.io/SemanticFinder/?hf=Universal_Declaration_of_Human_Rights_0a7da79a | TaylorAI/gte-tiny | True | \nArticle  | Regex | 8623 | 63 |  |  |  |  | 5 | 109 | 30 articles | https://www.un.org/en/about-us/universal-declaration-of-human-rights | Universal_Declaration_of_Human_Rights_0a7da79a.json.gz |
 38 | 
 39 | ## Import & Export 
 40 | 
 41 | You can create indices yourself with one two clicks and save them. If it's something private, keep it for yourself, if it's a classic book or something you think other's might be interested in consider a PR on the [Huggingface Repo](https://huggingface.co/datasets/do-me/SemanticFinder) or get in touch with us. Book requests are happily met if you provide us a good source link where we can do copy & paste. Simply open an issue here with [Book Request] or similar or contact us. 
 42 | 
 43 | It goes without saying that no discriminating content will be tolerated.
 44 | 
 45 | ## Installation 
 46 | 
 47 | Clone the repository and install dependencies with 
 48 | 
 49 | `npm install`
 50 | 
 51 | Then run with
 52 | 
 53 | `npm run start`
 54 | 
 55 | If you want to build instead, run 
 56 | 
 57 | `npm run build`
 58 | 
 59 | Afterwards, you'll find the `index.html`, `main.css` and `bundle.js` in `dist`.
 60 | 
 61 | ## Browser extension 
 62 | Download the Chrome extension from [Chrome webstore](https://chrome.google.com/webstore/detail/semanticfinder/ddmgffoffelnhnonpoiblaoboaeieejl) and pin it. Right click the extension icon for `options`: 
 63 | - choose distiluse-base-multilingual-cased-v2 for multilingual usage (default is English-only)
 64 | - set a higher number for min characters to split by for larger texts
 65 | 
 66 | ![](SemanticFinder_Chrome_Extension.gif?)
 67 | 
 68 | ### Local build 
 69 | If you want to build the browser extension locally, clone the repo and cd in `extension` directory then run: 
 70 | - `npm install`
 71 | - `npm run build` for a static build or
 72 | - `npm run dev` for the auto-refreshing development version
 73 | - go to Chrome extension settings with `chrome://extensions`
 74 | - select `Load Unpacked` and choose the `build` folder
 75 | - pin the extension in Chrome so you can access it easily. If it doesn't work for you, feel free to open an issue.
 76 | 
 77 | ## Speed 
 78 | Tested on the entire book of [Moby Dick](https://archive.org/stream/mobydickorwhale01melvuoft/mobydickorwhale01melvuoft_djvu.txt) with 660.000 characters ~13.000 lines or ~111.000 words. 
 79 | Initial embedding generation takes **1-2 mins** on my old i7-8550U CPU with 1000 characters as segment size. Following queries take only ~2 seconds! 
 80 | If you want to query larger text instead or keep an entire library of books indexed use a [proper vector database instead](https://geo.rocks/post/qdrant-transformers-js-semantic-search/). 
 81 | 
 82 | ## Features
 83 | 
 84 | You can customize everything!
 85 | 
 86 | - Input text & search term(s)
 87 | - Hybrid search (semantic search & full-text search)
 88 | - Segment length (the bigger the faster, the smaller the slower)
 89 | - Highlight colors (currently hard-coded)
 90 | - Number of highlights are based on the threshold value. The lower, the more results.
 91 | - Live updates
 92 | - Easy integration of other ML-models thanks to [transformers.js](https://xenova.github.io/transformers.js/)
 93 | - Data privacy-friendly - your input text data is not sent to a server, it stays in your browser!
 94 | 
 95 | ## Usage ideas
 96 | 
 97 | - Basic search through anything, like your personal notes (my initial motivation by the way, a huge notes.txt file I couldn't handle anymore)
 98 | - Remember peom analysis in school? Often you look for possible Leitmotifs or recurring categories like **food** in Hänsel & Gretel
 99 | 
100 | ## Future ideas
101 | 
102 | - One could package everything nicely and use it e.g. instead of JavaScript search engines such as [Lunr.js](https://lunrjs.com/) (also being used in [mkdocs-material](https://squidfunk.github.io/mkdocs-material/setup/setting-up-site-search/)).
103 | - Integration in mkdocs (mkdocs-material) **experimental**:
104 |     - when building the docs, slice all `.md`-files in chunks (length defined in `mkdocs.yaml`). Should be fairly large (>800 characters) for lower response time. It's also possible to build n indices with first a coarse index (mabye per document/ `.md`-file if the used model supports the length) and then a rfined one for the document chunks
105 |     - build the index by calculating the embeddings for all docs/chunks 
106 |     - when a user queries the docs, a switch can toggle (fast) full-text standard search (atm with lunr.js) or experimental semantic search 
107 |     - if the latter is being toggled, the client loads the model (all-MiniLM-L6-v2 has ~30mb) 
108 |     - like in SemanticFinder, the embedding is created client-side and the cosine similarity calculated 
109 |     - the high-scored results are returned just like with lunr.js so the user shouldn't even notice a differenc ein the UI
110 | - Electron- or browser-based apps could be augmented with semantic search, e.g. VS Code, Atom or mobile apps. 
111 | - Integration in personal wikis such as Obsidian, tiddlywiki etc. would save you the tedious tagging/keywords/categorisation work or could at least improve your structure further
112 | - Search your own browser history (thanks [@Snapdeus](https://twitter.com/snapdeus/status/1646233904691413006))
113 | - Integration in chat apps
114 | - Allow PDF-uploads (conversion from PDF to text) 
115 | - Integrate with Speech-to-Text whisper model from transformers.js to allow audio uploads.
116 | - Thanks to [CodeMirror](https://codemirror.net/) one could even use syntax highlighting for programming languages such as Python, JavaScript etc. 
117 | 
118 | ## Logic 
119 | 
120 | [Transformers.js](https://xenova.github.io/transformers.js/) is doing all the heavy lifting of tokenizing the input and running the model. Without it, this demo would have been impossible. 
121 | 
122 | **Input**
123 | - Text, as much as your browser can handle! The demo uses a part of "Hänsel & Gretel" but it can handle hundreds of PDF pages
124 | - A search term or phrase
125 | - The number of characters the text should be segmented in
126 | - A similarity threshold value. Results with lower similarity score won't be displayed.
127 | 
128 | **Output**
129 | - <span style="background-color: rgb(0, 255, 81);">Three highlighted string segments</span>, the darker the higher the similarity score.
130 | 
131 | **Pipeline**
132 | 
133 | 0. All scripts are loaded. The model is loaded once from HuggingFace, after cached in the browser.
134 | 1. A user inputs some text and a search term or phrase.
135 | 2. Depending on the approximate length to consider (unit=characters), the text is split into **segments**. Words themselves are never split, that's why it's approximative.
136 | 3. The search term embedding is created.
137 | 4. For each **segment** of the text, the embedding is created. 
138 | 5. Meanwhile, the cosine similarity is calculated between every **segment** embedding and the search term embedding. It's written to a dictionary with the segment as key and the score as value.
139 | 6. For every iteration, the progress bar and the highlighted sections are updated in real-time depending on the highest scores in the array.
140 | 7. The embeddings are cached in the dictionary so that subsequent queries are quite fast. The calculation of the cosine similarity is fairly speedy in comparison to the embedding generation. 
141 | 8. **Only if the user changes the segment length**, the embeddings must be recalculated.  
142 | 
143 | ## Collaboration 
144 | PRs welcome!
145 | 
146 | ## To Dos (no priorization)
147 | - [x] similarity score cutoff/threshold
148 | - [x] add option for more highlights (e.g. all above certain score)
149 | - [x] add stop button 
150 | - [x] MaterialUI for input fields or proper labels
151 | - [x] create a demo without CDNs
152 | - [x] separate one html properly in html, js, css
153 | - [x] add npm installation 
154 | - [x] option for loading embeddings from file or generally allow sharing embeddings in some way
155 | - [x] simplify chunking function so the original text can be loaded without issues
156 | - [ ] improve the color range
157 | - [ ] rewrite the cosine similarity function in Rust, port to WASM and load as a module for possible speedup (experimental)
158 | - [ ] UI overhaul
159 | - [ ] polish code 
160 | - [x]   - jQuery/vanilla JS mixed
161 | - [ ]   - clean up functions 
162 | - [ ]   - add more comments
163 | - [ ] add possible use cases
164 | - [ ] package as a standalone application (maybe with custom model choice; to be downloaded once from HF hub, then saved locally)
165 | - [ ] possible integration as example in [transformers.js homepage](https://github.com/xenova/transformers.js/issues/84)
166 | 
167 | ## Star History
168 | 
169 | [![Star History Chart](https://api.star-history.com/svg?repos=do-me/SemanticFinder&type=Timeline)](https://star-history.com/#do-me/SemanticFinder&Timeline)
170 | 
171 | ## Gource Map 
172 | 
173 | ![image](SemanticFinder_gource.png)
174 | 
175 | Gource image created with: 
176 | 
177 | ```bash 
178 | gource -1280x720 --title "SemanticFinder" --seconds-per-day 0.03 --auto-skip-seconds 0.03 --bloom-intensity 0.5 --max-user-speed 500 --highlight-dirs --multi-sampling --highlight-colour 00FF00  
179 | ```
180 | 


--------------------------------------------------------------------------------
/SemanticFinder.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder.gif


--------------------------------------------------------------------------------
/SemanticFinder_Chrome_Extension.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_Chrome_Extension.gif


--------------------------------------------------------------------------------
/SemanticFinder_Chrome_Extension_en.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_Chrome_Extension_en.zip


--------------------------------------------------------------------------------
/SemanticFinder_gource.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_gource.png


--------------------------------------------------------------------------------
/extension/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/extension/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # SemanticFinder Browser Extension
 3 | 
 4 | 
 5 | ## Getting Started
 6 | 1. Install the necessary dependencies:
 7 |     ```bash
 8 |     npm install 
 9 |     ```
10 | 
11 | 2. Build the project:
12 |     ```bash
13 |     npm run build 
14 |     ```
15 |    or
16 |     ```bash
17 |     npm run dev 
18 |     ```
19 |    for auto-reload. 
20 | 
21 | 3. Add the extension to your browser. To do this, go to `chrome://extensions/`, enable developer mode (top right), and click "Load unpacked". Select the `build` directory from the dialog which appears and click "Select Folder".
22 | 
23 | 
24 | ----
25 | 
26 | A big thank you to Xenova, whose work on 🤗 Transformers.js makes this entire project possible. 
27 | 


--------------------------------------------------------------------------------
/extension/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "SemanticFinder",
 3 |   "version": "0.0.1",
 4 |   "description": "SemanticFinder | In-browser Semantic Search via Transformers.js",
 5 |   "scripts": {
 6 |     "build": "webpack",
 7 |     "dev": "webpack --watch"
 8 |   },
 9 |   "type": "module",
10 |   "author": "Varun Neal Srivastava",
11 |   "contributors": [
12 |     "Dominik Weckmüller",
13 |     "Xenova"
14 |   ],
15 |   "license": "MIT",
16 |   "devDependencies": {
17 |     "copy-webpack-plugin": "^11.0.0",
18 |     "css-loader": "^6.8.1",
19 |     "html-webpack-plugin": "^5.5.1",
20 |     "pdfjs-dist": "^3.9.179",
21 |     "style-loader": "^3.3.3",
22 |     "vue-loader": "^17.2.2",
23 |     "vue-template-compiler": "^2.7.14",
24 |     "vueify": "^9.4.1",
25 |     "webpack": "^5.79.0",
26 |     "webpack-cli": "^5.1.4"
27 |   },
28 |   "dependencies": {
29 |     "@mozilla/readability": "^0.4.4",
30 |     "@vue/compiler-sfc": "^3.3.4",
31 |     "@xenova/transformers": "^2.5.0",
32 |     "mark.js": "^8.11.1",
33 |     "node-polyfill-webpack-plugin": "^2.0.1",
34 |     "util": "^0.12.5",
35 |     "vue": "^3.3.4"
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/extension/public/icons/logo128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/extension/public/icons/logo128.png


--------------------------------------------------------------------------------
/extension/public/icons/logo48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/extension/public/icons/logo48.png


--------------------------------------------------------------------------------
/extension/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "manifest_version": 3,
 3 |   "name": "SemanticFinder",
 4 |   "description": "SemanticFinder | In-browser Semantic Search via Transformers.js",
 5 |   "version": "0.0.1",
 6 |   "host_permissions": ["http://*/*", "https://*/*"],
 7 |   "permissions": [
 8 |     "scripting",
 9 |     "activeTab",
10 |     "storage",
11 |     "unlimitedStorage"
12 |   ],
13 |   "options_ui": {
14 |     "page": "options.html",
15 |     "open_in_tab": true
16 |   },
17 |   "background": {
18 |     "service_worker": "background.js",
19 |     "type": "module"
20 |   },
21 |   "content_scripts": [
22 |     {
23 |       "matches": [
24 |         "<all_urls>"
25 |       ],
26 |       "js": [
27 |         "content.js", "pdf.js", "pdf.worker.js"
28 |       ],
29 |       "css": [
30 |         "content.css"
31 |       ]
32 |     }
33 |   ],
34 |   "minimum_chrome_version": "92",
35 |   "action": {
36 |     "default_icon": {
37 |       "16": "icons/logo48.png",
38 |       "24": "icons/logo48.png",
39 |       "32": "icons/logo128.png"
40 |     },
41 |     "default_title": "SemanticFinder",
42 |     "default_popup": "popup.html"
43 |   },
44 |   "content_security_policy": {
45 |     "extension_pages": "script-src 'self' 'wasm-unsafe-eval'"
46 |   },
47 |   "icons": {
48 |     "16": "icons/logo48.png",
49 |     "48": "icons/logo48.png",
50 |     "128": "icons/logo128.png"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/extension/src/content/content.css:
--------------------------------------------------------------------------------
1 | 
2 | .SemanticFinder-highlight {
3 |     background-color: #ffff33 !important;
4 |     color: black !important;
5 | }
6 | 


--------------------------------------------------------------------------------
/extension/src/content/content.js:
--------------------------------------------------------------------------------
  1 | // content.js
  2 | import {prettyLog, splitReadableContent} from '../utils/utils.js';
  3 | import {Readability} from '@mozilla/readability';
  4 | import Mark from 'mark.js';
  5 | import {getDocument, GlobalWorkerOptions} from 'pdfjs-dist';
  6 | 
  7 | 
  8 | async function fetchAndExtractPDFText(url) {
  9 |     GlobalWorkerOptions.workerSrc = chrome.runtime.getURL('../serviceworkers/pdf.worker.js');
 10 | 
 11 |     const pdf = await getDocument(url).promise;
 12 | 
 13 |     let totalPages = pdf.numPages;
 14 |     let texts = [];
 15 | 
 16 |     for (let i = 1; i <= totalPages; i++) {
 17 |         // console.log("page ", i);
 18 |         const page = await pdf.getPage(i);
 19 |         const textContent = await page.getTextContent();
 20 |         const pageText = textContent.items.map(item => item.str).join(' ');
 21 |         texts.push(pageText);
 22 |     }
 23 | 
 24 |     return texts.join(' ');
 25 | }
 26 | 
 27 | function getValueFromStorage(key, defaultValue) {
 28 |     return new Promise((resolve, reject) => {
 29 |         chrome.storage.sync.get(key, function(result) {
 30 |             if (chrome.runtime.lastError) {
 31 |                 reject(new Error(chrome.runtime.lastError));
 32 |             } else {
 33 |                 resolve(result[key] || defaultValue);
 34 |             }
 35 |         });
 36 |     });
 37 | }
 38 | 
 39 | async function fetchNumChars() {
 40 |     try {
 41 |         const defaultNumChars = 50; // You can set this to your desired default value
 42 |         const storedNumChars = await getValueFromStorage('num_chars', defaultNumChars);
 43 |         return storedNumChars;
 44 |     } catch (error) {
 45 |         console.error('Error fetching num_chars:', error);
 46 |         return null;
 47 |     }
 48 | }
 49 | 
 50 | chrome.runtime.onMessage.addListener(async function(request, sender) {
 51 |     try {
 52 |         let currentURL = window.location.href;
 53 |         if (request.type === "getText") {
 54 |             const numChars = await fetchNumChars();
 55 |             let texts = [];
 56 | 
 57 |       if (request.contentType == "application/pdf") {
 58 |                 let textContent = await fetchAndExtractPDFText(currentURL);
 59 |                 texts = splitReadableContent(textContent, numChars);
 60 | 
 61 |             } else {
 62 |                 let concatenatedContent = "";
 63 | 
 64 |                 const iframes = document.querySelectorAll('iframe');
 65 |                 console.dir(iframes);
 66 | 
 67 |                 iframes.forEach(function(iframe) {
 68 |                     try {
 69 |                         const iframeDocument = iframe.contentDocument;
 70 | 
 71 |                         if (iframeDocument) {
 72 | 
 73 |                             let { textContent } = new Readability(iframeDocument.cloneNode(true)).parse();
 74 |                             prettyLog("Iframe text content:", textContent, "orange");
 75 |                             concatenatedContent += textContent;
 76 |                         }
 77 |                     } catch (error) {
 78 |                         prettyLog("Skipped an iframe due to permissions issue:", error, "red");
 79 |                     }
 80 |                 });
 81 | 
 82 |                 const documentClone = document.cloneNode(true);
 83 |                 let { textContent } = new Readability(documentClone).parse();
 84 |                 concatenatedContent += textContent;
 85 |                 // prettyLog("Main document text content:", textContent);
 86 | 
 87 |                 texts = splitReadableContent(concatenatedContent, numChars);
 88 | 
 89 |             }
 90 |             chrome.runtime.sendMessage({type: "tabUpdated", text: texts, currentURL});
 91 |         } else if (request.type === 'highlightAndScroll') {
 92 |             // if (currentURL.endsWith('.pdf')) { return; }
 93 |             if (!highlightAndScrollToText(request.text)) {
 94 |                 chrome.runtime.sendMessage({type: "error", reason: "Cannot find and highlight selection."})
 95 |             }
 96 |         }
 97 |     } catch (error) {
 98 |         prettyLog("ERROR", error.message, "red", "red");
 99 |         if (error.message.includes('net::ERR_BLOCKED_BY_CLIENT')) {
100 |             chrome.runtime.sendMessage({type: "error", reason: "ERR_BLOCKED_BY_CLIENT"});
101 |         } else {
102 |             chrome.runtime.sendMessage({type: "error", reason: error.message});
103 |         }
104 |     }
105 | });
106 | 
107 | 
108 | let currText;
109 | let instance = new Mark(document.querySelector("body"));
110 | 
111 | function highlightAndScrollToText(text, depth= 3) {
112 |     if (depth === 0) {
113 |         return false;
114 |     }
115 |     // If there's a previous highlighted text, unmark it
116 |     if (currText) {
117 |         instance.unmark({"element": "span", "className": "SemanticFinder-highlight"});
118 |     }
119 | 
120 |     currText = text;
121 | 
122 |     let textFound = false;
123 | 
124 |     instance.mark(text, {
125 |         "element": "span",
126 |         "separateWordSearch": false,
127 |         "className": "SemanticFinder-highlight",
128 |         "acrossElements": true,
129 |         "wildcards": "enabled",
130 |         "iframes": true,
131 |         "each": function (node) {
132 |             // Scroll to the first instance of it
133 |             node.scrollIntoView({
134 |                 behavior: "smooth",
135 |                 block: "center"
136 |             });
137 |             textFound = true;
138 |         }
139 |     });
140 | 
141 | 
142 |     // can use "noMatch" in markjs instead
143 |     if (!textFound) {
144 |         let segments = text.split('\n');
145 |         let longestSegment = segments.sort((a, b) => b.length - a.length)[0];
146 |         if (longestSegment) {
147 |             return highlightAndScrollToText(longestSegment, depth - 1);
148 |         }
149 |     } else {
150 |         return true;
151 |     }
152 | }
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/extension/src/options/options.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     font-family: 'Helvetica', sans-serif;
 3 |     padding: 20px;
 4 |     background-color: #f5f5f5;
 5 | }
 6 | 
 7 | .container {
 8 |     background-color: #fff;
 9 |     padding: 20px;
10 |     border-radius: 5px;
11 |     box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
12 |     width: 300px; /* Adjust as needed */
13 |     margin: 0 auto;
14 | }
15 | 
16 | label {
17 |     display: block;
18 |     margin-bottom: 10px;
19 | }
20 | 
21 | select,
22 | input {
23 |     width: 100%;
24 |     padding: 8px;
25 |     box-sizing: border-box;
26 |     margin-bottom: 20px;
27 |     border-radius: 5px;
28 |     border: 1px solid #ccc;
29 | }
30 | 
31 | .rectangular-button {
32 |     display: block;
33 |     background-color: #007bff;
34 |     color: white;
35 |     padding: 10px 20px;
36 |     text-align: center;
37 |     border: none;
38 |     border-radius: 5px;
39 |     cursor: pointer;
40 |     margin-bottom: 10px;
41 |     transition: background-color 0.3s;
42 | }
43 | 
44 | .rectangular-button:hover {
45 |     background-color: #0056b3;
46 | }
47 | 


--------------------------------------------------------------------------------
/extension/src/options/options.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <title>SemanticFinder Settings</title>
 7 |     <link rel="stylesheet" href="options.css">
 8 | </head>
 9 | 
10 | <body>
11 | <div class="container">
12 |     <label for="modelSelector">Model Selector:</label>
13 |     <select id="modelSelector">
14 |         <option value="TaylorAI/gte-tiny" selected>gte-tiny (23 MB)</option>
15 |         <option value="Supabase/gte-small">gte-small (34 MB)</option>
16 |         <option value="Xenova/gte-base">gte-base (110 MB)</option>                               
17 |         <option value="Xenova/gte-large">gte-large (337 MB)</option>   
18 |         <option value="Xenova/all-MiniLM-L6-v2">all-MiniLM-L6-v2 (23 MB)</option>
19 |         <option value="Xenova/multilingual-e5-small">multilingual-e5-small (118 MB)</option>
20 |         <option value="Xenova/distiluse-base-multilingual-cased-v2">distiluse-base-multilingual-cased-v2 (135 MB)</option>
21 |         <option value="Xenova/e5-large-v2">e5-large-v2 (335 MB)</option>
22 |         <option value="Xenova/bge-small-en-v1.5">bge-small-en-v1.5 (34 MB)</option>
23 |         <option value="Xenova/bge-base-en-v1.5">bge-base-en-v1.5 (110 MB)</option>
24 |         <option value="Xenova/e5-base-v2">e5-base-v2 (110 MB)</option>
25 |         <option value="Xenova/kobert">kobert (92 MB)</option>
26 |     </select>
27 | 
28 |     <br><br>
29 | 
30 |     <label for="minCharsInput">Min # chars to split by:</label>
31 |     <input type="number" id="minCharsInput" min="2" value="50">
32 | 
33 |     <br><br>
34 |     <button id="restoreButton" class="rectangular-button">Restore Defaults</button>
35 | 
36 |     <button id="saveButton" class="rectangular-button">Save Settings</button>
37 | </div>
38 | 
39 | <script src="options.js"></script>
40 | </body>
41 | 
42 | </html>
43 | 


--------------------------------------------------------------------------------
/extension/src/options/options.js:
--------------------------------------------------------------------------------
 1 | 
 2 | document.addEventListener('DOMContentLoaded', function() {
 3 |     loadSettings();
 4 | 
 5 |     const b = document.getElementById('saveButton')
 6 |     if (b) {
 7 |         b.addEventListener('click', saveSettings);
 8 |     }
 9 | 
10 |     const r = document.getElementById('restoreButton')
11 |     if (r) {
12 |         r.addEventListener('click', restoreDefaults);
13 |     }
14 | });
15 | 
16 | function saveSettings(showAlert = true) {
17 |     const modelName = document.getElementById('modelSelector').value;
18 |     const numChars = document.getElementById('minCharsInput').value;
19 | 
20 |     chrome.storage.sync.set({
21 |         'model_name': modelName,
22 |         'num_chars': numChars
23 |     }, function() {
24 |         if (showAlert) {
25 |             alert('Settings saved.');
26 |         }
27 |     });
28 | }
29 | 
30 | function restoreDefaults() {
31 |     document.getElementById('modelSelector').value = 'Supabase/gte-small'; // Default model
32 |     document.getElementById('minCharsInput').value = 50; // Default number
33 | 
34 |     saveSettings(false);
35 | }
36 | 
37 | 
38 | function loadSettings() {
39 |     chrome.storage.sync.get(['model_name', 'num_chars'], function(items) {
40 |         if (items['model_name']) {
41 |             const s = document.getElementById('modelSelector')
42 |             if (s) {
43 |                 s.value = items['model_name'];
44 |             }
45 |         }
46 |         if (items['num_chars']) {
47 |             const m = document.getElementById('minCharsInput')
48 |             if (m) {
49 |                 m.value = items['num_chars'];
50 |             }
51 |         }
52 |     });
53 | }
54 | 


--------------------------------------------------------------------------------
/extension/src/popup/AnimatedInput.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |     <div class="input-container">
 3 |         <input v-model="inputText" type="text" class="search-bar" ref="searchBar" placeholder="Search">
 4 |     </div>
 5 | </template>
 6 | 
 7 | <script>
 8 | 
 9 | export default {
10 |     name: 'AnimatedInput',
11 |     data() {
12 |         return {
13 |             inputText: "",
14 |             debounceTimeout: null
15 |         };
16 |     },
17 | 
18 |     methods: {
19 |         debounce(func, args, wait) {
20 |             clearTimeout(this.debounceTimeout);
21 |             this.debounceTimeout = setTimeout(() => {
22 |                 func.apply(this, args);
23 |             }, wait);
24 |         },
25 |         async spawnProcess(type, text) {
26 |             await chrome.runtime.sendMessage({type: type, text: text}); // await?
27 |         },
28 |     },
29 | 
30 |     watch: {
31 |         inputText: function (newVal, oldVal) {
32 |             if (newVal !== oldVal) {
33 |                 this.debounce(this.spawnProcess, ["inputText", this.inputText], 250);
34 |             }
35 |             if (this.inputText === "") {
36 |                 this.$parent.results = []
37 |             }
38 |         }
39 |     }
40 | }
41 | 
42 | </script>
43 | <style scoped>
44 | 
45 | @import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&display=swap');
46 | 
47 | .input-container {
48 |     width: 93%;
49 |     height: 30px;
50 |     background-color: black;
51 |     display: flex;
52 |     align-items: center;
53 |     justify-content: center;
54 |     padding: 0 5px;
55 | }
56 | 
57 | /*
58 |  dark orange: FF793B
59 |  light orange: ff9d3b
60 |  dark yellow: FFBF3E
61 |  bright yellow: ffd23e
62 |  */
63 | .search-bar {
64 |     width: 100%;
65 |     height: 20px;
66 |     background-color: #000;
67 |     border: none;
68 |     outline: none;
69 |     color: white;
70 |     font-family: 'Space Mono', 'monospace'; /* Same font as loading text */
71 |     font-size: 12px;
72 |     font-weight: 400;
73 |     padding-left: 7.5px; /* Same padding as loading text */
74 | }
75 | 
76 | .search-bar:hover {
77 |     background-color: #525252;
78 | }
79 | 
80 | 
81 | .search-bar::placeholder {
82 |     color: white; /* Set placeholder text color to white */
83 |     font-weight: 700;
84 |     text-shadow:
85 |         -1px -1px 0 #000,
86 |         1px -1px 0 #000,
87 |         -1px 1px 0 #000,
88 |         1px 1px 0 #000; /* Apply thin black outline */
89 |     opacity: 1; /* Ensure the placeholder is fully opaque */
90 | }
91 | </style>
92 | 
93 | 


--------------------------------------------------------------------------------
/extension/src/popup/popup.css:
--------------------------------------------------------------------------------
 1 | 
 2 | * {
 3 |     padding: 0;
 4 |     margin: 0;
 5 |     box-sizing: border-box;
 6 |     font-family: 'Roboto', sans-serif;
 7 | }
 8 | 
 9 | h1 {
10 |     font-size: 40px;
11 |     text-align: center;
12 |     font-weight: 500;
13 | }
14 | 
15 | h2 {
16 |     font-size: 20px;
17 |     text-align: center;
18 |     font-weight: 400;
19 |     margin-bottom: 16px;
20 | }
21 | 
22 | .container {
23 |     width: 450px;
24 | }
25 | 
26 | html,
27 | body {
28 |     min-width: 400px;
29 |     min-height: 500px;
30 | }
31 | 
32 | body {
33 |     display: flex;
34 |     justify-content: center;
35 |     align-items: center;
36 | }
37 | 
38 | #text {
39 |     width: 100%;
40 |     padding: 8px;
41 |     font-size: 20px;
42 |     margin-bottom: 8px;
43 | }
44 | 
45 | #output {
46 |     font-size: 20px;
47 |     font-family: 'Roboto Mono', monospace;
48 |     height: 100px;
49 | }
50 | 


--------------------------------------------------------------------------------
/extension/src/popup/popup.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <meta charset="UTF-8">
 4 | <head>
 5 |     <title>Popup</title>
 6 | </head>
 7 | <body>
 8 | <div id="app"></div>
 9 | <script src="popup.js" defer></script>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/extension/src/popup/popup.js:
--------------------------------------------------------------------------------
1 | import { createApp } from 'vue';
2 | import Popup from './popup.vue';
3 | 
4 | 
5 | createApp(Popup).mount("#app");
6 | 


--------------------------------------------------------------------------------
/extension/src/popup/popup.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div id="app" :class="popupClass">
  3 | 
  4 |         <div v-if="!isModelLoaded" class="progress-container">
  5 |             <!--todo: make this its own vue component-->
  6 |             <div class="progress-background">
  7 |                 <div class="progress-bar" :style="{ width: progressValue + `%`}"></div>
  8 |             </div>
  9 |             <div class="loading-text">Loading model</div>
 10 |         </div>
 11 | 
 12 |         <div v-else>
 13 |             <div v-if="error" class="error">
 14 |                 <span class="dismiss" @click="dismissError">&#x2612;</span>
 15 |                 ERROR: {{ error }}
 16 |             </div>
 17 |             <AnimatedInput ref="input"></AnimatedInput>
 18 | 
 19 |             <!-- Display results and progress only if popupClass is 'popup-expanded' -->
 20 |             <div v-if="popupClass === 'popup-expanded'">
 21 |                 <div class="results-container">
 22 |                     <ResultItem
 23 |                             v-for="(result, index) in results"
 24 |                             :key="index"
 25 |                             :result="result.text"
 26 |                             :score="result.sim"
 27 |                             @click="handleResultClick(result)"
 28 |                     />
 29 |                 </div>
 30 | 
 31 |                 <div class="progress-container search-progress">
 32 |                     <div class="progress-bar" :style="{ width: searchProgress + `%` }"></div>
 33 |                 </div>
 34 | 
 35 |             </div>
 36 |         </div>
 37 | 
 38 | 
 39 |     </div>
 40 | </template>
 41 | 
 42 | 
 43 | <script>
 44 | 
 45 | import ResultItem from './result.vue';
 46 | import AnimatedInput from './AnimatedInput.vue'
 47 | import {prettyLog} from "../utils/utils.js";
 48 | 
 49 | export default {
 50 |     components: {
 51 |         AnimatedInput,
 52 |         ResultItem,
 53 |     },
 54 |     data() {
 55 |         return {
 56 |             results: [],
 57 |             progressValue: 0,
 58 |             searchProgress: 0,
 59 |             isModelLoaded: false,
 60 |             error: undefined,
 61 |         };
 62 |     },
 63 |     computed: {
 64 |         popupClass() {
 65 |             return this.results.length > 0 ? 'popup-expanded' : 'popup-default';
 66 |         },
 67 |     },
 68 |     watch: {},
 69 |     methods: {
 70 |         async handleMessage(request, sender, sendResponse) {
 71 |             // console.dir(request);
 72 |             switch (request.type) {
 73 |                 case "results":
 74 |                     if ('text' in request) {
 75 |                         this.results = request.text;
 76 |                     }
 77 |                     this.searchProgress = request.progress;
 78 | 
 79 |                     break;
 80 |                 case "download":
 81 |                     if (request.data.file && request.data.file !== "onnx/model_quantized.onnx") {
 82 |                         break;
 83 |                     }
 84 |                     if (request.data.status === 'progress') {
 85 |                         this.progressValue = request.data.progress.toFixed(2);
 86 |                     } else if (request.data.status === 'complete') {
 87 |                         this.progressValue = 100;
 88 |                         this.isModelLoaded = true;
 89 |                     }
 90 |                     break;
 91 |                 case "error":
 92 |                     this.error = request.reason;
 93 |             }
 94 |         },
 95 | 
 96 |         // todo: move to result.vue
 97 |         handleResultClick(result) {
 98 |             chrome.tabs.query({active: true, currentWindow: true}, function (tabs) {
 99 |                 chrome.tabs.sendMessage(tabs[0].id, {type: 'highlightAndScroll', text: result.text});
100 |             });
101 |         },
102 |         dismissError() {
103 |             this.error = null;
104 |         },
105 |     },
106 | 
107 |     async mounted() {
108 |         chrome.runtime.onMessage.addListener(this.handleMessage);
109 |         const [tab] = await chrome.tabs.query({active: true, currentWindow: true});
110 | 
111 |     const [res] = await chrome.scripting.executeScript({
112 |       target: { tabId: tab.id },
113 |       func: () => {
114 |         return document.contentType;
115 |       },
116 |     });
117 | 
118 |     chrome.tabs.sendMessage(tab.id, {
119 |       type: "getText",
120 |       contentType: res.result,
121 |     });
122 |         chrome.runtime.sendMessage({type: "load"});
123 | 
124 |         this.results = [];
125 | 
126 |     },
127 |     beforeUnmount() {
128 |         this.results = [];
129 |         chrome.runtime.sendMessage({type: "pruneEmbeddings"});
130 |         chrome.runtime.onMessage.removeListener(this.handleMessage);
131 |     },
132 | };
133 | </script>
134 | 
135 | <style scoped>
136 | @import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@700&display=swap');
137 | 
138 | 
139 | #app {
140 |     padding: 0;
141 |     margin: 0;
142 |     display: flex;
143 |     justify-content: center;
144 |     align-items: center;
145 |     flex-direction: column;
146 | }
147 | 
148 | #app.popup-default {
149 |     width: 150px;
150 |     transition: width 0.5s ease;
151 | }
152 | 
153 | #app.popup-expanded {
154 |     width: 300px;
155 |     transition: width 0.5s ease;
156 | }
157 | 
158 | .progress-container {
159 |     position: relative;
160 | }
161 | 
162 | 
163 | .loading-text {
164 |     color: white;
165 |     font-size: 12px;
166 |     font-family: 'Space Mono', 'monospace';
167 |     font-weight: 700;
168 |     position: absolute;
169 |     z-index: 2;
170 |     white-space: nowrap;
171 |     overflow: hidden;
172 | 
173 |     top: 50%;
174 |     transform: translateY(-50%);
175 |     left: 7.5px;
176 | 
177 |     text-shadow: -1px -1px 0 #000,
178 |     1px -1px 0 #000,
179 |     -1px 1px 0 #000,
180 |     1px 1px 0 #000;
181 | }
182 | 
183 | 
184 | .progress-background {
185 |     width: 140px;
186 |     height: 30px;
187 |     background-color: black;
188 |     display: flex;
189 |     align-items: center;
190 |     justify-content: flex-start;
191 |     overflow: hidden;
192 |     padding: 0 5px;
193 | }
194 | 
195 | 
196 | .progress-bar {
197 |     height: 22px;
198 |     background-color: #525252;
199 |     clip-path: polygon(0 0, 75% 0, 100% 100%, 0 100%);
200 |     z-index: 2;
201 | }
202 | 
203 | 
204 | .results-container {
205 |     max-height: 400px;
206 |     width: 290px;
207 |     overflow-y: auto;
208 |     padding: 10px;
209 | }
210 | 
211 | .progress-container.search-progress >>> .progress-bar {
212 |     height: 12px;
213 |     background-color: #525252;
214 |     color: #525252;
215 |     clip-path: none;
216 | }
217 | 
218 | .progress-bar[style*="100%"] {
219 |     clip-path: none;
220 | }
221 | 
222 | .error {
223 |     font-family: 'Space Mono', 'monospace';
224 |     background: red;
225 |     color: white;
226 |     z-index: 3;
227 |     display: flex;
228 |     flex-direction: column;
229 |     justify-content: center;
230 |     text-align: center;
231 |     font-size: 11px;
232 |     line-height: 1;
233 |     padding: 15px 10px 10px 10px;  /* Adjust as needed */
234 | }
235 | 
236 | 
237 | .dismiss {
238 |     position: absolute;
239 |     top: 0px;
240 |     left: 8px;
241 |     cursor: pointer;
242 |     font-weight: bold;
243 |     font-size: 28px;
244 | }
245 | 
246 | </style>
247 | 


--------------------------------------------------------------------------------
/extension/src/popup/result.vue:
--------------------------------------------------------------------------------
 1 | <!-- result.vue -->
 2 | <template>
 3 |     <div class="result-item" @click="handleClick">
 4 |         <div class="result-text">{{ result }}</div>
 5 |         <div class="score-box">{{ "0." + Math.round(100 * score) }}</div>
 6 |     </div>
 7 | </template>
 8 | 
 9 | <script>
10 | export default {
11 |     props: {
12 |         result: {
13 |             type: String,
14 |             required: true,
15 |         },
16 |         score: {
17 |             type: Number,
18 |             required: true,
19 |         },
20 |     },
21 |     methods: {
22 |         handleClick() {
23 |             this.$emit('click', this.result);
24 |         },
25 |     },
26 | };
27 | </script>
28 | 
29 | <style scoped>
30 | .result-item {
31 |     display: flex;
32 |     justify-content: space-between;
33 |     padding: 10px;
34 |     margin-bottom: 10px;
35 |     border-radius: 5px;
36 |     background-color: #f5f5f5;
37 |     cursor: pointer;
38 |     transition: background-color 0.3s ease;
39 |     font-family: "Helvetica", sans-serif;
40 |     font-size: 9pt;
41 | }
42 | .result-item:hover {
43 |     background-color: #e0e0e0;
44 | }
45 | .result-text {
46 |     flex-grow: 1;
47 | }
48 | .score-box {
49 |     align-self: flex-start;
50 |     padding: 2px 6px;
51 |     border-radius: 3px;
52 |     background-color: #ddd;
53 |     color: #333;
54 |     font-family: "Helvetica", sans-serif;
55 |     font-size: 0.8em;
56 | }
57 | </style>
58 | 


--------------------------------------------------------------------------------
/extension/src/serviceworkers/background.js:
--------------------------------------------------------------------------------
  1 | // background.js - Handles requests from the UI, runs the model, then sends back a response
  2 | 
  3 | import {prettyLog, getSiteID} from '../utils/utils.js';
  4 | import {similarity, storeEmbeddings, loadEmbeddings} from "./semantic.js";
  5 | 
  6 | ////////////////////// 1. Context Menus //////////////////////
  7 | //
  8 | // Add a listener to create the initial context menu items,
  9 | // context menu items only need to be created at runtime.onInstalled
 10 | // chrome.runtime.onInstalled.addListener(function () {
 11 | // Register a context menu item that will only show up for selection text.
 12 | // chrome.contextMenus.create({
 13 | //     id: 'classify-selection',
 14 | //     title: 'Classify "%s"',
 15 | //     contexts: ['selection'],
 16 | // });
 17 | // });
 18 | //
 19 | // Perform inference when the user clicks a context menu
 20 | // chrome.contextMenus.onClicked.addListener(async (info, tab) => {
 21 | // Ignore context menu clicks that are not for classifications (or when there is no input)
 22 | // if (info.menuItemId !== 'classify-selection' || !info.selectionText) return;
 23 | //
 24 | // Perform classification on the selected text
 25 | // let result = await classify(info.selectionText);
 26 | //
 27 | // Do something with the result
 28 | // chrome.scripting.executeScript(
 29 | //
 30 | //     {
 31 | //     target: { tabId: tab.id },    // Run in the tab that the user clicked in
 32 | //     args: [result],               // The arguments to pass to the function
 33 | //     function: (result) => {       // The function to run
 34 | //         // NOTE: This function is run in the context of the web page, meaning that `document` is available.
 35 | //         console.log('result', result)
 36 | //         console.log('document', document)
 37 | //     },
 38 | // }
 39 | // );
 40 | // });
 41 | //////////////////////////////////////////////////////////////
 42 | 
 43 | ////////////////////// 2. Message Events /////////////////////
 44 | //
 45 | // Listen for messages from the UI, process it, and send the result back.
 46 | 
 47 | // TODO: body text is not persistent
 48 | let bodyText = [];
 49 | let inputText = "";
 50 | 
 51 | let liveProcess = 0;
 52 | let currSite = "";
 53 | 
 54 | chrome.runtime.onMessage.addListener(async function (request, sender, sendResponse) {
 55 |     if (request.type === "tabUpdated") {
 56 |         if (request.text.length > 0) {
 57 |             bodyText = request.text;
 58 |             currSite = getSiteID(request.currentURL);
 59 |         }
 60 |     } else if (request.type === "inputText") {
 61 |         inputText = request.text;
 62 |     } else {return; }
 63 |     if (!bodyText || !inputText) {
 64 |         return;
 65 |     }
 66 | 
 67 |     liveProcess++;
 68 |     const processId = liveProcess;
 69 | 
 70 |     await processQuery(inputText, bodyText, processId);
 71 | });
 72 | 
 73 | 
 74 | async function processQuery(query, bodyText, processId) {
 75 |     if (bodyText.length === 0) {
 76 |         prettyLog("Error", "no content found. please reload this page if this is unexpected", "red");
 77 |         chrome.runtime.sendMessage({type: "error", reason: "No content detected. Reloading may help."});
 78 |         return;  // Exit early if no bodyText
 79 |     }
 80 | 
 81 |     await loadEmbeddings(currSite);
 82 |     prettyLog("starting process " + processId, bodyText.length + " items, input: " + query, "orange");
 83 | 
 84 |     let results = [];
 85 |     const k = 10;
 86 | 
 87 |     let i = 0;
 88 |     for (let text of bodyText) {
 89 |         if (processId !== liveProcess) { return;}
 90 |         let sim = await similarity(query, text);
 91 | 
 92 |         if (sim > 0.15) {
 93 |             results.push({sim: sim, text: text});
 94 |             results.sort((a, b) => b.sim - a.sim);
 95 |             results.length = Math.min(results.length, k);
 96 | 
 97 |             if (processId !== liveProcess) { return;}
 98 |             chrome.runtime.sendMessage({
 99 |                 type: "results", progress: 100 * (i / bodyText.length),
100 |                 text: results
101 |             });
102 |         }
103 |         i += 1;
104 |     }
105 |     chrome.runtime.sendMessage({type: "results", progress: 100});
106 |     await storeEmbeddings();
107 | }
108 | 
109 | //////////////////////////////////////////////////////////////
110 | 
111 | 


--------------------------------------------------------------------------------
/extension/src/serviceworkers/pdf.worker.entry.js:
--------------------------------------------------------------------------------
 1 | /* Copyright 2022 Mozilla Foundation
 2 |  *
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | 
16 | (typeof window !== "undefined"
17 |   ? window
18 |   : {}
19 | ).pdfjsWorker = require("./pdf.worker.js");
20 | 


--------------------------------------------------------------------------------
/extension/src/serviceworkers/semantic.js:
--------------------------------------------------------------------------------
  1 | // Define caching parameters
  2 | import {CustomCache} from "../utils/cache.js";
  3 | import {pipeline, env} from '@xenova/transformers';
  4 | import {prettyLog} from "../utils/utils.js";
  5 | 
  6 | env.useBrowserCache = false;
  7 | env.useCustomCache = true;
  8 | env.customCache = new CustomCache('transformers-cache');
  9 | env.allowLocalModels = false;
 10 | 
 11 | 
 12 | // Due to a bug in onnxruntime-web, we must disable multithreading for now.
 13 | // See https://github.com/microsoft/onnxruntime/issues/14445 for more information.
 14 | env.backends.onnx.wasm.numThreads = 1;
 15 | 
 16 | // these should go in EmbedPipeline prob
 17 | let embeddingsDict = {};
 18 | let currID = "";
 19 | 
 20 | class EmbedPipeline {
 21 |     static task = 'feature-extraction';
 22 | 
 23 |     static model = 'Supabase/gte-small';
 24 |     static instance = null;
 25 | 
 26 |     static async getModelFromStorage() {
 27 |         return new Promise((resolve, reject) => {
 28 |             chrome.storage.sync.get('model_name', function(result) {
 29 |                 if (chrome.runtime.lastError) {
 30 |                     reject(new Error(chrome.runtime.lastError));
 31 |                 } else {
 32 |                     resolve(result.model_name);
 33 |                     return false;
 34 |                 }
 35 |             });
 36 |         });
 37 |     }
 38 |     static async updateModelName() {
 39 |         try {
 40 |             const storedModelName = await this.getModelFromStorage();
 41 |             if (storedModelName) {
 42 |                 this.model = storedModelName;
 43 |             }
 44 |         } catch (error) {}
 45 |     }
 46 | 
 47 |     static async getInstance() {
 48 |         if (this.instance === null) {
 49 |             await this.updateModelName();
 50 | 
 51 |             this.instance = await pipeline(this.task, this.model,
 52 |                 {
 53 |                     progress_callback: async data => {
 54 |                         await chrome.runtime.sendMessage({type: "download", data: data});
 55 |                     }
 56 |                 }
 57 |             );
 58 |         }
 59 |         await chrome.runtime.sendMessage({type: "download", data: {status: "complete"}})
 60 | 
 61 |         return this.instance;
 62 |     }
 63 | }
 64 | 
 65 | // Important: Return true to indicate that the response is asynchronous.
 66 | chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => {
 67 |     switch (request.type) {
 68 |         case "load":
 69 |             await load();
 70 |             break;
 71 |         case "clearLocalStorage":
 72 |             chrome.storage.local.clear(() => {
 73 |             });
 74 |             break;
 75 |         case "pruneEmbeddings":
 76 |             await pruneStoredEmbeddings(10);
 77 |             break;
 78 |     }
 79 | });
 80 | 
 81 | 
 82 | async function load() {
 83 |     await EmbedPipeline.getInstance();
 84 | }
 85 | 
 86 | async function embed(text, use_dict = true) {
 87 |     if (use_dict && text in embeddingsDict) {
 88 |         return embeddingsDict[text];
 89 |     }
 90 | 
 91 |     let embedder = await EmbedPipeline.getInstance();
 92 |     let e0 = await embedder(text, {pooling: 'mean', normalize: true});
 93 |     if (use_dict) {
 94 |         embeddingsDict[text] = e0["data"];
 95 |     }
 96 |     return e0["data"];
 97 | }
 98 | 
 99 | 
100 | // do on clean-up / unmount
101 | async function pruneStoredEmbeddings(k) {
102 |     return new Promise((resolve) => {
103 |         chrome.storage.local.get(null, function (allData) {
104 |             let embeddingKeys = Object.keys(allData).filter(key => allData[key].is_embeddings === true);
105 | 
106 |             console.log("All embedding keys found:", embeddingKeys);  // This logs all the embedding keys
107 | 
108 |             // Sort these embedding keys based on frecency scores
109 |             let sortedKeys = embeddingKeys.sort((a, b) => allData[b].frecency_score - allData[a].frecency_score);
110 | 
111 |             let topKKeys = sortedKeys.slice(0, k);
112 |             let keysToRemove = sortedKeys.filter(key => !topKKeys.includes(key));
113 |             console.log(`Removing the following keys: ${keysToRemove}`);
114 | 
115 |             // Remove the non-top k embeddings from storage.
116 |             if (keysToRemove.length > 0) {
117 |                 chrome.storage.local.remove(keysToRemove, () => {
118 |                     console.log(`Successfully removed ${keysToRemove.length} keys.`);
119 |                     resolve();
120 |                 });
121 |             } else {
122 |                 resolve();
123 |             }
124 |         });
125 |     });
126 | }
127 | 
128 | 
129 | 
130 | export async function storeEmbeddings() {
131 |     const buffer = new TextEncoder().encode(JSON.stringify(embeddingsDict));
132 | 
133 |     const body = await new Promise((resolve, reject) => {
134 |         const reader = new FileReader();
135 |         reader.onload = e => resolve(e.target.result);
136 |         reader.onerror = e => reject(e.target.error);
137 |         reader.readAsDataURL(new Blob([buffer], {type: 'application/json'}));
138 |     });
139 | 
140 |     try {
141 |         await chrome.storage.local.set({
142 |             [currID]: {
143 |                 _body: body,
144 |                 frecency_score: computeFrecencyScore(currID),
145 |                 is_embeddings: true,
146 |                 model_name: EmbedPipeline.model
147 |             }
148 |         });
149 |         prettyLog("stored " + currID, Object.keys(embeddingsDict).length + " items");
150 | 
151 |     } catch (err) {
152 |         console.warn('An error occurred while writing the embeddings to cache:', err)
153 |     }
154 | }
155 | 
156 | async function verifyLoad() {
157 |     for (let text in embeddingsDict) {
158 |         let e0 = await embed(text, true);
159 |         let e1 = await embed(text, false);
160 |         let sim = cosineSimilarity(e0, e1);
161 |         if (sim <  0.99) {
162 |             prettyLog("load differs", sim, "red");
163 |         }
164 |     }
165 | }
166 | 
167 | export async function loadEmbeddings(ID) {
168 |     if (Object.keys(embeddingsDict).length !== 0 && ID === currID) {
169 |         return;
170 |     }
171 |     currID = ID;
172 |     const data = await chrome.storage.local.get([currID]);
173 |     if (data[ID] && data[ID].is_embeddings) {
174 |         prettyLog("attempting load", ID);
175 |         if (!data[ID].model_name || data[ID].model_name !== EmbedPipeline.model) { return; }
176 |         const body = data[ID]._body;
177 | 
178 |         const jsonString = await new Promise((resolve, reject) => {
179 |             const byteCharacters = atob(body.split(',')[1]);
180 |             const byteNumbers = Array.from(byteCharacters).map(char => char.charCodeAt(0));
181 |             const byteArray = new Uint8Array(byteNumbers);
182 |             const blob = new Blob([byteArray], {type: 'application/json'});
183 |             const reader = new FileReader();
184 |             reader.onload = (event) => resolve(event.target.result);
185 |             reader.onerror = (error) => reject(error);
186 |             reader.readAsText(blob);
187 |         });
188 | 
189 |         const parsedData = JSON.parse(jsonString);
190 | 
191 |         // Convert the object-with-integer-keys representation into Float32Array
192 |         for (let textKey in parsedData) {
193 |             if (parsedData.hasOwnProperty(textKey)) {
194 |                 let arrayData = Object.values(parsedData[textKey]);
195 |                 embeddingsDict[textKey] = new Float32Array(arrayData);
196 |             }
197 |         }
198 | 
199 |         prettyLog("loaded " + ID, Object.keys(embeddingsDict).length + " items");
200 |         // await verifyLoad();
201 |     }
202 | }
203 | 
204 | 
205 | // todo: implement & move to utils
206 | function computeFrecencyScore(ID) {
207 |     return 4; // lol!
208 | }
209 | 
210 | 
211 | export async function similarity(text1, text2) {
212 |     let e0 = await embed(text1);
213 |     let e1 = await embed(text2);
214 | 
215 |     return cosineSimilarity(e0, e1);
216 | }
217 | 
218 | function cosineSimilarity(v1, v2) {
219 |     if (v1.length !== v2.length) {
220 |         return -1;
221 |     }
222 |     let dotProduct = 0;
223 |     let v1_mag = 0;
224 |     let v2_mag = 0;
225 |     for (let i = 0; i < v1.length; i++) {
226 |         dotProduct += v1[i] * v2[i];
227 |         v1_mag += v1[i] ** 2;
228 |         v2_mag += v2[i] ** 2;
229 |     }
230 |     return dotProduct / (Math.sqrt(v1_mag) * Math.sqrt(v2_mag));
231 | }
232 | 


--------------------------------------------------------------------------------
/extension/src/utils/cache.js:
--------------------------------------------------------------------------------
 1 | // Author: Xenova
 2 | // Design a caching API to be used by the extension which implements the same interface as
 3 | // the browser's native Cache API (https://developer.mozilla.org/en-US/docs/Web/API/Cache)
 4 | // but uses the browser's local storage API (https://developer.chrome.com/docs/extensions/reference/storage/).
 5 | //
 6 | // Since the local storage API requires all data to be stored as JSON (which doesn't allow some ASCII chars),
 7 | // one of the better approaches is to store the response body as a base64-encoded string. This is not ideal,
 8 | // as it increases the size of the response body by ~33%, but it's the best we can do with the local storage API.
 9 | // See https://stackoverflow.com/a/1443240/13989043 for more information about this.
10 | //
11 | // For serialization (arraybuffer -> string) and unserialization (string -> arraybuffer),
12 | // use the `FileReader` and `Blob` APIs. Although other options are also possible, this approach
13 | // is considered to be better for larger files (like models).
14 | //
15 | // Other references:
16 | //  - https://developer.chrome.com/docs/extensions/reference/storage/#property-local
17 | //  - https://stackoverflow.com/questions/6965107/converting-between-strings-and-arraybuffers
18 | 
19 | export class CustomCache {
20 |     /**
21 |      * Instantiate a `CustomCache` object.
22 |      * @param {string} path
23 |      */
24 |     constructor(cacheName) {
25 |         this.cacheName = cacheName;
26 |     }
27 | 
28 |     /**
29 |      * Checks whether the given request is in the cache.
30 |      * @param {Request|string} request
31 |      * @returns {Promise<Response | undefined>}
32 |      */
33 |     async match(request) {
34 |         const url = request instanceof Request ? request.url : request;
35 |         const cached = await chrome.storage.local.get([url]);
36 | 
37 |         if (cached[url]) {
38 |             let model =  await fetch(cached[url]._body);
39 |             // console.log("model: ", model);
40 |             return model;
41 |         } else {
42 |             return undefined;
43 |         }
44 |     }
45 | 
46 |     /**
47 |      * Adds the given response to the cache.
48 |      * @param {Request|string} request
49 |      * @param {Response} response
50 |      * @returns {Promise<void>}
51 |      */
52 |     async put(request, response) {
53 |         const url = request instanceof Request ? request.url : request;
54 |         const buffer = await response.arrayBuffer();
55 | 
56 |         const body = await new Promise((resolve, reject) => {
57 |             const reader = new FileReader();
58 |             reader.onload = e => resolve(e.target.result);
59 |             reader.onerror = e => reject(e.target.error);
60 |             reader.readAsDataURL(new Blob([buffer], { type: 'application/octet-stream' }));
61 |         });
62 | 
63 |         try {
64 |             await chrome.storage.local.set({
65 |                 [url]: {
66 |                     _body: body,
67 | 
68 |                     // Save original response in case
69 |                     status: response.status,
70 |                     statusText: response.statusText,
71 |                     headers: Object.fromEntries(response.headers.entries()),
72 |                     url: response.url,
73 |                     redirected: response.redirected,
74 |                     type: response.type,
75 |                     ok: response.ok,
76 |                 }
77 |             });
78 | 
79 |         } catch (err) {
80 |             console.warn('An error occurred while writing the file to cache:', err)
81 |         }
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/extension/src/utils/utils.js:
--------------------------------------------------------------------------------
 1 | export function prettyLog(label, message, labelColor = 'blue', messageColor = 'black') {
 2 |     console.log("%c" + label + ": %c" + message,
 3 |         "font-weight: bold; color: " + labelColor + ";",
 4 |         "font-weight: normal; color: " + messageColor + ";");
 5 | }
 6 | 
 7 | 
 8 | /*  Looks for a sentence ending after numChars.  */
 9 | function splitByChars(text, numChars) {
10 |     let chunks = [];
11 |     let currChunk = '';
12 |     const sentenceEndings = ['.', '?', '!', ';', ':', '\n', '–'];
13 | 
14 |     for (let i = 0; i < text.length; i++) {
15 |         currChunk += text[i];
16 | 
17 |         let isEndingPunctuation = sentenceEndings.includes(text[i]);
18 | 
19 |         // Special case: if the punctuation is a period and the next character is a quote
20 |         if (text[i] === '.' && text[i + 1] === '"') {
21 |             currChunk += text[++i];
22 |             isEndingPunctuation = true;
23 |         }
24 | 
25 |         if (currChunk.trim().length >= numChars && isEndingPunctuation) {
26 |             chunks.push(currChunk.trim());
27 |             currChunk = '';
28 |         }
29 |     }
30 | 
31 |     if (currChunk.trim()) {
32 |         chunks.push(currChunk.trim());
33 |     }
34 | 
35 |     return chunks;
36 | }
37 | 
38 | 
39 | export function getSiteID(url) {
40 |     let urlObj = new URL(url);
41 |     return urlObj.hostname + urlObj.pathname;
42 | }
43 | 
44 | 
45 | export function splitReadableContent(readableContent, numChars = 50) {
46 |     return splitByChars(readableContent, numChars);
47 | }
48 | 
49 | 
50 | function collectTextNodes(element, texts = []) {
51 |     if (element.nodeType === Node.ELEMENT_NODE && element.tagName.toLowerCase() === 'p') {
52 |         let sentences = tokenizer.tokenize(element.textContent); // Tokenize the text content into sentences
53 |         for (let sentence of sentences) {
54 |             sentence = sentence.trim();  // Remove leading/trailing white spaces
55 |             if (sentence !== "") {
56 |                 texts.push(sentence);
57 |             }
58 |         }
59 |     } else {
60 |         for (let child of element.childNodes) {
61 |             collectTextNodes(child, texts);
62 |         }
63 |     }
64 |     return texts;
65 | }
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/extension/webpack.config.js:
--------------------------------------------------------------------------------
  1 | // webpack.config.js
  2 | import path from 'path';
  3 | import { fileURLToPath } from 'url';
  4 | 
  5 | import HtmlWebpackPlugin from 'html-webpack-plugin';
  6 | import CopyPlugin from 'copy-webpack-plugin';
  7 | import { VueLoaderPlugin } from 'vue-loader';
  8 | import NodePolyfillPlugin from 'node-polyfill-webpack-plugin';
  9 | import webpack from 'webpack';
 10 | import util from 'util';
 11 | 
 12 | const __dirname = path.dirname(fileURLToPath(import.meta.url));
 13 | 
 14 | const config = {
 15 |     mode: 'development',
 16 |     devtool: 'inline-source-map',
 17 |     entry: {
 18 |         background: ['./src/serviceworkers/background.js', './src/serviceworkers/semantic.js'],
 19 |         popup: './src/popup/popup.js',
 20 |         content: './src/content/content.js',
 21 |         options: './src/options/options.js'
 22 |     },
 23 |     resolve: {
 24 |         fallback: {
 25 |             "fs": false,
 26 |             "tls": false,
 27 |             "net": false,
 28 |             "path": false,
 29 |             "util": false,
 30 |         }
 31 |     },
 32 |     output: {
 33 |         path: path.resolve(__dirname, 'build'),
 34 |         filename: '[name].js',
 35 |     },
 36 |     module: {
 37 |         rules: [
 38 |             {
 39 |                 test: /\.vue$/,
 40 |                 use: 'vue-loader'
 41 |             },
 42 |             {
 43 |                 test: /\.css$/,
 44 |                 use: [
 45 |                     'style-loader',
 46 |                     'css-loader',
 47 |                 ],
 48 |             },
 49 |         ],
 50 |     },
 51 |     plugins: [
 52 |         new NodePolyfillPlugin(),
 53 |         new webpack.DefinePlugin({
 54 |             __VUE_OPTIONS_API__: true,
 55 |             __VUE_PROD_DEVTOOLS__: false,
 56 |         }),
 57 |         new VueLoaderPlugin(),
 58 |         new HtmlWebpackPlugin({
 59 |             template: './src/popup/popup.html',
 60 |             filename: 'popup.html',
 61 |         }),
 62 |         new HtmlWebpackPlugin({
 63 |             template: './src/options/options.html',
 64 |             filename: 'options.html',
 65 |         }),
 66 |         new CopyPlugin({
 67 |             patterns: [
 68 |                 {
 69 |                     from: "public",
 70 |                     to: "." // Copies to build folder
 71 |                 },
 72 |                 {
 73 |                     from: "src/popup/popup.css",
 74 |                     to: "popup.css"
 75 |                 },
 76 |                 {
 77 |                     from: "src/content/content.css",
 78 |                     to: "content.css"
 79 |                 },
 80 |                 {
 81 |                     from: "src/serviceworkers/pdf.js",
 82 |                     to: "pdf.js"
 83 |                 },
 84 |                 {
 85 |                     from: "src/serviceworkers/pdf.worker.js",
 86 |                     to: "pdf.worker.js"
 87 |                 },
 88 |                 {
 89 |                     from: "src/options/options.css",
 90 |                     to: "options.css"
 91 |                 },
 92 |             ],
 93 |         })
 94 |     ],
 95 | };
 96 | 
 97 | export default config;
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |     "compilerOptions": {
3 |       "checkJs": true,
4 |       "strict": true
5 |     },
6 |     "include": ["src/**/*"]
7 |   }


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/logo.png


--------------------------------------------------------------------------------
/misc/Generate_large_textfile_from_books.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Script for generating large text files \n",
  8 |     "\n",
  9 |     "Keeps the metadata concatted with \"|\" as first line"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 7,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Pages: 28986 | Words: 15893741\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import pandas as pd \n",
 27 |     "\n",
 28 |     "# download first \"https://huggingface.co/datasets/storytracer/US-PD-Books/resolve/main/data/train-00000-of-00327.parquet?download=true\")\n",
 29 |     "df = pd.read_parquet(\"train-00000-of-00327.parquet\") \n",
 30 |     "\n",
 31 |     "# e.g. 100 books only \n",
 32 |     "books_number = 100\n",
 33 |     "df = df.iloc[:books_number]\n",
 34 |     "df[\"words\"] = df.full_text.apply(lambda x: len(x.split(\" \")))\n",
 35 |     "print(f\"Pages: {df.page_count.sum()} | Words: {df.words.sum()}\") #df.words.sum())"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 8,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Define the columns to concatenate, excluding 'full_text'\n",
 45 |     "metadata_columns = ['ocaid', 'title', 'author', 'year', 'page_count', 'openlibrary_edition', 'openlibrary_work']\n",
 46 |     "\n",
 47 |     "# Function to concatenate metadata and full_text\n",
 48 |     "def concatenate_row(row):\n",
 49 |     "    metadata = '|'.join(row[metadata_columns].astype(str)) # Convert to string and join with '|'\n",
 50 |     "    full_text = row['full_text']\n",
 51 |     "    return metadata + '|' + full_text\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 11,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/html": [
 62 |        "<div>\n",
 63 |        "<style scoped>\n",
 64 |        "    .dataframe tbody tr th:only-of-type {\n",
 65 |        "        vertical-align: middle;\n",
 66 |        "    }\n",
 67 |        "\n",
 68 |        "    .dataframe tbody tr th {\n",
 69 |        "        vertical-align: top;\n",
 70 |        "    }\n",
 71 |        "\n",
 72 |        "    .dataframe thead th {\n",
 73 |        "        text-align: right;\n",
 74 |        "    }\n",
 75 |        "</style>\n",
 76 |        "<table border=\"1\" class=\"dataframe\">\n",
 77 |        "  <thead>\n",
 78 |        "    <tr style=\"text-align: right;\">\n",
 79 |        "      <th></th>\n",
 80 |        "      <th>ocaid</th>\n",
 81 |        "      <th>title</th>\n",
 82 |        "      <th>author</th>\n",
 83 |        "      <th>year</th>\n",
 84 |        "      <th>page_count</th>\n",
 85 |        "      <th>openlibrary_edition</th>\n",
 86 |        "      <th>openlibrary_work</th>\n",
 87 |        "      <th>full_text</th>\n",
 88 |        "      <th>words</th>\n",
 89 |        "    </tr>\n",
 90 |        "  </thead>\n",
 91 |        "  <tbody>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>0</th>\n",
 94 |        "      <td>worksofcharlesle01leve</td>\n",
 95 |        "      <td>The works of Charles Lever</td>\n",
 96 |        "      <td>Lever, Charles James, 1806-1872</td>\n",
 97 |        "      <td>1872</td>\n",
 98 |        "      <td>692</td>\n",
 99 |        "      <td>OL13499428M</td>\n",
100 |        "      <td>OL3564322W</td>\n",
101 |        "      <td>&lt;8      ''         ^/^r \\n\\n\\nN      V       s...</td>\n",
102 |        "      <td>1045410</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>1</th>\n",
106 |        "      <td>specimensofexpos00lamorich</td>\n",
107 |        "      <td>Specimens of exposition</td>\n",
108 |        "      <td>Lamont, Hammond, 1864-1909</td>\n",
109 |        "      <td>1894</td>\n",
110 |        "      <td>220</td>\n",
111 |        "      <td>OL7034373M</td>\n",
112 |        "      <td>OL202608W</td>\n",
113 |        "      <td>Ifteafeirtgs \\n\\n\\nUC-NRLF \\n\\n\\nSPECIMENS \\n\\...</td>\n",
114 |        "      <td>109283</td>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "      <th>2</th>\n",
118 |        "      <td>recollectionsand00greerich</td>\n",
119 |        "      <td>Recollections and reflections : an auto of hal...</td>\n",
120 |        "      <td>Green, Wharton J. (Wharton Jackson), 1831-1910</td>\n",
121 |        "      <td>1906</td>\n",
122 |        "      <td>400</td>\n",
123 |        "      <td>OL7098980M</td>\n",
124 |        "      <td>OL7710550W</td>\n",
125 |        "      <td>;     J.  GREEN \\n\\n\\nRECOLLECTIONS  AND  REFL...</td>\n",
126 |        "      <td>229753</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>3</th>\n",
130 |        "      <td>puddnheadwilsont00twaiiala</td>\n",
131 |        "      <td>Pudd'nhead Wilson, and Those extraordinary twins</td>\n",
132 |        "      <td>Twain, Mark, 1835-1910</td>\n",
133 |        "      <td>1922</td>\n",
134 |        "      <td>322</td>\n",
135 |        "      <td>OL7095992M</td>\n",
136 |        "      <td>OL15269096W</td>\n",
137 |        "      <td>ROXY  HARVESTING  AMONG  THE   KITCHENS \\n\\n\\n...</td>\n",
138 |        "      <td>142528</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>4</th>\n",
142 |        "      <td>hansbreitmann00lelarich</td>\n",
143 |        "      <td>Hans Breitmann in Germany;</td>\n",
144 |        "      <td>Leland, Charles Godfrey, 1824-1903</td>\n",
145 |        "      <td>1895</td>\n",
146 |        "      <td>184</td>\n",
147 |        "      <td>OL7202758M</td>\n",
148 |        "      <td>OL4108366W</td>\n",
149 |        "      <td>;'HP- \\n\\n\\nn \\n\\n\\n\"* \\n\\nr.l»* \\n\\n'f  . \\n\\...</td>\n",
150 |        "      <td>58760</td>\n",
151 |        "    </tr>\n",
152 |        "  </tbody>\n",
153 |        "</table>\n",
154 |        "</div>"
155 |       ],
156 |       "text/plain": [
157 |        "                        ocaid   \n",
158 |        "0      worksofcharlesle01leve  \\\n",
159 |        "1  specimensofexpos00lamorich   \n",
160 |        "2  recollectionsand00greerich   \n",
161 |        "3  puddnheadwilsont00twaiiala   \n",
162 |        "4     hansbreitmann00lelarich   \n",
163 |        "\n",
164 |        "                                               title   \n",
165 |        "0                         The works of Charles Lever  \\\n",
166 |        "1                            Specimens of exposition   \n",
167 |        "2  Recollections and reflections : an auto of hal...   \n",
168 |        "3   Pudd'nhead Wilson, and Those extraordinary twins   \n",
169 |        "4                         Hans Breitmann in Germany;   \n",
170 |        "\n",
171 |        "                                           author  year  page_count   \n",
172 |        "0                 Lever, Charles James, 1806-1872  1872         692  \\\n",
173 |        "1                      Lamont, Hammond, 1864-1909  1894         220   \n",
174 |        "2  Green, Wharton J. (Wharton Jackson), 1831-1910  1906         400   \n",
175 |        "3                          Twain, Mark, 1835-1910  1922         322   \n",
176 |        "4              Leland, Charles Godfrey, 1824-1903  1895         184   \n",
177 |        "\n",
178 |        "  openlibrary_edition openlibrary_work   \n",
179 |        "0         OL13499428M       OL3564322W  \\\n",
180 |        "1          OL7034373M        OL202608W   \n",
181 |        "2          OL7098980M       OL7710550W   \n",
182 |        "3          OL7095992M      OL15269096W   \n",
183 |        "4          OL7202758M       OL4108366W   \n",
184 |        "\n",
185 |        "                                           full_text    words  \n",
186 |        "0  <8      ''         ^/^r \\n\\n\\nN      V       s...  1045410  \n",
187 |        "1  Ifteafeirtgs \\n\\n\\nUC-NRLF \\n\\n\\nSPECIMENS \\n\\...   109283  \n",
188 |        "2  ;     J.  GREEN \\n\\n\\nRECOLLECTIONS  AND  REFL...   229753  \n",
189 |        "3  ROXY  HARVESTING  AMONG  THE   KITCHENS \\n\\n\\n...   142528  \n",
190 |        "4  ;'HP- \\n\\n\\nn \\n\\n\\n\"* \\n\\nr.l»* \\n\\n'f  . \\n\\...    58760  "
191 |       ]
192 |      },
193 |      "execution_count": 11,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "df.head()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 9,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "name": "stdout",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "The text file '100_books.txt' has been created.\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "# Apply the function to each row and save to a list\n",
217 |     "concatenated_rows = df.iloc[:books_number].apply(concatenate_row, axis=1).tolist()\n",
218 |     "\n",
219 |     "# Write the concatenated rows to a text file\n",
220 |     "with open(f'{books_number}_books.txt', 'w', encoding='utf-8') as f:\n",
221 |     "    for row in concatenated_rows:\n",
222 |     "        f.write(row + '\\n')\n",
223 |     "\n",
224 |     "print(f\"The text file '{books_number}_books.txt' has been created.\")\n"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": []
233 |   }
234 |  ],
235 |  "metadata": {
236 |   "kernelspec": {
237 |    "display_name": "py3.11",
238 |    "language": "python",
239 |    "name": "python3"
240 |   },
241 |   "language_info": {
242 |    "codemirror_mode": {
243 |     "name": "ipython",
244 |     "version": 3
245 |    },
246 |    "file_extension": ".py",
247 |    "mimetype": "text/x-python",
248 |    "name": "python",
249 |    "nbconvert_exporter": "python",
250 |    "pygments_lexer": "ipython3",
251 |    "version": "3.11.0"
252 |   }
253 |  },
254 |  "nbformat": 4,
255 |  "nbformat_minor": 2
256 | }
257 | 


--------------------------------------------------------------------------------
/misc/README.md:
--------------------------------------------------------------------------------
1 | ## Various utilities 
2 | 
3 | Here goes anything for testing or similar.


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "semanticfinder",
 3 |   "version": "1.0.0",
 4 |   "description": "Update: just improved the UI - automatically scroll through the results!",
 5 |   "main": "src/js/index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1",
 8 |     "start": "webpack serve --mode development",
 9 |     "build": "webpack --config webpack.config.js"
10 |   },
11 |   "author": "",
12 |   "license": "ISC",
13 |   "dependencies": {
14 |     "@xenova/transformers": "^2.17.2",
15 |     "bootstrap": "^5.3.2",
16 |     "codemirror": "^5.52.2",
17 |     "deck.gl": "^8.9.34",
18 |     "marked": "^12.0.0",
19 |     "ollama": "^0.4.9",
20 |     "pako": "^2.1.0",
21 |     "wasm-bhtsne": "^0.3.3"
22 |   },
23 |   "devDependencies": {
24 |     "copy-webpack-plugin": "^11.0.0",
25 |     "css-loader": "^6.8.1",
26 |     "favicons": "^7.1.4",
27 |     "favicons-webpack-plugin": "^6.0.1",
28 |     "html-webpack-plugin": "^5.5.3",
29 |     "mini-css-extract-plugin": "^2.7.6",
30 |     "pdfjs-dist": "^4.0.379",
31 |     "style-loader": "^3.3.3",
32 |     "webpack": "^5.88.1",
33 |     "webpack-cli": "^5.1.4",
34 |     "webpack-dev-server": "^4.15.1"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/css/styles.css:
--------------------------------------------------------------------------------
  1 | #input-text {
  2 |   height: 50vh;
  3 |   width: 80vw;
  4 |   min-width: 80vw;
  5 |   text-align: left !important;
  6 | }
  7 | .CodeMirror {
  8 |   height: 55vh !important;
  9 | }
 10 | 
 11 | .CodeMirror.cm-s-default.CodeMirror-wrap {
 12 |   font-family: 'Open Sans', sans-serif;
 13 | }
 14 | 
 15 | #summary_text, #chat_text, #ollama_chat_text {
 16 |   margin: 10px;
 17 |   font-weight: 700;
 18 | }
 19 | 
 20 | #get_chat, #ollama_get_chat, #get_summary, #dimensionalityReduction{
 21 |  height: calc(3.5rem + calc(var(--bs-border-width) * 2));
 22 |  width: 110px;
 23 |  margin-right: 20px;
 24 | }
 25 | 
 26 | table {
 27 |   table-layout: auto;
 28 |   margin: 0 auto;
 29 | }
 30 | th,
 31 | td {
 32 |   word-wrap: break-word;
 33 |   max-width: 50%;
 34 |   text-align: center;
 35 | }
 36 | 
 37 | .table-bordered {
 38 |   border: none;
 39 | }
 40 | 
 41 | #output-table > tbody > tr:nth-of-type(odd) {
 42 |   background-color: #f9f9f9;
 43 | }
 44 | 
 45 | .table .table {
 46 |   background-color: transparent;
 47 | }
 48 | 
 49 | .highlight-first {
 50 |   background-color: rgb(0, 255, 81);
 51 | }
 52 | 
 53 | .highlight-second {
 54 |   background-color: rgb(135, 255, 153);
 55 | }
 56 | 
 57 | .highlight-third {
 58 |   background-color: rgb(190, 253, 190);
 59 | }
 60 | 
 61 | .highlight-select {
 62 |   background-color: orange;
 63 | }
 64 | 
 65 | #loading {
 66 |   display: inline-block;
 67 |   width: 1rem;
 68 |   height: 1rem;
 69 |   border: 3px solid rgba(255, 255, 255, 0.3);
 70 |   border-radius: 50%;
 71 |   border-top-color: #fff;
 72 |   animation: spin 1s ease-in-out infinite;
 73 |   -webkit-animation: spin 1s ease-in-out infinite;
 74 | }
 75 | 
 76 | @keyframes spin {
 77 |   to {
 78 |     -webkit-transform: rotate(360deg);
 79 |   }
 80 | }
 81 | @-webkit-keyframes spin {
 82 |   to {
 83 |     -webkit-transform: rotate(360deg);
 84 |   }
 85 | }
 86 | 
 87 | #progressBar {
 88 |   height: 25px;
 89 |   width: 100%;
 90 | }
 91 | 
 92 | #query-text {
 93 |   height: 75%;
 94 |   min-width: 80%;
 95 | }
 96 | 
 97 | 
 98 | .submit-button {
 99 |   height: 75%;
100 |   white-space:normal;
101 |   text-align: center; /* this seems to break when page size is too small */
102 | }
103 | 
104 | 
105 | #formGroupCenter {
106 |   width: 100%;
107 | }
108 | 
109 | .CodeMirror {
110 |   font-size: 15px;
111 | }
112 | 
113 | #results {
114 |   height: 70vh;
115 |   overflow-y: auto;
116 | }
117 | 
118 | .card {
119 |   width: 100%;
120 |   transition: background-color 0.3s ease;
121 | }
122 | 
123 | .card:hover {
124 |   background-color: #f8f9fa;
125 | }
126 | 
127 | /*.nav-button {*/
128 | /*  width: 60px; !* adjust this to the size you want *!*/
129 | /*  margin-right: 1px; !* adds space between buttons *!*/
130 | /*}*/
131 | 
132 | #submitGroup {
133 |   margin-top: 2vh; /* adjust this value as needed */
134 | }
135 | 
136 | /*
137 |   see: https://github.com/twbs/bootstrap/issues/33871
138 |  */
139 | .form-floating > label { z-index: 3; }
140 | 
141 | #advancedFeaturesHeader .accordion-button:hover {
142 |   text-decoration: underline;
143 | }
144 | 
145 | .accordion-button::after {
146 |   display: none;
147 | }
148 | 
149 | .card-title {
150 |   font-size: 0.9em;
151 | }
152 | .card-subtitle {
153 |   font-size: 0.8em;
154 | }
155 | 
156 | #SemanticFinderLogo{
157 |   display: block;
158 |   margin: 0 auto;
159 |   max-width: 250px;
160 | }
161 | 
162 | @media (min-width: 992px) {
163 |   #introContainer {
164 |     display: inline-flex;
165 |   }
166 |   #introContentDiv{
167 |     padding-left: 20px;
168 |   }
169 | }
170 | 
171 | @media (max-width: 992px) {
172 |   .col-sm-9  {
173 |     width: 100% !important;
174 |   }
175 |   #results{
176 |     height: unset !important;
177 |   }
178 |   .col-sm-3  {
179 |     width: 100% !important;
180 |   }
181 |   ul {
182 |     padding-left: 0 !important;
183 |   }
184 | }
185 | 
186 | .toast {
187 |   display: none;
188 |   position: fixed;
189 |   top: 16px;
190 |   left: 50%;
191 |   transform: translateX(-50%);
192 |   background-color: white !important;
193 |   color: #fff;
194 |   padding: 10px 20px;
195 |   border-radius: 5px;
196 |   z-index: 1000;
197 |   width: 250px !important;
198 |   font-size: 20px !important;
199 | }
200 | 
201 | 
202 | #closeToastButton{
203 |   cursor: pointer;
204 |   right: -47px;
205 |   position: relative;
206 | }
207 | 
208 | /* Style for the point labels 
209 | .point-label {
210 |   display: none;
211 |   position: absolute;
212 |   background-color: #ffffff;
213 |   padding: 8px;
214 |   border: 2px solid #4a90e2;
215 |   border-radius: 10px;
216 |   box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
217 |   color: #333333;
218 |   font-family: 'Open Sans', sans-serif;
219 |   font-size: 14px;
220 |   pointer-events: none;
221 |   z-index: 100000000000000000000;
222 |   opacity: 1 !important;
223 | }
224 | 
225 | /* Style for the tooltip 
226 | .tooltip {
227 |   position: absolute;
228 |   text-align: center;
229 |   width: auto;
230 |   height: 36px;
231 |   padding: 6px;
232 |   font-family: 'Open Sans', sans-serif;
233 |   background: #4a90e2;
234 |   color: #ffffff;
235 |   border: 0px;
236 |   border-radius: 12px;
237 |   box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
238 |   pointer-events: none;
239 |   z-index: 100000000000000000000;
240 |   opacity: 1 !important;
241 | }*/
242 | 
243 | #plot-container {
244 |   max-height: 700px;
245 |   height: 0;
246 | }
247 | 
248 | #deckgl {
249 |   position: relative !important;
250 | }
251 | 
252 | #tooltip {
253 |   position: absolute;
254 |   opacity: 1000;
255 |   font-size: 20px;
256 |   border-radius: var(--bs-border-radius);
257 |   background-color: #f5f8ffcf;
258 |   outline: 2px solid #dfebff;
259 |   padding: 8px;
260 |   max-width: 500px;
261 | }


--------------------------------------------------------------------------------
/src/js/semantic.js:
--------------------------------------------------------------------------------
  1 | import { env, cos_sim} from '@xenova/transformers';
  2 | import { loadScatterplot } from './utils.js';
  3 | 
  4 | // @ts-ignore
  5 | env.allowLocalModels = false;
  6 | 
  7 | /**
  8 |  * @type {Worker}
  9 |  */
 10 | const worker = new Worker(new URL('./worker.js', import.meta.url), {
 11 |     type: 'module'
 12 | });
 13 | 
 14 | window.semanticWorker = worker;
 15 | 
 16 | /**
 17 |  * @type {Array<number>}
 18 |  */
 19 | let queryEmbedding;
 20 | 
 21 | /**
 22 |  * @type {Object<string, Function>}
 23 |  */
 24 | const similarityResolveMap = {};
 25 | 
 26 | /**
 27 |  * @type {Object<string, Function>}
 28 |  */
 29 | const tokensResolveMap = {};
 30 | 
 31 | /**
 32 |  * @type Function
 33 |  */
 34 | let loadResolve;
 35 | 
 36 | /**
 37 |  * @type Function
 38 |  */
 39 | let queryResolve;
 40 | 
 41 | function downloadFile(data, filename, mimeType) {
 42 |     const blob = new Blob([data], { type: mimeType });
 43 | 
 44 |     const link = document.createElement('a');
 45 |     link.href = window.URL.createObjectURL(blob);
 46 |     link.download = filename;
 47 | 
 48 |     // Append the link to the body for programmatic click
 49 |     document.body.appendChild(link);
 50 |     link.click();
 51 | 
 52 |     // Remove the link from the DOM
 53 |     document.body.removeChild(link);
 54 | }
 55 | 
 56 | worker.onmessage = function (event) {
 57 |     const message = event.data;
 58 |     let resolve;
 59 | 
 60 |     switch (message.type) {
 61 |         case 'embeddingsDict':
 62 |             const gzippedData = message.data;
 63 |             //console.log("Embeddings data received.");
 64 |             // Download gzipped data as 'index.json.gz'
 65 |             downloadFile(gzippedData, message.filename, 'application/gzip');
 66 |             break;
 67 |         case "download":
 68 |             let downloadBar = document.getElementById('loading-progress');
 69 | 
 70 |             if (message.data.status === 'progress') {
 71 |                 if (message.data.file !== "onnx/model_quantized.onnx") { break; }
 72 |                 let progress = message.data.progress.toFixed(2);
 73 |                 downloadBar.style.width = progress + '%';
 74 |                 downloadBar.textContent = progress + "%";
 75 | 
 76 |                 downloadBar.setAttribute('aria-valuenow', progress);
 77 |             } else if (message.data.status === 'ready') {
 78 |                 downloadBar.style.width = '100%';
 79 |                 downloadBar.setAttribute('aria-valuenow', 100);
 80 |                 downloadBar.textContent = "";
 81 |                 loadResolve();
 82 |             }
 83 |             break;
 84 |         case "chat_download":
 85 |             let chatDownloadBar = document.getElementById('chat-progress');
 86 | 
 87 |             if (message.data.status === 'progress') {
 88 |                 if (message.data.file !== "onnx/decoder_model_merged_quantized.onnx") { break; }
 89 |                 let progress = message.data.progress.toFixed(2);
 90 |                 chatDownloadBar.style.width = progress + '%';
 91 |                 chatDownloadBar.textContent = Math.round(progress) + '%';
 92 |                 chatDownloadBar.setAttribute('aria-valuenow', progress);
 93 |             } else if (message.data.status === 'ready') {
 94 |                 chatDownloadBar.style.width = '100%';
 95 |                 chatDownloadBar.setAttribute('aria-valuenow', 100);
 96 |                 chatDownloadBar.textContent = "";
 97 |                 loadResolve();
 98 |             }
 99 |             break;
100 |         case "summary_download":
101 |             let summaryDownloadBar = document.getElementById('summary-progress');
102 | 
103 |             if (message.data.status === 'progress') {
104 |                 if (message.data.file !== "onnx/decoder_model_merged_quantized.onnx") { break; }
105 |                 let progress = message.data.progress.toFixed(2);
106 |                 summaryDownloadBar.style.width = progress + '%';
107 |                 summaryDownloadBar.textContent = Math.round(progress) + '%';
108 |                 summaryDownloadBar.setAttribute('aria-valuenow', progress);
109 |             } else if (message.data.status === 'ready') {
110 |                 summaryDownloadBar.style.width = '100%';
111 |                 summaryDownloadBar.setAttribute('aria-valuenow', 100);
112 |                 summaryDownloadBar.textContent = "";
113 |                 loadResolve();
114 |             }
115 |             break;
116 |         case 'chat':
117 |             //console.log(message.chat_text);
118 |             document.getElementById("chat_text").innerHTML = message.chat_text
119 |             queryResolve(message.chat_text);
120 |             break;
121 |         case 'summary':
122 |             //console.log(message.summary_text);
123 |             document.getElementById("summary_text").innerHTML = message.summary_text
124 |             queryResolve(message.summary_text);
125 |             break;
126 |         case 'query':
127 |             queryEmbedding = message.embedding;
128 |             queryResolve();
129 |             break;
130 |         case 'similarity':
131 |             resolve = similarityResolveMap[message.text];
132 |             resolve(cos_sim(message.embedding, queryEmbedding));
133 |             delete similarityResolveMap[message.text];
134 |             break;
135 |         case 'tokens':
136 |             resolve = tokensResolveMap[message.text];
137 |             resolve(message.tokens);
138 |             delete tokensResolveMap[message.text];
139 |             break;
140 |         case 'tsne':
141 |             console.log(message.plotDataArray)
142 |             loadScatterplot(message.plotDataArray);
143 | 
144 |             break
145 |         default:
146 |             console.error('Unknown message type: ' + message.type);
147 |     }
148 | };
149 | 
150 | /**
151 |  * @param {string} text
152 |  * @returns {Promise<number>}
153 |  */
154 | export async function similarity(text) {
155 |     worker.postMessage({
156 |         type: 'similarity',
157 |         inferencingActive: document.getElementById("inferencingActive").checked,
158 |         text
159 |     });
160 |     return new Promise((resolve) => {
161 |         similarityResolveMap[text] = resolve;
162 |     });
163 | }
164 | 
165 | /**
166 |  *
167 |  * @param {string} text
168 |  * @returns
169 |  */
170 | export async function summarizeText(text) {
171 |     worker.postMessage({
172 |         type: 'summary',
173 |         text
174 |     });
175 |     return new Promise((resolve) => {
176 |         queryResolve = resolve;
177 |     });
178 | }
179 | 
180 | /**
181 |  *
182 |  * @param {string} text
183 |  * @param {number} max_new_tokens
184 |  * @returns
185 |  */
186 | export async function chatText(text, max_new_tokens) {
187 |     worker.postMessage({
188 |         type: 'chat',
189 |         max_new_tokens: max_new_tokens,
190 |         text
191 |     });
192 |     return new Promise((resolve) => {
193 |         queryResolve = resolve;
194 |     });
195 | }
196 | 
197 | /**
198 |  *
199 |  * @param {string} text
200 |  * @returns
201 |  */
202 | export async function embedQuery(text) {
203 |     worker.postMessage({
204 |         type: 'query',
205 |         text
206 |     });
207 |     return new Promise((resolve) => {
208 |         queryResolve = resolve;
209 |     });
210 | }
211 | 
212 | /**
213 |  *
214 |  * @param {string} text
215 |  * @returns
216 |  */
217 | export async function getTokens(text) {
218 |     worker.postMessage({
219 |         type: 'getTokens',
220 |         text
221 |     });
222 |     return new Promise((resolve) => {
223 |         tokensResolveMap[text] = resolve;
224 |     });
225 | }
226 | 
227 | /**
228 |  * @param {string} modelName
229 |  * @returns
230 |  */
231 | export async function loadSemantic(modelName) {
232 |     const quantized = document.getElementById("quantized").checked;
233 |     const downloadBar = document.getElementById('loading-progress');
234 |     downloadBar.style.width = '0%';
235 |     downloadBar.textContent = 'Loading model...';
236 |     worker.postMessage({
237 |         type: 'load',
238 |         model_name: modelName,
239 |         quantized: quantized
240 |     });
241 |     return new Promise((resolve) => {
242 |         loadResolve = resolve;
243 |     });
244 | }
245 | 
246 | export async function loadChat(modelName) {
247 |     //const quantized = document.getElementById("quantized").checked;
248 |     let downloadBar = document.getElementById('chat-progress');
249 |     downloadBar.style.width = '0%';
250 |     downloadBar.textContent = 'Loading model...';
251 | 
252 |     if (modelName.includes("Qwen")) {
253 |         worker.postMessage({
254 |             type: 'load_text-generation',
255 |             model_name: modelName
256 |             //quantized: quantized
257 |         });
258 |      }
259 | 
260 |     else {
261 |         worker.postMessage({
262 |             type: 'load_text2text-generation',
263 |             model_name: modelName
264 |             //quantized: quantized
265 |         });
266 |     }
267 |     return new Promise((resolve) => {
268 |         loadResolve = resolve;
269 |     });
270 | }
271 | 
272 | export async function loadSummary(modelName) {
273 |     //const quantized = document.getElementById("quantized").checked;
274 |     let downloadBar = document.getElementById('summary-progress');
275 |     downloadBar.style.width = '0%';
276 |     downloadBar.textContent = 'Loading model...';
277 |     worker.postMessage({
278 |         type: 'load_summary',
279 |         model_name: modelName
280 |         //quantized: quantized
281 |     });
282 |     return new Promise((resolve) => {
283 |         loadResolve = resolve;
284 |     });
285 | }
286 | 


--------------------------------------------------------------------------------
/src/js/utils.js:
--------------------------------------------------------------------------------
  1 | import { getTokens } from './semantic';
  2 | import { Deck } from '@deck.gl/core';
  3 | import { ScatterplotLayer, LineLayer } from '@deck.gl/layers';
  4 | import {setProgressBarValue } from './index.js';
  5 | 
  6 | import * as pdfjsLib from 'pdfjs-dist/webpack.mjs';
  7 | pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/build/pdf.worker.js';
  8 | 
  9 | //import {ScatterplotLayer} from '@deck.gl/layers';
 10 | /**
 11 |  * @param {string} text
 12 |  * @param {string} splitType
 13 |  * @param {string} splitParam
 14 |  * @returns {Promise<Array<string> | null>}
 15 |  */
 16 | export async function splitText(text, splitType, splitParam) {
 17 |     switch (splitType) {
 18 |         case 'Regex':
 19 |             return splitByRegex(text, splitParam);
 20 |         case 'Sentence':
 21 |             return splitBySentences(text);
 22 |         case 'Words':
 23 |             return splitByWords(text, parseInt(splitParam));
 24 |         case 'Chars':
 25 |             return splitByChars(text, parseInt(splitParam));
 26 |         case 'Tokens':
 27 |             return await splitByTokens(text, parseInt(splitParam));
 28 |         case 'JinaAI':
 29 |             return await splitWithJinaAI(text, parseInt(splitParam));
 30 |         default:
 31 |             console.error('Invalid split type');
 32 |             return null;
 33 |     }
 34 | }
 35 | 
 36 | /**
 37 |  * @param {string} text
 38 |  * @param {number} numTokens
 39 |  * @returns {Promise<Array<string> | null>}
 40 |  */
 41 | async function splitByTokens(text, numTokens) {
 42 |     const words = text.split(' ');
 43 |     const chunks = [];
 44 | 
 45 |     for (let i = 0; i < words.length; i++) {
 46 |         const word = words[i];
 47 |         const tokens = await getTokens(word);
 48 | 
 49 |         // Check if there's no chunk or if the last chunk + the new word would exceed numTokens
 50 |         if (chunks.length === 0 || (await getTokens(chunks[chunks.length - 1])).length + tokens.length > numTokens) {
 51 |             chunks.push(word);
 52 |         } else {
 53 |             chunks[chunks.length - 1] += ' ' + word;
 54 |         }
 55 |     }
 56 |     //console.table(chunks);
 57 |     console.log("Number of chunks: " + chunks.length)
 58 |     return chunks;
 59 | }
 60 | 
 61 | /**
 62 |  * @param {string} text
 63 |  * @param {number} numWords
 64 |  * @returns {Array<string> | null}
 65 |  */
 66 | function splitByWords(text, numWords) {
 67 |     if (isNaN(numWords) || !Number.isInteger(numWords)) {
 68 |         console.error('numWords must be an integer.');
 69 |         return null;
 70 |     }
 71 | 
 72 |     const words = text.split(' ');
 73 |     let chunks = [];
 74 |     let currentChunk = [];
 75 | 
 76 |     for (let i = 0; i < words.length; i++) {
 77 |         currentChunk.push(words[i]);
 78 | 
 79 |         if (currentChunk.length === numWords) {
 80 |             chunks.push(currentChunk.join(' '));
 81 |             currentChunk = [];
 82 |         }
 83 |     }
 84 | 
 85 |     if (currentChunk.length > 0) {
 86 |         chunks.push(currentChunk.join(' '));
 87 |     }
 88 |     chunks = chunks.filter(chunk => chunk.trim().length > 0);
 89 | 
 90 |     //console.table(chunks);
 91 |     console.log("Number of chunks: " + chunks.length)
 92 | 
 93 |     return chunks;
 94 | }
 95 | 
 96 | /**
 97 |  * @param {string} text
 98 |  * @param {number} numChars
 99 |  * @returns {Array<string> | null}
100 |  */
101 | function splitByChars(text, numChars) {
102 |     const words = text.split(' ');
103 |     const chunks = [];
104 | 
105 |     for (let i = 0; i < words.length; i++) {
106 |         const word = words[i];
107 | 
108 |         if (chunks.length === 0 || chunks[chunks.length - 1].length + word.length + 1 > numChars) {
109 |             chunks.push(word);
110 |         } else {
111 |             chunks[chunks.length - 1] += ' ' + word;
112 |         }
113 |     }
114 |     // console.table(chunks);
115 |     console.log("Number of chunks: " + chunks.length)
116 |     return chunks;
117 | }
118 | 
119 | /**
120 |  * @param {string} text
121 |  * @returns {Array<string> | null}
122 |  */
123 | function splitBySentences(text) {
124 |     const chunks = text.match(/[^.!?]+[.!?]+/g);
125 |     console.log("Number of chunks: " + chunks.length)
126 | 
127 |     return chunks
128 | }
129 | 
130 | /**
131 |  * @param {string} text
132 |  * @param {string} r
133 |  * @returns {Array<string> | null}
134 |  */
135 | function splitByRegex(text, r) {
136 |     const regex = new RegExp(r, 'g');
137 |     const chunks = text.split(regex);
138 | 
139 |     console.log("Number of chunks: " + chunks.length)
140 | 
141 |     return chunks
142 | }
143 | 
144 | /**
145 |  * @param {string} text
146 |  * @param {number} numChars
147 |  * @returns {Promise<Array<string> | null>}
148 |  */
149 | async function splitWithJinaAI(text, numChars) {
150 |     const data = {
151 |         content: text,
152 |         return_chunks: true,
153 |         max_chunk_length: numChars
154 |     };
155 | 
156 |     try {
157 |         const response = await fetch('https://segment.jina.ai/', {
158 |             method: 'POST',
159 |             headers: {
160 |                 'Content-Type': 'application/json'
161 |             },
162 |             body: JSON.stringify(data)
163 |         });
164 | 
165 |         if (!response.ok) {
166 |             console.error('HTTP error:', response.status, response.statusText);
167 |             return null;
168 |         }
169 | 
170 |         const responseData = await response.json();
171 | 
172 |         const chunks = responseData.chunks || [];  // Assuming the API returns the chunks in a property called 'chunks'
173 | 
174 |         console.log("Number of chunks: " + chunks.length);
175 |         // console.table(chunks);  // Uncomment if you want to see the chunks in a table format
176 | 
177 |         return chunks;
178 |     } catch (error) {
179 |         console.error('Fetch error:', error);
180 |         return null;
181 |     }
182 | }
183 | 
184 | // Example usage:
185 | // splitWithJinaAIChars("Your text here", 1000).then(chunks => console.log(chunks));
186 | 
187 | 
188 | 
189 | // Sorting algorithms: heap-based sorting is quite superior for 1000+ and usually less than half of the time of normal sorting
190 | // might be interesting to use it once indices become larger than 100k but for now not a bottleneck
191 | 
192 | // Original code
193 | function normalSorting(inputTexts) {
194 |     const startTime = performance.now();
195 |     const sortedResults = Object.entries(inputTexts).sort((a, b) => b[1] - a[1]);
196 |     const endTime = performance.now();
197 |     console.log(`Original code took ${endTime - startTime} milliseconds`);
198 |     // updateResults(sortedResults); // Commented out, replace with your actual implementation
199 | }
200 | 
201 | // MaxHeap class
202 | class MaxHeap {
203 |     constructor(array) {
204 |         this.heap = [...array];
205 |         this.buildHeap();
206 |     }
207 | 
208 |     buildHeap() {
209 |         const n = this.heap.length;
210 |         for (let i = Math.floor(n / 2) - 1; i >= 0; i--) {
211 |             this.heapifyDown(i);
212 |         }
213 |     }
214 | 
215 |     heapifyDown(i) {
216 |         const left = 2 * i + 1;
217 |         const right = 2 * i + 2;
218 |         let largest = i;
219 | 
220 |         if (left < this.heap.length && this.heap[left][1] > this.heap[largest][1]) {
221 |             largest = left;
222 |         }
223 | 
224 |         if (right < this.heap.length && this.heap[right][1] > this.heap[largest][1]) {
225 |             largest = right;
226 |         }
227 | 
228 |         if (largest !== i) {
229 |             this.swap(i, largest);
230 |             this.heapifyDown(largest);
231 |         }
232 |     }
233 | 
234 |     extractMax() {
235 |         if (this.heap.length === 0) {
236 |             return null;
237 |         }
238 | 
239 |         const max = this.heap[0];
240 |         const last = this.heap.pop();
241 | 
242 |         if (this.heap.length > 0) {
243 |             this.heap[0] = last;
244 |             this.heapifyDown(0);
245 |         }
246 | 
247 |         return max;
248 |     }
249 | 
250 |     swap(i, j) {
251 |         [this.heap[i], this.heap[j]] = [this.heap[j], this.heap[i]];
252 |     }
253 | }
254 | 
255 | // Heap-based solution
256 | export function heapBasedSorting(inputTexts, n) {
257 |     //const startTime = performance.now();
258 | 
259 |     const entries = Object.entries(inputTexts);
260 |     const maxHeap = new MaxHeap(entries);
261 | 
262 |     const nLargest = [];
263 |     for (let i = 0; i < n && i < entries.length; i++) {
264 |         const maxEntry = maxHeap.extractMax();
265 |         nLargest.push(maxEntry);
266 |     }
267 |     return nLargest
268 | 
269 |     //const endTime = performance.now();
270 |     //console.log(`Heap-based solution took ${endTime - startTime} milliseconds`);
271 |     // updateResults(nLargest); // Commented out, replace with your actual implementation
272 | }
273 | 
274 | /*
275 | // Test objects
276 | function generateTestObject(size) {
277 |     const testObject = {};
278 |     for (let i = 0; i < size; i++) {
279 |         testObject[`key${i}`] = Math.random();
280 |     }
281 |     return testObject;
282 | }
283 | 
284 | //const obj100 = generateTestObject(100);
285 | //const obj10000 = generateTestObject(10000);
286 | //const obj100000 = generateTestObject(100000);
287 | 
288 | // Usage
289 | //const n = 5; // Change this to the desired number of largest values
290 | 
291 | //normalSorting(obj100);
292 | //heapBasedSorting(obj100, n);
293 | 
294 | //normalSorting(obj10000);
295 | //heapBasedSorting(obj10000, n);
296 | 
297 | //normalSorting(obj100000);
298 | //heapBasedSorting(obj100000, n);
299 | 
300 | Original code took 0.19999999925494194 milliseconds
301 | Heap-based solution took 0.10000000149011612 milliseconds
302 | 
303 | Original code took 19.5 milliseconds
304 | Heap-based solution took 9.299999997019768 milliseconds
305 | 
306 | Original code took 166.69999999925494 milliseconds
307 | Heap-based solution took 60.5 milliseconds
308 | 
309 | */
310 | 
311 | const toastMessage = document.getElementById("toastMessage");
312 | const toastText = document.getElementById("toastText");
313 | const closeToastButton = document.getElementById("closeToastButton");
314 | 
315 | export function showToast(message, timeout=2500) {
316 |     toastText.textContent = message;
317 |     toastMessage.style.display = "block";
318 | 
319 |     setTimeout(() => {
320 |         hideToast();
321 |     }, timeout);
322 | }
323 | 
324 | function hideToast() {
325 |     toastMessage.style.display = "none";
326 | }
327 | 
328 | closeToastButton.addEventListener("click", () => {
329 |     hideToast();
330 | });
331 | 
332 | function generateGridData(gridSize = 20) {
333 |     const gridData = [];
334 | 
335 |     // Create vertical lines
336 |     for (let i = -gridSize; i <= gridSize; i++) {
337 |         gridData.push({
338 |             sourcePosition: [i, -gridSize],
339 |             targetPosition: [i, gridSize],
340 |             color: [169, 169, 169],
341 |         });
342 |     }
343 | 
344 |     // Create horizontal lines
345 |     for (let j = -gridSize; j <= gridSize; j++) {
346 |         gridData.push({
347 |             sourcePosition: [-gridSize, j],
348 |             targetPosition: [gridSize, j],
349 |             color: [169, 169, 169],
350 |         });
351 |     }
352 | 
353 |     return gridData;
354 | }
355 | 
356 | const plotContainer = document.getElementById("plot-container");
357 | let deckgl;
358 | export async function loadScatterplot(data) {
359 | 
360 |     removeScatterplot();
361 |     // Find the minimum and maximum similarity values, x values, and y values in the data array
362 |     const minSimilarity = Math.min(...data.map(item => item.similarity));
363 |     const maxSimilarity = Math.max(...data.map(item => item.similarity));
364 | 
365 |     const minX = Math.min(...data.map(item => item.x));
366 |     const maxX = Math.max(...data.map(item => item.x));
367 | 
368 |     const minY = Math.min(...data.map(item => item.y));
369 |     const maxY = Math.max(...data.map(item => item.y));
370 | 
371 |     data = data.map(item => {
372 |         // Normalize similarity values to the range [0, 1]
373 |         const normalizedSimilarity = (item.similarity - minSimilarity) / (maxSimilarity - minSimilarity);
374 | 
375 |         // Normalize x and y coordinates to the range [0, 1]
376 |         const normalizedX = (item.x - minX) / (maxX - minX);
377 |         const normalizedY = (item.y - minY) / (maxY - minY);
378 | 
379 |         // Use the normalized similarity value as alpha (opacity)
380 |         const alpha = Math.min(1, Math.max(0, normalizedSimilarity));
381 | 
382 |         // Map the alpha value to the entire opacity spectrum
383 |         const color = [0, 0, 255, Math.floor(alpha * 255)]; // RGBA format with alpha value
384 | 
385 |         return {
386 |             coordinates: [normalizedX, normalizedY],
387 |             color: color,
388 |             similarity: item.similarity,
389 |             label: item.label,
390 |         };
391 |     });
392 | 
393 |     // Calculate the bounding box of the data
394 |     const bounds = data.reduce(
395 |         (acc, point) => ({
396 |             minX: Math.min(acc.minX, point.coordinates[0]),
397 |             minY: Math.min(acc.minY, point.coordinates[1]),
398 |             maxX: Math.max(acc.maxX, point.coordinates[0]),
399 |             maxY: Math.max(acc.maxY, point.coordinates[1]),
400 |         }),
401 |         { minX: Infinity, minY: Infinity, maxX: -Infinity, maxY: -Infinity }
402 |     );
403 | 
404 |     deckgl = new Deck({
405 |         canvas: 'deckgl',
406 |         container: 'plot-container',
407 |         initialViewState: {
408 |             latitude: (bounds.minY + bounds.maxY) / 2,
409 |             longitude: (bounds.minX + bounds.maxX) / 2,
410 |             zoom: 9
411 |         },
412 |         controller: true,
413 |         pickingRadius: 25,
414 |         layers: [
415 |             // Add a new LineLayer for the coordinate system
416 |             /*new LineLayer({
417 |                 id: 'coordinate-system',
418 |                 data: generateGridData(20),
419 |                 getSourcePosition: d => d.sourcePosition,
420 |                 getTargetPosition: d => d.targetPosition,
421 |                 getColor: d => d.color,
422 |                 getWidth: 1,
423 |                 pickable: false
424 |             }),
425 |             */
426 |             // ScatterplotLayer with all points added right away
427 |             new ScatterplotLayer({
428 |                 id: 'scatterplot',
429 |                 data: data,
430 |                 getPosition: d => d.coordinates,
431 |                 getRadius: parseInt(document.getElementById("scatterplotRadius").value), // Adjust the radius to fit the new range
432 |                 getFillColor: d => d.color,
433 |                 pickable: true, // Enable picking for on-hover interaction
434 |                 onHover: info => {
435 |                     const tooltip = document.getElementById('tooltip');
436 | 
437 |                     if (info.object) {
438 |                         const canvas = document.getElementById('deckgl');
439 |                         const rect = canvas.getBoundingClientRect();
440 | 
441 |                         // Calculate the correct position by subtracting the canvas offset and adding the scroll position
442 |                         const left = window.scrollX + info.x + rect.left + 30;
443 |                         const top = window.scrollY + info.y + rect.top + -50;
444 | 
445 |                         tooltip.innerHTML = `${info.object.label} <br>Similarity: ${info.object.similarity.toFixed(2)}`;
446 |                         tooltip.style.left = `${left}px`;
447 |                         tooltip.style.top = `${top}px`;
448 |                         tooltip.style.display = 'block';
449 |                     } else {
450 |                         tooltip.style.display = 'none';
451 |                     }
452 |                 },
453 |                 onClick: info => {
454 |                     const tooltip = document.getElementById('tooltip');
455 |             
456 |                     if (info.object) {
457 |                         const canvas = document.getElementById('deckgl');
458 |                         const rect = canvas.getBoundingClientRect();
459 |             
460 |                         // Calculate the correct position by subtracting the canvas offset and adding the scroll position
461 |                         const left = window.scrollX + info.x + rect.left + 30;
462 |                         const top = window.scrollY + info.y + rect.top + -50;
463 |             
464 |                         tooltip.innerHTML = `${info.object.label} <br>Similarity: ${info.object.similarity.toFixed(2)}`;
465 |                         tooltip.style.left = `${left}px`;
466 |                         tooltip.style.top = `${top}px`;
467 |                         tooltip.style.display = 'block';
468 |                     } else {
469 |                         tooltip.style.display = 'none';
470 |                     }
471 |                 }
472 | 
473 |             })
474 |         ]
475 |     });
476 | 
477 |     plotContainer.style.height = "700px";
478 | }
479 | 
480 | export function removeScatterplot() {
481 |     if (deckgl) {
482 |         deckgl.finalize();
483 |         deckgl = null;
484 |     }
485 | }
486 | 
487 | // pdf loading logic for local and remote
488 | 
489 | function processPdf(pdf, documentIdentifier, resolve, reject, updateProgress) {
490 |     let numPages = pdf.numPages;
491 |     let pageTextPromises = [];
492 |     for (let i = 1; i <= numPages; i++) {
493 |         pageTextPromises.push(pdf.getPage(i).then(page => {
494 |             return page.getTextContent().then(textContent => {
495 |                 return textContent.items.map(item => item.str).join(' ');
496 |             });
497 |         }));
498 |     }
499 |     Promise.all(pageTextPromises).then(pagesText => {
500 |         // Concatenate text from all pages with metadata including page number
501 |         let fullText = pagesText.map((pageText, index) => 
502 |             `[Document: ${documentIdentifier}, Page: ${index + 1}]\n${pageText}`
503 |         ).join("\n\n");
504 |         resolve(fullText); // Resolve the promise with the full text including metadata
505 |     }).catch(error => {
506 |         reject(error); // Reject the promise if there's an error
507 |     });
508 | }
509 | 
510 | function extractTextFromPDF(fileOrDataUri, updateProgress) {
511 |     return new Promise((resolve, reject) => {
512 |         let documentIdentifier;
513 |         let pdfSource;
514 | 
515 |         if (fileOrDataUri instanceof File) {
516 |             // For local files
517 |             documentIdentifier = fileOrDataUri.name;
518 |             pdfSource = URL.createObjectURL(fileOrDataUri);
519 |         } else if (typeof fileOrDataUri === 'string') {
520 |             if (fileOrDataUri.startsWith('data:')) {
521 |                 // For data URIs (remote PDFs)
522 |                 documentIdentifier = "RemotePDF";
523 |                 pdfSource = fileOrDataUri;
524 |             } else {
525 |                 // Assume it's a URL
526 |                 documentIdentifier = fileOrDataUri;
527 |                 pdfSource = fileOrDataUri;
528 |             }
529 |         } else {
530 |             reject(new Error('Invalid input type'));
531 |             return;
532 |         }
533 | 
534 |         pdfjsLib.getDocument(pdfSource).promise.then(pdf => {
535 |             processPdf(pdf, documentIdentifier, resolve, reject, updateProgress);
536 |         }).catch(error => {
537 |             reject(error); // Reject the promise if there's an error loading the PDF
538 |         });
539 |     });
540 | }
541 | 
542 | 
543 | export async function handlePdfFileUpload() {
544 |     const fileInput = document.getElementById('pdf-upload');
545 |     const files = fileInput.files; // Get all selected files
546 |     if (files.length > 0) {
547 |         const totalFiles = files.length;
548 |         let processedFiles = 0;
549 | 
550 |         // Map each file to a promise that resolves with its text content
551 |         const filePromises = Array.from(files).map(file => {
552 |             return extractTextFromPDF(file, setProgressBarValue).then(text => {
553 |                 processedFiles++;
554 |                 const progressPercentage = (processedFiles / totalFiles) * 100;
555 |                 setProgressBarValue(progressPercentage.toFixed(0));
556 |                 console.log(progressPercentage);
557 |                 return text;
558 |             });
559 |         });
560 | 
561 |         // Wait for all files to be processed
562 |         const allFilesText = await Promise.all(filePromises);
563 |         // Concatenate text from all files
564 |         const fullText = allFilesText.join("\n\n");
565 |         return fullText; // Return the full text
566 |     } else {
567 |         console.error('No files selected');
568 |         return ''; // Return an empty string or handle the error as needed
569 |     }
570 | }
571 | 
572 | 
573 | 
574 | 
575 | 
576 | ////////////////////////////////////////////////////
577 | 
578 | async function fetchPdfAsDataUri(url) {
579 |     const proxyUrl = 'https://corsproxy.io/?' + url; // cors proxy unfortunately needed for remote files :/
580 |     const response = await fetch(proxyUrl);
581 |     if (!response.ok) {
582 |         throw new Error('Network response was not ok');
583 |     }
584 |     const blob = await response.blob();
585 |     return new Promise((resolve, reject) => {
586 |         const reader = new FileReader();
587 |         reader.onloadend = () => resolve(reader.result);
588 |         reader.onerror = reject;
589 |         reader.readAsDataURL(blob);
590 |     });
591 | }
592 | 
593 | 
594 | export async function handleRemotePdfFileUpload() {
595 |     const urls = document.getElementById("importPdfURL").value.split(" ");
596 |     let texts = [];
597 | 
598 |     for (const url of urls) {
599 |         console.log(url);
600 | 
601 |         try {
602 |             const dataUri = await fetchPdfAsDataUri(url);
603 |             const text = await extractTextFromPDF(dataUri, null);
604 |             texts.push(text);
605 |         } catch (error) {
606 |             console.log('Not a pdf, trying to parse the web page');
607 | 
608 |             // Fallback to extracting text from a normal webpage
609 |             try {
610 |                 const response = await fetch(url);
611 |                 const html = await response.text();
612 |                 const parser = new DOMParser();
613 |                 const doc = parser.parseFromString(html, 'text/html');
614 |                 const pageText = doc.body.innerText;
615 |                 texts.push(pageText);
616 |             } catch (webpageError) {
617 |                 console.error('Error fetching or parsing webpage:', webpageError);
618 |             }
619 |         }
620 |     }
621 | 
622 |     return texts.join("\n");
623 | }
624 | 
625 | 
626 | export async function handleMultipleRemotePdfFileUploads() {
627 |     const urls = document.getElementById("importPdfURL").value.split(" ")
628 |     const results = [];
629 | 
630 |     for (const url of urls) {
631 |         console.log(url);
632 | 
633 |         try {
634 |             const dataUri = await fetchPdfAsDataUri(url);
635 |             const text = await extractTextFromPDF(dataUri, null);
636 |             results.push(text);
637 |         } catch (error) {
638 |             console.error(`Error handling remote PDF file upload for URL ${url}:`, error);
639 |             results.push('');
640 |         }
641 |     }
642 | 
643 |     return results;
644 | }
645 | 


--------------------------------------------------------------------------------
/src/js/worker.js:
--------------------------------------------------------------------------------
  1 | import { pipeline, AutoTokenizer } from '@xenova/transformers';
  2 | import pako from 'pako';
  3 | import init, { tSNE } from "wasm-bhtsne";
  4 | import { marked } from 'marked';
  5 | 
  6 | init();
  7 | // env.useBrowserCache = false; // for testing
  8 | 
  9 | /**
 10 |  * @type {Object<string, EmbeddingVector>}
 11 |  */
 12 | let embeddingsDict = {};
 13 | 
 14 | /**
 15 |  * @type {Pipeline}
 16 |  */
 17 | // embedding models
 18 | let embedder;
 19 | let tokenizer;
 20 | 
 21 | // chat model
 22 | let chat_generator;
 23 | let chat_tokenizer;
 24 | let chat_model_name;
 25 | 
 26 | // summary model
 27 | let summary_generator;
 28 | let summary_tokenizer;
 29 | 
 30 | let queryEmbedding;
 31 | let currentNullVector = [];
 32 | 
 33 | function minimalEightCharHash(str) {
 34 |     let hash = 5381;
 35 | 
 36 |     for (let i = 0; i < str.length; i++) {
 37 |         hash = (hash * 33) ^ str.charCodeAt(i);
 38 |     }
 39 | 
 40 |     // Convert to 8-character hexadecimal string
 41 |     const hexHash = (hash >>> 0).toString(16);
 42 |     return hexHash.slice(0, 8).padStart(8, '0');
 43 | }
 44 | 
 45 | function minimalRandomEightCharHash() {
 46 |     const characters = '0123456789abcdef';
 47 |     let hash = '';
 48 | 
 49 |     for (let i = 0; i < 8; i++) {
 50 |         const randomIndex = Math.floor(Math.random() * characters.length);
 51 |         hash += characters[randomIndex];
 52 |     }
 53 | 
 54 |     return hash;
 55 | }
 56 | 
 57 | 
 58 | async function token_to_text(beams, tokenizer_type) {
 59 |     //let chatTokenizer = await AutoTokenizer.from_pretrained(chatModel);
 60 |     let decoded_text = tokenizer_type.decode(beams[0].output_token_ids, {
 61 |         skip_special_tokens: true
 62 |     });
 63 |     //console.log(decoded_text);
 64 |     return decoded_text
 65 | }
 66 | 
 67 | /**
 68 |  * @param {string} text
 69 |  * @returns {Promise<EmbeddingVector>}
 70 |  */
 71 | async function embed(text, embedNewText=true) {
 72 |         if (text in embeddingsDict) {
 73 |             return embeddingsDict[text];
 74 |         }
 75 | 
 76 |         if (embedNewText==false){
 77 |             if (currentNullVector != []){
 78 |                 embeddingsDict[text] = currentNullVector;
 79 |                 return currentNullVector
 80 |             }
 81 |             else {
 82 |                 const tempVec = await embedder("test", { pooling: 'mean', normalize: true });
 83 |                 currentNullVector = [...tempVec.data].fill(0.00001);
 84 |                 embeddingsDict[text] = currentNullVector;
 85 |                 return currentNullVector
 86 |             }
 87 |         }
 88 | 
 89 |         const e0 = await embedder(text, { pooling: 'mean', normalize: true });
 90 | 
 91 |         const roundDecimalsDown = (num) => parseFloat(num.toFixed(3));
 92 | 
 93 |         embeddingsDict[text] = e0.data.map(roundDecimalsDown);
 94 |         //console.log(embeddingsDict)
 95 |         return e0.data;
 96 | 
 97 | }
 98 | 
 99 | async function getTokens(text) {
100 |     return await tokenizer(text).input_ids.data;
101 | }
102 | 
103 | async function chat(text, max_new_tokens = 100) {
104 |     return new Promise(async (resolve, reject) => {
105 |         // hier Weiche einbauen für Qwen da tokenizer anders
106 |         console.log(chat_model_name, max_new_tokens);
107 | 
108 |         if (chat_model_name.includes("Qwen")) {
109 |             try {
110 | 
111 |                 // Define the prompt and list of messages
112 |                 const messages = [
113 |                     { "role": "system", "content": "You are a helpful assistant." },
114 |                     { "role": "user", "content": text }
115 |                 ]
116 | 
117 |                 const generatorText = chat_generator.tokenizer.apply_chat_template(messages, {
118 |                     tokenize: false,
119 |                     add_generation_prompt: false,
120 |                 });
121 | 
122 |                 const thisChat = await chat_generator(generatorText, {
123 |                     max_new_tokens: max_new_tokens,
124 |                     do_sample: false,
125 |                     callback_function: async function (beams) {
126 |                         //const decodedText = await token_to_text(beams, chat_generator.tokenizer);
127 |                         let decodedText = chat_generator.tokenizer.decode(beams[0].output_token_ids, { skip_special_tokens: false })
128 | 
129 |                         decodedText = decodedText.split("<|im_start|>")[3].replace("<|im_end|>","") // just return the model's output
130 |                         decodedText = marked(decodedText)
131 | 
132 |                         self.postMessage({
133 |                             type: 'chat',
134 |                             chat_text: decodedText
135 |                         });
136 | 
137 |                         resolve(decodedText); // Resolve the main promise with chat text
138 |                     },
139 |                 });
140 |             } catch (error) {
141 |                 reject(error);
142 |             }
143 |         }
144 | 
145 |         else {
146 |             try {
147 |                 const thisChat = await chat_generator(text, {
148 |                     max_new_tokens: max_new_tokens,
149 |                     return_prompt: false,
150 |                     callback_function: async function (beams) {
151 |                         const decodedText = await token_to_text(beams, chat_tokenizer);
152 |                         //console.log(decodedText);
153 | 
154 |                         self.postMessage({
155 |                             type: 'chat',
156 |                             chat_text: decodedText,
157 |                         });
158 | 
159 |                         resolve(decodedText); // Resolve the main promise with chat text
160 |                     },
161 |                 });
162 |             } catch (error) {
163 |                 reject(error);
164 |             }
165 |         }
166 |     });
167 | }
168 | 
169 | async function summary(text, max_new_tokens = 100) {
170 |     return new Promise(async (resolve, reject) => {
171 |         try {
172 |             const thisSummary = await summary_generator(text, {
173 |                 max_new_tokens: max_new_tokens,
174 |                 return_prompt: false,
175 |                 callback_function: async function (beams) {
176 |                     const decodedText = await token_to_text(beams, summary_tokenizer);
177 |                     //console.log(beams)
178 | 
179 |                     self.postMessage({
180 |                         type: 'summary',
181 |                         summary_text: decodedText,
182 |                     });
183 | 
184 |                     resolve(decodedText); // Resolve the main promise with chat text
185 |                 },
186 |             });
187 |         } catch (error) {
188 |             reject(error);
189 |         }
190 |     });
191 | }
192 | 
193 | // tested, trivial calculation takes 200ms for 100k embeddings of size 384 or 700 ms with size 1000
194 | const calculateAverageEmbedding = (embeddingsAsArray) => {
195 |     const allEmbeddings = Object.values(embeddingsAsArray);
196 | 
197 |     if (allEmbeddings.length === 0) {
198 |         return null; // handle the case when the input object is empty
199 |     }
200 | 
201 |     const sumEmbeddings = allEmbeddings.reduce((acc, embedding) => {
202 |         return acc.map((value, index) => value + embedding[index]);
203 |     }, new Array(allEmbeddings[0].length).fill(0));
204 | 
205 |     const averageEmbedding = sumEmbeddings.map(value => value / allEmbeddings.length);
206 | 
207 |     return averageEmbedding;
208 | };
209 | 
210 | /* 
211 | const calculateAverageEmbedding = (embeddingsAsArray) => {
212 |   const allEmbeddings = Object.values(embeddingsAsArray);
213 | 
214 |   if (allEmbeddings.length === 0) {
215 |     return null; // handle the case when the input object is empty
216 |   }
217 | 
218 |   const start = performance.now();
219 | 
220 |   const sumEmbeddings = allEmbeddings.reduce((acc, embedding) => {
221 |     return acc.map((value, index) => value + embedding[index]);
222 |   }, new Array(allEmbeddings[0].length).fill(0));
223 | 
224 |   const averageEmbedding = sumEmbeddings.map(value => value / allEmbeddings.length);
225 | 
226 |   const end = performance.now();
227 |   console.log('Execution time:', end - start, 'milliseconds');
228 | 
229 |   return averageEmbedding;
230 | };
231 | 
232 | // Generate random embeddings for testing
233 | const generateRandomEmbedding = (size) => {
234 |   return Array.from({ length: size }, () => Math.random());
235 | };
236 | 
237 | // Generate test data with 10,000 strings and embeddings of size 1000
238 | const generateTestEmbeddings = (numStrings, embeddingSize) => {
239 |   const testData = {};
240 |   for (let i = 1; i <= numStrings; i++) {
241 |     const key = `string${i}`;
242 |     const embedding = generateRandomEmbedding(embeddingSize);
243 |     testData[key] = embedding;
244 |   }
245 |   return testData;
246 | };
247 | 
248 | // Test the calculateAverageEmbedding function with generated data
249 | const testEmbeddingsAsArray = generateTestEmbeddings(100000, 1000);
250 | const averageEmbedding = calculateAverageEmbedding(testEmbeddingsAsArray);
251 | 
252 | console.log('Average Embedding:', averageEmbedding);
253 | */
254 | 
255 | function convert_to_underscores(inputString) {
256 |     // Replace spaces with underscores
257 |     var stringWithUnderscores = lowercaseString.replace(/\s/g, '_');
258 | 
259 |     return stringWithUnderscores;
260 | }
261 | function createRandomMatrix(rows, columns) {
262 |     return Array.from({ length: rows }, () =>
263 |         Array.from({ length: columns }, () => Math.random())
264 |     );
265 | }
266 | // Function to update embeddingsDict
267 | const updateEmbeddingsDict = (newData) => {
268 |     embeddingsDict = newData;
269 |     postMessage({ type: 'updateEmbeddingsDict', data: embeddingsDict });
270 | };
271 | 
272 | function convertFloat32ArraysToArrays(arrayOfFloat32Arrays) {
273 |     return arrayOfFloat32Arrays.reduce((accumulator, currentFloat32Array) => {
274 |         // Convert Float32Array to a regular JavaScript array using Array.from
275 |         const jsArray = Array.from(currentFloat32Array);
276 | 
277 |         // Add the converted array to the accumulator
278 |         accumulator.push(jsArray);
279 | 
280 |         return accumulator;
281 |     }, []);
282 | }
283 | 
284 | function calculateCosineSimilarity(embedding) {
285 |     let dotProduct = 0;
286 |     let queryMagnitude = 0;
287 |     let embeddingMagnitude = 0;
288 |     const queryEmbeddingLength = queryEmbedding.length;
289 | 
290 |     for (let i = 0; i < queryEmbeddingLength; i++) {
291 |         dotProduct += queryEmbedding[i] * embedding[i];
292 |         queryMagnitude += queryEmbedding[i] ** 2;
293 |         embeddingMagnitude += embedding[i] ** 2;
294 |     }
295 | 
296 |     return dotProduct / (Math.sqrt(queryMagnitude) * Math.sqrt(embeddingMagnitude));
297 | }
298 | 
299 | // Expose a function to manually update embeddingsDict
300 | self.updateEmbeddingsDictManually = updateEmbeddingsDict;
301 | 
302 | self.onmessage = async (event) => {
303 |     const message = event.data;
304 |     //console.log(message)
305 |     let roundDecimals;
306 |     let embeddingsAsArray;
307 |     let exportDict;
308 |     let gzippedData;
309 |     let text;
310 |     let embedding;
311 | 
312 |     // Other cases in your existing switch statement
313 |     switch (message.type) {
314 |         case 'logEmbeddingsDict':
315 |             console.log(embeddingsDict);
316 |             break
317 |         case 'tsne':
318 |             const start = performance.now();
319 |             const valuesFloat32Array = Array.from(Object.values(embeddingsDict));
320 |             let valuesArray = convertFloat32ArraysToArrays(valuesFloat32Array);
321 |             const valuesArrayLength = valuesArray.length;
322 |             //console.log(valuesArrayLength);
323 |             // Check if the length is below 61 to set perplexity to a different value, needs slight refactoring to 
324 |             // get rid of this workaround
325 | 
326 |             let compressed_vectors;
327 |             if (valuesArrayLength < 61) {
328 |                 const vectorLength = valuesArray[0].length; // Assuming all vectors have the same length
329 |                 const vectorsToAdd = 61 - valuesArrayLength;
330 | 
331 |                 console.log("added: ", vectorsToAdd)
332 |                 // Add random vectors to the array
333 |                 for (let i = 0; i < vectorsToAdd; i++) {
334 |                     const randomVector = Array.from({ length: vectorLength }, () => Math.random());
335 |                     valuesArray.push(randomVector);
336 |                 }
337 | 
338 |                 const tsne_encoder = new tSNE(valuesArray);
339 |                 compressed_vectors = tsne_encoder.barnes_hut(message.data.iterations).slice(0, valuesArrayLength);//,theta=0.1);
340 |             }
341 |             else {
342 |                 const tsne_encoder = new tSNE(valuesArray);
343 |                 compressed_vectors = tsne_encoder.barnes_hut(message.data.iterations);
344 | 
345 |             }
346 | 
347 |             //console.log("Compressed Vectors:", compressed_vectors);
348 |             const end = performance.now();
349 |             console.log('BHtSNE Execution time:', Math.round(end - start), 'ms');
350 | 
351 |             //text = message.text;
352 |             //embedding = await embed(text);
353 | 
354 |             const originalKeys = Object.keys(embeddingsDict);
355 |             const originalEmbeddings = Object.values(embeddingsDict)
356 | 
357 |             // Assuming compressed_vectors is now an array of arrays
358 |             let plotDataArray = [];
359 | 
360 |             for (let i = 0; i < originalKeys.length; i++) {
361 |                 let thisVec = compressed_vectors[i];
362 |                 let similarity = calculateCosineSimilarity(originalEmbeddings[i]);
363 | 
364 |                 if (similarity >= message.data.dimensionalityReductionSimilarityThreshold) {
365 |                     plotDataArray.push({ "x": thisVec[0], "y": thisVec[1], "label": originalKeys[i], "similarity": similarity });
366 |                 }
367 |             }
368 | 
369 |             console.log(plotDataArray)
370 | 
371 |             // Now reconstructedDict will have the original format
372 |             //console.log(plotDataArray);
373 | 
374 |             //loadScatterplot(plotDataArray);
375 | 
376 |             self.postMessage({
377 |                 type: 'tsne',
378 |                 plotDataArray
379 |             });
380 |             break
381 | 
382 |         case 'importEmbeddingsDict':
383 |             embeddingsDict = message.data;
384 |             break
385 |         case 'exportEmbeddingsDict':
386 |             roundDecimals = (num) => parseFloat(num.toFixed(parseInt(message.data.meta.exportDecimals)));
387 | 
388 |             embeddingsAsArray = Object.fromEntries(
389 |                 Object.entries(embeddingsDict).map(([key, values]) => [key, Object.values(values).map(roundDecimals)])
390 |             );
391 | 
392 |             const meanEmbedding = calculateAverageEmbedding(embeddingsAsArray)
393 |             // adding mean embedding so all indexed docs on HF could be ingested in a "proper" vector DB!
394 |             exportDict = {
395 |                 "meta": message.data.meta, "text": message.data.text,
396 |                 "index": embeddingsAsArray,
397 |                 "mean_embedding": meanEmbedding
398 |             }
399 | 
400 |             exportDict.meta.chunks = Object.keys(embeddingsAsArray).length;
401 | 
402 |             console.log("Document average embedding", meanEmbedding);
403 |             console.log("Metadata", exportDict.meta);
404 | 
405 |             gzippedData = pako.gzip(JSON.stringify(exportDict), { to: 'string' });
406 | 
407 |             const tempFilename = `${message.data.meta.textTitle.replace(/\s/g, '_')}_${minimalRandomEightCharHash()}.json.gz`
408 |             // Send the gzipped data as a response
409 |             self.postMessage({ type: 'embeddingsDict', data: gzippedData, filename: tempFilename });
410 |             break;
411 | 
412 |         case 'load':
413 |             embeddingsDict = {}; // clear dict
414 |             tokenizer = await AutoTokenizer.from_pretrained(message.model_name); // no progress callbacks -- assume its quick
415 |             embedder = await pipeline('feature-extraction', message.model_name,
416 |                 {
417 |                     quantized: message.quantized,
418 |                     progress_callback: data => {
419 |                         self.postMessage({
420 |                             type: 'download',
421 |                             data
422 |                         });
423 |                     }
424 | 
425 |                 });
426 |             break;
427 |         case 'load_summary':
428 |             summary_tokenizer = await AutoTokenizer.from_pretrained(message.model_name)
429 |             summary_generator = await pipeline('summarization', message.model_name,
430 |                 {
431 |                     progress_callback: data => {
432 |                         self.postMessage({
433 |                             type: 'summary_download',
434 |                             data
435 |                         });
436 |                     }
437 |                     //quantized: message.quantized // currently not possible, models unquantized way too large!
438 |                 });
439 |             break;
440 |         case 'load_text2text-generation':
441 |             console.log("loading chat");
442 |             chat_model_name = message.model_name;
443 |             chat_tokenizer = await AutoTokenizer.from_pretrained(message.model_name); // no progress callbacks -- assume its quick
444 |             chat_generator = await pipeline('text2text-generation', message.model_name,
445 |                 {
446 |                     progress_callback: data => {
447 |                         self.postMessage({
448 |                             type: 'chat_download',
449 |                             data
450 |                         });
451 |                     }
452 |                     //quantized: message.quantized // currently not possible, models unquantized way too large!
453 |                 });
454 |             break;
455 |         case 'load_text-generation':
456 |             console.log("loading chat");
457 |             chat_model_name = message.model_name;
458 |             chat_tokenizer = await AutoTokenizer.from_pretrained(message.model_name) // no progress callbacks -- assume its quick
459 |             chat_generator = await pipeline('text-generation', message.model_name,
460 |                 {
461 |                     progress_callback: data => {
462 |                         self.postMessage({
463 |                             type: 'chat_download',
464 |                             data
465 |                         });
466 |                     }
467 |                     //quantized: message.quantized // currently not possible, models unquantized way too large!
468 |                 });
469 |             console.log("chat loaded");
470 |             break;
471 |         case 'query':
472 |             text = message.text;
473 |             embedding = await embed(text);
474 |             queryEmbedding = embedding;
475 |             currentNullVector = [...Object.values(embeddingsDict)[0]].fill(0.00001);
476 |             self.postMessage({
477 |                 type: 'query',
478 |                 embedding
479 |             });
480 |             break;
481 |         case 'similarity':
482 |             text = message.text;
483 |             embedding = await embed(text, message.inferencingActive);
484 |             self.postMessage({
485 |                 type: 'similarity',
486 |                 text,
487 |                 embedding
488 |             });
489 |             break;
490 |         case 'getTokens':
491 |             text = message.text;
492 |             self.postMessage({
493 |                 type: 'tokens',
494 |                 text,
495 |                 tokens: await getTokens(text)
496 |             });
497 |             break;
498 |         case 'summary':
499 |             text = message.text;
500 |             let summary_text = await summary(text, message.max_new_tokens);
501 |             self.postMessage({
502 |                 type: 'summary',
503 |                 summary_text
504 |             });
505 |             break;
506 |         case 'chat':
507 |             text = message.text;
508 |             let chat_text = await chat(text, message.max_new_tokens);
509 |             self.postMessage({
510 |                 type: 'chat',
511 |                 chat_text
512 |             });
513 |             break;
514 | 
515 |         default:
516 |     }
517 | };
518 | 
519 | 


--------------------------------------------------------------------------------
/src/models/model_miner.js:
--------------------------------------------------------------------------------
 1 | // model mining script - necessary as huggingface.co doesn not allow requests from other domains e.g. github.io 
 2 | // execute this script for each sorter while on https://huggingface.co/models
 3 | // downloads the json file
 4 | 
 5 | let out_json = {}
 6 | const sorter = "modified" // // likes, downloads, trending, modified
 7 | const pipeline_tag = "feature-extraction" // text2text2 etc.
 8 | const fileName = `${pipeline_tag}_${sorter}.json`;
 9 | 
10 | function downloadJsonToFile(jsonData, fileName) {
11 |   // Create a Blob object from the JSON data
12 |   const blob = new Blob([JSON.stringify(jsonData)], { type: "application/json" });
13 | 
14 |   // Create a URL for the Blob
15 |   const url = URL.createObjectURL(blob);
16 | 
17 |   // Create a link element for the download
18 |   const a = document.createElement("a");
19 |   a.href = url;
20 |   a.download = fileName;
21 | 
22 |   // Trigger a click event on the link to initiate the download
23 |   a.click();
24 | 
25 |   // Clean up by revoking the URL
26 |   URL.revokeObjectURL(url);
27 | }
28 | 
29 | async function fetchAllPages() {
30 |   const baseUrl = "https://huggingface.co/models-json";
31 |   const commonParams = `?pipeline_tag=${pipeline_tag}&library=transformers.js&sort=${sorter}`; 
32 |   const numPages = 3; // Change this if you need more or fewer pages
33 | 
34 |   const models = [];
35 | 
36 |   for (let pageIndex = 0; pageIndex < numPages; pageIndex++) {
37 |     const url = `${baseUrl}${commonParams}&p=${pageIndex}`;
38 | 
39 |     try {
40 |       const response = await fetch(url);
41 |       const data = await response.json();
42 |       models.push(...data.models);
43 |     } catch (error) {
44 |       console.error(`Error fetching page ${pageIndex}: ${error}`);
45 |     }
46 |   }
47 | 
48 |   const result = {
49 |     activeFilters: {
50 |       pipeline_tag: ["feature-extraction"],
51 |       library: ["transformers.js"],
52 |       dataset: [],
53 |       language: [],
54 |       license: [],
55 |       other: [],
56 |     },
57 |     models,
58 |     numItemsPerPage: 30,
59 |     numTotalItems: models.length,
60 |     pageIndex: 0, 
61 |   };
62 | 
63 |   out_json = result;
64 |   
65 |   downloadJsonToFile(result, fileName);
66 | }
67 | 
68 | fetchAllPages();
69 | 
70 | 


--------------------------------------------------------------------------------
/src/models/model_miner_simple.js:
--------------------------------------------------------------------------------
 1 | // simplified script for just downloading all models from the current HF page
 2 | // set the filters on HF and run it in the browser console
 3 | // e.g. go to https://huggingface.co/models?pipeline_tag=text2text-generation&library=transformers.js&sort=trending
 4 | 
 5 | const h4Elements = document.querySelectorAll("h4");
 6 | const h4TextArray = [];
 7 | 
 8 | h4Elements.forEach(element => {
 9 |   h4TextArray.push(element.textContent);
10 | });
11 | 
12 | console.log(h4TextArray);
13 | 
14 | //[
15 | //    "Xenova/t5-small",
16 | //    "Xenova/flan-t5-small",
17 | //    "Xenova/LaMini-Flan-T5-783M",
18 | //    "Xenova/LaMini-Flan-T5-248M",
19 | //    "Xenova/LaMini-Flan-T5-77M",
20 | //    "Xenova/LaMini-T5-61M",
21 | //    "Xenova/LaMini-T5-738M",
22 | //    "Xenova/LaMini-T5-223M",
23 | //    "Xenova/mt5-small",
24 | //    "Xenova/mt5-base",
25 | //    "Xenova/t5-base",
26 | //    "Xenova/t5-v1_1-base",
27 | //    "Xenova/flan-t5-base",
28 | //    "Xenova/t5-v1_1-small",
29 | //    "Xenova/blenderbot-400M-distill",
30 | //    "Xenova/blenderbot_small-90M",
31 | //    "Xenova/long-t5-tglobal-base",
32 | //    "Xenova/long-t5-local-base",
33 | //    "Xenova/long-t5-tglobal-base-16384-book-summary"
34 | //]


--------------------------------------------------------------------------------
/src/models/model_size_miner.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 25,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import requests\n",
 10 |     "from bs4 import BeautifulSoup\n",
 11 |     "import json\n",
 12 |     "\n",
 13 |     "# Load URLs from the JSON file\n",
 14 |     "with open('feature-extraction_downloads.json', 'r') as json_file:\n",
 15 |     "    data = json.load(json_file)\n",
 16 |     "    # urls = data.get('urls', [])\n",
 17 |     "ids = [i[\"id\"] for i in data[\"models\"]]\n",
 18 |     "urls = [f\"https://huggingface.co/{i}/tree/main/onnx\" for i in ids]"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 26,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "{'author': 'TaylorAI',\n",
 30 |        " 'authorData': {'avatarUrl': 'https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/63917e16b6b839bb61483dbf/Utq89ebo7Glxfls0QZnxK.png?w=200&h=200&f=face',\n",
 31 |        "  'fullname': 'Taylor',\n",
 32 |        "  'name': 'TaylorAI',\n",
 33 |        "  'type': 'org',\n",
 34 |        "  'isHf': False},\n",
 35 |        " 'downloads': 1752,\n",
 36 |        " 'gated': False,\n",
 37 |        " 'id': 'TaylorAI/gte-tiny',\n",
 38 |        " 'lastModified': '2023-10-07T05:20:49.000Z',\n",
 39 |        " 'likes': 102,\n",
 40 |        " 'pipeline_tag': 'sentence-similarity',\n",
 41 |        " 'private': False,\n",
 42 |        " 'repoType': 'model',\n",
 43 |        " 'isLikedByUser': False}"
 44 |       ]
 45 |      },
 46 |      "execution_count": 26,
 47 |      "metadata": {},
 48 |      "output_type": "execute_result"
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "data[\"models\"][0]"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 27,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "https://huggingface.co/TaylorAI/gte-tiny/tree/main/onnx | 22.9\n",
 65 |       "https://huggingface.co/Supabase/gte-small/tree/main/onnx | 34\n",
 66 |       "https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx | 23\n",
 67 |       "https://huggingface.co/Xenova/bge-large-en-v1.5/tree/main/onnx | 337\n",
 68 |       "https://huggingface.co/Supabase/bge-small-en/tree/main/onnx | 34\n",
 69 |       "https://huggingface.co/Xenova/gte-small/tree/main/onnx | 34\n",
 70 |       "https://huggingface.co/Xenova/all-mpnet-base-v2/tree/main/onnx | 110\n",
 71 |       "https://huggingface.co/Xenova/paraphrase-mpnet-base-v2/tree/main/onnx | 110\n",
 72 |       "https://huggingface.co/Xenova/all-MiniLM-L12-v2/tree/main/onnx | 34\n",
 73 |       "https://huggingface.co/Xenova/multilingual-e5-small/tree/main/onnx | 118\n",
 74 |       "https://huggingface.co/Xenova/gte-large/tree/main/onnx | 337\n",
 75 |       "https://huggingface.co/Xenova/bge-base-en-v1.5/tree/main/onnx | 110\n",
 76 |       "https://huggingface.co/Xenova/all-roberta-large-v1/tree/main/onnx | 357\n",
 77 |       "https://huggingface.co/Xenova/distiluse-base-multilingual-cased-v2/tree/main/onnx | 135\n",
 78 |       "https://huggingface.co/Xenova/paraphrase-multilingual-mpnet-base-v2/tree/main/onnx | 279\n",
 79 |       "https://huggingface.co/Xenova/bge-large-zh/tree/main/onnx | 327\n",
 80 |       "https://huggingface.co/Xenova/multilingual-e5-base/tree/main/onnx | 279\n",
 81 |       "https://huggingface.co/Xenova/bge-small-en-v1.5/tree/main/onnx | 34\n",
 82 |       "https://huggingface.co/Xenova/paraphrase-albert-small-v2/tree/main/onnx | 39.7\n",
 83 |       "https://huggingface.co/Xenova/paraphrase-albert-base-v2/tree/main/onnx | 40\n",
 84 |       "https://huggingface.co/Xenova/squeezebert-uncased/tree/main/onnx | 51.2\n",
 85 |       "https://huggingface.co/Xenova/squeezebert-mnli/tree/main/onnx | 51.3\n",
 86 |       "https://huggingface.co/Xenova/vit-base-patch16-224-in21k/tree/main/onnx | 87.5\n",
 87 |       "https://huggingface.co/Xenova/all-distilroberta-v1/tree/main/onnx | 82.1\n",
 88 |       "https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2/tree/main/onnx | 118\n",
 89 |       "https://huggingface.co/Xenova/paraphrase-MiniLM-L6-v2/tree/main/onnx | 23\n",
 90 |       "https://huggingface.co/Xenova/bert-base-nli-mean-tokens/tree/main/onnx | 110\n",
 91 |       "https://huggingface.co/Xenova/distilbert-base-nli-mean-tokens/tree/main/onnx | 66.9\n",
 92 |       "https://huggingface.co/Xenova/distilbert-base-nli-stsb-mean-tokens/tree/main/onnx | 66.9\n",
 93 |       "https://huggingface.co/Xenova/distiluse-base-multilingual-cased-v1/tree/main/onnx | 135\n",
 94 |       "https://huggingface.co/Xenova/msmarco-distilbert-base-v4/tree/main/onnx | 66.9\n",
 95 |       "https://huggingface.co/Xenova/multi-qa-MiniLM-L6-cos-v1/tree/main/onnx | 23\n",
 96 |       "https://huggingface.co/Xenova/multi-qa-distilbert-cos-v1/tree/main/onnx | 66.9\n",
 97 |       "https://huggingface.co/Xenova/multi-qa-mpnet-base-cos-v1/tree/main/onnx | 110\n",
 98 |       "https://huggingface.co/Xenova/multi-qa-mpnet-base-dot-v1/tree/main/onnx | 110\n",
 99 |       "https://huggingface.co/Xenova/nli-mpnet-base-v2/tree/main/onnx | 110\n",
100 |       "https://huggingface.co/Xenova/paraphrase-MiniLM-L3-v2/tree/main/onnx | 17.5\n",
101 |       "https://huggingface.co/Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens/tree/main/onnx | 279\n",
102 |       "https://huggingface.co/Xenova/dino-vitb16/tree/main/onnx | 87.5\n",
103 |       "https://huggingface.co/Xenova/dino-vits8/tree/main/onnx | 23.4\n",
104 |       "https://huggingface.co/Xenova/dino-vitb8/tree/main/onnx | 88.8\n",
105 |       "https://huggingface.co/Xenova/dino-vits16/tree/main/onnx | 22.7\n",
106 |       "https://huggingface.co/Xenova/scibert_scivocab_uncased/tree/main/onnx | 111\n",
107 |       "https://huggingface.co/Xenova/spanbert-large-cased/tree/main/onnx | 335\n",
108 |       "https://huggingface.co/Xenova/spanbert-base-cased/tree/main/onnx | 109\n",
109 |       "https://huggingface.co/sdan/simple-embeddings/tree/main/onnx | 23\n",
110 |       "https://huggingface.co/Xenova/sentence_bert/tree/main/onnx | 110\n",
111 |       "https://huggingface.co/Xenova/e5-small-v2/tree/main/onnx | 34\n",
112 |       "https://huggingface.co/Xenova/SapBERT-from-PubMedBERT-fulltext/tree/main/onnx | 110\n",
113 |       "https://huggingface.co/Xenova/indobert-base-p1/tree/main/onnx | 125\n",
114 |       "https://huggingface.co/Xenova/UMLSBert_ENG/tree/main/onnx | 110\n",
115 |       "https://huggingface.co/Xenova/rubert-base-cased/tree/main/onnx | 178\n",
116 |       "https://huggingface.co/Xenova/kobert/tree/main/onnx | 92.8\n",
117 |       "https://huggingface.co/Xenova/e5-small/tree/main/onnx | 34\n",
118 |       "https://huggingface.co/Xenova/e5-large/tree/main/onnx | 337\n",
119 |       "https://huggingface.co/Xenova/e5-large-v2/tree/main/onnx | 337\n",
120 |       "https://huggingface.co/Xenova/e5-base/tree/main/onnx | 110\n",
121 |       "https://huggingface.co/Xenova/e5-base-v2/tree/main/onnx | 110\n",
122 |       "https://huggingface.co/Xenova/instructor-base/tree/main/onnx | 110\n",
123 |       "https://huggingface.co/Xenova/instructor-large/tree/main/onnx | 337\n",
124 |       "https://huggingface.co/Xenova/sentence-t5-large/tree/main/onnx | 337\n",
125 |       "https://huggingface.co/Xenova/multilingual-e5-large/tree/main/onnx | 562\n",
126 |       "https://huggingface.co/Xenova/mms-300m/tree/main/onnx | 318\n",
127 |       "https://huggingface.co/Xenova/mms-1b/tree/main/onnx | 969\n",
128 |       "https://huggingface.co/Supabase/e5-small-v2/tree/main/onnx | 34\n",
129 |       "https://huggingface.co/Supabase/all-MiniLM-L6-v2/tree/main/onnx | 23\n",
130 |       "https://huggingface.co/Xenova/gte-base/tree/main/onnx | 110\n",
131 |       "https://huggingface.co/Xenova/bge-small-en/tree/main/onnx | 34\n",
132 |       "https://huggingface.co/Xenova/bge-base-en/tree/main/onnx | 110\n",
133 |       "https://huggingface.co/Xenova/bge-large-en/tree/main/onnx | 337\n",
134 |       "https://huggingface.co/ggrn/bge-small-en/tree/main/onnx | 34\n",
135 |       "https://huggingface.co/Xenova/bge-base-zh/tree/main/onnx | 103\n",
136 |       "https://huggingface.co/Xenova/bge-large-zh-noinstruct/tree/main/onnx | 327\n",
137 |       "https://huggingface.co/Xenova/bge-small-zh/tree/main/onnx | 24\n",
138 |       "https://huggingface.co/Xenova/ClinicalBERT/tree/main/onnx | 229\n",
139 |       "https://huggingface.co/Xenova/LaBSE/tree/main/onnx | 472\n",
140 |       "https://huggingface.co/Xenova/wavlm-base/tree/main/onnx | 95.8\n",
141 |       "https://huggingface.co/Xenova/wavlm-base-plus/tree/main/onnx | 95.8\n",
142 |       "https://huggingface.co/Xenova/wavlm-large/tree/main/onnx | 319\n",
143 |       "https://huggingface.co/Xenova/sentence-camembert-large/tree/main/onnx | 339\n",
144 |       "https://huggingface.co/Xenova/herbert-base-cased/tree/main/onnx | 125\n",
145 |       "https://huggingface.co/Xenova/herbert-large-cased/tree/main/onnx | 357\n",
146 |       "https://huggingface.co/Xenova/bge-large-zh-v1.5/tree/main/onnx | 327\n",
147 |       "https://huggingface.co/Xenova/bge-base-zh-v1.5/tree/main/onnx | 103\n",
148 |       "https://huggingface.co/Xenova/bge-small-zh-v1.5/tree/main/onnx | 24\n",
149 |       "https://huggingface.co/leolee9086/text2vec-base-chinese/tree/main/onnx | 103\n",
150 |       "https://huggingface.co/Xenova/long-t5-encodec-tglobal-base/tree/main/onnx | 291\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "def extract_size_from_url(url):\n",
156 |     "    try:\n",
157 |     "        response = requests.get(url)\n",
158 |     "        if response.status_code == 200:\n",
159 |     "            soup = BeautifulSoup(response.text, 'html.parser')\n",
160 |     "            \n",
161 |     "            # Find the 'a' tag with the specified title attribute\n",
162 |     "            a_tag = soup.find('a', title=\"Download file\")\n",
163 |     "            if a_tag:\n",
164 |     "                size = a_tag.text.strip()  # Extract the size text\n",
165 |     "                return size\n",
166 |     "            else:\n",
167 |     "                return \"Size not found\"\n",
168 |     "        else:\n",
169 |     "            return \"Failed to retrieve the page\"\n",
170 |     "    except requests.exceptions.RequestException as e:\n",
171 |     "        return f\"Request error: {e}\"\n",
172 |     "\n",
173 |     "# Iterate through the URLs and extract values\n",
174 |     "sizes = []\n",
175 |     "\n",
176 |     "for url in urls:\n",
177 |     "    values = extract_values_from_url(url)[-1].split(\" MB\")[0]\n",
178 |     "    sizes.append([url.split(\"https://huggingface.co/\")[1].split(\"/tree/main/onnx\")[0],values])\n",
179 |     "    print(f\"{url} | {values}\")\n"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 28,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "[['TaylorAI/gte-tiny', '22.9'],\n",
191 |        " ['Supabase/gte-small', '34'],\n",
192 |        " ['Xenova/all-MiniLM-L6-v2', '23'],\n",
193 |        " ['Xenova/bge-large-en-v1.5', '337'],\n",
194 |        " ['Supabase/bge-small-en', '34'],\n",
195 |        " ['Xenova/gte-small', '34'],\n",
196 |        " ['Xenova/all-mpnet-base-v2', '110'],\n",
197 |        " ['Xenova/paraphrase-mpnet-base-v2', '110'],\n",
198 |        " ['Xenova/all-MiniLM-L12-v2', '34'],\n",
199 |        " ['Xenova/multilingual-e5-small', '118'],\n",
200 |        " ['Xenova/gte-large', '337'],\n",
201 |        " ['Xenova/bge-base-en-v1.5', '110'],\n",
202 |        " ['Xenova/all-roberta-large-v1', '357'],\n",
203 |        " ['Xenova/distiluse-base-multilingual-cased-v2', '135'],\n",
204 |        " ['Xenova/paraphrase-multilingual-mpnet-base-v2', '279'],\n",
205 |        " ['Xenova/bge-large-zh', '327'],\n",
206 |        " ['Xenova/multilingual-e5-base', '279'],\n",
207 |        " ['Xenova/bge-small-en-v1.5', '34'],\n",
208 |        " ['Xenova/paraphrase-albert-small-v2', '39.7'],\n",
209 |        " ['Xenova/paraphrase-albert-base-v2', '40'],\n",
210 |        " ['Xenova/squeezebert-uncased', '51.2'],\n",
211 |        " ['Xenova/squeezebert-mnli', '51.3'],\n",
212 |        " ['Xenova/vit-base-patch16-224-in21k', '87.5'],\n",
213 |        " ['Xenova/all-distilroberta-v1', '82.1'],\n",
214 |        " ['Xenova/paraphrase-multilingual-MiniLM-L12-v2', '118'],\n",
215 |        " ['Xenova/paraphrase-MiniLM-L6-v2', '23'],\n",
216 |        " ['Xenova/bert-base-nli-mean-tokens', '110'],\n",
217 |        " ['Xenova/distilbert-base-nli-mean-tokens', '66.9'],\n",
218 |        " ['Xenova/distilbert-base-nli-stsb-mean-tokens', '66.9'],\n",
219 |        " ['Xenova/distiluse-base-multilingual-cased-v1', '135'],\n",
220 |        " ['Xenova/msmarco-distilbert-base-v4', '66.9'],\n",
221 |        " ['Xenova/multi-qa-MiniLM-L6-cos-v1', '23'],\n",
222 |        " ['Xenova/multi-qa-distilbert-cos-v1', '66.9'],\n",
223 |        " ['Xenova/multi-qa-mpnet-base-cos-v1', '110'],\n",
224 |        " ['Xenova/multi-qa-mpnet-base-dot-v1', '110'],\n",
225 |        " ['Xenova/nli-mpnet-base-v2', '110'],\n",
226 |        " ['Xenova/paraphrase-MiniLM-L3-v2', '17.5'],\n",
227 |        " ['Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', '279'],\n",
228 |        " ['Xenova/dino-vitb16', '87.5'],\n",
229 |        " ['Xenova/dino-vits8', '23.4'],\n",
230 |        " ['Xenova/dino-vitb8', '88.8'],\n",
231 |        " ['Xenova/dino-vits16', '22.7'],\n",
232 |        " ['Xenova/scibert_scivocab_uncased', '111'],\n",
233 |        " ['Xenova/spanbert-large-cased', '335'],\n",
234 |        " ['Xenova/spanbert-base-cased', '109'],\n",
235 |        " ['sdan/simple-embeddings', '23'],\n",
236 |        " ['Xenova/sentence_bert', '110'],\n",
237 |        " ['Xenova/e5-small-v2', '34'],\n",
238 |        " ['Xenova/SapBERT-from-PubMedBERT-fulltext', '110'],\n",
239 |        " ['Xenova/indobert-base-p1', '125'],\n",
240 |        " ['Xenova/UMLSBert_ENG', '110'],\n",
241 |        " ['Xenova/rubert-base-cased', '178'],\n",
242 |        " ['Xenova/kobert', '92.8'],\n",
243 |        " ['Xenova/e5-small', '34'],\n",
244 |        " ['Xenova/e5-large', '337'],\n",
245 |        " ['Xenova/e5-large-v2', '337'],\n",
246 |        " ['Xenova/e5-base', '110'],\n",
247 |        " ['Xenova/e5-base-v2', '110'],\n",
248 |        " ['Xenova/instructor-base', '110'],\n",
249 |        " ['Xenova/instructor-large', '337'],\n",
250 |        " ['Xenova/sentence-t5-large', '337'],\n",
251 |        " ['Xenova/multilingual-e5-large', '562'],\n",
252 |        " ['Xenova/mms-300m', '318'],\n",
253 |        " ['Xenova/mms-1b', '969'],\n",
254 |        " ['Supabase/e5-small-v2', '34'],\n",
255 |        " ['Supabase/all-MiniLM-L6-v2', '23'],\n",
256 |        " ['Xenova/gte-base', '110'],\n",
257 |        " ['Xenova/bge-small-en', '34'],\n",
258 |        " ['Xenova/bge-base-en', '110'],\n",
259 |        " ['Xenova/bge-large-en', '337'],\n",
260 |        " ['ggrn/bge-small-en', '34'],\n",
261 |        " ['Xenova/bge-base-zh', '103'],\n",
262 |        " ['Xenova/bge-large-zh-noinstruct', '327'],\n",
263 |        " ['Xenova/bge-small-zh', '24'],\n",
264 |        " ['Xenova/ClinicalBERT', '229'],\n",
265 |        " ['Xenova/LaBSE', '472'],\n",
266 |        " ['Xenova/wavlm-base', '95.8'],\n",
267 |        " ['Xenova/wavlm-base-plus', '95.8'],\n",
268 |        " ['Xenova/wavlm-large', '319'],\n",
269 |        " ['Xenova/sentence-camembert-large', '339'],\n",
270 |        " ['Xenova/herbert-base-cased', '125'],\n",
271 |        " ['Xenova/herbert-large-cased', '357'],\n",
272 |        " ['Xenova/bge-large-zh-v1.5', '327'],\n",
273 |        " ['Xenova/bge-base-zh-v1.5', '103'],\n",
274 |        " ['Xenova/bge-small-zh-v1.5', '24'],\n",
275 |        " ['leolee9086/text2vec-base-chinese', '103'],\n",
276 |        " ['Xenova/long-t5-encodec-tglobal-base', '291']]"
277 |       ]
278 |      },
279 |      "execution_count": 28,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "sizes"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 29,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "sizes_backup = [['TaylorAI/gte-tiny', '22.9'],\n",
295 |     " ['Supabase/gte-small', '34'],\n",
296 |     " ['Xenova/all-MiniLM-L6-v2', '23'],\n",
297 |     " ['Xenova/bge-large-en-v1.5', '337'],\n",
298 |     " ['Supabase/bge-small-en', '34'],\n",
299 |     " ['Xenova/gte-small', '34'],\n",
300 |     " ['Xenova/all-mpnet-base-v2', '110'],\n",
301 |     " ['Xenova/paraphrase-mpnet-base-v2', '110'],\n",
302 |     " ['Xenova/all-MiniLM-L12-v2', '34'],\n",
303 |     " ['Xenova/multilingual-e5-small', '118'],\n",
304 |     " ['Xenova/gte-large', '337'],\n",
305 |     " ['Xenova/bge-base-en-v1.5', '110'],\n",
306 |     " ['Xenova/all-roberta-large-v1', '357'],\n",
307 |     " ['Xenova/distiluse-base-multilingual-cased-v2', '135'],\n",
308 |     " ['Xenova/paraphrase-multilingual-mpnet-base-v2', '279'],\n",
309 |     " ['Xenova/bge-large-zh', '327'],\n",
310 |     " ['Xenova/multilingual-e5-base', '279'],\n",
311 |     " ['Xenova/bge-small-en-v1.5', '34'],\n",
312 |     " ['Xenova/paraphrase-albert-small-v2', '39.7'],\n",
313 |     " ['Xenova/paraphrase-albert-base-v2', '40'],\n",
314 |     " ['Xenova/squeezebert-uncased', '51.2'],\n",
315 |     " ['Xenova/squeezebert-mnli', '51.3'],\n",
316 |     " ['Xenova/vit-base-patch16-224-in21k', '87.5'],\n",
317 |     " ['Xenova/all-distilroberta-v1', '82.1'],\n",
318 |     " ['Xenova/paraphrase-multilingual-MiniLM-L12-v2', '118'],\n",
319 |     " ['Xenova/paraphrase-MiniLM-L6-v2', '23'],\n",
320 |     " ['Xenova/bert-base-nli-mean-tokens', '110'],\n",
321 |     " ['Xenova/distilbert-base-nli-mean-tokens', '66.9'],\n",
322 |     " ['Xenova/distilbert-base-nli-stsb-mean-tokens', '66.9'],\n",
323 |     " ['Xenova/distiluse-base-multilingual-cased-v1', '135'],\n",
324 |     " ['Xenova/msmarco-distilbert-base-v4', '66.9'],\n",
325 |     " ['Xenova/multi-qa-MiniLM-L6-cos-v1', '23'],\n",
326 |     " ['Xenova/multi-qa-distilbert-cos-v1', '66.9'],\n",
327 |     " ['Xenova/multi-qa-mpnet-base-cos-v1', '110'],\n",
328 |     " ['Xenova/multi-qa-mpnet-base-dot-v1', '110'],\n",
329 |     " ['Xenova/nli-mpnet-base-v2', '110'],\n",
330 |     " ['Xenova/paraphrase-MiniLM-L3-v2', '17.5'],\n",
331 |     " ['Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', '279'],\n",
332 |     " ['Xenova/dino-vitb16', '87.5'],\n",
333 |     " ['Xenova/dino-vits8', '23.4'],\n",
334 |     " ['Xenova/dino-vitb8', '88.8'],\n",
335 |     " ['Xenova/dino-vits16', '22.7'],\n",
336 |     " ['Xenova/scibert_scivocab_uncased', '111'],\n",
337 |     " ['Xenova/spanbert-large-cased', '335'],\n",
338 |     " ['Xenova/spanbert-base-cased', '109'],\n",
339 |     " ['sdan/simple-embeddings', '23'],\n",
340 |     " ['Xenova/sentence_bert', '110'],\n",
341 |     " ['Xenova/e5-small-v2', '34'],\n",
342 |     " ['Xenova/SapBERT-from-PubMedBERT-fulltext', '110'],\n",
343 |     " ['Xenova/indobert-base-p1', '125'],\n",
344 |     " ['Xenova/UMLSBert_ENG', '110'],\n",
345 |     " ['Xenova/rubert-base-cased', '178'],\n",
346 |     " ['Xenova/kobert', '92.8'],\n",
347 |     " ['Xenova/e5-small', '34'],\n",
348 |     " ['Xenova/e5-large', '337'],\n",
349 |     " ['Xenova/e5-large-v2', '337'],\n",
350 |     " ['Xenova/e5-base', '110'],\n",
351 |     " ['Xenova/e5-base-v2', '110'],\n",
352 |     " ['Xenova/instructor-base', '110'],\n",
353 |     " ['Xenova/instructor-large', '337'],\n",
354 |     " ['Xenova/sentence-t5-large', '337'],\n",
355 |     " ['Xenova/multilingual-e5-large', '562'],\n",
356 |     " ['Xenova/mms-300m', '318'],\n",
357 |     " ['Xenova/mms-1b', '969'],\n",
358 |     " ['Supabase/e5-small-v2', '34'],\n",
359 |     " ['Supabase/all-MiniLM-L6-v2', '23'],\n",
360 |     " ['Xenova/gte-base', '110'],\n",
361 |     " ['Xenova/bge-small-en', '34'],\n",
362 |     " ['Xenova/bge-base-en', '110'],\n",
363 |     " ['Xenova/bge-large-en', '337'],\n",
364 |     " ['ggrn/bge-small-en', '34'],\n",
365 |     " ['Xenova/bge-base-zh', '103'],\n",
366 |     " ['Xenova/bge-large-zh-noinstruct', '327'],\n",
367 |     " ['Xenova/bge-small-zh', '24'],\n",
368 |     " ['Xenova/ClinicalBERT', '229'],\n",
369 |     " ['Xenova/LaBSE', '472'],\n",
370 |     " ['Xenova/wavlm-base', '95.8'],\n",
371 |     " ['Xenova/wavlm-base-plus', '95.8'],\n",
372 |     " ['Xenova/wavlm-large', '319'],\n",
373 |     " ['Xenova/sentence-camembert-large', '339'],\n",
374 |     " ['Xenova/herbert-base-cased', '125'],\n",
375 |     " ['Xenova/herbert-large-cased', '357'],\n",
376 |     " ['Xenova/bge-large-zh-v1.5', '327'],\n",
377 |     " ['Xenova/bge-base-zh-v1.5', '103'],\n",
378 |     " ['Xenova/bge-small-zh-v1.5', '24'],\n",
379 |     " ['leolee9086/text2vec-base-chinese', '103'],\n",
380 |     " ['Xenova/long-t5-encodec-tglobal-base', '291']]"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 32,
386 |    "metadata": {},
387 |    "outputs": [
388 |     {
389 |      "data": {
390 |       "text/plain": [
391 |        "{'TaylorAI/gte-tiny': '22.9',\n",
392 |        " 'Supabase/gte-small': '34',\n",
393 |        " 'Xenova/all-MiniLM-L6-v2': '23',\n",
394 |        " 'Xenova/bge-large-en-v1.5': '337',\n",
395 |        " 'Supabase/bge-small-en': '34',\n",
396 |        " 'Xenova/gte-small': '34',\n",
397 |        " 'Xenova/all-mpnet-base-v2': '110',\n",
398 |        " 'Xenova/paraphrase-mpnet-base-v2': '110',\n",
399 |        " 'Xenova/all-MiniLM-L12-v2': '34',\n",
400 |        " 'Xenova/multilingual-e5-small': '118',\n",
401 |        " 'Xenova/gte-large': '337',\n",
402 |        " 'Xenova/bge-base-en-v1.5': '110',\n",
403 |        " 'Xenova/all-roberta-large-v1': '357',\n",
404 |        " 'Xenova/distiluse-base-multilingual-cased-v2': '135',\n",
405 |        " 'Xenova/paraphrase-multilingual-mpnet-base-v2': '279',\n",
406 |        " 'Xenova/bge-large-zh': '327',\n",
407 |        " 'Xenova/multilingual-e5-base': '279',\n",
408 |        " 'Xenova/bge-small-en-v1.5': '34',\n",
409 |        " 'Xenova/paraphrase-albert-small-v2': '39.7',\n",
410 |        " 'Xenova/paraphrase-albert-base-v2': '40',\n",
411 |        " 'Xenova/squeezebert-uncased': '51.2',\n",
412 |        " 'Xenova/squeezebert-mnli': '51.3',\n",
413 |        " 'Xenova/vit-base-patch16-224-in21k': '87.5',\n",
414 |        " 'Xenova/all-distilroberta-v1': '82.1',\n",
415 |        " 'Xenova/paraphrase-multilingual-MiniLM-L12-v2': '118',\n",
416 |        " 'Xenova/paraphrase-MiniLM-L6-v2': '23',\n",
417 |        " 'Xenova/bert-base-nli-mean-tokens': '110',\n",
418 |        " 'Xenova/distilbert-base-nli-mean-tokens': '66.9',\n",
419 |        " 'Xenova/distilbert-base-nli-stsb-mean-tokens': '66.9',\n",
420 |        " 'Xenova/distiluse-base-multilingual-cased-v1': '135',\n",
421 |        " 'Xenova/msmarco-distilbert-base-v4': '66.9',\n",
422 |        " 'Xenova/multi-qa-MiniLM-L6-cos-v1': '23',\n",
423 |        " 'Xenova/multi-qa-distilbert-cos-v1': '66.9',\n",
424 |        " 'Xenova/multi-qa-mpnet-base-cos-v1': '110',\n",
425 |        " 'Xenova/multi-qa-mpnet-base-dot-v1': '110',\n",
426 |        " 'Xenova/nli-mpnet-base-v2': '110',\n",
427 |        " 'Xenova/paraphrase-MiniLM-L3-v2': '17.5',\n",
428 |        " 'Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens': '279',\n",
429 |        " 'Xenova/dino-vitb16': '87.5',\n",
430 |        " 'Xenova/dino-vits8': '23.4',\n",
431 |        " 'Xenova/dino-vitb8': '88.8',\n",
432 |        " 'Xenova/dino-vits16': '22.7',\n",
433 |        " 'Xenova/scibert_scivocab_uncased': '111',\n",
434 |        " 'Xenova/spanbert-large-cased': '335',\n",
435 |        " 'Xenova/spanbert-base-cased': '109',\n",
436 |        " 'sdan/simple-embeddings': '23',\n",
437 |        " 'Xenova/sentence_bert': '110',\n",
438 |        " 'Xenova/e5-small-v2': '34',\n",
439 |        " 'Xenova/SapBERT-from-PubMedBERT-fulltext': '110',\n",
440 |        " 'Xenova/indobert-base-p1': '125',\n",
441 |        " 'Xenova/UMLSBert_ENG': '110',\n",
442 |        " 'Xenova/rubert-base-cased': '178',\n",
443 |        " 'Xenova/kobert': '92.8',\n",
444 |        " 'Xenova/e5-small': '34',\n",
445 |        " 'Xenova/e5-large': '337',\n",
446 |        " 'Xenova/e5-large-v2': '337',\n",
447 |        " 'Xenova/e5-base': '110',\n",
448 |        " 'Xenova/e5-base-v2': '110',\n",
449 |        " 'Xenova/instructor-base': '110',\n",
450 |        " 'Xenova/instructor-large': '337',\n",
451 |        " 'Xenova/sentence-t5-large': '337',\n",
452 |        " 'Xenova/multilingual-e5-large': '562',\n",
453 |        " 'Xenova/mms-300m': '318',\n",
454 |        " 'Xenova/mms-1b': '969',\n",
455 |        " 'Supabase/e5-small-v2': '34',\n",
456 |        " 'Supabase/all-MiniLM-L6-v2': '23',\n",
457 |        " 'Xenova/gte-base': '110',\n",
458 |        " 'Xenova/bge-small-en': '34',\n",
459 |        " 'Xenova/bge-base-en': '110',\n",
460 |        " 'Xenova/bge-large-en': '337',\n",
461 |        " 'ggrn/bge-small-en': '34',\n",
462 |        " 'Xenova/bge-base-zh': '103',\n",
463 |        " 'Xenova/bge-large-zh-noinstruct': '327',\n",
464 |        " 'Xenova/bge-small-zh': '24',\n",
465 |        " 'Xenova/ClinicalBERT': '229',\n",
466 |        " 'Xenova/LaBSE': '472',\n",
467 |        " 'Xenova/wavlm-base': '95.8',\n",
468 |        " 'Xenova/wavlm-base-plus': '95.8',\n",
469 |        " 'Xenova/wavlm-large': '319',\n",
470 |        " 'Xenova/sentence-camembert-large': '339',\n",
471 |        " 'Xenova/herbert-base-cased': '125',\n",
472 |        " 'Xenova/herbert-large-cased': '357',\n",
473 |        " 'Xenova/bge-large-zh-v1.5': '327',\n",
474 |        " 'Xenova/bge-base-zh-v1.5': '103',\n",
475 |        " 'Xenova/bge-small-zh-v1.5': '24',\n",
476 |        " 'leolee9086/text2vec-base-chinese': '103',\n",
477 |        " 'Xenova/long-t5-encodec-tglobal-base': '291'}"
478 |       ]
479 |      },
480 |      "execution_count": 32,
481 |      "metadata": {},
482 |      "output_type": "execute_result"
483 |     }
484 |    ],
485 |    "source": [
486 |     "# Create a dictionary to easily look up sizes by id\n",
487 |     "size_dict = dict(sizes)\n",
488 |     "size_dict"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 36,
494 |    "metadata": {},
495 |    "outputs": [
496 |     {
497 |      "name": "stdout",
498 |      "output_type": "stream",
499 |      "text": [
500 |       "JSON updated and saved to 'your_output.json'\n"
501 |      ]
502 |     }
503 |    ],
504 |    "source": [
505 |     "import json\n",
506 |     "\n",
507 |     "this_file = \"feature-extraction_trending\"\n",
508 |     "\n",
509 |     "with open(f\"{this_file}.json\", 'r') as json_file:\n",
510 |     "    data = json.load(json_file)\n",
511 |     "\n",
512 |     "# Iterate over the \"models\" in your JSON\n",
513 |     "for model in data[\"models\"]:\n",
514 |     "    model_id = model[\"id\"]\n",
515 |     "    if model_id in size_dict:\n",
516 |     "        model[\"model_size\"] = size_dict[model_id]\n",
517 |     "\n",
518 |     "# Save the updated JSON to a file\n",
519 |     "with open(f'{this_file}_sizes.json', 'w') as file:\n",
520 |     "    json.dump(data, file, indent=4)\n"
521 |    ]
522 |   }
523 |  ],
524 |  "metadata": {
525 |   "kernelspec": {
526 |    "display_name": "py3.11",
527 |    "language": "python",
528 |    "name": "python3"
529 |   },
530 |   "language_info": {
531 |    "codemirror_mode": {
532 |     "name": "ipython",
533 |     "version": 3
534 |    },
535 |    "file_extension": ".py",
536 |    "mimetype": "text/x-python",
537 |    "name": "python",
538 |    "nbconvert_exporter": "python",
539 |    "pygments_lexer": "ipython3",
540 |    "version": "3.11.0"
541 |   }
542 |  },
543 |  "nbformat": 4,
544 |  "nbformat_minor": 2
545 | }
546 | 


--------------------------------------------------------------------------------
/src/models/text2text_downloads.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}
2 | 


--------------------------------------------------------------------------------
/src/models/text2text_likes.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}


--------------------------------------------------------------------------------
/src/models/text2text_modified.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}


--------------------------------------------------------------------------------
/src/models/text2text_trending.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}
2 | 


--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const HtmlWebpackPlugin = require('html-webpack-plugin');
 3 | const MiniCssExtractPlugin = require('mini-css-extract-plugin'); // FOUC-correction
 4 | const FaviconsWebpackPlugin = require('favicons-webpack-plugin');
 5 | const CopyWebpackPlugin = require('copy-webpack-plugin'); 
 6 | 
 7 | module.exports = {
 8 |   entry: './src/js/index.js',
 9 |   mode: 'development',
10 |   output: {
11 |     filename: 'bundle.js',
12 |     path: path.resolve(__dirname, 'dist'),
13 |     clean: true
14 |   },
15 |   module: {
16 |     rules: [
17 |       {
18 |         test: /\.css$/,
19 |         use: [MiniCssExtractPlugin.loader, 'css-loader'],
20 | 
21 |       },
22 |       {
23 |         test: /\.svg$/,
24 |         type: 'asset/resource',
25 |         generator: {
26 |           filename: '[name][ext]'
27 |         }
28 |       },
29 |     ],
30 |   },
31 |   plugins: [
32 |     new HtmlWebpackPlugin({
33 |       template: './index.html',
34 |     }),
35 |     new MiniCssExtractPlugin(),
36 |     new FaviconsWebpackPlugin(),
37 |     new CopyWebpackPlugin({
38 |       patterns: [
39 |         {
40 |           from: 'src/models/**/*_sizes.json', // Source directory of JSON files
41 |           to: 'models/[name][ext]'
42 |         },
43 |       ],
44 |     }),
45 |   ],
46 | };
47 | 


--------------------------------------------------------------------------------