├── .eslintrc.js ├── .github └── workflows │ └── deploy.yml ├── .gitignore ├── LICENSE ├── README.md ├── SemanticFinder.gif ├── SemanticFinder_Chrome_Extension.gif ├── SemanticFinder_Chrome_Extension_en.zip ├── SemanticFinder_gource.png ├── extension ├── .gitignore ├── README.md ├── package-lock.json ├── package.json ├── public │ ├── icons │ │ ├── logo128.png │ │ └── logo48.png │ └── manifest.json ├── src │ ├── content │ │ ├── content.css │ │ └── content.js │ ├── options │ │ ├── options.css │ │ ├── options.html │ │ └── options.js │ ├── popup │ │ ├── AnimatedInput.vue │ │ ├── popup.css │ │ ├── popup.html │ │ ├── popup.js │ │ ├── popup.vue │ │ └── result.vue │ ├── serviceworkers │ │ ├── background.js │ │ ├── pdf.js │ │ ├── pdf.sandbox.js │ │ ├── pdf.worker.entry.js │ │ ├── pdf.worker.js │ │ └── semantic.js │ └── utils │ │ ├── cache.js │ │ └── utils.js └── webpack.config.js ├── index.html ├── jsconfig.json ├── logo.png ├── misc ├── Generate_large_textfile_from_books.ipynb └── README.md ├── package-lock.json ├── package.json ├── src ├── css │ └── styles.css ├── js │ ├── SemanticFinder.svg │ ├── index.js │ ├── semantic.js │ ├── utils.js │ └── worker.js └── models │ ├── feature-extraction_downloads.json │ ├── feature-extraction_downloads_sizes.json │ ├── feature-extraction_likes.json │ ├── feature-extraction_likes_sizes.json │ ├── feature-extraction_modified.json │ ├── feature-extraction_modified_sizes.json │ ├── feature-extraction_trending.json │ ├── feature-extraction_trending_sizes.json │ ├── model_miner.js │ ├── model_miner_simple.js │ ├── model_size_miner.ipynb │ ├── text2text_downloads.json │ ├── text2text_likes.json │ ├── text2text_modified.json │ └── text2text_trending.json └── webpack.config.js /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | env: { 3 | browser: true, 4 | es2021: true 5 | }, 6 | extends: 'standard', 7 | overrides: [ 8 | { 9 | env: { 10 | node: true 11 | }, 12 | files: [ 13 | '.eslintrc.{js,cjs}' 14 | ], 15 | parserOptions: { 16 | sourceType: 'script' 17 | } 18 | } 19 | ], 20 | parserOptions: { 21 | ecmaVersion: 'latest', 22 | sourceType: 'module' 23 | }, 24 | rules: { 25 | indent: ['error', 4], 26 | 'space-before-function-paren': ['error', 'never'], 27 | semi: ['error', 'always'] 28 | } 29 | }; 30 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - webgpu 8 | 9 | jobs: 10 | deploy: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | # Checkout and Deploy Main Branch 15 | - name: Checkout main branch 16 | uses: actions/checkout@v3 17 | with: 18 | ref: main 19 | 20 | - name: Set up Node.js for main 21 | uses: actions/setup-node@v3 22 | with: 23 | node-version: '22' 24 | cache: 'npm' 25 | 26 | - name: Install dependencies for main 27 | run: npm install 28 | 29 | # Build and deploy main branch 30 | - name: Build and deploy main 31 | run: | 32 | npm run build 33 | mkdir -p main_build 34 | mv dist/* main_build/ 35 | echo "Deploying main branch..." 36 | 37 | - name: Deploy main to GitHub Pages 38 | uses: peaceiris/actions-gh-pages@v3 39 | with: 40 | github_token: ${{ secrets.GITHUB_TOKEN }} 41 | publish_dir: ./main_build # Deploy from your custom directory 42 | 43 | # Checkout and Deploy webgpu Branch 44 | - name: Checkout webgpu branch 45 | uses: actions/checkout@v3 46 | with: 47 | ref: webgpu 48 | 49 | - name: Set up Node.js for webgpu 50 | uses: actions/setup-node@v3 51 | with: 52 | node-version: '22' 53 | cache: 'npm' 54 | 55 | - name: Install dependencies for webgpu 56 | run: npm install 57 | 58 | # Build and deploy webgpu branch 59 | - name: Build and deploy webgpu 60 | run: | 61 | npm run build 62 | mkdir -p webgpu_build 63 | mv dist/* webgpu_build/ 64 | echo "Deploying webgpu branch..." 65 | 66 | - name: Deploy webgpu to GitHub Pages subdirectory 67 | uses: peaceiris/actions-gh-pages@v3 68 | with: 69 | github_token: ${{ secrets.GITHUB_TOKEN }} 70 | publish_dir: ./webgpu_build 71 | destination_dir: webgpu 72 | keep_files: true 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .vscode/ 3 | .idea/ 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 do-me 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | SemanticFinder 4 | 5 |

Frontend-only live semantic search and chat-with-your-documents built on transformers.js. Supports Wasm and WebGPU!

6 |

7 | 8 | ![](/SemanticFinder.gif?) 9 | 10 | ## [Try the web app](https://do-me.github.io/SemanticFinder/), [install the Chrome extension](#browser-extension) or read the [introduction blog post](https://geo.rocks/post/semanticfinder-semantic-search-frontend-only/). 11 | 12 | ## 🔥 For best performance try the [WebGPU Version here!](https://do-me.github.io/SemanticFinder/webgpu/) 🔥 13 | 14 | Semantic search right in your browser! Calculates the embeddings and cosine similarity client-side without server-side inferencing, using [transformers.js](https://xenova.github.io/transformers.js/) and latest SOTA embedding models from Huggingface. 15 | 16 | ## Intro Video 17 | [![SemanticFinder Introduction](https://github.com/user-attachments/assets/9febc0e7-f444-4039-8cf2-af39f3d7733f)](https://www.youtube.com/watch?v=FZsWH1J4MXo "Get started with semantic search in the browser") 18 | 19 | ## Models 20 | All transformers.js-compatible feature-extraction models are supported. Here is a sortable list you can go through: [daily updated list](https://do-me.github.io/trending-huggingface-models/). Download the compatible models table as xlsx, csv, json, parquet, or html here: https://github.com/do-me/trending-huggingface-models/. 21 | Note that the wasm backend in transformers.js supports all mentioned models. If you want the best performance, make sure to use a WebGPU-compatible model. 22 | 23 | ## Catalogue 24 | You can use super fast pre-indexed examples for *really* large books like the Bible or Les Misérables with hundreds of pages and search the content in less than 2 seconds 🚀. Try one of these and convince yourself: 25 | 26 | | filesize | textTitle | textAuthor | textYear | textLanguage | URL | modelName | quantized | splitParam | splitType | characters | chunks | wordsToAvoidAll | wordsToCheckAll | wordsToAvoidAny | wordsToCheckAny | exportDecimals | lines | textNotes | textSourceURL | filename | 27 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 28 | | 4.78 | Das Kapital | Karl Marx | 1867 | de | https://do-me.github.io/SemanticFinder/?hf=Das_Kapital_c1a84fba | Xenova/multilingual-e5-small | True | 80 | Words | 2003807 | 3164 | | | | | 5 | 28673 | | https://ia601605.us.archive.org/13/items/KarlMarxDasKapitalpdf/KAPITAL1.pdf | Das_Kapital_c1a84fba.json.gz | 29 | | 2.58 | Divina Commedia | Dante | 1321 | it | https://do-me.github.io/SemanticFinder/?hf=Divina_Commedia_d5a0fa67 | Xenova/multilingual-e5-base | True | 50 | Words | 383782 | 1179 | | | | | 5 | 6225 | | http://www.letteratura-italiana.com/pdf/divina%20commedia/08%20Inferno%20in%20versione%20italiana.pdf | Divina_Commedia_d5a0fa67.json.gz | 30 | | 11.92 | Don Quijote | Miguel de Cervantes | 1605 | es | https://do-me.github.io/SemanticFinder/?hf=Don_Quijote_14a0b44 | Xenova/multilingual-e5-base | True | 25 | Words | 1047150 | 7186 | | | | | 4 | 12005 | | https://parnaseo.uv.es/lemir/revista/revista19/textos/quijote_1.pdf | Don_Quijote_14a0b44.json.gz | 31 | | 0.06 | Hansel and Gretel | Brothers Grimm | 1812 | en | https://do-me.github.io/SemanticFinder/?hf=Hansel_and_Gretel_4de079eb | TaylorAI/gte-tiny | True | 100 | Chars | 5304 | 55 | | | | | 5 | 9 | | https://www.grimmstories.com/en/grimm_fairy-tales/hansel_and_gretel | Hansel_and_Gretel_4de079eb.json.gz | 32 | | 1.74 | IPCC Report 2023 | IPCC | 2023 | en | https://do-me.github.io/SemanticFinder/?hf=IPCC_Report_2023_2b260928 | Supabase/bge-small-en | True | 200 | Chars | 307811 | 1566 | | | | | 5 | 3230 | state of knowledge of climate change | https://report.ipcc.ch/ar6syr/pdf/IPCC_AR6_SYR_LongerReport.pdf | IPCC_Report_2023_2b260928.json.gz | 33 | | 25.56 | King James Bible | | None | en | https://do-me.github.io/SemanticFinder/?hf=King_James_Bible_24f6dc4c | TaylorAI/gte-tiny | True | 200 | Chars | 4556163 | 23056 | | | | | 5 | 80496 | | https://www.holybooks.com/wp-content/uploads/2010/05/The-Holy-Bible-King-James-Version.pdf | King_James_Bible_24f6dc4c.json.gz | 34 | | 11.45 | King James Bible | | None | en | https://do-me.github.io/SemanticFinder/?hf=King_James_Bible_6434a78d | TaylorAI/gte-tiny | True | 200 | Chars | 4556163 | 23056 | | | | | 2 | 80496 | | https://www.holybooks.com/wp-content/uploads/2010/05/The-Holy-Bible-King-James-Version.pdf | King_James_Bible_6434a78d.json.gz | 35 | | 39.32 | Les Misérables | Victor Hugo | 1862 | fr | https://do-me.github.io/SemanticFinder/?hf=Les_Misérables_2239df51 | Xenova/multilingual-e5-base | True | 25 | Words | 3236941 | 19463 | | | | | 5 | 74491 | All five acts included | https://beq.ebooksgratuits.com/vents/Hugo-miserables-1.pdf | Les_Misérables_2239df51.json.gz | 36 | | 0.46 | REGULATION (EU) 2023/138 | European Commission | 2022 | en | https://do-me.github.io/SemanticFinder/?hf=REGULATION_(EU)_2023_138_c00e7ff6 | Supabase/bge-small-en | True | 25 | Words | 76809 | 424 | | | | | 5 | 1323 | | https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R0138&qid=1704492501351 | REGULATION_(EU)_2023_138_c00e7ff6.json.gz | 37 | | 0.07 | Universal Declaration of Human Rights | United Nations | 1948 | en | https://do-me.github.io/SemanticFinder/?hf=Universal_Declaration_of_Human_Rights_0a7da79a | TaylorAI/gte-tiny | True | \nArticle | Regex | 8623 | 63 | | | | | 5 | 109 | 30 articles | https://www.un.org/en/about-us/universal-declaration-of-human-rights | Universal_Declaration_of_Human_Rights_0a7da79a.json.gz | 38 | 39 | ## Import & Export 40 | 41 | You can create indices yourself with one two clicks and save them. If it's something private, keep it for yourself, if it's a classic book or something you think other's might be interested in consider a PR on the [Huggingface Repo](https://huggingface.co/datasets/do-me/SemanticFinder) or get in touch with us. Book requests are happily met if you provide us a good source link where we can do copy & paste. Simply open an issue here with [Book Request] or similar or contact us. 42 | 43 | It goes without saying that no discriminating content will be tolerated. 44 | 45 | ## Installation 46 | 47 | Clone the repository and install dependencies with 48 | 49 | `npm install` 50 | 51 | Then run with 52 | 53 | `npm run start` 54 | 55 | If you want to build instead, run 56 | 57 | `npm run build` 58 | 59 | Afterwards, you'll find the `index.html`, `main.css` and `bundle.js` in `dist`. 60 | 61 | ## Browser extension 62 | Download the Chrome extension from [Chrome webstore](https://chrome.google.com/webstore/detail/semanticfinder/ddmgffoffelnhnonpoiblaoboaeieejl) and pin it. Right click the extension icon for `options`: 63 | - choose distiluse-base-multilingual-cased-v2 for multilingual usage (default is English-only) 64 | - set a higher number for min characters to split by for larger texts 65 | 66 | ![](SemanticFinder_Chrome_Extension.gif?) 67 | 68 | ### Local build 69 | If you want to build the browser extension locally, clone the repo and cd in `extension` directory then run: 70 | - `npm install` 71 | - `npm run build` for a static build or 72 | - `npm run dev` for the auto-refreshing development version 73 | - go to Chrome extension settings with `chrome://extensions` 74 | - select `Load Unpacked` and choose the `build` folder 75 | - pin the extension in Chrome so you can access it easily. If it doesn't work for you, feel free to open an issue. 76 | 77 | ## Speed 78 | Tested on the entire book of [Moby Dick](https://archive.org/stream/mobydickorwhale01melvuoft/mobydickorwhale01melvuoft_djvu.txt) with 660.000 characters ~13.000 lines or ~111.000 words. 79 | Initial embedding generation takes **1-2 mins** on my old i7-8550U CPU with 1000 characters as segment size. Following queries take only ~2 seconds! 80 | If you want to query larger text instead or keep an entire library of books indexed use a [proper vector database instead](https://geo.rocks/post/qdrant-transformers-js-semantic-search/). 81 | 82 | ## Features 83 | 84 | You can customize everything! 85 | 86 | - Input text & search term(s) 87 | - Hybrid search (semantic search & full-text search) 88 | - Segment length (the bigger the faster, the smaller the slower) 89 | - Highlight colors (currently hard-coded) 90 | - Number of highlights are based on the threshold value. The lower, the more results. 91 | - Live updates 92 | - Easy integration of other ML-models thanks to [transformers.js](https://xenova.github.io/transformers.js/) 93 | - Data privacy-friendly - your input text data is not sent to a server, it stays in your browser! 94 | 95 | ## Usage ideas 96 | 97 | - Basic search through anything, like your personal notes (my initial motivation by the way, a huge notes.txt file I couldn't handle anymore) 98 | - Remember peom analysis in school? Often you look for possible Leitmotifs or recurring categories like **food** in Hänsel & Gretel 99 | 100 | ## Future ideas 101 | 102 | - One could package everything nicely and use it e.g. instead of JavaScript search engines such as [Lunr.js](https://lunrjs.com/) (also being used in [mkdocs-material](https://squidfunk.github.io/mkdocs-material/setup/setting-up-site-search/)). 103 | - Integration in mkdocs (mkdocs-material) **experimental**: 104 | - when building the docs, slice all `.md`-files in chunks (length defined in `mkdocs.yaml`). Should be fairly large (>800 characters) for lower response time. It's also possible to build n indices with first a coarse index (mabye per document/ `.md`-file if the used model supports the length) and then a rfined one for the document chunks 105 | - build the index by calculating the embeddings for all docs/chunks 106 | - when a user queries the docs, a switch can toggle (fast) full-text standard search (atm with lunr.js) or experimental semantic search 107 | - if the latter is being toggled, the client loads the model (all-MiniLM-L6-v2 has ~30mb) 108 | - like in SemanticFinder, the embedding is created client-side and the cosine similarity calculated 109 | - the high-scored results are returned just like with lunr.js so the user shouldn't even notice a differenc ein the UI 110 | - Electron- or browser-based apps could be augmented with semantic search, e.g. VS Code, Atom or mobile apps. 111 | - Integration in personal wikis such as Obsidian, tiddlywiki etc. would save you the tedious tagging/keywords/categorisation work or could at least improve your structure further 112 | - Search your own browser history (thanks [@Snapdeus](https://twitter.com/snapdeus/status/1646233904691413006)) 113 | - Integration in chat apps 114 | - Allow PDF-uploads (conversion from PDF to text) 115 | - Integrate with Speech-to-Text whisper model from transformers.js to allow audio uploads. 116 | - Thanks to [CodeMirror](https://codemirror.net/) one could even use syntax highlighting for programming languages such as Python, JavaScript etc. 117 | 118 | ## Logic 119 | 120 | [Transformers.js](https://xenova.github.io/transformers.js/) is doing all the heavy lifting of tokenizing the input and running the model. Without it, this demo would have been impossible. 121 | 122 | **Input** 123 | - Text, as much as your browser can handle! The demo uses a part of "Hänsel & Gretel" but it can handle hundreds of PDF pages 124 | - A search term or phrase 125 | - The number of characters the text should be segmented in 126 | - A similarity threshold value. Results with lower similarity score won't be displayed. 127 | 128 | **Output** 129 | - Three highlighted string segments, the darker the higher the similarity score. 130 | 131 | **Pipeline** 132 | 133 | 0. All scripts are loaded. The model is loaded once from HuggingFace, after cached in the browser. 134 | 1. A user inputs some text and a search term or phrase. 135 | 2. Depending on the approximate length to consider (unit=characters), the text is split into **segments**. Words themselves are never split, that's why it's approximative. 136 | 3. The search term embedding is created. 137 | 4. For each **segment** of the text, the embedding is created. 138 | 5. Meanwhile, the cosine similarity is calculated between every **segment** embedding and the search term embedding. It's written to a dictionary with the segment as key and the score as value. 139 | 6. For every iteration, the progress bar and the highlighted sections are updated in real-time depending on the highest scores in the array. 140 | 7. The embeddings are cached in the dictionary so that subsequent queries are quite fast. The calculation of the cosine similarity is fairly speedy in comparison to the embedding generation. 141 | 8. **Only if the user changes the segment length**, the embeddings must be recalculated. 142 | 143 | ## Collaboration 144 | PRs welcome! 145 | 146 | ## To Dos (no priorization) 147 | - [x] similarity score cutoff/threshold 148 | - [x] add option for more highlights (e.g. all above certain score) 149 | - [x] add stop button 150 | - [x] MaterialUI for input fields or proper labels 151 | - [x] create a demo without CDNs 152 | - [x] separate one html properly in html, js, css 153 | - [x] add npm installation 154 | - [x] option for loading embeddings from file or generally allow sharing embeddings in some way 155 | - [x] simplify chunking function so the original text can be loaded without issues 156 | - [ ] improve the color range 157 | - [ ] rewrite the cosine similarity function in Rust, port to WASM and load as a module for possible speedup (experimental) 158 | - [ ] UI overhaul 159 | - [ ] polish code 160 | - [x] - jQuery/vanilla JS mixed 161 | - [ ] - clean up functions 162 | - [ ] - add more comments 163 | - [ ] add possible use cases 164 | - [ ] package as a standalone application (maybe with custom model choice; to be downloaded once from HF hub, then saved locally) 165 | - [ ] possible integration as example in [transformers.js homepage](https://github.com/xenova/transformers.js/issues/84) 166 | 167 | ## Star History 168 | 169 | [![Star History Chart](https://api.star-history.com/svg?repos=do-me/SemanticFinder&type=Timeline)](https://star-history.com/#do-me/SemanticFinder&Timeline) 170 | 171 | ## Gource Map 172 | 173 | ![image](SemanticFinder_gource.png) 174 | 175 | Gource image created with: 176 | 177 | ```bash 178 | gource -1280x720 --title "SemanticFinder" --seconds-per-day 0.03 --auto-skip-seconds 0.03 --bloom-intensity 0.5 --max-user-speed 500 --highlight-dirs --multi-sampling --highlight-colour 00FF00 179 | ``` 180 | -------------------------------------------------------------------------------- /SemanticFinder.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder.gif -------------------------------------------------------------------------------- /SemanticFinder_Chrome_Extension.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_Chrome_Extension.gif -------------------------------------------------------------------------------- /SemanticFinder_Chrome_Extension_en.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_Chrome_Extension_en.zip -------------------------------------------------------------------------------- /SemanticFinder_gource.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_gource.png -------------------------------------------------------------------------------- /extension/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /extension/README.md: -------------------------------------------------------------------------------- 1 | 2 | # SemanticFinder Browser Extension 3 | 4 | 5 | ## Getting Started 6 | 1. Install the necessary dependencies: 7 | ```bash 8 | npm install 9 | ``` 10 | 11 | 2. Build the project: 12 | ```bash 13 | npm run build 14 | ``` 15 | or 16 | ```bash 17 | npm run dev 18 | ``` 19 | for auto-reload. 20 | 21 | 3. Add the extension to your browser. To do this, go to `chrome://extensions/`, enable developer mode (top right), and click "Load unpacked". Select the `build` directory from the dialog which appears and click "Select Folder". 22 | 23 | 24 | ---- 25 | 26 | A big thank you to Xenova, whose work on 🤗 Transformers.js makes this entire project possible. 27 | -------------------------------------------------------------------------------- /extension/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "SemanticFinder", 3 | "version": "0.0.1", 4 | "description": "SemanticFinder | In-browser Semantic Search via Transformers.js", 5 | "scripts": { 6 | "build": "webpack", 7 | "dev": "webpack --watch" 8 | }, 9 | "type": "module", 10 | "author": "Varun Neal Srivastava", 11 | "contributors": [ 12 | "Dominik Weckmüller", 13 | "Xenova" 14 | ], 15 | "license": "MIT", 16 | "devDependencies": { 17 | "copy-webpack-plugin": "^11.0.0", 18 | "css-loader": "^6.8.1", 19 | "html-webpack-plugin": "^5.5.1", 20 | "pdfjs-dist": "^3.9.179", 21 | "style-loader": "^3.3.3", 22 | "vue-loader": "^17.2.2", 23 | "vue-template-compiler": "^2.7.14", 24 | "vueify": "^9.4.1", 25 | "webpack": "^5.79.0", 26 | "webpack-cli": "^5.1.4" 27 | }, 28 | "dependencies": { 29 | "@mozilla/readability": "^0.4.4", 30 | "@vue/compiler-sfc": "^3.3.4", 31 | "@xenova/transformers": "^2.5.0", 32 | "mark.js": "^8.11.1", 33 | "node-polyfill-webpack-plugin": "^2.0.1", 34 | "util": "^0.12.5", 35 | "vue": "^3.3.4" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /extension/public/icons/logo128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/extension/public/icons/logo128.png -------------------------------------------------------------------------------- /extension/public/icons/logo48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/extension/public/icons/logo48.png -------------------------------------------------------------------------------- /extension/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 3, 3 | "name": "SemanticFinder", 4 | "description": "SemanticFinder | In-browser Semantic Search via Transformers.js", 5 | "version": "0.0.1", 6 | "host_permissions": ["http://*/*", "https://*/*"], 7 | "permissions": [ 8 | "scripting", 9 | "activeTab", 10 | "storage", 11 | "unlimitedStorage" 12 | ], 13 | "options_ui": { 14 | "page": "options.html", 15 | "open_in_tab": true 16 | }, 17 | "background": { 18 | "service_worker": "background.js", 19 | "type": "module" 20 | }, 21 | "content_scripts": [ 22 | { 23 | "matches": [ 24 | "" 25 | ], 26 | "js": [ 27 | "content.js", "pdf.js", "pdf.worker.js" 28 | ], 29 | "css": [ 30 | "content.css" 31 | ] 32 | } 33 | ], 34 | "minimum_chrome_version": "92", 35 | "action": { 36 | "default_icon": { 37 | "16": "icons/logo48.png", 38 | "24": "icons/logo48.png", 39 | "32": "icons/logo128.png" 40 | }, 41 | "default_title": "SemanticFinder", 42 | "default_popup": "popup.html" 43 | }, 44 | "content_security_policy": { 45 | "extension_pages": "script-src 'self' 'wasm-unsafe-eval'" 46 | }, 47 | "icons": { 48 | "16": "icons/logo48.png", 49 | "48": "icons/logo48.png", 50 | "128": "icons/logo128.png" 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /extension/src/content/content.css: -------------------------------------------------------------------------------- 1 | 2 | .SemanticFinder-highlight { 3 | background-color: #ffff33 !important; 4 | color: black !important; 5 | } 6 | -------------------------------------------------------------------------------- /extension/src/content/content.js: -------------------------------------------------------------------------------- 1 | // content.js 2 | import {prettyLog, splitReadableContent} from '../utils/utils.js'; 3 | import {Readability} from '@mozilla/readability'; 4 | import Mark from 'mark.js'; 5 | import {getDocument, GlobalWorkerOptions} from 'pdfjs-dist'; 6 | 7 | 8 | async function fetchAndExtractPDFText(url) { 9 | GlobalWorkerOptions.workerSrc = chrome.runtime.getURL('../serviceworkers/pdf.worker.js'); 10 | 11 | const pdf = await getDocument(url).promise; 12 | 13 | let totalPages = pdf.numPages; 14 | let texts = []; 15 | 16 | for (let i = 1; i <= totalPages; i++) { 17 | // console.log("page ", i); 18 | const page = await pdf.getPage(i); 19 | const textContent = await page.getTextContent(); 20 | const pageText = textContent.items.map(item => item.str).join(' '); 21 | texts.push(pageText); 22 | } 23 | 24 | return texts.join(' '); 25 | } 26 | 27 | function getValueFromStorage(key, defaultValue) { 28 | return new Promise((resolve, reject) => { 29 | chrome.storage.sync.get(key, function(result) { 30 | if (chrome.runtime.lastError) { 31 | reject(new Error(chrome.runtime.lastError)); 32 | } else { 33 | resolve(result[key] || defaultValue); 34 | } 35 | }); 36 | }); 37 | } 38 | 39 | async function fetchNumChars() { 40 | try { 41 | const defaultNumChars = 50; // You can set this to your desired default value 42 | const storedNumChars = await getValueFromStorage('num_chars', defaultNumChars); 43 | return storedNumChars; 44 | } catch (error) { 45 | console.error('Error fetching num_chars:', error); 46 | return null; 47 | } 48 | } 49 | 50 | chrome.runtime.onMessage.addListener(async function(request, sender) { 51 | try { 52 | let currentURL = window.location.href; 53 | if (request.type === "getText") { 54 | const numChars = await fetchNumChars(); 55 | let texts = []; 56 | 57 | if (request.contentType == "application/pdf") { 58 | let textContent = await fetchAndExtractPDFText(currentURL); 59 | texts = splitReadableContent(textContent, numChars); 60 | 61 | } else { 62 | let concatenatedContent = ""; 63 | 64 | const iframes = document.querySelectorAll('iframe'); 65 | console.dir(iframes); 66 | 67 | iframes.forEach(function(iframe) { 68 | try { 69 | const iframeDocument = iframe.contentDocument; 70 | 71 | if (iframeDocument) { 72 | 73 | let { textContent } = new Readability(iframeDocument.cloneNode(true)).parse(); 74 | prettyLog("Iframe text content:", textContent, "orange"); 75 | concatenatedContent += textContent; 76 | } 77 | } catch (error) { 78 | prettyLog("Skipped an iframe due to permissions issue:", error, "red"); 79 | } 80 | }); 81 | 82 | const documentClone = document.cloneNode(true); 83 | let { textContent } = new Readability(documentClone).parse(); 84 | concatenatedContent += textContent; 85 | // prettyLog("Main document text content:", textContent); 86 | 87 | texts = splitReadableContent(concatenatedContent, numChars); 88 | 89 | } 90 | chrome.runtime.sendMessage({type: "tabUpdated", text: texts, currentURL}); 91 | } else if (request.type === 'highlightAndScroll') { 92 | // if (currentURL.endsWith('.pdf')) { return; } 93 | if (!highlightAndScrollToText(request.text)) { 94 | chrome.runtime.sendMessage({type: "error", reason: "Cannot find and highlight selection."}) 95 | } 96 | } 97 | } catch (error) { 98 | prettyLog("ERROR", error.message, "red", "red"); 99 | if (error.message.includes('net::ERR_BLOCKED_BY_CLIENT')) { 100 | chrome.runtime.sendMessage({type: "error", reason: "ERR_BLOCKED_BY_CLIENT"}); 101 | } else { 102 | chrome.runtime.sendMessage({type: "error", reason: error.message}); 103 | } 104 | } 105 | }); 106 | 107 | 108 | let currText; 109 | let instance = new Mark(document.querySelector("body")); 110 | 111 | function highlightAndScrollToText(text, depth= 3) { 112 | if (depth === 0) { 113 | return false; 114 | } 115 | // If there's a previous highlighted text, unmark it 116 | if (currText) { 117 | instance.unmark({"element": "span", "className": "SemanticFinder-highlight"}); 118 | } 119 | 120 | currText = text; 121 | 122 | let textFound = false; 123 | 124 | instance.mark(text, { 125 | "element": "span", 126 | "separateWordSearch": false, 127 | "className": "SemanticFinder-highlight", 128 | "acrossElements": true, 129 | "wildcards": "enabled", 130 | "iframes": true, 131 | "each": function (node) { 132 | // Scroll to the first instance of it 133 | node.scrollIntoView({ 134 | behavior: "smooth", 135 | block: "center" 136 | }); 137 | textFound = true; 138 | } 139 | }); 140 | 141 | 142 | // can use "noMatch" in markjs instead 143 | if (!textFound) { 144 | let segments = text.split('\n'); 145 | let longestSegment = segments.sort((a, b) => b.length - a.length)[0]; 146 | if (longestSegment) { 147 | return highlightAndScrollToText(longestSegment, depth - 1); 148 | } 149 | } else { 150 | return true; 151 | } 152 | } 153 | 154 | 155 | -------------------------------------------------------------------------------- /extension/src/options/options.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Helvetica', sans-serif; 3 | padding: 20px; 4 | background-color: #f5f5f5; 5 | } 6 | 7 | .container { 8 | background-color: #fff; 9 | padding: 20px; 10 | border-radius: 5px; 11 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 12 | width: 300px; /* Adjust as needed */ 13 | margin: 0 auto; 14 | } 15 | 16 | label { 17 | display: block; 18 | margin-bottom: 10px; 19 | } 20 | 21 | select, 22 | input { 23 | width: 100%; 24 | padding: 8px; 25 | box-sizing: border-box; 26 | margin-bottom: 20px; 27 | border-radius: 5px; 28 | border: 1px solid #ccc; 29 | } 30 | 31 | .rectangular-button { 32 | display: block; 33 | background-color: #007bff; 34 | color: white; 35 | padding: 10px 20px; 36 | text-align: center; 37 | border: none; 38 | border-radius: 5px; 39 | cursor: pointer; 40 | margin-bottom: 10px; 41 | transition: background-color 0.3s; 42 | } 43 | 44 | .rectangular-button:hover { 45 | background-color: #0056b3; 46 | } 47 | -------------------------------------------------------------------------------- /extension/src/options/options.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | SemanticFinder Settings 7 | 8 | 9 | 10 | 11 |
12 | 13 | 27 | 28 |

29 | 30 | 31 | 32 | 33 |

34 | 35 | 36 | 37 |
38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /extension/src/options/options.js: -------------------------------------------------------------------------------- 1 | 2 | document.addEventListener('DOMContentLoaded', function() { 3 | loadSettings(); 4 | 5 | const b = document.getElementById('saveButton') 6 | if (b) { 7 | b.addEventListener('click', saveSettings); 8 | } 9 | 10 | const r = document.getElementById('restoreButton') 11 | if (r) { 12 | r.addEventListener('click', restoreDefaults); 13 | } 14 | }); 15 | 16 | function saveSettings(showAlert = true) { 17 | const modelName = document.getElementById('modelSelector').value; 18 | const numChars = document.getElementById('minCharsInput').value; 19 | 20 | chrome.storage.sync.set({ 21 | 'model_name': modelName, 22 | 'num_chars': numChars 23 | }, function() { 24 | if (showAlert) { 25 | alert('Settings saved.'); 26 | } 27 | }); 28 | } 29 | 30 | function restoreDefaults() { 31 | document.getElementById('modelSelector').value = 'Supabase/gte-small'; // Default model 32 | document.getElementById('minCharsInput').value = 50; // Default number 33 | 34 | saveSettings(false); 35 | } 36 | 37 | 38 | function loadSettings() { 39 | chrome.storage.sync.get(['model_name', 'num_chars'], function(items) { 40 | if (items['model_name']) { 41 | const s = document.getElementById('modelSelector') 42 | if (s) { 43 | s.value = items['model_name']; 44 | } 45 | } 46 | if (items['num_chars']) { 47 | const m = document.getElementById('minCharsInput') 48 | if (m) { 49 | m.value = items['num_chars']; 50 | } 51 | } 52 | }); 53 | } 54 | -------------------------------------------------------------------------------- /extension/src/popup/AnimatedInput.vue: -------------------------------------------------------------------------------- 1 | 6 | 7 | 43 | 92 | 93 | -------------------------------------------------------------------------------- /extension/src/popup/popup.css: -------------------------------------------------------------------------------- 1 | 2 | * { 3 | padding: 0; 4 | margin: 0; 5 | box-sizing: border-box; 6 | font-family: 'Roboto', sans-serif; 7 | } 8 | 9 | h1 { 10 | font-size: 40px; 11 | text-align: center; 12 | font-weight: 500; 13 | } 14 | 15 | h2 { 16 | font-size: 20px; 17 | text-align: center; 18 | font-weight: 400; 19 | margin-bottom: 16px; 20 | } 21 | 22 | .container { 23 | width: 450px; 24 | } 25 | 26 | html, 27 | body { 28 | min-width: 400px; 29 | min-height: 500px; 30 | } 31 | 32 | body { 33 | display: flex; 34 | justify-content: center; 35 | align-items: center; 36 | } 37 | 38 | #text { 39 | width: 100%; 40 | padding: 8px; 41 | font-size: 20px; 42 | margin-bottom: 8px; 43 | } 44 | 45 | #output { 46 | font-size: 20px; 47 | font-family: 'Roboto Mono', monospace; 48 | height: 100px; 49 | } 50 | -------------------------------------------------------------------------------- /extension/src/popup/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Popup 6 | 7 | 8 |
9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /extension/src/popup/popup.js: -------------------------------------------------------------------------------- 1 | import { createApp } from 'vue'; 2 | import Popup from './popup.vue'; 3 | 4 | 5 | createApp(Popup).mount("#app"); 6 | -------------------------------------------------------------------------------- /extension/src/popup/popup.vue: -------------------------------------------------------------------------------- 1 | 41 | 42 | 43 | 134 | 135 | 247 | -------------------------------------------------------------------------------- /extension/src/popup/result.vue: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 28 | 29 | 58 | -------------------------------------------------------------------------------- /extension/src/serviceworkers/background.js: -------------------------------------------------------------------------------- 1 | // background.js - Handles requests from the UI, runs the model, then sends back a response 2 | 3 | import {prettyLog, getSiteID} from '../utils/utils.js'; 4 | import {similarity, storeEmbeddings, loadEmbeddings} from "./semantic.js"; 5 | 6 | ////////////////////// 1. Context Menus ////////////////////// 7 | // 8 | // Add a listener to create the initial context menu items, 9 | // context menu items only need to be created at runtime.onInstalled 10 | // chrome.runtime.onInstalled.addListener(function () { 11 | // Register a context menu item that will only show up for selection text. 12 | // chrome.contextMenus.create({ 13 | // id: 'classify-selection', 14 | // title: 'Classify "%s"', 15 | // contexts: ['selection'], 16 | // }); 17 | // }); 18 | // 19 | // Perform inference when the user clicks a context menu 20 | // chrome.contextMenus.onClicked.addListener(async (info, tab) => { 21 | // Ignore context menu clicks that are not for classifications (or when there is no input) 22 | // if (info.menuItemId !== 'classify-selection' || !info.selectionText) return; 23 | // 24 | // Perform classification on the selected text 25 | // let result = await classify(info.selectionText); 26 | // 27 | // Do something with the result 28 | // chrome.scripting.executeScript( 29 | // 30 | // { 31 | // target: { tabId: tab.id }, // Run in the tab that the user clicked in 32 | // args: [result], // The arguments to pass to the function 33 | // function: (result) => { // The function to run 34 | // // NOTE: This function is run in the context of the web page, meaning that `document` is available. 35 | // console.log('result', result) 36 | // console.log('document', document) 37 | // }, 38 | // } 39 | // ); 40 | // }); 41 | ////////////////////////////////////////////////////////////// 42 | 43 | ////////////////////// 2. Message Events ///////////////////// 44 | // 45 | // Listen for messages from the UI, process it, and send the result back. 46 | 47 | // TODO: body text is not persistent 48 | let bodyText = []; 49 | let inputText = ""; 50 | 51 | let liveProcess = 0; 52 | let currSite = ""; 53 | 54 | chrome.runtime.onMessage.addListener(async function (request, sender, sendResponse) { 55 | if (request.type === "tabUpdated") { 56 | if (request.text.length > 0) { 57 | bodyText = request.text; 58 | currSite = getSiteID(request.currentURL); 59 | } 60 | } else if (request.type === "inputText") { 61 | inputText = request.text; 62 | } else {return; } 63 | if (!bodyText || !inputText) { 64 | return; 65 | } 66 | 67 | liveProcess++; 68 | const processId = liveProcess; 69 | 70 | await processQuery(inputText, bodyText, processId); 71 | }); 72 | 73 | 74 | async function processQuery(query, bodyText, processId) { 75 | if (bodyText.length === 0) { 76 | prettyLog("Error", "no content found. please reload this page if this is unexpected", "red"); 77 | chrome.runtime.sendMessage({type: "error", reason: "No content detected. Reloading may help."}); 78 | return; // Exit early if no bodyText 79 | } 80 | 81 | await loadEmbeddings(currSite); 82 | prettyLog("starting process " + processId, bodyText.length + " items, input: " + query, "orange"); 83 | 84 | let results = []; 85 | const k = 10; 86 | 87 | let i = 0; 88 | for (let text of bodyText) { 89 | if (processId !== liveProcess) { return;} 90 | let sim = await similarity(query, text); 91 | 92 | if (sim > 0.15) { 93 | results.push({sim: sim, text: text}); 94 | results.sort((a, b) => b.sim - a.sim); 95 | results.length = Math.min(results.length, k); 96 | 97 | if (processId !== liveProcess) { return;} 98 | chrome.runtime.sendMessage({ 99 | type: "results", progress: 100 * (i / bodyText.length), 100 | text: results 101 | }); 102 | } 103 | i += 1; 104 | } 105 | chrome.runtime.sendMessage({type: "results", progress: 100}); 106 | await storeEmbeddings(); 107 | } 108 | 109 | ////////////////////////////////////////////////////////////// 110 | 111 | -------------------------------------------------------------------------------- /extension/src/serviceworkers/pdf.worker.entry.js: -------------------------------------------------------------------------------- 1 | /* Copyright 2022 Mozilla Foundation 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | (typeof window !== "undefined" 17 | ? window 18 | : {} 19 | ).pdfjsWorker = require("./pdf.worker.js"); 20 | -------------------------------------------------------------------------------- /extension/src/serviceworkers/semantic.js: -------------------------------------------------------------------------------- 1 | // Define caching parameters 2 | import {CustomCache} from "../utils/cache.js"; 3 | import {pipeline, env} from '@xenova/transformers'; 4 | import {prettyLog} from "../utils/utils.js"; 5 | 6 | env.useBrowserCache = false; 7 | env.useCustomCache = true; 8 | env.customCache = new CustomCache('transformers-cache'); 9 | env.allowLocalModels = false; 10 | 11 | 12 | // Due to a bug in onnxruntime-web, we must disable multithreading for now. 13 | // See https://github.com/microsoft/onnxruntime/issues/14445 for more information. 14 | env.backends.onnx.wasm.numThreads = 1; 15 | 16 | // these should go in EmbedPipeline prob 17 | let embeddingsDict = {}; 18 | let currID = ""; 19 | 20 | class EmbedPipeline { 21 | static task = 'feature-extraction'; 22 | 23 | static model = 'Supabase/gte-small'; 24 | static instance = null; 25 | 26 | static async getModelFromStorage() { 27 | return new Promise((resolve, reject) => { 28 | chrome.storage.sync.get('model_name', function(result) { 29 | if (chrome.runtime.lastError) { 30 | reject(new Error(chrome.runtime.lastError)); 31 | } else { 32 | resolve(result.model_name); 33 | return false; 34 | } 35 | }); 36 | }); 37 | } 38 | static async updateModelName() { 39 | try { 40 | const storedModelName = await this.getModelFromStorage(); 41 | if (storedModelName) { 42 | this.model = storedModelName; 43 | } 44 | } catch (error) {} 45 | } 46 | 47 | static async getInstance() { 48 | if (this.instance === null) { 49 | await this.updateModelName(); 50 | 51 | this.instance = await pipeline(this.task, this.model, 52 | { 53 | progress_callback: async data => { 54 | await chrome.runtime.sendMessage({type: "download", data: data}); 55 | } 56 | } 57 | ); 58 | } 59 | await chrome.runtime.sendMessage({type: "download", data: {status: "complete"}}) 60 | 61 | return this.instance; 62 | } 63 | } 64 | 65 | // Important: Return true to indicate that the response is asynchronous. 66 | chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => { 67 | switch (request.type) { 68 | case "load": 69 | await load(); 70 | break; 71 | case "clearLocalStorage": 72 | chrome.storage.local.clear(() => { 73 | }); 74 | break; 75 | case "pruneEmbeddings": 76 | await pruneStoredEmbeddings(10); 77 | break; 78 | } 79 | }); 80 | 81 | 82 | async function load() { 83 | await EmbedPipeline.getInstance(); 84 | } 85 | 86 | async function embed(text, use_dict = true) { 87 | if (use_dict && text in embeddingsDict) { 88 | return embeddingsDict[text]; 89 | } 90 | 91 | let embedder = await EmbedPipeline.getInstance(); 92 | let e0 = await embedder(text, {pooling: 'mean', normalize: true}); 93 | if (use_dict) { 94 | embeddingsDict[text] = e0["data"]; 95 | } 96 | return e0["data"]; 97 | } 98 | 99 | 100 | // do on clean-up / unmount 101 | async function pruneStoredEmbeddings(k) { 102 | return new Promise((resolve) => { 103 | chrome.storage.local.get(null, function (allData) { 104 | let embeddingKeys = Object.keys(allData).filter(key => allData[key].is_embeddings === true); 105 | 106 | console.log("All embedding keys found:", embeddingKeys); // This logs all the embedding keys 107 | 108 | // Sort these embedding keys based on frecency scores 109 | let sortedKeys = embeddingKeys.sort((a, b) => allData[b].frecency_score - allData[a].frecency_score); 110 | 111 | let topKKeys = sortedKeys.slice(0, k); 112 | let keysToRemove = sortedKeys.filter(key => !topKKeys.includes(key)); 113 | console.log(`Removing the following keys: ${keysToRemove}`); 114 | 115 | // Remove the non-top k embeddings from storage. 116 | if (keysToRemove.length > 0) { 117 | chrome.storage.local.remove(keysToRemove, () => { 118 | console.log(`Successfully removed ${keysToRemove.length} keys.`); 119 | resolve(); 120 | }); 121 | } else { 122 | resolve(); 123 | } 124 | }); 125 | }); 126 | } 127 | 128 | 129 | 130 | export async function storeEmbeddings() { 131 | const buffer = new TextEncoder().encode(JSON.stringify(embeddingsDict)); 132 | 133 | const body = await new Promise((resolve, reject) => { 134 | const reader = new FileReader(); 135 | reader.onload = e => resolve(e.target.result); 136 | reader.onerror = e => reject(e.target.error); 137 | reader.readAsDataURL(new Blob([buffer], {type: 'application/json'})); 138 | }); 139 | 140 | try { 141 | await chrome.storage.local.set({ 142 | [currID]: { 143 | _body: body, 144 | frecency_score: computeFrecencyScore(currID), 145 | is_embeddings: true, 146 | model_name: EmbedPipeline.model 147 | } 148 | }); 149 | prettyLog("stored " + currID, Object.keys(embeddingsDict).length + " items"); 150 | 151 | } catch (err) { 152 | console.warn('An error occurred while writing the embeddings to cache:', err) 153 | } 154 | } 155 | 156 | async function verifyLoad() { 157 | for (let text in embeddingsDict) { 158 | let e0 = await embed(text, true); 159 | let e1 = await embed(text, false); 160 | let sim = cosineSimilarity(e0, e1); 161 | if (sim < 0.99) { 162 | prettyLog("load differs", sim, "red"); 163 | } 164 | } 165 | } 166 | 167 | export async function loadEmbeddings(ID) { 168 | if (Object.keys(embeddingsDict).length !== 0 && ID === currID) { 169 | return; 170 | } 171 | currID = ID; 172 | const data = await chrome.storage.local.get([currID]); 173 | if (data[ID] && data[ID].is_embeddings) { 174 | prettyLog("attempting load", ID); 175 | if (!data[ID].model_name || data[ID].model_name !== EmbedPipeline.model) { return; } 176 | const body = data[ID]._body; 177 | 178 | const jsonString = await new Promise((resolve, reject) => { 179 | const byteCharacters = atob(body.split(',')[1]); 180 | const byteNumbers = Array.from(byteCharacters).map(char => char.charCodeAt(0)); 181 | const byteArray = new Uint8Array(byteNumbers); 182 | const blob = new Blob([byteArray], {type: 'application/json'}); 183 | const reader = new FileReader(); 184 | reader.onload = (event) => resolve(event.target.result); 185 | reader.onerror = (error) => reject(error); 186 | reader.readAsText(blob); 187 | }); 188 | 189 | const parsedData = JSON.parse(jsonString); 190 | 191 | // Convert the object-with-integer-keys representation into Float32Array 192 | for (let textKey in parsedData) { 193 | if (parsedData.hasOwnProperty(textKey)) { 194 | let arrayData = Object.values(parsedData[textKey]); 195 | embeddingsDict[textKey] = new Float32Array(arrayData); 196 | } 197 | } 198 | 199 | prettyLog("loaded " + ID, Object.keys(embeddingsDict).length + " items"); 200 | // await verifyLoad(); 201 | } 202 | } 203 | 204 | 205 | // todo: implement & move to utils 206 | function computeFrecencyScore(ID) { 207 | return 4; // lol! 208 | } 209 | 210 | 211 | export async function similarity(text1, text2) { 212 | let e0 = await embed(text1); 213 | let e1 = await embed(text2); 214 | 215 | return cosineSimilarity(e0, e1); 216 | } 217 | 218 | function cosineSimilarity(v1, v2) { 219 | if (v1.length !== v2.length) { 220 | return -1; 221 | } 222 | let dotProduct = 0; 223 | let v1_mag = 0; 224 | let v2_mag = 0; 225 | for (let i = 0; i < v1.length; i++) { 226 | dotProduct += v1[i] * v2[i]; 227 | v1_mag += v1[i] ** 2; 228 | v2_mag += v2[i] ** 2; 229 | } 230 | return dotProduct / (Math.sqrt(v1_mag) * Math.sqrt(v2_mag)); 231 | } 232 | -------------------------------------------------------------------------------- /extension/src/utils/cache.js: -------------------------------------------------------------------------------- 1 | // Author: Xenova 2 | // Design a caching API to be used by the extension which implements the same interface as 3 | // the browser's native Cache API (https://developer.mozilla.org/en-US/docs/Web/API/Cache) 4 | // but uses the browser's local storage API (https://developer.chrome.com/docs/extensions/reference/storage/). 5 | // 6 | // Since the local storage API requires all data to be stored as JSON (which doesn't allow some ASCII chars), 7 | // one of the better approaches is to store the response body as a base64-encoded string. This is not ideal, 8 | // as it increases the size of the response body by ~33%, but it's the best we can do with the local storage API. 9 | // See https://stackoverflow.com/a/1443240/13989043 for more information about this. 10 | // 11 | // For serialization (arraybuffer -> string) and unserialization (string -> arraybuffer), 12 | // use the `FileReader` and `Blob` APIs. Although other options are also possible, this approach 13 | // is considered to be better for larger files (like models). 14 | // 15 | // Other references: 16 | // - https://developer.chrome.com/docs/extensions/reference/storage/#property-local 17 | // - https://stackoverflow.com/questions/6965107/converting-between-strings-and-arraybuffers 18 | 19 | export class CustomCache { 20 | /** 21 | * Instantiate a `CustomCache` object. 22 | * @param {string} path 23 | */ 24 | constructor(cacheName) { 25 | this.cacheName = cacheName; 26 | } 27 | 28 | /** 29 | * Checks whether the given request is in the cache. 30 | * @param {Request|string} request 31 | * @returns {Promise} 32 | */ 33 | async match(request) { 34 | const url = request instanceof Request ? request.url : request; 35 | const cached = await chrome.storage.local.get([url]); 36 | 37 | if (cached[url]) { 38 | let model = await fetch(cached[url]._body); 39 | // console.log("model: ", model); 40 | return model; 41 | } else { 42 | return undefined; 43 | } 44 | } 45 | 46 | /** 47 | * Adds the given response to the cache. 48 | * @param {Request|string} request 49 | * @param {Response} response 50 | * @returns {Promise} 51 | */ 52 | async put(request, response) { 53 | const url = request instanceof Request ? request.url : request; 54 | const buffer = await response.arrayBuffer(); 55 | 56 | const body = await new Promise((resolve, reject) => { 57 | const reader = new FileReader(); 58 | reader.onload = e => resolve(e.target.result); 59 | reader.onerror = e => reject(e.target.error); 60 | reader.readAsDataURL(new Blob([buffer], { type: 'application/octet-stream' })); 61 | }); 62 | 63 | try { 64 | await chrome.storage.local.set({ 65 | [url]: { 66 | _body: body, 67 | 68 | // Save original response in case 69 | status: response.status, 70 | statusText: response.statusText, 71 | headers: Object.fromEntries(response.headers.entries()), 72 | url: response.url, 73 | redirected: response.redirected, 74 | type: response.type, 75 | ok: response.ok, 76 | } 77 | }); 78 | 79 | } catch (err) { 80 | console.warn('An error occurred while writing the file to cache:', err) 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /extension/src/utils/utils.js: -------------------------------------------------------------------------------- 1 | export function prettyLog(label, message, labelColor = 'blue', messageColor = 'black') { 2 | console.log("%c" + label + ": %c" + message, 3 | "font-weight: bold; color: " + labelColor + ";", 4 | "font-weight: normal; color: " + messageColor + ";"); 5 | } 6 | 7 | 8 | /* Looks for a sentence ending after numChars. */ 9 | function splitByChars(text, numChars) { 10 | let chunks = []; 11 | let currChunk = ''; 12 | const sentenceEndings = ['.', '?', '!', ';', ':', '\n', '–']; 13 | 14 | for (let i = 0; i < text.length; i++) { 15 | currChunk += text[i]; 16 | 17 | let isEndingPunctuation = sentenceEndings.includes(text[i]); 18 | 19 | // Special case: if the punctuation is a period and the next character is a quote 20 | if (text[i] === '.' && text[i + 1] === '"') { 21 | currChunk += text[++i]; 22 | isEndingPunctuation = true; 23 | } 24 | 25 | if (currChunk.trim().length >= numChars && isEndingPunctuation) { 26 | chunks.push(currChunk.trim()); 27 | currChunk = ''; 28 | } 29 | } 30 | 31 | if (currChunk.trim()) { 32 | chunks.push(currChunk.trim()); 33 | } 34 | 35 | return chunks; 36 | } 37 | 38 | 39 | export function getSiteID(url) { 40 | let urlObj = new URL(url); 41 | return urlObj.hostname + urlObj.pathname; 42 | } 43 | 44 | 45 | export function splitReadableContent(readableContent, numChars = 50) { 46 | return splitByChars(readableContent, numChars); 47 | } 48 | 49 | 50 | function collectTextNodes(element, texts = []) { 51 | if (element.nodeType === Node.ELEMENT_NODE && element.tagName.toLowerCase() === 'p') { 52 | let sentences = tokenizer.tokenize(element.textContent); // Tokenize the text content into sentences 53 | for (let sentence of sentences) { 54 | sentence = sentence.trim(); // Remove leading/trailing white spaces 55 | if (sentence !== "") { 56 | texts.push(sentence); 57 | } 58 | } 59 | } else { 60 | for (let child of element.childNodes) { 61 | collectTextNodes(child, texts); 62 | } 63 | } 64 | return texts; 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /extension/webpack.config.js: -------------------------------------------------------------------------------- 1 | // webpack.config.js 2 | import path from 'path'; 3 | import { fileURLToPath } from 'url'; 4 | 5 | import HtmlWebpackPlugin from 'html-webpack-plugin'; 6 | import CopyPlugin from 'copy-webpack-plugin'; 7 | import { VueLoaderPlugin } from 'vue-loader'; 8 | import NodePolyfillPlugin from 'node-polyfill-webpack-plugin'; 9 | import webpack from 'webpack'; 10 | import util from 'util'; 11 | 12 | const __dirname = path.dirname(fileURLToPath(import.meta.url)); 13 | 14 | const config = { 15 | mode: 'development', 16 | devtool: 'inline-source-map', 17 | entry: { 18 | background: ['./src/serviceworkers/background.js', './src/serviceworkers/semantic.js'], 19 | popup: './src/popup/popup.js', 20 | content: './src/content/content.js', 21 | options: './src/options/options.js' 22 | }, 23 | resolve: { 24 | fallback: { 25 | "fs": false, 26 | "tls": false, 27 | "net": false, 28 | "path": false, 29 | "util": false, 30 | } 31 | }, 32 | output: { 33 | path: path.resolve(__dirname, 'build'), 34 | filename: '[name].js', 35 | }, 36 | module: { 37 | rules: [ 38 | { 39 | test: /\.vue$/, 40 | use: 'vue-loader' 41 | }, 42 | { 43 | test: /\.css$/, 44 | use: [ 45 | 'style-loader', 46 | 'css-loader', 47 | ], 48 | }, 49 | ], 50 | }, 51 | plugins: [ 52 | new NodePolyfillPlugin(), 53 | new webpack.DefinePlugin({ 54 | __VUE_OPTIONS_API__: true, 55 | __VUE_PROD_DEVTOOLS__: false, 56 | }), 57 | new VueLoaderPlugin(), 58 | new HtmlWebpackPlugin({ 59 | template: './src/popup/popup.html', 60 | filename: 'popup.html', 61 | }), 62 | new HtmlWebpackPlugin({ 63 | template: './src/options/options.html', 64 | filename: 'options.html', 65 | }), 66 | new CopyPlugin({ 67 | patterns: [ 68 | { 69 | from: "public", 70 | to: "." // Copies to build folder 71 | }, 72 | { 73 | from: "src/popup/popup.css", 74 | to: "popup.css" 75 | }, 76 | { 77 | from: "src/content/content.css", 78 | to: "content.css" 79 | }, 80 | { 81 | from: "src/serviceworkers/pdf.js", 82 | to: "pdf.js" 83 | }, 84 | { 85 | from: "src/serviceworkers/pdf.worker.js", 86 | to: "pdf.worker.js" 87 | }, 88 | { 89 | from: "src/options/options.css", 90 | to: "options.css" 91 | }, 92 | ], 93 | }) 94 | ], 95 | }; 96 | 97 | export default config; 98 | 99 | 100 | -------------------------------------------------------------------------------- /jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "checkJs": true, 4 | "strict": true 5 | }, 6 | "include": ["src/**/*"] 7 | } -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/logo.png -------------------------------------------------------------------------------- /misc/Generate_large_textfile_from_books.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Script for generating large text files \n", 8 | "\n", 9 | "Keeps the metadata concatted with \"|\" as first line" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 7, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Pages: 28986 | Words: 15893741\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import pandas as pd \n", 27 | "\n", 28 | "# download first \"https://huggingface.co/datasets/storytracer/US-PD-Books/resolve/main/data/train-00000-of-00327.parquet?download=true\")\n", 29 | "df = pd.read_parquet(\"train-00000-of-00327.parquet\") \n", 30 | "\n", 31 | "# e.g. 100 books only \n", 32 | "books_number = 100\n", 33 | "df = df.iloc[:books_number]\n", 34 | "df[\"words\"] = df.full_text.apply(lambda x: len(x.split(\" \")))\n", 35 | "print(f\"Pages: {df.page_count.sum()} | Words: {df.words.sum()}\") #df.words.sum())" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 8, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Define the columns to concatenate, excluding 'full_text'\n", 45 | "metadata_columns = ['ocaid', 'title', 'author', 'year', 'page_count', 'openlibrary_edition', 'openlibrary_work']\n", 46 | "\n", 47 | "# Function to concatenate metadata and full_text\n", 48 | "def concatenate_row(row):\n", 49 | " metadata = '|'.join(row[metadata_columns].astype(str)) # Convert to string and join with '|'\n", 50 | " full_text = row['full_text']\n", 51 | " return metadata + '|' + full_text\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 11, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/html": [ 62 | "
\n", 63 | "\n", 76 | "\n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | "
ocaidtitleauthoryearpage_countopenlibrary_editionopenlibrary_workfull_textwords
0worksofcharlesle01leveThe works of Charles LeverLever, Charles James, 1806-18721872692OL13499428MOL3564322W<8 '' ^/^r \\n\\n\\nN V s...1045410
1specimensofexpos00lamorichSpecimens of expositionLamont, Hammond, 1864-19091894220OL7034373MOL202608WIfteafeirtgs \\n\\n\\nUC-NRLF \\n\\n\\nSPECIMENS \\n\\...109283
2recollectionsand00greerichRecollections and reflections : an auto of hal...Green, Wharton J. (Wharton Jackson), 1831-19101906400OL7098980MOL7710550W; J. GREEN \\n\\n\\nRECOLLECTIONS AND REFL...229753
3puddnheadwilsont00twaiialaPudd'nhead Wilson, and Those extraordinary twinsTwain, Mark, 1835-19101922322OL7095992MOL15269096WROXY HARVESTING AMONG THE KITCHENS \\n\\n\\n...142528
4hansbreitmann00lelarichHans Breitmann in Germany;Leland, Charles Godfrey, 1824-19031895184OL7202758MOL4108366W;'HP- \\n\\n\\nn \\n\\n\\n\"* \\n\\nr.l»* \\n\\n'f . \\n\\...58760
\n", 154 | "
" 155 | ], 156 | "text/plain": [ 157 | " ocaid \n", 158 | "0 worksofcharlesle01leve \\\n", 159 | "1 specimensofexpos00lamorich \n", 160 | "2 recollectionsand00greerich \n", 161 | "3 puddnheadwilsont00twaiiala \n", 162 | "4 hansbreitmann00lelarich \n", 163 | "\n", 164 | " title \n", 165 | "0 The works of Charles Lever \\\n", 166 | "1 Specimens of exposition \n", 167 | "2 Recollections and reflections : an auto of hal... \n", 168 | "3 Pudd'nhead Wilson, and Those extraordinary twins \n", 169 | "4 Hans Breitmann in Germany; \n", 170 | "\n", 171 | " author year page_count \n", 172 | "0 Lever, Charles James, 1806-1872 1872 692 \\\n", 173 | "1 Lamont, Hammond, 1864-1909 1894 220 \n", 174 | "2 Green, Wharton J. (Wharton Jackson), 1831-1910 1906 400 \n", 175 | "3 Twain, Mark, 1835-1910 1922 322 \n", 176 | "4 Leland, Charles Godfrey, 1824-1903 1895 184 \n", 177 | "\n", 178 | " openlibrary_edition openlibrary_work \n", 179 | "0 OL13499428M OL3564322W \\\n", 180 | "1 OL7034373M OL202608W \n", 181 | "2 OL7098980M OL7710550W \n", 182 | "3 OL7095992M OL15269096W \n", 183 | "4 OL7202758M OL4108366W \n", 184 | "\n", 185 | " full_text words \n", 186 | "0 <8 '' ^/^r \\n\\n\\nN V s... 1045410 \n", 187 | "1 Ifteafeirtgs \\n\\n\\nUC-NRLF \\n\\n\\nSPECIMENS \\n\\... 109283 \n", 188 | "2 ; J. GREEN \\n\\n\\nRECOLLECTIONS AND REFL... 229753 \n", 189 | "3 ROXY HARVESTING AMONG THE KITCHENS \\n\\n\\n... 142528 \n", 190 | "4 ;'HP- \\n\\n\\nn \\n\\n\\n\"* \\n\\nr.l»* \\n\\n'f . \\n\\... 58760 " 191 | ] 192 | }, 193 | "execution_count": 11, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "df.head()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 9, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "The text file '100_books.txt' has been created.\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "# Apply the function to each row and save to a list\n", 217 | "concatenated_rows = df.iloc[:books_number].apply(concatenate_row, axis=1).tolist()\n", 218 | "\n", 219 | "# Write the concatenated rows to a text file\n", 220 | "with open(f'{books_number}_books.txt', 'w', encoding='utf-8') as f:\n", 221 | " for row in concatenated_rows:\n", 222 | " f.write(row + '\\n')\n", 223 | "\n", 224 | "print(f\"The text file '{books_number}_books.txt' has been created.\")\n" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [] 233 | } 234 | ], 235 | "metadata": { 236 | "kernelspec": { 237 | "display_name": "py3.11", 238 | "language": "python", 239 | "name": "python3" 240 | }, 241 | "language_info": { 242 | "codemirror_mode": { 243 | "name": "ipython", 244 | "version": 3 245 | }, 246 | "file_extension": ".py", 247 | "mimetype": "text/x-python", 248 | "name": "python", 249 | "nbconvert_exporter": "python", 250 | "pygments_lexer": "ipython3", 251 | "version": "3.11.0" 252 | } 253 | }, 254 | "nbformat": 4, 255 | "nbformat_minor": 2 256 | } 257 | -------------------------------------------------------------------------------- /misc/README.md: -------------------------------------------------------------------------------- 1 | ## Various utilities 2 | 3 | Here goes anything for testing or similar. -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "semanticfinder", 3 | "version": "1.0.0", 4 | "description": "Update: just improved the UI - automatically scroll through the results!", 5 | "main": "src/js/index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "webpack serve --mode development", 9 | "build": "webpack --config webpack.config.js" 10 | }, 11 | "author": "", 12 | "license": "ISC", 13 | "dependencies": { 14 | "@xenova/transformers": "^2.17.2", 15 | "bootstrap": "^5.3.2", 16 | "codemirror": "^5.52.2", 17 | "deck.gl": "^8.9.34", 18 | "marked": "^12.0.0", 19 | "ollama": "^0.4.9", 20 | "pako": "^2.1.0", 21 | "wasm-bhtsne": "^0.3.3" 22 | }, 23 | "devDependencies": { 24 | "copy-webpack-plugin": "^11.0.0", 25 | "css-loader": "^6.8.1", 26 | "favicons": "^7.1.4", 27 | "favicons-webpack-plugin": "^6.0.1", 28 | "html-webpack-plugin": "^5.5.3", 29 | "mini-css-extract-plugin": "^2.7.6", 30 | "pdfjs-dist": "^4.0.379", 31 | "style-loader": "^3.3.3", 32 | "webpack": "^5.88.1", 33 | "webpack-cli": "^5.1.4", 34 | "webpack-dev-server": "^4.15.1" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/css/styles.css: -------------------------------------------------------------------------------- 1 | #input-text { 2 | height: 50vh; 3 | width: 80vw; 4 | min-width: 80vw; 5 | text-align: left !important; 6 | } 7 | .CodeMirror { 8 | height: 55vh !important; 9 | } 10 | 11 | .CodeMirror.cm-s-default.CodeMirror-wrap { 12 | font-family: 'Open Sans', sans-serif; 13 | } 14 | 15 | #summary_text, #chat_text, #ollama_chat_text { 16 | margin: 10px; 17 | font-weight: 700; 18 | } 19 | 20 | #get_chat, #ollama_get_chat, #get_summary, #dimensionalityReduction{ 21 | height: calc(3.5rem + calc(var(--bs-border-width) * 2)); 22 | width: 110px; 23 | margin-right: 20px; 24 | } 25 | 26 | table { 27 | table-layout: auto; 28 | margin: 0 auto; 29 | } 30 | th, 31 | td { 32 | word-wrap: break-word; 33 | max-width: 50%; 34 | text-align: center; 35 | } 36 | 37 | .table-bordered { 38 | border: none; 39 | } 40 | 41 | #output-table > tbody > tr:nth-of-type(odd) { 42 | background-color: #f9f9f9; 43 | } 44 | 45 | .table .table { 46 | background-color: transparent; 47 | } 48 | 49 | .highlight-first { 50 | background-color: rgb(0, 255, 81); 51 | } 52 | 53 | .highlight-second { 54 | background-color: rgb(135, 255, 153); 55 | } 56 | 57 | .highlight-third { 58 | background-color: rgb(190, 253, 190); 59 | } 60 | 61 | .highlight-select { 62 | background-color: orange; 63 | } 64 | 65 | #loading { 66 | display: inline-block; 67 | width: 1rem; 68 | height: 1rem; 69 | border: 3px solid rgba(255, 255, 255, 0.3); 70 | border-radius: 50%; 71 | border-top-color: #fff; 72 | animation: spin 1s ease-in-out infinite; 73 | -webkit-animation: spin 1s ease-in-out infinite; 74 | } 75 | 76 | @keyframes spin { 77 | to { 78 | -webkit-transform: rotate(360deg); 79 | } 80 | } 81 | @-webkit-keyframes spin { 82 | to { 83 | -webkit-transform: rotate(360deg); 84 | } 85 | } 86 | 87 | #progressBar { 88 | height: 25px; 89 | width: 100%; 90 | } 91 | 92 | #query-text { 93 | height: 75%; 94 | min-width: 80%; 95 | } 96 | 97 | 98 | .submit-button { 99 | height: 75%; 100 | white-space:normal; 101 | text-align: center; /* this seems to break when page size is too small */ 102 | } 103 | 104 | 105 | #formGroupCenter { 106 | width: 100%; 107 | } 108 | 109 | .CodeMirror { 110 | font-size: 15px; 111 | } 112 | 113 | #results { 114 | height: 70vh; 115 | overflow-y: auto; 116 | } 117 | 118 | .card { 119 | width: 100%; 120 | transition: background-color 0.3s ease; 121 | } 122 | 123 | .card:hover { 124 | background-color: #f8f9fa; 125 | } 126 | 127 | /*.nav-button {*/ 128 | /* width: 60px; !* adjust this to the size you want *!*/ 129 | /* margin-right: 1px; !* adds space between buttons *!*/ 130 | /*}*/ 131 | 132 | #submitGroup { 133 | margin-top: 2vh; /* adjust this value as needed */ 134 | } 135 | 136 | /* 137 | see: https://github.com/twbs/bootstrap/issues/33871 138 | */ 139 | .form-floating > label { z-index: 3; } 140 | 141 | #advancedFeaturesHeader .accordion-button:hover { 142 | text-decoration: underline; 143 | } 144 | 145 | .accordion-button::after { 146 | display: none; 147 | } 148 | 149 | .card-title { 150 | font-size: 0.9em; 151 | } 152 | .card-subtitle { 153 | font-size: 0.8em; 154 | } 155 | 156 | #SemanticFinderLogo{ 157 | display: block; 158 | margin: 0 auto; 159 | max-width: 250px; 160 | } 161 | 162 | @media (min-width: 992px) { 163 | #introContainer { 164 | display: inline-flex; 165 | } 166 | #introContentDiv{ 167 | padding-left: 20px; 168 | } 169 | } 170 | 171 | @media (max-width: 992px) { 172 | .col-sm-9 { 173 | width: 100% !important; 174 | } 175 | #results{ 176 | height: unset !important; 177 | } 178 | .col-sm-3 { 179 | width: 100% !important; 180 | } 181 | ul { 182 | padding-left: 0 !important; 183 | } 184 | } 185 | 186 | .toast { 187 | display: none; 188 | position: fixed; 189 | top: 16px; 190 | left: 50%; 191 | transform: translateX(-50%); 192 | background-color: white !important; 193 | color: #fff; 194 | padding: 10px 20px; 195 | border-radius: 5px; 196 | z-index: 1000; 197 | width: 250px !important; 198 | font-size: 20px !important; 199 | } 200 | 201 | 202 | #closeToastButton{ 203 | cursor: pointer; 204 | right: -47px; 205 | position: relative; 206 | } 207 | 208 | /* Style for the point labels 209 | .point-label { 210 | display: none; 211 | position: absolute; 212 | background-color: #ffffff; 213 | padding: 8px; 214 | border: 2px solid #4a90e2; 215 | border-radius: 10px; 216 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 217 | color: #333333; 218 | font-family: 'Open Sans', sans-serif; 219 | font-size: 14px; 220 | pointer-events: none; 221 | z-index: 100000000000000000000; 222 | opacity: 1 !important; 223 | } 224 | 225 | /* Style for the tooltip 226 | .tooltip { 227 | position: absolute; 228 | text-align: center; 229 | width: auto; 230 | height: 36px; 231 | padding: 6px; 232 | font-family: 'Open Sans', sans-serif; 233 | background: #4a90e2; 234 | color: #ffffff; 235 | border: 0px; 236 | border-radius: 12px; 237 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 238 | pointer-events: none; 239 | z-index: 100000000000000000000; 240 | opacity: 1 !important; 241 | }*/ 242 | 243 | #plot-container { 244 | max-height: 700px; 245 | height: 0; 246 | } 247 | 248 | #deckgl { 249 | position: relative !important; 250 | } 251 | 252 | #tooltip { 253 | position: absolute; 254 | opacity: 1000; 255 | font-size: 20px; 256 | border-radius: var(--bs-border-radius); 257 | background-color: #f5f8ffcf; 258 | outline: 2px solid #dfebff; 259 | padding: 8px; 260 | max-width: 500px; 261 | } -------------------------------------------------------------------------------- /src/js/semantic.js: -------------------------------------------------------------------------------- 1 | import { env, cos_sim} from '@xenova/transformers'; 2 | import { loadScatterplot } from './utils.js'; 3 | 4 | // @ts-ignore 5 | env.allowLocalModels = false; 6 | 7 | /** 8 | * @type {Worker} 9 | */ 10 | const worker = new Worker(new URL('./worker.js', import.meta.url), { 11 | type: 'module' 12 | }); 13 | 14 | window.semanticWorker = worker; 15 | 16 | /** 17 | * @type {Array} 18 | */ 19 | let queryEmbedding; 20 | 21 | /** 22 | * @type {Object} 23 | */ 24 | const similarityResolveMap = {}; 25 | 26 | /** 27 | * @type {Object} 28 | */ 29 | const tokensResolveMap = {}; 30 | 31 | /** 32 | * @type Function 33 | */ 34 | let loadResolve; 35 | 36 | /** 37 | * @type Function 38 | */ 39 | let queryResolve; 40 | 41 | function downloadFile(data, filename, mimeType) { 42 | const blob = new Blob([data], { type: mimeType }); 43 | 44 | const link = document.createElement('a'); 45 | link.href = window.URL.createObjectURL(blob); 46 | link.download = filename; 47 | 48 | // Append the link to the body for programmatic click 49 | document.body.appendChild(link); 50 | link.click(); 51 | 52 | // Remove the link from the DOM 53 | document.body.removeChild(link); 54 | } 55 | 56 | worker.onmessage = function (event) { 57 | const message = event.data; 58 | let resolve; 59 | 60 | switch (message.type) { 61 | case 'embeddingsDict': 62 | const gzippedData = message.data; 63 | //console.log("Embeddings data received."); 64 | // Download gzipped data as 'index.json.gz' 65 | downloadFile(gzippedData, message.filename, 'application/gzip'); 66 | break; 67 | case "download": 68 | let downloadBar = document.getElementById('loading-progress'); 69 | 70 | if (message.data.status === 'progress') { 71 | if (message.data.file !== "onnx/model_quantized.onnx") { break; } 72 | let progress = message.data.progress.toFixed(2); 73 | downloadBar.style.width = progress + '%'; 74 | downloadBar.textContent = progress + "%"; 75 | 76 | downloadBar.setAttribute('aria-valuenow', progress); 77 | } else if (message.data.status === 'ready') { 78 | downloadBar.style.width = '100%'; 79 | downloadBar.setAttribute('aria-valuenow', 100); 80 | downloadBar.textContent = ""; 81 | loadResolve(); 82 | } 83 | break; 84 | case "chat_download": 85 | let chatDownloadBar = document.getElementById('chat-progress'); 86 | 87 | if (message.data.status === 'progress') { 88 | if (message.data.file !== "onnx/decoder_model_merged_quantized.onnx") { break; } 89 | let progress = message.data.progress.toFixed(2); 90 | chatDownloadBar.style.width = progress + '%'; 91 | chatDownloadBar.textContent = Math.round(progress) + '%'; 92 | chatDownloadBar.setAttribute('aria-valuenow', progress); 93 | } else if (message.data.status === 'ready') { 94 | chatDownloadBar.style.width = '100%'; 95 | chatDownloadBar.setAttribute('aria-valuenow', 100); 96 | chatDownloadBar.textContent = ""; 97 | loadResolve(); 98 | } 99 | break; 100 | case "summary_download": 101 | let summaryDownloadBar = document.getElementById('summary-progress'); 102 | 103 | if (message.data.status === 'progress') { 104 | if (message.data.file !== "onnx/decoder_model_merged_quantized.onnx") { break; } 105 | let progress = message.data.progress.toFixed(2); 106 | summaryDownloadBar.style.width = progress + '%'; 107 | summaryDownloadBar.textContent = Math.round(progress) + '%'; 108 | summaryDownloadBar.setAttribute('aria-valuenow', progress); 109 | } else if (message.data.status === 'ready') { 110 | summaryDownloadBar.style.width = '100%'; 111 | summaryDownloadBar.setAttribute('aria-valuenow', 100); 112 | summaryDownloadBar.textContent = ""; 113 | loadResolve(); 114 | } 115 | break; 116 | case 'chat': 117 | //console.log(message.chat_text); 118 | document.getElementById("chat_text").innerHTML = message.chat_text 119 | queryResolve(message.chat_text); 120 | break; 121 | case 'summary': 122 | //console.log(message.summary_text); 123 | document.getElementById("summary_text").innerHTML = message.summary_text 124 | queryResolve(message.summary_text); 125 | break; 126 | case 'query': 127 | queryEmbedding = message.embedding; 128 | queryResolve(); 129 | break; 130 | case 'similarity': 131 | resolve = similarityResolveMap[message.text]; 132 | resolve(cos_sim(message.embedding, queryEmbedding)); 133 | delete similarityResolveMap[message.text]; 134 | break; 135 | case 'tokens': 136 | resolve = tokensResolveMap[message.text]; 137 | resolve(message.tokens); 138 | delete tokensResolveMap[message.text]; 139 | break; 140 | case 'tsne': 141 | console.log(message.plotDataArray) 142 | loadScatterplot(message.plotDataArray); 143 | 144 | break 145 | default: 146 | console.error('Unknown message type: ' + message.type); 147 | } 148 | }; 149 | 150 | /** 151 | * @param {string} text 152 | * @returns {Promise} 153 | */ 154 | export async function similarity(text) { 155 | worker.postMessage({ 156 | type: 'similarity', 157 | inferencingActive: document.getElementById("inferencingActive").checked, 158 | text 159 | }); 160 | return new Promise((resolve) => { 161 | similarityResolveMap[text] = resolve; 162 | }); 163 | } 164 | 165 | /** 166 | * 167 | * @param {string} text 168 | * @returns 169 | */ 170 | export async function summarizeText(text) { 171 | worker.postMessage({ 172 | type: 'summary', 173 | text 174 | }); 175 | return new Promise((resolve) => { 176 | queryResolve = resolve; 177 | }); 178 | } 179 | 180 | /** 181 | * 182 | * @param {string} text 183 | * @param {number} max_new_tokens 184 | * @returns 185 | */ 186 | export async function chatText(text, max_new_tokens) { 187 | worker.postMessage({ 188 | type: 'chat', 189 | max_new_tokens: max_new_tokens, 190 | text 191 | }); 192 | return new Promise((resolve) => { 193 | queryResolve = resolve; 194 | }); 195 | } 196 | 197 | /** 198 | * 199 | * @param {string} text 200 | * @returns 201 | */ 202 | export async function embedQuery(text) { 203 | worker.postMessage({ 204 | type: 'query', 205 | text 206 | }); 207 | return new Promise((resolve) => { 208 | queryResolve = resolve; 209 | }); 210 | } 211 | 212 | /** 213 | * 214 | * @param {string} text 215 | * @returns 216 | */ 217 | export async function getTokens(text) { 218 | worker.postMessage({ 219 | type: 'getTokens', 220 | text 221 | }); 222 | return new Promise((resolve) => { 223 | tokensResolveMap[text] = resolve; 224 | }); 225 | } 226 | 227 | /** 228 | * @param {string} modelName 229 | * @returns 230 | */ 231 | export async function loadSemantic(modelName) { 232 | const quantized = document.getElementById("quantized").checked; 233 | const downloadBar = document.getElementById('loading-progress'); 234 | downloadBar.style.width = '0%'; 235 | downloadBar.textContent = 'Loading model...'; 236 | worker.postMessage({ 237 | type: 'load', 238 | model_name: modelName, 239 | quantized: quantized 240 | }); 241 | return new Promise((resolve) => { 242 | loadResolve = resolve; 243 | }); 244 | } 245 | 246 | export async function loadChat(modelName) { 247 | //const quantized = document.getElementById("quantized").checked; 248 | let downloadBar = document.getElementById('chat-progress'); 249 | downloadBar.style.width = '0%'; 250 | downloadBar.textContent = 'Loading model...'; 251 | 252 | if (modelName.includes("Qwen")) { 253 | worker.postMessage({ 254 | type: 'load_text-generation', 255 | model_name: modelName 256 | //quantized: quantized 257 | }); 258 | } 259 | 260 | else { 261 | worker.postMessage({ 262 | type: 'load_text2text-generation', 263 | model_name: modelName 264 | //quantized: quantized 265 | }); 266 | } 267 | return new Promise((resolve) => { 268 | loadResolve = resolve; 269 | }); 270 | } 271 | 272 | export async function loadSummary(modelName) { 273 | //const quantized = document.getElementById("quantized").checked; 274 | let downloadBar = document.getElementById('summary-progress'); 275 | downloadBar.style.width = '0%'; 276 | downloadBar.textContent = 'Loading model...'; 277 | worker.postMessage({ 278 | type: 'load_summary', 279 | model_name: modelName 280 | //quantized: quantized 281 | }); 282 | return new Promise((resolve) => { 283 | loadResolve = resolve; 284 | }); 285 | } 286 | -------------------------------------------------------------------------------- /src/js/utils.js: -------------------------------------------------------------------------------- 1 | import { getTokens } from './semantic'; 2 | import { Deck } from '@deck.gl/core'; 3 | import { ScatterplotLayer, LineLayer } from '@deck.gl/layers'; 4 | import {setProgressBarValue } from './index.js'; 5 | 6 | import * as pdfjsLib from 'pdfjs-dist/webpack.mjs'; 7 | pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/build/pdf.worker.js'; 8 | 9 | //import {ScatterplotLayer} from '@deck.gl/layers'; 10 | /** 11 | * @param {string} text 12 | * @param {string} splitType 13 | * @param {string} splitParam 14 | * @returns {Promise | null>} 15 | */ 16 | export async function splitText(text, splitType, splitParam) { 17 | switch (splitType) { 18 | case 'Regex': 19 | return splitByRegex(text, splitParam); 20 | case 'Sentence': 21 | return splitBySentences(text); 22 | case 'Words': 23 | return splitByWords(text, parseInt(splitParam)); 24 | case 'Chars': 25 | return splitByChars(text, parseInt(splitParam)); 26 | case 'Tokens': 27 | return await splitByTokens(text, parseInt(splitParam)); 28 | case 'JinaAI': 29 | return await splitWithJinaAI(text, parseInt(splitParam)); 30 | default: 31 | console.error('Invalid split type'); 32 | return null; 33 | } 34 | } 35 | 36 | /** 37 | * @param {string} text 38 | * @param {number} numTokens 39 | * @returns {Promise | null>} 40 | */ 41 | async function splitByTokens(text, numTokens) { 42 | const words = text.split(' '); 43 | const chunks = []; 44 | 45 | for (let i = 0; i < words.length; i++) { 46 | const word = words[i]; 47 | const tokens = await getTokens(word); 48 | 49 | // Check if there's no chunk or if the last chunk + the new word would exceed numTokens 50 | if (chunks.length === 0 || (await getTokens(chunks[chunks.length - 1])).length + tokens.length > numTokens) { 51 | chunks.push(word); 52 | } else { 53 | chunks[chunks.length - 1] += ' ' + word; 54 | } 55 | } 56 | //console.table(chunks); 57 | console.log("Number of chunks: " + chunks.length) 58 | return chunks; 59 | } 60 | 61 | /** 62 | * @param {string} text 63 | * @param {number} numWords 64 | * @returns {Array | null} 65 | */ 66 | function splitByWords(text, numWords) { 67 | if (isNaN(numWords) || !Number.isInteger(numWords)) { 68 | console.error('numWords must be an integer.'); 69 | return null; 70 | } 71 | 72 | const words = text.split(' '); 73 | let chunks = []; 74 | let currentChunk = []; 75 | 76 | for (let i = 0; i < words.length; i++) { 77 | currentChunk.push(words[i]); 78 | 79 | if (currentChunk.length === numWords) { 80 | chunks.push(currentChunk.join(' ')); 81 | currentChunk = []; 82 | } 83 | } 84 | 85 | if (currentChunk.length > 0) { 86 | chunks.push(currentChunk.join(' ')); 87 | } 88 | chunks = chunks.filter(chunk => chunk.trim().length > 0); 89 | 90 | //console.table(chunks); 91 | console.log("Number of chunks: " + chunks.length) 92 | 93 | return chunks; 94 | } 95 | 96 | /** 97 | * @param {string} text 98 | * @param {number} numChars 99 | * @returns {Array | null} 100 | */ 101 | function splitByChars(text, numChars) { 102 | const words = text.split(' '); 103 | const chunks = []; 104 | 105 | for (let i = 0; i < words.length; i++) { 106 | const word = words[i]; 107 | 108 | if (chunks.length === 0 || chunks[chunks.length - 1].length + word.length + 1 > numChars) { 109 | chunks.push(word); 110 | } else { 111 | chunks[chunks.length - 1] += ' ' + word; 112 | } 113 | } 114 | // console.table(chunks); 115 | console.log("Number of chunks: " + chunks.length) 116 | return chunks; 117 | } 118 | 119 | /** 120 | * @param {string} text 121 | * @returns {Array | null} 122 | */ 123 | function splitBySentences(text) { 124 | const chunks = text.match(/[^.!?]+[.!?]+/g); 125 | console.log("Number of chunks: " + chunks.length) 126 | 127 | return chunks 128 | } 129 | 130 | /** 131 | * @param {string} text 132 | * @param {string} r 133 | * @returns {Array | null} 134 | */ 135 | function splitByRegex(text, r) { 136 | const regex = new RegExp(r, 'g'); 137 | const chunks = text.split(regex); 138 | 139 | console.log("Number of chunks: " + chunks.length) 140 | 141 | return chunks 142 | } 143 | 144 | /** 145 | * @param {string} text 146 | * @param {number} numChars 147 | * @returns {Promise | null>} 148 | */ 149 | async function splitWithJinaAI(text, numChars) { 150 | const data = { 151 | content: text, 152 | return_chunks: true, 153 | max_chunk_length: numChars 154 | }; 155 | 156 | try { 157 | const response = await fetch('https://segment.jina.ai/', { 158 | method: 'POST', 159 | headers: { 160 | 'Content-Type': 'application/json' 161 | }, 162 | body: JSON.stringify(data) 163 | }); 164 | 165 | if (!response.ok) { 166 | console.error('HTTP error:', response.status, response.statusText); 167 | return null; 168 | } 169 | 170 | const responseData = await response.json(); 171 | 172 | const chunks = responseData.chunks || []; // Assuming the API returns the chunks in a property called 'chunks' 173 | 174 | console.log("Number of chunks: " + chunks.length); 175 | // console.table(chunks); // Uncomment if you want to see the chunks in a table format 176 | 177 | return chunks; 178 | } catch (error) { 179 | console.error('Fetch error:', error); 180 | return null; 181 | } 182 | } 183 | 184 | // Example usage: 185 | // splitWithJinaAIChars("Your text here", 1000).then(chunks => console.log(chunks)); 186 | 187 | 188 | 189 | // Sorting algorithms: heap-based sorting is quite superior for 1000+ and usually less than half of the time of normal sorting 190 | // might be interesting to use it once indices become larger than 100k but for now not a bottleneck 191 | 192 | // Original code 193 | function normalSorting(inputTexts) { 194 | const startTime = performance.now(); 195 | const sortedResults = Object.entries(inputTexts).sort((a, b) => b[1] - a[1]); 196 | const endTime = performance.now(); 197 | console.log(`Original code took ${endTime - startTime} milliseconds`); 198 | // updateResults(sortedResults); // Commented out, replace with your actual implementation 199 | } 200 | 201 | // MaxHeap class 202 | class MaxHeap { 203 | constructor(array) { 204 | this.heap = [...array]; 205 | this.buildHeap(); 206 | } 207 | 208 | buildHeap() { 209 | const n = this.heap.length; 210 | for (let i = Math.floor(n / 2) - 1; i >= 0; i--) { 211 | this.heapifyDown(i); 212 | } 213 | } 214 | 215 | heapifyDown(i) { 216 | const left = 2 * i + 1; 217 | const right = 2 * i + 2; 218 | let largest = i; 219 | 220 | if (left < this.heap.length && this.heap[left][1] > this.heap[largest][1]) { 221 | largest = left; 222 | } 223 | 224 | if (right < this.heap.length && this.heap[right][1] > this.heap[largest][1]) { 225 | largest = right; 226 | } 227 | 228 | if (largest !== i) { 229 | this.swap(i, largest); 230 | this.heapifyDown(largest); 231 | } 232 | } 233 | 234 | extractMax() { 235 | if (this.heap.length === 0) { 236 | return null; 237 | } 238 | 239 | const max = this.heap[0]; 240 | const last = this.heap.pop(); 241 | 242 | if (this.heap.length > 0) { 243 | this.heap[0] = last; 244 | this.heapifyDown(0); 245 | } 246 | 247 | return max; 248 | } 249 | 250 | swap(i, j) { 251 | [this.heap[i], this.heap[j]] = [this.heap[j], this.heap[i]]; 252 | } 253 | } 254 | 255 | // Heap-based solution 256 | export function heapBasedSorting(inputTexts, n) { 257 | //const startTime = performance.now(); 258 | 259 | const entries = Object.entries(inputTexts); 260 | const maxHeap = new MaxHeap(entries); 261 | 262 | const nLargest = []; 263 | for (let i = 0; i < n && i < entries.length; i++) { 264 | const maxEntry = maxHeap.extractMax(); 265 | nLargest.push(maxEntry); 266 | } 267 | return nLargest 268 | 269 | //const endTime = performance.now(); 270 | //console.log(`Heap-based solution took ${endTime - startTime} milliseconds`); 271 | // updateResults(nLargest); // Commented out, replace with your actual implementation 272 | } 273 | 274 | /* 275 | // Test objects 276 | function generateTestObject(size) { 277 | const testObject = {}; 278 | for (let i = 0; i < size; i++) { 279 | testObject[`key${i}`] = Math.random(); 280 | } 281 | return testObject; 282 | } 283 | 284 | //const obj100 = generateTestObject(100); 285 | //const obj10000 = generateTestObject(10000); 286 | //const obj100000 = generateTestObject(100000); 287 | 288 | // Usage 289 | //const n = 5; // Change this to the desired number of largest values 290 | 291 | //normalSorting(obj100); 292 | //heapBasedSorting(obj100, n); 293 | 294 | //normalSorting(obj10000); 295 | //heapBasedSorting(obj10000, n); 296 | 297 | //normalSorting(obj100000); 298 | //heapBasedSorting(obj100000, n); 299 | 300 | Original code took 0.19999999925494194 milliseconds 301 | Heap-based solution took 0.10000000149011612 milliseconds 302 | 303 | Original code took 19.5 milliseconds 304 | Heap-based solution took 9.299999997019768 milliseconds 305 | 306 | Original code took 166.69999999925494 milliseconds 307 | Heap-based solution took 60.5 milliseconds 308 | 309 | */ 310 | 311 | const toastMessage = document.getElementById("toastMessage"); 312 | const toastText = document.getElementById("toastText"); 313 | const closeToastButton = document.getElementById("closeToastButton"); 314 | 315 | export function showToast(message, timeout=2500) { 316 | toastText.textContent = message; 317 | toastMessage.style.display = "block"; 318 | 319 | setTimeout(() => { 320 | hideToast(); 321 | }, timeout); 322 | } 323 | 324 | function hideToast() { 325 | toastMessage.style.display = "none"; 326 | } 327 | 328 | closeToastButton.addEventListener("click", () => { 329 | hideToast(); 330 | }); 331 | 332 | function generateGridData(gridSize = 20) { 333 | const gridData = []; 334 | 335 | // Create vertical lines 336 | for (let i = -gridSize; i <= gridSize; i++) { 337 | gridData.push({ 338 | sourcePosition: [i, -gridSize], 339 | targetPosition: [i, gridSize], 340 | color: [169, 169, 169], 341 | }); 342 | } 343 | 344 | // Create horizontal lines 345 | for (let j = -gridSize; j <= gridSize; j++) { 346 | gridData.push({ 347 | sourcePosition: [-gridSize, j], 348 | targetPosition: [gridSize, j], 349 | color: [169, 169, 169], 350 | }); 351 | } 352 | 353 | return gridData; 354 | } 355 | 356 | const plotContainer = document.getElementById("plot-container"); 357 | let deckgl; 358 | export async function loadScatterplot(data) { 359 | 360 | removeScatterplot(); 361 | // Find the minimum and maximum similarity values, x values, and y values in the data array 362 | const minSimilarity = Math.min(...data.map(item => item.similarity)); 363 | const maxSimilarity = Math.max(...data.map(item => item.similarity)); 364 | 365 | const minX = Math.min(...data.map(item => item.x)); 366 | const maxX = Math.max(...data.map(item => item.x)); 367 | 368 | const minY = Math.min(...data.map(item => item.y)); 369 | const maxY = Math.max(...data.map(item => item.y)); 370 | 371 | data = data.map(item => { 372 | // Normalize similarity values to the range [0, 1] 373 | const normalizedSimilarity = (item.similarity - minSimilarity) / (maxSimilarity - minSimilarity); 374 | 375 | // Normalize x and y coordinates to the range [0, 1] 376 | const normalizedX = (item.x - minX) / (maxX - minX); 377 | const normalizedY = (item.y - minY) / (maxY - minY); 378 | 379 | // Use the normalized similarity value as alpha (opacity) 380 | const alpha = Math.min(1, Math.max(0, normalizedSimilarity)); 381 | 382 | // Map the alpha value to the entire opacity spectrum 383 | const color = [0, 0, 255, Math.floor(alpha * 255)]; // RGBA format with alpha value 384 | 385 | return { 386 | coordinates: [normalizedX, normalizedY], 387 | color: color, 388 | similarity: item.similarity, 389 | label: item.label, 390 | }; 391 | }); 392 | 393 | // Calculate the bounding box of the data 394 | const bounds = data.reduce( 395 | (acc, point) => ({ 396 | minX: Math.min(acc.minX, point.coordinates[0]), 397 | minY: Math.min(acc.minY, point.coordinates[1]), 398 | maxX: Math.max(acc.maxX, point.coordinates[0]), 399 | maxY: Math.max(acc.maxY, point.coordinates[1]), 400 | }), 401 | { minX: Infinity, minY: Infinity, maxX: -Infinity, maxY: -Infinity } 402 | ); 403 | 404 | deckgl = new Deck({ 405 | canvas: 'deckgl', 406 | container: 'plot-container', 407 | initialViewState: { 408 | latitude: (bounds.minY + bounds.maxY) / 2, 409 | longitude: (bounds.minX + bounds.maxX) / 2, 410 | zoom: 9 411 | }, 412 | controller: true, 413 | pickingRadius: 25, 414 | layers: [ 415 | // Add a new LineLayer for the coordinate system 416 | /*new LineLayer({ 417 | id: 'coordinate-system', 418 | data: generateGridData(20), 419 | getSourcePosition: d => d.sourcePosition, 420 | getTargetPosition: d => d.targetPosition, 421 | getColor: d => d.color, 422 | getWidth: 1, 423 | pickable: false 424 | }), 425 | */ 426 | // ScatterplotLayer with all points added right away 427 | new ScatterplotLayer({ 428 | id: 'scatterplot', 429 | data: data, 430 | getPosition: d => d.coordinates, 431 | getRadius: parseInt(document.getElementById("scatterplotRadius").value), // Adjust the radius to fit the new range 432 | getFillColor: d => d.color, 433 | pickable: true, // Enable picking for on-hover interaction 434 | onHover: info => { 435 | const tooltip = document.getElementById('tooltip'); 436 | 437 | if (info.object) { 438 | const canvas = document.getElementById('deckgl'); 439 | const rect = canvas.getBoundingClientRect(); 440 | 441 | // Calculate the correct position by subtracting the canvas offset and adding the scroll position 442 | const left = window.scrollX + info.x + rect.left + 30; 443 | const top = window.scrollY + info.y + rect.top + -50; 444 | 445 | tooltip.innerHTML = `${info.object.label}
Similarity: ${info.object.similarity.toFixed(2)}`; 446 | tooltip.style.left = `${left}px`; 447 | tooltip.style.top = `${top}px`; 448 | tooltip.style.display = 'block'; 449 | } else { 450 | tooltip.style.display = 'none'; 451 | } 452 | }, 453 | onClick: info => { 454 | const tooltip = document.getElementById('tooltip'); 455 | 456 | if (info.object) { 457 | const canvas = document.getElementById('deckgl'); 458 | const rect = canvas.getBoundingClientRect(); 459 | 460 | // Calculate the correct position by subtracting the canvas offset and adding the scroll position 461 | const left = window.scrollX + info.x + rect.left + 30; 462 | const top = window.scrollY + info.y + rect.top + -50; 463 | 464 | tooltip.innerHTML = `${info.object.label}
Similarity: ${info.object.similarity.toFixed(2)}`; 465 | tooltip.style.left = `${left}px`; 466 | tooltip.style.top = `${top}px`; 467 | tooltip.style.display = 'block'; 468 | } else { 469 | tooltip.style.display = 'none'; 470 | } 471 | } 472 | 473 | }) 474 | ] 475 | }); 476 | 477 | plotContainer.style.height = "700px"; 478 | } 479 | 480 | export function removeScatterplot() { 481 | if (deckgl) { 482 | deckgl.finalize(); 483 | deckgl = null; 484 | } 485 | } 486 | 487 | // pdf loading logic for local and remote 488 | 489 | function processPdf(pdf, documentIdentifier, resolve, reject, updateProgress) { 490 | let numPages = pdf.numPages; 491 | let pageTextPromises = []; 492 | for (let i = 1; i <= numPages; i++) { 493 | pageTextPromises.push(pdf.getPage(i).then(page => { 494 | return page.getTextContent().then(textContent => { 495 | return textContent.items.map(item => item.str).join(' '); 496 | }); 497 | })); 498 | } 499 | Promise.all(pageTextPromises).then(pagesText => { 500 | // Concatenate text from all pages with metadata including page number 501 | let fullText = pagesText.map((pageText, index) => 502 | `[Document: ${documentIdentifier}, Page: ${index + 1}]\n${pageText}` 503 | ).join("\n\n"); 504 | resolve(fullText); // Resolve the promise with the full text including metadata 505 | }).catch(error => { 506 | reject(error); // Reject the promise if there's an error 507 | }); 508 | } 509 | 510 | function extractTextFromPDF(fileOrDataUri, updateProgress) { 511 | return new Promise((resolve, reject) => { 512 | let documentIdentifier; 513 | let pdfSource; 514 | 515 | if (fileOrDataUri instanceof File) { 516 | // For local files 517 | documentIdentifier = fileOrDataUri.name; 518 | pdfSource = URL.createObjectURL(fileOrDataUri); 519 | } else if (typeof fileOrDataUri === 'string') { 520 | if (fileOrDataUri.startsWith('data:')) { 521 | // For data URIs (remote PDFs) 522 | documentIdentifier = "RemotePDF"; 523 | pdfSource = fileOrDataUri; 524 | } else { 525 | // Assume it's a URL 526 | documentIdentifier = fileOrDataUri; 527 | pdfSource = fileOrDataUri; 528 | } 529 | } else { 530 | reject(new Error('Invalid input type')); 531 | return; 532 | } 533 | 534 | pdfjsLib.getDocument(pdfSource).promise.then(pdf => { 535 | processPdf(pdf, documentIdentifier, resolve, reject, updateProgress); 536 | }).catch(error => { 537 | reject(error); // Reject the promise if there's an error loading the PDF 538 | }); 539 | }); 540 | } 541 | 542 | 543 | export async function handlePdfFileUpload() { 544 | const fileInput = document.getElementById('pdf-upload'); 545 | const files = fileInput.files; // Get all selected files 546 | if (files.length > 0) { 547 | const totalFiles = files.length; 548 | let processedFiles = 0; 549 | 550 | // Map each file to a promise that resolves with its text content 551 | const filePromises = Array.from(files).map(file => { 552 | return extractTextFromPDF(file, setProgressBarValue).then(text => { 553 | processedFiles++; 554 | const progressPercentage = (processedFiles / totalFiles) * 100; 555 | setProgressBarValue(progressPercentage.toFixed(0)); 556 | console.log(progressPercentage); 557 | return text; 558 | }); 559 | }); 560 | 561 | // Wait for all files to be processed 562 | const allFilesText = await Promise.all(filePromises); 563 | // Concatenate text from all files 564 | const fullText = allFilesText.join("\n\n"); 565 | return fullText; // Return the full text 566 | } else { 567 | console.error('No files selected'); 568 | return ''; // Return an empty string or handle the error as needed 569 | } 570 | } 571 | 572 | 573 | 574 | 575 | 576 | //////////////////////////////////////////////////// 577 | 578 | async function fetchPdfAsDataUri(url) { 579 | const proxyUrl = 'https://corsproxy.io/?' + url; // cors proxy unfortunately needed for remote files :/ 580 | const response = await fetch(proxyUrl); 581 | if (!response.ok) { 582 | throw new Error('Network response was not ok'); 583 | } 584 | const blob = await response.blob(); 585 | return new Promise((resolve, reject) => { 586 | const reader = new FileReader(); 587 | reader.onloadend = () => resolve(reader.result); 588 | reader.onerror = reject; 589 | reader.readAsDataURL(blob); 590 | }); 591 | } 592 | 593 | 594 | export async function handleRemotePdfFileUpload() { 595 | const urls = document.getElementById("importPdfURL").value.split(" "); 596 | let texts = []; 597 | 598 | for (const url of urls) { 599 | console.log(url); 600 | 601 | try { 602 | const dataUri = await fetchPdfAsDataUri(url); 603 | const text = await extractTextFromPDF(dataUri, null); 604 | texts.push(text); 605 | } catch (error) { 606 | console.log('Not a pdf, trying to parse the web page'); 607 | 608 | // Fallback to extracting text from a normal webpage 609 | try { 610 | const response = await fetch(url); 611 | const html = await response.text(); 612 | const parser = new DOMParser(); 613 | const doc = parser.parseFromString(html, 'text/html'); 614 | const pageText = doc.body.innerText; 615 | texts.push(pageText); 616 | } catch (webpageError) { 617 | console.error('Error fetching or parsing webpage:', webpageError); 618 | } 619 | } 620 | } 621 | 622 | return texts.join("\n"); 623 | } 624 | 625 | 626 | export async function handleMultipleRemotePdfFileUploads() { 627 | const urls = document.getElementById("importPdfURL").value.split(" ") 628 | const results = []; 629 | 630 | for (const url of urls) { 631 | console.log(url); 632 | 633 | try { 634 | const dataUri = await fetchPdfAsDataUri(url); 635 | const text = await extractTextFromPDF(dataUri, null); 636 | results.push(text); 637 | } catch (error) { 638 | console.error(`Error handling remote PDF file upload for URL ${url}:`, error); 639 | results.push(''); 640 | } 641 | } 642 | 643 | return results; 644 | } 645 | -------------------------------------------------------------------------------- /src/js/worker.js: -------------------------------------------------------------------------------- 1 | import { pipeline, AutoTokenizer } from '@xenova/transformers'; 2 | import pako from 'pako'; 3 | import init, { tSNE } from "wasm-bhtsne"; 4 | import { marked } from 'marked'; 5 | 6 | init(); 7 | // env.useBrowserCache = false; // for testing 8 | 9 | /** 10 | * @type {Object} 11 | */ 12 | let embeddingsDict = {}; 13 | 14 | /** 15 | * @type {Pipeline} 16 | */ 17 | // embedding models 18 | let embedder; 19 | let tokenizer; 20 | 21 | // chat model 22 | let chat_generator; 23 | let chat_tokenizer; 24 | let chat_model_name; 25 | 26 | // summary model 27 | let summary_generator; 28 | let summary_tokenizer; 29 | 30 | let queryEmbedding; 31 | let currentNullVector = []; 32 | 33 | function minimalEightCharHash(str) { 34 | let hash = 5381; 35 | 36 | for (let i = 0; i < str.length; i++) { 37 | hash = (hash * 33) ^ str.charCodeAt(i); 38 | } 39 | 40 | // Convert to 8-character hexadecimal string 41 | const hexHash = (hash >>> 0).toString(16); 42 | return hexHash.slice(0, 8).padStart(8, '0'); 43 | } 44 | 45 | function minimalRandomEightCharHash() { 46 | const characters = '0123456789abcdef'; 47 | let hash = ''; 48 | 49 | for (let i = 0; i < 8; i++) { 50 | const randomIndex = Math.floor(Math.random() * characters.length); 51 | hash += characters[randomIndex]; 52 | } 53 | 54 | return hash; 55 | } 56 | 57 | 58 | async function token_to_text(beams, tokenizer_type) { 59 | //let chatTokenizer = await AutoTokenizer.from_pretrained(chatModel); 60 | let decoded_text = tokenizer_type.decode(beams[0].output_token_ids, { 61 | skip_special_tokens: true 62 | }); 63 | //console.log(decoded_text); 64 | return decoded_text 65 | } 66 | 67 | /** 68 | * @param {string} text 69 | * @returns {Promise} 70 | */ 71 | async function embed(text, embedNewText=true) { 72 | if (text in embeddingsDict) { 73 | return embeddingsDict[text]; 74 | } 75 | 76 | if (embedNewText==false){ 77 | if (currentNullVector != []){ 78 | embeddingsDict[text] = currentNullVector; 79 | return currentNullVector 80 | } 81 | else { 82 | const tempVec = await embedder("test", { pooling: 'mean', normalize: true }); 83 | currentNullVector = [...tempVec.data].fill(0.00001); 84 | embeddingsDict[text] = currentNullVector; 85 | return currentNullVector 86 | } 87 | } 88 | 89 | const e0 = await embedder(text, { pooling: 'mean', normalize: true }); 90 | 91 | const roundDecimalsDown = (num) => parseFloat(num.toFixed(3)); 92 | 93 | embeddingsDict[text] = e0.data.map(roundDecimalsDown); 94 | //console.log(embeddingsDict) 95 | return e0.data; 96 | 97 | } 98 | 99 | async function getTokens(text) { 100 | return await tokenizer(text).input_ids.data; 101 | } 102 | 103 | async function chat(text, max_new_tokens = 100) { 104 | return new Promise(async (resolve, reject) => { 105 | // hier Weiche einbauen für Qwen da tokenizer anders 106 | console.log(chat_model_name, max_new_tokens); 107 | 108 | if (chat_model_name.includes("Qwen")) { 109 | try { 110 | 111 | // Define the prompt and list of messages 112 | const messages = [ 113 | { "role": "system", "content": "You are a helpful assistant." }, 114 | { "role": "user", "content": text } 115 | ] 116 | 117 | const generatorText = chat_generator.tokenizer.apply_chat_template(messages, { 118 | tokenize: false, 119 | add_generation_prompt: false, 120 | }); 121 | 122 | const thisChat = await chat_generator(generatorText, { 123 | max_new_tokens: max_new_tokens, 124 | do_sample: false, 125 | callback_function: async function (beams) { 126 | //const decodedText = await token_to_text(beams, chat_generator.tokenizer); 127 | let decodedText = chat_generator.tokenizer.decode(beams[0].output_token_ids, { skip_special_tokens: false }) 128 | 129 | decodedText = decodedText.split("<|im_start|>")[3].replace("<|im_end|>","") // just return the model's output 130 | decodedText = marked(decodedText) 131 | 132 | self.postMessage({ 133 | type: 'chat', 134 | chat_text: decodedText 135 | }); 136 | 137 | resolve(decodedText); // Resolve the main promise with chat text 138 | }, 139 | }); 140 | } catch (error) { 141 | reject(error); 142 | } 143 | } 144 | 145 | else { 146 | try { 147 | const thisChat = await chat_generator(text, { 148 | max_new_tokens: max_new_tokens, 149 | return_prompt: false, 150 | callback_function: async function (beams) { 151 | const decodedText = await token_to_text(beams, chat_tokenizer); 152 | //console.log(decodedText); 153 | 154 | self.postMessage({ 155 | type: 'chat', 156 | chat_text: decodedText, 157 | }); 158 | 159 | resolve(decodedText); // Resolve the main promise with chat text 160 | }, 161 | }); 162 | } catch (error) { 163 | reject(error); 164 | } 165 | } 166 | }); 167 | } 168 | 169 | async function summary(text, max_new_tokens = 100) { 170 | return new Promise(async (resolve, reject) => { 171 | try { 172 | const thisSummary = await summary_generator(text, { 173 | max_new_tokens: max_new_tokens, 174 | return_prompt: false, 175 | callback_function: async function (beams) { 176 | const decodedText = await token_to_text(beams, summary_tokenizer); 177 | //console.log(beams) 178 | 179 | self.postMessage({ 180 | type: 'summary', 181 | summary_text: decodedText, 182 | }); 183 | 184 | resolve(decodedText); // Resolve the main promise with chat text 185 | }, 186 | }); 187 | } catch (error) { 188 | reject(error); 189 | } 190 | }); 191 | } 192 | 193 | // tested, trivial calculation takes 200ms for 100k embeddings of size 384 or 700 ms with size 1000 194 | const calculateAverageEmbedding = (embeddingsAsArray) => { 195 | const allEmbeddings = Object.values(embeddingsAsArray); 196 | 197 | if (allEmbeddings.length === 0) { 198 | return null; // handle the case when the input object is empty 199 | } 200 | 201 | const sumEmbeddings = allEmbeddings.reduce((acc, embedding) => { 202 | return acc.map((value, index) => value + embedding[index]); 203 | }, new Array(allEmbeddings[0].length).fill(0)); 204 | 205 | const averageEmbedding = sumEmbeddings.map(value => value / allEmbeddings.length); 206 | 207 | return averageEmbedding; 208 | }; 209 | 210 | /* 211 | const calculateAverageEmbedding = (embeddingsAsArray) => { 212 | const allEmbeddings = Object.values(embeddingsAsArray); 213 | 214 | if (allEmbeddings.length === 0) { 215 | return null; // handle the case when the input object is empty 216 | } 217 | 218 | const start = performance.now(); 219 | 220 | const sumEmbeddings = allEmbeddings.reduce((acc, embedding) => { 221 | return acc.map((value, index) => value + embedding[index]); 222 | }, new Array(allEmbeddings[0].length).fill(0)); 223 | 224 | const averageEmbedding = sumEmbeddings.map(value => value / allEmbeddings.length); 225 | 226 | const end = performance.now(); 227 | console.log('Execution time:', end - start, 'milliseconds'); 228 | 229 | return averageEmbedding; 230 | }; 231 | 232 | // Generate random embeddings for testing 233 | const generateRandomEmbedding = (size) => { 234 | return Array.from({ length: size }, () => Math.random()); 235 | }; 236 | 237 | // Generate test data with 10,000 strings and embeddings of size 1000 238 | const generateTestEmbeddings = (numStrings, embeddingSize) => { 239 | const testData = {}; 240 | for (let i = 1; i <= numStrings; i++) { 241 | const key = `string${i}`; 242 | const embedding = generateRandomEmbedding(embeddingSize); 243 | testData[key] = embedding; 244 | } 245 | return testData; 246 | }; 247 | 248 | // Test the calculateAverageEmbedding function with generated data 249 | const testEmbeddingsAsArray = generateTestEmbeddings(100000, 1000); 250 | const averageEmbedding = calculateAverageEmbedding(testEmbeddingsAsArray); 251 | 252 | console.log('Average Embedding:', averageEmbedding); 253 | */ 254 | 255 | function convert_to_underscores(inputString) { 256 | // Replace spaces with underscores 257 | var stringWithUnderscores = lowercaseString.replace(/\s/g, '_'); 258 | 259 | return stringWithUnderscores; 260 | } 261 | function createRandomMatrix(rows, columns) { 262 | return Array.from({ length: rows }, () => 263 | Array.from({ length: columns }, () => Math.random()) 264 | ); 265 | } 266 | // Function to update embeddingsDict 267 | const updateEmbeddingsDict = (newData) => { 268 | embeddingsDict = newData; 269 | postMessage({ type: 'updateEmbeddingsDict', data: embeddingsDict }); 270 | }; 271 | 272 | function convertFloat32ArraysToArrays(arrayOfFloat32Arrays) { 273 | return arrayOfFloat32Arrays.reduce((accumulator, currentFloat32Array) => { 274 | // Convert Float32Array to a regular JavaScript array using Array.from 275 | const jsArray = Array.from(currentFloat32Array); 276 | 277 | // Add the converted array to the accumulator 278 | accumulator.push(jsArray); 279 | 280 | return accumulator; 281 | }, []); 282 | } 283 | 284 | function calculateCosineSimilarity(embedding) { 285 | let dotProduct = 0; 286 | let queryMagnitude = 0; 287 | let embeddingMagnitude = 0; 288 | const queryEmbeddingLength = queryEmbedding.length; 289 | 290 | for (let i = 0; i < queryEmbeddingLength; i++) { 291 | dotProduct += queryEmbedding[i] * embedding[i]; 292 | queryMagnitude += queryEmbedding[i] ** 2; 293 | embeddingMagnitude += embedding[i] ** 2; 294 | } 295 | 296 | return dotProduct / (Math.sqrt(queryMagnitude) * Math.sqrt(embeddingMagnitude)); 297 | } 298 | 299 | // Expose a function to manually update embeddingsDict 300 | self.updateEmbeddingsDictManually = updateEmbeddingsDict; 301 | 302 | self.onmessage = async (event) => { 303 | const message = event.data; 304 | //console.log(message) 305 | let roundDecimals; 306 | let embeddingsAsArray; 307 | let exportDict; 308 | let gzippedData; 309 | let text; 310 | let embedding; 311 | 312 | // Other cases in your existing switch statement 313 | switch (message.type) { 314 | case 'logEmbeddingsDict': 315 | console.log(embeddingsDict); 316 | break 317 | case 'tsne': 318 | const start = performance.now(); 319 | const valuesFloat32Array = Array.from(Object.values(embeddingsDict)); 320 | let valuesArray = convertFloat32ArraysToArrays(valuesFloat32Array); 321 | const valuesArrayLength = valuesArray.length; 322 | //console.log(valuesArrayLength); 323 | // Check if the length is below 61 to set perplexity to a different value, needs slight refactoring to 324 | // get rid of this workaround 325 | 326 | let compressed_vectors; 327 | if (valuesArrayLength < 61) { 328 | const vectorLength = valuesArray[0].length; // Assuming all vectors have the same length 329 | const vectorsToAdd = 61 - valuesArrayLength; 330 | 331 | console.log("added: ", vectorsToAdd) 332 | // Add random vectors to the array 333 | for (let i = 0; i < vectorsToAdd; i++) { 334 | const randomVector = Array.from({ length: vectorLength }, () => Math.random()); 335 | valuesArray.push(randomVector); 336 | } 337 | 338 | const tsne_encoder = new tSNE(valuesArray); 339 | compressed_vectors = tsne_encoder.barnes_hut(message.data.iterations).slice(0, valuesArrayLength);//,theta=0.1); 340 | } 341 | else { 342 | const tsne_encoder = new tSNE(valuesArray); 343 | compressed_vectors = tsne_encoder.barnes_hut(message.data.iterations); 344 | 345 | } 346 | 347 | //console.log("Compressed Vectors:", compressed_vectors); 348 | const end = performance.now(); 349 | console.log('BHtSNE Execution time:', Math.round(end - start), 'ms'); 350 | 351 | //text = message.text; 352 | //embedding = await embed(text); 353 | 354 | const originalKeys = Object.keys(embeddingsDict); 355 | const originalEmbeddings = Object.values(embeddingsDict) 356 | 357 | // Assuming compressed_vectors is now an array of arrays 358 | let plotDataArray = []; 359 | 360 | for (let i = 0; i < originalKeys.length; i++) { 361 | let thisVec = compressed_vectors[i]; 362 | let similarity = calculateCosineSimilarity(originalEmbeddings[i]); 363 | 364 | if (similarity >= message.data.dimensionalityReductionSimilarityThreshold) { 365 | plotDataArray.push({ "x": thisVec[0], "y": thisVec[1], "label": originalKeys[i], "similarity": similarity }); 366 | } 367 | } 368 | 369 | console.log(plotDataArray) 370 | 371 | // Now reconstructedDict will have the original format 372 | //console.log(plotDataArray); 373 | 374 | //loadScatterplot(plotDataArray); 375 | 376 | self.postMessage({ 377 | type: 'tsne', 378 | plotDataArray 379 | }); 380 | break 381 | 382 | case 'importEmbeddingsDict': 383 | embeddingsDict = message.data; 384 | break 385 | case 'exportEmbeddingsDict': 386 | roundDecimals = (num) => parseFloat(num.toFixed(parseInt(message.data.meta.exportDecimals))); 387 | 388 | embeddingsAsArray = Object.fromEntries( 389 | Object.entries(embeddingsDict).map(([key, values]) => [key, Object.values(values).map(roundDecimals)]) 390 | ); 391 | 392 | const meanEmbedding = calculateAverageEmbedding(embeddingsAsArray) 393 | // adding mean embedding so all indexed docs on HF could be ingested in a "proper" vector DB! 394 | exportDict = { 395 | "meta": message.data.meta, "text": message.data.text, 396 | "index": embeddingsAsArray, 397 | "mean_embedding": meanEmbedding 398 | } 399 | 400 | exportDict.meta.chunks = Object.keys(embeddingsAsArray).length; 401 | 402 | console.log("Document average embedding", meanEmbedding); 403 | console.log("Metadata", exportDict.meta); 404 | 405 | gzippedData = pako.gzip(JSON.stringify(exportDict), { to: 'string' }); 406 | 407 | const tempFilename = `${message.data.meta.textTitle.replace(/\s/g, '_')}_${minimalRandomEightCharHash()}.json.gz` 408 | // Send the gzipped data as a response 409 | self.postMessage({ type: 'embeddingsDict', data: gzippedData, filename: tempFilename }); 410 | break; 411 | 412 | case 'load': 413 | embeddingsDict = {}; // clear dict 414 | tokenizer = await AutoTokenizer.from_pretrained(message.model_name); // no progress callbacks -- assume its quick 415 | embedder = await pipeline('feature-extraction', message.model_name, 416 | { 417 | quantized: message.quantized, 418 | progress_callback: data => { 419 | self.postMessage({ 420 | type: 'download', 421 | data 422 | }); 423 | } 424 | 425 | }); 426 | break; 427 | case 'load_summary': 428 | summary_tokenizer = await AutoTokenizer.from_pretrained(message.model_name) 429 | summary_generator = await pipeline('summarization', message.model_name, 430 | { 431 | progress_callback: data => { 432 | self.postMessage({ 433 | type: 'summary_download', 434 | data 435 | }); 436 | } 437 | //quantized: message.quantized // currently not possible, models unquantized way too large! 438 | }); 439 | break; 440 | case 'load_text2text-generation': 441 | console.log("loading chat"); 442 | chat_model_name = message.model_name; 443 | chat_tokenizer = await AutoTokenizer.from_pretrained(message.model_name); // no progress callbacks -- assume its quick 444 | chat_generator = await pipeline('text2text-generation', message.model_name, 445 | { 446 | progress_callback: data => { 447 | self.postMessage({ 448 | type: 'chat_download', 449 | data 450 | }); 451 | } 452 | //quantized: message.quantized // currently not possible, models unquantized way too large! 453 | }); 454 | break; 455 | case 'load_text-generation': 456 | console.log("loading chat"); 457 | chat_model_name = message.model_name; 458 | chat_tokenizer = await AutoTokenizer.from_pretrained(message.model_name) // no progress callbacks -- assume its quick 459 | chat_generator = await pipeline('text-generation', message.model_name, 460 | { 461 | progress_callback: data => { 462 | self.postMessage({ 463 | type: 'chat_download', 464 | data 465 | }); 466 | } 467 | //quantized: message.quantized // currently not possible, models unquantized way too large! 468 | }); 469 | console.log("chat loaded"); 470 | break; 471 | case 'query': 472 | text = message.text; 473 | embedding = await embed(text); 474 | queryEmbedding = embedding; 475 | currentNullVector = [...Object.values(embeddingsDict)[0]].fill(0.00001); 476 | self.postMessage({ 477 | type: 'query', 478 | embedding 479 | }); 480 | break; 481 | case 'similarity': 482 | text = message.text; 483 | embedding = await embed(text, message.inferencingActive); 484 | self.postMessage({ 485 | type: 'similarity', 486 | text, 487 | embedding 488 | }); 489 | break; 490 | case 'getTokens': 491 | text = message.text; 492 | self.postMessage({ 493 | type: 'tokens', 494 | text, 495 | tokens: await getTokens(text) 496 | }); 497 | break; 498 | case 'summary': 499 | text = message.text; 500 | let summary_text = await summary(text, message.max_new_tokens); 501 | self.postMessage({ 502 | type: 'summary', 503 | summary_text 504 | }); 505 | break; 506 | case 'chat': 507 | text = message.text; 508 | let chat_text = await chat(text, message.max_new_tokens); 509 | self.postMessage({ 510 | type: 'chat', 511 | chat_text 512 | }); 513 | break; 514 | 515 | default: 516 | } 517 | }; 518 | 519 | -------------------------------------------------------------------------------- /src/models/model_miner.js: -------------------------------------------------------------------------------- 1 | // model mining script - necessary as huggingface.co doesn not allow requests from other domains e.g. github.io 2 | // execute this script for each sorter while on https://huggingface.co/models 3 | // downloads the json file 4 | 5 | let out_json = {} 6 | const sorter = "modified" // // likes, downloads, trending, modified 7 | const pipeline_tag = "feature-extraction" // text2text2 etc. 8 | const fileName = `${pipeline_tag}_${sorter}.json`; 9 | 10 | function downloadJsonToFile(jsonData, fileName) { 11 | // Create a Blob object from the JSON data 12 | const blob = new Blob([JSON.stringify(jsonData)], { type: "application/json" }); 13 | 14 | // Create a URL for the Blob 15 | const url = URL.createObjectURL(blob); 16 | 17 | // Create a link element for the download 18 | const a = document.createElement("a"); 19 | a.href = url; 20 | a.download = fileName; 21 | 22 | // Trigger a click event on the link to initiate the download 23 | a.click(); 24 | 25 | // Clean up by revoking the URL 26 | URL.revokeObjectURL(url); 27 | } 28 | 29 | async function fetchAllPages() { 30 | const baseUrl = "https://huggingface.co/models-json"; 31 | const commonParams = `?pipeline_tag=${pipeline_tag}&library=transformers.js&sort=${sorter}`; 32 | const numPages = 3; // Change this if you need more or fewer pages 33 | 34 | const models = []; 35 | 36 | for (let pageIndex = 0; pageIndex < numPages; pageIndex++) { 37 | const url = `${baseUrl}${commonParams}&p=${pageIndex}`; 38 | 39 | try { 40 | const response = await fetch(url); 41 | const data = await response.json(); 42 | models.push(...data.models); 43 | } catch (error) { 44 | console.error(`Error fetching page ${pageIndex}: ${error}`); 45 | } 46 | } 47 | 48 | const result = { 49 | activeFilters: { 50 | pipeline_tag: ["feature-extraction"], 51 | library: ["transformers.js"], 52 | dataset: [], 53 | language: [], 54 | license: [], 55 | other: [], 56 | }, 57 | models, 58 | numItemsPerPage: 30, 59 | numTotalItems: models.length, 60 | pageIndex: 0, 61 | }; 62 | 63 | out_json = result; 64 | 65 | downloadJsonToFile(result, fileName); 66 | } 67 | 68 | fetchAllPages(); 69 | 70 | -------------------------------------------------------------------------------- /src/models/model_miner_simple.js: -------------------------------------------------------------------------------- 1 | // simplified script for just downloading all models from the current HF page 2 | // set the filters on HF and run it in the browser console 3 | // e.g. go to https://huggingface.co/models?pipeline_tag=text2text-generation&library=transformers.js&sort=trending 4 | 5 | const h4Elements = document.querySelectorAll("h4"); 6 | const h4TextArray = []; 7 | 8 | h4Elements.forEach(element => { 9 | h4TextArray.push(element.textContent); 10 | }); 11 | 12 | console.log(h4TextArray); 13 | 14 | //[ 15 | // "Xenova/t5-small", 16 | // "Xenova/flan-t5-small", 17 | // "Xenova/LaMini-Flan-T5-783M", 18 | // "Xenova/LaMini-Flan-T5-248M", 19 | // "Xenova/LaMini-Flan-T5-77M", 20 | // "Xenova/LaMini-T5-61M", 21 | // "Xenova/LaMini-T5-738M", 22 | // "Xenova/LaMini-T5-223M", 23 | // "Xenova/mt5-small", 24 | // "Xenova/mt5-base", 25 | // "Xenova/t5-base", 26 | // "Xenova/t5-v1_1-base", 27 | // "Xenova/flan-t5-base", 28 | // "Xenova/t5-v1_1-small", 29 | // "Xenova/blenderbot-400M-distill", 30 | // "Xenova/blenderbot_small-90M", 31 | // "Xenova/long-t5-tglobal-base", 32 | // "Xenova/long-t5-local-base", 33 | // "Xenova/long-t5-tglobal-base-16384-book-summary" 34 | //] -------------------------------------------------------------------------------- /src/models/model_size_miner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 25, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests\n", 10 | "from bs4 import BeautifulSoup\n", 11 | "import json\n", 12 | "\n", 13 | "# Load URLs from the JSON file\n", 14 | "with open('feature-extraction_downloads.json', 'r') as json_file:\n", 15 | " data = json.load(json_file)\n", 16 | " # urls = data.get('urls', [])\n", 17 | "ids = [i[\"id\"] for i in data[\"models\"]]\n", 18 | "urls = [f\"https://huggingface.co/{i}/tree/main/onnx\" for i in ids]" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 26, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "{'author': 'TaylorAI',\n", 30 | " 'authorData': {'avatarUrl': 'https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/63917e16b6b839bb61483dbf/Utq89ebo7Glxfls0QZnxK.png?w=200&h=200&f=face',\n", 31 | " 'fullname': 'Taylor',\n", 32 | " 'name': 'TaylorAI',\n", 33 | " 'type': 'org',\n", 34 | " 'isHf': False},\n", 35 | " 'downloads': 1752,\n", 36 | " 'gated': False,\n", 37 | " 'id': 'TaylorAI/gte-tiny',\n", 38 | " 'lastModified': '2023-10-07T05:20:49.000Z',\n", 39 | " 'likes': 102,\n", 40 | " 'pipeline_tag': 'sentence-similarity',\n", 41 | " 'private': False,\n", 42 | " 'repoType': 'model',\n", 43 | " 'isLikedByUser': False}" 44 | ] 45 | }, 46 | "execution_count": 26, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "data[\"models\"][0]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 27, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "https://huggingface.co/TaylorAI/gte-tiny/tree/main/onnx | 22.9\n", 65 | "https://huggingface.co/Supabase/gte-small/tree/main/onnx | 34\n", 66 | "https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx | 23\n", 67 | "https://huggingface.co/Xenova/bge-large-en-v1.5/tree/main/onnx | 337\n", 68 | "https://huggingface.co/Supabase/bge-small-en/tree/main/onnx | 34\n", 69 | "https://huggingface.co/Xenova/gte-small/tree/main/onnx | 34\n", 70 | "https://huggingface.co/Xenova/all-mpnet-base-v2/tree/main/onnx | 110\n", 71 | "https://huggingface.co/Xenova/paraphrase-mpnet-base-v2/tree/main/onnx | 110\n", 72 | "https://huggingface.co/Xenova/all-MiniLM-L12-v2/tree/main/onnx | 34\n", 73 | "https://huggingface.co/Xenova/multilingual-e5-small/tree/main/onnx | 118\n", 74 | "https://huggingface.co/Xenova/gte-large/tree/main/onnx | 337\n", 75 | "https://huggingface.co/Xenova/bge-base-en-v1.5/tree/main/onnx | 110\n", 76 | "https://huggingface.co/Xenova/all-roberta-large-v1/tree/main/onnx | 357\n", 77 | "https://huggingface.co/Xenova/distiluse-base-multilingual-cased-v2/tree/main/onnx | 135\n", 78 | "https://huggingface.co/Xenova/paraphrase-multilingual-mpnet-base-v2/tree/main/onnx | 279\n", 79 | "https://huggingface.co/Xenova/bge-large-zh/tree/main/onnx | 327\n", 80 | "https://huggingface.co/Xenova/multilingual-e5-base/tree/main/onnx | 279\n", 81 | "https://huggingface.co/Xenova/bge-small-en-v1.5/tree/main/onnx | 34\n", 82 | "https://huggingface.co/Xenova/paraphrase-albert-small-v2/tree/main/onnx | 39.7\n", 83 | "https://huggingface.co/Xenova/paraphrase-albert-base-v2/tree/main/onnx | 40\n", 84 | "https://huggingface.co/Xenova/squeezebert-uncased/tree/main/onnx | 51.2\n", 85 | "https://huggingface.co/Xenova/squeezebert-mnli/tree/main/onnx | 51.3\n", 86 | "https://huggingface.co/Xenova/vit-base-patch16-224-in21k/tree/main/onnx | 87.5\n", 87 | "https://huggingface.co/Xenova/all-distilroberta-v1/tree/main/onnx | 82.1\n", 88 | "https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2/tree/main/onnx | 118\n", 89 | "https://huggingface.co/Xenova/paraphrase-MiniLM-L6-v2/tree/main/onnx | 23\n", 90 | "https://huggingface.co/Xenova/bert-base-nli-mean-tokens/tree/main/onnx | 110\n", 91 | "https://huggingface.co/Xenova/distilbert-base-nli-mean-tokens/tree/main/onnx | 66.9\n", 92 | "https://huggingface.co/Xenova/distilbert-base-nli-stsb-mean-tokens/tree/main/onnx | 66.9\n", 93 | "https://huggingface.co/Xenova/distiluse-base-multilingual-cased-v1/tree/main/onnx | 135\n", 94 | "https://huggingface.co/Xenova/msmarco-distilbert-base-v4/tree/main/onnx | 66.9\n", 95 | "https://huggingface.co/Xenova/multi-qa-MiniLM-L6-cos-v1/tree/main/onnx | 23\n", 96 | "https://huggingface.co/Xenova/multi-qa-distilbert-cos-v1/tree/main/onnx | 66.9\n", 97 | "https://huggingface.co/Xenova/multi-qa-mpnet-base-cos-v1/tree/main/onnx | 110\n", 98 | "https://huggingface.co/Xenova/multi-qa-mpnet-base-dot-v1/tree/main/onnx | 110\n", 99 | "https://huggingface.co/Xenova/nli-mpnet-base-v2/tree/main/onnx | 110\n", 100 | "https://huggingface.co/Xenova/paraphrase-MiniLM-L3-v2/tree/main/onnx | 17.5\n", 101 | "https://huggingface.co/Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens/tree/main/onnx | 279\n", 102 | "https://huggingface.co/Xenova/dino-vitb16/tree/main/onnx | 87.5\n", 103 | "https://huggingface.co/Xenova/dino-vits8/tree/main/onnx | 23.4\n", 104 | "https://huggingface.co/Xenova/dino-vitb8/tree/main/onnx | 88.8\n", 105 | "https://huggingface.co/Xenova/dino-vits16/tree/main/onnx | 22.7\n", 106 | "https://huggingface.co/Xenova/scibert_scivocab_uncased/tree/main/onnx | 111\n", 107 | "https://huggingface.co/Xenova/spanbert-large-cased/tree/main/onnx | 335\n", 108 | "https://huggingface.co/Xenova/spanbert-base-cased/tree/main/onnx | 109\n", 109 | "https://huggingface.co/sdan/simple-embeddings/tree/main/onnx | 23\n", 110 | "https://huggingface.co/Xenova/sentence_bert/tree/main/onnx | 110\n", 111 | "https://huggingface.co/Xenova/e5-small-v2/tree/main/onnx | 34\n", 112 | "https://huggingface.co/Xenova/SapBERT-from-PubMedBERT-fulltext/tree/main/onnx | 110\n", 113 | "https://huggingface.co/Xenova/indobert-base-p1/tree/main/onnx | 125\n", 114 | "https://huggingface.co/Xenova/UMLSBert_ENG/tree/main/onnx | 110\n", 115 | "https://huggingface.co/Xenova/rubert-base-cased/tree/main/onnx | 178\n", 116 | "https://huggingface.co/Xenova/kobert/tree/main/onnx | 92.8\n", 117 | "https://huggingface.co/Xenova/e5-small/tree/main/onnx | 34\n", 118 | "https://huggingface.co/Xenova/e5-large/tree/main/onnx | 337\n", 119 | "https://huggingface.co/Xenova/e5-large-v2/tree/main/onnx | 337\n", 120 | "https://huggingface.co/Xenova/e5-base/tree/main/onnx | 110\n", 121 | "https://huggingface.co/Xenova/e5-base-v2/tree/main/onnx | 110\n", 122 | "https://huggingface.co/Xenova/instructor-base/tree/main/onnx | 110\n", 123 | "https://huggingface.co/Xenova/instructor-large/tree/main/onnx | 337\n", 124 | "https://huggingface.co/Xenova/sentence-t5-large/tree/main/onnx | 337\n", 125 | "https://huggingface.co/Xenova/multilingual-e5-large/tree/main/onnx | 562\n", 126 | "https://huggingface.co/Xenova/mms-300m/tree/main/onnx | 318\n", 127 | "https://huggingface.co/Xenova/mms-1b/tree/main/onnx | 969\n", 128 | "https://huggingface.co/Supabase/e5-small-v2/tree/main/onnx | 34\n", 129 | "https://huggingface.co/Supabase/all-MiniLM-L6-v2/tree/main/onnx | 23\n", 130 | "https://huggingface.co/Xenova/gte-base/tree/main/onnx | 110\n", 131 | "https://huggingface.co/Xenova/bge-small-en/tree/main/onnx | 34\n", 132 | "https://huggingface.co/Xenova/bge-base-en/tree/main/onnx | 110\n", 133 | "https://huggingface.co/Xenova/bge-large-en/tree/main/onnx | 337\n", 134 | "https://huggingface.co/ggrn/bge-small-en/tree/main/onnx | 34\n", 135 | "https://huggingface.co/Xenova/bge-base-zh/tree/main/onnx | 103\n", 136 | "https://huggingface.co/Xenova/bge-large-zh-noinstruct/tree/main/onnx | 327\n", 137 | "https://huggingface.co/Xenova/bge-small-zh/tree/main/onnx | 24\n", 138 | "https://huggingface.co/Xenova/ClinicalBERT/tree/main/onnx | 229\n", 139 | "https://huggingface.co/Xenova/LaBSE/tree/main/onnx | 472\n", 140 | "https://huggingface.co/Xenova/wavlm-base/tree/main/onnx | 95.8\n", 141 | "https://huggingface.co/Xenova/wavlm-base-plus/tree/main/onnx | 95.8\n", 142 | "https://huggingface.co/Xenova/wavlm-large/tree/main/onnx | 319\n", 143 | "https://huggingface.co/Xenova/sentence-camembert-large/tree/main/onnx | 339\n", 144 | "https://huggingface.co/Xenova/herbert-base-cased/tree/main/onnx | 125\n", 145 | "https://huggingface.co/Xenova/herbert-large-cased/tree/main/onnx | 357\n", 146 | "https://huggingface.co/Xenova/bge-large-zh-v1.5/tree/main/onnx | 327\n", 147 | "https://huggingface.co/Xenova/bge-base-zh-v1.5/tree/main/onnx | 103\n", 148 | "https://huggingface.co/Xenova/bge-small-zh-v1.5/tree/main/onnx | 24\n", 149 | "https://huggingface.co/leolee9086/text2vec-base-chinese/tree/main/onnx | 103\n", 150 | "https://huggingface.co/Xenova/long-t5-encodec-tglobal-base/tree/main/onnx | 291\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "def extract_size_from_url(url):\n", 156 | " try:\n", 157 | " response = requests.get(url)\n", 158 | " if response.status_code == 200:\n", 159 | " soup = BeautifulSoup(response.text, 'html.parser')\n", 160 | " \n", 161 | " # Find the 'a' tag with the specified title attribute\n", 162 | " a_tag = soup.find('a', title=\"Download file\")\n", 163 | " if a_tag:\n", 164 | " size = a_tag.text.strip() # Extract the size text\n", 165 | " return size\n", 166 | " else:\n", 167 | " return \"Size not found\"\n", 168 | " else:\n", 169 | " return \"Failed to retrieve the page\"\n", 170 | " except requests.exceptions.RequestException as e:\n", 171 | " return f\"Request error: {e}\"\n", 172 | "\n", 173 | "# Iterate through the URLs and extract values\n", 174 | "sizes = []\n", 175 | "\n", 176 | "for url in urls:\n", 177 | " values = extract_values_from_url(url)[-1].split(\" MB\")[0]\n", 178 | " sizes.append([url.split(\"https://huggingface.co/\")[1].split(\"/tree/main/onnx\")[0],values])\n", 179 | " print(f\"{url} | {values}\")\n" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 28, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "[['TaylorAI/gte-tiny', '22.9'],\n", 191 | " ['Supabase/gte-small', '34'],\n", 192 | " ['Xenova/all-MiniLM-L6-v2', '23'],\n", 193 | " ['Xenova/bge-large-en-v1.5', '337'],\n", 194 | " ['Supabase/bge-small-en', '34'],\n", 195 | " ['Xenova/gte-small', '34'],\n", 196 | " ['Xenova/all-mpnet-base-v2', '110'],\n", 197 | " ['Xenova/paraphrase-mpnet-base-v2', '110'],\n", 198 | " ['Xenova/all-MiniLM-L12-v2', '34'],\n", 199 | " ['Xenova/multilingual-e5-small', '118'],\n", 200 | " ['Xenova/gte-large', '337'],\n", 201 | " ['Xenova/bge-base-en-v1.5', '110'],\n", 202 | " ['Xenova/all-roberta-large-v1', '357'],\n", 203 | " ['Xenova/distiluse-base-multilingual-cased-v2', '135'],\n", 204 | " ['Xenova/paraphrase-multilingual-mpnet-base-v2', '279'],\n", 205 | " ['Xenova/bge-large-zh', '327'],\n", 206 | " ['Xenova/multilingual-e5-base', '279'],\n", 207 | " ['Xenova/bge-small-en-v1.5', '34'],\n", 208 | " ['Xenova/paraphrase-albert-small-v2', '39.7'],\n", 209 | " ['Xenova/paraphrase-albert-base-v2', '40'],\n", 210 | " ['Xenova/squeezebert-uncased', '51.2'],\n", 211 | " ['Xenova/squeezebert-mnli', '51.3'],\n", 212 | " ['Xenova/vit-base-patch16-224-in21k', '87.5'],\n", 213 | " ['Xenova/all-distilroberta-v1', '82.1'],\n", 214 | " ['Xenova/paraphrase-multilingual-MiniLM-L12-v2', '118'],\n", 215 | " ['Xenova/paraphrase-MiniLM-L6-v2', '23'],\n", 216 | " ['Xenova/bert-base-nli-mean-tokens', '110'],\n", 217 | " ['Xenova/distilbert-base-nli-mean-tokens', '66.9'],\n", 218 | " ['Xenova/distilbert-base-nli-stsb-mean-tokens', '66.9'],\n", 219 | " ['Xenova/distiluse-base-multilingual-cased-v1', '135'],\n", 220 | " ['Xenova/msmarco-distilbert-base-v4', '66.9'],\n", 221 | " ['Xenova/multi-qa-MiniLM-L6-cos-v1', '23'],\n", 222 | " ['Xenova/multi-qa-distilbert-cos-v1', '66.9'],\n", 223 | " ['Xenova/multi-qa-mpnet-base-cos-v1', '110'],\n", 224 | " ['Xenova/multi-qa-mpnet-base-dot-v1', '110'],\n", 225 | " ['Xenova/nli-mpnet-base-v2', '110'],\n", 226 | " ['Xenova/paraphrase-MiniLM-L3-v2', '17.5'],\n", 227 | " ['Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', '279'],\n", 228 | " ['Xenova/dino-vitb16', '87.5'],\n", 229 | " ['Xenova/dino-vits8', '23.4'],\n", 230 | " ['Xenova/dino-vitb8', '88.8'],\n", 231 | " ['Xenova/dino-vits16', '22.7'],\n", 232 | " ['Xenova/scibert_scivocab_uncased', '111'],\n", 233 | " ['Xenova/spanbert-large-cased', '335'],\n", 234 | " ['Xenova/spanbert-base-cased', '109'],\n", 235 | " ['sdan/simple-embeddings', '23'],\n", 236 | " ['Xenova/sentence_bert', '110'],\n", 237 | " ['Xenova/e5-small-v2', '34'],\n", 238 | " ['Xenova/SapBERT-from-PubMedBERT-fulltext', '110'],\n", 239 | " ['Xenova/indobert-base-p1', '125'],\n", 240 | " ['Xenova/UMLSBert_ENG', '110'],\n", 241 | " ['Xenova/rubert-base-cased', '178'],\n", 242 | " ['Xenova/kobert', '92.8'],\n", 243 | " ['Xenova/e5-small', '34'],\n", 244 | " ['Xenova/e5-large', '337'],\n", 245 | " ['Xenova/e5-large-v2', '337'],\n", 246 | " ['Xenova/e5-base', '110'],\n", 247 | " ['Xenova/e5-base-v2', '110'],\n", 248 | " ['Xenova/instructor-base', '110'],\n", 249 | " ['Xenova/instructor-large', '337'],\n", 250 | " ['Xenova/sentence-t5-large', '337'],\n", 251 | " ['Xenova/multilingual-e5-large', '562'],\n", 252 | " ['Xenova/mms-300m', '318'],\n", 253 | " ['Xenova/mms-1b', '969'],\n", 254 | " ['Supabase/e5-small-v2', '34'],\n", 255 | " ['Supabase/all-MiniLM-L6-v2', '23'],\n", 256 | " ['Xenova/gte-base', '110'],\n", 257 | " ['Xenova/bge-small-en', '34'],\n", 258 | " ['Xenova/bge-base-en', '110'],\n", 259 | " ['Xenova/bge-large-en', '337'],\n", 260 | " ['ggrn/bge-small-en', '34'],\n", 261 | " ['Xenova/bge-base-zh', '103'],\n", 262 | " ['Xenova/bge-large-zh-noinstruct', '327'],\n", 263 | " ['Xenova/bge-small-zh', '24'],\n", 264 | " ['Xenova/ClinicalBERT', '229'],\n", 265 | " ['Xenova/LaBSE', '472'],\n", 266 | " ['Xenova/wavlm-base', '95.8'],\n", 267 | " ['Xenova/wavlm-base-plus', '95.8'],\n", 268 | " ['Xenova/wavlm-large', '319'],\n", 269 | " ['Xenova/sentence-camembert-large', '339'],\n", 270 | " ['Xenova/herbert-base-cased', '125'],\n", 271 | " ['Xenova/herbert-large-cased', '357'],\n", 272 | " ['Xenova/bge-large-zh-v1.5', '327'],\n", 273 | " ['Xenova/bge-base-zh-v1.5', '103'],\n", 274 | " ['Xenova/bge-small-zh-v1.5', '24'],\n", 275 | " ['leolee9086/text2vec-base-chinese', '103'],\n", 276 | " ['Xenova/long-t5-encodec-tglobal-base', '291']]" 277 | ] 278 | }, 279 | "execution_count": 28, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "sizes" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 29, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "sizes_backup = [['TaylorAI/gte-tiny', '22.9'],\n", 295 | " ['Supabase/gte-small', '34'],\n", 296 | " ['Xenova/all-MiniLM-L6-v2', '23'],\n", 297 | " ['Xenova/bge-large-en-v1.5', '337'],\n", 298 | " ['Supabase/bge-small-en', '34'],\n", 299 | " ['Xenova/gte-small', '34'],\n", 300 | " ['Xenova/all-mpnet-base-v2', '110'],\n", 301 | " ['Xenova/paraphrase-mpnet-base-v2', '110'],\n", 302 | " ['Xenova/all-MiniLM-L12-v2', '34'],\n", 303 | " ['Xenova/multilingual-e5-small', '118'],\n", 304 | " ['Xenova/gte-large', '337'],\n", 305 | " ['Xenova/bge-base-en-v1.5', '110'],\n", 306 | " ['Xenova/all-roberta-large-v1', '357'],\n", 307 | " ['Xenova/distiluse-base-multilingual-cased-v2', '135'],\n", 308 | " ['Xenova/paraphrase-multilingual-mpnet-base-v2', '279'],\n", 309 | " ['Xenova/bge-large-zh', '327'],\n", 310 | " ['Xenova/multilingual-e5-base', '279'],\n", 311 | " ['Xenova/bge-small-en-v1.5', '34'],\n", 312 | " ['Xenova/paraphrase-albert-small-v2', '39.7'],\n", 313 | " ['Xenova/paraphrase-albert-base-v2', '40'],\n", 314 | " ['Xenova/squeezebert-uncased', '51.2'],\n", 315 | " ['Xenova/squeezebert-mnli', '51.3'],\n", 316 | " ['Xenova/vit-base-patch16-224-in21k', '87.5'],\n", 317 | " ['Xenova/all-distilroberta-v1', '82.1'],\n", 318 | " ['Xenova/paraphrase-multilingual-MiniLM-L12-v2', '118'],\n", 319 | " ['Xenova/paraphrase-MiniLM-L6-v2', '23'],\n", 320 | " ['Xenova/bert-base-nli-mean-tokens', '110'],\n", 321 | " ['Xenova/distilbert-base-nli-mean-tokens', '66.9'],\n", 322 | " ['Xenova/distilbert-base-nli-stsb-mean-tokens', '66.9'],\n", 323 | " ['Xenova/distiluse-base-multilingual-cased-v1', '135'],\n", 324 | " ['Xenova/msmarco-distilbert-base-v4', '66.9'],\n", 325 | " ['Xenova/multi-qa-MiniLM-L6-cos-v1', '23'],\n", 326 | " ['Xenova/multi-qa-distilbert-cos-v1', '66.9'],\n", 327 | " ['Xenova/multi-qa-mpnet-base-cos-v1', '110'],\n", 328 | " ['Xenova/multi-qa-mpnet-base-dot-v1', '110'],\n", 329 | " ['Xenova/nli-mpnet-base-v2', '110'],\n", 330 | " ['Xenova/paraphrase-MiniLM-L3-v2', '17.5'],\n", 331 | " ['Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', '279'],\n", 332 | " ['Xenova/dino-vitb16', '87.5'],\n", 333 | " ['Xenova/dino-vits8', '23.4'],\n", 334 | " ['Xenova/dino-vitb8', '88.8'],\n", 335 | " ['Xenova/dino-vits16', '22.7'],\n", 336 | " ['Xenova/scibert_scivocab_uncased', '111'],\n", 337 | " ['Xenova/spanbert-large-cased', '335'],\n", 338 | " ['Xenova/spanbert-base-cased', '109'],\n", 339 | " ['sdan/simple-embeddings', '23'],\n", 340 | " ['Xenova/sentence_bert', '110'],\n", 341 | " ['Xenova/e5-small-v2', '34'],\n", 342 | " ['Xenova/SapBERT-from-PubMedBERT-fulltext', '110'],\n", 343 | " ['Xenova/indobert-base-p1', '125'],\n", 344 | " ['Xenova/UMLSBert_ENG', '110'],\n", 345 | " ['Xenova/rubert-base-cased', '178'],\n", 346 | " ['Xenova/kobert', '92.8'],\n", 347 | " ['Xenova/e5-small', '34'],\n", 348 | " ['Xenova/e5-large', '337'],\n", 349 | " ['Xenova/e5-large-v2', '337'],\n", 350 | " ['Xenova/e5-base', '110'],\n", 351 | " ['Xenova/e5-base-v2', '110'],\n", 352 | " ['Xenova/instructor-base', '110'],\n", 353 | " ['Xenova/instructor-large', '337'],\n", 354 | " ['Xenova/sentence-t5-large', '337'],\n", 355 | " ['Xenova/multilingual-e5-large', '562'],\n", 356 | " ['Xenova/mms-300m', '318'],\n", 357 | " ['Xenova/mms-1b', '969'],\n", 358 | " ['Supabase/e5-small-v2', '34'],\n", 359 | " ['Supabase/all-MiniLM-L6-v2', '23'],\n", 360 | " ['Xenova/gte-base', '110'],\n", 361 | " ['Xenova/bge-small-en', '34'],\n", 362 | " ['Xenova/bge-base-en', '110'],\n", 363 | " ['Xenova/bge-large-en', '337'],\n", 364 | " ['ggrn/bge-small-en', '34'],\n", 365 | " ['Xenova/bge-base-zh', '103'],\n", 366 | " ['Xenova/bge-large-zh-noinstruct', '327'],\n", 367 | " ['Xenova/bge-small-zh', '24'],\n", 368 | " ['Xenova/ClinicalBERT', '229'],\n", 369 | " ['Xenova/LaBSE', '472'],\n", 370 | " ['Xenova/wavlm-base', '95.8'],\n", 371 | " ['Xenova/wavlm-base-plus', '95.8'],\n", 372 | " ['Xenova/wavlm-large', '319'],\n", 373 | " ['Xenova/sentence-camembert-large', '339'],\n", 374 | " ['Xenova/herbert-base-cased', '125'],\n", 375 | " ['Xenova/herbert-large-cased', '357'],\n", 376 | " ['Xenova/bge-large-zh-v1.5', '327'],\n", 377 | " ['Xenova/bge-base-zh-v1.5', '103'],\n", 378 | " ['Xenova/bge-small-zh-v1.5', '24'],\n", 379 | " ['leolee9086/text2vec-base-chinese', '103'],\n", 380 | " ['Xenova/long-t5-encodec-tglobal-base', '291']]" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 32, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/plain": [ 391 | "{'TaylorAI/gte-tiny': '22.9',\n", 392 | " 'Supabase/gte-small': '34',\n", 393 | " 'Xenova/all-MiniLM-L6-v2': '23',\n", 394 | " 'Xenova/bge-large-en-v1.5': '337',\n", 395 | " 'Supabase/bge-small-en': '34',\n", 396 | " 'Xenova/gte-small': '34',\n", 397 | " 'Xenova/all-mpnet-base-v2': '110',\n", 398 | " 'Xenova/paraphrase-mpnet-base-v2': '110',\n", 399 | " 'Xenova/all-MiniLM-L12-v2': '34',\n", 400 | " 'Xenova/multilingual-e5-small': '118',\n", 401 | " 'Xenova/gte-large': '337',\n", 402 | " 'Xenova/bge-base-en-v1.5': '110',\n", 403 | " 'Xenova/all-roberta-large-v1': '357',\n", 404 | " 'Xenova/distiluse-base-multilingual-cased-v2': '135',\n", 405 | " 'Xenova/paraphrase-multilingual-mpnet-base-v2': '279',\n", 406 | " 'Xenova/bge-large-zh': '327',\n", 407 | " 'Xenova/multilingual-e5-base': '279',\n", 408 | " 'Xenova/bge-small-en-v1.5': '34',\n", 409 | " 'Xenova/paraphrase-albert-small-v2': '39.7',\n", 410 | " 'Xenova/paraphrase-albert-base-v2': '40',\n", 411 | " 'Xenova/squeezebert-uncased': '51.2',\n", 412 | " 'Xenova/squeezebert-mnli': '51.3',\n", 413 | " 'Xenova/vit-base-patch16-224-in21k': '87.5',\n", 414 | " 'Xenova/all-distilroberta-v1': '82.1',\n", 415 | " 'Xenova/paraphrase-multilingual-MiniLM-L12-v2': '118',\n", 416 | " 'Xenova/paraphrase-MiniLM-L6-v2': '23',\n", 417 | " 'Xenova/bert-base-nli-mean-tokens': '110',\n", 418 | " 'Xenova/distilbert-base-nli-mean-tokens': '66.9',\n", 419 | " 'Xenova/distilbert-base-nli-stsb-mean-tokens': '66.9',\n", 420 | " 'Xenova/distiluse-base-multilingual-cased-v1': '135',\n", 421 | " 'Xenova/msmarco-distilbert-base-v4': '66.9',\n", 422 | " 'Xenova/multi-qa-MiniLM-L6-cos-v1': '23',\n", 423 | " 'Xenova/multi-qa-distilbert-cos-v1': '66.9',\n", 424 | " 'Xenova/multi-qa-mpnet-base-cos-v1': '110',\n", 425 | " 'Xenova/multi-qa-mpnet-base-dot-v1': '110',\n", 426 | " 'Xenova/nli-mpnet-base-v2': '110',\n", 427 | " 'Xenova/paraphrase-MiniLM-L3-v2': '17.5',\n", 428 | " 'Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens': '279',\n", 429 | " 'Xenova/dino-vitb16': '87.5',\n", 430 | " 'Xenova/dino-vits8': '23.4',\n", 431 | " 'Xenova/dino-vitb8': '88.8',\n", 432 | " 'Xenova/dino-vits16': '22.7',\n", 433 | " 'Xenova/scibert_scivocab_uncased': '111',\n", 434 | " 'Xenova/spanbert-large-cased': '335',\n", 435 | " 'Xenova/spanbert-base-cased': '109',\n", 436 | " 'sdan/simple-embeddings': '23',\n", 437 | " 'Xenova/sentence_bert': '110',\n", 438 | " 'Xenova/e5-small-v2': '34',\n", 439 | " 'Xenova/SapBERT-from-PubMedBERT-fulltext': '110',\n", 440 | " 'Xenova/indobert-base-p1': '125',\n", 441 | " 'Xenova/UMLSBert_ENG': '110',\n", 442 | " 'Xenova/rubert-base-cased': '178',\n", 443 | " 'Xenova/kobert': '92.8',\n", 444 | " 'Xenova/e5-small': '34',\n", 445 | " 'Xenova/e5-large': '337',\n", 446 | " 'Xenova/e5-large-v2': '337',\n", 447 | " 'Xenova/e5-base': '110',\n", 448 | " 'Xenova/e5-base-v2': '110',\n", 449 | " 'Xenova/instructor-base': '110',\n", 450 | " 'Xenova/instructor-large': '337',\n", 451 | " 'Xenova/sentence-t5-large': '337',\n", 452 | " 'Xenova/multilingual-e5-large': '562',\n", 453 | " 'Xenova/mms-300m': '318',\n", 454 | " 'Xenova/mms-1b': '969',\n", 455 | " 'Supabase/e5-small-v2': '34',\n", 456 | " 'Supabase/all-MiniLM-L6-v2': '23',\n", 457 | " 'Xenova/gte-base': '110',\n", 458 | " 'Xenova/bge-small-en': '34',\n", 459 | " 'Xenova/bge-base-en': '110',\n", 460 | " 'Xenova/bge-large-en': '337',\n", 461 | " 'ggrn/bge-small-en': '34',\n", 462 | " 'Xenova/bge-base-zh': '103',\n", 463 | " 'Xenova/bge-large-zh-noinstruct': '327',\n", 464 | " 'Xenova/bge-small-zh': '24',\n", 465 | " 'Xenova/ClinicalBERT': '229',\n", 466 | " 'Xenova/LaBSE': '472',\n", 467 | " 'Xenova/wavlm-base': '95.8',\n", 468 | " 'Xenova/wavlm-base-plus': '95.8',\n", 469 | " 'Xenova/wavlm-large': '319',\n", 470 | " 'Xenova/sentence-camembert-large': '339',\n", 471 | " 'Xenova/herbert-base-cased': '125',\n", 472 | " 'Xenova/herbert-large-cased': '357',\n", 473 | " 'Xenova/bge-large-zh-v1.5': '327',\n", 474 | " 'Xenova/bge-base-zh-v1.5': '103',\n", 475 | " 'Xenova/bge-small-zh-v1.5': '24',\n", 476 | " 'leolee9086/text2vec-base-chinese': '103',\n", 477 | " 'Xenova/long-t5-encodec-tglobal-base': '291'}" 478 | ] 479 | }, 480 | "execution_count": 32, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "# Create a dictionary to easily look up sizes by id\n", 487 | "size_dict = dict(sizes)\n", 488 | "size_dict" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 36, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "name": "stdout", 498 | "output_type": "stream", 499 | "text": [ 500 | "JSON updated and saved to 'your_output.json'\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "import json\n", 506 | "\n", 507 | "this_file = \"feature-extraction_trending\"\n", 508 | "\n", 509 | "with open(f\"{this_file}.json\", 'r') as json_file:\n", 510 | " data = json.load(json_file)\n", 511 | "\n", 512 | "# Iterate over the \"models\" in your JSON\n", 513 | "for model in data[\"models\"]:\n", 514 | " model_id = model[\"id\"]\n", 515 | " if model_id in size_dict:\n", 516 | " model[\"model_size\"] = size_dict[model_id]\n", 517 | "\n", 518 | "# Save the updated JSON to a file\n", 519 | "with open(f'{this_file}_sizes.json', 'w') as file:\n", 520 | " json.dump(data, file, indent=4)\n" 521 | ] 522 | } 523 | ], 524 | "metadata": { 525 | "kernelspec": { 526 | "display_name": "py3.11", 527 | "language": "python", 528 | "name": "python3" 529 | }, 530 | "language_info": { 531 | "codemirror_mode": { 532 | "name": "ipython", 533 | "version": 3 534 | }, 535 | "file_extension": ".py", 536 | "mimetype": "text/x-python", 537 | "name": "python", 538 | "nbconvert_exporter": "python", 539 | "pygments_lexer": "ipython3", 540 | "version": "3.11.0" 541 | } 542 | }, 543 | "nbformat": 4, 544 | "nbformat_minor": 2 545 | } 546 | -------------------------------------------------------------------------------- /src/models/text2text_downloads.json: -------------------------------------------------------------------------------- 1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0} 2 | -------------------------------------------------------------------------------- /src/models/text2text_likes.json: -------------------------------------------------------------------------------- 1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0} -------------------------------------------------------------------------------- /src/models/text2text_modified.json: -------------------------------------------------------------------------------- 1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0} -------------------------------------------------------------------------------- /src/models/text2text_trending.json: -------------------------------------------------------------------------------- 1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0} 2 | -------------------------------------------------------------------------------- /webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const HtmlWebpackPlugin = require('html-webpack-plugin'); 3 | const MiniCssExtractPlugin = require('mini-css-extract-plugin'); // FOUC-correction 4 | const FaviconsWebpackPlugin = require('favicons-webpack-plugin'); 5 | const CopyWebpackPlugin = require('copy-webpack-plugin'); 6 | 7 | module.exports = { 8 | entry: './src/js/index.js', 9 | mode: 'development', 10 | output: { 11 | filename: 'bundle.js', 12 | path: path.resolve(__dirname, 'dist'), 13 | clean: true 14 | }, 15 | module: { 16 | rules: [ 17 | { 18 | test: /\.css$/, 19 | use: [MiniCssExtractPlugin.loader, 'css-loader'], 20 | 21 | }, 22 | { 23 | test: /\.svg$/, 24 | type: 'asset/resource', 25 | generator: { 26 | filename: '[name][ext]' 27 | } 28 | }, 29 | ], 30 | }, 31 | plugins: [ 32 | new HtmlWebpackPlugin({ 33 | template: './index.html', 34 | }), 35 | new MiniCssExtractPlugin(), 36 | new FaviconsWebpackPlugin(), 37 | new CopyWebpackPlugin({ 38 | patterns: [ 39 | { 40 | from: 'src/models/**/*_sizes.json', // Source directory of JSON files 41 | to: 'models/[name][ext]' 42 | }, 43 | ], 44 | }), 45 | ], 46 | }; 47 | --------------------------------------------------------------------------------