├── .eslintrc.js
├── .github
└── workflows
│ └── deploy.yml
├── .gitignore
├── LICENSE
├── README.md
├── SemanticFinder.gif
├── SemanticFinder_Chrome_Extension.gif
├── SemanticFinder_Chrome_Extension_en.zip
├── SemanticFinder_gource.png
├── extension
├── .gitignore
├── README.md
├── package-lock.json
├── package.json
├── public
│ ├── icons
│ │ ├── logo128.png
│ │ └── logo48.png
│ └── manifest.json
├── src
│ ├── content
│ │ ├── content.css
│ │ └── content.js
│ ├── options
│ │ ├── options.css
│ │ ├── options.html
│ │ └── options.js
│ ├── popup
│ │ ├── AnimatedInput.vue
│ │ ├── popup.css
│ │ ├── popup.html
│ │ ├── popup.js
│ │ ├── popup.vue
│ │ └── result.vue
│ ├── serviceworkers
│ │ ├── background.js
│ │ ├── pdf.js
│ │ ├── pdf.sandbox.js
│ │ ├── pdf.worker.entry.js
│ │ ├── pdf.worker.js
│ │ └── semantic.js
│ └── utils
│ │ ├── cache.js
│ │ └── utils.js
└── webpack.config.js
├── index.html
├── jsconfig.json
├── logo.png
├── misc
├── Generate_large_textfile_from_books.ipynb
└── README.md
├── package-lock.json
├── package.json
├── src
├── css
│ └── styles.css
├── js
│ ├── SemanticFinder.svg
│ ├── index.js
│ ├── semantic.js
│ ├── utils.js
│ └── worker.js
└── models
│ ├── feature-extraction_downloads.json
│ ├── feature-extraction_downloads_sizes.json
│ ├── feature-extraction_likes.json
│ ├── feature-extraction_likes_sizes.json
│ ├── feature-extraction_modified.json
│ ├── feature-extraction_modified_sizes.json
│ ├── feature-extraction_trending.json
│ ├── feature-extraction_trending_sizes.json
│ ├── model_miner.js
│ ├── model_miner_simple.js
│ ├── model_size_miner.ipynb
│ ├── text2text_downloads.json
│ ├── text2text_likes.json
│ ├── text2text_modified.json
│ └── text2text_trending.json
└── webpack.config.js
/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | env: {
3 | browser: true,
4 | es2021: true
5 | },
6 | extends: 'standard',
7 | overrides: [
8 | {
9 | env: {
10 | node: true
11 | },
12 | files: [
13 | '.eslintrc.{js,cjs}'
14 | ],
15 | parserOptions: {
16 | sourceType: 'script'
17 | }
18 | }
19 | ],
20 | parserOptions: {
21 | ecmaVersion: 'latest',
22 | sourceType: 'module'
23 | },
24 | rules: {
25 | indent: ['error', 4],
26 | 'space-before-function-paren': ['error', 'never'],
27 | semi: ['error', 'always']
28 | }
29 | };
30 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy to GitHub Pages
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - webgpu
8 |
9 | jobs:
10 | deploy:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | # Checkout and Deploy Main Branch
15 | - name: Checkout main branch
16 | uses: actions/checkout@v3
17 | with:
18 | ref: main
19 |
20 | - name: Set up Node.js for main
21 | uses: actions/setup-node@v3
22 | with:
23 | node-version: '22'
24 | cache: 'npm'
25 |
26 | - name: Install dependencies for main
27 | run: npm install
28 |
29 | # Build and deploy main branch
30 | - name: Build and deploy main
31 | run: |
32 | npm run build
33 | mkdir -p main_build
34 | mv dist/* main_build/
35 | echo "Deploying main branch..."
36 |
37 | - name: Deploy main to GitHub Pages
38 | uses: peaceiris/actions-gh-pages@v3
39 | with:
40 | github_token: ${{ secrets.GITHUB_TOKEN }}
41 | publish_dir: ./main_build # Deploy from your custom directory
42 |
43 | # Checkout and Deploy webgpu Branch
44 | - name: Checkout webgpu branch
45 | uses: actions/checkout@v3
46 | with:
47 | ref: webgpu
48 |
49 | - name: Set up Node.js for webgpu
50 | uses: actions/setup-node@v3
51 | with:
52 | node-version: '22'
53 | cache: 'npm'
54 |
55 | - name: Install dependencies for webgpu
56 | run: npm install
57 |
58 | # Build and deploy webgpu branch
59 | - name: Build and deploy webgpu
60 | run: |
61 | npm run build
62 | mkdir -p webgpu_build
63 | mv dist/* webgpu_build/
64 | echo "Deploying webgpu branch..."
65 |
66 | - name: Deploy webgpu to GitHub Pages subdirectory
67 | uses: peaceiris/actions-gh-pages@v3
68 | with:
69 | github_token: ${{ secrets.GITHUB_TOKEN }}
70 | publish_dir: ./webgpu_build
71 | destination_dir: webgpu
72 | keep_files: true
73 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | .vscode/
3 | .idea/
4 | .DS_Store
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 do-me
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
Frontend-only live semantic search and chat-with-your-documents built on transformers.js. Supports Wasm and WebGPU!
6 |
7 |
8 | 
9 |
10 | ## [Try the web app](https://do-me.github.io/SemanticFinder/), [install the Chrome extension](#browser-extension) or read the [introduction blog post](https://geo.rocks/post/semanticfinder-semantic-search-frontend-only/).
11 |
12 | ## 🔥 For best performance try the [WebGPU Version here!](https://do-me.github.io/SemanticFinder/webgpu/) 🔥
13 |
14 | Semantic search right in your browser! Calculates the embeddings and cosine similarity client-side without server-side inferencing, using [transformers.js](https://xenova.github.io/transformers.js/) and latest SOTA embedding models from Huggingface.
15 |
16 | ## Intro Video
17 | [](https://www.youtube.com/watch?v=FZsWH1J4MXo "Get started with semantic search in the browser")
18 |
19 | ## Models
20 | All transformers.js-compatible feature-extraction models are supported. Here is a sortable list you can go through: [daily updated list](https://do-me.github.io/trending-huggingface-models/). Download the compatible models table as xlsx, csv, json, parquet, or html here: https://github.com/do-me/trending-huggingface-models/.
21 | Note that the wasm backend in transformers.js supports all mentioned models. If you want the best performance, make sure to use a WebGPU-compatible model.
22 |
23 | ## Catalogue
24 | You can use super fast pre-indexed examples for *really* large books like the Bible or Les Misérables with hundreds of pages and search the content in less than 2 seconds 🚀. Try one of these and convince yourself:
25 |
26 | | filesize | textTitle | textAuthor | textYear | textLanguage | URL | modelName | quantized | splitParam | splitType | characters | chunks | wordsToAvoidAll | wordsToCheckAll | wordsToAvoidAny | wordsToCheckAny | exportDecimals | lines | textNotes | textSourceURL | filename |
27 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
28 | | 4.78 | Das Kapital | Karl Marx | 1867 | de | https://do-me.github.io/SemanticFinder/?hf=Das_Kapital_c1a84fba | Xenova/multilingual-e5-small | True | 80 | Words | 2003807 | 3164 | | | | | 5 | 28673 | | https://ia601605.us.archive.org/13/items/KarlMarxDasKapitalpdf/KAPITAL1.pdf | Das_Kapital_c1a84fba.json.gz |
29 | | 2.58 | Divina Commedia | Dante | 1321 | it | https://do-me.github.io/SemanticFinder/?hf=Divina_Commedia_d5a0fa67 | Xenova/multilingual-e5-base | True | 50 | Words | 383782 | 1179 | | | | | 5 | 6225 | | http://www.letteratura-italiana.com/pdf/divina%20commedia/08%20Inferno%20in%20versione%20italiana.pdf | Divina_Commedia_d5a0fa67.json.gz |
30 | | 11.92 | Don Quijote | Miguel de Cervantes | 1605 | es | https://do-me.github.io/SemanticFinder/?hf=Don_Quijote_14a0b44 | Xenova/multilingual-e5-base | True | 25 | Words | 1047150 | 7186 | | | | | 4 | 12005 | | https://parnaseo.uv.es/lemir/revista/revista19/textos/quijote_1.pdf | Don_Quijote_14a0b44.json.gz |
31 | | 0.06 | Hansel and Gretel | Brothers Grimm | 1812 | en | https://do-me.github.io/SemanticFinder/?hf=Hansel_and_Gretel_4de079eb | TaylorAI/gte-tiny | True | 100 | Chars | 5304 | 55 | | | | | 5 | 9 | | https://www.grimmstories.com/en/grimm_fairy-tales/hansel_and_gretel | Hansel_and_Gretel_4de079eb.json.gz |
32 | | 1.74 | IPCC Report 2023 | IPCC | 2023 | en | https://do-me.github.io/SemanticFinder/?hf=IPCC_Report_2023_2b260928 | Supabase/bge-small-en | True | 200 | Chars | 307811 | 1566 | | | | | 5 | 3230 | state of knowledge of climate change | https://report.ipcc.ch/ar6syr/pdf/IPCC_AR6_SYR_LongerReport.pdf | IPCC_Report_2023_2b260928.json.gz |
33 | | 25.56 | King James Bible | | None | en | https://do-me.github.io/SemanticFinder/?hf=King_James_Bible_24f6dc4c | TaylorAI/gte-tiny | True | 200 | Chars | 4556163 | 23056 | | | | | 5 | 80496 | | https://www.holybooks.com/wp-content/uploads/2010/05/The-Holy-Bible-King-James-Version.pdf | King_James_Bible_24f6dc4c.json.gz |
34 | | 11.45 | King James Bible | | None | en | https://do-me.github.io/SemanticFinder/?hf=King_James_Bible_6434a78d | TaylorAI/gte-tiny | True | 200 | Chars | 4556163 | 23056 | | | | | 2 | 80496 | | https://www.holybooks.com/wp-content/uploads/2010/05/The-Holy-Bible-King-James-Version.pdf | King_James_Bible_6434a78d.json.gz |
35 | | 39.32 | Les Misérables | Victor Hugo | 1862 | fr | https://do-me.github.io/SemanticFinder/?hf=Les_Misérables_2239df51 | Xenova/multilingual-e5-base | True | 25 | Words | 3236941 | 19463 | | | | | 5 | 74491 | All five acts included | https://beq.ebooksgratuits.com/vents/Hugo-miserables-1.pdf | Les_Misérables_2239df51.json.gz |
36 | | 0.46 | REGULATION (EU) 2023/138 | European Commission | 2022 | en | https://do-me.github.io/SemanticFinder/?hf=REGULATION_(EU)_2023_138_c00e7ff6 | Supabase/bge-small-en | True | 25 | Words | 76809 | 424 | | | | | 5 | 1323 | | https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R0138&qid=1704492501351 | REGULATION_(EU)_2023_138_c00e7ff6.json.gz |
37 | | 0.07 | Universal Declaration of Human Rights | United Nations | 1948 | en | https://do-me.github.io/SemanticFinder/?hf=Universal_Declaration_of_Human_Rights_0a7da79a | TaylorAI/gte-tiny | True | \nArticle | Regex | 8623 | 63 | | | | | 5 | 109 | 30 articles | https://www.un.org/en/about-us/universal-declaration-of-human-rights | Universal_Declaration_of_Human_Rights_0a7da79a.json.gz |
38 |
39 | ## Import & Export
40 |
41 | You can create indices yourself with one two clicks and save them. If it's something private, keep it for yourself, if it's a classic book or something you think other's might be interested in consider a PR on the [Huggingface Repo](https://huggingface.co/datasets/do-me/SemanticFinder) or get in touch with us. Book requests are happily met if you provide us a good source link where we can do copy & paste. Simply open an issue here with [Book Request] or similar or contact us.
42 |
43 | It goes without saying that no discriminating content will be tolerated.
44 |
45 | ## Installation
46 |
47 | Clone the repository and install dependencies with
48 |
49 | `npm install`
50 |
51 | Then run with
52 |
53 | `npm run start`
54 |
55 | If you want to build instead, run
56 |
57 | `npm run build`
58 |
59 | Afterwards, you'll find the `index.html`, `main.css` and `bundle.js` in `dist`.
60 |
61 | ## Browser extension
62 | Download the Chrome extension from [Chrome webstore](https://chrome.google.com/webstore/detail/semanticfinder/ddmgffoffelnhnonpoiblaoboaeieejl) and pin it. Right click the extension icon for `options`:
63 | - choose distiluse-base-multilingual-cased-v2 for multilingual usage (default is English-only)
64 | - set a higher number for min characters to split by for larger texts
65 |
66 | 
67 |
68 | ### Local build
69 | If you want to build the browser extension locally, clone the repo and cd in `extension` directory then run:
70 | - `npm install`
71 | - `npm run build` for a static build or
72 | - `npm run dev` for the auto-refreshing development version
73 | - go to Chrome extension settings with `chrome://extensions`
74 | - select `Load Unpacked` and choose the `build` folder
75 | - pin the extension in Chrome so you can access it easily. If it doesn't work for you, feel free to open an issue.
76 |
77 | ## Speed
78 | Tested on the entire book of [Moby Dick](https://archive.org/stream/mobydickorwhale01melvuoft/mobydickorwhale01melvuoft_djvu.txt) with 660.000 characters ~13.000 lines or ~111.000 words.
79 | Initial embedding generation takes **1-2 mins** on my old i7-8550U CPU with 1000 characters as segment size. Following queries take only ~2 seconds!
80 | If you want to query larger text instead or keep an entire library of books indexed use a [proper vector database instead](https://geo.rocks/post/qdrant-transformers-js-semantic-search/).
81 |
82 | ## Features
83 |
84 | You can customize everything!
85 |
86 | - Input text & search term(s)
87 | - Hybrid search (semantic search & full-text search)
88 | - Segment length (the bigger the faster, the smaller the slower)
89 | - Highlight colors (currently hard-coded)
90 | - Number of highlights are based on the threshold value. The lower, the more results.
91 | - Live updates
92 | - Easy integration of other ML-models thanks to [transformers.js](https://xenova.github.io/transformers.js/)
93 | - Data privacy-friendly - your input text data is not sent to a server, it stays in your browser!
94 |
95 | ## Usage ideas
96 |
97 | - Basic search through anything, like your personal notes (my initial motivation by the way, a huge notes.txt file I couldn't handle anymore)
98 | - Remember peom analysis in school? Often you look for possible Leitmotifs or recurring categories like **food** in Hänsel & Gretel
99 |
100 | ## Future ideas
101 |
102 | - One could package everything nicely and use it e.g. instead of JavaScript search engines such as [Lunr.js](https://lunrjs.com/) (also being used in [mkdocs-material](https://squidfunk.github.io/mkdocs-material/setup/setting-up-site-search/)).
103 | - Integration in mkdocs (mkdocs-material) **experimental**:
104 | - when building the docs, slice all `.md`-files in chunks (length defined in `mkdocs.yaml`). Should be fairly large (>800 characters) for lower response time. It's also possible to build n indices with first a coarse index (mabye per document/ `.md`-file if the used model supports the length) and then a rfined one for the document chunks
105 | - build the index by calculating the embeddings for all docs/chunks
106 | - when a user queries the docs, a switch can toggle (fast) full-text standard search (atm with lunr.js) or experimental semantic search
107 | - if the latter is being toggled, the client loads the model (all-MiniLM-L6-v2 has ~30mb)
108 | - like in SemanticFinder, the embedding is created client-side and the cosine similarity calculated
109 | - the high-scored results are returned just like with lunr.js so the user shouldn't even notice a differenc ein the UI
110 | - Electron- or browser-based apps could be augmented with semantic search, e.g. VS Code, Atom or mobile apps.
111 | - Integration in personal wikis such as Obsidian, tiddlywiki etc. would save you the tedious tagging/keywords/categorisation work or could at least improve your structure further
112 | - Search your own browser history (thanks [@Snapdeus](https://twitter.com/snapdeus/status/1646233904691413006))
113 | - Integration in chat apps
114 | - Allow PDF-uploads (conversion from PDF to text)
115 | - Integrate with Speech-to-Text whisper model from transformers.js to allow audio uploads.
116 | - Thanks to [CodeMirror](https://codemirror.net/) one could even use syntax highlighting for programming languages such as Python, JavaScript etc.
117 |
118 | ## Logic
119 |
120 | [Transformers.js](https://xenova.github.io/transformers.js/) is doing all the heavy lifting of tokenizing the input and running the model. Without it, this demo would have been impossible.
121 |
122 | **Input**
123 | - Text, as much as your browser can handle! The demo uses a part of "Hänsel & Gretel" but it can handle hundreds of PDF pages
124 | - A search term or phrase
125 | - The number of characters the text should be segmented in
126 | - A similarity threshold value. Results with lower similarity score won't be displayed.
127 |
128 | **Output**
129 | - Three highlighted string segments, the darker the higher the similarity score.
130 |
131 | **Pipeline**
132 |
133 | 0. All scripts are loaded. The model is loaded once from HuggingFace, after cached in the browser.
134 | 1. A user inputs some text and a search term or phrase.
135 | 2. Depending on the approximate length to consider (unit=characters), the text is split into **segments**. Words themselves are never split, that's why it's approximative.
136 | 3. The search term embedding is created.
137 | 4. For each **segment** of the text, the embedding is created.
138 | 5. Meanwhile, the cosine similarity is calculated between every **segment** embedding and the search term embedding. It's written to a dictionary with the segment as key and the score as value.
139 | 6. For every iteration, the progress bar and the highlighted sections are updated in real-time depending on the highest scores in the array.
140 | 7. The embeddings are cached in the dictionary so that subsequent queries are quite fast. The calculation of the cosine similarity is fairly speedy in comparison to the embedding generation.
141 | 8. **Only if the user changes the segment length**, the embeddings must be recalculated.
142 |
143 | ## Collaboration
144 | PRs welcome!
145 |
146 | ## To Dos (no priorization)
147 | - [x] similarity score cutoff/threshold
148 | - [x] add option for more highlights (e.g. all above certain score)
149 | - [x] add stop button
150 | - [x] MaterialUI for input fields or proper labels
151 | - [x] create a demo without CDNs
152 | - [x] separate one html properly in html, js, css
153 | - [x] add npm installation
154 | - [x] option for loading embeddings from file or generally allow sharing embeddings in some way
155 | - [x] simplify chunking function so the original text can be loaded without issues
156 | - [ ] improve the color range
157 | - [ ] rewrite the cosine similarity function in Rust, port to WASM and load as a module for possible speedup (experimental)
158 | - [ ] UI overhaul
159 | - [ ] polish code
160 | - [x] - jQuery/vanilla JS mixed
161 | - [ ] - clean up functions
162 | - [ ] - add more comments
163 | - [ ] add possible use cases
164 | - [ ] package as a standalone application (maybe with custom model choice; to be downloaded once from HF hub, then saved locally)
165 | - [ ] possible integration as example in [transformers.js homepage](https://github.com/xenova/transformers.js/issues/84)
166 |
167 | ## Star History
168 |
169 | [](https://star-history.com/#do-me/SemanticFinder&Timeline)
170 |
171 | ## Gource Map
172 |
173 | 
174 |
175 | Gource image created with:
176 |
177 | ```bash
178 | gource -1280x720 --title "SemanticFinder" --seconds-per-day 0.03 --auto-skip-seconds 0.03 --bloom-intensity 0.5 --max-user-speed 500 --highlight-dirs --multi-sampling --highlight-colour 00FF00
179 | ```
180 |
--------------------------------------------------------------------------------
/SemanticFinder.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder.gif
--------------------------------------------------------------------------------
/SemanticFinder_Chrome_Extension.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_Chrome_Extension.gif
--------------------------------------------------------------------------------
/SemanticFinder_Chrome_Extension_en.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_Chrome_Extension_en.zip
--------------------------------------------------------------------------------
/SemanticFinder_gource.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/SemanticFinder_gource.png
--------------------------------------------------------------------------------
/extension/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 |
--------------------------------------------------------------------------------
/extension/README.md:
--------------------------------------------------------------------------------
1 |
2 | # SemanticFinder Browser Extension
3 |
4 |
5 | ## Getting Started
6 | 1. Install the necessary dependencies:
7 | ```bash
8 | npm install
9 | ```
10 |
11 | 2. Build the project:
12 | ```bash
13 | npm run build
14 | ```
15 | or
16 | ```bash
17 | npm run dev
18 | ```
19 | for auto-reload.
20 |
21 | 3. Add the extension to your browser. To do this, go to `chrome://extensions/`, enable developer mode (top right), and click "Load unpacked". Select the `build` directory from the dialog which appears and click "Select Folder".
22 |
23 |
24 | ----
25 |
26 | A big thank you to Xenova, whose work on 🤗 Transformers.js makes this entire project possible.
27 |
--------------------------------------------------------------------------------
/extension/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "SemanticFinder",
3 | "version": "0.0.1",
4 | "description": "SemanticFinder | In-browser Semantic Search via Transformers.js",
5 | "scripts": {
6 | "build": "webpack",
7 | "dev": "webpack --watch"
8 | },
9 | "type": "module",
10 | "author": "Varun Neal Srivastava",
11 | "contributors": [
12 | "Dominik Weckmüller",
13 | "Xenova"
14 | ],
15 | "license": "MIT",
16 | "devDependencies": {
17 | "copy-webpack-plugin": "^11.0.0",
18 | "css-loader": "^6.8.1",
19 | "html-webpack-plugin": "^5.5.1",
20 | "pdfjs-dist": "^3.9.179",
21 | "style-loader": "^3.3.3",
22 | "vue-loader": "^17.2.2",
23 | "vue-template-compiler": "^2.7.14",
24 | "vueify": "^9.4.1",
25 | "webpack": "^5.79.0",
26 | "webpack-cli": "^5.1.4"
27 | },
28 | "dependencies": {
29 | "@mozilla/readability": "^0.4.4",
30 | "@vue/compiler-sfc": "^3.3.4",
31 | "@xenova/transformers": "^2.5.0",
32 | "mark.js": "^8.11.1",
33 | "node-polyfill-webpack-plugin": "^2.0.1",
34 | "util": "^0.12.5",
35 | "vue": "^3.3.4"
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/extension/public/icons/logo128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/extension/public/icons/logo128.png
--------------------------------------------------------------------------------
/extension/public/icons/logo48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/extension/public/icons/logo48.png
--------------------------------------------------------------------------------
/extension/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "manifest_version": 3,
3 | "name": "SemanticFinder",
4 | "description": "SemanticFinder | In-browser Semantic Search via Transformers.js",
5 | "version": "0.0.1",
6 | "host_permissions": ["http://*/*", "https://*/*"],
7 | "permissions": [
8 | "scripting",
9 | "activeTab",
10 | "storage",
11 | "unlimitedStorage"
12 | ],
13 | "options_ui": {
14 | "page": "options.html",
15 | "open_in_tab": true
16 | },
17 | "background": {
18 | "service_worker": "background.js",
19 | "type": "module"
20 | },
21 | "content_scripts": [
22 | {
23 | "matches": [
24 | ""
25 | ],
26 | "js": [
27 | "content.js", "pdf.js", "pdf.worker.js"
28 | ],
29 | "css": [
30 | "content.css"
31 | ]
32 | }
33 | ],
34 | "minimum_chrome_version": "92",
35 | "action": {
36 | "default_icon": {
37 | "16": "icons/logo48.png",
38 | "24": "icons/logo48.png",
39 | "32": "icons/logo128.png"
40 | },
41 | "default_title": "SemanticFinder",
42 | "default_popup": "popup.html"
43 | },
44 | "content_security_policy": {
45 | "extension_pages": "script-src 'self' 'wasm-unsafe-eval'"
46 | },
47 | "icons": {
48 | "16": "icons/logo48.png",
49 | "48": "icons/logo48.png",
50 | "128": "icons/logo128.png"
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/extension/src/content/content.css:
--------------------------------------------------------------------------------
1 |
2 | .SemanticFinder-highlight {
3 | background-color: #ffff33 !important;
4 | color: black !important;
5 | }
6 |
--------------------------------------------------------------------------------
/extension/src/content/content.js:
--------------------------------------------------------------------------------
1 | // content.js
2 | import {prettyLog, splitReadableContent} from '../utils/utils.js';
3 | import {Readability} from '@mozilla/readability';
4 | import Mark from 'mark.js';
5 | import {getDocument, GlobalWorkerOptions} from 'pdfjs-dist';
6 |
7 |
8 | async function fetchAndExtractPDFText(url) {
9 | GlobalWorkerOptions.workerSrc = chrome.runtime.getURL('../serviceworkers/pdf.worker.js');
10 |
11 | const pdf = await getDocument(url).promise;
12 |
13 | let totalPages = pdf.numPages;
14 | let texts = [];
15 |
16 | for (let i = 1; i <= totalPages; i++) {
17 | // console.log("page ", i);
18 | const page = await pdf.getPage(i);
19 | const textContent = await page.getTextContent();
20 | const pageText = textContent.items.map(item => item.str).join(' ');
21 | texts.push(pageText);
22 | }
23 |
24 | return texts.join(' ');
25 | }
26 |
27 | function getValueFromStorage(key, defaultValue) {
28 | return new Promise((resolve, reject) => {
29 | chrome.storage.sync.get(key, function(result) {
30 | if (chrome.runtime.lastError) {
31 | reject(new Error(chrome.runtime.lastError));
32 | } else {
33 | resolve(result[key] || defaultValue);
34 | }
35 | });
36 | });
37 | }
38 |
39 | async function fetchNumChars() {
40 | try {
41 | const defaultNumChars = 50; // You can set this to your desired default value
42 | const storedNumChars = await getValueFromStorage('num_chars', defaultNumChars);
43 | return storedNumChars;
44 | } catch (error) {
45 | console.error('Error fetching num_chars:', error);
46 | return null;
47 | }
48 | }
49 |
50 | chrome.runtime.onMessage.addListener(async function(request, sender) {
51 | try {
52 | let currentURL = window.location.href;
53 | if (request.type === "getText") {
54 | const numChars = await fetchNumChars();
55 | let texts = [];
56 |
57 | if (request.contentType == "application/pdf") {
58 | let textContent = await fetchAndExtractPDFText(currentURL);
59 | texts = splitReadableContent(textContent, numChars);
60 |
61 | } else {
62 | let concatenatedContent = "";
63 |
64 | const iframes = document.querySelectorAll('iframe');
65 | console.dir(iframes);
66 |
67 | iframes.forEach(function(iframe) {
68 | try {
69 | const iframeDocument = iframe.contentDocument;
70 |
71 | if (iframeDocument) {
72 |
73 | let { textContent } = new Readability(iframeDocument.cloneNode(true)).parse();
74 | prettyLog("Iframe text content:", textContent, "orange");
75 | concatenatedContent += textContent;
76 | }
77 | } catch (error) {
78 | prettyLog("Skipped an iframe due to permissions issue:", error, "red");
79 | }
80 | });
81 |
82 | const documentClone = document.cloneNode(true);
83 | let { textContent } = new Readability(documentClone).parse();
84 | concatenatedContent += textContent;
85 | // prettyLog("Main document text content:", textContent);
86 |
87 | texts = splitReadableContent(concatenatedContent, numChars);
88 |
89 | }
90 | chrome.runtime.sendMessage({type: "tabUpdated", text: texts, currentURL});
91 | } else if (request.type === 'highlightAndScroll') {
92 | // if (currentURL.endsWith('.pdf')) { return; }
93 | if (!highlightAndScrollToText(request.text)) {
94 | chrome.runtime.sendMessage({type: "error", reason: "Cannot find and highlight selection."})
95 | }
96 | }
97 | } catch (error) {
98 | prettyLog("ERROR", error.message, "red", "red");
99 | if (error.message.includes('net::ERR_BLOCKED_BY_CLIENT')) {
100 | chrome.runtime.sendMessage({type: "error", reason: "ERR_BLOCKED_BY_CLIENT"});
101 | } else {
102 | chrome.runtime.sendMessage({type: "error", reason: error.message});
103 | }
104 | }
105 | });
106 |
107 |
108 | let currText;
109 | let instance = new Mark(document.querySelector("body"));
110 |
111 | function highlightAndScrollToText(text, depth= 3) {
112 | if (depth === 0) {
113 | return false;
114 | }
115 | // If there's a previous highlighted text, unmark it
116 | if (currText) {
117 | instance.unmark({"element": "span", "className": "SemanticFinder-highlight"});
118 | }
119 |
120 | currText = text;
121 |
122 | let textFound = false;
123 |
124 | instance.mark(text, {
125 | "element": "span",
126 | "separateWordSearch": false,
127 | "className": "SemanticFinder-highlight",
128 | "acrossElements": true,
129 | "wildcards": "enabled",
130 | "iframes": true,
131 | "each": function (node) {
132 | // Scroll to the first instance of it
133 | node.scrollIntoView({
134 | behavior: "smooth",
135 | block: "center"
136 | });
137 | textFound = true;
138 | }
139 | });
140 |
141 |
142 | // can use "noMatch" in markjs instead
143 | if (!textFound) {
144 | let segments = text.split('\n');
145 | let longestSegment = segments.sort((a, b) => b.length - a.length)[0];
146 | if (longestSegment) {
147 | return highlightAndScrollToText(longestSegment, depth - 1);
148 | }
149 | } else {
150 | return true;
151 | }
152 | }
153 |
154 |
155 |
--------------------------------------------------------------------------------
/extension/src/options/options.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-family: 'Helvetica', sans-serif;
3 | padding: 20px;
4 | background-color: #f5f5f5;
5 | }
6 |
7 | .container {
8 | background-color: #fff;
9 | padding: 20px;
10 | border-radius: 5px;
11 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
12 | width: 300px; /* Adjust as needed */
13 | margin: 0 auto;
14 | }
15 |
16 | label {
17 | display: block;
18 | margin-bottom: 10px;
19 | }
20 |
21 | select,
22 | input {
23 | width: 100%;
24 | padding: 8px;
25 | box-sizing: border-box;
26 | margin-bottom: 20px;
27 | border-radius: 5px;
28 | border: 1px solid #ccc;
29 | }
30 |
31 | .rectangular-button {
32 | display: block;
33 | background-color: #007bff;
34 | color: white;
35 | padding: 10px 20px;
36 | text-align: center;
37 | border: none;
38 | border-radius: 5px;
39 | cursor: pointer;
40 | margin-bottom: 10px;
41 | transition: background-color 0.3s;
42 | }
43 |
44 | .rectangular-button:hover {
45 | background-color: #0056b3;
46 | }
47 |
--------------------------------------------------------------------------------
/extension/src/options/options.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | SemanticFinder Settings
7 |
8 |
9 |
10 |
11 |
12 |
13 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/extension/src/options/options.js:
--------------------------------------------------------------------------------
1 |
2 | document.addEventListener('DOMContentLoaded', function() {
3 | loadSettings();
4 |
5 | const b = document.getElementById('saveButton')
6 | if (b) {
7 | b.addEventListener('click', saveSettings);
8 | }
9 |
10 | const r = document.getElementById('restoreButton')
11 | if (r) {
12 | r.addEventListener('click', restoreDefaults);
13 | }
14 | });
15 |
16 | function saveSettings(showAlert = true) {
17 | const modelName = document.getElementById('modelSelector').value;
18 | const numChars = document.getElementById('minCharsInput').value;
19 |
20 | chrome.storage.sync.set({
21 | 'model_name': modelName,
22 | 'num_chars': numChars
23 | }, function() {
24 | if (showAlert) {
25 | alert('Settings saved.');
26 | }
27 | });
28 | }
29 |
30 | function restoreDefaults() {
31 | document.getElementById('modelSelector').value = 'Supabase/gte-small'; // Default model
32 | document.getElementById('minCharsInput').value = 50; // Default number
33 |
34 | saveSettings(false);
35 | }
36 |
37 |
38 | function loadSettings() {
39 | chrome.storage.sync.get(['model_name', 'num_chars'], function(items) {
40 | if (items['model_name']) {
41 | const s = document.getElementById('modelSelector')
42 | if (s) {
43 | s.value = items['model_name'];
44 | }
45 | }
46 | if (items['num_chars']) {
47 | const m = document.getElementById('minCharsInput')
48 | if (m) {
49 | m.value = items['num_chars'];
50 | }
51 | }
52 | });
53 | }
54 |
--------------------------------------------------------------------------------
/extension/src/popup/AnimatedInput.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
43 |
92 |
93 |
--------------------------------------------------------------------------------
/extension/src/popup/popup.css:
--------------------------------------------------------------------------------
1 |
2 | * {
3 | padding: 0;
4 | margin: 0;
5 | box-sizing: border-box;
6 | font-family: 'Roboto', sans-serif;
7 | }
8 |
9 | h1 {
10 | font-size: 40px;
11 | text-align: center;
12 | font-weight: 500;
13 | }
14 |
15 | h2 {
16 | font-size: 20px;
17 | text-align: center;
18 | font-weight: 400;
19 | margin-bottom: 16px;
20 | }
21 |
22 | .container {
23 | width: 450px;
24 | }
25 |
26 | html,
27 | body {
28 | min-width: 400px;
29 | min-height: 500px;
30 | }
31 |
32 | body {
33 | display: flex;
34 | justify-content: center;
35 | align-items: center;
36 | }
37 |
38 | #text {
39 | width: 100%;
40 | padding: 8px;
41 | font-size: 20px;
42 | margin-bottom: 8px;
43 | }
44 |
45 | #output {
46 | font-size: 20px;
47 | font-family: 'Roboto Mono', monospace;
48 | height: 100px;
49 | }
50 |
--------------------------------------------------------------------------------
/extension/src/popup/popup.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Popup
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/extension/src/popup/popup.js:
--------------------------------------------------------------------------------
1 | import { createApp } from 'vue';
2 | import Popup from './popup.vue';
3 |
4 |
5 | createApp(Popup).mount("#app");
6 |
--------------------------------------------------------------------------------
/extension/src/popup/popup.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
9 |
Loading model
10 |
11 |
12 |
13 |
14 | ☒
15 | ERROR: {{ error }}
16 |
17 |
18 |
19 |
20 |
21 |
22 |
29 |
30 |
31 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
134 |
135 |
247 |
--------------------------------------------------------------------------------
/extension/src/popup/result.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
{{ result }}
5 |
{{ "0." + Math.round(100 * score) }}
6 |
7 |
8 |
9 |
28 |
29 |
58 |
--------------------------------------------------------------------------------
/extension/src/serviceworkers/background.js:
--------------------------------------------------------------------------------
1 | // background.js - Handles requests from the UI, runs the model, then sends back a response
2 |
3 | import {prettyLog, getSiteID} from '../utils/utils.js';
4 | import {similarity, storeEmbeddings, loadEmbeddings} from "./semantic.js";
5 |
6 | ////////////////////// 1. Context Menus //////////////////////
7 | //
8 | // Add a listener to create the initial context menu items,
9 | // context menu items only need to be created at runtime.onInstalled
10 | // chrome.runtime.onInstalled.addListener(function () {
11 | // Register a context menu item that will only show up for selection text.
12 | // chrome.contextMenus.create({
13 | // id: 'classify-selection',
14 | // title: 'Classify "%s"',
15 | // contexts: ['selection'],
16 | // });
17 | // });
18 | //
19 | // Perform inference when the user clicks a context menu
20 | // chrome.contextMenus.onClicked.addListener(async (info, tab) => {
21 | // Ignore context menu clicks that are not for classifications (or when there is no input)
22 | // if (info.menuItemId !== 'classify-selection' || !info.selectionText) return;
23 | //
24 | // Perform classification on the selected text
25 | // let result = await classify(info.selectionText);
26 | //
27 | // Do something with the result
28 | // chrome.scripting.executeScript(
29 | //
30 | // {
31 | // target: { tabId: tab.id }, // Run in the tab that the user clicked in
32 | // args: [result], // The arguments to pass to the function
33 | // function: (result) => { // The function to run
34 | // // NOTE: This function is run in the context of the web page, meaning that `document` is available.
35 | // console.log('result', result)
36 | // console.log('document', document)
37 | // },
38 | // }
39 | // );
40 | // });
41 | //////////////////////////////////////////////////////////////
42 |
43 | ////////////////////// 2. Message Events /////////////////////
44 | //
45 | // Listen for messages from the UI, process it, and send the result back.
46 |
47 | // TODO: body text is not persistent
48 | let bodyText = [];
49 | let inputText = "";
50 |
51 | let liveProcess = 0;
52 | let currSite = "";
53 |
54 | chrome.runtime.onMessage.addListener(async function (request, sender, sendResponse) {
55 | if (request.type === "tabUpdated") {
56 | if (request.text.length > 0) {
57 | bodyText = request.text;
58 | currSite = getSiteID(request.currentURL);
59 | }
60 | } else if (request.type === "inputText") {
61 | inputText = request.text;
62 | } else {return; }
63 | if (!bodyText || !inputText) {
64 | return;
65 | }
66 |
67 | liveProcess++;
68 | const processId = liveProcess;
69 |
70 | await processQuery(inputText, bodyText, processId);
71 | });
72 |
73 |
74 | async function processQuery(query, bodyText, processId) {
75 | if (bodyText.length === 0) {
76 | prettyLog("Error", "no content found. please reload this page if this is unexpected", "red");
77 | chrome.runtime.sendMessage({type: "error", reason: "No content detected. Reloading may help."});
78 | return; // Exit early if no bodyText
79 | }
80 |
81 | await loadEmbeddings(currSite);
82 | prettyLog("starting process " + processId, bodyText.length + " items, input: " + query, "orange");
83 |
84 | let results = [];
85 | const k = 10;
86 |
87 | let i = 0;
88 | for (let text of bodyText) {
89 | if (processId !== liveProcess) { return;}
90 | let sim = await similarity(query, text);
91 |
92 | if (sim > 0.15) {
93 | results.push({sim: sim, text: text});
94 | results.sort((a, b) => b.sim - a.sim);
95 | results.length = Math.min(results.length, k);
96 |
97 | if (processId !== liveProcess) { return;}
98 | chrome.runtime.sendMessage({
99 | type: "results", progress: 100 * (i / bodyText.length),
100 | text: results
101 | });
102 | }
103 | i += 1;
104 | }
105 | chrome.runtime.sendMessage({type: "results", progress: 100});
106 | await storeEmbeddings();
107 | }
108 |
109 | //////////////////////////////////////////////////////////////
110 |
111 |
--------------------------------------------------------------------------------
/extension/src/serviceworkers/pdf.worker.entry.js:
--------------------------------------------------------------------------------
1 | /* Copyright 2022 Mozilla Foundation
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License");
4 | * you may not use this file except in compliance with the License.
5 | * You may obtain a copy of the License at
6 | *
7 | * http://www.apache.org/licenses/LICENSE-2.0
8 | *
9 | * Unless required by applicable law or agreed to in writing, software
10 | * distributed under the License is distributed on an "AS IS" BASIS,
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | * See the License for the specific language governing permissions and
13 | * limitations under the License.
14 | */
15 |
16 | (typeof window !== "undefined"
17 | ? window
18 | : {}
19 | ).pdfjsWorker = require("./pdf.worker.js");
20 |
--------------------------------------------------------------------------------
/extension/src/serviceworkers/semantic.js:
--------------------------------------------------------------------------------
1 | // Define caching parameters
2 | import {CustomCache} from "../utils/cache.js";
3 | import {pipeline, env} from '@xenova/transformers';
4 | import {prettyLog} from "../utils/utils.js";
5 |
6 | env.useBrowserCache = false;
7 | env.useCustomCache = true;
8 | env.customCache = new CustomCache('transformers-cache');
9 | env.allowLocalModels = false;
10 |
11 |
12 | // Due to a bug in onnxruntime-web, we must disable multithreading for now.
13 | // See https://github.com/microsoft/onnxruntime/issues/14445 for more information.
14 | env.backends.onnx.wasm.numThreads = 1;
15 |
16 | // these should go in EmbedPipeline prob
17 | let embeddingsDict = {};
18 | let currID = "";
19 |
20 | class EmbedPipeline {
21 | static task = 'feature-extraction';
22 |
23 | static model = 'Supabase/gte-small';
24 | static instance = null;
25 |
26 | static async getModelFromStorage() {
27 | return new Promise((resolve, reject) => {
28 | chrome.storage.sync.get('model_name', function(result) {
29 | if (chrome.runtime.lastError) {
30 | reject(new Error(chrome.runtime.lastError));
31 | } else {
32 | resolve(result.model_name);
33 | return false;
34 | }
35 | });
36 | });
37 | }
38 | static async updateModelName() {
39 | try {
40 | const storedModelName = await this.getModelFromStorage();
41 | if (storedModelName) {
42 | this.model = storedModelName;
43 | }
44 | } catch (error) {}
45 | }
46 |
47 | static async getInstance() {
48 | if (this.instance === null) {
49 | await this.updateModelName();
50 |
51 | this.instance = await pipeline(this.task, this.model,
52 | {
53 | progress_callback: async data => {
54 | await chrome.runtime.sendMessage({type: "download", data: data});
55 | }
56 | }
57 | );
58 | }
59 | await chrome.runtime.sendMessage({type: "download", data: {status: "complete"}})
60 |
61 | return this.instance;
62 | }
63 | }
64 |
65 | // Important: Return true to indicate that the response is asynchronous.
66 | chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => {
67 | switch (request.type) {
68 | case "load":
69 | await load();
70 | break;
71 | case "clearLocalStorage":
72 | chrome.storage.local.clear(() => {
73 | });
74 | break;
75 | case "pruneEmbeddings":
76 | await pruneStoredEmbeddings(10);
77 | break;
78 | }
79 | });
80 |
81 |
82 | async function load() {
83 | await EmbedPipeline.getInstance();
84 | }
85 |
86 | async function embed(text, use_dict = true) {
87 | if (use_dict && text in embeddingsDict) {
88 | return embeddingsDict[text];
89 | }
90 |
91 | let embedder = await EmbedPipeline.getInstance();
92 | let e0 = await embedder(text, {pooling: 'mean', normalize: true});
93 | if (use_dict) {
94 | embeddingsDict[text] = e0["data"];
95 | }
96 | return e0["data"];
97 | }
98 |
99 |
100 | // do on clean-up / unmount
101 | async function pruneStoredEmbeddings(k) {
102 | return new Promise((resolve) => {
103 | chrome.storage.local.get(null, function (allData) {
104 | let embeddingKeys = Object.keys(allData).filter(key => allData[key].is_embeddings === true);
105 |
106 | console.log("All embedding keys found:", embeddingKeys); // This logs all the embedding keys
107 |
108 | // Sort these embedding keys based on frecency scores
109 | let sortedKeys = embeddingKeys.sort((a, b) => allData[b].frecency_score - allData[a].frecency_score);
110 |
111 | let topKKeys = sortedKeys.slice(0, k);
112 | let keysToRemove = sortedKeys.filter(key => !topKKeys.includes(key));
113 | console.log(`Removing the following keys: ${keysToRemove}`);
114 |
115 | // Remove the non-top k embeddings from storage.
116 | if (keysToRemove.length > 0) {
117 | chrome.storage.local.remove(keysToRemove, () => {
118 | console.log(`Successfully removed ${keysToRemove.length} keys.`);
119 | resolve();
120 | });
121 | } else {
122 | resolve();
123 | }
124 | });
125 | });
126 | }
127 |
128 |
129 |
130 | export async function storeEmbeddings() {
131 | const buffer = new TextEncoder().encode(JSON.stringify(embeddingsDict));
132 |
133 | const body = await new Promise((resolve, reject) => {
134 | const reader = new FileReader();
135 | reader.onload = e => resolve(e.target.result);
136 | reader.onerror = e => reject(e.target.error);
137 | reader.readAsDataURL(new Blob([buffer], {type: 'application/json'}));
138 | });
139 |
140 | try {
141 | await chrome.storage.local.set({
142 | [currID]: {
143 | _body: body,
144 | frecency_score: computeFrecencyScore(currID),
145 | is_embeddings: true,
146 | model_name: EmbedPipeline.model
147 | }
148 | });
149 | prettyLog("stored " + currID, Object.keys(embeddingsDict).length + " items");
150 |
151 | } catch (err) {
152 | console.warn('An error occurred while writing the embeddings to cache:', err)
153 | }
154 | }
155 |
156 | async function verifyLoad() {
157 | for (let text in embeddingsDict) {
158 | let e0 = await embed(text, true);
159 | let e1 = await embed(text, false);
160 | let sim = cosineSimilarity(e0, e1);
161 | if (sim < 0.99) {
162 | prettyLog("load differs", sim, "red");
163 | }
164 | }
165 | }
166 |
167 | export async function loadEmbeddings(ID) {
168 | if (Object.keys(embeddingsDict).length !== 0 && ID === currID) {
169 | return;
170 | }
171 | currID = ID;
172 | const data = await chrome.storage.local.get([currID]);
173 | if (data[ID] && data[ID].is_embeddings) {
174 | prettyLog("attempting load", ID);
175 | if (!data[ID].model_name || data[ID].model_name !== EmbedPipeline.model) { return; }
176 | const body = data[ID]._body;
177 |
178 | const jsonString = await new Promise((resolve, reject) => {
179 | const byteCharacters = atob(body.split(',')[1]);
180 | const byteNumbers = Array.from(byteCharacters).map(char => char.charCodeAt(0));
181 | const byteArray = new Uint8Array(byteNumbers);
182 | const blob = new Blob([byteArray], {type: 'application/json'});
183 | const reader = new FileReader();
184 | reader.onload = (event) => resolve(event.target.result);
185 | reader.onerror = (error) => reject(error);
186 | reader.readAsText(blob);
187 | });
188 |
189 | const parsedData = JSON.parse(jsonString);
190 |
191 | // Convert the object-with-integer-keys representation into Float32Array
192 | for (let textKey in parsedData) {
193 | if (parsedData.hasOwnProperty(textKey)) {
194 | let arrayData = Object.values(parsedData[textKey]);
195 | embeddingsDict[textKey] = new Float32Array(arrayData);
196 | }
197 | }
198 |
199 | prettyLog("loaded " + ID, Object.keys(embeddingsDict).length + " items");
200 | // await verifyLoad();
201 | }
202 | }
203 |
204 |
205 | // todo: implement & move to utils
206 | function computeFrecencyScore(ID) {
207 | return 4; // lol!
208 | }
209 |
210 |
211 | export async function similarity(text1, text2) {
212 | let e0 = await embed(text1);
213 | let e1 = await embed(text2);
214 |
215 | return cosineSimilarity(e0, e1);
216 | }
217 |
218 | function cosineSimilarity(v1, v2) {
219 | if (v1.length !== v2.length) {
220 | return -1;
221 | }
222 | let dotProduct = 0;
223 | let v1_mag = 0;
224 | let v2_mag = 0;
225 | for (let i = 0; i < v1.length; i++) {
226 | dotProduct += v1[i] * v2[i];
227 | v1_mag += v1[i] ** 2;
228 | v2_mag += v2[i] ** 2;
229 | }
230 | return dotProduct / (Math.sqrt(v1_mag) * Math.sqrt(v2_mag));
231 | }
232 |
--------------------------------------------------------------------------------
/extension/src/utils/cache.js:
--------------------------------------------------------------------------------
1 | // Author: Xenova
2 | // Design a caching API to be used by the extension which implements the same interface as
3 | // the browser's native Cache API (https://developer.mozilla.org/en-US/docs/Web/API/Cache)
4 | // but uses the browser's local storage API (https://developer.chrome.com/docs/extensions/reference/storage/).
5 | //
6 | // Since the local storage API requires all data to be stored as JSON (which doesn't allow some ASCII chars),
7 | // one of the better approaches is to store the response body as a base64-encoded string. This is not ideal,
8 | // as it increases the size of the response body by ~33%, but it's the best we can do with the local storage API.
9 | // See https://stackoverflow.com/a/1443240/13989043 for more information about this.
10 | //
11 | // For serialization (arraybuffer -> string) and unserialization (string -> arraybuffer),
12 | // use the `FileReader` and `Blob` APIs. Although other options are also possible, this approach
13 | // is considered to be better for larger files (like models).
14 | //
15 | // Other references:
16 | // - https://developer.chrome.com/docs/extensions/reference/storage/#property-local
17 | // - https://stackoverflow.com/questions/6965107/converting-between-strings-and-arraybuffers
18 |
19 | export class CustomCache {
20 | /**
21 | * Instantiate a `CustomCache` object.
22 | * @param {string} path
23 | */
24 | constructor(cacheName) {
25 | this.cacheName = cacheName;
26 | }
27 |
28 | /**
29 | * Checks whether the given request is in the cache.
30 | * @param {Request|string} request
31 | * @returns {Promise}
32 | */
33 | async match(request) {
34 | const url = request instanceof Request ? request.url : request;
35 | const cached = await chrome.storage.local.get([url]);
36 |
37 | if (cached[url]) {
38 | let model = await fetch(cached[url]._body);
39 | // console.log("model: ", model);
40 | return model;
41 | } else {
42 | return undefined;
43 | }
44 | }
45 |
46 | /**
47 | * Adds the given response to the cache.
48 | * @param {Request|string} request
49 | * @param {Response} response
50 | * @returns {Promise}
51 | */
52 | async put(request, response) {
53 | const url = request instanceof Request ? request.url : request;
54 | const buffer = await response.arrayBuffer();
55 |
56 | const body = await new Promise((resolve, reject) => {
57 | const reader = new FileReader();
58 | reader.onload = e => resolve(e.target.result);
59 | reader.onerror = e => reject(e.target.error);
60 | reader.readAsDataURL(new Blob([buffer], { type: 'application/octet-stream' }));
61 | });
62 |
63 | try {
64 | await chrome.storage.local.set({
65 | [url]: {
66 | _body: body,
67 |
68 | // Save original response in case
69 | status: response.status,
70 | statusText: response.statusText,
71 | headers: Object.fromEntries(response.headers.entries()),
72 | url: response.url,
73 | redirected: response.redirected,
74 | type: response.type,
75 | ok: response.ok,
76 | }
77 | });
78 |
79 | } catch (err) {
80 | console.warn('An error occurred while writing the file to cache:', err)
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/extension/src/utils/utils.js:
--------------------------------------------------------------------------------
1 | export function prettyLog(label, message, labelColor = 'blue', messageColor = 'black') {
2 | console.log("%c" + label + ": %c" + message,
3 | "font-weight: bold; color: " + labelColor + ";",
4 | "font-weight: normal; color: " + messageColor + ";");
5 | }
6 |
7 |
8 | /* Looks for a sentence ending after numChars. */
9 | function splitByChars(text, numChars) {
10 | let chunks = [];
11 | let currChunk = '';
12 | const sentenceEndings = ['.', '?', '!', ';', ':', '\n', '–'];
13 |
14 | for (let i = 0; i < text.length; i++) {
15 | currChunk += text[i];
16 |
17 | let isEndingPunctuation = sentenceEndings.includes(text[i]);
18 |
19 | // Special case: if the punctuation is a period and the next character is a quote
20 | if (text[i] === '.' && text[i + 1] === '"') {
21 | currChunk += text[++i];
22 | isEndingPunctuation = true;
23 | }
24 |
25 | if (currChunk.trim().length >= numChars && isEndingPunctuation) {
26 | chunks.push(currChunk.trim());
27 | currChunk = '';
28 | }
29 | }
30 |
31 | if (currChunk.trim()) {
32 | chunks.push(currChunk.trim());
33 | }
34 |
35 | return chunks;
36 | }
37 |
38 |
39 | export function getSiteID(url) {
40 | let urlObj = new URL(url);
41 | return urlObj.hostname + urlObj.pathname;
42 | }
43 |
44 |
45 | export function splitReadableContent(readableContent, numChars = 50) {
46 | return splitByChars(readableContent, numChars);
47 | }
48 |
49 |
50 | function collectTextNodes(element, texts = []) {
51 | if (element.nodeType === Node.ELEMENT_NODE && element.tagName.toLowerCase() === 'p') {
52 | let sentences = tokenizer.tokenize(element.textContent); // Tokenize the text content into sentences
53 | for (let sentence of sentences) {
54 | sentence = sentence.trim(); // Remove leading/trailing white spaces
55 | if (sentence !== "") {
56 | texts.push(sentence);
57 | }
58 | }
59 | } else {
60 | for (let child of element.childNodes) {
61 | collectTextNodes(child, texts);
62 | }
63 | }
64 | return texts;
65 | }
66 |
67 |
68 |
--------------------------------------------------------------------------------
/extension/webpack.config.js:
--------------------------------------------------------------------------------
1 | // webpack.config.js
2 | import path from 'path';
3 | import { fileURLToPath } from 'url';
4 |
5 | import HtmlWebpackPlugin from 'html-webpack-plugin';
6 | import CopyPlugin from 'copy-webpack-plugin';
7 | import { VueLoaderPlugin } from 'vue-loader';
8 | import NodePolyfillPlugin from 'node-polyfill-webpack-plugin';
9 | import webpack from 'webpack';
10 | import util from 'util';
11 |
12 | const __dirname = path.dirname(fileURLToPath(import.meta.url));
13 |
14 | const config = {
15 | mode: 'development',
16 | devtool: 'inline-source-map',
17 | entry: {
18 | background: ['./src/serviceworkers/background.js', './src/serviceworkers/semantic.js'],
19 | popup: './src/popup/popup.js',
20 | content: './src/content/content.js',
21 | options: './src/options/options.js'
22 | },
23 | resolve: {
24 | fallback: {
25 | "fs": false,
26 | "tls": false,
27 | "net": false,
28 | "path": false,
29 | "util": false,
30 | }
31 | },
32 | output: {
33 | path: path.resolve(__dirname, 'build'),
34 | filename: '[name].js',
35 | },
36 | module: {
37 | rules: [
38 | {
39 | test: /\.vue$/,
40 | use: 'vue-loader'
41 | },
42 | {
43 | test: /\.css$/,
44 | use: [
45 | 'style-loader',
46 | 'css-loader',
47 | ],
48 | },
49 | ],
50 | },
51 | plugins: [
52 | new NodePolyfillPlugin(),
53 | new webpack.DefinePlugin({
54 | __VUE_OPTIONS_API__: true,
55 | __VUE_PROD_DEVTOOLS__: false,
56 | }),
57 | new VueLoaderPlugin(),
58 | new HtmlWebpackPlugin({
59 | template: './src/popup/popup.html',
60 | filename: 'popup.html',
61 | }),
62 | new HtmlWebpackPlugin({
63 | template: './src/options/options.html',
64 | filename: 'options.html',
65 | }),
66 | new CopyPlugin({
67 | patterns: [
68 | {
69 | from: "public",
70 | to: "." // Copies to build folder
71 | },
72 | {
73 | from: "src/popup/popup.css",
74 | to: "popup.css"
75 | },
76 | {
77 | from: "src/content/content.css",
78 | to: "content.css"
79 | },
80 | {
81 | from: "src/serviceworkers/pdf.js",
82 | to: "pdf.js"
83 | },
84 | {
85 | from: "src/serviceworkers/pdf.worker.js",
86 | to: "pdf.worker.js"
87 | },
88 | {
89 | from: "src/options/options.css",
90 | to: "options.css"
91 | },
92 | ],
93 | })
94 | ],
95 | };
96 |
97 | export default config;
98 |
99 |
100 |
--------------------------------------------------------------------------------
/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "checkJs": true,
4 | "strict": true
5 | },
6 | "include": ["src/**/*"]
7 | }
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/do-me/SemanticFinder/a287e14bad6a42b560bab674fab0d95a65da623e/logo.png
--------------------------------------------------------------------------------
/misc/Generate_large_textfile_from_books.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Script for generating large text files \n",
8 | "\n",
9 | "Keeps the metadata concatted with \"|\" as first line"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 7,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Pages: 28986 | Words: 15893741\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "import pandas as pd \n",
27 | "\n",
28 | "# download first \"https://huggingface.co/datasets/storytracer/US-PD-Books/resolve/main/data/train-00000-of-00327.parquet?download=true\")\n",
29 | "df = pd.read_parquet(\"train-00000-of-00327.parquet\") \n",
30 | "\n",
31 | "# e.g. 100 books only \n",
32 | "books_number = 100\n",
33 | "df = df.iloc[:books_number]\n",
34 | "df[\"words\"] = df.full_text.apply(lambda x: len(x.split(\" \")))\n",
35 | "print(f\"Pages: {df.page_count.sum()} | Words: {df.words.sum()}\") #df.words.sum())"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 8,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# Define the columns to concatenate, excluding 'full_text'\n",
45 | "metadata_columns = ['ocaid', 'title', 'author', 'year', 'page_count', 'openlibrary_edition', 'openlibrary_work']\n",
46 | "\n",
47 | "# Function to concatenate metadata and full_text\n",
48 | "def concatenate_row(row):\n",
49 | " metadata = '|'.join(row[metadata_columns].astype(str)) # Convert to string and join with '|'\n",
50 | " full_text = row['full_text']\n",
51 | " return metadata + '|' + full_text\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 11,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/html": [
62 | "\n",
63 | "\n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | " | \n",
80 | " ocaid | \n",
81 | " title | \n",
82 | " author | \n",
83 | " year | \n",
84 | " page_count | \n",
85 | " openlibrary_edition | \n",
86 | " openlibrary_work | \n",
87 | " full_text | \n",
88 | " words | \n",
89 | "
\n",
90 | " \n",
91 | " \n",
92 | " \n",
93 | " 0 | \n",
94 | " worksofcharlesle01leve | \n",
95 | " The works of Charles Lever | \n",
96 | " Lever, Charles James, 1806-1872 | \n",
97 | " 1872 | \n",
98 | " 692 | \n",
99 | " OL13499428M | \n",
100 | " OL3564322W | \n",
101 | " <8 '' ^/^r \\n\\n\\nN V s... | \n",
102 | " 1045410 | \n",
103 | "
\n",
104 | " \n",
105 | " 1 | \n",
106 | " specimensofexpos00lamorich | \n",
107 | " Specimens of exposition | \n",
108 | " Lamont, Hammond, 1864-1909 | \n",
109 | " 1894 | \n",
110 | " 220 | \n",
111 | " OL7034373M | \n",
112 | " OL202608W | \n",
113 | " Ifteafeirtgs \\n\\n\\nUC-NRLF \\n\\n\\nSPECIMENS \\n\\... | \n",
114 | " 109283 | \n",
115 | "
\n",
116 | " \n",
117 | " 2 | \n",
118 | " recollectionsand00greerich | \n",
119 | " Recollections and reflections : an auto of hal... | \n",
120 | " Green, Wharton J. (Wharton Jackson), 1831-1910 | \n",
121 | " 1906 | \n",
122 | " 400 | \n",
123 | " OL7098980M | \n",
124 | " OL7710550W | \n",
125 | " ; J. GREEN \\n\\n\\nRECOLLECTIONS AND REFL... | \n",
126 | " 229753 | \n",
127 | "
\n",
128 | " \n",
129 | " 3 | \n",
130 | " puddnheadwilsont00twaiiala | \n",
131 | " Pudd'nhead Wilson, and Those extraordinary twins | \n",
132 | " Twain, Mark, 1835-1910 | \n",
133 | " 1922 | \n",
134 | " 322 | \n",
135 | " OL7095992M | \n",
136 | " OL15269096W | \n",
137 | " ROXY HARVESTING AMONG THE KITCHENS \\n\\n\\n... | \n",
138 | " 142528 | \n",
139 | "
\n",
140 | " \n",
141 | " 4 | \n",
142 | " hansbreitmann00lelarich | \n",
143 | " Hans Breitmann in Germany; | \n",
144 | " Leland, Charles Godfrey, 1824-1903 | \n",
145 | " 1895 | \n",
146 | " 184 | \n",
147 | " OL7202758M | \n",
148 | " OL4108366W | \n",
149 | " ;'HP- \\n\\n\\nn \\n\\n\\n\"* \\n\\nr.l»* \\n\\n'f . \\n\\... | \n",
150 | " 58760 | \n",
151 | "
\n",
152 | " \n",
153 | "
\n",
154 | "
"
155 | ],
156 | "text/plain": [
157 | " ocaid \n",
158 | "0 worksofcharlesle01leve \\\n",
159 | "1 specimensofexpos00lamorich \n",
160 | "2 recollectionsand00greerich \n",
161 | "3 puddnheadwilsont00twaiiala \n",
162 | "4 hansbreitmann00lelarich \n",
163 | "\n",
164 | " title \n",
165 | "0 The works of Charles Lever \\\n",
166 | "1 Specimens of exposition \n",
167 | "2 Recollections and reflections : an auto of hal... \n",
168 | "3 Pudd'nhead Wilson, and Those extraordinary twins \n",
169 | "4 Hans Breitmann in Germany; \n",
170 | "\n",
171 | " author year page_count \n",
172 | "0 Lever, Charles James, 1806-1872 1872 692 \\\n",
173 | "1 Lamont, Hammond, 1864-1909 1894 220 \n",
174 | "2 Green, Wharton J. (Wharton Jackson), 1831-1910 1906 400 \n",
175 | "3 Twain, Mark, 1835-1910 1922 322 \n",
176 | "4 Leland, Charles Godfrey, 1824-1903 1895 184 \n",
177 | "\n",
178 | " openlibrary_edition openlibrary_work \n",
179 | "0 OL13499428M OL3564322W \\\n",
180 | "1 OL7034373M OL202608W \n",
181 | "2 OL7098980M OL7710550W \n",
182 | "3 OL7095992M OL15269096W \n",
183 | "4 OL7202758M OL4108366W \n",
184 | "\n",
185 | " full_text words \n",
186 | "0 <8 '' ^/^r \\n\\n\\nN V s... 1045410 \n",
187 | "1 Ifteafeirtgs \\n\\n\\nUC-NRLF \\n\\n\\nSPECIMENS \\n\\... 109283 \n",
188 | "2 ; J. GREEN \\n\\n\\nRECOLLECTIONS AND REFL... 229753 \n",
189 | "3 ROXY HARVESTING AMONG THE KITCHENS \\n\\n\\n... 142528 \n",
190 | "4 ;'HP- \\n\\n\\nn \\n\\n\\n\"* \\n\\nr.l»* \\n\\n'f . \\n\\... 58760 "
191 | ]
192 | },
193 | "execution_count": 11,
194 | "metadata": {},
195 | "output_type": "execute_result"
196 | }
197 | ],
198 | "source": [
199 | "df.head()"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 9,
205 | "metadata": {},
206 | "outputs": [
207 | {
208 | "name": "stdout",
209 | "output_type": "stream",
210 | "text": [
211 | "The text file '100_books.txt' has been created.\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "# Apply the function to each row and save to a list\n",
217 | "concatenated_rows = df.iloc[:books_number].apply(concatenate_row, axis=1).tolist()\n",
218 | "\n",
219 | "# Write the concatenated rows to a text file\n",
220 | "with open(f'{books_number}_books.txt', 'w', encoding='utf-8') as f:\n",
221 | " for row in concatenated_rows:\n",
222 | " f.write(row + '\\n')\n",
223 | "\n",
224 | "print(f\"The text file '{books_number}_books.txt' has been created.\")\n"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": []
233 | }
234 | ],
235 | "metadata": {
236 | "kernelspec": {
237 | "display_name": "py3.11",
238 | "language": "python",
239 | "name": "python3"
240 | },
241 | "language_info": {
242 | "codemirror_mode": {
243 | "name": "ipython",
244 | "version": 3
245 | },
246 | "file_extension": ".py",
247 | "mimetype": "text/x-python",
248 | "name": "python",
249 | "nbconvert_exporter": "python",
250 | "pygments_lexer": "ipython3",
251 | "version": "3.11.0"
252 | }
253 | },
254 | "nbformat": 4,
255 | "nbformat_minor": 2
256 | }
257 |
--------------------------------------------------------------------------------
/misc/README.md:
--------------------------------------------------------------------------------
1 | ## Various utilities
2 |
3 | Here goes anything for testing or similar.
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "semanticfinder",
3 | "version": "1.0.0",
4 | "description": "Update: just improved the UI - automatically scroll through the results!",
5 | "main": "src/js/index.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1",
8 | "start": "webpack serve --mode development",
9 | "build": "webpack --config webpack.config.js"
10 | },
11 | "author": "",
12 | "license": "ISC",
13 | "dependencies": {
14 | "@xenova/transformers": "^2.17.2",
15 | "bootstrap": "^5.3.2",
16 | "codemirror": "^5.52.2",
17 | "deck.gl": "^8.9.34",
18 | "marked": "^12.0.0",
19 | "ollama": "^0.4.9",
20 | "pako": "^2.1.0",
21 | "wasm-bhtsne": "^0.3.3"
22 | },
23 | "devDependencies": {
24 | "copy-webpack-plugin": "^11.0.0",
25 | "css-loader": "^6.8.1",
26 | "favicons": "^7.1.4",
27 | "favicons-webpack-plugin": "^6.0.1",
28 | "html-webpack-plugin": "^5.5.3",
29 | "mini-css-extract-plugin": "^2.7.6",
30 | "pdfjs-dist": "^4.0.379",
31 | "style-loader": "^3.3.3",
32 | "webpack": "^5.88.1",
33 | "webpack-cli": "^5.1.4",
34 | "webpack-dev-server": "^4.15.1"
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/css/styles.css:
--------------------------------------------------------------------------------
1 | #input-text {
2 | height: 50vh;
3 | width: 80vw;
4 | min-width: 80vw;
5 | text-align: left !important;
6 | }
7 | .CodeMirror {
8 | height: 55vh !important;
9 | }
10 |
11 | .CodeMirror.cm-s-default.CodeMirror-wrap {
12 | font-family: 'Open Sans', sans-serif;
13 | }
14 |
15 | #summary_text, #chat_text, #ollama_chat_text {
16 | margin: 10px;
17 | font-weight: 700;
18 | }
19 |
20 | #get_chat, #ollama_get_chat, #get_summary, #dimensionalityReduction{
21 | height: calc(3.5rem + calc(var(--bs-border-width) * 2));
22 | width: 110px;
23 | margin-right: 20px;
24 | }
25 |
26 | table {
27 | table-layout: auto;
28 | margin: 0 auto;
29 | }
30 | th,
31 | td {
32 | word-wrap: break-word;
33 | max-width: 50%;
34 | text-align: center;
35 | }
36 |
37 | .table-bordered {
38 | border: none;
39 | }
40 |
41 | #output-table > tbody > tr:nth-of-type(odd) {
42 | background-color: #f9f9f9;
43 | }
44 |
45 | .table .table {
46 | background-color: transparent;
47 | }
48 |
49 | .highlight-first {
50 | background-color: rgb(0, 255, 81);
51 | }
52 |
53 | .highlight-second {
54 | background-color: rgb(135, 255, 153);
55 | }
56 |
57 | .highlight-third {
58 | background-color: rgb(190, 253, 190);
59 | }
60 |
61 | .highlight-select {
62 | background-color: orange;
63 | }
64 |
65 | #loading {
66 | display: inline-block;
67 | width: 1rem;
68 | height: 1rem;
69 | border: 3px solid rgba(255, 255, 255, 0.3);
70 | border-radius: 50%;
71 | border-top-color: #fff;
72 | animation: spin 1s ease-in-out infinite;
73 | -webkit-animation: spin 1s ease-in-out infinite;
74 | }
75 |
76 | @keyframes spin {
77 | to {
78 | -webkit-transform: rotate(360deg);
79 | }
80 | }
81 | @-webkit-keyframes spin {
82 | to {
83 | -webkit-transform: rotate(360deg);
84 | }
85 | }
86 |
87 | #progressBar {
88 | height: 25px;
89 | width: 100%;
90 | }
91 |
92 | #query-text {
93 | height: 75%;
94 | min-width: 80%;
95 | }
96 |
97 |
98 | .submit-button {
99 | height: 75%;
100 | white-space:normal;
101 | text-align: center; /* this seems to break when page size is too small */
102 | }
103 |
104 |
105 | #formGroupCenter {
106 | width: 100%;
107 | }
108 |
109 | .CodeMirror {
110 | font-size: 15px;
111 | }
112 |
113 | #results {
114 | height: 70vh;
115 | overflow-y: auto;
116 | }
117 |
118 | .card {
119 | width: 100%;
120 | transition: background-color 0.3s ease;
121 | }
122 |
123 | .card:hover {
124 | background-color: #f8f9fa;
125 | }
126 |
127 | /*.nav-button {*/
128 | /* width: 60px; !* adjust this to the size you want *!*/
129 | /* margin-right: 1px; !* adds space between buttons *!*/
130 | /*}*/
131 |
132 | #submitGroup {
133 | margin-top: 2vh; /* adjust this value as needed */
134 | }
135 |
136 | /*
137 | see: https://github.com/twbs/bootstrap/issues/33871
138 | */
139 | .form-floating > label { z-index: 3; }
140 |
141 | #advancedFeaturesHeader .accordion-button:hover {
142 | text-decoration: underline;
143 | }
144 |
145 | .accordion-button::after {
146 | display: none;
147 | }
148 |
149 | .card-title {
150 | font-size: 0.9em;
151 | }
152 | .card-subtitle {
153 | font-size: 0.8em;
154 | }
155 |
156 | #SemanticFinderLogo{
157 | display: block;
158 | margin: 0 auto;
159 | max-width: 250px;
160 | }
161 |
162 | @media (min-width: 992px) {
163 | #introContainer {
164 | display: inline-flex;
165 | }
166 | #introContentDiv{
167 | padding-left: 20px;
168 | }
169 | }
170 |
171 | @media (max-width: 992px) {
172 | .col-sm-9 {
173 | width: 100% !important;
174 | }
175 | #results{
176 | height: unset !important;
177 | }
178 | .col-sm-3 {
179 | width: 100% !important;
180 | }
181 | ul {
182 | padding-left: 0 !important;
183 | }
184 | }
185 |
186 | .toast {
187 | display: none;
188 | position: fixed;
189 | top: 16px;
190 | left: 50%;
191 | transform: translateX(-50%);
192 | background-color: white !important;
193 | color: #fff;
194 | padding: 10px 20px;
195 | border-radius: 5px;
196 | z-index: 1000;
197 | width: 250px !important;
198 | font-size: 20px !important;
199 | }
200 |
201 |
202 | #closeToastButton{
203 | cursor: pointer;
204 | right: -47px;
205 | position: relative;
206 | }
207 |
208 | /* Style for the point labels
209 | .point-label {
210 | display: none;
211 | position: absolute;
212 | background-color: #ffffff;
213 | padding: 8px;
214 | border: 2px solid #4a90e2;
215 | border-radius: 10px;
216 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
217 | color: #333333;
218 | font-family: 'Open Sans', sans-serif;
219 | font-size: 14px;
220 | pointer-events: none;
221 | z-index: 100000000000000000000;
222 | opacity: 1 !important;
223 | }
224 |
225 | /* Style for the tooltip
226 | .tooltip {
227 | position: absolute;
228 | text-align: center;
229 | width: auto;
230 | height: 36px;
231 | padding: 6px;
232 | font-family: 'Open Sans', sans-serif;
233 | background: #4a90e2;
234 | color: #ffffff;
235 | border: 0px;
236 | border-radius: 12px;
237 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
238 | pointer-events: none;
239 | z-index: 100000000000000000000;
240 | opacity: 1 !important;
241 | }*/
242 |
243 | #plot-container {
244 | max-height: 700px;
245 | height: 0;
246 | }
247 |
248 | #deckgl {
249 | position: relative !important;
250 | }
251 |
252 | #tooltip {
253 | position: absolute;
254 | opacity: 1000;
255 | font-size: 20px;
256 | border-radius: var(--bs-border-radius);
257 | background-color: #f5f8ffcf;
258 | outline: 2px solid #dfebff;
259 | padding: 8px;
260 | max-width: 500px;
261 | }
--------------------------------------------------------------------------------
/src/js/semantic.js:
--------------------------------------------------------------------------------
1 | import { env, cos_sim} from '@xenova/transformers';
2 | import { loadScatterplot } from './utils.js';
3 |
4 | // @ts-ignore
5 | env.allowLocalModels = false;
6 |
7 | /**
8 | * @type {Worker}
9 | */
10 | const worker = new Worker(new URL('./worker.js', import.meta.url), {
11 | type: 'module'
12 | });
13 |
14 | window.semanticWorker = worker;
15 |
16 | /**
17 | * @type {Array}
18 | */
19 | let queryEmbedding;
20 |
21 | /**
22 | * @type {Object}
23 | */
24 | const similarityResolveMap = {};
25 |
26 | /**
27 | * @type {Object}
28 | */
29 | const tokensResolveMap = {};
30 |
31 | /**
32 | * @type Function
33 | */
34 | let loadResolve;
35 |
36 | /**
37 | * @type Function
38 | */
39 | let queryResolve;
40 |
41 | function downloadFile(data, filename, mimeType) {
42 | const blob = new Blob([data], { type: mimeType });
43 |
44 | const link = document.createElement('a');
45 | link.href = window.URL.createObjectURL(blob);
46 | link.download = filename;
47 |
48 | // Append the link to the body for programmatic click
49 | document.body.appendChild(link);
50 | link.click();
51 |
52 | // Remove the link from the DOM
53 | document.body.removeChild(link);
54 | }
55 |
56 | worker.onmessage = function (event) {
57 | const message = event.data;
58 | let resolve;
59 |
60 | switch (message.type) {
61 | case 'embeddingsDict':
62 | const gzippedData = message.data;
63 | //console.log("Embeddings data received.");
64 | // Download gzipped data as 'index.json.gz'
65 | downloadFile(gzippedData, message.filename, 'application/gzip');
66 | break;
67 | case "download":
68 | let downloadBar = document.getElementById('loading-progress');
69 |
70 | if (message.data.status === 'progress') {
71 | if (message.data.file !== "onnx/model_quantized.onnx") { break; }
72 | let progress = message.data.progress.toFixed(2);
73 | downloadBar.style.width = progress + '%';
74 | downloadBar.textContent = progress + "%";
75 |
76 | downloadBar.setAttribute('aria-valuenow', progress);
77 | } else if (message.data.status === 'ready') {
78 | downloadBar.style.width = '100%';
79 | downloadBar.setAttribute('aria-valuenow', 100);
80 | downloadBar.textContent = "";
81 | loadResolve();
82 | }
83 | break;
84 | case "chat_download":
85 | let chatDownloadBar = document.getElementById('chat-progress');
86 |
87 | if (message.data.status === 'progress') {
88 | if (message.data.file !== "onnx/decoder_model_merged_quantized.onnx") { break; }
89 | let progress = message.data.progress.toFixed(2);
90 | chatDownloadBar.style.width = progress + '%';
91 | chatDownloadBar.textContent = Math.round(progress) + '%';
92 | chatDownloadBar.setAttribute('aria-valuenow', progress);
93 | } else if (message.data.status === 'ready') {
94 | chatDownloadBar.style.width = '100%';
95 | chatDownloadBar.setAttribute('aria-valuenow', 100);
96 | chatDownloadBar.textContent = "";
97 | loadResolve();
98 | }
99 | break;
100 | case "summary_download":
101 | let summaryDownloadBar = document.getElementById('summary-progress');
102 |
103 | if (message.data.status === 'progress') {
104 | if (message.data.file !== "onnx/decoder_model_merged_quantized.onnx") { break; }
105 | let progress = message.data.progress.toFixed(2);
106 | summaryDownloadBar.style.width = progress + '%';
107 | summaryDownloadBar.textContent = Math.round(progress) + '%';
108 | summaryDownloadBar.setAttribute('aria-valuenow', progress);
109 | } else if (message.data.status === 'ready') {
110 | summaryDownloadBar.style.width = '100%';
111 | summaryDownloadBar.setAttribute('aria-valuenow', 100);
112 | summaryDownloadBar.textContent = "";
113 | loadResolve();
114 | }
115 | break;
116 | case 'chat':
117 | //console.log(message.chat_text);
118 | document.getElementById("chat_text").innerHTML = message.chat_text
119 | queryResolve(message.chat_text);
120 | break;
121 | case 'summary':
122 | //console.log(message.summary_text);
123 | document.getElementById("summary_text").innerHTML = message.summary_text
124 | queryResolve(message.summary_text);
125 | break;
126 | case 'query':
127 | queryEmbedding = message.embedding;
128 | queryResolve();
129 | break;
130 | case 'similarity':
131 | resolve = similarityResolveMap[message.text];
132 | resolve(cos_sim(message.embedding, queryEmbedding));
133 | delete similarityResolveMap[message.text];
134 | break;
135 | case 'tokens':
136 | resolve = tokensResolveMap[message.text];
137 | resolve(message.tokens);
138 | delete tokensResolveMap[message.text];
139 | break;
140 | case 'tsne':
141 | console.log(message.plotDataArray)
142 | loadScatterplot(message.plotDataArray);
143 |
144 | break
145 | default:
146 | console.error('Unknown message type: ' + message.type);
147 | }
148 | };
149 |
150 | /**
151 | * @param {string} text
152 | * @returns {Promise}
153 | */
154 | export async function similarity(text) {
155 | worker.postMessage({
156 | type: 'similarity',
157 | inferencingActive: document.getElementById("inferencingActive").checked,
158 | text
159 | });
160 | return new Promise((resolve) => {
161 | similarityResolveMap[text] = resolve;
162 | });
163 | }
164 |
165 | /**
166 | *
167 | * @param {string} text
168 | * @returns
169 | */
170 | export async function summarizeText(text) {
171 | worker.postMessage({
172 | type: 'summary',
173 | text
174 | });
175 | return new Promise((resolve) => {
176 | queryResolve = resolve;
177 | });
178 | }
179 |
180 | /**
181 | *
182 | * @param {string} text
183 | * @param {number} max_new_tokens
184 | * @returns
185 | */
186 | export async function chatText(text, max_new_tokens) {
187 | worker.postMessage({
188 | type: 'chat',
189 | max_new_tokens: max_new_tokens,
190 | text
191 | });
192 | return new Promise((resolve) => {
193 | queryResolve = resolve;
194 | });
195 | }
196 |
197 | /**
198 | *
199 | * @param {string} text
200 | * @returns
201 | */
202 | export async function embedQuery(text) {
203 | worker.postMessage({
204 | type: 'query',
205 | text
206 | });
207 | return new Promise((resolve) => {
208 | queryResolve = resolve;
209 | });
210 | }
211 |
212 | /**
213 | *
214 | * @param {string} text
215 | * @returns
216 | */
217 | export async function getTokens(text) {
218 | worker.postMessage({
219 | type: 'getTokens',
220 | text
221 | });
222 | return new Promise((resolve) => {
223 | tokensResolveMap[text] = resolve;
224 | });
225 | }
226 |
227 | /**
228 | * @param {string} modelName
229 | * @returns
230 | */
231 | export async function loadSemantic(modelName) {
232 | const quantized = document.getElementById("quantized").checked;
233 | const downloadBar = document.getElementById('loading-progress');
234 | downloadBar.style.width = '0%';
235 | downloadBar.textContent = 'Loading model...';
236 | worker.postMessage({
237 | type: 'load',
238 | model_name: modelName,
239 | quantized: quantized
240 | });
241 | return new Promise((resolve) => {
242 | loadResolve = resolve;
243 | });
244 | }
245 |
246 | export async function loadChat(modelName) {
247 | //const quantized = document.getElementById("quantized").checked;
248 | let downloadBar = document.getElementById('chat-progress');
249 | downloadBar.style.width = '0%';
250 | downloadBar.textContent = 'Loading model...';
251 |
252 | if (modelName.includes("Qwen")) {
253 | worker.postMessage({
254 | type: 'load_text-generation',
255 | model_name: modelName
256 | //quantized: quantized
257 | });
258 | }
259 |
260 | else {
261 | worker.postMessage({
262 | type: 'load_text2text-generation',
263 | model_name: modelName
264 | //quantized: quantized
265 | });
266 | }
267 | return new Promise((resolve) => {
268 | loadResolve = resolve;
269 | });
270 | }
271 |
272 | export async function loadSummary(modelName) {
273 | //const quantized = document.getElementById("quantized").checked;
274 | let downloadBar = document.getElementById('summary-progress');
275 | downloadBar.style.width = '0%';
276 | downloadBar.textContent = 'Loading model...';
277 | worker.postMessage({
278 | type: 'load_summary',
279 | model_name: modelName
280 | //quantized: quantized
281 | });
282 | return new Promise((resolve) => {
283 | loadResolve = resolve;
284 | });
285 | }
286 |
--------------------------------------------------------------------------------
/src/js/utils.js:
--------------------------------------------------------------------------------
1 | import { getTokens } from './semantic';
2 | import { Deck } from '@deck.gl/core';
3 | import { ScatterplotLayer, LineLayer } from '@deck.gl/layers';
4 | import {setProgressBarValue } from './index.js';
5 |
6 | import * as pdfjsLib from 'pdfjs-dist/webpack.mjs';
7 | pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/build/pdf.worker.js';
8 |
9 | //import {ScatterplotLayer} from '@deck.gl/layers';
10 | /**
11 | * @param {string} text
12 | * @param {string} splitType
13 | * @param {string} splitParam
14 | * @returns {Promise | null>}
15 | */
16 | export async function splitText(text, splitType, splitParam) {
17 | switch (splitType) {
18 | case 'Regex':
19 | return splitByRegex(text, splitParam);
20 | case 'Sentence':
21 | return splitBySentences(text);
22 | case 'Words':
23 | return splitByWords(text, parseInt(splitParam));
24 | case 'Chars':
25 | return splitByChars(text, parseInt(splitParam));
26 | case 'Tokens':
27 | return await splitByTokens(text, parseInt(splitParam));
28 | case 'JinaAI':
29 | return await splitWithJinaAI(text, parseInt(splitParam));
30 | default:
31 | console.error('Invalid split type');
32 | return null;
33 | }
34 | }
35 |
36 | /**
37 | * @param {string} text
38 | * @param {number} numTokens
39 | * @returns {Promise | null>}
40 | */
41 | async function splitByTokens(text, numTokens) {
42 | const words = text.split(' ');
43 | const chunks = [];
44 |
45 | for (let i = 0; i < words.length; i++) {
46 | const word = words[i];
47 | const tokens = await getTokens(word);
48 |
49 | // Check if there's no chunk or if the last chunk + the new word would exceed numTokens
50 | if (chunks.length === 0 || (await getTokens(chunks[chunks.length - 1])).length + tokens.length > numTokens) {
51 | chunks.push(word);
52 | } else {
53 | chunks[chunks.length - 1] += ' ' + word;
54 | }
55 | }
56 | //console.table(chunks);
57 | console.log("Number of chunks: " + chunks.length)
58 | return chunks;
59 | }
60 |
61 | /**
62 | * @param {string} text
63 | * @param {number} numWords
64 | * @returns {Array | null}
65 | */
66 | function splitByWords(text, numWords) {
67 | if (isNaN(numWords) || !Number.isInteger(numWords)) {
68 | console.error('numWords must be an integer.');
69 | return null;
70 | }
71 |
72 | const words = text.split(' ');
73 | let chunks = [];
74 | let currentChunk = [];
75 |
76 | for (let i = 0; i < words.length; i++) {
77 | currentChunk.push(words[i]);
78 |
79 | if (currentChunk.length === numWords) {
80 | chunks.push(currentChunk.join(' '));
81 | currentChunk = [];
82 | }
83 | }
84 |
85 | if (currentChunk.length > 0) {
86 | chunks.push(currentChunk.join(' '));
87 | }
88 | chunks = chunks.filter(chunk => chunk.trim().length > 0);
89 |
90 | //console.table(chunks);
91 | console.log("Number of chunks: " + chunks.length)
92 |
93 | return chunks;
94 | }
95 |
96 | /**
97 | * @param {string} text
98 | * @param {number} numChars
99 | * @returns {Array | null}
100 | */
101 | function splitByChars(text, numChars) {
102 | const words = text.split(' ');
103 | const chunks = [];
104 |
105 | for (let i = 0; i < words.length; i++) {
106 | const word = words[i];
107 |
108 | if (chunks.length === 0 || chunks[chunks.length - 1].length + word.length + 1 > numChars) {
109 | chunks.push(word);
110 | } else {
111 | chunks[chunks.length - 1] += ' ' + word;
112 | }
113 | }
114 | // console.table(chunks);
115 | console.log("Number of chunks: " + chunks.length)
116 | return chunks;
117 | }
118 |
119 | /**
120 | * @param {string} text
121 | * @returns {Array | null}
122 | */
123 | function splitBySentences(text) {
124 | const chunks = text.match(/[^.!?]+[.!?]+/g);
125 | console.log("Number of chunks: " + chunks.length)
126 |
127 | return chunks
128 | }
129 |
130 | /**
131 | * @param {string} text
132 | * @param {string} r
133 | * @returns {Array | null}
134 | */
135 | function splitByRegex(text, r) {
136 | const regex = new RegExp(r, 'g');
137 | const chunks = text.split(regex);
138 |
139 | console.log("Number of chunks: " + chunks.length)
140 |
141 | return chunks
142 | }
143 |
144 | /**
145 | * @param {string} text
146 | * @param {number} numChars
147 | * @returns {Promise | null>}
148 | */
149 | async function splitWithJinaAI(text, numChars) {
150 | const data = {
151 | content: text,
152 | return_chunks: true,
153 | max_chunk_length: numChars
154 | };
155 |
156 | try {
157 | const response = await fetch('https://segment.jina.ai/', {
158 | method: 'POST',
159 | headers: {
160 | 'Content-Type': 'application/json'
161 | },
162 | body: JSON.stringify(data)
163 | });
164 |
165 | if (!response.ok) {
166 | console.error('HTTP error:', response.status, response.statusText);
167 | return null;
168 | }
169 |
170 | const responseData = await response.json();
171 |
172 | const chunks = responseData.chunks || []; // Assuming the API returns the chunks in a property called 'chunks'
173 |
174 | console.log("Number of chunks: " + chunks.length);
175 | // console.table(chunks); // Uncomment if you want to see the chunks in a table format
176 |
177 | return chunks;
178 | } catch (error) {
179 | console.error('Fetch error:', error);
180 | return null;
181 | }
182 | }
183 |
184 | // Example usage:
185 | // splitWithJinaAIChars("Your text here", 1000).then(chunks => console.log(chunks));
186 |
187 |
188 |
189 | // Sorting algorithms: heap-based sorting is quite superior for 1000+ and usually less than half of the time of normal sorting
190 | // might be interesting to use it once indices become larger than 100k but for now not a bottleneck
191 |
192 | // Original code
193 | function normalSorting(inputTexts) {
194 | const startTime = performance.now();
195 | const sortedResults = Object.entries(inputTexts).sort((a, b) => b[1] - a[1]);
196 | const endTime = performance.now();
197 | console.log(`Original code took ${endTime - startTime} milliseconds`);
198 | // updateResults(sortedResults); // Commented out, replace with your actual implementation
199 | }
200 |
201 | // MaxHeap class
202 | class MaxHeap {
203 | constructor(array) {
204 | this.heap = [...array];
205 | this.buildHeap();
206 | }
207 |
208 | buildHeap() {
209 | const n = this.heap.length;
210 | for (let i = Math.floor(n / 2) - 1; i >= 0; i--) {
211 | this.heapifyDown(i);
212 | }
213 | }
214 |
215 | heapifyDown(i) {
216 | const left = 2 * i + 1;
217 | const right = 2 * i + 2;
218 | let largest = i;
219 |
220 | if (left < this.heap.length && this.heap[left][1] > this.heap[largest][1]) {
221 | largest = left;
222 | }
223 |
224 | if (right < this.heap.length && this.heap[right][1] > this.heap[largest][1]) {
225 | largest = right;
226 | }
227 |
228 | if (largest !== i) {
229 | this.swap(i, largest);
230 | this.heapifyDown(largest);
231 | }
232 | }
233 |
234 | extractMax() {
235 | if (this.heap.length === 0) {
236 | return null;
237 | }
238 |
239 | const max = this.heap[0];
240 | const last = this.heap.pop();
241 |
242 | if (this.heap.length > 0) {
243 | this.heap[0] = last;
244 | this.heapifyDown(0);
245 | }
246 |
247 | return max;
248 | }
249 |
250 | swap(i, j) {
251 | [this.heap[i], this.heap[j]] = [this.heap[j], this.heap[i]];
252 | }
253 | }
254 |
255 | // Heap-based solution
256 | export function heapBasedSorting(inputTexts, n) {
257 | //const startTime = performance.now();
258 |
259 | const entries = Object.entries(inputTexts);
260 | const maxHeap = new MaxHeap(entries);
261 |
262 | const nLargest = [];
263 | for (let i = 0; i < n && i < entries.length; i++) {
264 | const maxEntry = maxHeap.extractMax();
265 | nLargest.push(maxEntry);
266 | }
267 | return nLargest
268 |
269 | //const endTime = performance.now();
270 | //console.log(`Heap-based solution took ${endTime - startTime} milliseconds`);
271 | // updateResults(nLargest); // Commented out, replace with your actual implementation
272 | }
273 |
274 | /*
275 | // Test objects
276 | function generateTestObject(size) {
277 | const testObject = {};
278 | for (let i = 0; i < size; i++) {
279 | testObject[`key${i}`] = Math.random();
280 | }
281 | return testObject;
282 | }
283 |
284 | //const obj100 = generateTestObject(100);
285 | //const obj10000 = generateTestObject(10000);
286 | //const obj100000 = generateTestObject(100000);
287 |
288 | // Usage
289 | //const n = 5; // Change this to the desired number of largest values
290 |
291 | //normalSorting(obj100);
292 | //heapBasedSorting(obj100, n);
293 |
294 | //normalSorting(obj10000);
295 | //heapBasedSorting(obj10000, n);
296 |
297 | //normalSorting(obj100000);
298 | //heapBasedSorting(obj100000, n);
299 |
300 | Original code took 0.19999999925494194 milliseconds
301 | Heap-based solution took 0.10000000149011612 milliseconds
302 |
303 | Original code took 19.5 milliseconds
304 | Heap-based solution took 9.299999997019768 milliseconds
305 |
306 | Original code took 166.69999999925494 milliseconds
307 | Heap-based solution took 60.5 milliseconds
308 |
309 | */
310 |
311 | const toastMessage = document.getElementById("toastMessage");
312 | const toastText = document.getElementById("toastText");
313 | const closeToastButton = document.getElementById("closeToastButton");
314 |
315 | export function showToast(message, timeout=2500) {
316 | toastText.textContent = message;
317 | toastMessage.style.display = "block";
318 |
319 | setTimeout(() => {
320 | hideToast();
321 | }, timeout);
322 | }
323 |
324 | function hideToast() {
325 | toastMessage.style.display = "none";
326 | }
327 |
328 | closeToastButton.addEventListener("click", () => {
329 | hideToast();
330 | });
331 |
332 | function generateGridData(gridSize = 20) {
333 | const gridData = [];
334 |
335 | // Create vertical lines
336 | for (let i = -gridSize; i <= gridSize; i++) {
337 | gridData.push({
338 | sourcePosition: [i, -gridSize],
339 | targetPosition: [i, gridSize],
340 | color: [169, 169, 169],
341 | });
342 | }
343 |
344 | // Create horizontal lines
345 | for (let j = -gridSize; j <= gridSize; j++) {
346 | gridData.push({
347 | sourcePosition: [-gridSize, j],
348 | targetPosition: [gridSize, j],
349 | color: [169, 169, 169],
350 | });
351 | }
352 |
353 | return gridData;
354 | }
355 |
356 | const plotContainer = document.getElementById("plot-container");
357 | let deckgl;
358 | export async function loadScatterplot(data) {
359 |
360 | removeScatterplot();
361 | // Find the minimum and maximum similarity values, x values, and y values in the data array
362 | const minSimilarity = Math.min(...data.map(item => item.similarity));
363 | const maxSimilarity = Math.max(...data.map(item => item.similarity));
364 |
365 | const minX = Math.min(...data.map(item => item.x));
366 | const maxX = Math.max(...data.map(item => item.x));
367 |
368 | const minY = Math.min(...data.map(item => item.y));
369 | const maxY = Math.max(...data.map(item => item.y));
370 |
371 | data = data.map(item => {
372 | // Normalize similarity values to the range [0, 1]
373 | const normalizedSimilarity = (item.similarity - minSimilarity) / (maxSimilarity - minSimilarity);
374 |
375 | // Normalize x and y coordinates to the range [0, 1]
376 | const normalizedX = (item.x - minX) / (maxX - minX);
377 | const normalizedY = (item.y - minY) / (maxY - minY);
378 |
379 | // Use the normalized similarity value as alpha (opacity)
380 | const alpha = Math.min(1, Math.max(0, normalizedSimilarity));
381 |
382 | // Map the alpha value to the entire opacity spectrum
383 | const color = [0, 0, 255, Math.floor(alpha * 255)]; // RGBA format with alpha value
384 |
385 | return {
386 | coordinates: [normalizedX, normalizedY],
387 | color: color,
388 | similarity: item.similarity,
389 | label: item.label,
390 | };
391 | });
392 |
393 | // Calculate the bounding box of the data
394 | const bounds = data.reduce(
395 | (acc, point) => ({
396 | minX: Math.min(acc.minX, point.coordinates[0]),
397 | minY: Math.min(acc.minY, point.coordinates[1]),
398 | maxX: Math.max(acc.maxX, point.coordinates[0]),
399 | maxY: Math.max(acc.maxY, point.coordinates[1]),
400 | }),
401 | { minX: Infinity, minY: Infinity, maxX: -Infinity, maxY: -Infinity }
402 | );
403 |
404 | deckgl = new Deck({
405 | canvas: 'deckgl',
406 | container: 'plot-container',
407 | initialViewState: {
408 | latitude: (bounds.minY + bounds.maxY) / 2,
409 | longitude: (bounds.minX + bounds.maxX) / 2,
410 | zoom: 9
411 | },
412 | controller: true,
413 | pickingRadius: 25,
414 | layers: [
415 | // Add a new LineLayer for the coordinate system
416 | /*new LineLayer({
417 | id: 'coordinate-system',
418 | data: generateGridData(20),
419 | getSourcePosition: d => d.sourcePosition,
420 | getTargetPosition: d => d.targetPosition,
421 | getColor: d => d.color,
422 | getWidth: 1,
423 | pickable: false
424 | }),
425 | */
426 | // ScatterplotLayer with all points added right away
427 | new ScatterplotLayer({
428 | id: 'scatterplot',
429 | data: data,
430 | getPosition: d => d.coordinates,
431 | getRadius: parseInt(document.getElementById("scatterplotRadius").value), // Adjust the radius to fit the new range
432 | getFillColor: d => d.color,
433 | pickable: true, // Enable picking for on-hover interaction
434 | onHover: info => {
435 | const tooltip = document.getElementById('tooltip');
436 |
437 | if (info.object) {
438 | const canvas = document.getElementById('deckgl');
439 | const rect = canvas.getBoundingClientRect();
440 |
441 | // Calculate the correct position by subtracting the canvas offset and adding the scroll position
442 | const left = window.scrollX + info.x + rect.left + 30;
443 | const top = window.scrollY + info.y + rect.top + -50;
444 |
445 | tooltip.innerHTML = `${info.object.label}
Similarity: ${info.object.similarity.toFixed(2)}`;
446 | tooltip.style.left = `${left}px`;
447 | tooltip.style.top = `${top}px`;
448 | tooltip.style.display = 'block';
449 | } else {
450 | tooltip.style.display = 'none';
451 | }
452 | },
453 | onClick: info => {
454 | const tooltip = document.getElementById('tooltip');
455 |
456 | if (info.object) {
457 | const canvas = document.getElementById('deckgl');
458 | const rect = canvas.getBoundingClientRect();
459 |
460 | // Calculate the correct position by subtracting the canvas offset and adding the scroll position
461 | const left = window.scrollX + info.x + rect.left + 30;
462 | const top = window.scrollY + info.y + rect.top + -50;
463 |
464 | tooltip.innerHTML = `${info.object.label}
Similarity: ${info.object.similarity.toFixed(2)}`;
465 | tooltip.style.left = `${left}px`;
466 | tooltip.style.top = `${top}px`;
467 | tooltip.style.display = 'block';
468 | } else {
469 | tooltip.style.display = 'none';
470 | }
471 | }
472 |
473 | })
474 | ]
475 | });
476 |
477 | plotContainer.style.height = "700px";
478 | }
479 |
480 | export function removeScatterplot() {
481 | if (deckgl) {
482 | deckgl.finalize();
483 | deckgl = null;
484 | }
485 | }
486 |
487 | // pdf loading logic for local and remote
488 |
489 | function processPdf(pdf, documentIdentifier, resolve, reject, updateProgress) {
490 | let numPages = pdf.numPages;
491 | let pageTextPromises = [];
492 | for (let i = 1; i <= numPages; i++) {
493 | pageTextPromises.push(pdf.getPage(i).then(page => {
494 | return page.getTextContent().then(textContent => {
495 | return textContent.items.map(item => item.str).join(' ');
496 | });
497 | }));
498 | }
499 | Promise.all(pageTextPromises).then(pagesText => {
500 | // Concatenate text from all pages with metadata including page number
501 | let fullText = pagesText.map((pageText, index) =>
502 | `[Document: ${documentIdentifier}, Page: ${index + 1}]\n${pageText}`
503 | ).join("\n\n");
504 | resolve(fullText); // Resolve the promise with the full text including metadata
505 | }).catch(error => {
506 | reject(error); // Reject the promise if there's an error
507 | });
508 | }
509 |
510 | function extractTextFromPDF(fileOrDataUri, updateProgress) {
511 | return new Promise((resolve, reject) => {
512 | let documentIdentifier;
513 | let pdfSource;
514 |
515 | if (fileOrDataUri instanceof File) {
516 | // For local files
517 | documentIdentifier = fileOrDataUri.name;
518 | pdfSource = URL.createObjectURL(fileOrDataUri);
519 | } else if (typeof fileOrDataUri === 'string') {
520 | if (fileOrDataUri.startsWith('data:')) {
521 | // For data URIs (remote PDFs)
522 | documentIdentifier = "RemotePDF";
523 | pdfSource = fileOrDataUri;
524 | } else {
525 | // Assume it's a URL
526 | documentIdentifier = fileOrDataUri;
527 | pdfSource = fileOrDataUri;
528 | }
529 | } else {
530 | reject(new Error('Invalid input type'));
531 | return;
532 | }
533 |
534 | pdfjsLib.getDocument(pdfSource).promise.then(pdf => {
535 | processPdf(pdf, documentIdentifier, resolve, reject, updateProgress);
536 | }).catch(error => {
537 | reject(error); // Reject the promise if there's an error loading the PDF
538 | });
539 | });
540 | }
541 |
542 |
543 | export async function handlePdfFileUpload() {
544 | const fileInput = document.getElementById('pdf-upload');
545 | const files = fileInput.files; // Get all selected files
546 | if (files.length > 0) {
547 | const totalFiles = files.length;
548 | let processedFiles = 0;
549 |
550 | // Map each file to a promise that resolves with its text content
551 | const filePromises = Array.from(files).map(file => {
552 | return extractTextFromPDF(file, setProgressBarValue).then(text => {
553 | processedFiles++;
554 | const progressPercentage = (processedFiles / totalFiles) * 100;
555 | setProgressBarValue(progressPercentage.toFixed(0));
556 | console.log(progressPercentage);
557 | return text;
558 | });
559 | });
560 |
561 | // Wait for all files to be processed
562 | const allFilesText = await Promise.all(filePromises);
563 | // Concatenate text from all files
564 | const fullText = allFilesText.join("\n\n");
565 | return fullText; // Return the full text
566 | } else {
567 | console.error('No files selected');
568 | return ''; // Return an empty string or handle the error as needed
569 | }
570 | }
571 |
572 |
573 |
574 |
575 |
576 | ////////////////////////////////////////////////////
577 |
578 | async function fetchPdfAsDataUri(url) {
579 | const proxyUrl = 'https://corsproxy.io/?' + url; // cors proxy unfortunately needed for remote files :/
580 | const response = await fetch(proxyUrl);
581 | if (!response.ok) {
582 | throw new Error('Network response was not ok');
583 | }
584 | const blob = await response.blob();
585 | return new Promise((resolve, reject) => {
586 | const reader = new FileReader();
587 | reader.onloadend = () => resolve(reader.result);
588 | reader.onerror = reject;
589 | reader.readAsDataURL(blob);
590 | });
591 | }
592 |
593 |
594 | export async function handleRemotePdfFileUpload() {
595 | const urls = document.getElementById("importPdfURL").value.split(" ");
596 | let texts = [];
597 |
598 | for (const url of urls) {
599 | console.log(url);
600 |
601 | try {
602 | const dataUri = await fetchPdfAsDataUri(url);
603 | const text = await extractTextFromPDF(dataUri, null);
604 | texts.push(text);
605 | } catch (error) {
606 | console.log('Not a pdf, trying to parse the web page');
607 |
608 | // Fallback to extracting text from a normal webpage
609 | try {
610 | const response = await fetch(url);
611 | const html = await response.text();
612 | const parser = new DOMParser();
613 | const doc = parser.parseFromString(html, 'text/html');
614 | const pageText = doc.body.innerText;
615 | texts.push(pageText);
616 | } catch (webpageError) {
617 | console.error('Error fetching or parsing webpage:', webpageError);
618 | }
619 | }
620 | }
621 |
622 | return texts.join("\n");
623 | }
624 |
625 |
626 | export async function handleMultipleRemotePdfFileUploads() {
627 | const urls = document.getElementById("importPdfURL").value.split(" ")
628 | const results = [];
629 |
630 | for (const url of urls) {
631 | console.log(url);
632 |
633 | try {
634 | const dataUri = await fetchPdfAsDataUri(url);
635 | const text = await extractTextFromPDF(dataUri, null);
636 | results.push(text);
637 | } catch (error) {
638 | console.error(`Error handling remote PDF file upload for URL ${url}:`, error);
639 | results.push('');
640 | }
641 | }
642 |
643 | return results;
644 | }
645 |
--------------------------------------------------------------------------------
/src/js/worker.js:
--------------------------------------------------------------------------------
1 | import { pipeline, AutoTokenizer } from '@xenova/transformers';
2 | import pako from 'pako';
3 | import init, { tSNE } from "wasm-bhtsne";
4 | import { marked } from 'marked';
5 |
6 | init();
7 | // env.useBrowserCache = false; // for testing
8 |
9 | /**
10 | * @type {Object}
11 | */
12 | let embeddingsDict = {};
13 |
14 | /**
15 | * @type {Pipeline}
16 | */
17 | // embedding models
18 | let embedder;
19 | let tokenizer;
20 |
21 | // chat model
22 | let chat_generator;
23 | let chat_tokenizer;
24 | let chat_model_name;
25 |
26 | // summary model
27 | let summary_generator;
28 | let summary_tokenizer;
29 |
30 | let queryEmbedding;
31 | let currentNullVector = [];
32 |
33 | function minimalEightCharHash(str) {
34 | let hash = 5381;
35 |
36 | for (let i = 0; i < str.length; i++) {
37 | hash = (hash * 33) ^ str.charCodeAt(i);
38 | }
39 |
40 | // Convert to 8-character hexadecimal string
41 | const hexHash = (hash >>> 0).toString(16);
42 | return hexHash.slice(0, 8).padStart(8, '0');
43 | }
44 |
45 | function minimalRandomEightCharHash() {
46 | const characters = '0123456789abcdef';
47 | let hash = '';
48 |
49 | for (let i = 0; i < 8; i++) {
50 | const randomIndex = Math.floor(Math.random() * characters.length);
51 | hash += characters[randomIndex];
52 | }
53 |
54 | return hash;
55 | }
56 |
57 |
58 | async function token_to_text(beams, tokenizer_type) {
59 | //let chatTokenizer = await AutoTokenizer.from_pretrained(chatModel);
60 | let decoded_text = tokenizer_type.decode(beams[0].output_token_ids, {
61 | skip_special_tokens: true
62 | });
63 | //console.log(decoded_text);
64 | return decoded_text
65 | }
66 |
67 | /**
68 | * @param {string} text
69 | * @returns {Promise}
70 | */
71 | async function embed(text, embedNewText=true) {
72 | if (text in embeddingsDict) {
73 | return embeddingsDict[text];
74 | }
75 |
76 | if (embedNewText==false){
77 | if (currentNullVector != []){
78 | embeddingsDict[text] = currentNullVector;
79 | return currentNullVector
80 | }
81 | else {
82 | const tempVec = await embedder("test", { pooling: 'mean', normalize: true });
83 | currentNullVector = [...tempVec.data].fill(0.00001);
84 | embeddingsDict[text] = currentNullVector;
85 | return currentNullVector
86 | }
87 | }
88 |
89 | const e0 = await embedder(text, { pooling: 'mean', normalize: true });
90 |
91 | const roundDecimalsDown = (num) => parseFloat(num.toFixed(3));
92 |
93 | embeddingsDict[text] = e0.data.map(roundDecimalsDown);
94 | //console.log(embeddingsDict)
95 | return e0.data;
96 |
97 | }
98 |
99 | async function getTokens(text) {
100 | return await tokenizer(text).input_ids.data;
101 | }
102 |
103 | async function chat(text, max_new_tokens = 100) {
104 | return new Promise(async (resolve, reject) => {
105 | // hier Weiche einbauen für Qwen da tokenizer anders
106 | console.log(chat_model_name, max_new_tokens);
107 |
108 | if (chat_model_name.includes("Qwen")) {
109 | try {
110 |
111 | // Define the prompt and list of messages
112 | const messages = [
113 | { "role": "system", "content": "You are a helpful assistant." },
114 | { "role": "user", "content": text }
115 | ]
116 |
117 | const generatorText = chat_generator.tokenizer.apply_chat_template(messages, {
118 | tokenize: false,
119 | add_generation_prompt: false,
120 | });
121 |
122 | const thisChat = await chat_generator(generatorText, {
123 | max_new_tokens: max_new_tokens,
124 | do_sample: false,
125 | callback_function: async function (beams) {
126 | //const decodedText = await token_to_text(beams, chat_generator.tokenizer);
127 | let decodedText = chat_generator.tokenizer.decode(beams[0].output_token_ids, { skip_special_tokens: false })
128 |
129 | decodedText = decodedText.split("<|im_start|>")[3].replace("<|im_end|>","") // just return the model's output
130 | decodedText = marked(decodedText)
131 |
132 | self.postMessage({
133 | type: 'chat',
134 | chat_text: decodedText
135 | });
136 |
137 | resolve(decodedText); // Resolve the main promise with chat text
138 | },
139 | });
140 | } catch (error) {
141 | reject(error);
142 | }
143 | }
144 |
145 | else {
146 | try {
147 | const thisChat = await chat_generator(text, {
148 | max_new_tokens: max_new_tokens,
149 | return_prompt: false,
150 | callback_function: async function (beams) {
151 | const decodedText = await token_to_text(beams, chat_tokenizer);
152 | //console.log(decodedText);
153 |
154 | self.postMessage({
155 | type: 'chat',
156 | chat_text: decodedText,
157 | });
158 |
159 | resolve(decodedText); // Resolve the main promise with chat text
160 | },
161 | });
162 | } catch (error) {
163 | reject(error);
164 | }
165 | }
166 | });
167 | }
168 |
169 | async function summary(text, max_new_tokens = 100) {
170 | return new Promise(async (resolve, reject) => {
171 | try {
172 | const thisSummary = await summary_generator(text, {
173 | max_new_tokens: max_new_tokens,
174 | return_prompt: false,
175 | callback_function: async function (beams) {
176 | const decodedText = await token_to_text(beams, summary_tokenizer);
177 | //console.log(beams)
178 |
179 | self.postMessage({
180 | type: 'summary',
181 | summary_text: decodedText,
182 | });
183 |
184 | resolve(decodedText); // Resolve the main promise with chat text
185 | },
186 | });
187 | } catch (error) {
188 | reject(error);
189 | }
190 | });
191 | }
192 |
193 | // tested, trivial calculation takes 200ms for 100k embeddings of size 384 or 700 ms with size 1000
194 | const calculateAverageEmbedding = (embeddingsAsArray) => {
195 | const allEmbeddings = Object.values(embeddingsAsArray);
196 |
197 | if (allEmbeddings.length === 0) {
198 | return null; // handle the case when the input object is empty
199 | }
200 |
201 | const sumEmbeddings = allEmbeddings.reduce((acc, embedding) => {
202 | return acc.map((value, index) => value + embedding[index]);
203 | }, new Array(allEmbeddings[0].length).fill(0));
204 |
205 | const averageEmbedding = sumEmbeddings.map(value => value / allEmbeddings.length);
206 |
207 | return averageEmbedding;
208 | };
209 |
210 | /*
211 | const calculateAverageEmbedding = (embeddingsAsArray) => {
212 | const allEmbeddings = Object.values(embeddingsAsArray);
213 |
214 | if (allEmbeddings.length === 0) {
215 | return null; // handle the case when the input object is empty
216 | }
217 |
218 | const start = performance.now();
219 |
220 | const sumEmbeddings = allEmbeddings.reduce((acc, embedding) => {
221 | return acc.map((value, index) => value + embedding[index]);
222 | }, new Array(allEmbeddings[0].length).fill(0));
223 |
224 | const averageEmbedding = sumEmbeddings.map(value => value / allEmbeddings.length);
225 |
226 | const end = performance.now();
227 | console.log('Execution time:', end - start, 'milliseconds');
228 |
229 | return averageEmbedding;
230 | };
231 |
232 | // Generate random embeddings for testing
233 | const generateRandomEmbedding = (size) => {
234 | return Array.from({ length: size }, () => Math.random());
235 | };
236 |
237 | // Generate test data with 10,000 strings and embeddings of size 1000
238 | const generateTestEmbeddings = (numStrings, embeddingSize) => {
239 | const testData = {};
240 | for (let i = 1; i <= numStrings; i++) {
241 | const key = `string${i}`;
242 | const embedding = generateRandomEmbedding(embeddingSize);
243 | testData[key] = embedding;
244 | }
245 | return testData;
246 | };
247 |
248 | // Test the calculateAverageEmbedding function with generated data
249 | const testEmbeddingsAsArray = generateTestEmbeddings(100000, 1000);
250 | const averageEmbedding = calculateAverageEmbedding(testEmbeddingsAsArray);
251 |
252 | console.log('Average Embedding:', averageEmbedding);
253 | */
254 |
255 | function convert_to_underscores(inputString) {
256 | // Replace spaces with underscores
257 | var stringWithUnderscores = lowercaseString.replace(/\s/g, '_');
258 |
259 | return stringWithUnderscores;
260 | }
261 | function createRandomMatrix(rows, columns) {
262 | return Array.from({ length: rows }, () =>
263 | Array.from({ length: columns }, () => Math.random())
264 | );
265 | }
266 | // Function to update embeddingsDict
267 | const updateEmbeddingsDict = (newData) => {
268 | embeddingsDict = newData;
269 | postMessage({ type: 'updateEmbeddingsDict', data: embeddingsDict });
270 | };
271 |
272 | function convertFloat32ArraysToArrays(arrayOfFloat32Arrays) {
273 | return arrayOfFloat32Arrays.reduce((accumulator, currentFloat32Array) => {
274 | // Convert Float32Array to a regular JavaScript array using Array.from
275 | const jsArray = Array.from(currentFloat32Array);
276 |
277 | // Add the converted array to the accumulator
278 | accumulator.push(jsArray);
279 |
280 | return accumulator;
281 | }, []);
282 | }
283 |
284 | function calculateCosineSimilarity(embedding) {
285 | let dotProduct = 0;
286 | let queryMagnitude = 0;
287 | let embeddingMagnitude = 0;
288 | const queryEmbeddingLength = queryEmbedding.length;
289 |
290 | for (let i = 0; i < queryEmbeddingLength; i++) {
291 | dotProduct += queryEmbedding[i] * embedding[i];
292 | queryMagnitude += queryEmbedding[i] ** 2;
293 | embeddingMagnitude += embedding[i] ** 2;
294 | }
295 |
296 | return dotProduct / (Math.sqrt(queryMagnitude) * Math.sqrt(embeddingMagnitude));
297 | }
298 |
299 | // Expose a function to manually update embeddingsDict
300 | self.updateEmbeddingsDictManually = updateEmbeddingsDict;
301 |
302 | self.onmessage = async (event) => {
303 | const message = event.data;
304 | //console.log(message)
305 | let roundDecimals;
306 | let embeddingsAsArray;
307 | let exportDict;
308 | let gzippedData;
309 | let text;
310 | let embedding;
311 |
312 | // Other cases in your existing switch statement
313 | switch (message.type) {
314 | case 'logEmbeddingsDict':
315 | console.log(embeddingsDict);
316 | break
317 | case 'tsne':
318 | const start = performance.now();
319 | const valuesFloat32Array = Array.from(Object.values(embeddingsDict));
320 | let valuesArray = convertFloat32ArraysToArrays(valuesFloat32Array);
321 | const valuesArrayLength = valuesArray.length;
322 | //console.log(valuesArrayLength);
323 | // Check if the length is below 61 to set perplexity to a different value, needs slight refactoring to
324 | // get rid of this workaround
325 |
326 | let compressed_vectors;
327 | if (valuesArrayLength < 61) {
328 | const vectorLength = valuesArray[0].length; // Assuming all vectors have the same length
329 | const vectorsToAdd = 61 - valuesArrayLength;
330 |
331 | console.log("added: ", vectorsToAdd)
332 | // Add random vectors to the array
333 | for (let i = 0; i < vectorsToAdd; i++) {
334 | const randomVector = Array.from({ length: vectorLength }, () => Math.random());
335 | valuesArray.push(randomVector);
336 | }
337 |
338 | const tsne_encoder = new tSNE(valuesArray);
339 | compressed_vectors = tsne_encoder.barnes_hut(message.data.iterations).slice(0, valuesArrayLength);//,theta=0.1);
340 | }
341 | else {
342 | const tsne_encoder = new tSNE(valuesArray);
343 | compressed_vectors = tsne_encoder.barnes_hut(message.data.iterations);
344 |
345 | }
346 |
347 | //console.log("Compressed Vectors:", compressed_vectors);
348 | const end = performance.now();
349 | console.log('BHtSNE Execution time:', Math.round(end - start), 'ms');
350 |
351 | //text = message.text;
352 | //embedding = await embed(text);
353 |
354 | const originalKeys = Object.keys(embeddingsDict);
355 | const originalEmbeddings = Object.values(embeddingsDict)
356 |
357 | // Assuming compressed_vectors is now an array of arrays
358 | let plotDataArray = [];
359 |
360 | for (let i = 0; i < originalKeys.length; i++) {
361 | let thisVec = compressed_vectors[i];
362 | let similarity = calculateCosineSimilarity(originalEmbeddings[i]);
363 |
364 | if (similarity >= message.data.dimensionalityReductionSimilarityThreshold) {
365 | plotDataArray.push({ "x": thisVec[0], "y": thisVec[1], "label": originalKeys[i], "similarity": similarity });
366 | }
367 | }
368 |
369 | console.log(plotDataArray)
370 |
371 | // Now reconstructedDict will have the original format
372 | //console.log(plotDataArray);
373 |
374 | //loadScatterplot(plotDataArray);
375 |
376 | self.postMessage({
377 | type: 'tsne',
378 | plotDataArray
379 | });
380 | break
381 |
382 | case 'importEmbeddingsDict':
383 | embeddingsDict = message.data;
384 | break
385 | case 'exportEmbeddingsDict':
386 | roundDecimals = (num) => parseFloat(num.toFixed(parseInt(message.data.meta.exportDecimals)));
387 |
388 | embeddingsAsArray = Object.fromEntries(
389 | Object.entries(embeddingsDict).map(([key, values]) => [key, Object.values(values).map(roundDecimals)])
390 | );
391 |
392 | const meanEmbedding = calculateAverageEmbedding(embeddingsAsArray)
393 | // adding mean embedding so all indexed docs on HF could be ingested in a "proper" vector DB!
394 | exportDict = {
395 | "meta": message.data.meta, "text": message.data.text,
396 | "index": embeddingsAsArray,
397 | "mean_embedding": meanEmbedding
398 | }
399 |
400 | exportDict.meta.chunks = Object.keys(embeddingsAsArray).length;
401 |
402 | console.log("Document average embedding", meanEmbedding);
403 | console.log("Metadata", exportDict.meta);
404 |
405 | gzippedData = pako.gzip(JSON.stringify(exportDict), { to: 'string' });
406 |
407 | const tempFilename = `${message.data.meta.textTitle.replace(/\s/g, '_')}_${minimalRandomEightCharHash()}.json.gz`
408 | // Send the gzipped data as a response
409 | self.postMessage({ type: 'embeddingsDict', data: gzippedData, filename: tempFilename });
410 | break;
411 |
412 | case 'load':
413 | embeddingsDict = {}; // clear dict
414 | tokenizer = await AutoTokenizer.from_pretrained(message.model_name); // no progress callbacks -- assume its quick
415 | embedder = await pipeline('feature-extraction', message.model_name,
416 | {
417 | quantized: message.quantized,
418 | progress_callback: data => {
419 | self.postMessage({
420 | type: 'download',
421 | data
422 | });
423 | }
424 |
425 | });
426 | break;
427 | case 'load_summary':
428 | summary_tokenizer = await AutoTokenizer.from_pretrained(message.model_name)
429 | summary_generator = await pipeline('summarization', message.model_name,
430 | {
431 | progress_callback: data => {
432 | self.postMessage({
433 | type: 'summary_download',
434 | data
435 | });
436 | }
437 | //quantized: message.quantized // currently not possible, models unquantized way too large!
438 | });
439 | break;
440 | case 'load_text2text-generation':
441 | console.log("loading chat");
442 | chat_model_name = message.model_name;
443 | chat_tokenizer = await AutoTokenizer.from_pretrained(message.model_name); // no progress callbacks -- assume its quick
444 | chat_generator = await pipeline('text2text-generation', message.model_name,
445 | {
446 | progress_callback: data => {
447 | self.postMessage({
448 | type: 'chat_download',
449 | data
450 | });
451 | }
452 | //quantized: message.quantized // currently not possible, models unquantized way too large!
453 | });
454 | break;
455 | case 'load_text-generation':
456 | console.log("loading chat");
457 | chat_model_name = message.model_name;
458 | chat_tokenizer = await AutoTokenizer.from_pretrained(message.model_name) // no progress callbacks -- assume its quick
459 | chat_generator = await pipeline('text-generation', message.model_name,
460 | {
461 | progress_callback: data => {
462 | self.postMessage({
463 | type: 'chat_download',
464 | data
465 | });
466 | }
467 | //quantized: message.quantized // currently not possible, models unquantized way too large!
468 | });
469 | console.log("chat loaded");
470 | break;
471 | case 'query':
472 | text = message.text;
473 | embedding = await embed(text);
474 | queryEmbedding = embedding;
475 | currentNullVector = [...Object.values(embeddingsDict)[0]].fill(0.00001);
476 | self.postMessage({
477 | type: 'query',
478 | embedding
479 | });
480 | break;
481 | case 'similarity':
482 | text = message.text;
483 | embedding = await embed(text, message.inferencingActive);
484 | self.postMessage({
485 | type: 'similarity',
486 | text,
487 | embedding
488 | });
489 | break;
490 | case 'getTokens':
491 | text = message.text;
492 | self.postMessage({
493 | type: 'tokens',
494 | text,
495 | tokens: await getTokens(text)
496 | });
497 | break;
498 | case 'summary':
499 | text = message.text;
500 | let summary_text = await summary(text, message.max_new_tokens);
501 | self.postMessage({
502 | type: 'summary',
503 | summary_text
504 | });
505 | break;
506 | case 'chat':
507 | text = message.text;
508 | let chat_text = await chat(text, message.max_new_tokens);
509 | self.postMessage({
510 | type: 'chat',
511 | chat_text
512 | });
513 | break;
514 |
515 | default:
516 | }
517 | };
518 |
519 |
--------------------------------------------------------------------------------
/src/models/model_miner.js:
--------------------------------------------------------------------------------
1 | // model mining script - necessary as huggingface.co doesn not allow requests from other domains e.g. github.io
2 | // execute this script for each sorter while on https://huggingface.co/models
3 | // downloads the json file
4 |
5 | let out_json = {}
6 | const sorter = "modified" // // likes, downloads, trending, modified
7 | const pipeline_tag = "feature-extraction" // text2text2 etc.
8 | const fileName = `${pipeline_tag}_${sorter}.json`;
9 |
10 | function downloadJsonToFile(jsonData, fileName) {
11 | // Create a Blob object from the JSON data
12 | const blob = new Blob([JSON.stringify(jsonData)], { type: "application/json" });
13 |
14 | // Create a URL for the Blob
15 | const url = URL.createObjectURL(blob);
16 |
17 | // Create a link element for the download
18 | const a = document.createElement("a");
19 | a.href = url;
20 | a.download = fileName;
21 |
22 | // Trigger a click event on the link to initiate the download
23 | a.click();
24 |
25 | // Clean up by revoking the URL
26 | URL.revokeObjectURL(url);
27 | }
28 |
29 | async function fetchAllPages() {
30 | const baseUrl = "https://huggingface.co/models-json";
31 | const commonParams = `?pipeline_tag=${pipeline_tag}&library=transformers.js&sort=${sorter}`;
32 | const numPages = 3; // Change this if you need more or fewer pages
33 |
34 | const models = [];
35 |
36 | for (let pageIndex = 0; pageIndex < numPages; pageIndex++) {
37 | const url = `${baseUrl}${commonParams}&p=${pageIndex}`;
38 |
39 | try {
40 | const response = await fetch(url);
41 | const data = await response.json();
42 | models.push(...data.models);
43 | } catch (error) {
44 | console.error(`Error fetching page ${pageIndex}: ${error}`);
45 | }
46 | }
47 |
48 | const result = {
49 | activeFilters: {
50 | pipeline_tag: ["feature-extraction"],
51 | library: ["transformers.js"],
52 | dataset: [],
53 | language: [],
54 | license: [],
55 | other: [],
56 | },
57 | models,
58 | numItemsPerPage: 30,
59 | numTotalItems: models.length,
60 | pageIndex: 0,
61 | };
62 |
63 | out_json = result;
64 |
65 | downloadJsonToFile(result, fileName);
66 | }
67 |
68 | fetchAllPages();
69 |
70 |
--------------------------------------------------------------------------------
/src/models/model_miner_simple.js:
--------------------------------------------------------------------------------
1 | // simplified script for just downloading all models from the current HF page
2 | // set the filters on HF and run it in the browser console
3 | // e.g. go to https://huggingface.co/models?pipeline_tag=text2text-generation&library=transformers.js&sort=trending
4 |
5 | const h4Elements = document.querySelectorAll("h4");
6 | const h4TextArray = [];
7 |
8 | h4Elements.forEach(element => {
9 | h4TextArray.push(element.textContent);
10 | });
11 |
12 | console.log(h4TextArray);
13 |
14 | //[
15 | // "Xenova/t5-small",
16 | // "Xenova/flan-t5-small",
17 | // "Xenova/LaMini-Flan-T5-783M",
18 | // "Xenova/LaMini-Flan-T5-248M",
19 | // "Xenova/LaMini-Flan-T5-77M",
20 | // "Xenova/LaMini-T5-61M",
21 | // "Xenova/LaMini-T5-738M",
22 | // "Xenova/LaMini-T5-223M",
23 | // "Xenova/mt5-small",
24 | // "Xenova/mt5-base",
25 | // "Xenova/t5-base",
26 | // "Xenova/t5-v1_1-base",
27 | // "Xenova/flan-t5-base",
28 | // "Xenova/t5-v1_1-small",
29 | // "Xenova/blenderbot-400M-distill",
30 | // "Xenova/blenderbot_small-90M",
31 | // "Xenova/long-t5-tglobal-base",
32 | // "Xenova/long-t5-local-base",
33 | // "Xenova/long-t5-tglobal-base-16384-book-summary"
34 | //]
--------------------------------------------------------------------------------
/src/models/model_size_miner.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 25,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import requests\n",
10 | "from bs4 import BeautifulSoup\n",
11 | "import json\n",
12 | "\n",
13 | "# Load URLs from the JSON file\n",
14 | "with open('feature-extraction_downloads.json', 'r') as json_file:\n",
15 | " data = json.load(json_file)\n",
16 | " # urls = data.get('urls', [])\n",
17 | "ids = [i[\"id\"] for i in data[\"models\"]]\n",
18 | "urls = [f\"https://huggingface.co/{i}/tree/main/onnx\" for i in ids]"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 26,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "data": {
28 | "text/plain": [
29 | "{'author': 'TaylorAI',\n",
30 | " 'authorData': {'avatarUrl': 'https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/63917e16b6b839bb61483dbf/Utq89ebo7Glxfls0QZnxK.png?w=200&h=200&f=face',\n",
31 | " 'fullname': 'Taylor',\n",
32 | " 'name': 'TaylorAI',\n",
33 | " 'type': 'org',\n",
34 | " 'isHf': False},\n",
35 | " 'downloads': 1752,\n",
36 | " 'gated': False,\n",
37 | " 'id': 'TaylorAI/gte-tiny',\n",
38 | " 'lastModified': '2023-10-07T05:20:49.000Z',\n",
39 | " 'likes': 102,\n",
40 | " 'pipeline_tag': 'sentence-similarity',\n",
41 | " 'private': False,\n",
42 | " 'repoType': 'model',\n",
43 | " 'isLikedByUser': False}"
44 | ]
45 | },
46 | "execution_count": 26,
47 | "metadata": {},
48 | "output_type": "execute_result"
49 | }
50 | ],
51 | "source": [
52 | "data[\"models\"][0]"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 27,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "https://huggingface.co/TaylorAI/gte-tiny/tree/main/onnx | 22.9\n",
65 | "https://huggingface.co/Supabase/gte-small/tree/main/onnx | 34\n",
66 | "https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx | 23\n",
67 | "https://huggingface.co/Xenova/bge-large-en-v1.5/tree/main/onnx | 337\n",
68 | "https://huggingface.co/Supabase/bge-small-en/tree/main/onnx | 34\n",
69 | "https://huggingface.co/Xenova/gte-small/tree/main/onnx | 34\n",
70 | "https://huggingface.co/Xenova/all-mpnet-base-v2/tree/main/onnx | 110\n",
71 | "https://huggingface.co/Xenova/paraphrase-mpnet-base-v2/tree/main/onnx | 110\n",
72 | "https://huggingface.co/Xenova/all-MiniLM-L12-v2/tree/main/onnx | 34\n",
73 | "https://huggingface.co/Xenova/multilingual-e5-small/tree/main/onnx | 118\n",
74 | "https://huggingface.co/Xenova/gte-large/tree/main/onnx | 337\n",
75 | "https://huggingface.co/Xenova/bge-base-en-v1.5/tree/main/onnx | 110\n",
76 | "https://huggingface.co/Xenova/all-roberta-large-v1/tree/main/onnx | 357\n",
77 | "https://huggingface.co/Xenova/distiluse-base-multilingual-cased-v2/tree/main/onnx | 135\n",
78 | "https://huggingface.co/Xenova/paraphrase-multilingual-mpnet-base-v2/tree/main/onnx | 279\n",
79 | "https://huggingface.co/Xenova/bge-large-zh/tree/main/onnx | 327\n",
80 | "https://huggingface.co/Xenova/multilingual-e5-base/tree/main/onnx | 279\n",
81 | "https://huggingface.co/Xenova/bge-small-en-v1.5/tree/main/onnx | 34\n",
82 | "https://huggingface.co/Xenova/paraphrase-albert-small-v2/tree/main/onnx | 39.7\n",
83 | "https://huggingface.co/Xenova/paraphrase-albert-base-v2/tree/main/onnx | 40\n",
84 | "https://huggingface.co/Xenova/squeezebert-uncased/tree/main/onnx | 51.2\n",
85 | "https://huggingface.co/Xenova/squeezebert-mnli/tree/main/onnx | 51.3\n",
86 | "https://huggingface.co/Xenova/vit-base-patch16-224-in21k/tree/main/onnx | 87.5\n",
87 | "https://huggingface.co/Xenova/all-distilroberta-v1/tree/main/onnx | 82.1\n",
88 | "https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2/tree/main/onnx | 118\n",
89 | "https://huggingface.co/Xenova/paraphrase-MiniLM-L6-v2/tree/main/onnx | 23\n",
90 | "https://huggingface.co/Xenova/bert-base-nli-mean-tokens/tree/main/onnx | 110\n",
91 | "https://huggingface.co/Xenova/distilbert-base-nli-mean-tokens/tree/main/onnx | 66.9\n",
92 | "https://huggingface.co/Xenova/distilbert-base-nli-stsb-mean-tokens/tree/main/onnx | 66.9\n",
93 | "https://huggingface.co/Xenova/distiluse-base-multilingual-cased-v1/tree/main/onnx | 135\n",
94 | "https://huggingface.co/Xenova/msmarco-distilbert-base-v4/tree/main/onnx | 66.9\n",
95 | "https://huggingface.co/Xenova/multi-qa-MiniLM-L6-cos-v1/tree/main/onnx | 23\n",
96 | "https://huggingface.co/Xenova/multi-qa-distilbert-cos-v1/tree/main/onnx | 66.9\n",
97 | "https://huggingface.co/Xenova/multi-qa-mpnet-base-cos-v1/tree/main/onnx | 110\n",
98 | "https://huggingface.co/Xenova/multi-qa-mpnet-base-dot-v1/tree/main/onnx | 110\n",
99 | "https://huggingface.co/Xenova/nli-mpnet-base-v2/tree/main/onnx | 110\n",
100 | "https://huggingface.co/Xenova/paraphrase-MiniLM-L3-v2/tree/main/onnx | 17.5\n",
101 | "https://huggingface.co/Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens/tree/main/onnx | 279\n",
102 | "https://huggingface.co/Xenova/dino-vitb16/tree/main/onnx | 87.5\n",
103 | "https://huggingface.co/Xenova/dino-vits8/tree/main/onnx | 23.4\n",
104 | "https://huggingface.co/Xenova/dino-vitb8/tree/main/onnx | 88.8\n",
105 | "https://huggingface.co/Xenova/dino-vits16/tree/main/onnx | 22.7\n",
106 | "https://huggingface.co/Xenova/scibert_scivocab_uncased/tree/main/onnx | 111\n",
107 | "https://huggingface.co/Xenova/spanbert-large-cased/tree/main/onnx | 335\n",
108 | "https://huggingface.co/Xenova/spanbert-base-cased/tree/main/onnx | 109\n",
109 | "https://huggingface.co/sdan/simple-embeddings/tree/main/onnx | 23\n",
110 | "https://huggingface.co/Xenova/sentence_bert/tree/main/onnx | 110\n",
111 | "https://huggingface.co/Xenova/e5-small-v2/tree/main/onnx | 34\n",
112 | "https://huggingface.co/Xenova/SapBERT-from-PubMedBERT-fulltext/tree/main/onnx | 110\n",
113 | "https://huggingface.co/Xenova/indobert-base-p1/tree/main/onnx | 125\n",
114 | "https://huggingface.co/Xenova/UMLSBert_ENG/tree/main/onnx | 110\n",
115 | "https://huggingface.co/Xenova/rubert-base-cased/tree/main/onnx | 178\n",
116 | "https://huggingface.co/Xenova/kobert/tree/main/onnx | 92.8\n",
117 | "https://huggingface.co/Xenova/e5-small/tree/main/onnx | 34\n",
118 | "https://huggingface.co/Xenova/e5-large/tree/main/onnx | 337\n",
119 | "https://huggingface.co/Xenova/e5-large-v2/tree/main/onnx | 337\n",
120 | "https://huggingface.co/Xenova/e5-base/tree/main/onnx | 110\n",
121 | "https://huggingface.co/Xenova/e5-base-v2/tree/main/onnx | 110\n",
122 | "https://huggingface.co/Xenova/instructor-base/tree/main/onnx | 110\n",
123 | "https://huggingface.co/Xenova/instructor-large/tree/main/onnx | 337\n",
124 | "https://huggingface.co/Xenova/sentence-t5-large/tree/main/onnx | 337\n",
125 | "https://huggingface.co/Xenova/multilingual-e5-large/tree/main/onnx | 562\n",
126 | "https://huggingface.co/Xenova/mms-300m/tree/main/onnx | 318\n",
127 | "https://huggingface.co/Xenova/mms-1b/tree/main/onnx | 969\n",
128 | "https://huggingface.co/Supabase/e5-small-v2/tree/main/onnx | 34\n",
129 | "https://huggingface.co/Supabase/all-MiniLM-L6-v2/tree/main/onnx | 23\n",
130 | "https://huggingface.co/Xenova/gte-base/tree/main/onnx | 110\n",
131 | "https://huggingface.co/Xenova/bge-small-en/tree/main/onnx | 34\n",
132 | "https://huggingface.co/Xenova/bge-base-en/tree/main/onnx | 110\n",
133 | "https://huggingface.co/Xenova/bge-large-en/tree/main/onnx | 337\n",
134 | "https://huggingface.co/ggrn/bge-small-en/tree/main/onnx | 34\n",
135 | "https://huggingface.co/Xenova/bge-base-zh/tree/main/onnx | 103\n",
136 | "https://huggingface.co/Xenova/bge-large-zh-noinstruct/tree/main/onnx | 327\n",
137 | "https://huggingface.co/Xenova/bge-small-zh/tree/main/onnx | 24\n",
138 | "https://huggingface.co/Xenova/ClinicalBERT/tree/main/onnx | 229\n",
139 | "https://huggingface.co/Xenova/LaBSE/tree/main/onnx | 472\n",
140 | "https://huggingface.co/Xenova/wavlm-base/tree/main/onnx | 95.8\n",
141 | "https://huggingface.co/Xenova/wavlm-base-plus/tree/main/onnx | 95.8\n",
142 | "https://huggingface.co/Xenova/wavlm-large/tree/main/onnx | 319\n",
143 | "https://huggingface.co/Xenova/sentence-camembert-large/tree/main/onnx | 339\n",
144 | "https://huggingface.co/Xenova/herbert-base-cased/tree/main/onnx | 125\n",
145 | "https://huggingface.co/Xenova/herbert-large-cased/tree/main/onnx | 357\n",
146 | "https://huggingface.co/Xenova/bge-large-zh-v1.5/tree/main/onnx | 327\n",
147 | "https://huggingface.co/Xenova/bge-base-zh-v1.5/tree/main/onnx | 103\n",
148 | "https://huggingface.co/Xenova/bge-small-zh-v1.5/tree/main/onnx | 24\n",
149 | "https://huggingface.co/leolee9086/text2vec-base-chinese/tree/main/onnx | 103\n",
150 | "https://huggingface.co/Xenova/long-t5-encodec-tglobal-base/tree/main/onnx | 291\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "def extract_size_from_url(url):\n",
156 | " try:\n",
157 | " response = requests.get(url)\n",
158 | " if response.status_code == 200:\n",
159 | " soup = BeautifulSoup(response.text, 'html.parser')\n",
160 | " \n",
161 | " # Find the 'a' tag with the specified title attribute\n",
162 | " a_tag = soup.find('a', title=\"Download file\")\n",
163 | " if a_tag:\n",
164 | " size = a_tag.text.strip() # Extract the size text\n",
165 | " return size\n",
166 | " else:\n",
167 | " return \"Size not found\"\n",
168 | " else:\n",
169 | " return \"Failed to retrieve the page\"\n",
170 | " except requests.exceptions.RequestException as e:\n",
171 | " return f\"Request error: {e}\"\n",
172 | "\n",
173 | "# Iterate through the URLs and extract values\n",
174 | "sizes = []\n",
175 | "\n",
176 | "for url in urls:\n",
177 | " values = extract_values_from_url(url)[-1].split(\" MB\")[0]\n",
178 | " sizes.append([url.split(\"https://huggingface.co/\")[1].split(\"/tree/main/onnx\")[0],values])\n",
179 | " print(f\"{url} | {values}\")\n"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 28,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/plain": [
190 | "[['TaylorAI/gte-tiny', '22.9'],\n",
191 | " ['Supabase/gte-small', '34'],\n",
192 | " ['Xenova/all-MiniLM-L6-v2', '23'],\n",
193 | " ['Xenova/bge-large-en-v1.5', '337'],\n",
194 | " ['Supabase/bge-small-en', '34'],\n",
195 | " ['Xenova/gte-small', '34'],\n",
196 | " ['Xenova/all-mpnet-base-v2', '110'],\n",
197 | " ['Xenova/paraphrase-mpnet-base-v2', '110'],\n",
198 | " ['Xenova/all-MiniLM-L12-v2', '34'],\n",
199 | " ['Xenova/multilingual-e5-small', '118'],\n",
200 | " ['Xenova/gte-large', '337'],\n",
201 | " ['Xenova/bge-base-en-v1.5', '110'],\n",
202 | " ['Xenova/all-roberta-large-v1', '357'],\n",
203 | " ['Xenova/distiluse-base-multilingual-cased-v2', '135'],\n",
204 | " ['Xenova/paraphrase-multilingual-mpnet-base-v2', '279'],\n",
205 | " ['Xenova/bge-large-zh', '327'],\n",
206 | " ['Xenova/multilingual-e5-base', '279'],\n",
207 | " ['Xenova/bge-small-en-v1.5', '34'],\n",
208 | " ['Xenova/paraphrase-albert-small-v2', '39.7'],\n",
209 | " ['Xenova/paraphrase-albert-base-v2', '40'],\n",
210 | " ['Xenova/squeezebert-uncased', '51.2'],\n",
211 | " ['Xenova/squeezebert-mnli', '51.3'],\n",
212 | " ['Xenova/vit-base-patch16-224-in21k', '87.5'],\n",
213 | " ['Xenova/all-distilroberta-v1', '82.1'],\n",
214 | " ['Xenova/paraphrase-multilingual-MiniLM-L12-v2', '118'],\n",
215 | " ['Xenova/paraphrase-MiniLM-L6-v2', '23'],\n",
216 | " ['Xenova/bert-base-nli-mean-tokens', '110'],\n",
217 | " ['Xenova/distilbert-base-nli-mean-tokens', '66.9'],\n",
218 | " ['Xenova/distilbert-base-nli-stsb-mean-tokens', '66.9'],\n",
219 | " ['Xenova/distiluse-base-multilingual-cased-v1', '135'],\n",
220 | " ['Xenova/msmarco-distilbert-base-v4', '66.9'],\n",
221 | " ['Xenova/multi-qa-MiniLM-L6-cos-v1', '23'],\n",
222 | " ['Xenova/multi-qa-distilbert-cos-v1', '66.9'],\n",
223 | " ['Xenova/multi-qa-mpnet-base-cos-v1', '110'],\n",
224 | " ['Xenova/multi-qa-mpnet-base-dot-v1', '110'],\n",
225 | " ['Xenova/nli-mpnet-base-v2', '110'],\n",
226 | " ['Xenova/paraphrase-MiniLM-L3-v2', '17.5'],\n",
227 | " ['Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', '279'],\n",
228 | " ['Xenova/dino-vitb16', '87.5'],\n",
229 | " ['Xenova/dino-vits8', '23.4'],\n",
230 | " ['Xenova/dino-vitb8', '88.8'],\n",
231 | " ['Xenova/dino-vits16', '22.7'],\n",
232 | " ['Xenova/scibert_scivocab_uncased', '111'],\n",
233 | " ['Xenova/spanbert-large-cased', '335'],\n",
234 | " ['Xenova/spanbert-base-cased', '109'],\n",
235 | " ['sdan/simple-embeddings', '23'],\n",
236 | " ['Xenova/sentence_bert', '110'],\n",
237 | " ['Xenova/e5-small-v2', '34'],\n",
238 | " ['Xenova/SapBERT-from-PubMedBERT-fulltext', '110'],\n",
239 | " ['Xenova/indobert-base-p1', '125'],\n",
240 | " ['Xenova/UMLSBert_ENG', '110'],\n",
241 | " ['Xenova/rubert-base-cased', '178'],\n",
242 | " ['Xenova/kobert', '92.8'],\n",
243 | " ['Xenova/e5-small', '34'],\n",
244 | " ['Xenova/e5-large', '337'],\n",
245 | " ['Xenova/e5-large-v2', '337'],\n",
246 | " ['Xenova/e5-base', '110'],\n",
247 | " ['Xenova/e5-base-v2', '110'],\n",
248 | " ['Xenova/instructor-base', '110'],\n",
249 | " ['Xenova/instructor-large', '337'],\n",
250 | " ['Xenova/sentence-t5-large', '337'],\n",
251 | " ['Xenova/multilingual-e5-large', '562'],\n",
252 | " ['Xenova/mms-300m', '318'],\n",
253 | " ['Xenova/mms-1b', '969'],\n",
254 | " ['Supabase/e5-small-v2', '34'],\n",
255 | " ['Supabase/all-MiniLM-L6-v2', '23'],\n",
256 | " ['Xenova/gte-base', '110'],\n",
257 | " ['Xenova/bge-small-en', '34'],\n",
258 | " ['Xenova/bge-base-en', '110'],\n",
259 | " ['Xenova/bge-large-en', '337'],\n",
260 | " ['ggrn/bge-small-en', '34'],\n",
261 | " ['Xenova/bge-base-zh', '103'],\n",
262 | " ['Xenova/bge-large-zh-noinstruct', '327'],\n",
263 | " ['Xenova/bge-small-zh', '24'],\n",
264 | " ['Xenova/ClinicalBERT', '229'],\n",
265 | " ['Xenova/LaBSE', '472'],\n",
266 | " ['Xenova/wavlm-base', '95.8'],\n",
267 | " ['Xenova/wavlm-base-plus', '95.8'],\n",
268 | " ['Xenova/wavlm-large', '319'],\n",
269 | " ['Xenova/sentence-camembert-large', '339'],\n",
270 | " ['Xenova/herbert-base-cased', '125'],\n",
271 | " ['Xenova/herbert-large-cased', '357'],\n",
272 | " ['Xenova/bge-large-zh-v1.5', '327'],\n",
273 | " ['Xenova/bge-base-zh-v1.5', '103'],\n",
274 | " ['Xenova/bge-small-zh-v1.5', '24'],\n",
275 | " ['leolee9086/text2vec-base-chinese', '103'],\n",
276 | " ['Xenova/long-t5-encodec-tglobal-base', '291']]"
277 | ]
278 | },
279 | "execution_count": 28,
280 | "metadata": {},
281 | "output_type": "execute_result"
282 | }
283 | ],
284 | "source": [
285 | "sizes"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 29,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "sizes_backup = [['TaylorAI/gte-tiny', '22.9'],\n",
295 | " ['Supabase/gte-small', '34'],\n",
296 | " ['Xenova/all-MiniLM-L6-v2', '23'],\n",
297 | " ['Xenova/bge-large-en-v1.5', '337'],\n",
298 | " ['Supabase/bge-small-en', '34'],\n",
299 | " ['Xenova/gte-small', '34'],\n",
300 | " ['Xenova/all-mpnet-base-v2', '110'],\n",
301 | " ['Xenova/paraphrase-mpnet-base-v2', '110'],\n",
302 | " ['Xenova/all-MiniLM-L12-v2', '34'],\n",
303 | " ['Xenova/multilingual-e5-small', '118'],\n",
304 | " ['Xenova/gte-large', '337'],\n",
305 | " ['Xenova/bge-base-en-v1.5', '110'],\n",
306 | " ['Xenova/all-roberta-large-v1', '357'],\n",
307 | " ['Xenova/distiluse-base-multilingual-cased-v2', '135'],\n",
308 | " ['Xenova/paraphrase-multilingual-mpnet-base-v2', '279'],\n",
309 | " ['Xenova/bge-large-zh', '327'],\n",
310 | " ['Xenova/multilingual-e5-base', '279'],\n",
311 | " ['Xenova/bge-small-en-v1.5', '34'],\n",
312 | " ['Xenova/paraphrase-albert-small-v2', '39.7'],\n",
313 | " ['Xenova/paraphrase-albert-base-v2', '40'],\n",
314 | " ['Xenova/squeezebert-uncased', '51.2'],\n",
315 | " ['Xenova/squeezebert-mnli', '51.3'],\n",
316 | " ['Xenova/vit-base-patch16-224-in21k', '87.5'],\n",
317 | " ['Xenova/all-distilroberta-v1', '82.1'],\n",
318 | " ['Xenova/paraphrase-multilingual-MiniLM-L12-v2', '118'],\n",
319 | " ['Xenova/paraphrase-MiniLM-L6-v2', '23'],\n",
320 | " ['Xenova/bert-base-nli-mean-tokens', '110'],\n",
321 | " ['Xenova/distilbert-base-nli-mean-tokens', '66.9'],\n",
322 | " ['Xenova/distilbert-base-nli-stsb-mean-tokens', '66.9'],\n",
323 | " ['Xenova/distiluse-base-multilingual-cased-v1', '135'],\n",
324 | " ['Xenova/msmarco-distilbert-base-v4', '66.9'],\n",
325 | " ['Xenova/multi-qa-MiniLM-L6-cos-v1', '23'],\n",
326 | " ['Xenova/multi-qa-distilbert-cos-v1', '66.9'],\n",
327 | " ['Xenova/multi-qa-mpnet-base-cos-v1', '110'],\n",
328 | " ['Xenova/multi-qa-mpnet-base-dot-v1', '110'],\n",
329 | " ['Xenova/nli-mpnet-base-v2', '110'],\n",
330 | " ['Xenova/paraphrase-MiniLM-L3-v2', '17.5'],\n",
331 | " ['Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', '279'],\n",
332 | " ['Xenova/dino-vitb16', '87.5'],\n",
333 | " ['Xenova/dino-vits8', '23.4'],\n",
334 | " ['Xenova/dino-vitb8', '88.8'],\n",
335 | " ['Xenova/dino-vits16', '22.7'],\n",
336 | " ['Xenova/scibert_scivocab_uncased', '111'],\n",
337 | " ['Xenova/spanbert-large-cased', '335'],\n",
338 | " ['Xenova/spanbert-base-cased', '109'],\n",
339 | " ['sdan/simple-embeddings', '23'],\n",
340 | " ['Xenova/sentence_bert', '110'],\n",
341 | " ['Xenova/e5-small-v2', '34'],\n",
342 | " ['Xenova/SapBERT-from-PubMedBERT-fulltext', '110'],\n",
343 | " ['Xenova/indobert-base-p1', '125'],\n",
344 | " ['Xenova/UMLSBert_ENG', '110'],\n",
345 | " ['Xenova/rubert-base-cased', '178'],\n",
346 | " ['Xenova/kobert', '92.8'],\n",
347 | " ['Xenova/e5-small', '34'],\n",
348 | " ['Xenova/e5-large', '337'],\n",
349 | " ['Xenova/e5-large-v2', '337'],\n",
350 | " ['Xenova/e5-base', '110'],\n",
351 | " ['Xenova/e5-base-v2', '110'],\n",
352 | " ['Xenova/instructor-base', '110'],\n",
353 | " ['Xenova/instructor-large', '337'],\n",
354 | " ['Xenova/sentence-t5-large', '337'],\n",
355 | " ['Xenova/multilingual-e5-large', '562'],\n",
356 | " ['Xenova/mms-300m', '318'],\n",
357 | " ['Xenova/mms-1b', '969'],\n",
358 | " ['Supabase/e5-small-v2', '34'],\n",
359 | " ['Supabase/all-MiniLM-L6-v2', '23'],\n",
360 | " ['Xenova/gte-base', '110'],\n",
361 | " ['Xenova/bge-small-en', '34'],\n",
362 | " ['Xenova/bge-base-en', '110'],\n",
363 | " ['Xenova/bge-large-en', '337'],\n",
364 | " ['ggrn/bge-small-en', '34'],\n",
365 | " ['Xenova/bge-base-zh', '103'],\n",
366 | " ['Xenova/bge-large-zh-noinstruct', '327'],\n",
367 | " ['Xenova/bge-small-zh', '24'],\n",
368 | " ['Xenova/ClinicalBERT', '229'],\n",
369 | " ['Xenova/LaBSE', '472'],\n",
370 | " ['Xenova/wavlm-base', '95.8'],\n",
371 | " ['Xenova/wavlm-base-plus', '95.8'],\n",
372 | " ['Xenova/wavlm-large', '319'],\n",
373 | " ['Xenova/sentence-camembert-large', '339'],\n",
374 | " ['Xenova/herbert-base-cased', '125'],\n",
375 | " ['Xenova/herbert-large-cased', '357'],\n",
376 | " ['Xenova/bge-large-zh-v1.5', '327'],\n",
377 | " ['Xenova/bge-base-zh-v1.5', '103'],\n",
378 | " ['Xenova/bge-small-zh-v1.5', '24'],\n",
379 | " ['leolee9086/text2vec-base-chinese', '103'],\n",
380 | " ['Xenova/long-t5-encodec-tglobal-base', '291']]"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 32,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "data": {
390 | "text/plain": [
391 | "{'TaylorAI/gte-tiny': '22.9',\n",
392 | " 'Supabase/gte-small': '34',\n",
393 | " 'Xenova/all-MiniLM-L6-v2': '23',\n",
394 | " 'Xenova/bge-large-en-v1.5': '337',\n",
395 | " 'Supabase/bge-small-en': '34',\n",
396 | " 'Xenova/gte-small': '34',\n",
397 | " 'Xenova/all-mpnet-base-v2': '110',\n",
398 | " 'Xenova/paraphrase-mpnet-base-v2': '110',\n",
399 | " 'Xenova/all-MiniLM-L12-v2': '34',\n",
400 | " 'Xenova/multilingual-e5-small': '118',\n",
401 | " 'Xenova/gte-large': '337',\n",
402 | " 'Xenova/bge-base-en-v1.5': '110',\n",
403 | " 'Xenova/all-roberta-large-v1': '357',\n",
404 | " 'Xenova/distiluse-base-multilingual-cased-v2': '135',\n",
405 | " 'Xenova/paraphrase-multilingual-mpnet-base-v2': '279',\n",
406 | " 'Xenova/bge-large-zh': '327',\n",
407 | " 'Xenova/multilingual-e5-base': '279',\n",
408 | " 'Xenova/bge-small-en-v1.5': '34',\n",
409 | " 'Xenova/paraphrase-albert-small-v2': '39.7',\n",
410 | " 'Xenova/paraphrase-albert-base-v2': '40',\n",
411 | " 'Xenova/squeezebert-uncased': '51.2',\n",
412 | " 'Xenova/squeezebert-mnli': '51.3',\n",
413 | " 'Xenova/vit-base-patch16-224-in21k': '87.5',\n",
414 | " 'Xenova/all-distilroberta-v1': '82.1',\n",
415 | " 'Xenova/paraphrase-multilingual-MiniLM-L12-v2': '118',\n",
416 | " 'Xenova/paraphrase-MiniLM-L6-v2': '23',\n",
417 | " 'Xenova/bert-base-nli-mean-tokens': '110',\n",
418 | " 'Xenova/distilbert-base-nli-mean-tokens': '66.9',\n",
419 | " 'Xenova/distilbert-base-nli-stsb-mean-tokens': '66.9',\n",
420 | " 'Xenova/distiluse-base-multilingual-cased-v1': '135',\n",
421 | " 'Xenova/msmarco-distilbert-base-v4': '66.9',\n",
422 | " 'Xenova/multi-qa-MiniLM-L6-cos-v1': '23',\n",
423 | " 'Xenova/multi-qa-distilbert-cos-v1': '66.9',\n",
424 | " 'Xenova/multi-qa-mpnet-base-cos-v1': '110',\n",
425 | " 'Xenova/multi-qa-mpnet-base-dot-v1': '110',\n",
426 | " 'Xenova/nli-mpnet-base-v2': '110',\n",
427 | " 'Xenova/paraphrase-MiniLM-L3-v2': '17.5',\n",
428 | " 'Xenova/xlm-r-100langs-bert-base-nli-stsb-mean-tokens': '279',\n",
429 | " 'Xenova/dino-vitb16': '87.5',\n",
430 | " 'Xenova/dino-vits8': '23.4',\n",
431 | " 'Xenova/dino-vitb8': '88.8',\n",
432 | " 'Xenova/dino-vits16': '22.7',\n",
433 | " 'Xenova/scibert_scivocab_uncased': '111',\n",
434 | " 'Xenova/spanbert-large-cased': '335',\n",
435 | " 'Xenova/spanbert-base-cased': '109',\n",
436 | " 'sdan/simple-embeddings': '23',\n",
437 | " 'Xenova/sentence_bert': '110',\n",
438 | " 'Xenova/e5-small-v2': '34',\n",
439 | " 'Xenova/SapBERT-from-PubMedBERT-fulltext': '110',\n",
440 | " 'Xenova/indobert-base-p1': '125',\n",
441 | " 'Xenova/UMLSBert_ENG': '110',\n",
442 | " 'Xenova/rubert-base-cased': '178',\n",
443 | " 'Xenova/kobert': '92.8',\n",
444 | " 'Xenova/e5-small': '34',\n",
445 | " 'Xenova/e5-large': '337',\n",
446 | " 'Xenova/e5-large-v2': '337',\n",
447 | " 'Xenova/e5-base': '110',\n",
448 | " 'Xenova/e5-base-v2': '110',\n",
449 | " 'Xenova/instructor-base': '110',\n",
450 | " 'Xenova/instructor-large': '337',\n",
451 | " 'Xenova/sentence-t5-large': '337',\n",
452 | " 'Xenova/multilingual-e5-large': '562',\n",
453 | " 'Xenova/mms-300m': '318',\n",
454 | " 'Xenova/mms-1b': '969',\n",
455 | " 'Supabase/e5-small-v2': '34',\n",
456 | " 'Supabase/all-MiniLM-L6-v2': '23',\n",
457 | " 'Xenova/gte-base': '110',\n",
458 | " 'Xenova/bge-small-en': '34',\n",
459 | " 'Xenova/bge-base-en': '110',\n",
460 | " 'Xenova/bge-large-en': '337',\n",
461 | " 'ggrn/bge-small-en': '34',\n",
462 | " 'Xenova/bge-base-zh': '103',\n",
463 | " 'Xenova/bge-large-zh-noinstruct': '327',\n",
464 | " 'Xenova/bge-small-zh': '24',\n",
465 | " 'Xenova/ClinicalBERT': '229',\n",
466 | " 'Xenova/LaBSE': '472',\n",
467 | " 'Xenova/wavlm-base': '95.8',\n",
468 | " 'Xenova/wavlm-base-plus': '95.8',\n",
469 | " 'Xenova/wavlm-large': '319',\n",
470 | " 'Xenova/sentence-camembert-large': '339',\n",
471 | " 'Xenova/herbert-base-cased': '125',\n",
472 | " 'Xenova/herbert-large-cased': '357',\n",
473 | " 'Xenova/bge-large-zh-v1.5': '327',\n",
474 | " 'Xenova/bge-base-zh-v1.5': '103',\n",
475 | " 'Xenova/bge-small-zh-v1.5': '24',\n",
476 | " 'leolee9086/text2vec-base-chinese': '103',\n",
477 | " 'Xenova/long-t5-encodec-tglobal-base': '291'}"
478 | ]
479 | },
480 | "execution_count": 32,
481 | "metadata": {},
482 | "output_type": "execute_result"
483 | }
484 | ],
485 | "source": [
486 | "# Create a dictionary to easily look up sizes by id\n",
487 | "size_dict = dict(sizes)\n",
488 | "size_dict"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 36,
494 | "metadata": {},
495 | "outputs": [
496 | {
497 | "name": "stdout",
498 | "output_type": "stream",
499 | "text": [
500 | "JSON updated and saved to 'your_output.json'\n"
501 | ]
502 | }
503 | ],
504 | "source": [
505 | "import json\n",
506 | "\n",
507 | "this_file = \"feature-extraction_trending\"\n",
508 | "\n",
509 | "with open(f\"{this_file}.json\", 'r') as json_file:\n",
510 | " data = json.load(json_file)\n",
511 | "\n",
512 | "# Iterate over the \"models\" in your JSON\n",
513 | "for model in data[\"models\"]:\n",
514 | " model_id = model[\"id\"]\n",
515 | " if model_id in size_dict:\n",
516 | " model[\"model_size\"] = size_dict[model_id]\n",
517 | "\n",
518 | "# Save the updated JSON to a file\n",
519 | "with open(f'{this_file}_sizes.json', 'w') as file:\n",
520 | " json.dump(data, file, indent=4)\n"
521 | ]
522 | }
523 | ],
524 | "metadata": {
525 | "kernelspec": {
526 | "display_name": "py3.11",
527 | "language": "python",
528 | "name": "python3"
529 | },
530 | "language_info": {
531 | "codemirror_mode": {
532 | "name": "ipython",
533 | "version": 3
534 | },
535 | "file_extension": ".py",
536 | "mimetype": "text/x-python",
537 | "name": "python",
538 | "nbconvert_exporter": "python",
539 | "pygments_lexer": "ipython3",
540 | "version": "3.11.0"
541 | }
542 | },
543 | "nbformat": 4,
544 | "nbformat_minor": 2
545 | }
546 |
--------------------------------------------------------------------------------
/src/models/text2text_downloads.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}
2 |
--------------------------------------------------------------------------------
/src/models/text2text_likes.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}
--------------------------------------------------------------------------------
/src/models/text2text_modified.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}
--------------------------------------------------------------------------------
/src/models/text2text_trending.json:
--------------------------------------------------------------------------------
1 | {"activeFilters":{"pipeline_tag":["text2text-generation"],"library":["transformers.js"],"dataset":[],"language":[],"license":[],"other":[]},"models":[{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-small","lastModified":"2023-09-05T14:57:45.000Z","likes":2,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-small","lastModified":"2023-09-04T15:50:15.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":59,"gated":false,"id":"Xenova/LaMini-Flan-T5-783M","lastModified":"2023-09-05T20:23:56.000Z","likes":19,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-248M","lastModified":"2023-09-05T20:20:11.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-Flan-T5-77M","lastModified":"2023-09-05T20:18:44.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-61M","lastModified":"2023-09-05T20:24:26.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":3,"gated":false,"id":"Xenova/LaMini-T5-738M","lastModified":"2023-09-05T20:29:00.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/LaMini-T5-223M","lastModified":"2023-09-05T20:25:40.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-small","lastModified":"2023-09-05T02:42:51.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/mt5-base","lastModified":"2023-09-05T02:47:48.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-base","lastModified":"2023-09-05T14:58:50.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-base","lastModified":"2023-09-04T16:09:48.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/flan-t5-base","lastModified":"2023-09-04T15:50:57.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/t5-v1_1-small","lastModified":"2023-09-04T16:09:07.000Z","likes":1,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot-400M-distill","lastModified":"2023-09-11T22:42:58.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/blenderbot_small-90M","lastModified":"2023-09-11T02:13:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-tglobal-base","lastModified":"2023-09-18T21:57:30.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":0,"gated":false,"id":"Xenova/long-t5-local-base","lastModified":"2023-09-18T21:58:22.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false},{"author":"Xenova","authorData":{"avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/hwiQ0uvz3t-L5a-NtBIO6.png?w=200&h=200&f=face","fullname":"Joshua","name":"Xenova","type":"user","isPro":false,"isHf":true},"downloads":2,"gated":false,"id":"Xenova/long-t5-tglobal-base-16384-book-summary","lastModified":"2023-09-18T22:19:34.000Z","likes":0,"pipeline_tag":"text2text-generation","private":false,"repoType":"model","isLikedByUser":false}],"numItemsPerPage":30,"numTotalItems":19,"pageIndex":0}
2 |
--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
1 | const path = require('path');
2 | const HtmlWebpackPlugin = require('html-webpack-plugin');
3 | const MiniCssExtractPlugin = require('mini-css-extract-plugin'); // FOUC-correction
4 | const FaviconsWebpackPlugin = require('favicons-webpack-plugin');
5 | const CopyWebpackPlugin = require('copy-webpack-plugin');
6 |
7 | module.exports = {
8 | entry: './src/js/index.js',
9 | mode: 'development',
10 | output: {
11 | filename: 'bundle.js',
12 | path: path.resolve(__dirname, 'dist'),
13 | clean: true
14 | },
15 | module: {
16 | rules: [
17 | {
18 | test: /\.css$/,
19 | use: [MiniCssExtractPlugin.loader, 'css-loader'],
20 |
21 | },
22 | {
23 | test: /\.svg$/,
24 | type: 'asset/resource',
25 | generator: {
26 | filename: '[name][ext]'
27 | }
28 | },
29 | ],
30 | },
31 | plugins: [
32 | new HtmlWebpackPlugin({
33 | template: './index.html',
34 | }),
35 | new MiniCssExtractPlugin(),
36 | new FaviconsWebpackPlugin(),
37 | new CopyWebpackPlugin({
38 | patterns: [
39 | {
40 | from: 'src/models/**/*_sizes.json', // Source directory of JSON files
41 | to: 'models/[name][ext]'
42 | },
43 | ],
44 | }),
45 | ],
46 | };
47 |
--------------------------------------------------------------------------------