├── .eslintrc.cjs
├── .github
└── workflows
│ └── static.yml
├── .gitignore
├── .npmignore
├── LICENSE
├── README.md
├── demos
├── HorizontalLinkList.jsx
├── paragraphs_as_options
│ ├── App.jsx
│ ├── context.jsx
│ ├── data.json
│ └── main.jsx
└── simple_autocomplete
│ ├── App.jsx
│ └── main.jsx
├── gif-20240430-032634.gif
├── index.html
├── package-lock.json
├── package.json
├── src
├── SemanticAutocomplete.jsx
└── worker.js
└── vite.config.js
/.eslintrc.cjs:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | root: true,
3 | env: { browser: true, es2020: true },
4 | extends: [
5 | 'eslint:recommended',
6 | 'plugin:react/recommended',
7 | 'plugin:react/jsx-runtime',
8 | 'plugin:react-hooks/recommended',
9 | ],
10 | ignorePatterns: ['dist', '.eslintrc.cjs'],
11 | parserOptions: { ecmaVersion: 'latest', sourceType: 'module' },
12 | settings: { react: { version: '18.2' } },
13 | plugins: ['react-refresh'],
14 | rules: {
15 | 'react/jsx-no-target-blank': 'off',
16 | 'react-refresh/only-export-components': [
17 | 'warn',
18 | { allowConstantExport: true },
19 | ],
20 | },
21 | }
22 |
--------------------------------------------------------------------------------
/.github/workflows/static.yml:
--------------------------------------------------------------------------------
1 | # Simple workflow for deploying static content to GitHub Pages
2 | name: Deploy static content to Pages
3 |
4 | on:
5 | # Runs on pushes targeting the default branch
6 | push:
7 | branches: ["gh-pages"]
8 |
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 | contents: read
15 | pages: write
16 | id-token: write
17 |
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 | group: "pages"
22 | cancel-in-progress: false
23 |
24 | jobs:
25 | # Single deploy job since we're just deploying
26 | deploy:
27 | environment:
28 | name: github-pages
29 | url: ${{ steps.deployment.outputs.page_url }}
30 | runs-on: ubuntu-latest
31 | steps:
32 | - name: Checkout
33 | uses: actions/checkout@v4
34 | - name: Setup Pages
35 | uses: actions/configure-pages@v5
36 | - name: Upload artifact
37 | uses: actions/upload-pages-artifact@v3
38 | with:
39 | # Upload entire repository
40 | path: '.'
41 | - name: Deploy to GitHub Pages
42 | id: deployment
43 | uses: actions/deploy-pages@v4
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | pnpm-debug.log*
8 | lerna-debug.log*
9 |
10 | node_modules
11 | dist
12 | dist-ssr
13 | *.local
14 |
15 | # Editor directories and files
16 | .vscode/*
17 | !.vscode/extensions.json
18 | .idea
19 | .DS_Store
20 | *.suo
21 | *.ntvs*
22 | *.njsproj
23 | *.sln
24 | *.sw?
25 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | pnpm-debug.log*
8 | lerna-debug.log*
9 |
10 | node_modules
11 | dist
12 | dist-ssr
13 | *.local
14 |
15 | # Editor directories and files
16 | .vscode/*
17 | !.vscode/extensions.json
18 | .idea
19 | .DS_Store
20 | *.suo
21 | *.ntvs*
22 | *.njsproj
23 | *.sln
24 | *.sw?
25 |
26 | public
27 | index.html
28 | .eslintrc.cjs
29 | .gitignore
30 | src/App.jsx
31 | src/main.jsx
32 | package-lock.json
33 | vite.config.js
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2024 Mihai Chirculescu
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # semantic-autocomplete
2 |
3 | semantic-autocomplete is a React component that extends [v6 MUI's autocomplete](https://v6.mui.com/material-ui/react-autocomplete/) and performs **semantic similarity search** using a small, quantized machine learning (ML) model which runs on client side. The model is downloaded once and then taken from browser's cache. The full functionality is provided within this React component!
4 |
5 | ## Demo
6 |
7 | **Sort paragraphs of a webpage by meaning:**
8 |
9 | https://mihaiii.github.io/semantic-autocomplete/
10 |
11 | 
12 |
13 | ## v5 MUI support
14 | This component works with both v5 and v6 MUI. It was not tested by the author on lower MUI versions.
15 |
16 | ## How to install
17 | Install:
18 |
19 | `npm install --save semantic-autocomplete`
20 |
21 | Then import:
22 |
23 | `import SemanticAutocomplete from "semantic-autocomplete";`
24 |
25 | ## Run on local from source code
26 |
27 | ```
28 | npm install
29 | npm run dev
30 | ```
31 |
32 | ## Usage
33 |
34 | Since semantic-autocomplete extends [MUI's autocomplete](https://v6.mui.com/material-ui/react-autocomplete/), the entire [v6 MUI's autocomplete API](https://v6.mui.com/material-ui/api/autocomplete/) will also work on semantic-autocomplete. The only exception is the [filterOptions property](https://mui.com/material-ui/react-autocomplete/#custom-filter).
35 |
36 | **If you're already using `autocomplete` in your project, just replace the tag name and you're done.** 🙌
37 |
38 | You can see the component being used in code [here](https://github.com/Mihaiii/semantic-autocomplete/blob/6d312a6264b7c3b79d053e23d3cdb4cf226196a1/demos/paragraphs_as_options/App.jsx#L26-L34) and [here](https://github.com/Mihaiii/semantic-autocomplete/blob/6d312a6264b7c3b79d053e23d3cdb4cf226196a1/demos/simple_autocomplete/App.jsx#L107-L112).
39 |
40 |
41 | [See this page for how you can use MUI's autocomplete and therefore semantic-autocomplete too](https://v6.mui.com/material-ui/react-autocomplete/).
42 |
43 | Besides the MUI's autocomplete API, the following props are provided:
44 |
45 | - `threshold`: if it has a value, then the component will filter out options below this cosine similarity value. Defaults to no value (meaning no filtering, only sorting). [Click for code example](https://github.com/Mihaiii/semantic-autocomplete/blob/6d312a6264b7c3b79d053e23d3cdb4cf226196a1/demos/simple_autocomplete/App.jsx#L110).
46 |
47 | - `onResult`: callback function once the sorting/filtering of the options is done, using the resulted options array as first param. [Click for code example](https://github.com/Mihaiii/semantic-autocomplete/blob/6d312a6264b7c3b79d053e23d3cdb4cf226196a1/demos/paragraphs_as_options/App.jsx#L29).
48 |
49 | - `model`: the name of the Huggingface ML model repo. It has to have the ONNX embeddings model. The folder structure of the repo has to be the standard one used by transformers.js. If you're interested in changing the default used model, you might find [this filter](https://huggingface.co/models?pipeline_tag=sentence-similarity&library=onnx&sort=trending) useful. [I made a bunch of small models for this component. Try them out and see what works best for your use case](https://huggingface.co/collections/Mihaiii/pokemons-662ce912d64b8a3bee518b7f). Default value: `Mihaiii/Venusaur` (pointing to [this repo](https://huggingface.co/Mihaiii/Venusaur)), which loads the ONNX quantized model having **~15 MB**. [Click here for code example](https://github.com/Mihaiii/semantic-autocomplete/blob/b16115492466eb1502107cf4581a804cb1dcbbe4/demos/simple_autocomplete/App.jsx#L115)
50 |
51 | - `pipelineParams`: the params to be passed to [transformer.js](https://github.com/xenova/transformers.js) when loading the model. Default value: `{ pooling: "mean", normalize: true }`. For more info, please [see this page](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline).
52 |
53 | ## Thanks / credit
54 | - [xonova](https://x.com/xenovacom?t=Mw1h_1joKgfrUXR_wl9Wrg&s=09) for building [transformers.js](https://github.com/xenova/transformers.js), providing clear & in detail documentation, always being willing to help out and for having [lots of demos](https://github.com/xenova/transformers.js/tree/main/examples) on [his HF account](https://huggingface.co/Xenova). The work for this component is based on his tutorial on [how to build a React component using tranaformers.js](https://huggingface.co/docs/transformers.js/en/tutorials/react).
55 | - [andersonbcdefg](https://x.com/andersonbcdefg?t=0Nkr_SRk-fMUrU_Kp0Wm5w&s=09) for building many small models like [gte-tiny](https://huggingface.co/TaylorAI/gte-tiny) or [bge-micro-v2](https://huggingface.co/TaylorAI/bge-micro-v2) and for providing some guidelines to me prior to making [Venusaur](https://huggingface.co/Mihaiii/Venusaur).
56 |
--------------------------------------------------------------------------------
/demos/HorizontalLinkList.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { List, ListItem, Link } from '@mui/material';
3 |
4 | const links = [
5 | { href: 'https://github.com/Mihaiii/semantic-autocomplete', title: 'GitHub' },
6 | { href: 'https://huggingface.co/Mihaiii/Venusaur', title: 'Model' },
7 | { href: 'https://www.npmjs.com/package/semantic-autocomplete', title: 'npm' },
8 | { href: 'https://mihaiii.github.io/semantic-autocomplete/', title: 'Demo' },
9 | ];
10 |
11 | const HorizontalLinkList = () => {
12 | const listStyle = {
13 | display: 'flex',
14 | flexDirection: 'row',
15 | padding: 0
16 | };
17 |
18 | return (
19 |
20 | {links.map((link, index) => (
21 |
22 |
23 |
24 | {link.title}
25 |
26 |
27 |
28 | ))}
29 |
30 | );
31 | };
32 |
33 | export default HorizontalLinkList;
--------------------------------------------------------------------------------
/demos/paragraphs_as_options/App.jsx:
--------------------------------------------------------------------------------
1 | import SemanticAutocomplete from "../../src/SemanticAutocomplete";
2 | import { TextField, ListItem, ListItemText, List } from "@mui/material";
3 | import React, { useContext, useMemo } from 'react'
4 | const SemanticAutocompleteMemoized = React.memo(SemanticAutocomplete)
5 | import { SortedOptionsContext } from './context.jsx'
6 | import jsonData from './data.json';
7 | import HorizontalLinkList from '../HorizontalLinkList.jsx'
8 |
9 | function App() {
10 | const options = useMemo(() => jsonData, []);
11 | const { sortedOptions, setSortedOptions } = useContext(SortedOptionsContext);
12 |
13 | const ResultsList = () => {
14 | return (
15 |
16 | {sortedOptions.map(op => (
17 |
18 |
19 |
20 | ))}
21 |
22 | );
23 | }
24 |
25 | return (
26 |
38 | );
39 | }
40 |
41 | export default App;
42 |
--------------------------------------------------------------------------------
/demos/paragraphs_as_options/context.jsx:
--------------------------------------------------------------------------------
1 | import React, { createContext, useState } from 'react';
2 | import jsonData from './data.json';
3 |
4 | const SortedOptionsContext = createContext();
5 |
6 | export const SortedOptionsProvider = ({ children }) => {
7 | const [sortedOptions, setSortedOptions] = useState(jsonData);
8 |
9 | return (
10 |
11 | {children}
12 |
13 | );
14 | };
15 |
16 | export { SortedOptionsContext };
17 |
--------------------------------------------------------------------------------
/demos/paragraphs_as_options/data.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "label": "Word embeddings are a type of word representation that allows words to be represented as vectors in a continuous vector space. The primary goal is to capture the semantic meaning of words so that words with similar meanings are located close to each other in this space. This is achieved by transforming sparse, high-dimensional word vectors into lower-dimensional spaces while preserving semantic relationships.",
4 | "value": 1
5 | },
6 | {
7 | "label": "Embeddings are used extensively across various NLP tasks. Some common applications include text classification, sentiment analysis, language modeling, and machine translation. They are also integral to more complex tasks like question-answering systems, chatbots, and content recommendation systems. Beyond NLP, embeddings find applications in image and video analysis, where they help in tasks like image classification and facial recognition.",
8 | "value": 2
9 | },
10 | {
11 | "label": "Embeddings are used because they provide a dense and efficient representation of words, capturing complex patterns in language that are not apparent at the surface level. Unlike one-hot encoding, which treats words as isolated units without any notion of similarity, embeddings map words into a vector space based on their usage and context. This allows models to understand synonyms, analogies, and the overall semantics of text, leading to more nuanced and intelligent processing.",
12 | "value": 3
13 | },
14 | {
15 | "label": "Embeddings are typically created using models like Word2Vec, GloVe, or FastText, which learn representations by analyzing word co-occurrences and relationships in large corpora of text. These models apply algorithms to adjust the position of each word in the vector space, such that the distance between vectors captures semantic relationships between words. For example, similar words are placed closer together, whereas unrelated words are positioned farther apart.",
16 | "value": 4
17 | },
18 | {
19 | "label": "While embeddings are powerful, they also present challenges. One major concern is bias, as embeddings can perpetuate and amplify biases present in the training data. This requires careful consideration and mitigation strategies during model development and deployment. Additionally, creating and fine-tuning embeddings for specific domains or languages with limited resources can be challenging, necessitating innovative approaches to leverage embeddings effectively across diverse contexts.",
20 | "value": 5
21 | },
22 | {
23 | "label": "Traditional word embeddings, like Word2Vec and GloVe, generate a single representation for each word, regardless of its context. This means that words with multiple meanings are represented by the same vector across different uses. Contextual embeddings, introduced by models such as BERT and ELMo, represent words as vectors that vary depending on the word's context within a sentence. This allows these models to capture the nuances of language more effectively, distinguishing between different meanings of a word based on its usage.",
24 | "value": 6
25 | },
26 | {
27 | "label": "While primarily designed to capture semantic relationships between words, embeddings can also encode aspects of syntax and grammar to a certain extent. For example, embeddings can reflect syntactic categories like part of speech, and models trained on sentence-level tasks can learn representations that implicitly encode grammatical structures. However, explicit modeling of syntax and grammar often requires architectures designed specifically for these aspects, such as syntactic parsing models.",
28 | "value": 7
29 | },
30 | {
31 | "label": "Embeddings are a cornerstone of transfer learning in NLP. Pre-trained embeddings, generated from large-scale language models on extensive corpora, can be used as the starting point for training on specific tasks. This approach allows models to leverage general linguistic knowledge learned from the broader language use, significantly improving performance on tasks with limited training data. Transfer learning with embeddings accelerates model development and enhances capabilities in domain-specific applications.",
32 | "value": 8
33 | },
34 | {
35 | "label": "Evaluating the quality of embeddings involves assessing how well they capture semantic and syntactic relationships. This is often done through intrinsic methods, like analogy solving (e.g., \"king\" is to \"man\" as \"queen\" is to \"woman\") and similarity assessments, or through extrinsic methods, where embeddings are evaluated based on their performance in downstream tasks like text classification or sentiment analysis. Both approaches provide insights into the effectiveness of embeddings in encoding linguistic properties.",
36 | "value": 9
37 | },
38 | {
39 | "label": "Significant efforts are underway to develop and refine embeddings for a wide range of languages beyond English. This includes both multilingual models, which learn embeddings capable of representing multiple languages in a single vector space, and language-specific models that cater to the unique characteristics of individual languages. Challenges in this area include dealing with low-resource languages and adapting models to capture linguistic features unique to each language.",
40 | "value": 10
41 | },
42 | {
43 | "label": "Future developments in embeddings may focus on several areas, including improving the handling of polysemy and context, reducing biases in embeddings, and enhancing the efficiency and scalability of embedding models for large-scale applications. Additionally, there's a growing interest in cross-modal embeddings, which can represent data from different modalities (e.g., text and images) in a unified vector space, opening up new possibilities for multimodal applications and AI systems.",
44 | "value": 11
45 | },
46 | {
47 | "label": "Graph embeddings aim to represent nodes, edges, and possibly whole subgraphs of a graph in a continuous vector space. These embeddings capture the structure of the graph as well as node-level and edge-level properties. Applications of graph embeddings include social network analysis, where they can predict connections or recommend content; knowledge graph completion, where they can infer missing relations; and in bioinformatics, for example, to predict protein interactions.",
48 | "value": 12
49 | },
50 | {
51 | "label": "Embeddings can be adapted for time-series data by creating representations that capture temporal dynamics in addition to the underlying patterns. This involves training embeddings not just on the static features of data points but also on their changes over time, enabling models to understand periodic trends, anomalies, and long-term shifts in data. Applications include financial market analysis, weather forecasting, and predictive maintenance, where understanding the temporal dimension is crucial.",
52 | "value": 13
53 | },
54 | {
55 | "label": "Scaling embedding models presents several challenges, including computational demands, memory requirements, and maintaining the quality of embeddings as the size of the data and the model increases. Solutions to these challenges include more efficient model architectures, quantization techniques to reduce the size of embeddings, and distributed computing strategies. Addressing these issues is key to enabling the application of embeddings to ever-larger datasets and more complex problems.",
56 | "value": 14
57 | }
58 | ]
--------------------------------------------------------------------------------
/demos/paragraphs_as_options/main.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 | import ReactDOM from 'react-dom/client'
3 | import App from './App.jsx'
4 | import { SortedOptionsProvider } from './context.jsx';
5 |
6 | ReactDOM.createRoot(document.getElementById('root')).render(
7 |
8 |
9 |
10 |
11 |
12 | )
--------------------------------------------------------------------------------
/demos/simple_autocomplete/App.jsx:
--------------------------------------------------------------------------------
1 | import SemanticAutocomplete from "../../src/SemanticAutocomplete";
2 | import { TextField } from "@mui/material";
3 | import HorizontalLinkList from '../HorizontalLinkList.jsx'
4 | import React from 'react'
5 |
6 | function App() {
7 | const options = [
8 | { label: "Spoon", value: 1 },
9 | { label: "Fork", value: 2 },
10 | { label: "Knife", value: 3 },
11 | { label: "Plate", value: 4 },
12 | { label: "Cup", value: 5 },
13 | { label: "Mug", value: 6 },
14 | { label: "Bowl", value: 7 },
15 | { label: "Teapot", value: 8 },
16 | { label: "Frying Pan", value: 9 },
17 | { label: "Saucepan", value: 10 },
18 | { label: "Spatula", value: 11 },
19 | { label: "Whisk", value: 12 },
20 | { label: "Oven Mitt", value: 13 },
21 | { label: "Cutting Board", value: 14 },
22 | { label: "Measuring Cup", value: 15 },
23 | { label: "Blender", value: 16 },
24 | { label: "Toaster", value: 17 },
25 | { label: "Microwave", value: 18 },
26 | { label: "Refrigerator", value: 19 },
27 | { label: "Dishwasher", value: 20 },
28 | { label: "Table", value: 21 },
29 | { label: "Chair", value: 22 },
30 | { label: "Sofa", value: 23 },
31 | { label: "Lamp", value: 24 },
32 | { label: "Bookshelf", value: 25 },
33 | { label: "Bed", value: 26 },
34 | { label: "Mattress", value: 27 },
35 | { label: "Pillow", value: 28 },
36 | { label: "Blanket", value: 29 },
37 | { label: "Dresser", value: 30 },
38 | { label: "Mirror", value: 31 },
39 | { label: "Alarm Clock", value: 32 },
40 | { label: "Curtains", value: 33 },
41 | { label: "Rug", value: 34 },
42 | { label: "Trash Can", value: 35 },
43 | { label: "Laundry Basket", value: 36 },
44 | { label: "Washing Machine", value: 37 },
45 | { label: "Dryer", value: 38 },
46 | { label: "Iron", value: 39 },
47 | { label: "Vacuum Cleaner", value: 40 },
48 | { label: "Broom", value: 41 },
49 | { label: "Mop", value: 42 },
50 | { label: "Bucket", value: 43 },
51 | { label: "Garden Hose", value: 44 },
52 | { label: "Rake", value: 45 },
53 | { label: "Shovel", value: 46 },
54 | { label: "Lawn Mower", value: 47 },
55 | { label: "Hammer", value: 48 },
56 | { label: "Screwdriver", value: 49 },
57 | { label: "Wrench", value: 50 },
58 | { label: "Drill", value: 51 },
59 | { label: "Saw", value: 52 },
60 | { label: "Nails", value: 53 },
61 | { label: "Screws", value: 54 },
62 | { label: "Bolts", value: 55 },
63 | { label: "Paint Brush", value: 56 },
64 | { label: "Roller", value: 57 },
65 | { label: "Paint", value: 58 },
66 | { label: "Vase", value: 59 },
67 | { label: "Picture Frame", value: 60 },
68 | { label: "Candle", value: 61 },
69 | { label: "Book", value: 62 },
70 | { label: "Magazine", value: 63 },
71 | { label: "Remote Control", value: 64 },
72 | { label: "TV", value: 65 },
73 | { label: "Speaker", value: 66 },
74 | { label: "Laptop", value: 67 },
75 | { label: "Phone", value: 68 },
76 | { label: "Charger", value: 69 },
77 | { label: "Flashlight", value: 70 },
78 | { label: "Bicycle", value: 71 },
79 | { label: "Skateboard", value: 72 },
80 | { label: "Helmet", value: 73 },
81 | { label: "Ball", value: 74 },
82 | { label: "Gloves", value: 75 },
83 | { label: "Scarf", value: 76 },
84 | { label: "Umbrella", value: 77 },
85 | { label: "Backpack", value: 78 },
86 | { label: "Wallet", value: 79 },
87 | { label: "Keys", value: 80 },
88 | { label: "Sunglasses", value: 81 },
89 | { label: "Watch", value: 82 },
90 | { label: "Fitness Tracker", value: 83 },
91 | { label: "Yoga Mat", value: 84 },
92 | { label: "Treadmill", value: 85 },
93 | { label: "Weights", value: 86 },
94 | { label: "Swimsuit", value: 87 },
95 | { label: "Towel", value: 88 },
96 | { label: "Shampoo", value: 89 },
97 | { label: "Soap", value: 90 },
98 | { label: "Toothbrush", value: 91 },
99 | { label: "Toothpaste", value: 92 },
100 | { label: "Floss", value: 93 },
101 | { label: "Razor", value: 94 },
102 | { label: "Deodorant", value: 95 },
103 | { label: "Perfume", value: 96 },
104 | { label: "Makeup", value: 97 },
105 | { label: "Hairbrush", value: 98 },
106 | ];
107 |
108 | return (
109 |