├── .gitattributes ├── .gitignore ├── LICENSE ├── Readme.md ├── config.json ├── demo └── image.png ├── download-model.sh ├── go.mod ├── go.sum ├── install.sh ├── model_processing_utils ├── Readme.md ├── cluster.go ├── fasttext-to-bin.darwin.amd64 ├── fasttext-to-bin.go ├── fasttext-to-bin.linux.amd64 ├── fasttext-to-bin.windows.amd64.exe ├── reduce-model-size │ ├── PCA-dimension-reduction.go │ ├── Readme.md │ ├── go.mod │ ├── go.sum │ └── reduce-pca └── synonym-finder.go ├── models ├── fasttext │ ├── ATTRIBUTION.md │ └── LICENSE ├── glove │ ├── ATTRIBUTION.md │ ├── LICENSE.html │ └── glove.6B.300d.bin └── googlenews-slim │ ├── ATTRIBUTION.md │ ├── GoogleNews-vectors-negative300-SLIM.bin │ └── LICENSE ├── modules ├── config │ └── config.go ├── model │ └── model.go ├── processor │ └── processor.go ├── similarity │ └── similarity.go └── utils │ └── utils.go └── w2vgrep.go /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bin filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Models 15 | # models/* 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | 23 | # Binary output directory 24 | bin/ 25 | 26 | # IDE-specific files 27 | .idea/ 28 | .vscode/ 29 | *.swp 30 | *.swo 31 | 32 | # OS-specific files 33 | .DS_Store 34 | Thumbs.db 35 | 36 | # Log files 37 | *.log 38 | 39 | # Environment variables file 40 | .env 41 | 42 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 43 | *.o 44 | *.a 45 | 46 | # Debug files 47 | debug 48 | 49 | # Project-specific build output 50 | sgrep 51 | 52 | # Temporary files 53 | *.tmp 54 | *~ 55 | 56 | # Configuration files (uncomment if you don't want to track these) 57 | # *.json 58 | # *.yaml 59 | # *.yml 60 | 61 | # Ignore all local history of files 62 | .history/ 63 | 64 | # Ignore files related to API keys 65 | *.pem 66 | 67 | # Ignore Go module cache 68 | pkg/mod/ 69 | 70 | # Ignore Go build cache 71 | .cache/ 72 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 arunsupe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # w2vgrep - Semantic Grep 2 | 3 | w2vgrep is a command-line tool that performs semantic searches on text input using word embeddings. It's designed to find semantically similar matches to the query, going beyond simple string matching. Supports multiple languages. The experience is designed to be similar to grep. 4 | 5 | 6 | ## Example Usage 7 | 8 | Search for words similar to "death" in Hemingway's "The Old Man and the Sea" with context and line numbers: 9 | 10 | ```bash 11 | curl -s 'https://gutenberg.ca/ebooks/hemingwaye-oldmanandthesea/hemingwaye-oldmanandthesea-00-t.txt' \ 12 | | w2vgrep -C 2 -n --threshold=0.55 death 13 | ``` 14 | 15 | Output: 16 | ![alt text](demo/image.png) 17 | 18 | This command: 19 | 20 | - Fetches the text of "The Old Man and the Sea" from Project Gutenberg Canada 21 | - Pipes the text to w2vgrep 22 | - Searches for words semantically similar to "death" 23 | - Uses a similarity threshold of 0.55 (-threshold 0.55) 24 | - Displays 2 lines of context before and after each match (-C 2) 25 | - Shows line numbers (-n) 26 | 27 | The output will show matches with their similarity scores, highlighted words, context, and line numbers. 28 | 29 | ## Features 30 | 31 | - Semantic search using word embeddings 32 | - Configurable similarity threshold 33 | - Context display (before and after matching lines) 34 | - Color-coded output 35 | - Support for multiple languages 36 | - Read from files or stdin 37 | - Configurable via JSON file and command-line arguments 38 | 39 | ## Installation 40 | 41 | Two files are absolutely needed: 42 | 1. the w2vgrep binary 43 | 2. the vector embedding model file 44 | 3. (Optionally, a config.json file to tell w2vgrep where the embedding model is) 45 | 46 | **Using install script**: 47 | 48 | ```bash 49 | # clone 50 | git clone https://github.com/arunsupe/semantic-grep.git 51 | cd semantic-grep 52 | 53 | # run install: 54 | # compiles using the local go compiler, installs in user/bin, 55 | # downloads the model to $HOME/.config/semantic-grep 56 | # makes config.json 57 | bash install.sh 58 | ``` 59 | **Binary**: 60 | 61 | 1. Download the latest binary release 62 | 2. Download a vector embedding model (see below) 63 | 3. Optionally, download the config.json to configure model location there (or do this from the command line) 64 | 65 | **From source (linux/osx)**: 66 | 67 | ```bash 68 | # clone 69 | git clone https://github.com/arunsupe/semantic-grep.git 70 | cd semantic-grep 71 | 72 | # build 73 | go build -o w2vgrep 74 | 75 | # download a word2vec model using this helper script (see "Word Embedding Model" below) 76 | bash download-model.sh 77 | ``` 78 | 79 | ## Usage 80 | 81 | Basic usage: 82 | 83 | ./w2vgrep [options] [file] 84 | 85 | If no file is specified, w2vgrep reads from standard input. 86 | 87 | ### Command-line Options 88 | ``` 89 | -m, --model_path= Path to the Word2Vec model file. Overrides config file 90 | -t, --threshold= Similarity threshold for matching (default: 0.7) 91 | -A, --before-context= Number of lines before matching line 92 | -B, --after-context= Number of lines after matching line 93 | -C, --context= Number of lines before and after matching line 94 | -n, --line-number Print line numbers 95 | -i, --ignore-case Ignore case. 96 | -o, --only-matching Output only matching words 97 | -l, --only-lines Output only matched lines without similarity scores 98 | -f, --file= Match patterns from file, one pattern per line. Like grep -f. 99 | ``` 100 | 101 | ## Configuration 102 | 103 | `w2vgrep` can be configured using a JSON file. By default, it looks for `config.json` in the current directory, "$HOME/.config/semantic-grep/config.json" and "/etc/semantic-grep/config.json". 104 | 105 | 106 | ## Word Embedding Model 107 | 108 | ### Quick start: 109 | `w2vgrep` requires a word embedding model in __binary__ format. The default model loader uses the model file's extension to determine the type (.bin, .8bit.int). A few compatible model files are provided in this repo ([models/](models/)). Download one of the .bin files from the `models/` directory and update the path in config.json. 110 | 111 | Note: `git clone` will not download the large binary model files unless git lfs is installed in your machine. If you do not want to install git-lfs, just manually download the model .bin file and place it in the correct folder. 112 | 113 | 114 | ### Support for multiple languages: 115 | Facebook's fasttext group have published word vectors in [157 languages](https://fasttext.cc/docs/en/crawl-vectors.html) - an amazing resource. I want to host these files on my github account, but alas, they are too big and $$$. Therefore, I have provided a small go program, [fasttext-to-bin](model_processing_utils/), that can make `w2vgrep` compatible binary models from this. (note: use the text files with "__.vec.gz__" extension, not the binary ".bin.gz" files) 116 | 117 | ```bash 118 | # e.g., for a French model: 119 | curl -s 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz' | gunzip -c | ./fasttext-to-bin -input - -output models/fasttext/cc.fr.300.bin 120 | 121 | # use it like so: 122 | # curl -s 'https://www.gutenberg.org/cache/epub/17989/pg17989.txt' \ 123 | # | w2vgrep -C 2 -n -t 0.55 \ 124 | # -model_path model_processing_utils/cc.fr.300.bin 'château' 125 | ``` 126 | 127 | ### Roll your own: 128 | Alternatively, you can use pre-trained models (like Google's Word2Vec) or train your own using tools like gensim. Note though that there does not seem to be a standardized binary format (google's is different to facebook's fasttext or gensim's default _save()_). For `w2vgrep`, because efficiently loading the large model is key for performance, I have elected to keep the simplest format. 129 | 130 | 131 | ### Testing the model by finding synonyms 132 | To help troubleshoot the model, I added a `synonym-finder.go` to `./model_processing_utils/`. This program will find similar words to the query word above any threshold in the model. 133 | 134 | ```bash 135 | # build 136 | cd model_processing_utils 137 | go build synonym-finder.go 138 | 139 | #run 140 | synonym-finder -model_path path/to/cc.zh.300.bin -threshold 0.6 合理性 141 | 142 | # Output 143 | Words similar to '合理性' with similarity >= 0.60: 144 | 科学性 0.6304 145 | 合理性 1.0000 146 | 正当性 0.6018 147 | 公允性 0.6152 148 | 不合理性 0.6094 149 | 合法性 0.6219 150 | 有效性 0.6374 151 | 必要性 0.6499 152 | ``` 153 | 154 | 155 | ## Decreasing the size of the model files 156 | The model files are large (Gigabytes). Each word is typically represented using 300 dimension, 32 bit floating point vectors. Reducing dimensionality, to 100 or 150 dimensions, can produce smaller, memory efficient, faster, more performant models with minimal (maybe even better) accuracy. In `model_processing_utils/reduce-model-size`, I have written a program to reduce model dimensions. This can be used to reduce the size of any word2vec binary model used by w2vgrep. Use this like so: 157 | 158 | ```bash 159 | # build 160 | cd model_processing_utils/reduce-model-size 161 | go build . 162 | 163 | # run on large GoogleNews-vectors-negative300-SLIM.bin model (346MB) to make smaller 164 | # GoogleNews-vectors-negative100-SLIM.bin model (117MB) 165 | ./reduce-pca -input ../../models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin -output ../../models/googlenews-slim/GoogleNews-vectors-negative100-SLIM.bin 166 | 167 | # use this smaller model in w2vgrep like so 168 | curl -s 'https://gutenberg.ca/ebooks/hemingwaye-oldmanandthesea/hemingwaye-oldmanandthesea-00-t.txt' | bin/w2vgrep.linux.amd64 -n -t 0.5 -m models/googlenews-slim/GoogleNews-vectors-negative100-SLIM.bin --line-number death 169 | 170 | ``` 171 | 172 | 173 | ## A word about performance of the different embedding models 174 | Different models define "similarity" differently ([explaination](https://machinelearninginterview.com/topics/natural-language-processing/what-is-the-difference-between-word2vec-and-glove/)). However, for practical purposes, they seem equivalent enough. 175 | 176 | 177 | ## Contributing 178 | Contributions are welcome! Please feel free to submit a Pull Request. 179 | 180 | 181 | ## License and attribution: 182 | The code in this project is licensed under the MIT [License](LICENSE). 183 | 184 | **go-flags package:** 185 | 186 | The go-flags package, used by the code in this project, is distributed under the BSD-3-Clause license. Please see the license information https://github.com/jessevdk/go-flags. 187 | 188 | **Word2Vec Model**: 189 | 190 | This project uses a mirrored version of the word2vec-slim model, which is stored in the `models/googlenews-slim` directory. This model is distributed under the Apache License 2.0. For more information about the model, its original authors, and the license, please see the `models/googlenews-slim/ATTRIBUTION.md` file. 191 | 192 | **GloVe word vectors**: 193 | 194 | This project uses a processed version of the GloVe word vectors, which is stored in the `models/glove` directory. This work is distributed under the Public Domain Dedication and License v1.0. For more information about the model, its original authors, and the license, please see the `models/glove/ATTRIBUTION.md` file. 195 | 196 | **Fasttext word vectors**: 197 | 198 | This project uses a processed version of the fasttext word vectors, which is stored in the `models/fasttext` directory. This work is distributed under the Creative Commons Attribution-Share-Alike License 3.0. For more information about the model, its original authors, and the license, please see the `models/fasttext/ATTRIBUTION.md` file. 199 | 200 | 201 | ## Sources of models in the web 202 | - Google's Word2Vec: from https://github.com/mmihaltz/word2vec-GoogleNews-vectors 203 | - A slim version of the above: GoogleNews-vectors-negative300-SLIM.bin.gz model from https://github.com/eyaler/word2vec-slim/ 204 | - Stanford NLP group's Global Vectors for Word Representation (glove) model [source](https://nlp.stanford.edu/projects/glove/): binary version is in mirrored in [models/glove/](models/glove/). 205 | - Facebook fasttext vectors: https://fasttext.cc/docs/en/crawl-vectors.html -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_path": "models/glove/glove.6B.300d.bin" 3 | } -------------------------------------------------------------------------------- /demo/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arunsupe/semantic-grep/ded610ca3243346dba9de33e07a7db775be536f7/demo/image.png -------------------------------------------------------------------------------- /download-model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Model URL and save path (modify these if needed) 4 | model_url="https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz" 5 | save_path="models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin" 6 | 7 | # Create directory (if it doesn't exist) 8 | mkdir -p "$(dirname "$save_path")" 9 | 10 | # Download the model 11 | echo "Downloading..." 12 | 13 | if command -v wget > /dev/null; then 14 | wget -O "$save_path.gz" -q --show-progress "$model_url" 15 | else 16 | curl -L -s -o "$save_path.gz" "$model_url" 17 | fi 18 | 19 | # Decompress the file 20 | gunzip "$save_path.gz" 21 | 22 | echo "Model downloaded and saved to: $save_path" 23 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/arunsupe/semantic-grep 2 | 3 | go 1.22.5 4 | 5 | require ( 6 | github.com/clipperhouse/uax29 v1.13.0 7 | github.com/jessevdk/go-flags v1.6.1 8 | ) 9 | 10 | require ( 11 | golang.org/x/sys v0.21.0 // indirect 12 | golang.org/x/text v0.16.0 // indirect 13 | ) 14 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/clipperhouse/uax29 v1.13.0 h1:5q58IRS9gBATd+NtnPXAmul5PLTGFeQ3lv0C51zhIEk= 2 | github.com/clipperhouse/uax29 v1.13.0/go.mod h1:paNABhygWmmjkg0ROxKQoenJAX4dM9AS8biVkXmAK0c= 3 | github.com/jessevdk/go-flags v1.6.1 h1:Cvu5U8UGrLay1rZfv/zP7iLpSHGUZ/Ou68T0iX1bBK4= 4 | github.com/jessevdk/go-flags v1.6.1/go.mod h1:Mk8T1hIAWpOiJiHa9rJASDK2UGWji0EuPGBnNLMooyc= 5 | golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= 6 | golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 7 | golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= 8 | golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= 9 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # golang check for version specified in go.mod 4 | echo "[*] Checking golang" 5 | go version 6 | if [ $? -ne 0 ]; then 7 | echo "Failed to check golang version: do you have go installed?" 8 | exit 1 9 | fi 10 | echo "[OK] golang installed" 11 | 12 | # Building and installing w2vgrep 13 | echo "[*] Building and installing w2vgrep" 14 | go build -o w2vgrep 15 | if [ $? -ne 0 ]; then 16 | echo "Failed to build w2vgrep" 17 | exit 1 18 | fi 19 | echo "[OK] w2vgrep built" 20 | 21 | # Prompt to either move to /usr/bin/w2vgrep, adding local path to $PATH or doing nothing 22 | echo "[*] Where would you like to install w2vgrep?" 23 | echo "1. /usr/bin/w2vgrep (will require sudo)" 24 | echo "2. Add local path to \$PATH" 25 | echo "3. Do nothing" 26 | read -p "Enter your choice: " choice 27 | 28 | INSTALL_PATH="$(pwd)/w2vgrep" 29 | if [ $choice -eq 1 ]; then 30 | echo "[*] Installing w2vgrep in /usr/bin/w2vgrep, please enter your password as sudo is required." 31 | sudo cp w2vgrep /usr/bin/w2vgrep 32 | if [ $? -ne 0 ]; then 33 | echo "Failed to install w2vgrep" 34 | exit 1 35 | fi 36 | INSTALL_PATH="/usr/bin/w2vgrep" 37 | echo "[OK] w2vgrep installed in /usr/bin/w2vgrep" 38 | elif [ $choice -eq 2 ]; then 39 | echo "[*] Adding local path to \$PATH" 40 | export PATH=$PATH:$(pwd) 41 | echo "[OK] Local path added to \$PATH: to make this permanent, add the following line to your shell configuration file (e.g. ~/.bashrc or ~/.zshrc):" 42 | echo "export PATH=\$PATH:$(pwd)" 43 | elif [ $choice -eq 3 ]; then 44 | echo "[*] Skipping installation" 45 | echo "[OK] Skipped installation" 46 | else 47 | echo "Invalid choice" 48 | echo "[*] Skipping installation" 49 | echo "[OK] Skipped installation" 50 | fi 51 | 52 | # Setting configuration path 53 | 54 | CONFIG_PATH="./config.json" 55 | # ask user if they want to install the model 56 | echo "[*] Do you want to install the configuration to $HOME/.config/semantic-grep/?" 57 | echo "1. Yes" 58 | echo "2. No" 59 | read -p "Enter your choice: " choice 60 | 61 | if [ $choice -eq 1 ]; then 62 | echo "[*] Setting configuration path in $HOME/.config/semantic-grep/" 63 | mkdir -p "$HOME/.config/semantic-grep/" 64 | CONFIG_PATH="$HOME/.config/semantic-grep/config.json" 65 | else 66 | echo "[*] Skipping configuration installation" 67 | echo "[OK] Skipped configuration installation" 68 | fi 69 | 70 | # Downloading the model 71 | echo "[*] Downloading the model" 72 | if [ -f "models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin" ]; then 73 | echo "[OK] Model already downloaded" 74 | else 75 | bash download-model.sh 76 | if [ $? -ne 0 ]; then 77 | echo "Failed to download the model" 78 | exit 1 79 | fi 80 | fi 81 | 82 | echo "[OK] Model downloaded" 83 | # User prompt to move the model to $HOME/.config/semantic-grep/ 84 | MODEL_PATH="./models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin" 85 | echo "[*] Do you want to move the model to $HOME/.config/semantic-grep/?" 86 | echo "1. Yes" 87 | echo "2. No" 88 | read -p "Enter your choice: " choice 89 | if [ $choice -eq 1 ]; then 90 | cp -r models "$HOME/.config/semantic-grep/" 91 | echo "[OK] Model moved to $HOME/.config/semantic-grep/" 92 | MODEL_PATH="$HOME/.config/semantic-grep/models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin" 93 | else 94 | echo "[*] Skipping model installation" 95 | echo "[OK] Skipped model installation" 96 | fi 97 | 98 | # Defining model path and writing to configuration based on their paths 99 | echo "[*] Setting model path" 100 | CONFIG_STRING="{\"model_path\":\"$MODEL_PATH\"}" 101 | echo $CONFIG_STRING > "$CONFIG_PATH" 102 | if [ $? -ne 0 ]; then 103 | echo "Failed to set model path" 104 | exit 1 105 | fi 106 | echo "[OK] Model path set:" 107 | cat "$CONFIG_PATH" 108 | 109 | # Testing 110 | echo "[*] Testing" 111 | $INSTALL_PATH -h 112 | if [ $? -ne 0 ]; then 113 | echo "Failed to test w2vgrep" 114 | exit 1 115 | fi 116 | echo "[OK] w2vgrep tested and working. Installation complete." -------------------------------------------------------------------------------- /model_processing_utils/Readme.md: -------------------------------------------------------------------------------- 1 | # Interesting/helpful utilities to manage word embedding models 2 | 3 | `cluster.go` 4 | A program to collect similar words into a text file, with one cluster per line. It optionally takes nunber of clusters to find as input 5 | 6 | `synonym-finder.go` 7 | A program to find all words in the model above a similarity threshold to the qurey word. Essentially, finds synonyms 8 | 9 | `fasttext-to-bin.go` 10 | A utility to convert FastText text model files to Word2Vec binary format for use with w2vgrep. 11 | 12 | -------------------------------------------------------------------------------- /model_processing_utils/cluster.go: -------------------------------------------------------------------------------- 1 | /* 2 | Description: 3 | This program clusters words into synonyms, printing the clusters to a file, 4 | one cluster per line. It's output can be used by standard grep to find synonyms 5 | in texts. Some of the clusters are too big for grep to handle; grep -f may work. 6 | Not sure if this tool is useful. But an INTERESTING EXPERIMENT. 7 | 8 | This script is used to cluster the words in the word2vec model using 9 | mini-batch k-means clustering. It works for any language as long as the model is 10 | in that language. The output is a text file where each line contains a cluster of 11 | synonyms separated by a pipe (|) character. I want to use it in the 12 | standard unix text tools. 13 | 14 | The script takes the path to the word2vec binary model, the number of clusters (k), 15 | the batch size for mini-batch k-means, the maximum number of iterations, and 16 | the output file path as input. 17 | The script performs mini-batch k-means clustering on the word vectors and 18 | writes the clusters to the output file. 19 | 20 | Usage: cluster.go -model path/to/model.bin \ 21 | -k 100 -batch-size 100 \ 22 | -iterations 100 \ 23 | -output clusters.txt 24 | */ 25 | 26 | package main 27 | 28 | import ( 29 | "bufio" 30 | "encoding/binary" 31 | "flag" 32 | "fmt" 33 | "io" 34 | "log" 35 | "math" 36 | "math/rand" 37 | "os" 38 | "sort" 39 | "strings" 40 | "time" 41 | ) 42 | 43 | // VectorModel interface defines the methods that all vector models must implement 44 | type VectorModel interface { 45 | LoadModel(filename string) error 46 | GetEmbedding(token string) interface{} 47 | } 48 | 49 | // VecModel32bit represents a 32-bit floating point Word2Vec model 50 | type VecModel32bit struct { 51 | Vectors map[string][]float32 52 | Size int 53 | } 54 | 55 | // LoadModel loads a 32-bit floating point Word2Vec model from a file 56 | func (m *VecModel32bit) LoadModel(filename string) error { 57 | file, err := os.Open(filename) 58 | if err != nil { 59 | return fmt.Errorf("failed to open file: %v", err) 60 | } 61 | defer file.Close() 62 | 63 | reader := bufio.NewReader(file) 64 | 65 | // Read header 66 | var vocabSize, vectorSize int 67 | _, err = fmt.Fscanf(reader, "%d %d\n", &vocabSize, &vectorSize) 68 | if err != nil { 69 | return fmt.Errorf("failed to read header: %v\nCheck that you have a valid model file", err) 70 | } 71 | 72 | // Validate header 73 | if vocabSize <= 0 || vectorSize <= 0 { 74 | return fmt.Errorf("invalid header: vocabSize=%d, vectorSize=%d\nCheck that you have a valid model file", vocabSize, vectorSize) 75 | } 76 | 77 | m.Vectors = make(map[string][]float32, vocabSize) 78 | m.Size = vectorSize 79 | 80 | for i := 0; i < vocabSize; i++ { 81 | word, err := reader.ReadString(' ') 82 | if err != nil { 83 | return fmt.Errorf("failed to read word: %v", err) 84 | } 85 | word = strings.TrimSpace(word) 86 | 87 | vector := make([]float32, vectorSize) 88 | for j := 0; j < vectorSize; j++ { 89 | err := binary.Read(reader, binary.LittleEndian, &vector[j]) 90 | if err != nil { 91 | return fmt.Errorf("failed to read vector: %v", err) 92 | } 93 | } 94 | 95 | // Check if we've reached the end of the record 96 | nextByte, err := reader.Peek(1) 97 | if err != nil && err != io.EOF { 98 | return fmt.Errorf("unexpected error reading next byte: %v", err) 99 | } 100 | if len(nextByte) > 0 && nextByte[0] == '\n' { 101 | reader.ReadByte() // consume the newline 102 | } 103 | 104 | m.Vectors[word] = vector 105 | } 106 | 107 | // Check if we've reached the end of the file 108 | _, err = reader.ReadByte() 109 | if err != io.EOF { 110 | return fmt.Errorf("unexpected data at end of file.\nCheck that you have a valid model file") 111 | } 112 | 113 | return nil 114 | } 115 | 116 | // GetEmbedding returns the vector embedding of a token for the 32-bit model 117 | func (m *VecModel32bit) GetEmbedding(token string) interface{} { 118 | vec, ok := m.Vectors[token] 119 | if !ok { 120 | return make([]float32, m.Size) 121 | } 122 | return vec 123 | } 124 | 125 | // LoadVectorModel loads either a 32-bit or 8-bit model based on the file extension 126 | func LoadVectorModel(filename string) (VectorModel, error) { 127 | var model VectorModel 128 | 129 | if strings.HasSuffix(filename, ".bin") { 130 | model = &VecModel32bit{} 131 | } else { 132 | return nil, fmt.Errorf("unsupported file format") 133 | } 134 | 135 | err := model.LoadModel(filename) 136 | if err != nil { 137 | return nil, err 138 | } 139 | 140 | return model, nil 141 | } 142 | 143 | // func euclideanDistance(vec1, vec2 []float32) float64 { 144 | // sum := float64(0) 145 | // for i := range vec1 { 146 | // diff := float64(vec1[i] - vec2[i]) 147 | // sum += diff * diff 148 | // } 149 | // return math.Sqrt(sum) 150 | // } 151 | 152 | // cosineSimilarity calculates the cosine similarity between two vectors 153 | func cosineSimilarity(vec1, vec2 []float32) float64 { 154 | dotProduct := float64(0) 155 | normVec1 := float64(0) 156 | normVec2 := float64(0) 157 | for i := range vec1 { 158 | dotProduct += float64(vec1[i] * vec2[i]) 159 | normVec1 += float64(vec1[i] * vec1[i]) 160 | normVec2 += float64(vec2[i] * vec2[i]) 161 | } 162 | if normVec1 == 0 || normVec2 == 0 { 163 | return 0 // To handle zero vectors 164 | } 165 | return dotProduct / (math.Sqrt(normVec1) * math.Sqrt(normVec2)) 166 | } 167 | 168 | // cosineDistance converts cosine similarity to a distance metric 169 | func cosineDistance(vec1, vec2 []float32) float64 { 170 | return 1 - cosineSimilarity(vec1, vec2) 171 | } 172 | 173 | func calculateCentroid(vectors [][]float32) []float32 { 174 | dim := len(vectors[0]) 175 | centroid := make([]float32, dim) 176 | for _, vec := range vectors { 177 | for i := range vec { 178 | centroid[i] += vec[i] 179 | } 180 | } 181 | for i := range centroid { 182 | centroid[i] /= float32(len(vectors)) 183 | } 184 | return centroid 185 | } 186 | 187 | // Use cosineDistance 188 | func miniBatchKMeans(vectors [][]float32, words []string, k, batchSize, maxIterations int) [][]string { 189 | rand.Seed(time.Now().UnixNano()) 190 | dim := len(vectors[0]) 191 | 192 | // Initialize k random centroids 193 | centroids := make([][]float32, k) 194 | for i := range centroids { 195 | centroids[i] = vectors[rand.Intn(len(vectors))] 196 | } 197 | 198 | for iteration := 0; iteration < maxIterations; iteration++ { 199 | // Sample a random batch of data points 200 | batchIndices := rand.Perm(len(vectors))[:batchSize] 201 | batch := make([][]float32, batchSize) 202 | for i, idx := range batchIndices { 203 | batch[i] = vectors[idx] 204 | } 205 | 206 | // Assign points in the batch to the nearest centroid 207 | clusterAssignments := make([]int, batchSize) 208 | for i, vec := range batch { 209 | bestCluster := 0 210 | // bestDistance := euclideanDistance(vec, centroids[0]) 211 | bestDistance := cosineDistance(vec, centroids[0]) 212 | for j := 1; j < k; j++ { 213 | // distance := euclideanDistance(vec, centroids[j]) 214 | distance := cosineDistance(vec, centroids[j]) 215 | if distance < bestDistance { 216 | bestDistance = distance 217 | bestCluster = j 218 | } 219 | } 220 | clusterAssignments[i] = bestCluster 221 | } 222 | 223 | // Update centroids based on the batch 224 | clusterSums := make([][]float32, k) 225 | clusterCounts := make([]int, k) 226 | for i := range clusterSums { 227 | clusterSums[i] = make([]float32, dim) 228 | } 229 | for i, vec := range batch { 230 | cluster := clusterAssignments[i] 231 | for j := range vec { 232 | clusterSums[cluster][j] += vec[j] 233 | } 234 | clusterCounts[cluster]++ 235 | } 236 | for i := range centroids { 237 | if clusterCounts[i] > 0 { 238 | for j := range centroids[i] { 239 | centroids[i][j] = clusterSums[i][j] / float32(clusterCounts[i]) 240 | } 241 | } 242 | } 243 | } 244 | 245 | // Assign all points to the nearest centroid 246 | clusters := make([][]string, k) 247 | for i, vec := range vectors { 248 | bestCluster := 0 249 | bestDistance := cosineDistance(vec, centroids[0]) 250 | for j := 1; j < k; j++ { 251 | distance := cosineDistance(vec, centroids[j]) 252 | if distance < bestDistance { 253 | bestDistance = distance 254 | bestCluster = j 255 | } 256 | } 257 | clusters[bestCluster] = append(clusters[bestCluster], words[i]) 258 | } 259 | 260 | return clusters 261 | } 262 | 263 | func main() { 264 | modelPath := flag.String("model", "", "Path to word2vec binary model") 265 | k := flag.Int("k", 100, "Number of clusters") 266 | batchSize := flag.Int("batch-size", 100, "Batch size for mini-batch k-means") 267 | maxIterations := flag.Int("iterations", 100, "Maximum number of iterations for mini-batch k-means") 268 | outputPath := flag.String("output", "clusters.txt", "Output file path") 269 | flag.Parse() 270 | 271 | if *modelPath == "" { 272 | log.Fatal("Please provide a path to the word2vec binary model") 273 | } 274 | 275 | // Load the word2vec model 276 | model, err := LoadVectorModel(*modelPath) 277 | if err != nil { 278 | log.Fatalf("Failed to load model: %v", err) 279 | } 280 | 281 | // Get all words and vectors 282 | var words []string 283 | var vectors [][]float32 284 | for word, vec := range model.(*VecModel32bit).Vectors { 285 | words = append(words, word) 286 | vectors = append(vectors, vec) 287 | } 288 | 289 | // Perform mini-batch k-means clustering 290 | clusters := miniBatchKMeans(vectors, words, *k, *batchSize, *maxIterations) 291 | 292 | // Sort clusters by size (largest first) 293 | sort.Slice(clusters, func(i, j int) bool { 294 | return len(clusters[i]) > len(clusters[j]) 295 | }) 296 | 297 | // Write clusters to file 298 | file, err := os.Create(*outputPath) 299 | if err != nil { 300 | log.Fatalf("Failed to create output file: %v", err) 301 | } 302 | defer file.Close() 303 | 304 | writer := bufio.NewWriter(file) 305 | for _, cluster := range clusters { 306 | _, err := writer.WriteString(strings.Join(cluster, "|") + "\n") 307 | if err != nil { 308 | log.Fatalf("Failed to write to file: %v", err) 309 | } 310 | } 311 | writer.Flush() 312 | 313 | fmt.Printf("Clustering complete. %d clusters written to %s\n", len(clusters), *outputPath) 314 | } 315 | -------------------------------------------------------------------------------- /model_processing_utils/fasttext-to-bin.darwin.amd64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arunsupe/semantic-grep/ded610ca3243346dba9de33e07a7db775be536f7/model_processing_utils/fasttext-to-bin.darwin.amd64 -------------------------------------------------------------------------------- /model_processing_utils/fasttext-to-bin.go: -------------------------------------------------------------------------------- 1 | /* 2 | A small utility to convert FastText models to Word2Vec format. 3 | The input file should be a FastText model in text format. 4 | The output file will be a Word2Vec binary model. 5 | 6 | Usage: 7 | fasttext-to-bin -input -output 8 | 9 | Example: 10 | fasttext-to-bin -input model.bin -output model.bin 11 | 12 | Or stream from stdin: 13 | curl -s 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz' \ 14 | | gunzip -c \ 15 | | fasttext-to-bin -input - -output models/fasttext/cc.fr.300.bin 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | "bufio" 22 | "encoding/binary" 23 | "flag" 24 | "fmt" 25 | "io" 26 | "os" 27 | "strconv" 28 | "strings" 29 | ) 30 | 31 | func convertFastTextToWord2Vec(input io.Reader, outputFile string) error { 32 | // Open output file 33 | out, err := os.Create(outputFile) 34 | if err != nil { 35 | return fmt.Errorf("error creating output file: %v", err) 36 | } 37 | defer out.Close() 38 | 39 | writer := bufio.NewWriter(out) 40 | defer writer.Flush() 41 | 42 | scanner := bufio.NewScanner(input) 43 | 44 | // Read header 45 | if !scanner.Scan() { 46 | return fmt.Errorf("error reading header: %v", scanner.Err()) 47 | } 48 | header := strings.Fields(scanner.Text()) 49 | if len(header) != 2 { 50 | return fmt.Errorf("invalid header format") 51 | } 52 | 53 | vocabSize, err := strconv.Atoi(header[0]) 54 | if err != nil { 55 | return fmt.Errorf("invalid vocabulary size: %v", err) 56 | } 57 | 58 | vectorSize, err := strconv.Atoi(header[1]) 59 | if err != nil { 60 | return fmt.Errorf("invalid vector size: %v", err) 61 | } 62 | 63 | // Write text header 64 | if _, err := fmt.Fprintf(writer, "%d %d\n", vocabSize, vectorSize); err != nil { 65 | return fmt.Errorf("error writing text header: %v", err) 66 | } 67 | 68 | // Process each line 69 | for scanner.Scan() { 70 | line := scanner.Text() 71 | parts := strings.Fields(line) 72 | if len(parts) != vectorSize+1 { 73 | return fmt.Errorf("invalid line format: expected %d fields, got %d", vectorSize+1, len(parts)) 74 | } 75 | 76 | word := parts[0] 77 | if _, err := writer.WriteString(word); err != nil { 78 | return fmt.Errorf("error writing word: %v", err) 79 | } 80 | if err := writer.WriteByte(' '); err != nil { 81 | return fmt.Errorf("error writing space: %v", err) 82 | } 83 | 84 | vector := make([]float32, vectorSize) 85 | for i := 0; i < vectorSize; i++ { 86 | value, err := strconv.ParseFloat(parts[i+1], 32) 87 | if err != nil { 88 | return fmt.Errorf("error parsing float: %v", err) 89 | } 90 | vector[i] = float32(value) 91 | } 92 | 93 | if err := binary.Write(writer, binary.LittleEndian, vector); err != nil { 94 | return fmt.Errorf("error writing vector: %v", err) 95 | } 96 | } 97 | 98 | if err := scanner.Err(); err != nil { 99 | return fmt.Errorf("error scanning input: %v", err) 100 | } 101 | 102 | return nil 103 | } 104 | 105 | func main() { 106 | // Define command-line flags 107 | inputFileFlag := flag.String("input", "", "Input FastText file (use '-' for stdin)") 108 | outputFileFlag := flag.String("output", "", "Output Word2Vec file. End in .bin") 109 | flag.Parse() 110 | 111 | // Validate flags 112 | if *inputFileFlag == "" || *outputFileFlag == "" { 113 | fmt.Println("Usage: fasttext-to-bin -input -output ") 114 | os.Exit(1) 115 | } 116 | 117 | var input io.Reader 118 | 119 | // Check if input is from stdin or a file 120 | if *inputFileFlag == "-" { 121 | input = os.Stdin 122 | } else { 123 | file, err := os.Open(*inputFileFlag) 124 | if err != nil { 125 | fmt.Printf("Error opening input file: %v\n", err) 126 | os.Exit(1) 127 | } 128 | defer file.Close() 129 | input = file 130 | } 131 | 132 | // Convert FastText to Word2Vec 133 | err := convertFastTextToWord2Vec(input, *outputFileFlag) 134 | if err != nil { 135 | fmt.Printf("Error during conversion: %v\n", err) 136 | os.Exit(1) 137 | } 138 | 139 | fmt.Printf("Conversion complete. Word2Vec binary model saved as %s\n", *outputFileFlag) 140 | } 141 | -------------------------------------------------------------------------------- /model_processing_utils/fasttext-to-bin.linux.amd64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arunsupe/semantic-grep/ded610ca3243346dba9de33e07a7db775be536f7/model_processing_utils/fasttext-to-bin.linux.amd64 -------------------------------------------------------------------------------- /model_processing_utils/fasttext-to-bin.windows.amd64.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arunsupe/semantic-grep/ded610ca3243346dba9de33e07a7db775be536f7/model_processing_utils/fasttext-to-bin.windows.amd64.exe -------------------------------------------------------------------------------- /model_processing_utils/reduce-model-size/PCA-dimension-reduction.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "fmt" 7 | "io" 8 | "os" 9 | "strings" 10 | "flag" 11 | 12 | "gonum.org/v1/gonum/mat" 13 | "gonum.org/v1/gonum/stat" 14 | ) 15 | 16 | type VecModel32bit struct { 17 | Vectors map[string][]float32 18 | VectorsReduced map[string][]float32 19 | Size int 20 | } 21 | 22 | // LoadModel loads a 32-bit floating point Word2Vec model from a file 23 | func (m *VecModel32bit) LoadModel(filename string) error { 24 | file, err := os.Open(filename) 25 | if err != nil { 26 | return fmt.Errorf("failed to open file: %v", err) 27 | } 28 | defer file.Close() 29 | 30 | reader := bufio.NewReader(file) 31 | 32 | // Read header 33 | var vocabSize, vectorSize int 34 | _, err = fmt.Fscanf(reader, "%d %d\n", &vocabSize, &vectorSize) 35 | if err != nil { 36 | return fmt.Errorf("failed to read header: %v\nCheck that you have a valid model file", err) 37 | } 38 | 39 | // Validate header 40 | if vocabSize <= 0 || vectorSize <= 0 { 41 | return fmt.Errorf("invalid header: vocabSize=%d, vectorSize=%d\nCheck that you have a valid model file", vocabSize, vectorSize) 42 | } 43 | 44 | m.Vectors = make(map[string][]float32, vocabSize) 45 | m.Size = vectorSize 46 | 47 | for i := 0; i < vocabSize; i++ { 48 | word, err := reader.ReadString(' ') 49 | if err != nil { 50 | return fmt.Errorf("failed to read word: %v", err) 51 | } 52 | word = strings.TrimSpace(word) 53 | 54 | vector := make([]float32, vectorSize) 55 | for j := 0; j < vectorSize; j++ { 56 | err := binary.Read(reader, binary.LittleEndian, &vector[j]) 57 | if err != nil { 58 | return fmt.Errorf("failed to read vector: %v", err) 59 | } 60 | } 61 | 62 | // Check if we've reached the end of the record 63 | nextByte, err := reader.Peek(1) 64 | if err != nil && err != io.EOF { 65 | return fmt.Errorf("unexpected error reading next byte: %v", err) 66 | } 67 | if len(nextByte) > 0 && nextByte[0] == '\n' { 68 | reader.ReadByte() // consume the newline 69 | } 70 | 71 | m.Vectors[word] = vector 72 | } 73 | 74 | // Check if we've reached the end of the file 75 | _, err = reader.ReadByte() 76 | if err != io.EOF { 77 | return fmt.Errorf("unexpected data at end of file.\nCheck that you have a valid model file") 78 | } 79 | 80 | return nil 81 | } 82 | 83 | // ReduceDimensions reduces the dimensions of the vectors to 100 using PCA 84 | func (m *VecModel32bit) ReduceDimensions(targetDim int) error { 85 | vocabSize := len(m.Vectors) 86 | if vocabSize == 0 || m.Size <= 100 { 87 | return fmt.Errorf("no vectors to reduce or vector size is already 100 or less") 88 | } 89 | 90 | // Convert map to matrix 91 | data := make([]float64, 0, vocabSize*m.Size) 92 | words := make([]string, 0, vocabSize) 93 | for word, vector := range m.Vectors { 94 | words = append(words, word) 95 | for _, v := range vector { 96 | data = append(data, float64(v)) 97 | } 98 | } 99 | 100 | originalMatrix := mat.NewDense(vocabSize, m.Size, data) 101 | 102 | // Perform PCA 103 | var pc stat.PC 104 | ok := pc.PrincipalComponents(originalMatrix, nil) 105 | if !ok { 106 | return fmt.Errorf("PCA computation failed") 107 | } 108 | 109 | // Get the principal component direction vectors 110 | var vec mat.Dense 111 | pc.VectorsTo(&vec) 112 | 113 | // Select the first targetDim columns of the principal components 114 | proj := mat.NewDense(vocabSize, targetDim, nil) 115 | proj.Mul(originalMatrix, vec.Slice(0, m.Size, 0, targetDim)) 116 | 117 | // Convert reduced matrix back to map 118 | m.VectorsReduced = make(map[string][]float32, vocabSize) 119 | for i, word := range words { 120 | reducedVector := make([]float32, targetDim) 121 | for j := 0; j < targetDim; j++ { 122 | reducedVector[j] = float32(proj.At(i, j)) 123 | } 124 | m.VectorsReduced[word] = reducedVector 125 | } 126 | 127 | return nil 128 | } 129 | 130 | // SaveReducedModel saves the reduced model to a file in the same binary format 131 | func (m *VecModel32bit) SaveReducedModel(filename string) error { 132 | file, err := os.Create(filename) 133 | if err != nil { 134 | return fmt.Errorf("failed to create file: %v", err) 135 | } 136 | defer file.Close() 137 | 138 | writer := bufio.NewWriter(file) 139 | 140 | // Get the reduced vector size from the first vector in the map 141 | var reducedSize int 142 | for _, vector := range m.VectorsReduced { 143 | reducedSize = len(vector) 144 | break 145 | } 146 | 147 | // Write header: vocabSize and reduced vector size 148 | vocabSize := len(m.VectorsReduced) 149 | _, err = fmt.Fprintf(writer, "%d %d\n", vocabSize, reducedSize) 150 | if err != nil { 151 | return fmt.Errorf("failed to write header: %v", err) 152 | } 153 | 154 | // Write each word and its reduced vector 155 | for word, vector := range m.VectorsReduced { 156 | _, err := writer.WriteString(word + " ") 157 | if err != nil { 158 | return fmt.Errorf("failed to write word: %v", err) 159 | } 160 | 161 | for _, value := range vector { 162 | err := binary.Write(writer, binary.LittleEndian, value) 163 | if err != nil { 164 | return fmt.Errorf("failed to write vector value: %v", err) 165 | } 166 | } 167 | 168 | // Write a newline character after each vector 169 | err = writer.WriteByte('\n') 170 | if err != nil { 171 | return fmt.Errorf("failed to write newline: %v", err) 172 | } 173 | } 174 | 175 | // Flush the buffer to ensure all data is written to the file 176 | err = writer.Flush() 177 | if err != nil { 178 | return fmt.Errorf("failed to flush writer: %v", err) 179 | } 180 | 181 | return nil 182 | } 183 | 184 | func main() { 185 | // Define command-line flags for input and output file paths 186 | inputFile := flag.String("input", "", "Path to the input Word2Vec model file") 187 | outputFile := flag.String("output", "", "Path to the output reduced model file") 188 | targetDim := flag.Int("dim", 100, "Target dimension for PCA reduction") 189 | flag.Parse() 190 | 191 | // Check if input and output paths are provided 192 | if *inputFile == "" || *outputFile == "" { 193 | fmt.Println("Please provide both input and output file paths using -input and -output flags.") 194 | return 195 | } 196 | 197 | // Load the model 198 | model := VecModel32bit{} 199 | err := model.LoadModel(*inputFile) 200 | if err != nil { 201 | fmt.Println("Error loading model:", err) 202 | return 203 | } 204 | 205 | // Reduce dimensions 206 | err = model.ReduceDimensions(*targetDim) 207 | if err != nil { 208 | fmt.Println("Error reducing dimensions:", err) 209 | return 210 | } 211 | 212 | // Save the reduced model 213 | err = model.SaveReducedModel(*outputFile) 214 | if err != nil { 215 | fmt.Println("Error saving reduced model:", err) 216 | return 217 | } 218 | 219 | fmt.Println("Reduced model saved successfully!") 220 | } -------------------------------------------------------------------------------- /model_processing_utils/reduce-model-size/Readme.md: -------------------------------------------------------------------------------- 1 | # Decreasing the size of the model files 2 | 3 | The model files are large (Gigabytes). Each word is typically represented using a 300 dimension, 32 bit floating point vector. While highly accurate, these lareg models are memory intensive and slow. Reducing dimensionality, to 100 or 150 dimensions, can produce smaller, memory efficient, faster, more performant models with minimal (maybe even better) accuracy. 4 | 5 | In `model_processing_utils/reduce-model-size/PCA-dimension-reduction.go`, I have written a small utility to reduce model's dimensions. This will take as input the model file, the output path and optionally, vector dimensions and reduce the model's size using Principal Component Analysis. In my testing, the optimal vector dimensions are somewhere between 100-150. Smaller than that, and accuracy may be compromised. Note: thresholds will be different with the new, smaller, model compared to the large models. Optimize through trial and error. 6 | 7 | This can be used to reduce the size of any word2vec binary model used by w2vgrep. Use this like so: 8 | 9 | ```bash 10 | # build 11 | cd model_processing_utils/reduce-model-size 12 | go build . 13 | 14 | # run on large GoogleNews-vectors-negative300-SLIM.bin model (346MB) to make smaller 15 | # GoogleNews-vectors-negative100-SLIM.bin model (117MB) 16 | ./reduce-pca -input ../../models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin -output ../../models/googlenews-slim/GoogleNews-vectors-negative100-SLIM.bin -dim 100 17 | 18 | # use this smaller model in w2vgrep like so 19 | curl -s 'https://gutenberg.ca/ebooks/hemingwaye-oldmanandthesea/hemingwaye-oldmanandthesea-00-t.txt' | bin/w2vgrep.linux.amd64 -n -t 0.5 -m models/googlenews-slim/GoogleNews-vectors-negative100-SLIM.bin --line-number death 20 | ``` 21 | 22 | Please try this if performance is a bottle neck. -------------------------------------------------------------------------------- /model_processing_utils/reduce-model-size/go.mod: -------------------------------------------------------------------------------- 1 | module reduce-pca 2 | 3 | go 1.22.5 4 | 5 | require gonum.org/v1/gonum v0.15.1 // indirect 6 | -------------------------------------------------------------------------------- /model_processing_utils/reduce-model-size/go.sum: -------------------------------------------------------------------------------- 1 | gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= 2 | gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= 3 | -------------------------------------------------------------------------------- /model_processing_utils/reduce-model-size/reduce-pca: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arunsupe/semantic-grep/ded610ca3243346dba9de33e07a7db775be536f7/model_processing_utils/reduce-model-size/reduce-pca -------------------------------------------------------------------------------- /model_processing_utils/synonym-finder.go: -------------------------------------------------------------------------------- 1 | // A program to find synonyms for a given word in a Word2Vec model. 2 | // It finds words in the model with similarity scores above a given threshold. 3 | // 4 | // Usage: synonym-finder [OPTIONS] QUERY 5 | // QUERY is the word to find similar words for (required) 6 | // Options: 7 | // -model_path string 8 | // Path to the Word2Vec model file (required) 9 | // -threshold float 10 | // Similarity threshold for matching (required) (default 0.7) 11 | // -ignore-case 12 | // Ignore case. Note: word2vec is case-sensitive. Ignoring case may lead to unexpected results 13 | // -f string 14 | // File containing patterns, one per line 15 | // -o Print only matching tokens 16 | // 17 | // Example: 18 | // synonym-finder -model_path ../models/glove/glove.6B.300d.bin -threshold 0.5 angry 19 | 20 | package main 21 | 22 | import ( 23 | "bufio" 24 | "encoding/binary" 25 | "flag" 26 | "fmt" 27 | "io" 28 | "math" 29 | "os" 30 | "strings" 31 | ) 32 | 33 | // Options defines the command-line options 34 | type Options struct { 35 | ModelPath string 36 | SimilarityThreshold float64 37 | IgnoreCase bool 38 | PatternFile string 39 | OnlyMatching bool // New field for -o flag 40 | } 41 | 42 | // VectorModel interface defines the methods that all vector models must implement 43 | type VectorModel interface { 44 | LoadModel(filename string) error 45 | GetEmbedding(token string) interface{} 46 | } 47 | 48 | // VecModel32bit represents a 32-bit floating point Word2Vec model 49 | type VecModel32bit struct { 50 | Vectors map[string][]float32 51 | Size int 52 | } 53 | 54 | // LoadModel loads a 32-bit floating point Word2Vec model from a file 55 | func (m *VecModel32bit) LoadModel(filename string) error { 56 | file, err := os.Open(filename) 57 | if err != nil { 58 | return fmt.Errorf("failed to open file: %v", err) 59 | } 60 | defer file.Close() 61 | 62 | reader := bufio.NewReader(file) 63 | 64 | // Read header 65 | var vocabSize, vectorSize int 66 | _, err = fmt.Fscanf(reader, "%d %d\n", &vocabSize, &vectorSize) 67 | if err != nil { 68 | return fmt.Errorf("failed to read header: %v\nCheck that you have a valid model file", err) 69 | } 70 | 71 | // Validate header 72 | if vocabSize <= 0 || vectorSize <= 0 { 73 | return fmt.Errorf("invalid header: vocabSize=%d, vectorSize=%d\nCheck that you have a valid model file", vocabSize, vectorSize) 74 | } 75 | 76 | m.Vectors = make(map[string][]float32, vocabSize) 77 | m.Size = vectorSize 78 | 79 | for i := 0; i < vocabSize; i++ { 80 | word, err := reader.ReadString(' ') 81 | if err != nil { 82 | return fmt.Errorf("failed to read word: %v", err) 83 | } 84 | word = strings.TrimSpace(word) 85 | 86 | vector := make([]float32, vectorSize) 87 | for j := 0; j < vectorSize; j++ { 88 | err := binary.Read(reader, binary.LittleEndian, &vector[j]) 89 | if err != nil { 90 | return fmt.Errorf("failed to read vector: %v", err) 91 | } 92 | } 93 | 94 | // Check if we've reached the end of the record 95 | nextByte, err := reader.Peek(1) 96 | if err != nil && err != io.EOF { 97 | return fmt.Errorf("unexpected error reading next byte: %v", err) 98 | } 99 | if len(nextByte) > 0 && nextByte[0] == '\n' { 100 | reader.ReadByte() // consume the newline 101 | } 102 | 103 | m.Vectors[word] = vector 104 | } 105 | 106 | // Check if we've reached the end of the file 107 | _, err = reader.ReadByte() 108 | if err != io.EOF { 109 | return fmt.Errorf("unexpected data at end of file.\nCheck that you have a valid model file") 110 | } 111 | 112 | return nil 113 | } 114 | 115 | // GetEmbedding returns the vector embedding of a token for the 32-bit model 116 | func (m *VecModel32bit) GetEmbedding(token string) interface{} { 117 | vec, ok := m.Vectors[token] 118 | if !ok { 119 | return make([]float32, m.Size) 120 | } 121 | return vec 122 | } 123 | 124 | // LoadVectorModel loads either a 32-bit or 8-bit model based on the file extension 125 | func LoadVectorModel(filename string) (VectorModel, error) { 126 | var model VectorModel 127 | 128 | if strings.HasSuffix(filename, ".bin") { 129 | model = &VecModel32bit{} 130 | } else { 131 | return nil, fmt.Errorf("unsupported file format") 132 | } 133 | 134 | err := model.LoadModel(filename) 135 | if err != nil { 136 | return nil, err 137 | } 138 | 139 | return model, nil 140 | } 141 | 142 | // calculateSimilarity calculates the cosine similarity between two vectors 143 | func calculateSimilarity32bit(vec1, vec2 []float32) float64 { 144 | dotProduct := float64(0) 145 | norm1 := float64(0) 146 | norm2 := float64(0) 147 | for i := range vec1 { 148 | dotProduct += float64(vec1[i] * vec2[i]) 149 | norm1 += float64(vec1[i] * vec1[i]) 150 | norm2 += float64(vec2[i] * vec2[i]) 151 | } 152 | return dotProduct / (math.Sqrt(norm1) * math.Sqrt(norm2)) 153 | } 154 | 155 | // findSimilarWords finds words in the model that are similar to the query word above the given threshold 156 | func findSimilarWords(model VectorModel, query string, threshold float64, onlyMatching bool) error { 157 | queryEmbedding := model.GetEmbedding(query).([]float32) 158 | if len(queryEmbedding) == 0 { 159 | return fmt.Errorf("query word not found in model") 160 | } 161 | 162 | if onlyMatching { 163 | fmt.Println(query) // Print the bare query 164 | } else { 165 | fmt.Printf("Words similar to '%s' with similarity >= %.2f:\n", query, threshold) 166 | } 167 | 168 | for word, embedding := range model.(*VecModel32bit).Vectors { 169 | similarity := calculateSimilarity32bit(queryEmbedding, embedding) 170 | if similarity >= threshold && similarity < 1.0 { 171 | if onlyMatching { 172 | fmt.Println(word) 173 | } else { 174 | fmt.Printf("%s %.4f\n", word, similarity) 175 | } 176 | } 177 | } 178 | 179 | return nil 180 | } 181 | 182 | // findSimilarWordsForPatterns finds similar words for each pattern in the given file 183 | func findSimilarWordsForPatterns(model VectorModel, patternFile string, threshold float64, onlyMatching bool) error { 184 | file, err := os.Open(patternFile) 185 | if err != nil { 186 | return fmt.Errorf("failed to open pattern file: %v", err) 187 | } 188 | defer file.Close() 189 | 190 | scanner := bufio.NewScanner(file) 191 | for scanner.Scan() { 192 | pattern := strings.TrimSpace(scanner.Text()) 193 | if pattern == "" { 194 | continue 195 | } 196 | 197 | err := findSimilarWords(model, pattern, threshold, onlyMatching) 198 | if err != nil { 199 | fmt.Printf("Warning: %v\n", err) 200 | } 201 | } 202 | 203 | if err := scanner.Err(); err != nil { 204 | return fmt.Errorf("error reading pattern file: %v", err) 205 | } 206 | 207 | return nil 208 | } 209 | 210 | func main() { 211 | var opts Options 212 | 213 | flag.StringVar(&opts.ModelPath, "model_path", "", "Path to the Word2Vec model file (required)") 214 | flag.Float64Var(&opts.SimilarityThreshold, "threshold", 0.7, "Similarity threshold for matching (default 0.7)") 215 | flag.BoolVar(&opts.IgnoreCase, "ignore-case", false, "Ignore case. Note: word2vec is case-sensitive. Ignoring case may lead to unexpected results") 216 | flag.StringVar(&opts.PatternFile, "f", "", "File containing patterns, one per line") 217 | flag.BoolVar(&opts.OnlyMatching, "o", false, "Print only matching tokens") 218 | 219 | // Custom usage message 220 | flag.Usage = func() { 221 | fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] [QUERY]\n\n", os.Args[0]) 222 | fmt.Fprintf(os.Stderr, "QUERY is the word to find similar words for (required if -f is not used)\n\n") 223 | fmt.Fprintf(os.Stderr, "Options:\n") 224 | flag.PrintDefaults() 225 | fmt.Fprintf(os.Stderr, "\nExamples:\n") 226 | fmt.Fprintf(os.Stderr, " %s -model_path path/to/model.bin -threshold 0.8 cat\n", os.Args[0]) 227 | fmt.Fprintf(os.Stderr, " %s -model_path path/to/model.bin -threshold 0.8 -f patterns.txt\n", os.Args[0]) 228 | fmt.Fprintf(os.Stderr, " %s -model_path path/to/model.bin -threshold 0.8 -o cat\n", os.Args[0]) 229 | } 230 | 231 | flag.Parse() 232 | 233 | if opts.ModelPath == "" { 234 | fmt.Fprintln(os.Stderr, "Error: Model path is required. Please provide it via -model_path flag.") 235 | flag.Usage() 236 | os.Exit(1) 237 | } 238 | 239 | if flag.Lookup("threshold").Value.String() == "0.7" { 240 | fmt.Fprintln(os.Stderr, "Error: Threshold is required. Please provide it via -threshold flag.") 241 | flag.Usage() 242 | os.Exit(1) 243 | } 244 | 245 | model, err := LoadVectorModel(opts.ModelPath) 246 | if err != nil { 247 | fmt.Fprintf(os.Stderr, "Error loading model: %v\n", err) 248 | os.Exit(1) 249 | } 250 | 251 | if opts.PatternFile != "" { 252 | err = findSimilarWordsForPatterns(model, opts.PatternFile, opts.SimilarityThreshold, opts.OnlyMatching) 253 | if err != nil { 254 | fmt.Fprintf(os.Stderr, "Error processing pattern file: %v\n", err) 255 | os.Exit(1) 256 | } 257 | } else { 258 | args := flag.Args() 259 | if len(args) != 1 { 260 | fmt.Fprintln(os.Stderr, "Error: Exactly one query word is required when not using -f") 261 | flag.Usage() 262 | os.Exit(1) 263 | } 264 | 265 | query := args[0] 266 | err = findSimilarWords(model, query, opts.SimilarityThreshold, opts.OnlyMatching) 267 | if err != nil { 268 | fmt.Fprintf(os.Stderr, "Error finding similar words: %v\n", err) 269 | os.Exit(1) 270 | } 271 | } 272 | } 273 | -------------------------------------------------------------------------------- /models/fasttext/ATTRIBUTION.md: -------------------------------------------------------------------------------- 1 | # FastText Word Vectors Attribution 2 | 3 | This directory contains pre-trained word vectors from FastText, trained on Wikipedia and the News dataset. 4 | 5 | Original source: https://fasttext.cc/docs/en/english-vectors.html 6 | 7 | ## Citation 8 | 9 | If you use these word vectors, please cite the following paper: 10 | 11 | 12 | @inproceedings{grave2018learning, 13 | title={Learning Word Vectors for 157 Languages}, 14 | author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas}, 15 | booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)}, 16 | year={2018} 17 | } 18 | text 19 | 20 | ## License 21 | 22 | These word vectors are distributed under the Creative Commons Attribution-Share-Alike License 3.0. See the LICENSE file in this directory for the full license text. 23 | 24 | ## Changes Made 25 | 26 | The word vectors have been processed to fit the specific needs of this project. The modifications include converting them to binary format. 27 | -------------------------------------------------------------------------------- /models/fasttext/LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | Attribution-ShareAlike 3.0 Unported 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR 10 | DAMAGES RESULTING FROM ITS USE. 11 | 12 | License 13 | 14 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE 15 | COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY 16 | COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS 17 | AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. 18 | 19 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE 20 | TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY 21 | BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS 22 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND 23 | CONDITIONS. 24 | 25 | 1. Definitions 26 | 27 | a. "Adaptation" means a work based upon the Work, or upon the Work and 28 | other pre-existing works, such as a translation, adaptation, 29 | derivative work, arrangement of music or other alterations of a 30 | literary or artistic work, or phonogram or performance and includes 31 | cinematographic adaptations or any other form in which the Work may be 32 | recast, transformed, or adapted including in any form recognizably 33 | derived from the original, except that a work that constitutes a 34 | Collection will not be considered an Adaptation for the purpose of 35 | this License. For the avoidance of doubt, where the Work is a musical 36 | work, performance or phonogram, the synchronization of the Work in 37 | timed-relation with a moving image ("synching") will be considered an 38 | Adaptation for the purpose of this License. 39 | b. "Collection" means a collection of literary or artistic works, such as 40 | encyclopedias and anthologies, or performances, phonograms or 41 | broadcasts, or other works or subject matter other than works listed 42 | in Section 1(f) below, which, by reason of the selection and 43 | arrangement of their contents, constitute intellectual creations, in 44 | which the Work is included in its entirety in unmodified form along 45 | with one or more other contributions, each constituting separate and 46 | independent works in themselves, which together are assembled into a 47 | collective whole. A work that constitutes a Collection will not be 48 | considered an Adaptation (as defined below) for the purposes of this 49 | License. 50 | c. "Creative Commons Compatible License" means a license that is listed 51 | at https://creativecommons.org/compatiblelicenses that has been 52 | approved by Creative Commons as being essentially equivalent to this 53 | License, including, at a minimum, because that license: (i) contains 54 | terms that have the same purpose, meaning and effect as the License 55 | Elements of this License; and, (ii) explicitly permits the relicensing 56 | of adaptations of works made available under that license under this 57 | License or a Creative Commons jurisdiction license with the same 58 | License Elements as this License. 59 | d. "Distribute" means to make available to the public the original and 60 | copies of the Work or Adaptation, as appropriate, through sale or 61 | other transfer of ownership. 62 | e. "License Elements" means the following high-level license attributes 63 | as selected by Licensor and indicated in the title of this License: 64 | Attribution, ShareAlike. 65 | f. "Licensor" means the individual, individuals, entity or entities that 66 | offer(s) the Work under the terms of this License. 67 | g. "Original Author" means, in the case of a literary or artistic work, 68 | the individual, individuals, entity or entities who created the Work 69 | or if no individual or entity can be identified, the publisher; and in 70 | addition (i) in the case of a performance the actors, singers, 71 | musicians, dancers, and other persons who act, sing, deliver, declaim, 72 | play in, interpret or otherwise perform literary or artistic works or 73 | expressions of folklore; (ii) in the case of a phonogram the producer 74 | being the person or legal entity who first fixes the sounds of a 75 | performance or other sounds; and, (iii) in the case of broadcasts, the 76 | organization that transmits the broadcast. 77 | h. "Work" means the literary and/or artistic work offered under the terms 78 | of this License including without limitation any production in the 79 | literary, scientific and artistic domain, whatever may be the mode or 80 | form of its expression including digital form, such as a book, 81 | pamphlet and other writing; a lecture, address, sermon or other work 82 | of the same nature; a dramatic or dramatico-musical work; a 83 | choreographic work or entertainment in dumb show; a musical 84 | composition with or without words; a cinematographic work to which are 85 | assimilated works expressed by a process analogous to cinematography; 86 | a work of drawing, painting, architecture, sculpture, engraving or 87 | lithography; a photographic work to which are assimilated works 88 | expressed by a process analogous to photography; a work of applied 89 | art; an illustration, map, plan, sketch or three-dimensional work 90 | relative to geography, topography, architecture or science; a 91 | performance; a broadcast; a phonogram; a compilation of data to the 92 | extent it is protected as a copyrightable work; or a work performed by 93 | a variety or circus performer to the extent it is not otherwise 94 | considered a literary or artistic work. 95 | i. "You" means an individual or entity exercising rights under this 96 | License who has not previously violated the terms of this License with 97 | respect to the Work, or who has received express permission from the 98 | Licensor to exercise rights under this License despite a previous 99 | violation. 100 | j. "Publicly Perform" means to perform public recitations of the Work and 101 | to communicate to the public those public recitations, by any means or 102 | process, including by wire or wireless means or public digital 103 | performances; to make available to the public Works in such a way that 104 | members of the public may access these Works from a place and at a 105 | place individually chosen by them; to perform the Work to the public 106 | by any means or process and the communication to the public of the 107 | performances of the Work, including by public digital performance; to 108 | broadcast and rebroadcast the Work by any means including signs, 109 | sounds or images. 110 | k. "Reproduce" means to make copies of the Work by any means including 111 | without limitation by sound or visual recordings and the right of 112 | fixation and reproducing fixations of the Work, including storage of a 113 | protected performance or phonogram in digital form or other electronic 114 | medium. 115 | 116 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce, 117 | limit, or restrict any uses free from copyright or rights arising from 118 | limitations or exceptions that are provided for in connection with the 119 | copyright protection under copyright law or other applicable laws. 120 | 121 | 3. License Grant. Subject to the terms and conditions of this License, 122 | Licensor hereby grants You a worldwide, royalty-free, non-exclusive, 123 | perpetual (for the duration of the applicable copyright) license to 124 | exercise the rights in the Work as stated below: 125 | 126 | a. to Reproduce the Work, to incorporate the Work into one or more 127 | Collections, and to Reproduce the Work as incorporated in the 128 | Collections; 129 | b. to create and Reproduce Adaptations provided that any such Adaptation, 130 | including any translation in any medium, takes reasonable steps to 131 | clearly label, demarcate or otherwise identify that changes were made 132 | to the original Work. For example, a translation could be marked "The 133 | original work was translated from English to Spanish," or a 134 | modification could indicate "The original work has been modified."; 135 | c. to Distribute and Publicly Perform the Work including as incorporated 136 | in Collections; and, 137 | d. to Distribute and Publicly Perform Adaptations. 138 | e. For the avoidance of doubt: 139 | 140 | i. Non-waivable Compulsory License Schemes. In those jurisdictions in 141 | which the right to collect royalties through any statutory or 142 | compulsory licensing scheme cannot be waived, the Licensor 143 | reserves the exclusive right to collect such royalties for any 144 | exercise by You of the rights granted under this License; 145 | ii. Waivable Compulsory License Schemes. In those jurisdictions in 146 | which the right to collect royalties through any statutory or 147 | compulsory licensing scheme can be waived, the Licensor waives the 148 | exclusive right to collect such royalties for any exercise by You 149 | of the rights granted under this License; and, 150 | iii. Voluntary License Schemes. The Licensor waives the right to 151 | collect royalties, whether individually or, in the event that the 152 | Licensor is a member of a collecting society that administers 153 | voluntary licensing schemes, via that society, from any exercise 154 | by You of the rights granted under this License. 155 | 156 | The above rights may be exercised in all media and formats whether now 157 | known or hereafter devised. The above rights include the right to make 158 | such modifications as are technically necessary to exercise the rights in 159 | other media and formats. Subject to Section 8(f), all rights not expressly 160 | granted by Licensor are hereby reserved. 161 | 162 | 4. Restrictions. The license granted in Section 3 above is expressly made 163 | subject to and limited by the following restrictions: 164 | 165 | a. You may Distribute or Publicly Perform the Work only under the terms 166 | of this License. You must include a copy of, or the Uniform Resource 167 | Identifier (URI) for, this License with every copy of the Work You 168 | Distribute or Publicly Perform. You may not offer or impose any terms 169 | on the Work that restrict the terms of this License or the ability of 170 | the recipient of the Work to exercise the rights granted to that 171 | recipient under the terms of the License. You may not sublicense the 172 | Work. You must keep intact all notices that refer to this License and 173 | to the disclaimer of warranties with every copy of the Work You 174 | Distribute or Publicly Perform. When You Distribute or Publicly 175 | Perform the Work, You may not impose any effective technological 176 | measures on the Work that restrict the ability of a recipient of the 177 | Work from You to exercise the rights granted to that recipient under 178 | the terms of the License. This Section 4(a) applies to the Work as 179 | incorporated in a Collection, but this does not require the Collection 180 | apart from the Work itself to be made subject to the terms of this 181 | License. If You create a Collection, upon notice from any Licensor You 182 | must, to the extent practicable, remove from the Collection any credit 183 | as required by Section 4(c), as requested. If You create an 184 | Adaptation, upon notice from any Licensor You must, to the extent 185 | practicable, remove from the Adaptation any credit as required by 186 | Section 4(c), as requested. 187 | b. You may Distribute or Publicly Perform an Adaptation only under the 188 | terms of: (i) this License; (ii) a later version of this License with 189 | the same License Elements as this License; (iii) a Creative Commons 190 | jurisdiction license (either this or a later license version) that 191 | contains the same License Elements as this License (e.g., 192 | Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible 193 | License. If you license the Adaptation under one of the licenses 194 | mentioned in (iv), you must comply with the terms of that license. If 195 | you license the Adaptation under the terms of any of the licenses 196 | mentioned in (i), (ii) or (iii) (the "Applicable License"), you must 197 | comply with the terms of the Applicable License generally and the 198 | following provisions: (I) You must include a copy of, or the URI for, 199 | the Applicable License with every copy of each Adaptation You 200 | Distribute or Publicly Perform; (II) You may not offer or impose any 201 | terms on the Adaptation that restrict the terms of the Applicable 202 | License or the ability of the recipient of the Adaptation to exercise 203 | the rights granted to that recipient under the terms of the Applicable 204 | License; (III) You must keep intact all notices that refer to the 205 | Applicable License and to the disclaimer of warranties with every copy 206 | of the Work as included in the Adaptation You Distribute or Publicly 207 | Perform; (IV) when You Distribute or Publicly Perform the Adaptation, 208 | You may not impose any effective technological measures on the 209 | Adaptation that restrict the ability of a recipient of the Adaptation 210 | from You to exercise the rights granted to that recipient under the 211 | terms of the Applicable License. This Section 4(b) applies to the 212 | Adaptation as incorporated in a Collection, but this does not require 213 | the Collection apart from the Adaptation itself to be made subject to 214 | the terms of the Applicable License. 215 | c. If You Distribute, or Publicly Perform the Work or any Adaptations or 216 | Collections, You must, unless a request has been made pursuant to 217 | Section 4(a), keep intact all copyright notices for the Work and 218 | provide, reasonable to the medium or means You are utilizing: (i) the 219 | name of the Original Author (or pseudonym, if applicable) if supplied, 220 | and/or if the Original Author and/or Licensor designate another party 221 | or parties (e.g., a sponsor institute, publishing entity, journal) for 222 | attribution ("Attribution Parties") in Licensor's copyright notice, 223 | terms of service or by other reasonable means, the name of such party 224 | or parties; (ii) the title of the Work if supplied; (iii) to the 225 | extent reasonably practicable, the URI, if any, that Licensor 226 | specifies to be associated with the Work, unless such URI does not 227 | refer to the copyright notice or licensing information for the Work; 228 | and (iv) , consistent with Ssection 3(b), in the case of an 229 | Adaptation, a credit identifying the use of the Work in the Adaptation 230 | (e.g., "French translation of the Work by Original Author," or 231 | "Screenplay based on original Work by Original Author"). The credit 232 | required by this Section 4(c) may be implemented in any reasonable 233 | manner; provided, however, that in the case of a Adaptation or 234 | Collection, at a minimum such credit will appear, if a credit for all 235 | contributing authors of the Adaptation or Collection appears, then as 236 | part of these credits and in a manner at least as prominent as the 237 | credits for the other contributing authors. For the avoidance of 238 | doubt, You may only use the credit required by this Section for the 239 | purpose of attribution in the manner set out above and, by exercising 240 | Your rights under this License, You may not implicitly or explicitly 241 | assert or imply any connection with, sponsorship or endorsement by the 242 | Original Author, Licensor and/or Attribution Parties, as appropriate, 243 | of You or Your use of the Work, without the separate, express prior 244 | written permission of the Original Author, Licensor and/or Attribution 245 | Parties. 246 | d. Except as otherwise agreed in writing by the Licensor or as may be 247 | otherwise permitted by applicable law, if You Reproduce, Distribute or 248 | Publicly Perform the Work either by itself or as part of any 249 | Adaptations or Collections, You must not distort, mutilate, modify or 250 | take other derogatory action in relation to the Work which would be 251 | prejudicial to the Original Author's honor or reputation. Licensor 252 | agrees that in those jurisdictions (e.g. Japan), in which any exercise 253 | of the right granted in Section 3(b) of this License (the right to 254 | make Adaptations) would be deemed to be a distortion, mutilation, 255 | modification or other derogatory action prejudicial to the Original 256 | Author's honor and reputation, the Licensor will waive or not assert, 257 | as appropriate, this Section, to the fullest extent permitted by the 258 | applicable national law, to enable You to reasonably exercise Your 259 | right under Section 3(b) of this License (right to make Adaptations) 260 | but not otherwise. 261 | 262 | 5. Representations, Warranties and Disclaimer 263 | 264 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR 265 | OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY 266 | KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, 267 | INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, 268 | FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF 269 | LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, 270 | WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION 271 | OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. 272 | 273 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE 274 | LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR 275 | ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES 276 | ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS 277 | BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 278 | 279 | 7. Termination 280 | 281 | a. This License and the rights granted hereunder will terminate 282 | automatically upon any breach by You of the terms of this License. 283 | Individuals or entities who have received Adaptations or Collections 284 | from You under this License, however, will not have their licenses 285 | terminated provided such individuals or entities remain in full 286 | compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will 287 | survive any termination of this License. 288 | b. Subject to the above terms and conditions, the license granted here is 289 | perpetual (for the duration of the applicable copyright in the Work). 290 | Notwithstanding the above, Licensor reserves the right to release the 291 | Work under different license terms or to stop distributing the Work at 292 | any time; provided, however that any such election will not serve to 293 | withdraw this License (or any other license that has been, or is 294 | required to be, granted under the terms of this License), and this 295 | License will continue in full force and effect unless terminated as 296 | stated above. 297 | 298 | 8. Miscellaneous 299 | 300 | a. Each time You Distribute or Publicly Perform the Work or a Collection, 301 | the Licensor offers to the recipient a license to the Work on the same 302 | terms and conditions as the license granted to You under this License. 303 | b. Each time You Distribute or Publicly Perform an Adaptation, Licensor 304 | offers to the recipient a license to the original Work on the same 305 | terms and conditions as the license granted to You under this License. 306 | c. If any provision of this License is invalid or unenforceable under 307 | applicable law, it shall not affect the validity or enforceability of 308 | the remainder of the terms of this License, and without further action 309 | by the parties to this agreement, such provision shall be reformed to 310 | the minimum extent necessary to make such provision valid and 311 | enforceable. 312 | d. No term or provision of this License shall be deemed waived and no 313 | breach consented to unless such waiver or consent shall be in writing 314 | and signed by the party to be charged with such waiver or consent. 315 | e. This License constitutes the entire agreement between the parties with 316 | respect to the Work licensed here. There are no understandings, 317 | agreements or representations with respect to the Work not specified 318 | here. Licensor shall not be bound by any additional provisions that 319 | may appear in any communication from You. This License may not be 320 | modified without the mutual written agreement of the Licensor and You. 321 | f. The rights granted under, and the subject matter referenced, in this 322 | License were drafted utilizing the terminology of the Berne Convention 323 | for the Protection of Literary and Artistic Works (as amended on 324 | September 28, 1979), the Rome Convention of 1961, the WIPO Copyright 325 | Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 326 | and the Universal Copyright Convention (as revised on July 24, 1971). 327 | These rights and subject matter take effect in the relevant 328 | jurisdiction in which the License terms are sought to be enforced 329 | according to the corresponding provisions of the implementation of 330 | those treaty provisions in the applicable national law. If the 331 | standard suite of rights granted under applicable copyright law 332 | includes additional rights not granted under this License, such 333 | additional rights are deemed to be included in the License; this 334 | License is not intended to restrict the license of any rights under 335 | applicable law. 336 | 337 | 338 | Creative Commons Notice 339 | 340 | Creative Commons is not a party to this License, and makes no warranty 341 | whatsoever in connection with the Work. Creative Commons will not be 342 | liable to You or any party on any legal theory for any damages 343 | whatsoever, including without limitation any general, special, 344 | incidental or consequential damages arising in connection to this 345 | license. Notwithstanding the foregoing two (2) sentences, if Creative 346 | Commons has expressly identified itself as the Licensor hereunder, it 347 | shall have all rights and obligations of Licensor. 348 | 349 | Except for the limited purpose of indicating to the public that the 350 | Work is licensed under the CCPL, Creative Commons does not authorize 351 | the use by either party of the trademark "Creative Commons" or any 352 | related trademark or logo of Creative Commons without the prior 353 | written consent of Creative Commons. Any permitted use will be in 354 | compliance with Creative Commons' then-current trademark usage 355 | guidelines, as may be published on its website or otherwise made 356 | available upon request from time to time. For the avoidance of doubt, 357 | this trademark restriction does not form part of the License. 358 | 359 | Creative Commons may be contacted at https://creativecommons.org/. 360 | -------------------------------------------------------------------------------- /models/glove/ATTRIBUTION.md: -------------------------------------------------------------------------------- 1 | # FastText Word Vectors Attribution 2 | 3 | This directory contains pre-trained word vectors using Wikipedia 2014 + Gigaword 5 from Stanford NLP. 4 | 5 | Original source: https://nlp.stanford.edu/projects/glove/ 6 | 7 | ## Citation 8 | 9 | If you use these word vectors, please cite the following paper: 10 | 11 | 12 | @inproceedings{grave2018learning, 13 | title={Learning Word Vectors for 157 Languages}, 14 | author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas}, 15 | booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)}, 16 | year={2018} 17 | } 18 | text 19 | 20 | ## License 21 | 22 | These word vectors are distributed under the Public Domain Dedication and License v1.0. See the LICENSE file in this directory for the full license text. 23 | 24 | ## Changes Made 25 | 26 | The word vectors have been processed to fit the specific needs of this project. The modifications include converting them to binary format. 27 | -------------------------------------------------------------------------------- /models/glove/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Open Data Commons Public Domain Dedication and License (PDDL) v1.0 — Open Data Commons: legal tools for open data 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
Open Knowledge
20 |
21 | 22 | Contents 23 | 24 | 258 |
259 |
260 | 261 |

Open Data Commons Public Domain Dedication and License (PDDL) v1.0

262 |

Disclaimer

Open Data Commons is not a law firm and does not provide legal services of any kind.

263 |

Open Data Commons has no formal relationship with you. Your receipt of this document does not create any kind of agent-client relationship. Please seek the advice of a suitably qualified legal professional licensed to practice in your jurisdiction before using this document.

264 |

No warranties and disclaimer of any damages.

265 |

This information is provided ‘as is‘, and this site makes no warranties on the information provided. Any damages resulting from its use are disclaimed.

266 |

Read the full disclaimer. A plain language summary of the Public Domain Dedication and License is available as well as a plain text version.

267 |
268 |

Public Domain Dedication and License (PDDL)

Preamble

The Open Data Commons – Public Domain Dedication and Licence is a document intended to allow you to freely share, modify, and use this work for any purpose and without any restrictions. This licence is intended for use on databases or their contents (“data”), either together or individually.

269 |

Many databases are covered by copyright. Some jurisdictions, mainly in Europe, have specific special rights that cover databases called the “sui generis” database right. Both of these sets of rights, as well as other legal rights used to protect databases and data, can create uncertainty or practical difficulty for those wishing to share databases and their underlying data but retain a limited amount of rights under a “some rights reserved” approach to licensing as outlined in the Science Commons Protocol for Implementing Open Access Data. As a result, this waiver and licence tries to the fullest extent possible to eliminate or fully license any rights that cover this database and data. Any Community Norms or similar statements of use of the database or data do not form a part of this document, and do not act as a contract for access or other terms of use for the database or data.

270 |

The position of the recipient of the work

Because this document places the database and its contents in or as close as possible within the public domain, there are no restrictions or requirements placed on the recipient by this document. Recipients may use this work commercially, use technical protection measures, combine this data or database with other databases or data, and share their changes and additions or keep them secret. It is not a requirement that recipients provide further users with a copy of this licence or attribute the original creator of the data or database as a source. The goal is to eliminate restrictions held by the original creator of the data and database on the use of it by others.

271 |

The position of the dedicator of the work

Copyright law, as with most other law under the banner of “intellectual property”, is inherently national law. This means that there exists several differences in how copyright and other IP rights can be relinquished, waived or licensed in the many legal jurisdictions of the world. This is despite much harmonisation of minimum levels of protection. The internet and other communication technologies span these many disparate legal jurisdictions and thus pose special difficulties for a document relinquishing and waiving intellectual property rights, including copyright and database rights, for use by the global community. Because of this feature of intellectual property law, this document first relinquishes the rights and waives the relevant rights and claims. It then goes on to license these same rights for jurisdictions or areas of law that may make it difficult to relinquish or waive rights or claims.

272 |

The purpose of this document is to enable rightsholders to place their work into the public domain. Unlike licences for free and open source software, free cultural works, or open content licences, rightsholders will not be able to “dual license” their work by releasing the same work under different licences. This is because they have allowed anyone to use the work in whatever way they choose. Rightsholders therefore can’t re-license it under copyright or database rights on different terms because they have nothing left to license. Doing so creates truly accessible data to build rich applications and advance the progress of science and the arts.

273 |

This document can cover either or both of the database and its contents (the data). Because databases can have a wide variety of content – not just factual data – rightsholders should use the Open Data Commons – Public Domain Dedication & Licence for an entire database and its contents only if everything can be placed under the terms of this document. Because even factual data can sometimes have intellectual property rights, rightsholders should use this licence to cover both the database and its factual data when making material available under this document; even if it is likely that the data would not be covered by copyright or database rights.

274 |

Rightsholders can also use this document to cover any copyright or database rights claims over only a database, and leave the contents to be covered by other licences or documents. They can do this because this document refers to the “Work”, which can be either – or both – the database and its contents. As a result, rightsholders need to clearly state what they are dedicating under this document when they dedicate it.

275 |

Just like any licence or other document dealing with intellectual property, rightsholders should be aware that one can only license what one owns. Please ensure that the rights have been cleared to make this material available under this document.

276 |

This document permanently and irrevocably makes the Work available to the public for any use of any kind, and it should not be used unless the rightsholder is prepared for this to happen.

277 |

Part I: Introduction

The Rightsholder (the Person holding rights or claims over the Work) agrees as follows:

278 |

1.0 Definitions of Capitalised Words

“Copyright” – Includes rights under copyright and under neighbouring rights and similarly related sets of rights under the law of the relevant jurisdiction under Section 6.4.

279 |

“Data” – The contents of the Database, which includes the information, independent works, or other material collected into the Database offered under the terms of this Document.

280 |

“Database” – A collection of Data arranged in a systematic or methodical way and individually accessible by electronic or other means offered under the terms of this Document.

281 |

“Database Right” – Means rights over Data resulting from the Chapter III (“sui generis”) rights in the Database Directive (Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases) and any future updates as well as any similar rights available in the relevant jurisdiction under Section 6.4.

282 |

“Document” – means this relinquishment and waiver of rights and claims and back up licence agreement.

283 |

“Person” – Means a natural or legal person or a body of persons corporate or incorporate.

284 |

“Use” – As a verb, means doing any act that is restricted by Copyright or Database Rights whether in the original medium or any other; and includes modifying the Work as may be technically necessary to use it in a different mode or format. This includes the right to sublicense the Work.

285 |

“Work” – Means either or both of the Database and Data offered under the terms of this Document.

286 |

“You” – the Person acquiring rights under the licence elements of this Document.

287 |

Words in the singular include the plural and vice versa.

288 |

2.0 What this document covers

2.1. Legal effect of this Document. This Document is:

289 |

a. A dedication to the public domain and waiver of Copyright and Database Rights over the Work; and

290 |

b. A licence of Copyright and Database Rights over the Work in jurisdictions that do not allow for relinquishment or waiver.

291 |

2.2. Legal rights covered.

292 |

a. Copyright. Any copyright or neighbouring rights in the Work. Copyright law varies between jurisdictions, but is likely to cover: the Database model or schema, which is the structure, arrangement, and organisation of the Database, and can also include the Database tables and table indexes; the data entry and output sheets; and the Field names of Data stored in the Database. Copyright may also cover the Data depending on the jurisdiction and type of Data; and

293 |

b. Database Rights. Database Rights only extend to the extraction and re-utilisation of the whole or a substantial part of the Data. Database Rights can apply even when there is no copyright over the Database. Database Rights can also apply when the Data is removed from the Database and is selected and arranged in a way that would not infringe any applicable copyright.

294 |

2.2 Rights not covered.

295 |

a. This Document does not apply to computer programs used in the making or operation of the Database;

296 |

b. This Document does not cover any patents over the Data or the Database. Please see Section 4.2 later in this Document for further details; and

297 |

c. This Document does not cover any trade marks associated with the Database. Please see Section 4.3 later in this Document for further details.

298 |

Users of this Database are cautioned that they may have to clear other rights or consult other licences.

299 |

2.3 Facts are free. The Rightsholder takes the position that factual information is not covered by Copyright. This Document however covers the Work in jurisdictions that may protect the factual information in the Work by Copyright, and to cover any information protected by Copyright that is contained in the Work.

300 |

Part II: Dedication to the public domain

3.0 Dedication, waiver, and licence of Copyright and Database Rights

301 |

3.1 Dedication of Copyright and Database Rights to the public domain. The Rightsholder by using this Document, dedicates the Work to the public domain for the benefit of the public and relinquishes all rights in Copyright and Database Rights over the Work.

302 |

a. The Rightsholder realises that once these rights are relinquished, that the Rightsholder has no further rights in Copyright and Database Rights over the Work, and that the Work is free and open for others to Use.

303 |

b. The Rightsholder intends for their relinquishment to cover all present and future rights in the Work under Copyright and Database Rights, whether they are vested or contingent rights, and that this relinquishment of rights covers all their heirs and successors.

304 |

The above relinquishment of rights applies worldwide and includes media and formats now known or created in the future.

305 |

3.2 Waiver of rights and claims in Copyright and Database Rights when Section 3.1 dedication inapplicable. If the dedication in Section 3.1 does not apply in the relevant jurisdiction under Section 6.4, the Rightsholder waives any rights and claims that the Rightsholder may have or acquire in the future over the Work in:

306 |

a. Copyright; and

307 |

b. Database Rights.

308 |

To the extent possible in the relevant jurisdiction, the above waiver of rights and claims applies worldwide and includes media and formats now known or created in the future. The Rightsholder agrees not to assert the above rights and waives the right to enforce them over the Work.

309 |

3.3 Licence of Copyright and Database Rights when Sections 3.1 and 3.2 inapplicable. If the dedication and waiver in Sections 3.1 and 3.2 does not apply in the relevant jurisdiction under Section 6.4, the Rightsholder and You agree as follows:

310 |

a. The Licensor grants to You a worldwide, royalty-free, non-exclusive, licence to Use the Work for the duration of any applicable Copyright and Database Rights. These rights explicitly include commercial use, and do not exclude any field of endeavour. To the extent possible in the relevant jurisdiction, these rights may be exercised in all media and formats whether now known or created in the future.

311 |

3.4 Moral rights. This section covers moral rights, including the right to be identified as the author of the Work or to object to treatment that would otherwise prejudice the author’s honour and reputation, or any other derogatory treatment:

312 |

a. For jurisdictions allowing waiver of moral rights, Licensor waives all moral rights that Licensor may have in the Work to the fullest extent possible by the law of the relevant jurisdiction under Section 6.4;

313 |

b. If waiver of moral rights under Section 3.4 a in the relevant jurisdiction is not possible, Licensor agrees not to assert any moral rights over the Work and waives all claims in moral rights to the fullest extent possible by the law of the relevant jurisdiction under Section 6.4; and

314 |

c. For jurisdictions not allowing waiver or an agreement not to assert moral rights under Section 3.4 a and b, the author may retain their moral rights over the copyrighted aspects of the Work.

315 |

Please note that some jurisdictions do not allow for the waiver of moral rights, and so moral rights may still subsist over the work in some jurisdictions.

316 |

4.0 Relationship to other rights

4.1 No other contractual conditions. The Rightsholder makes this Work available to You without any other contractual obligations, either express or implied. Any Community Norms statement associated with the Work is not a contract and does not form part of this Document.

317 |

4.2 Relationship to patents. This Document does not grant You a licence for any patents that the Rightsholder may own. Users of this Database are cautioned that they may have to clear other rights or consult other licences.

318 |

4.3 Relationship to trade marks. This Document does not grant You a licence for any trade marks that the Rightsholder may own or that the Rightsholder may use to cover the Work. Users of this Database are cautioned that they may have to clear other rights or consult other licences.

319 |

Part III: General provisions

5.0 Warranties, disclaimer, and limitation of liability

5.1 The Work is provided by the Rightsholder “as is” and without any warranty of any kind, either express or implied, whether of title, of accuracy or completeness, of the presence of absence of errors, of fitness for purpose, or otherwise. Some jurisdictions do not allow the exclusion of implied warranties, so this exclusion may not apply to You.

320 |

5.2 Subject to any liability that may not be excluded or limited by law, the Rightsholder is not liable for, and expressly excludes, all liability for loss or damage however and whenever caused to anyone by any use under this Document, whether by You or by anyone else, and whether caused by any fault on the part of the Rightsholder or not. This exclusion of liability includes, but is not limited to, any special, incidental, consequential, punitive, or exemplary damages. This exclusion applies even if the Rightsholder has been advised of the possibility of such damages.

321 |

5.3 If liability may not be excluded by law, it is limited to actual and direct financial loss to the extent it is caused by proved negligence on the part of the Rightsholder.

322 |

6.0 General

6.1 If any provision of this Document is held to be invalid or unenforceable, that must not affect the validity or enforceability of the remainder of the terms of this Document.

323 |

6.2 This Document is the entire agreement between the parties with respect to the Work covered here. It replaces any earlier understandings, agreements or representations with respect to the Work not specified here.

324 |

6.3 This Document does not affect any rights that You or anyone else may independently have under any applicable law to make any use of this Work, including (for jurisdictions where this Document is a licence) fair dealing, fair use, database exceptions, or any other legally recognised limitation or exception to infringement of copyright or other applicable laws.

325 |

6.4 This Document takes effect in the relevant jurisdiction in which the Document terms are sought to be enforced. If the rights waived or granted under applicable law in the relevant jurisdiction includes additional rights not waived or granted under this Document, these additional rights are included in this Document in order to meet the intent of this Document.

326 | 327 | 328 |
329 |
330 | 384 |
385 | 435 | 436 | 437 | 438 | 447 | 448 | 449 | 450 | -------------------------------------------------------------------------------- /models/glove/glove.6B.300d.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f21541ff9116ab6732ea04a978d74984ffb5a9e5cb4f26e833d11cd640f30222 3 | size 483356485 4 | -------------------------------------------------------------------------------- /models/googlenews-slim/ATTRIBUTION.md: -------------------------------------------------------------------------------- 1 | # Word2Vec Slim Model Attribution 2 | 3 | This directory contains a mirrored version of the word2vec-slim model, which is derived from the Google News dataset (about 100 billion words). 4 | 5 | Original source: https://github.com/eyaler/word2vec-slim/ 6 | 7 | The word2vec model used here is a slimmed-down version of the original Google News model, created by Eyal Gruss. 8 | 9 | ## Original Attribution 10 | 11 | Pre-trained vectors trained on part of Google News dataset (about 100 billion words). 12 | Model contains 300-dimensional vectors for 3 million words and phrases. 13 | 14 | The original model was created by Mikolov et al. and is available here: 15 | https://code.google.com/archive/p/word2vec/ 16 | 17 | ## License 18 | 19 | This model is distributed under the Apache License 2.0. See the LICENSE file in this directory for the full license text. 20 | 21 | ## Changes Made 22 | 23 | Quantized 8 bit int models are derived from the original 32 bit float model. -------------------------------------------------------------------------------- /models/googlenews-slim/GoogleNews-vectors-negative300-SLIM.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:046e0921bcb665f50d646b0963fcef8c5abb5f830d0daba8f686e1dffd6ad832 3 | size 362017275 4 | -------------------------------------------------------------------------------- /models/googlenews-slim/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /modules/config/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package config provides a simple configuration loader for the semantic-grep tool. 3 | Contains a FindConfigFile function that searches for a configuration file 4 | in a few standard locations, and a LoadConfig function that reads the 5 | configuration file and returns a Config struct. 6 | */ 7 | 8 | package config 9 | 10 | import ( 11 | "encoding/json" 12 | "fmt" 13 | "os" 14 | "path/filepath" 15 | ) 16 | 17 | const DefaultConfigPath = "config.json" 18 | 19 | type Config struct { 20 | ModelPath string `json:"model_path"` 21 | } 22 | 23 | func FindConfigFile() string { 24 | cwd, err := os.Getwd() 25 | if err != nil { 26 | fmt.Fprintf(os.Stderr, "Warning: Unable to determine current directory: %v\n", err) 27 | cwd = "." 28 | } 29 | 30 | locations := []string{ 31 | filepath.Join(cwd, "config.json"), 32 | DefaultConfigPath, 33 | os.ExpandEnv("$HOME/.config/semantic-grep/config.json"), 34 | "/etc/semantic-grep/config.json", 35 | } 36 | 37 | for _, location := range locations { 38 | if _, err := os.Stat(location); err == nil { 39 | return location 40 | } 41 | } 42 | 43 | return "" 44 | } 45 | 46 | func LoadConfig(configPath string) (*Config, error) { 47 | file, err := os.Open(configPath) 48 | if err != nil { 49 | return nil, err 50 | } 51 | defer file.Close() 52 | 53 | var config Config 54 | decoder := json.NewDecoder(file) 55 | if err := decoder.Decode(&config); err != nil { 56 | return nil, err 57 | } 58 | 59 | return &config, nil 60 | } 61 | -------------------------------------------------------------------------------- /modules/model/model.go: -------------------------------------------------------------------------------- 1 | /* VectorModel interface 2 | 32 bit and 8 bit model structs 3 | LoadModel and GetEmbedding methods for both structs 4 | LoadVectorModel function to load either 32 bit or 8 bit model based on file extension 5 | */ 6 | 7 | package model 8 | 9 | import ( 10 | "bufio" 11 | "encoding/binary" 12 | "fmt" 13 | "io" 14 | "os" 15 | "strings" 16 | ) 17 | 18 | // VectorModel interface defines the methods that all vector models must implement 19 | type VectorModel interface { 20 | LoadModel(filename string) error 21 | GetEmbedding(token string) (interface{}, error) 22 | } 23 | 24 | // VecModel32bit represents a 32-bit floating point Word2Vec model 25 | type VecModel32bit struct { 26 | Vectors map[string][]float32 27 | Size int 28 | } 29 | 30 | // LoadModel loads a 32-bit floating point Word2Vec model from a file 31 | // Attempt to validate the header and check for unexpected data 32 | // at the end of each record and at the end of the file 33 | func (m *VecModel32bit) LoadModel(filename string) error { 34 | file, err := os.Open(filename) 35 | if err != nil { 36 | return fmt.Errorf("failed to open file: %v", err) 37 | } 38 | defer file.Close() 39 | 40 | reader := bufio.NewReader(file) 41 | 42 | // Read header 43 | var vocabSize, vectorSize int 44 | _, err = fmt.Fscanf(reader, "%d %d\n", &vocabSize, &vectorSize) 45 | if err != nil { 46 | return fmt.Errorf("failed to read header: %v\nCheck that you have a valid model file", err) 47 | } 48 | 49 | // Validate header 50 | if vocabSize <= 0 || vectorSize <= 0 { 51 | return fmt.Errorf("invalid header: vocabSize=%d, vectorSize=%d\nCheck that you have a valid model file", vocabSize, vectorSize) 52 | } 53 | 54 | m.Vectors = make(map[string][]float32, vocabSize) 55 | m.Size = vectorSize 56 | 57 | for i := 0; i < vocabSize; i++ { 58 | word, err := reader.ReadString(' ') 59 | if err != nil { 60 | return fmt.Errorf("failed to read word: %v", err) 61 | } 62 | word = strings.TrimSpace(word) 63 | 64 | vector := make([]float32, vectorSize) 65 | for j := 0; j < vectorSize; j++ { 66 | err := binary.Read(reader, binary.LittleEndian, &vector[j]) 67 | if err != nil { 68 | return fmt.Errorf("failed to read vector: %v", err) 69 | } 70 | } 71 | 72 | // Check if we've reached the end of the record 73 | nextByte, err := reader.Peek(1) 74 | if err != nil && err != io.EOF { 75 | return fmt.Errorf("unexpected error reading next byte: %v", err) 76 | } 77 | if len(nextByte) > 0 && nextByte[0] == '\n' { 78 | reader.ReadByte() // consume the newline 79 | } 80 | 81 | m.Vectors[word] = vector 82 | } 83 | 84 | // Check if we've reached the end of the file 85 | _, err = reader.ReadByte() 86 | if err != io.EOF { 87 | return fmt.Errorf("unexpected data at end of file.\nCheck that you have a valid model file") 88 | } 89 | 90 | return nil 91 | } 92 | 93 | // GetEmbedding returns the vector embedding of a token for the 32-bit model 94 | func (m *VecModel32bit) GetEmbedding(token string) (interface{}, error) { 95 | vec, ok := m.Vectors[token] 96 | if !ok { 97 | return nil, fmt.Errorf("word not found in model: %s", token) 98 | } 99 | return vec, nil 100 | } 101 | 102 | // VecModel8bit represents an 8-bit integer quantized Word2Vec model 103 | type VecModel8bit struct { 104 | Vectors map[string][]int8 105 | Min float32 106 | Max float32 107 | Size int 108 | } 109 | 110 | // LoadModel loads an 8-bit integer quantized Word2Vec model from a file 111 | func (m *VecModel8bit) LoadModel(filename string) error { 112 | file, err := os.Open(filename) 113 | if err != nil { 114 | return fmt.Errorf("failed to open file: %v", err) 115 | } 116 | defer file.Close() 117 | 118 | var vocabSize, vectorSize int32 119 | if err := binary.Read(file, binary.LittleEndian, &vocabSize); err != nil { 120 | return fmt.Errorf("failed to read vocab size: %v", err) 121 | } 122 | if err := binary.Read(file, binary.LittleEndian, &vectorSize); err != nil { 123 | return fmt.Errorf("failed to read vector size: %v", err) 124 | } 125 | m.Size = int(vectorSize) 126 | 127 | if err := binary.Read(file, binary.LittleEndian, &m.Min); err != nil { 128 | return fmt.Errorf("failed to read min value: %v", err) 129 | } 130 | if err := binary.Read(file, binary.LittleEndian, &m.Max); err != nil { 131 | return fmt.Errorf("failed to read max value: %v", err) 132 | } 133 | 134 | m.Vectors = make(map[string][]int8, vocabSize) 135 | 136 | for i := 0; i < int(vocabSize); i++ { 137 | word, err := readNullTerminatedString(file) 138 | if err != nil { 139 | return fmt.Errorf("failed to read word: %v", err) 140 | } 141 | 142 | vector := make([]int8, vectorSize) 143 | if err := binary.Read(file, binary.LittleEndian, &vector); err != nil { 144 | return fmt.Errorf("failed to read vector: %v", err) 145 | } 146 | 147 | m.Vectors[word] = vector 148 | } 149 | 150 | return nil 151 | } 152 | 153 | // GetEmbedding returns the vector embedding of a token for the 8-bit quantized model 154 | func (m *VecModel8bit) GetEmbedding(token string) (interface{}, error) { 155 | vec, ok := m.Vectors[token] 156 | if !ok { 157 | return nil, fmt.Errorf("word not found in model: %s", token) 158 | } 159 | return vec, nil 160 | } 161 | 162 | // Helper function to read null-terminated strings 163 | func readNullTerminatedString(reader io.Reader) (string, error) { 164 | var bytes []byte 165 | for { 166 | var b [1]byte 167 | _, err := reader.Read(b[:]) 168 | if err != nil { 169 | return "", err 170 | } 171 | if b[0] == 0 { 172 | break 173 | } 174 | bytes = append(bytes, b[0]) 175 | } 176 | return string(bytes), nil 177 | } 178 | 179 | // LoadVectorModel loads either a 32-bit or 8-bit model based on the file extension 180 | func LoadVectorModel(filename string) (VectorModel, error) { 181 | var model VectorModel 182 | 183 | if strings.HasSuffix(filename, ".bin") { 184 | model = &VecModel32bit{} 185 | } else if strings.HasSuffix(filename, ".8int.bin") { 186 | model = &VecModel8bit{} 187 | } else { 188 | return nil, fmt.Errorf("unsupported file format") 189 | } 190 | 191 | err := model.LoadModel(filename) 192 | if err != nil { 193 | return nil, err 194 | } 195 | 196 | return model, nil 197 | } 198 | -------------------------------------------------------------------------------- /modules/processor/processor.go: -------------------------------------------------------------------------------- 1 | // Package processor provides functions to process text line by line, 2 | // performing semantic searches using Word2Vec models. 3 | package processor 4 | 5 | import ( 6 | "bufio" 7 | "fmt" 8 | "os" 9 | "strings" 10 | 11 | "github.com/arunsupe/semantic-grep/modules/model" 12 | "github.com/arunsupe/semantic-grep/modules/similarity" 13 | "github.com/arunsupe/semantic-grep/modules/utils" 14 | 15 | "github.com/clipperhouse/uax29/words" 16 | ) 17 | 18 | // ProcessLineByLine processes an input file line by line, performing semantic searches 19 | // based on the provided queries and Word2Vec model. It supports various options for 20 | // context lines, case sensitivity, and output formatting. 21 | // 22 | // queries: List of query words to search for. 23 | // w2vModel: The Word2Vec model used for semantic matching. 24 | // similarityCache: Cache for storing similarity calculations. 25 | // similarityThreshold: Threshold above which a match is considered similar. 26 | // contextBefore: Number of lines to include before a matching line. 27 | // contextAfter: Number of lines to include after a matching line. 28 | // input: The input file to process. 29 | // printLineNumbers: Whether to print line numbers in the output. 30 | // ignoreCase: Whether to ignore case when matching words. 31 | // outputOnlyMatching: Whether to output only the matching words. 32 | // outputOnlyLines: Whether to output only the lines that contain matches. 33 | func ProcessLineByLine(queries []string, w2vModel model.VectorModel, similarityCache similarity.SimilarityCache, 34 | similarityThreshold float64, contextBefore, contextAfter int, input *os.File, 35 | printLineNumbers, ignoreCase, outputOnlyMatching, outputOnlyLines bool) { 36 | 37 | // Prepare query vectors 38 | queryVectors := make(map[string]interface{}) 39 | queryInModel := make(map[string]bool) 40 | 41 | for _, query := range queries { 42 | var queryTokenToCheck string 43 | if ignoreCase { 44 | queryTokenToCheck = strings.ToLower(query) 45 | } else { 46 | queryTokenToCheck = query 47 | } 48 | 49 | queryVector, err := w2vModel.GetEmbedding(queryTokenToCheck) 50 | if err != nil { 51 | fmt.Fprintf(os.Stderr, "Warning: %v\n", err) 52 | queryInModel[queryTokenToCheck] = false 53 | } else { 54 | switch queryVector.(type) { 55 | case []float32, []int8: 56 | queryVectors[queryTokenToCheck] = queryVector 57 | queryInModel[queryTokenToCheck] = true 58 | default: 59 | fmt.Fprintf(os.Stderr, "Warning: Unsupported vector type for query: %s\n", queryTokenToCheck) 60 | queryInModel[queryTokenToCheck] = false 61 | } 62 | } 63 | } 64 | 65 | scanner := bufio.NewScanner(input) 66 | lineNumber := 0 67 | var contextBuffer []string 68 | var contextLineNumbers []int 69 | 70 | // Process each line 71 | for scanner.Scan() { 72 | line := scanner.Text() 73 | lineNumber++ 74 | matched := false 75 | var highlightedLine string 76 | var similarityScore float64 77 | var matchSimilarityScore float64 78 | 79 | // Tokenize and check each token 80 | tokens := words.NewSegmenter(scanner.Bytes()) 81 | for tokens.Next() { 82 | token := tokens.Text() 83 | var tokenToCheck string 84 | if ignoreCase { 85 | tokenToCheck = strings.ToLower(token) 86 | } else { 87 | tokenToCheck = token 88 | } 89 | 90 | for queryTokenToCheck, queryVector := range queryVectors { 91 | // Check if tokenToCheck is exactly equal to queryTokenToCheck 92 | if tokenToCheck == queryTokenToCheck { 93 | similarityScore = 1.0 94 | matchSimilarityScore = similarityScore 95 | matched = true 96 | highlightedLine = strings.Replace(line, token, utils.ColorText(token, "red"), -1) 97 | } else if queryInModel[queryTokenToCheck] { 98 | // Only perform similarity check if query is in the model 99 | tokenVector, err := w2vModel.GetEmbedding(tokenToCheck) 100 | if err == nil { 101 | // Calculate similarity and check threshold only if token is in model 102 | similarityScore = similarityCache.MemoizedCalculateSimilarity(queryTokenToCheck, tokenToCheck, queryVector, tokenVector) 103 | if similarityScore > similarityThreshold { 104 | matched = true 105 | highlightedLine = strings.Replace(line, token, utils.ColorText(token, "red"), -1) 106 | matchSimilarityScore = similarityScore 107 | } 108 | } 109 | } 110 | 111 | if matched && outputOnlyMatching { 112 | fmt.Println(token) 113 | matched = false // Stop after first match if -o is set 114 | } 115 | } 116 | } 117 | 118 | // Handle matched line 119 | if matched { 120 | if outputOnlyMatching { 121 | // Already printed in the loop above 122 | } else if outputOnlyLines { 123 | utils.PrintLine(highlightedLine, lineNumber, printLineNumbers) 124 | } else { 125 | fmt.Printf("Similarity: %.4f\n", matchSimilarityScore) 126 | // Print the context lines before the match 127 | for i, ctxLine := range contextBuffer { 128 | utils.PrintLine(ctxLine, contextLineNumbers[i], printLineNumbers) 129 | } 130 | 131 | // Print the matched line with highlighted token 132 | utils.PrintLine(highlightedLine, lineNumber, printLineNumbers) 133 | 134 | // Print the context lines after the match 135 | for i := 0; i < contextAfter && scanner.Scan(); i++ { 136 | lineNumber++ 137 | utils.PrintLine(scanner.Text(), lineNumber, printLineNumbers) 138 | } 139 | 140 | fmt.Println("--") 141 | } 142 | 143 | // Clear the context buffer after printing 144 | contextBuffer = nil 145 | contextLineNumbers = nil 146 | } else { 147 | // Update the context buffer with the current line if no match is found 148 | if contextBefore > 0 && !outputOnlyMatching && !outputOnlyLines { 149 | contextBuffer = append(contextBuffer, line) 150 | contextLineNumbers = append(contextLineNumbers, lineNumber) 151 | // Ensure the context buffer does not exceed the specified number of lines 152 | if len(contextBuffer) > contextBefore { 153 | contextBuffer = contextBuffer[1:] 154 | contextLineNumbers = contextLineNumbers[1:] 155 | } 156 | } 157 | } 158 | } 159 | 160 | // Check for scanner errors 161 | if err := scanner.Err(); err != nil { 162 | fmt.Fprintf(os.Stderr, "Error reading input: %v\n", err) 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /modules/similarity/similarity.go: -------------------------------------------------------------------------------- 1 | // Package similarity provides functions and types for calculating and caching 2 | // the similarity between word vectors using cosine similarity. 3 | package similarity 4 | 5 | import ( 6 | "math" 7 | ) 8 | 9 | // SimilarityCache is an interface for caching and calculating the similarity 10 | // between word vectors. 11 | type SimilarityCache interface { 12 | // MemoizedCalculateSimilarity calculates the similarity between two word vectors 13 | // and caches the result to avoid redundant calculations. 14 | MemoizedCalculateSimilarity(queryToken, token string, queryVector, tokenVector interface{}) float64 15 | } 16 | 17 | // Cache implements the SimilarityCache interface and provides a simple in-memory cache. 18 | type Cache struct { 19 | cache map[string]float64 20 | } 21 | 22 | // NewSimilarityCache creates a new Cache instance for storing similarity calculations. 23 | func NewSimilarityCache() *Cache { 24 | return &Cache{ 25 | cache: make(map[string]float64), 26 | } 27 | } 28 | 29 | // MemoizedCalculateSimilarity calculates the similarity between two word vectors 30 | // and caches the result. It supports both []float32 and []int8 vector types. 31 | func (c *Cache) MemoizedCalculateSimilarity(queryToken, token string, queryVector, tokenVector interface{}) float64 { 32 | key := token 33 | 34 | if cachedValue, exists := c.cache[key]; exists { 35 | return cachedValue 36 | } 37 | 38 | var similarity float64 39 | switch qv := queryVector.(type) { 40 | case []float32: 41 | similarity = calculateSimilarity32bit(qv, tokenVector.([]float32)) 42 | case []int8: 43 | similarity = calculateSimilarity8bit(qv, tokenVector.([]int8)) 44 | default: 45 | panic("Unsupported vector type") 46 | } 47 | 48 | c.cache[key] = similarity 49 | return similarity 50 | } 51 | 52 | // calculateSimilarity calculates the cosine similarity between two []float32 vectors 53 | func calculateSimilarity32bit(vec1, vec2 []float32) float64 { 54 | dotProduct := float64(0) 55 | norm1 := float64(0) 56 | norm2 := float64(0) 57 | for i := range vec1 { 58 | dotProduct += float64(vec1[i] * vec2[i]) 59 | norm1 += float64(vec1[i] * vec1[i]) 60 | norm2 += float64(vec2[i] * vec2[i]) 61 | } 62 | return dotProduct / (math.Sqrt(norm1) * math.Sqrt(norm2)) 63 | } 64 | 65 | // calculateSimilarity calculates the cosine similarity between two []int8 vectors 66 | func calculateSimilarity8bit(vec1, vec2 []int8) float64 { 67 | var dotProduct int32 68 | var norm1, norm2 int32 69 | 70 | for i := range vec1 { 71 | dotProduct += int32(vec1[i]) * int32(vec2[i]) 72 | norm1 += int32(vec1[i]) * int32(vec1[i]) 73 | norm2 += int32(vec2[i]) * int32(vec2[i]) 74 | } 75 | 76 | return float64(dotProduct) / (math.Sqrt(float64(norm1)) * math.Sqrt(float64(norm2))) 77 | } 78 | -------------------------------------------------------------------------------- /modules/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | // ColorText colors the given text with the specified color. 8 | func ColorText(text, color string) string { 9 | colors := map[string]string{ 10 | "red": "\033[31m", 11 | "green": "\033[32m", 12 | "yellow": "\033[33m", 13 | "blue": "\033[34m", 14 | "magenta": "\033[35m", 15 | "cyan": "\033[36m", 16 | "reset": "\033[0m", 17 | } 18 | return colors[color] + text + colors["reset"] 19 | } 20 | 21 | // PrintLine prints a line with an optional line number. 22 | func PrintLine(line string, lineNumber int, printLineNumbers bool) { 23 | if printLineNumbers { 24 | lineNumberStr := ColorText(fmt.Sprintf("%d:", lineNumber), "blue") 25 | fmt.Printf("%s %s\n", lineNumberStr, line) 26 | } else { 27 | fmt.Println(line) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /w2vgrep.go: -------------------------------------------------------------------------------- 1 | // Package main provides a command-line tool for performing semantic searches 2 | // on text files using Word2Vec models. 3 | 4 | package main 5 | 6 | import ( 7 | "bufio" 8 | "fmt" 9 | "os" 10 | 11 | "github.com/arunsupe/semantic-grep/modules/config" 12 | "github.com/arunsupe/semantic-grep/modules/model" 13 | "github.com/arunsupe/semantic-grep/modules/processor" 14 | "github.com/arunsupe/semantic-grep/modules/similarity" 15 | 16 | "github.com/jessevdk/go-flags" 17 | ) 18 | 19 | // Options defines the command-line options for the semantic-grep tool. 20 | type Options struct { 21 | ModelPath string `short:"m" long:"model_path" description:"Path to the Word2Vec model file"` 22 | SimilarityThreshold float64 `short:"t" long:"threshold" default:"0.7" description:"Similarity threshold for matching"` 23 | ContextBefore int `short:"A" long:"before-context" description:"Number of lines before matching line"` 24 | ContextAfter int `short:"B" long:"after-context" description:"Number of lines after matching line"` 25 | ContextBoth int `short:"C" long:"context" description:"Number of lines before and after matching line"` 26 | PrintLineNumbers bool `short:"n" long:"line-number" description:"Print line numbers"` 27 | IgnoreCase bool `short:"i" long:"ignore-case" description:"Ignore case. Note: word2vec is case-sensitive. Ignoring case may lead to unexpected results"` 28 | OutputOnlyMatching bool `short:"o" long:"only-matching" description:"Output only matching words"` 29 | OutputOnlyLines bool `short:"l" long:"only-lines" description:"Output only matched lines without similarity scores"` 30 | PatternFile string `short:"f" long:"file" description:"File with patterns to match"` 31 | } 32 | 33 | // main is the entry point for the semantic-grep tool. It parses command-line 34 | // options, loads the Word2Vec model, and processes the input text file or 35 | // standard input for semantic matches. 36 | func main() { 37 | var opts Options 38 | var parser = flags.NewParser(&opts, flags.Default) 39 | parser.Usage = "[OPTIONS] QUERY [FILE]" 40 | 41 | args, err := parser.Parse() 42 | if err != nil { 43 | if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp { 44 | os.Exit(0) 45 | } else { 46 | fmt.Fprintf(os.Stderr, "Error: %v\n", err) 47 | parser.WriteHelp(os.Stderr) 48 | os.Exit(1) 49 | } 50 | } 51 | 52 | if len(args) < 1 && opts.PatternFile == "" { 53 | fmt.Fprintln(os.Stderr, "Error: query or pattern file is required") 54 | parser.WriteHelp(os.Stderr) 55 | os.Exit(1) 56 | } 57 | 58 | if opts.ContextBoth > 0 { 59 | opts.ContextBefore = opts.ContextBoth 60 | opts.ContextAfter = opts.ContextBoth 61 | } 62 | 63 | var patterns []string 64 | if opts.PatternFile != "" { 65 | file, err := os.Open(opts.PatternFile) 66 | if err != nil { 67 | fmt.Fprintf(os.Stderr, "Error opening pattern file: %v\n", err) 68 | os.Exit(1) 69 | } 70 | defer file.Close() 71 | 72 | scanner := bufio.NewScanner(file) 73 | for scanner.Scan() { 74 | patterns = append(patterns, scanner.Text()) 75 | } 76 | if err := scanner.Err(); err != nil { 77 | fmt.Fprintf(os.Stderr, "Error reading pattern file: %v\n", err) 78 | os.Exit(1) 79 | } 80 | } 81 | 82 | query := "" 83 | if len(args) > 0 { 84 | query = args[0] 85 | } 86 | 87 | var input *os.File 88 | if len(args) > 1 { 89 | input, err = os.Open(args[1]) 90 | if err != nil { 91 | fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err) 92 | os.Exit(1) 93 | } 94 | defer input.Close() 95 | } else { 96 | input = os.Stdin 97 | } 98 | 99 | configPath := config.FindConfigFile() 100 | if configPath != "" { 101 | conf, err := config.LoadConfig(configPath) 102 | if err != nil { 103 | fmt.Fprintf(os.Stderr, "Error loading config from %s: %v\n", configPath, err) 104 | os.Exit(1) 105 | } 106 | fmt.Fprintf(os.Stderr, "Using configuration file: %s\n", configPath) 107 | 108 | if opts.ModelPath == "" { 109 | opts.ModelPath = conf.ModelPath 110 | } 111 | } 112 | 113 | if opts.ModelPath == "" { 114 | fmt.Fprintln(os.Stderr, "Error: Model path is required. Please provide it via config file or -m/--model_path flag.") 115 | parser.WriteHelp(os.Stderr) 116 | os.Exit(1) 117 | } 118 | 119 | var w2vModel model.VectorModel 120 | var similarityCache similarity.SimilarityCache 121 | 122 | w2vModel, err = model.LoadVectorModel(opts.ModelPath) 123 | if err != nil { 124 | fmt.Fprintf(os.Stderr, "Error loading full model: %v\n", err) 125 | os.Exit(1) 126 | } 127 | similarityCache = similarity.NewSimilarityCache() 128 | 129 | if opts.PatternFile != "" { 130 | patterns = append(patterns, query) 131 | processor.ProcessLineByLine(patterns, w2vModel, similarityCache, opts.SimilarityThreshold, 132 | opts.ContextBefore, opts.ContextAfter, input, opts.PrintLineNumbers, opts.IgnoreCase, 133 | opts.OutputOnlyMatching, opts.OutputOnlyLines) 134 | } else { 135 | processor.ProcessLineByLine([]string{query}, w2vModel, similarityCache, opts.SimilarityThreshold, 136 | opts.ContextBefore, opts.ContextAfter, input, opts.PrintLineNumbers, opts.IgnoreCase, 137 | opts.OutputOnlyMatching, opts.OutputOnlyLines) 138 | } 139 | } 140 | --------------------------------------------------------------------------------