├── rust ├── .gitignore ├── .dockerignore ├── Cargo.toml ├── Dockerfile ├── src │ └── main.rs └── Cargo.lock ├── go ├── Dockerfile ├── .vscode │ └── launch.json ├── .gitignore ├── .dockerignore ├── go.mod ├── go.sum └── main.go ├── compose.yaml ├── LICENSE ├── test ├── index.js └── papaparse.min.js └── README.md /rust/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /rust/.dockerignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | # custom 4 | .vscode 5 | .git 6 | .gitignore 7 | Dockerfile -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "text-similarity-rust" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | serde = { version = "1.0", features = ["derive"] } 8 | serde_json = "1" 9 | actix-web = "4" 10 | regex = "1.11.1" 11 | once_cell = "1.20.2" 12 | 13 | -------------------------------------------------------------------------------- /rust/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:latest as builder 2 | 3 | WORKDIR /text-similarity-rust 4 | 5 | COPY . . 6 | RUN cargo build --release 7 | 8 | FROM gcr.io/distroless/cc-debian12 9 | 10 | WORKDIR /app 11 | COPY --from=builder /text-similarity-rust/target/release/text-similarity-rust . 12 | 13 | CMD ["./text-similarity-rust"] -------------------------------------------------------------------------------- /go/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23-alpine AS builder 2 | 3 | WORKDIR /app 4 | 5 | COPY go.mod go.sum ./ 6 | RUN go mod download 7 | 8 | COPY . . 9 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/app main.go 10 | 11 | FROM alpine 12 | 13 | WORKDIR /out 14 | COPY --from=builder /out/app /out/app 15 | 16 | CMD ["./app"] -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | name: text-similarity 2 | 3 | services: 4 | rust: 5 | build: 6 | context: ./rust 7 | dockerfile: Dockerfile 8 | environment: 9 | ADDR: 0.0.0.0 10 | PORT: 8081 11 | ports: 12 | - 8081:8081 13 | 14 | go: 15 | build: 16 | context: ./go 17 | dockerfile: Dockerfile 18 | ports: 19 | - 8082:8082 20 | -------------------------------------------------------------------------------- /go/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Launch Package", 9 | "type": "go", 10 | "request": "launch", 11 | "mode": "auto", 12 | "program": "${fileDirname}" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /go/.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # env file 25 | .env -------------------------------------------------------------------------------- /go/.dockerignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # env file 25 | .env 26 | 27 | # custom 28 | .vscode 29 | .git 30 | .gitignore 31 | Dockerfile -------------------------------------------------------------------------------- /go/go.mod: -------------------------------------------------------------------------------- 1 | module text-similarity-go 2 | 3 | go 1.23.2 4 | 5 | require ( 6 | github.com/gofiber/fiber/v2 v2.52.5 7 | golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f 8 | ) 9 | 10 | require ( 11 | github.com/andybalholm/brotli v1.0.5 // indirect 12 | github.com/google/uuid v1.5.0 // indirect 13 | github.com/klauspost/compress v1.17.0 // indirect 14 | github.com/mattn/go-colorable v0.1.13 // indirect 15 | github.com/mattn/go-isatty v0.0.20 // indirect 16 | github.com/mattn/go-runewidth v0.0.15 // indirect 17 | github.com/rivo/uniseg v0.2.0 // indirect 18 | github.com/valyala/bytebufferpool v1.0.0 // indirect 19 | github.com/valyala/fasthttp v1.51.0 // indirect 20 | github.com/valyala/tcplisten v1.0.0 // indirect 21 | golang.org/x/sys v0.15.0 // indirect 22 | ) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Dmytro Misik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | import { check } from 'k6'; 2 | import { SharedArray } from 'k6/data'; 3 | import http from 'k6/http'; 4 | import papaparse from './papaparse.min.js'; 5 | 6 | const data = new SharedArray('Test texts', function () { 7 | return papaparse.parse(open('./data.csv'), { header: true }).data; 8 | }); 9 | 10 | const serviceUrl = 'http://localhost:8082/similarity'; 11 | 12 | export const options = { 13 | stages: [ 14 | { duration: '30s', target: 100 }, 15 | { duration: '1m', target: 100 }, 16 | { duration: '30s', target: 200 }, 17 | { duration: '1m', target: 200 }, 18 | { duration: '30s', target: 400 }, 19 | { duration: '1m', target: 400 }, 20 | { duration: '30s', target: 800 }, 21 | { duration: '1m', target: 800 }, 22 | { duration: '2m', target: 0 }, 23 | ], 24 | thresholds: { 25 | "http_req_failed": ["rate<0.01"], 26 | "http_req_duration": ["p(95)<1500"], 27 | }, 28 | }; 29 | 30 | export default function () { 31 | const text1 = data[Math.floor(Math.random() * (data.length - 1))].Text; 32 | const text2 = data[Math.floor(Math.random() * (data.length - 1))].Text; 33 | 34 | const payload = JSON.stringify({ 35 | text1: text1, 36 | text2: text2, 37 | }); 38 | 39 | const params = { 40 | headers: { 'Content-Type': 'application/json' }, 41 | }; 42 | 43 | const response = http.post(serviceUrl, payload, { 44 | ...params, 45 | tags: { service: 'Go' }, 46 | }); 47 | check(response, { 48 | OK: (r) => r.status === 200, 49 | 'Similarity Returned': (r) => 50 | typeof JSON.parse(r.body).similarity === 'number', 51 | }); 52 | } 53 | -------------------------------------------------------------------------------- /go/go.sum: -------------------------------------------------------------------------------- 1 | github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= 2 | github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= 3 | github.com/gofiber/fiber/v2 v2.52.5 h1:tWoP1MJQjGEe4GB5TUGOi7P2E0ZMMRx5ZTG4rT+yGMo= 4 | github.com/gofiber/fiber/v2 v2.52.5/go.mod h1:KEOE+cXMhXG0zHc9d8+E38hoX+ZN7bhOtgeF2oT6jrQ= 5 | github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= 6 | github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 7 | github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM= 8 | github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= 9 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= 10 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= 11 | github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 12 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= 13 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 14 | github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= 15 | github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= 16 | github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= 17 | github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= 18 | github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= 19 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= 20 | github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA= 21 | github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g= 22 | github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8= 23 | github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc= 24 | golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo= 25 | golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak= 26 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 27 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 28 | golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= 29 | golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rust vs. Go 2 | 3 | Comparison of Rust and Go in terms of performance. 4 | 5 | ## Rust 6 | 7 | ```bash 8 | ✗ OK 9 | ↳ 99% — ✓ 1865605 / ✗ 13 10 | ✗ Similarity Returned 11 | ↳ 99% — ✓ 1865605 / ✗ 13 12 | 13 | checks.........................: 99.99% 3731210 out of 3731236 14 | data_received..................: 310 MB 645 kB/s 15 | data_sent......................: 38 GB 78 MB/s 16 | http_req_blocked...............: avg=5.28µs min=1.38µs med=4.22µs max=15.33ms p(90)=5.68µs p(95)=6.31µs 17 | http_req_connecting............: avg=191ns min=0s med=0s max=12.04ms p(90)=0s p(95)=0s 18 | ✓ http_req_duration..............: avg=90.77ms min=972.39µs med=74.79ms max=1m0s p(90)=161.72ms p(95)=219.01ms 19 | { expected_response:true }...: avg=90.35ms min=972.39µs med=74.79ms max=59.9s p(90)=161.71ms p(95)=219ms 20 | ✓ http_req_failed................: 0.00% 13 out of 1865618 21 | http_req_receiving.............: avg=59.68µs min=0s med=34.8µs max=22.07ms p(90)=48.32µs p(95)=68.42µs 22 | http_req_sending...............: avg=98.37µs min=29.7µs med=76.18µs max=20.57ms p(90)=101.4µs p(95)=142.3µs 23 | http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s 24 | http_req_waiting...............: avg=90.61ms min=900.59µs med=74.64ms max=1m0s p(90)=161.55ms p(95)=218.79ms 25 | http_reqs......................: 1865618 3886.688951/s 26 | iteration_duration.............: avg=91.7ms min=1.33ms med=75.69ms max=1m0s p(90)=162.66ms p(95)=219.97ms 27 | iterations.....................: 1865618 3886.688951/s 28 | vus............................: 1 min=1 max=800 29 | vus_max........................: 800 min=800 max=800 30 | 31 | 32 | running (8m00.0s), 000/800 VUs, 1865618 complete and 0 interrupted iterations 33 | default ✓ [======================================] 000/800 VUs 8m0s 34 | ``` 35 | 36 | ## Go 37 | 38 | ```bash 39 | ✓ OK 40 | ✓ Similarity Returned 41 | 42 | checks.........................: 100.00% 1920966 out of 1920966 43 | data_received..................: 159 MB 332 kB/s 44 | data_sent......................: 19 GB 40 MB/s 45 | http_req_blocked...............: avg=5.7µs min=1.49µs med=4.12µs max=19.21ms p(90)=5.68µs p(95)=6.33µs 46 | http_req_connecting............: avg=478ns min=0s med=0s max=10.48ms p(90)=0s p(95)=0s 47 | ✓ http_req_duration..............: avg=177.35ms min=1.93ms med=22.28ms max=5.25s p(90)=651.66ms p(95)=905.77ms 48 | { expected_response:true }...: avg=177.35ms min=1.93ms med=22.28ms max=5.25s p(90)=651.66ms p(95)=905.77ms 49 | ✓ http_req_failed................: 0.00% 0 out of 960483 50 | http_req_receiving.............: avg=65.69µs min=12.43µs med=35.92µs max=31.08ms p(90)=51.72µs p(95)=69.33µs 51 | http_req_sending...............: avg=111.55µs min=32.45µs med=75.79µs max=27.21ms p(90)=102.84µs p(95)=164.76µs 52 | http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s 53 | http_req_waiting...............: avg=177.17ms min=899.7µs med=22.08ms max=5.25s p(90)=651.49ms p(95)=905.59ms 54 | http_reqs......................: 960483 2000.998964/s 55 | iteration_duration.............: avg=178.31ms min=2.38ms med=23.32ms max=5.25s p(90)=652.74ms p(95)=906.96ms 56 | iterations.....................: 960483 2000.998964/s 57 | vus............................: 1 min=1 max=800 58 | vus_max........................: 800 min=800 max=800 59 | 60 | 61 | running (8m00.0s), 000/800 VUs, 960483 complete and 0 interrupted iterations 62 | default ✓ [======================================] 000/800 VUs 8m0s 63 | ``` 64 | -------------------------------------------------------------------------------- /go/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "math" 5 | "regexp" 6 | "strings" 7 | 8 | "github.com/gofiber/fiber/v2" 9 | "golang.org/x/exp/maps" 10 | ) 11 | 12 | type SimilarityRequest struct { 13 | Text1 string `json:"text1"` 14 | Text2 string `json:"text2"` 15 | } 16 | 17 | type InterpretationResult string 18 | 19 | const ( 20 | InterpretationResultDissimilar InterpretationResult = "Dissimilar" 21 | InterpretationResultSlightly InterpretationResult = "Slightly Similar" 22 | InterpretationResultModerately InterpretationResult = "Moderately Similar" 23 | InterpretationResultQuite InterpretationResult = "Quite Similar" 24 | InterpretationResultHighly InterpretationResult = "Highly Similar" 25 | InterpretationResultUnknown InterpretationResult = "Unknown" 26 | ) 27 | 28 | type SimilarityResponse struct { 29 | Similarity float64 `json:"similarity"` 30 | Interpretation string `json:"interpretation"` 31 | } 32 | 33 | var ( 34 | punctuationRegex = regexp.MustCompile(`[^\w\s]`) 35 | whitespaceRegex = regexp.MustCompile(`\s+`) 36 | ) 37 | 38 | func similarityHandler(c *fiber.Ctx) error { 39 | var req SimilarityRequest 40 | if err := c.BodyParser(&req); err != nil { 41 | return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{ 42 | "error": "Bad Request", 43 | }) 44 | } 45 | 46 | text1 := normalizeText(req.Text1) 47 | text2 := normalizeText(req.Text2) 48 | 49 | words1 := strings.Split(text1, " ") 50 | words2 := strings.Split(text2, " ") 51 | 52 | fm1 := generateFrequencyMap(words1) 53 | fm2 := generateFrequencyMap(words2) 54 | 55 | uw := make(map[string]any, 0) 56 | for word := range fm1 { 57 | uw[word] = struct{}{} 58 | } 59 | for word := range fm2 { 60 | uw[word] = struct{}{} 61 | } 62 | 63 | uniqueWords := maps.Keys(uw) 64 | 65 | total1 := len(words1) 66 | total2 := len(words2) 67 | 68 | tf1 := calculateTF(uniqueWords, fm1, total1) 69 | tf2 := calculateTF(uniqueWords, fm2, total2) 70 | 71 | idf := calculateIDF(uniqueWords, fm1, fm2) 72 | 73 | tfidf1 := calculateTFIDF(tf1, idf) 74 | tfidf2 := calculateTFIDF(tf2, idf) 75 | 76 | similarity := calculateSimilarity(tfidf1, tfidf2) 77 | 78 | similarity = math.Round(similarity*1000) / 1000 79 | interpretation := interpretSimilarity(similarity) 80 | 81 | return c.JSON(SimilarityResponse{ 82 | Similarity: similarity, 83 | Interpretation: string(interpretation), 84 | }) 85 | } 86 | 87 | func main() { 88 | app := fiber.New() 89 | app.Post("/similarity", similarityHandler) 90 | app.Listen(":8082") 91 | } 92 | 93 | func normalizeText(text string) string { 94 | lower := strings.ToLower(text) 95 | noPunctuation := punctuationRegex.ReplaceAllString(lower, "") 96 | cleanText := whitespaceRegex.ReplaceAllString(noPunctuation, " ") 97 | 98 | return strings.Trim(cleanText, " ") 99 | } 100 | 101 | func generateFrequencyMap(words []string) map[string]int { 102 | frequencyMap := make(map[string]int) 103 | for _, word := range words { 104 | frequencyMap[word]++ 105 | } 106 | 107 | return frequencyMap 108 | } 109 | 110 | func calculateTF(uniqueWords []string, frequencyMap map[string]int, total int) []float64 { 111 | tf := make([]float64, len(uniqueWords)) 112 | for i, word := range uniqueWords { 113 | tf[i] = float64(frequencyMap[word]) / float64(total) 114 | } 115 | 116 | return tf 117 | } 118 | 119 | func calculateIDF(uniqueWords []string, fm1, fm2 map[string]int) []float64 { 120 | docFreq := make(map[string]int) 121 | for _, word := range uniqueWords { 122 | count1, count2 := 0, 0 123 | if _, ok := fm1[word]; ok { 124 | count1 = 1 125 | } 126 | 127 | if _, ok := fm2[word]; ok { 128 | count2 = 1 129 | } 130 | 131 | docFreq[word] = count1 + count2 132 | } 133 | 134 | idf := make([]float64, len(uniqueWords)) 135 | for i, word := range uniqueWords { 136 | idf[i] = math.Log(1.0 + 2.0/(float64(docFreq[word])+1.0)) 137 | } 138 | 139 | return idf 140 | } 141 | 142 | func calculateTFIDF(tf, idf []float64) []float64 { 143 | tfidf := make([]float64, len(tf)) 144 | for i := range len(tf) { 145 | tfidf[i] = tf[i] * idf[i] 146 | } 147 | 148 | return tfidf 149 | } 150 | 151 | func calculateSimilarity(tfidf1, tfidf2 []float64) float64 { 152 | dotProduct := 0.0 153 | for i := range len(tfidf1) { 154 | dotProduct += tfidf1[i] * tfidf2[i] 155 | } 156 | 157 | magnitude1 := 0.0 158 | for _, val := range tfidf1 { 159 | magnitude1 += val * val 160 | } 161 | magnitude1 = math.Sqrt(magnitude1) 162 | 163 | magnitude2 := 0.0 164 | for _, val := range tfidf2 { 165 | magnitude2 += val * val 166 | } 167 | magnitude2 = math.Sqrt(magnitude2) 168 | 169 | if magnitude1 <= 1e-9 || magnitude2 <= 1e-9 { 170 | return 0.0 171 | } 172 | 173 | return dotProduct / (magnitude1 * magnitude2) 174 | } 175 | 176 | func interpretSimilarity(similarity float64) InterpretationResult { 177 | if similarity <= 0.2 { 178 | return InterpretationResultDissimilar 179 | } else if similarity <= 0.4 { 180 | return InterpretationResultSlightly 181 | } else if similarity <= 0.6 { 182 | return InterpretationResultModerately 183 | } else if similarity <= 0.8 { 184 | return InterpretationResultQuite 185 | } else if similarity <= 1 { 186 | return InterpretationResultHighly 187 | } 188 | 189 | return InterpretationResultUnknown 190 | } 191 | -------------------------------------------------------------------------------- /rust/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::{HashMap, HashSet}, env}; 2 | 3 | use actix_web::{post, web, App, HttpResponse, HttpServer, Responder}; 4 | use once_cell::sync::Lazy; 5 | use regex::Regex; 6 | use serde::{Deserialize, Serialize}; 7 | 8 | #[actix_web::main] 9 | async fn main() -> std::io::Result<()> { 10 | let addr = env::var("ADDR").unwrap_or("127.0.0.1".to_string()); 11 | let port = env::var("PORT").unwrap_or("8081".to_string()).parse().unwrap(); 12 | 13 | HttpServer::new(|| App::new().service(similarity)) 14 | .bind((addr.as_ref(), port))? 15 | .run() 16 | .await 17 | } 18 | 19 | /// Request data for the similarity endpoint. 20 | #[derive(Deserialize)] 21 | struct SimilarityRequest { 22 | text1: String, 23 | text2: String, 24 | } 25 | 26 | /// Response data for the similarity endpoint. 27 | #[derive(Serialize)] 28 | struct SimilarityResponse { 29 | similarity: f64, 30 | interpretation: String, 31 | } 32 | 33 | /// Calculate the similarity between two texts. 34 | #[post("/similarity")] 35 | pub async fn similarity(data: web::Json) -> impl Responder { 36 | let normalized1 = normalize_text(&data.text1); 37 | let normalized2 = normalize_text(&data.text2); 38 | 39 | let words1: Vec<&str> = normalized1.split_whitespace().collect(); 40 | let words2: Vec<&str> = normalized2.split_whitespace().collect(); 41 | 42 | // Generate frequency maps for both texts 43 | let freq_map1 = generate_frequency_map(&words1); 44 | let freq_map2 = generate_frequency_map(&words2); 45 | 46 | // Create a vector of unique words 47 | let uniq: Vec<&str> = freq_map1 48 | .keys() 49 | .chain(freq_map2.keys()) 50 | .cloned() 51 | .collect::>() 52 | .into_iter() 53 | .collect(); 54 | 55 | // Calculate term frequency for both texts 56 | let total1 = words1.len(); 57 | let total2 = words2.len(); 58 | 59 | let tf1 = calculate_tf(&uniq, &freq_map1, total1); 60 | let tf2 = calculate_tf(&uniq, &freq_map2, total2); 61 | 62 | // Calculate inverse document frequency 63 | let idf = calculate_idf(&uniq, &freq_map1, &freq_map2); 64 | 65 | // Calculate tf-idf for both texts 66 | let tf_idf1 = calculate_tf_idf(&tf1, &idf); 67 | let tf_idf2 = calculate_tf_idf(&tf2, &idf); 68 | 69 | // Calculate cosine similarity 70 | let similarity = calculate_similarity(&tf_idf1, &tf_idf2); 71 | 72 | // Round similarity to 3 decimal places 73 | let similarity = (similarity * 1000.0).round() / 1000.0; 74 | let interpretation = interpret_similarity(similarity); 75 | 76 | // Return the similarity as JSON 77 | HttpResponse::Ok().json(SimilarityResponse { 78 | similarity, 79 | interpretation, 80 | }) 81 | } 82 | 83 | /// Normalize text by converting to lowercase, removing punctuation, and collapsing whitespace. 84 | fn normalize_text(text: &str) -> String { 85 | static RE_PUNCT: Lazy = Lazy::new(|| Regex::new(r"[^\w\s]").unwrap()); 86 | static RE_WHITESPACE: Lazy = Lazy::new(|| Regex::new(r"\s+").unwrap()); 87 | 88 | let lower = text.to_lowercase(); 89 | let no_punct = RE_PUNCT.replace_all(&lower, ""); 90 | let clean_text = RE_WHITESPACE.replace_all(&no_punct, " "); 91 | 92 | clean_text.trim().to_string() 93 | } 94 | 95 | /// Generate a frequency map for a list of words. 96 | fn generate_frequency_map<'a>(words: &[&'a str]) -> HashMap<&'a str, usize> { 97 | let mut freq_map = HashMap::new(); 98 | for word in words { 99 | *freq_map.entry(*word).or_insert(0) += 1; 100 | } 101 | freq_map 102 | } 103 | 104 | /// Calculate term frequency (TF) for a list of unique words and a frequency map. 105 | fn calculate_tf(uniq: &[&str], fm: &HashMap<&str, usize>, total: usize) -> Vec { 106 | // Compute TF using the frequency map 107 | uniq.iter() 108 | .map(|word| *fm.get(word).unwrap_or(&0) as f64 / total as f64) 109 | .collect() 110 | } 111 | 112 | /// Calculate inverse document frequency (IDF) for a list of unique words and two frequency maps. 113 | fn calculate_idf( 114 | uniq: &[&str], 115 | fm1: &HashMap<&str, usize>, 116 | fm2: &HashMap<&str, usize>, 117 | ) -> Vec { 118 | let mut doc_freq = HashMap::new(); 119 | 120 | // Populate document frequencies 121 | for &word in uniq { 122 | doc_freq.insert( 123 | word, 124 | fm1.contains_key(word) as usize + fm2.contains_key(word) as usize, 125 | ); 126 | } 127 | 128 | uniq.iter() 129 | .map(|word| { 130 | let count = *doc_freq.get(word).unwrap_or(&0); 131 | (1.0 + 2.0 / (count as f64 + 1.0)).ln() 132 | }) 133 | .collect() 134 | } 135 | 136 | /// Calculate the TF-IDF for a list of term frequencies and a list of inverse document frequencies. 137 | fn calculate_tf_idf(tf: &[f64], idf: &[f64]) -> Vec { 138 | tf.iter().zip(idf.iter()).map(|(a, b)| a * b).collect() 139 | } 140 | 141 | /// Calculate the cosine similarity between two vectors. 142 | fn calculate_similarity(tf_idf1: &[f64], tf_idf2: &[f64]) -> f64 { 143 | let dot_product: f64 = tf_idf1.iter().zip(tf_idf2.iter()).map(|(a, b)| a * b).sum(); 144 | 145 | let norm1 = tf_idf1.iter().map(|a| a * a).sum::().sqrt(); 146 | let norm2 = tf_idf2.iter().map(|a| a * a).sum::().sqrt(); 147 | 148 | if norm1.abs() < f64::EPSILON || norm2.abs() < f64::EPSILON { 149 | return 0.0; 150 | } 151 | 152 | dot_product / (norm1 * norm2) 153 | } 154 | 155 | /// Interpret the similarity value as a human-readable string. 156 | fn interpret_similarity(s: f64) -> String { 157 | match s { 158 | 0.0..=0.2 => "Dissimilar".to_string(), 159 | 0.2..=0.4 => "Slightly Similar".to_string(), 160 | 0.4..=0.6 => "Moderately Similar".to_string(), 161 | 0.6..=0.8 => "Quite Similar".to_string(), 162 | 0.8..=1.0 => "Highly Similar".to_string(), 163 | _ => "Unknown".to_string(), // Catch-all for unexpected values 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /test/papaparse.min.js: -------------------------------------------------------------------------------- 1 | /* @license 2 | Papa Parse 3 | v5.0.2 4 | https://github.com/mholt/PapaParse 5 | License: MIT 6 | */ 7 | !function(e,t){"function"==typeof define&&define.amd?define([],t):"object"==typeof module&&"undefined"!=typeof exports?module.exports=t():e.Papa=t()}(this,function s(){"use strict";var f="undefined"!=typeof self?self:"undefined"!=typeof window?window:void 0!==f?f:{};var n=!f.document&&!!f.postMessage,o=n&&/blob:/i.test((f.location||{}).protocol),a={},h=0,b={parse:function(e,t){var r=(t=t||{}).dynamicTyping||!1;q(r)&&(t.dynamicTypingFunction=r,r={});if(t.dynamicTyping=r,t.transform=!!q(t.transform)&&t.transform,t.worker&&b.WORKERS_SUPPORTED){var i=function(){if(!b.WORKERS_SUPPORTED)return!1;var e=(r=f.URL||f.webkitURL||null,i=s.toString(),b.BLOB_URL||(b.BLOB_URL=r.createObjectURL(new Blob(["(",i,")();"],{type:"text/javascript"})))),t=new f.Worker(e);var r,i;return t.onmessage=_,t.id=h++,a[t.id]=t}();return i.userStep=t.step,i.userChunk=t.chunk,i.userComplete=t.complete,i.userError=t.error,t.step=q(t.step),t.chunk=q(t.chunk),t.complete=q(t.complete),t.error=q(t.error),delete t.worker,void i.postMessage({input:e,config:t,workerId:i.id})}var n=null;b.NODE_STREAM_INPUT,"string"==typeof e?n=t.download?new l(t):new p(t):!0===e.readable&&q(e.read)&&q(e.on)?n=new m(t):(f.File&&e instanceof File||e instanceof Object)&&(n=new c(t));return n.stream(e)},unparse:function(e,t){var i=!1,_=!0,g=",",v="\r\n",n='"',s=n+n,r=!1,a=null;!function(){if("object"!=typeof t)return;"string"!=typeof t.delimiter||b.BAD_DELIMITERS.filter(function(e){return-1!==t.delimiter.indexOf(e)}).length||(g=t.delimiter);("boolean"==typeof t.quotes||Array.isArray(t.quotes))&&(i=t.quotes);"boolean"!=typeof t.skipEmptyLines&&"string"!=typeof t.skipEmptyLines||(r=t.skipEmptyLines);"string"==typeof t.newline&&(v=t.newline);"string"==typeof t.quoteChar&&(n=t.quoteChar);"boolean"==typeof t.header&&(_=t.header);if(Array.isArray(t.columns)){if(0===t.columns.length)throw new Error("Option columns is empty");a=t.columns}void 0!==t.escapeChar&&(s=t.escapeChar+n)}();var o=new RegExp(U(n),"g");"string"==typeof e&&(e=JSON.parse(e));if(Array.isArray(e)){if(!e.length||Array.isArray(e[0]))return u(null,e,r);if("object"==typeof e[0])return u(a||h(e[0]),e,r)}else if("object"==typeof e)return"string"==typeof e.data&&(e.data=JSON.parse(e.data)),Array.isArray(e.data)&&(e.fields||(e.fields=e.meta&&e.meta.fields),e.fields||(e.fields=Array.isArray(e.data[0])?e.fields:h(e.data[0])),Array.isArray(e.data[0])||"object"==typeof e.data[0]||(e.data=[e.data])),u(e.fields||[],e.data||[],r);throw new Error("Unable to serialize unrecognized input");function h(e){if("object"!=typeof e)return[];var t=[];for(var r in e)t.push(r);return t}function u(e,t,r){var i="";"string"==typeof e&&(e=JSON.parse(e)),"string"==typeof t&&(t=JSON.parse(t));var n=Array.isArray(e)&&0=this._config.preview;if(o)f.postMessage({results:n,workerId:b.WORKER_ID,finished:a});else if(q(this._config.chunk)&&!t){if(this._config.chunk(n,this._handle),this._handle.paused()||this._handle.aborted())return void(this._halted=!0);n=void 0,this._completeResults=void 0}return this._config.step||this._config.chunk||(this._completeResults.data=this._completeResults.data.concat(n.data),this._completeResults.errors=this._completeResults.errors.concat(n.errors),this._completeResults.meta=n.meta),this._completed||!a||!q(this._config.complete)||n&&n.meta.aborted||(this._config.complete(this._completeResults,this._input),this._completed=!0),a||n&&n.meta.paused||this._nextChunk(),n}this._halted=!0},this._sendError=function(e){q(this._config.error)?this._config.error(e):o&&this._config.error&&f.postMessage({workerId:b.WORKER_ID,error:e,finished:!1})}}function l(e){var i;(e=e||{}).chunkSize||(e.chunkSize=b.RemoteChunkSize),u.call(this,e),this._nextChunk=n?function(){this._readChunk(),this._chunkLoaded()}:function(){this._readChunk()},this.stream=function(e){this._input=e,this._nextChunk()},this._readChunk=function(){if(this._finished)this._chunkLoaded();else{if(i=new XMLHttpRequest,this._config.withCredentials&&(i.withCredentials=this._config.withCredentials),n||(i.onload=y(this._chunkLoaded,this),i.onerror=y(this._chunkError,this)),i.open("GET",this._input,!n),this._config.downloadRequestHeaders){var e=this._config.downloadRequestHeaders;for(var t in e)i.setRequestHeader(t,e[t])}if(this._config.chunkSize){var r=this._start+this._config.chunkSize-1;i.setRequestHeader("Range","bytes="+this._start+"-"+r)}try{i.send()}catch(e){this._chunkError(e.message)}n&&0===i.status?this._chunkError():this._start+=this._config.chunkSize}},this._chunkLoaded=function(){4===i.readyState&&(i.status<200||400<=i.status?this._chunkError():(this._finished=!this._config.chunkSize||this._start>function(e){var t=e.getResponseHeader("Content-Range");if(null===t)return-1;return parseInt(t.substr(t.lastIndexOf("/")+1))}(i),this.parseChunk(i.responseText)))},this._chunkError=function(e){var t=i.statusText||e;this._sendError(new Error(t))}}function c(e){var i,n;(e=e||{}).chunkSize||(e.chunkSize=b.LocalChunkSize),u.call(this,e);var s="undefined"!=typeof FileReader;this.stream=function(e){this._input=e,n=e.slice||e.webkitSlice||e.mozSlice,s?((i=new FileReader).onload=y(this._chunkLoaded,this),i.onerror=y(this._chunkError,this)):i=new FileReaderSync,this._nextChunk()},this._nextChunk=function(){this._finished||this._config.preview&&!(this._rowCount=this._input.size,this.parseChunk(e.target.result)},this._chunkError=function(){this._sendError(i.error)}}function p(e){var r;u.call(this,e=e||{}),this.stream=function(e){return r=e,this._nextChunk()},this._nextChunk=function(){if(!this._finished){var e=this._config.chunkSize,t=e?r.substr(0,e):r;return r=e?r.substr(e):"",this._finished=!r,this.parseChunk(t)}}}function m(e){u.call(this,e=e||{});var t=[],r=!0,i=!1;this.pause=function(){u.prototype.pause.apply(this,arguments),this._input.pause()},this.resume=function(){u.prototype.resume.apply(this,arguments),this._input.resume()},this.stream=function(e){this._input=e,this._input.on("data",this._streamData),this._input.on("end",this._streamEnd),this._input.on("error",this._streamError)},this._checkIsFinished=function(){i&&1===t.length&&(this._finished=!0)},this._nextChunk=function(){this._checkIsFinished(),t.length?this.parseChunk(t.shift()):r=!0},this._streamData=y(function(e){try{t.push("string"==typeof e?e:e.toString(this._config.encoding)),r&&(r=!1,this._checkIsFinished(),this.parseChunk(t.shift()))}catch(e){this._streamError(e)}},this),this._streamError=y(function(e){this._streamCleanUp(),this._sendError(e)},this),this._streamEnd=y(function(){this._streamCleanUp(),i=!0,this._streamData("")},this),this._streamCleanUp=y(function(){this._input.removeListener("data",this._streamData),this._input.removeListener("end",this._streamEnd),this._input.removeListener("error",this._streamError)},this)}function r(g){var a,o,h,i=Math.pow(2,53),n=-i,s=/^\s*-?(\d*\.?\d+|\d+\.?\d*)(e[-+]?\d+)?\s*$/i,u=/(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))/,t=this,r=0,f=0,d=!1,e=!1,l=[],c={data:[],errors:[],meta:{}};if(q(g.step)){var p=g.step;g.step=function(e){if(c=e,_())m();else{if(m(),0===c.data.length)return;r+=e.data.length,g.preview&&r>g.preview?o.abort():p(c,t)}}}function v(e){return"greedy"===g.skipEmptyLines?""===e.join("").trim():1===e.length&&0===e[0].length}function m(){if(c&&h&&(k("Delimiter","UndetectableDelimiter","Unable to auto-detect delimiting character; defaulted to '"+b.DefaultDelimiter+"'"),h=!1),g.skipEmptyLines)for(var e=0;e=l.length?"__parsed_extra":l[r]),g.transform&&(s=g.transform(s,n)),s=y(n,s),"__parsed_extra"===n?(i[n]=i[n]||[],i[n].push(s)):i[n]=s}return g.header&&(r>l.length?k("FieldMismatch","TooManyFields","Too many fields: expected "+l.length+" fields but parsed "+r,f+t):r=i.length/2?"\r\n":"\r"}(e,i)),h=!1,g.delimiter)q(g.delimiter)&&(g.delimiter=g.delimiter(e),c.meta.delimiter=g.delimiter);else{var n=function(e,t,r,i,n){var s,a,o,h;n=n||[",","\t","|",";",b.RECORD_SEP,b.UNIT_SEP];for(var u=0;u=L)return R(!0)}else for(g=M,M++;;){if(-1===(g=a.indexOf(O,g+1)))return t||u.push({type:"Quotes",code:"MissingQuotes",message:"Quoted field unterminated",row:h.length,index:M}),w();if(g===i-1)return w(a.substring(M,g).replace(_,O));if(O!==z||a[g+1]!==z){if(O===z||0===g||a[g-1]!==z){var y=E(-1===m?p:Math.min(p,m));if(a[g+1+y]===D){f.push(a.substring(M,g).replace(_,O)),a[M=g+1+y+e]!==O&&(g=a.indexOf(O,M)),p=a.indexOf(D,M),m=a.indexOf(I,M);break}var k=E(m);if(a.substr(g+1+k,n)===I){if(f.push(a.substring(M,g).replace(_,O)),C(g+1+k+n),p=a.indexOf(D,M),g=a.indexOf(O,M),o&&(S(),j))return R();if(L&&h.length>=L)return R(!0);break}u.push({type:"Quotes",code:"InvalidQuotes",message:"Trailing quote on quoted field is malformed",row:h.length,index:M}),g++}}else g++}return w();function b(e){h.push(e),d=M}function E(e){var t=0;if(-1!==e){var r=a.substring(g+1,e);r&&""===r.trim()&&(t=r.length)}return t}function w(e){return t||(void 0===e&&(e=a.substr(M)),f.push(e),M=i,b(f),o&&S()),R()}function C(e){M=e,b(f),f=[],m=a.indexOf(I,M)}function R(e,t){return{data:t||!1?h[0]:h,errors:u,meta:{delimiter:D,linebreak:I,aborted:j,truncated:!!e,cursor:d+(r||0)}}}function S(){A(R(void 0,!0)),h=[],u=[]}function x(e,t,r){var i={nextDelim:void 0,quoteSearch:void 0},n=a.indexOf(O,t+1);if(t