├── .dockerignore ├── .github ├── CODEOWNERS └── workflows │ ├── create-release.yaml │ └── tests.yaml ├── .gitignore ├── Dockerfile.full ├── Dockerfile.local-vectordb ├── Dockerfile.minimal ├── LICENSE ├── README.md ├── adapters └── repos │ └── extensions_weaviate_module.go ├── build.sh ├── client └── client.go ├── compoundsplitting ├── dictionary.go ├── noop_splitter.go ├── splitter.go └── splitter_test.go ├── contextionary ├── contextionary.pb.go ├── contextionary.proto ├── core │ ├── annoyindex │ │ ├── annoy_test.go │ │ ├── annoygomodule.h │ │ ├── annoygomodule_wrap.cxx │ │ ├── annoyindex.go │ │ ├── annoylib.h │ │ └── kissrandom.h │ ├── centroid.go │ ├── centroid_test.go │ ├── certainty.go │ ├── combined.go │ ├── combined_simple_test.go │ ├── component_test.go │ ├── contextionary.go │ ├── generator │ │ ├── cmd │ │ │ └── generator.go │ │ └── generator.go │ ├── indices_test.go │ ├── memory_index.go │ ├── mmapped.go │ ├── similar_words.go │ ├── similar_words_test.go │ ├── stopwords │ │ └── detector.go │ ├── vector.go │ └── wordlist.go └── schema │ ├── contextionary.go │ ├── schema_search.go │ ├── schema_search_params.go │ ├── schema_search_params_test.go │ └── schema_search_test.go ├── errors └── errors.go ├── extensions ├── extension.go ├── looker_upper.go ├── looker_upper_test.go ├── storer.go └── storer_test.go ├── gen_proto_code.sh ├── go.mod ├── go.sum ├── logparser └── parse.go ├── main └── splitter_preprocessor.go ├── prepare_docker_buildx.sh ├── preprocessing ├── dictionary_pre_processing.go ├── dictionary_pre_processing_test.go ├── hunspell.go └── hunspell_test.go ├── server ├── api.go ├── config │ └── config.go ├── contextionary.go ├── corpus_vectorizer.go ├── corpus_vectorizer_test.go ├── grpc_error.go ├── server.go ├── splitter.go ├── splitter_test.go ├── weight_manipulator.go └── weight_manipulator_test.go ├── test ├── compoundsplitting │ ├── contextionary.idx │ ├── nl_NL.aff │ ├── nl_NL.dic │ └── pre_processed_splitter_dict.csv ├── journey.sh └── journey │ ├── Dockerfile │ ├── docker-compose.yml │ ├── go.mod │ ├── go.sum │ └── journey_test.go └── tools ├── dev ├── .gitignore ├── contextionary-playground │ ├── .gitignore │ ├── class_vectors │ │ ├── elastic.go │ │ ├── main.go │ │ ├── search.go │ │ ├── stopwords.go │ │ └── texts.go │ ├── comparison │ │ └── main.go │ ├── main.go │ └── schema │ │ └── main.go ├── en_test-vectors-small.txt.bz2 ├── gen_simple_contextionary.sh ├── run.sh └── stopwords.json ├── download_contextionary.sh ├── native_build_contextionary.sh ├── preprocess_splitter_dict.sh └── preprocess_splitter_dict_native_build.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Ci related folders 2 | /.github/ @weaviate/core 3 | build.sh @weaviate/core 4 | prepare_docker_buildx.sh @weaviate/core 5 | -------------------------------------------------------------------------------- /.github/workflows/create-release.yaml: -------------------------------------------------------------------------------- 1 | name: Create Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - '**' 7 | 8 | jobs: 9 | create-release: 10 | name: Create Release 11 | if: startsWith(github.ref, 'refs/tags') 12 | runs-on: ubuntu-latest-4-cores 13 | strategy: 14 | matrix: 15 | include: 16 | - language: en 17 | model_version: 0.16.0 18 | - language: nl 19 | model_version: 0.16.0 20 | - language: en 21 | model_version: 0.14.0 22 | - language: nl 23 | model_version: 0.14.0 24 | - language: de 25 | model_version: 0.14.0 26 | - language: cs 27 | model_version: 0.14.0 28 | - language: it 29 | model_version: 0.14.0 30 | env: 31 | DOCKER_ORG: semitechnologies 32 | DOCKER_REPO: contextionary 33 | LANGUAGE: ${{matrix.language}} 34 | MODEL_VERSION: ${{matrix.model_version}} 35 | steps: 36 | - uses: actions/checkout@v3 37 | - name: Login to Docker Hub 38 | uses: docker/login-action@v2 39 | if: ${{ !github.event.pull_request.head.repo.fork }} 40 | with: 41 | username: ${{secrets.DOCKER_USERNAME}} 42 | password: ${{secrets.DOCKER_PASSWORD}} 43 | - name: Set up Go 44 | uses: actions/setup-go@v3 45 | with: 46 | go-version: 1.19 47 | cache: true 48 | - name: Build and release 49 | run: | 50 | export SOFTWARE_VERSION=${GITHUB_REF##*/} 51 | set -e 52 | ./prepare_docker_buildx.sh 53 | PUSH_MULTIARCH=1 ./build.sh 54 | echo "Success" 55 | gh-release: 56 | name: Create a GitHub Release 57 | if: startsWith(github.ref, 'refs/tags') 58 | runs-on: ubuntu-latest 59 | needs: create-release 60 | steps: 61 | - name: Checkout 62 | uses: actions/checkout@v3 63 | - name: Release 64 | uses: softprops/action-gh-release@v1 65 | with: 66 | generate_release_notes: true 67 | draft: true 68 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | tags: 8 | - '**' 9 | paths-ignore: 10 | - LICENSE 11 | - README.md 12 | pull_request: 13 | 14 | jobs: 15 | tests: 16 | name: Tests 17 | runs-on: ubuntu-latest 18 | strategy: 19 | matrix: 20 | include: 21 | - model_version: 0.16.0 22 | dimensions: 300 23 | - model_version: 0.14.0 24 | dimensions: 600 25 | env: 26 | DOCKER_ORG: semitechnologies 27 | DOCKER_REPO: contextionary 28 | LANGUAGE: en 29 | SOFTWARE_VERSION: localtest 30 | MODEL_VERSION: ${{matrix.model_version}} 31 | DIMENSIONS: ${{matrix.dimensions}} 32 | steps: 33 | - uses: actions/checkout@v3 34 | - name: Login to Docker Hub 35 | uses: docker/login-action@v2 36 | if: ${{ !github.event.pull_request.head.repo.fork }} 37 | with: 38 | username: ${{secrets.DOCKER_USERNAME}} 39 | password: ${{secrets.DOCKER_PASSWORD}} 40 | - name: Set up Go 41 | uses: actions/setup-go@v3 42 | with: 43 | go-version: 1.19 44 | cache: true 45 | - name: Build and run journey tests 46 | run: | 47 | set -e 48 | docker buildx version 49 | ./build.sh 50 | ./test/journey.sh 51 | echo "Success" 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | local-vectordb/ 3 | 4 | .idea 5 | 6 | vendor/ -------------------------------------------------------------------------------- /Dockerfile.full: -------------------------------------------------------------------------------- 1 | # vi: ft=Dockerfile 2 | 3 | 4 | FROM golang:1.13 as builder 5 | WORKDIR /app 6 | 7 | RUN apt-get update && apt-get install -y bzip2 jq hunspell libhunspell-dev git 8 | 9 | COPY ./tools/download_contextionary.sh ./ 10 | ARG LANGUAGE 11 | ARG MODEL_VERSION 12 | RUN ./download_contextionary.sh "$LANGUAGE" "$MODEL_VERSION" 13 | 14 | COPY go.mod go.sum ./ 15 | RUN go mod download 16 | 17 | COPY . . 18 | ARG VERSION 19 | ARG TARGETARCH 20 | 21 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=$TARGETARCH go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server 22 | 23 | RUN tools/dev/gen_simple_contextionary.sh 24 | RUN mkdir -p ./data 25 | 26 | COPY ./tools/preprocess_splitter_dict.sh ./ 27 | RUN /bin/bash preprocess_splitter_dict.sh "$LANGUAGE" "/app/data/contextionary.idx" 28 | 29 | 30 | FROM alpine 31 | 32 | COPY --from=builder /app/data/contextionary.idx /app/data/contextionary.knn /app/data/stopwords.json /app/data/splitter_dict.csv / 33 | COPY --from=builder /app/contextionary-server / 34 | 35 | ENV KNN_FILE=/contextionary.knn 36 | ENV IDX_FILE=/contextionary.idx 37 | ENV STOPWORDS_FILE=/stopwords.json 38 | ENV COMPOUND_SPLITTING_DICTIONARY_FILE=/splitter_dict.csv 39 | 40 | ENTRYPOINT [ "/contextionary-server" ] 41 | -------------------------------------------------------------------------------- /Dockerfile.local-vectordb: -------------------------------------------------------------------------------- 1 | # vi: ft=Dockerfile 2 | 3 | 4 | FROM golang:1.13 as builder 5 | WORKDIR /app 6 | 7 | RUN apt-get update && apt-get install -y bzip2 jq 8 | 9 | COPY go.mod go.sum ./ 10 | RUN go mod download 11 | 12 | COPY . . 13 | ARG VERSION 14 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server 15 | 16 | RUN tools/dev/gen_simple_contextionary.sh 17 | RUN mkdir -p ./data 18 | 19 | 20 | FROM alpine 21 | 22 | COPY local-vectordb/contextionary.idx local-vectordb/contextionary.knn local-vectordb/stopwords.json / 23 | COPY --from=builder /app/contextionary-server / 24 | 25 | ENV KNN_FILE=/contextionary.knn 26 | ENV IDX_FILE=/contextionary.idx 27 | ENV STOPWORDS_FILE=/stopwords.json 28 | 29 | ENTRYPOINT [ "/contextionary-server" ] 30 | -------------------------------------------------------------------------------- /Dockerfile.minimal: -------------------------------------------------------------------------------- 1 | # vi: ft=Dockerfile 2 | 3 | FROM golang:1.13 as builder 4 | WORKDIR /app 5 | 6 | RUN apt-get update && apt-get install -y bzip2 jq hunspell libhunspell-dev git 7 | 8 | COPY go.mod go.sum ./ 9 | RUN go mod download 10 | 11 | COPY . . 12 | ARG VERSION 13 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server 14 | 15 | RUN tools/dev/gen_simple_contextionary.sh 16 | RUN mkdir -p ./data 17 | 18 | COPY ./tools/preprocess_splitter_dict.sh ./ 19 | RUN /bin/bash preprocess_splitter_dict.sh "en" "/app/tools/dev/example.idx" 20 | 21 | FROM scratch 22 | 23 | COPY --from=builder /app/tools/dev/example.idx /app/tools/dev/example.knn /app/tools/dev/stopwords.json /app/data/splitter_dict.csv / 24 | COPY --from=builder /app/contextionary-server / 25 | 26 | ENV KNN_FILE=/example.knn 27 | ENV IDX_FILE=/example.idx 28 | ENV STOPWORDS_FILE=/stopwords.json 29 | ENV COMPOUND_SPLITTING_DICTIONARY_FILE=/splitter_dict.csv 30 | 31 | ENTRYPOINT [ "/contextionary-server" ] 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, Weaviate B.V. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weaviate Contextionary Weaviate logo 2 | 3 | > The contextionary powers the semantic, context-based searches in Weaviate. 4 | 5 | Not intended for stand-alone use. Used by [Weaviate - the ML-first vector 6 | search engine](https://github.com/weaviate/weaviate). 7 | 8 | ## Versioning 9 | 10 | The version tag is `-v`. So for 11 | example the app version `0.1.0` deployed with the [contextionary vector db 12 | version](https://c11y.semi.technology/contextionary.json) `0.6.0` of the 13 | English language will have the version `en0.6.0-v0.1.0`. This also 14 | corresponds to the Docker tag. 15 | 16 | ## Languages 17 | 18 | Currently available languages include: 19 | * `en` 20 | * `de` 21 | * `nl` 22 | * `cs` 23 | * `it` 24 | 25 | Other languages coming soon. 26 | 27 | ## Docker Requirements 28 | 29 | The build pipeline makes use of Docker's `buildx` for multi-arch builds. Make 30 | sure you run a Docker version which supports `buildx` and have run `docker 31 | buildx create --use` at least once. 32 | 33 | ## How to build and test project 34 | 35 | 1. Regenerate schema: 36 | 37 | ```bash 38 | ./gen_proto_code.sh 39 | ``` 40 | 41 | 2. Build image: 42 | 43 | ```bash 44 | LANGUAGE=en MODEL_VERSION=0.16.0 ./build.sh 45 | ``` 46 | 47 | 3. Run journey tests: 48 | 49 | ```bash 50 | LANGUAGE=en MODEL_VERSION=0.16.0 ./build.sh && DIMENSIONS=300 ./test/journey.sh 51 | ``` 52 | -------------------------------------------------------------------------------- /adapters/repos/extensions_weaviate_module.go: -------------------------------------------------------------------------------- 1 | package repos 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "context" 7 | "encoding/json" 8 | "fmt" 9 | "net/http" 10 | "time" 11 | 12 | "github.com/sirupsen/logrus" 13 | "github.com/weaviate/contextionary/extensions" 14 | "github.com/weaviate/contextionary/server/config" 15 | ) 16 | 17 | type ModuleExtensionRepo struct { 18 | client *http.Client 19 | logger logrus.FieldLogger 20 | origin string 21 | watchInterval time.Duration 22 | } 23 | 24 | func NewExtensionsRepo(logger logrus.FieldLogger, 25 | config *config.Config, watchInterval time.Duration) *ModuleExtensionRepo { 26 | client := &http.Client{} 27 | return &ModuleExtensionRepo{ 28 | client: client, 29 | logger: logger, 30 | origin: config.ExtensionsStorageOrigin, 31 | watchInterval: watchInterval, 32 | } 33 | } 34 | 35 | func (r *ModuleExtensionRepo) WatchAll() chan extensions.WatchResponse { 36 | returnCh := make(chan extensions.WatchResponse) 37 | 38 | go func() { 39 | t := time.Tick(r.watchInterval) 40 | for { 41 | r.updateConsumers(returnCh) 42 | <-t 43 | } 44 | }() 45 | 46 | return returnCh 47 | } 48 | 49 | func (f *ModuleExtensionRepo) uri(path string) string { 50 | return fmt.Sprintf("%s%s", f.origin, path) 51 | } 52 | 53 | func (r *ModuleExtensionRepo) updateConsumers(returnCh chan extensions.WatchResponse) { 54 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 55 | defer cancel() 56 | 57 | req, err := http.NewRequestWithContext(ctx, "GET", 58 | r.uri("/v1/modules/text2vec-contextionary/extensions-storage/"), nil) 59 | if err != nil { 60 | r.logger.WithField("action", "extensions_retrieve_all"). 61 | WithError(err).Error() 62 | return 63 | } 64 | 65 | res, err := r.client.Do(req) 66 | if err != nil { 67 | r.logger.WithField("action", "extensions_retrieve_all"). 68 | WithError(err).Error() 69 | return 70 | } 71 | 72 | defer res.Body.Close() 73 | if res.StatusCode > 399 { 74 | r.logger.WithField("action", "extensions_retrieve_all"). 75 | WithError(fmt.Errorf("expected status < 399, got %d", res.StatusCode)). 76 | Error() 77 | return 78 | } 79 | 80 | var exts []extensions.Extension 81 | scanner := bufio.NewScanner(res.Body) 82 | for scanner.Scan() { 83 | if err := scanner.Err(); err != nil { 84 | r.logger.WithField("action", "extensions_retrieve_all"). 85 | WithError(err).Error() 86 | return 87 | } 88 | 89 | rawExt := scanner.Bytes() 90 | var ext extensions.Extension 91 | err := json.Unmarshal(rawExt, &ext) 92 | if err != nil { 93 | r.logger.WithField("action", "extensions_retrieve_all"). 94 | WithError(err).Error() 95 | return 96 | } 97 | 98 | exts = append(exts, ext) 99 | } 100 | 101 | returnCh <- exts 102 | } 103 | 104 | func (r *ModuleExtensionRepo) Put(ctx context.Context, ext extensions.Extension) error { 105 | extBytes, err := json.Marshal(ext) 106 | if err != nil { 107 | return fmt.Errorf("marshal extension to json: %v", err) 108 | } 109 | 110 | req, err := http.NewRequestWithContext(ctx, "PUT", r.uri(fmt.Sprintf( 111 | "/v1/modules/text2vec-contextionary/extensions-storage/%s", ext.Concept)), bytes.NewReader(extBytes)) 112 | 113 | res, err := r.client.Do(req) 114 | if err != nil { 115 | return fmt.Errorf("put: %v", err) 116 | } 117 | 118 | defer res.Body.Close() 119 | if res.StatusCode > 399 { 120 | return fmt.Errorf("expected status < 399, got %d", res.StatusCode) 121 | } 122 | 123 | return nil 124 | } 125 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # set some defaults so we can also run locally 6 | if [ -z "$DOCKER_ORG" ] 7 | then 8 | DOCKER_ORG=semitechnologies 9 | fi 10 | 11 | if [ -z "$DOCKER_REPO" ] 12 | then 13 | DOCKER_REPO=contextionary 14 | fi 15 | 16 | if [ -z "$SOFTWARE_VERSION" ] 17 | then 18 | SOFTWARE_VERSION=local 19 | fi 20 | 21 | if [ -z "$MODEL_VERSION" ] 22 | then 23 | MODEL_VERSION=0.16.0 24 | fi 25 | 26 | if [ -z "$LANGUAGE" ] 27 | then 28 | LANGUAGE=en 29 | fi 30 | 31 | VERSION="${MODEL_VERSION}-${SOFTWARE_VERSION}" 32 | 33 | if [ -z "$FULL_VERSION_DOCKERFILE" ] 34 | then 35 | FULL_VERSION_DOCKERFILE=Dockerfile.full 36 | fi 37 | 38 | if [ "$PUSH_MULTIARCH" = "1" ]; then 39 | echo "Build and push multi-arch full version" 40 | echo "Build $LANGUAGE:" 41 | full_version="${LANGUAGE}${VERSION}" 42 | docker buildx build --platform=linux/amd64,linux/arm64 \ 43 | --push \ 44 | -f "$FULL_VERSION_DOCKERFILE" \ 45 | --build-arg VERSION="$full_version" \ 46 | --build-arg MODEL_VERSION="$MODEL_VERSION" \ 47 | --build-arg LANGUAGE="$LANGUAGE" \ 48 | -t "$DOCKER_ORG/$DOCKER_REPO:$full_version" . 49 | else 50 | echo "Build minimal version (english only)" 51 | docker build -f Dockerfile.minimal --build-arg VERSION="$VERSION-minimal" -t "$DOCKER_ORG/$DOCKER_REPO:en$VERSION-minimal" . 52 | 53 | echo "Build single-arch full version" 54 | echo "Build $LANGUAGE:" 55 | full_version="${LANGUAGE}${VERSION}" 56 | docker build \ 57 | -f "$FULL_VERSION_DOCKERFILE" \ 58 | --build-arg VERSION="$full_version" \ 59 | --build-arg MODEL_VERSION="$MODEL_VERSION" \ 60 | --build-arg LANGUAGE="$LANGUAGE" \ 61 | -t "$DOCKER_ORG/$DOCKER_REPO:$full_version" . 62 | fi 63 | 64 | 65 | -------------------------------------------------------------------------------- /client/client.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "strconv" 8 | "strings" 9 | 10 | pb "github.com/weaviate/contextionary/contextionary" 11 | grpc "google.golang.org/grpc" 12 | ) 13 | 14 | func help() { 15 | fmt.Println("the following commands are supported:") 16 | fmt.Printf("\n") 17 | fmt.Printf("\t%-15s%s\n", "meta", "Display meta info, such as versions") 18 | fmt.Printf("\t %s\n", "Usage: client meta") 19 | fmt.Printf("\n") 20 | fmt.Printf("\t%-15s%s\n", "word-present", "Check if the word is present in the db or as an extension") 21 | fmt.Printf("\t %s\n", "Usage: client word-present word") 22 | fmt.Printf("\n") 23 | fmt.Printf("\t%-15s%s\n", "word-stopword", "Check if the word is considered a stopword") 24 | fmt.Printf("\t %s\n", "Usage: client word-stopword word") 25 | fmt.Printf("\n") 26 | fmt.Printf("\t%-15s%s\n", "search", "Search for word or property") 27 | fmt.Printf("\t %s\n", "For usage run client search and see instructions from there") 28 | fmt.Printf("\n") 29 | fmt.Printf("\t%-15s%s\n", "similar-words", "Search for similar words within the specified certainty") 30 | fmt.Printf("\t %s\n", "Usage: client similar-words word certainty") 31 | fmt.Printf("\n") 32 | fmt.Printf("\t%-15s%s\n", "extend", "Extend the contextionary with custom concepts") 33 | fmt.Printf("\t %s\n", "Usage: client extend newconcept \"definition of the new concept\"") 34 | fmt.Printf("\n") 35 | fmt.Printf("\t%-15s%s\n", "vectorize", "Vectorize any string") 36 | fmt.Printf("\t %s\n", "Usage: client vectorize \"input string to vectorize\"") 37 | fmt.Printf("\t%-15s%s\n", "multi-vector-for-word", "Vectorize multiple strings") 38 | fmt.Printf("\t %s\n", "Usage: client multi-vector-for-word \"word1 word2 word3 ... wordN\"") 39 | } 40 | 41 | func main() { 42 | conn, err := grpc.Dial("localhost:9999", grpc.WithInsecure()) 43 | if err != nil { 44 | fmt.Fprintf(os.Stderr, "couldn't connect: %s", err) 45 | os.Exit(1) 46 | } 47 | defer conn.Close() 48 | 49 | client := pb.NewContextionaryClient(conn) 50 | 51 | args := os.Args[1:] 52 | if len(args) == 0 { 53 | fmt.Fprintf(os.Stderr, "no command provided, try 'word-present'\n") 54 | os.Exit(1) 55 | } 56 | 57 | cmd := args[0] 58 | switch cmd { 59 | case "help": 60 | help() 61 | case "meta", "version": 62 | meta(client, args[1:]) 63 | case "word-present": 64 | wordPresent(client, args[1:]) 65 | case "word-stopword": 66 | wordStopword(client, args[1:]) 67 | case "search": 68 | search(client, args[1:]) 69 | case "similar-words": 70 | similarWords(client, args[1:]) 71 | case "extend": 72 | extend(client, args[1:]) 73 | case "vectorize": 74 | vectorize(client, args[1:]) 75 | case "multi-vector-for-word": 76 | multiVecForWord(client, args[1:]) 77 | 78 | default: 79 | fmt.Fprintf(os.Stderr, "unknown command '%s'\n", cmd) 80 | os.Exit(1) 81 | } 82 | } 83 | func meta(client pb.ContextionaryClient, args []string) { 84 | ctx := context.Background() 85 | 86 | res, err := client.Meta(ctx, &pb.MetaParams{}) 87 | if err != nil { 88 | fmt.Fprintf(os.Stderr, "ERROR: couldn't display meta: %s", err) 89 | os.Exit(1) 90 | } 91 | 92 | fmt.Printf("%#v\n", res) 93 | } 94 | 95 | func wordPresent(client pb.ContextionaryClient, args []string) { 96 | if len(args) == 0 { 97 | fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to check\n") 98 | os.Exit(1) 99 | } 100 | 101 | ctx := context.Background() 102 | 103 | for _, word := range args { 104 | res, err := client.IsWordPresent(ctx, &pb.Word{Word: word}) 105 | if err != nil { 106 | fmt.Fprintf(os.Stderr, "ERROR: couldn't get word: %s", err) 107 | os.Exit(1) 108 | } 109 | if res.Present { 110 | fmt.Printf("word '%s' is present in the contextionary\n", word) 111 | } else { 112 | fmt.Printf("word '%s' is NOT present in the contextionary\n", word) 113 | } 114 | } 115 | } 116 | 117 | func similarWords(client pb.ContextionaryClient, args []string) { 118 | var word string 119 | var certainty float32 120 | 121 | if len(args) == 0 { 122 | fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to find similarities to\n") 123 | os.Exit(1) 124 | } 125 | word = args[0] 126 | 127 | if len(args) == 1 { 128 | fmt.Fprintf(os.Stderr, "need at least one other argument: the minimum required certainty\n") 129 | os.Exit(1) 130 | } 131 | 132 | c, err := strconv.ParseFloat(args[1], 32) 133 | if err != nil { 134 | fmt.Fprintf(os.Stderr, "couldnt parse certainty: %v\n", err) 135 | os.Exit(1) 136 | } 137 | certainty = float32(c) 138 | 139 | res, err := client.SafeGetSimilarWordsWithCertainty(context.Background(), &pb.SimilarWordsParams{ 140 | Certainty: certainty, 141 | Word: word, 142 | }) 143 | if err != nil { 144 | fmt.Fprintf(os.Stderr, "ERROR: couldn't get similar words: %s", err) 145 | os.Exit(1) 146 | } 147 | 148 | for _, word := range res.Words { 149 | fmt.Printf("🥳 %s\n", word.Word) 150 | } 151 | } 152 | func extend(client pb.ContextionaryClient, args []string) { 153 | if len(args) != 2 { 154 | fmt.Fprintf(os.Stderr, "need two arguments, the concept to add/extend and its definition\n") 155 | os.Exit(1) 156 | } 157 | concept := args[0] 158 | definition := strings.ToLower(args[1]) 159 | 160 | _, err := client.AddExtension(context.Background(), &pb.ExtensionInput{ 161 | Concept: concept, 162 | Definition: definition, 163 | Weight: 1, 164 | }) 165 | if err != nil { 166 | fmt.Fprintf(os.Stderr, "ERROR: %s", err) 167 | os.Exit(1) 168 | } else { 169 | fmt.Fprintf(os.Stdout, "Success!") 170 | os.Exit(0) 171 | } 172 | } 173 | 174 | func vectorize(client pb.ContextionaryClient, args []string) { 175 | if len(args) != 1 { 176 | fmt.Fprintf(os.Stderr, "need one argument: the input string to vectorize") 177 | os.Exit(1) 178 | } 179 | input := args[0] 180 | 181 | res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{ 182 | Corpi: []string{input}, 183 | }) 184 | if err != nil { 185 | fmt.Fprintf(os.Stderr, "ERROR: %s", err) 186 | os.Exit(1) 187 | } else { 188 | fmt.Fprintf(os.Stdout, "Success: %v\n", res.Entries) 189 | fmt.Fprintf(os.Stdout, "Source: %v\n", res.Source) 190 | os.Exit(0) 191 | } 192 | } 193 | 194 | func multiVecForWord(client pb.ContextionaryClient, args []string) { 195 | if len(args) < 1 { 196 | fmt.Fprintf(os.Stderr, "need at least one argument: the input word to vectorize") 197 | os.Exit(1) 198 | } 199 | 200 | words := make([]*pb.Word, len(args)) 201 | for i, word := range args { 202 | words[i] = &pb.Word{Word: word} 203 | } 204 | 205 | res, err := client.MultiVectorForWord(context.Background(), &pb.WordList{ 206 | Words: words, 207 | }) 208 | if err != nil { 209 | fmt.Fprintf(os.Stderr, "ERROR: %s", err) 210 | os.Exit(1) 211 | } else { 212 | fmt.Fprintf(os.Stdout, "Success: %v", res.Vectors) 213 | os.Exit(0) 214 | } 215 | } 216 | 217 | func wordStopword(client pb.ContextionaryClient, args []string) { 218 | if len(args) == 0 { 219 | fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to check\n") 220 | os.Exit(1) 221 | } 222 | 223 | ctx := context.Background() 224 | 225 | for _, word := range args { 226 | res, err := client.IsWordStopword(ctx, &pb.Word{Word: word}) 227 | if err != nil { 228 | fmt.Fprintf(os.Stderr, "ERROR: couldn't get word: %s", err) 229 | os.Exit(1) 230 | } 231 | if res.Stopword { 232 | fmt.Printf("word '%s' is a stopword\n", word) 233 | } else { 234 | fmt.Printf("word '%s' is not a stopword\n", word) 235 | } 236 | } 237 | } 238 | 239 | func search(client pb.ContextionaryClient, args []string) { 240 | if len(args) == 0 { 241 | fmt.Fprintf(os.Stderr, "need at least one other argument: either 'class' or 'property' \n") 242 | os.Exit(1) 243 | } 244 | 245 | cmd := args[0] 246 | switch cmd { 247 | case "class": 248 | searchClass(client, args[1:]) 249 | default: 250 | fmt.Fprintf(os.Stderr, "unknown command '%s'\n", cmd) 251 | os.Exit(1) 252 | } 253 | } 254 | 255 | func searchClass(client pb.ContextionaryClient, args []string) { 256 | if len(args) == 0 { 257 | fmt.Fprintf(os.Stderr, "need at least one other argument the search term\n") 258 | os.Exit(1) 259 | } 260 | 261 | if len(args) == 1 { 262 | fmt.Fprintf(os.Stderr, "need at least one other argument the desired certainty\n") 263 | os.Exit(1) 264 | } 265 | 266 | searchTerm := args[0] 267 | certainty, err := strconv.ParseFloat(args[1], 32) 268 | if err != nil { 269 | fmt.Fprintf(os.Stderr, "cannot parse certainty '%s'\n", args[1]) 270 | os.Exit(1) 271 | } 272 | 273 | params := &pb.SchemaSearchParams{ 274 | Certainty: float32(certainty), 275 | Name: searchTerm, 276 | } 277 | 278 | ctx := context.Background() 279 | res, err := client.SchemaSearch(ctx, params) 280 | if err != nil { 281 | fmt.Fprintf(os.Stderr, "schema search failed: %s", err) 282 | os.Exit(1) 283 | } 284 | 285 | if len(res.Results) == 0 { 286 | fmt.Println("😵 nothing found") 287 | } 288 | 289 | for _, class := range res.Results { 290 | fmt.Printf("🥳 %s (Certainty: %f)\n", class.Name, class.Certainty) 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /compoundsplitting/dictionary.go: -------------------------------------------------------------------------------- 1 | package compoundsplitting 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | 11 | // Dictionary filter for the splitting algorithm 12 | // based on the words in the contextionary 13 | type ContextionaryDict struct { 14 | dict map[string]int // storing the word and its occurrence 15 | } 16 | 17 | // NewContextionaryDict 18 | // uses a dictionary file that was created using the preprocessing procedures 19 | func NewContextionaryDict(contextionaryDictFile string) (*ContextionaryDict, error) { 20 | file, err := os.Open(contextionaryDictFile) 21 | if err != nil { 22 | return nil, err 23 | } 24 | defer file.Close() 25 | 26 | dict := &ContextionaryDict{ 27 | dict: make(map[string]int, 400000), 28 | } 29 | 30 | scanner := bufio.NewScanner(file) 31 | for scanner.Scan() { 32 | line := scanner.Text() 33 | split := strings.Split(line, ",") 34 | occurrence, err := strconv.Atoi(split[1]) 35 | if err != nil { 36 | return nil, err 37 | } 38 | dict.dict[split[0]] = occurrence 39 | } 40 | 41 | return dict, nil 42 | } 43 | 44 | // Contains true if word is in contextionary 45 | func (cd *ContextionaryDict) Contains(word string) bool { 46 | _, exists := cd.dict[word] 47 | return exists 48 | } 49 | 50 | //Score prefers long and few words 51 | func (cd *ContextionaryDict) Score(phrase []string) float64 { 52 | // Prefer longer words as scoring 53 | // Assumption is that the compound words are on average more similar to splittings that 54 | // share most of the characters with the compound. 55 | lenScore := 0 56 | for _, word := range phrase { 57 | lenScore += len(word) 58 | } 59 | 60 | // Give a boost for less words 61 | if len(phrase) == 2 { 62 | lenScore += 3 63 | } 64 | if len(phrase) == 3 { 65 | lenScore += 1 66 | } 67 | 68 | return float64(lenScore) 69 | } 70 | 71 | 72 | // DictMock used for unit testing 73 | type DictMock struct { 74 | scores map[string]float64 75 | } 76 | 77 | // Contains 78 | func (dm *DictMock) Contains(word string) bool { 79 | _, exists := dm.scores[word] 80 | return exists 81 | } 82 | 83 | // Score 84 | func (dm *DictMock) Score(phrase []string) float64 { 85 | score := 0.0 86 | for _, word := range phrase { 87 | score += dm.scores[word] 88 | } 89 | return score 90 | } 91 | -------------------------------------------------------------------------------- /compoundsplitting/noop_splitter.go: -------------------------------------------------------------------------------- 1 | package compoundsplitting 2 | 3 | type NoopSplitter struct{} 4 | 5 | func NewNoopSplitter() NoopSplitter { 6 | return NoopSplitter{} 7 | } 8 | 9 | func (n NoopSplitter) Split(words string) ([]string, error) { 10 | return []string{}, nil 11 | } 12 | -------------------------------------------------------------------------------- /compoundsplitting/splitter.go: -------------------------------------------------------------------------------- 1 | package compoundsplitting 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | ) 8 | 9 | // minCompoundWordLength prevents the splitting into very small (often not real) words 10 | // to prevent a bloated tree 11 | const minCompoundWordLength = 4 12 | 13 | // maxWordLength prevents a tree from growing too big when adding very long strings 14 | const maxWordLength = 100 15 | 16 | // maxNumberTreeNodes 17 | const maxNumberTreeNodes = 20 18 | 19 | const cancelSplittingAfter = 500 * time.Millisecond 20 | 21 | type Dictionary interface { 22 | // Score receives a phrase of words and gives a score on how "good" this phrase is. 23 | // If a compound word can be splitted into multiple phrases it will choose the one with the highest score. 24 | Score(phrase []string) float64 25 | // Contains is true if the word is in the dictionary 26 | Contains(word string) bool 27 | } 28 | 29 | // Splitter builds a tree of compound splits and selects 30 | // the best option based on a scoring mechanism 31 | type Splitter struct { 32 | dict Dictionary 33 | cancelAfter time.Duration 34 | } 35 | 36 | // New Splitter recognizing words given by dict and 37 | // selecting split phrases based on scoring 38 | func NewSplitter(dict Dictionary) *Splitter { 39 | return &Splitter{ 40 | dict: dict, 41 | cancelAfter: cancelSplittingAfter, 42 | } 43 | } 44 | 45 | type CompoundSplit struct { 46 | // Combinations of compound combinations in a phrase 47 | combinations []*Node 48 | } 49 | 50 | // Split a compound word into its compounds 51 | func (sp *Splitter) Split(word string) ([]string, error) { 52 | 53 | if len(word) > maxWordLength { 54 | return []string{}, nil 55 | } 56 | 57 | compoundSplit := CompoundSplit{} 58 | 59 | // spawn a new context that cancels the recursion if we are spending too much 60 | // time on it 61 | ctx, cancel := context.WithTimeout(context.Background(), sp.cancelAfter) 62 | defer cancel() 63 | 64 | err := sp.findAllWordCombinations(ctx, word, &compoundSplit) 65 | if err != nil { 66 | return nil, err 67 | } 68 | combinations := compoundSplit.getAllWordCombinations(ctx) 69 | maxScore := 0.0 70 | maxPhrase := []string{} 71 | for _, combination := range combinations { 72 | currentScore := sp.dict.Score(combination) 73 | if len(maxPhrase) == 0 { 74 | // Initialize if score is negative 75 | maxScore = currentScore 76 | maxPhrase = combination 77 | } 78 | if currentScore > maxScore { 79 | maxScore = currentScore 80 | maxPhrase = combination 81 | } 82 | } 83 | return maxPhrase, nil 84 | } 85 | 86 | func (cs *CompoundSplit) insertCompound(ctx context.Context, word string, 87 | startIndex int) error { 88 | compound := NewNode(word, startIndex) 89 | appended := false 90 | for _, combination := range cs.combinations { 91 | // For all possible combinations 92 | 93 | leaves := combination.RecursivelyFindLeavesBeforeIndex(ctx, startIndex) 94 | for _, leave := range leaves { 95 | // Append the new compound to the leaves 96 | 97 | appended = true 98 | err := leave.AddChild(compound) 99 | if err != nil { 100 | return err 101 | } 102 | } 103 | } 104 | if !appended { 105 | // if compound was not added to any leave add it to combinations 106 | cs.combinations = append(cs.combinations, compound) 107 | } 108 | return nil 109 | } 110 | 111 | func (sp *Splitter) findAllWordCombinations(ctx context.Context, str string, compoundSplit *CompoundSplit) error { 112 | compoundsUsed := 0 113 | for offset, _ := range str { 114 | // go from left to right and choose offsetted substring 115 | offsetted := str[offset:] 116 | 117 | for i := 1; i <= len(offsetted); i++ { 118 | // go from left to right to find a word 119 | word := offsetted[:i] 120 | if len(word) < minCompoundWordLength { 121 | continue 122 | } 123 | 124 | if sp.dict.Contains(word) { 125 | compoundsUsed += 1 126 | if compoundsUsed == maxNumberTreeNodes { 127 | // Tree is getting out of bounds stopping for performance 128 | return nil 129 | } 130 | err := compoundSplit.insertCompound(ctx, word, offset) 131 | if err != nil { 132 | return err 133 | } 134 | } 135 | } 136 | } 137 | return nil 138 | } 139 | 140 | func (cs *CompoundSplit) getAllWordCombinations(ctx context.Context) [][]string { 141 | wordCombinations := [][]string{} 142 | 143 | for _, combination := range cs.combinations { 144 | wordCombinations = append(wordCombinations, 145 | combination.RecursivelyBuildNames(ctx)...) 146 | } 147 | 148 | return wordCombinations 149 | } 150 | 151 | // Node for of the word tree 152 | type Node struct { 153 | name string 154 | children []*Node 155 | startIndex int // inclusiv 156 | endIndex int // exclusive 157 | } 158 | 159 | // NewNode from node name and in compoundword index 160 | func NewNode(name string, startIndex int) *Node { 161 | return &Node{ 162 | name: name, 163 | children: []*Node{}, 164 | startIndex: startIndex, 165 | endIndex: startIndex + len(name), 166 | } 167 | } 168 | 169 | // AddChild node to node 170 | func (node *Node) AddChild(newChildNode *Node) error { 171 | if newChildNode.startIndex < node.endIndex { 172 | return fmt.Errorf("Child starts at %v but this node ends at %v can't add as child", newChildNode.startIndex, node.endIndex) 173 | } 174 | node.children = append(node.children, newChildNode) 175 | return nil 176 | } 177 | 178 | func (node *Node) findChildNodesBeforeIndex(index int) []*Node { 179 | childrensThatEndBeforeIndex := []*Node{} 180 | 181 | for _, child := range node.children { 182 | if child.endIndex <= index { 183 | childrensThatEndBeforeIndex = append(childrensThatEndBeforeIndex, child) 184 | } 185 | } 186 | 187 | return childrensThatEndBeforeIndex 188 | } 189 | 190 | // RecursivelyBuildNames of compounds 191 | func (node *Node) RecursivelyBuildNames(ctx context.Context) [][]string { 192 | compoundName := [][]string{} 193 | if ctx.Err() != nil { 194 | // we've been going recursively too long, abort! 195 | compoundName = append(compoundName, []string{node.name}) 196 | return compoundName 197 | } 198 | 199 | for _, child := range node.children { 200 | childNames := child.RecursivelyBuildNames(ctx) 201 | 202 | for _, childName := range childNames { 203 | // Add the name of this node first 204 | fullName := []string{node.name} 205 | fullName = append(fullName, childName...) 206 | compoundName = append(compoundName, fullName) 207 | } 208 | } 209 | if len(compoundName) == 0 { 210 | // This is a leave node 211 | compoundName = append(compoundName, []string{node.name}) 212 | } 213 | 214 | return compoundName 215 | } 216 | 217 | // RecursivelyFindLeavesBeforeIndex where to add a new node 218 | func (node *Node) RecursivelyFindLeavesBeforeIndex(ctx context.Context, index int) []*Node { 219 | foundLeaves := []*Node{} 220 | if ctx.Err() != nil { 221 | // we've been going recursively too long, abort! 222 | return foundLeaves 223 | } 224 | 225 | children := node.findChildNodesBeforeIndex(index) 226 | for _, child := range children { 227 | leaves := child.RecursivelyFindLeavesBeforeIndex(ctx, index) 228 | if len(leaves) == 0 { 229 | // There are no leaves this means the child node is already a leave 230 | foundLeaves = append(foundLeaves, child) 231 | } else { 232 | // Found leaves use them instead of direct child 233 | foundLeaves = append(foundLeaves, leaves...) 234 | } 235 | } 236 | 237 | if len(foundLeaves) == 0 && node.endIndex <= index { 238 | // This node is the leave 239 | foundLeaves = append(foundLeaves, node) 240 | } 241 | 242 | return foundLeaves 243 | } 244 | 245 | // NewEmptyTestSplitter creates a splitter, 246 | // that does not know any words and 247 | // thus is not able to split any words 248 | func NewEmptyTestSplitter() *Splitter { 249 | dictMock := &DictMock{ 250 | scores: map[string]float64{}, 251 | } 252 | return &Splitter{ 253 | dict: dictMock, 254 | } 255 | } 256 | 257 | func NewTestSplitter(wordScoreMapping map[string]float64) *Splitter { 258 | dict := &DictMock{ 259 | scores: wordScoreMapping, 260 | } 261 | return &Splitter{ 262 | dict: dict, 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /compoundsplitting/splitter_test.go: -------------------------------------------------------------------------------- 1 | package compoundsplitting 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "testing" 7 | "time" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestSplitTreeSplitter(t *testing.T) { 14 | dictMock := &DictMock{ 15 | scores: map[string]float64{ 16 | "drie": 2.0, 17 | "hoek": 2.0, 18 | "brood": 4.0, 19 | "driehoek": 5.0, 20 | "broodje": 5.0, 21 | }, 22 | } 23 | 24 | ts := Splitter{ 25 | dict: dictMock, 26 | cancelAfter: 500 * time.Millisecond, 27 | } 28 | 29 | // drie hoek brood 30 | // broodje 31 | // driehoek brood 32 | // broodje 33 | 34 | 35 | cs := CompoundSplit{} 36 | 37 | ts.findAllWordCombinations(context.Background(), "driehoeksbroodje", &cs) 38 | 39 | combinations := cs.getAllWordCombinations(context.Background()) 40 | assert.Equal(t, 4, len(combinations)) 41 | for _, combination := range combinations { 42 | fmt.Printf("%v\n", combination) 43 | } 44 | 45 | splited, err := ts.Split("driehoeksbroodje") 46 | assert.Nil(t, err) 47 | require.Equal(t, 2, len(splited)) 48 | assert.Equal(t, "driehoek", splited[0]) 49 | assert.Equal(t, "broodje", splited[1]) 50 | 51 | // Test no result 52 | splited, err = ts.Split("raupenprozessionsspinner") 53 | assert.Nil(t, err) 54 | assert.Equal(t, 0, len(splited), "Expected no result since no substring is in the dict") 55 | } 56 | 57 | func TestNegativeScore(t *testing.T) { 58 | dictMock := &DictMock{ 59 | scores: map[string]float64{ 60 | "drie": -10.0, 61 | "hoek": -10.0, 62 | "brood": -8.0, 63 | "driehoek": -2.0, 64 | "broodje": -2.0, 65 | }, 66 | } 67 | 68 | ts := NewSplitter(dictMock) 69 | 70 | splited, err := ts.Split("driehoeksbroodje") 71 | assert.Nil(t, err) 72 | assert.Equal(t, 2, len(splited)) 73 | assert.Equal(t, "driehoek", splited[0]) 74 | assert.Equal(t, "broodje", splited[1]) 75 | } 76 | 77 | func TestInsertCompound(t *testing.T) { 78 | 79 | t.Run("Add a new word", func(t *testing.T) { 80 | ts := CompoundSplit{} 81 | ts.insertCompound(context.Background(), "test", 0) 82 | 83 | assert.Equal(t, 1, len(ts.combinations)) 84 | assert.Equal(t, "test", ts.combinations[0].name) 85 | }) 86 | 87 | t.Run("Add a two words", func(t *testing.T) { 88 | ts := CompoundSplit{} 89 | ts.insertCompound(context.Background(), "test", 0) 90 | ts.insertCompound(context.Background(), "testje", 0) 91 | 92 | assert.Equal(t, 2, len(ts.combinations)) 93 | assert.Equal(t, "test", ts.combinations[0].name) 94 | assert.Equal(t, "testje", ts.combinations[1].name) 95 | }) 96 | 97 | t.Run("Add a two words different index", func(t *testing.T) { 98 | ts := CompoundSplit{} 99 | 100 | // phrase: testje 101 | ts.insertCompound(context.Background(), "test", 0) 102 | ts.insertCompound(context.Background(), "stje", 2) 103 | 104 | assert.Equal(t, 2, len(ts.combinations)) 105 | assert.Equal(t, "test", ts.combinations[0].name) 106 | assert.Equal(t, "stje", ts.combinations[1].name) 107 | }) 108 | 109 | t.Run("Add a two words different index", func(t *testing.T) { 110 | ts := CompoundSplit{} 111 | 112 | // phrase: testjenuttig 113 | // 123456789111 114 | // 012 115 | ts.insertCompound(context.Background(), "test", 0) 116 | ts.insertCompound(context.Background(), "nuttig", 8) 117 | 118 | assert.Equal(t, 1, len(ts.combinations)) 119 | phrase := ts.combinations[0] 120 | assert.Equal(t, "test", phrase.name) 121 | assert.Equal(t, "nuttig", phrase.children[0].name) 122 | 123 | }) 124 | 125 | t.Run("Add a two combinations", func(t *testing.T) { 126 | ts := CompoundSplit{} 127 | 128 | // phrase: testjenuttig 129 | // 123456789111 130 | // 012 131 | ts.insertCompound(context.Background(), "test", 0) 132 | ts.insertCompound(context.Background(), "est", 1) 133 | ts.insertCompound(context.Background(), "nuttig", 8) 134 | 135 | assert.Equal(t, 2, len(ts.combinations)) 136 | phrase := ts.combinations[0] 137 | assert.Equal(t, "test", phrase.name) 138 | assert.Equal(t, "nuttig", phrase.children[0].name) 139 | 140 | phrase = ts.combinations[1] 141 | assert.Equal(t, "est", phrase.name) 142 | assert.Equal(t, "nuttig", phrase.children[0].name) 143 | }) 144 | 145 | t.Run("Add driehoeksbroodje", func(t *testing.T) { 146 | ts := CompoundSplit{} 147 | 148 | // phrase: driehoeksbroodje 149 | // 1234567891111111 150 | // 0123456 151 | ts.insertCompound(context.Background(), "drie", 0) 152 | ts.insertCompound(context.Background(), "driehoek", 0) 153 | ts.insertCompound(context.Background(), "hoek", 5) 154 | ts.insertCompound(context.Background(), "brood", 10) 155 | ts.insertCompound(context.Background(), "broodje", 10) 156 | 157 | // drie hoek brood 158 | // broodje 159 | 160 | // driehoek brood 161 | // broodje 162 | 163 | assert.Equal(t, 2, len(ts.combinations)) 164 | }) 165 | 166 | } 167 | 168 | func TestNode(t *testing.T) { 169 | 170 | t.Run("New Node", func(t *testing.T) { 171 | node := NewNode("test", 2) 172 | assert.Equal(t, 6, node.endIndex) 173 | }) 174 | 175 | t.Run("Add child", func(t *testing.T) { 176 | node1 := NewNode("test", 2) 177 | node2 := NewNode("case", 6) 178 | node3 := NewNode("ase", 7) 179 | err := node1.AddChild(node2) 180 | assert.Nil(t, err) 181 | err = node1.AddChild(node3) 182 | assert.Nil(t, err) 183 | 184 | assert.Equal(t, 2, len(node1.children)) 185 | }) 186 | 187 | t.Run("Add wrong index", func(t *testing.T) { 188 | node1 := NewNode("test", 2) 189 | node2 := NewNode("esting", 3) 190 | err := node1.AddChild(node2) 191 | assert.NotNil(t, err) 192 | }) 193 | 194 | t.Run("find children before index", func(t *testing.T) { 195 | // testcasees 196 | // 0123456789 197 | test := NewNode("test", 0) 198 | caseN := NewNode("case", 4) 199 | as := NewNode("as", 5) 200 | see := NewNode("see", 6) 201 | es := NewNode("es", 8) 202 | 203 | // test case es 204 | // test as es 205 | // test see 206 | 207 | test.AddChild(caseN) 208 | test.AddChild(as) 209 | test.AddChild(see) 210 | caseN.AddChild(es) 211 | as.AddChild(es) 212 | 213 | // no child nodes that end before index 6 214 | assert.Equal(t, 0, len(test.findChildNodesBeforeIndex(6))) 215 | // as ends at 7 216 | assert.Equal(t, 1, len(test.findChildNodesBeforeIndex(7))) 217 | // case ends at 8 218 | assert.Equal(t, 2, len(test.findChildNodesBeforeIndex(8))) 219 | // see ends at 9 220 | assert.Equal(t, 3, len(test.findChildNodesBeforeIndex(9))) 221 | }) 222 | 223 | t.Run("find leaves before index", func(t *testing.T) { 224 | // testcasees 225 | // 0123456789 226 | test := NewNode("test", 0) 227 | caseN := NewNode("case", 4) 228 | as := NewNode("as", 5) 229 | see := NewNode("see", 6) 230 | es := NewNode("es", 8) 231 | 232 | // test case es 233 | // test as es 234 | // test see 235 | 236 | test.AddChild(caseN) 237 | test.AddChild(as) 238 | test.AddChild(see) 239 | caseN.AddChild(es) 240 | as.AddChild(es) 241 | 242 | assert.Equal(t, 0, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 0))) 243 | assert.Equal(t, 0, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 3))) 244 | assert.Equal(t, 1, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 4))) 245 | node := test.RecursivelyFindLeavesBeforeIndex(context.Background(), 4)[0] 246 | assert.Equal(t, "test", node.name) 247 | 248 | assert.Equal(t, 1, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 7))) 249 | node = test.RecursivelyFindLeavesBeforeIndex(context.Background(), 7)[0] 250 | assert.Equal(t, "as", node.name) 251 | 252 | assert.Equal(t, 2, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 8))) 253 | }) 254 | 255 | } 256 | 257 | func TestSplitVeryLongWords(t *testing.T) { 258 | dictMock := &DictMock{ 259 | scores: map[string]float64{ 260 | "aaaa": 1.0, 261 | "bbbb": 1.0, 262 | }, 263 | } 264 | 265 | ts := Splitter{ 266 | dict: dictMock, 267 | } 268 | 269 | t1 := time.Now() 270 | 271 | split, err := ts.Split("aaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaa") 272 | 273 | t2 := time.Now() 274 | diff := t2.Sub(t1) 275 | 276 | assert.Nil(t, err) 277 | assert.Less(t, 0, len(split)) 278 | 279 | if diff > time.Millisecond*200 { 280 | fmt.Errorf("Splitter took too long") 281 | t.Fail() 282 | } 283 | } 284 | 285 | func TestSplitTooLongWords(t *testing.T) { 286 | dictMock := &DictMock{ 287 | scores: map[string]float64{ 288 | "aaaa": 1.0, 289 | "bbbb": 1.0, 290 | }, 291 | } 292 | 293 | ts := Splitter{ 294 | dict: dictMock, 295 | } 296 | 297 | split, err := ts.Split("aaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbb") 298 | 299 | assert.Nil(t, err) 300 | assert.Equal(t, 0, len(split)) 301 | } 302 | 303 | func TestUnboundTree(t *testing.T) { 304 | dictMock := &DictMock{ 305 | scores: map[string]float64{ 306 | "5555": 1.0, 307 | "55555": 1.0, 308 | "5555555555555555": 1.0, 309 | }, 310 | } 311 | 312 | ts := Splitter{ 313 | dict: dictMock, 314 | } 315 | 316 | t1 := time.Now() 317 | 318 | _, err := ts.Split("ql55555555555555555555555555555") 319 | 320 | t2 := time.Now() 321 | diff := t2.Sub(t1) 322 | 323 | assert.Nil(t, err) 324 | 325 | if diff > time.Millisecond*200 { 326 | fmt.Errorf("Splitter took too long") 327 | t.Fail() 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /contextionary/contextionary.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package contextionary; 4 | 5 | service Contextionary { 6 | rpc IsWordStopword(Word) returns (WordStopword) {} 7 | rpc IsWordPresent(Word) returns (WordPresent) {} 8 | rpc SchemaSearch(SchemaSearchParams) returns (SchemaSearchResults) {} 9 | rpc SafeGetSimilarWordsWithCertainty(SimilarWordsParams) returns (SimilarWordsResults) {} 10 | rpc VectorForWord(Word) returns (Vector) {} 11 | rpc MultiVectorForWord(WordList) returns (VectorList) {} 12 | rpc VectorForCorpi(Corpi) returns (Vector) {} 13 | rpc NearestWordsByVector(VectorNNParams) returns (NearestWords) {} 14 | rpc MultiNearestWordsByVector(VectorNNParamsList) returns (NearestWordsList) {} 15 | rpc Meta(MetaParams) returns (MetaOverview) {} 16 | rpc AddExtension(ExtensionInput) returns (AddExtensionResult) {} 17 | } 18 | 19 | message ExtensionInput { 20 | string concept = 1; 21 | string definition = 2; 22 | float weight = 3; 23 | } 24 | 25 | message AddExtensionResult { } 26 | 27 | message MetaParams {} 28 | 29 | message MetaOverview { 30 | string version = 1; 31 | int64 wordCount = 2; 32 | } 33 | 34 | message Word { 35 | string word = 1; 36 | } 37 | 38 | message WordList { 39 | repeated Word words = 1; 40 | } 41 | 42 | message WordPresent { 43 | bool present = 1; 44 | } 45 | 46 | message Vector { 47 | repeated VectorEntry entries = 1; 48 | repeated InputElement source = 2; 49 | }; 50 | 51 | message InputElement { 52 | string concept = 1; 53 | float weight = 2; 54 | uint64 occurrence = 3; 55 | }; 56 | 57 | message VectorList { 58 | repeated Vector vectors = 1; 59 | } 60 | 61 | message VectorEntry { 62 | float Entry = 1; 63 | } 64 | 65 | message VectorNNParams { 66 | Vector vector = 1; 67 | int32 k = 2; 68 | int32 n = 3; 69 | } 70 | 71 | message VectorNNParamsList { 72 | repeated VectorNNParams Params = 1; 73 | } 74 | 75 | message Corpi { 76 | repeated string corpi = 1; 77 | repeated Override overrides = 2; 78 | } 79 | 80 | message Override { 81 | string word = 1; 82 | string expression = 2; 83 | } 84 | 85 | message WordStopword { 86 | bool stopword = 1; 87 | } 88 | 89 | message SimilarWordsParams { 90 | string word = 1; 91 | float certainty = 2; 92 | } 93 | 94 | message SimilarWordsResults { 95 | repeated Word words = 1; 96 | } 97 | 98 | message NearestWords { 99 | repeated string words = 1; 100 | repeated float distances = 2; 101 | VectorList vectors = 3; 102 | } 103 | 104 | message NearestWordsList { 105 | repeated NearestWords words = 1; 106 | } 107 | 108 | message Keyword { 109 | string keyword = 1; 110 | float weight = 2; 111 | } 112 | 113 | enum SearchType { 114 | CLASS=0; 115 | PROPERTY=1; 116 | }; 117 | 118 | message SchemaSearchParams { 119 | SearchType searchType = 1; 120 | string name = 2; 121 | repeated Keyword keywords = 3; 122 | float certainty = 5; 123 | } 124 | 125 | message SchemaSearchResults { 126 | SearchType type = 1; 127 | repeated SchemaSearchResult results = 2; 128 | } 129 | 130 | message SchemaSearchResult { 131 | string name = 1; 132 | float certainty = 3; 133 | } 134 | -------------------------------------------------------------------------------- /contextionary/core/annoyindex/annoy_test.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | /* 13 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not 14 | # use this file except in compliance with the License. You may obtain a copy of 15 | # the License at 16 | # 17 | # http://www.apache.org/licenses/LICENSE-2.0 18 | # 19 | # Unless required by applicable law or agreed to in writing, software 20 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 21 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 22 | # License for the specific language governing permissions and limitations under 23 | # the License. 24 | */ 25 | 26 | package annoyindex_test 27 | 28 | import ( 29 | "math" 30 | "math/rand" 31 | "os" 32 | "testing" 33 | 34 | "github.com/weaviate/contextionary/contextionary/core/annoyindex" 35 | 36 | "github.com/stretchr/testify/assert" 37 | "github.com/stretchr/testify/suite" 38 | ) 39 | 40 | type AnnoyTestSuite struct { 41 | suite.Suite 42 | } 43 | 44 | func Round(f float64) float64 { 45 | return math.Floor(f + 0.5) 46 | } 47 | 48 | func RoundPlus(f float64, places int) float64 { 49 | shift := math.Pow(10, float64(places)) 50 | return Round(f*shift) / shift 51 | } 52 | 53 | func (suite *AnnoyTestSuite) SetupTest() { 54 | } 55 | 56 | func (suite *AnnoyTestSuite) TestFileHandling() { 57 | index := annoyindex.NewAnnoyIndexAngular(3) 58 | index.AddItem(0, []float32{0, 0, 1}) 59 | index.AddItem(1, []float32{0, 1, 0}) 60 | index.AddItem(2, []float32{1, 0, 0}) 61 | index.Build(10) 62 | 63 | index.Save("go_test.ann") 64 | 65 | info, err := os.Stat("go_test.ann") 66 | if err != nil { 67 | assert.Fail(suite.T(), "Failed to create file, file not found") 68 | } 69 | if info.Size() == 0 { 70 | assert.Fail(suite.T(), "Failed to create file, file size zero") 71 | } 72 | 73 | annoyindex.DeleteAnnoyIndexAngular(index) 74 | 75 | index = annoyindex.NewAnnoyIndexAngular(3) 76 | if ret := index.Load("go_test.ann"); ret == false { 77 | assert.Fail(suite.T(), "Failed to load file") 78 | } 79 | annoyindex.DeleteAnnoyIndexAngular(index) 80 | 81 | os.Remove("go_test.ann") 82 | } 83 | 84 | func (suite *AnnoyTestSuite) TestGetNnsByVector() { 85 | index := annoyindex.NewAnnoyIndexAngular(3) 86 | index.AddItem(0, []float32{0, 0, 1}) 87 | index.AddItem(1, []float32{0, 1, 0}) 88 | index.AddItem(2, []float32{1, 0, 0}) 89 | index.Build(10) 90 | 91 | var result []int 92 | index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, &result) 93 | assert.Equal(suite.T(), []int{2, 1, 0}, result) 94 | 95 | index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, &result) 96 | assert.Equal(suite.T(), []int{0, 1, 2}, result) 97 | 98 | index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, &result) 99 | assert.Equal(suite.T(), []int{2, 0, 1}, result) 100 | 101 | annoyindex.DeleteAnnoyIndexAngular(index) 102 | } 103 | 104 | func (suite *AnnoyTestSuite) TestGetNnsByItem() { 105 | index := annoyindex.NewAnnoyIndexAngular(3) 106 | index.AddItem(0, []float32{2, 1, 0}) 107 | index.AddItem(1, []float32{1, 2, 0}) 108 | index.AddItem(2, []float32{0, 0, 1}) 109 | index.Build(10) 110 | 111 | var result []int 112 | index.GetNnsByItem(0, 3, -1, &result) 113 | assert.Equal(suite.T(), []int{0, 1, 2}, result) 114 | 115 | index.GetNnsByItem(1, 3, -1, &result) 116 | assert.Equal(suite.T(), []int{1, 0, 2}, result) 117 | 118 | annoyindex.DeleteAnnoyIndexAngular(index) 119 | } 120 | 121 | func (suite *AnnoyTestSuite) TestGetItem() { 122 | index := annoyindex.NewAnnoyIndexAngular(3) 123 | index.AddItem(0, []float32{2, 1, 0}) 124 | index.AddItem(1, []float32{1, 2, 0}) 125 | index.AddItem(2, []float32{0, 0, 1}) 126 | index.Build(10) 127 | 128 | var result []float32 129 | 130 | index.GetItem(0, &result) 131 | assert.Equal(suite.T(), []float32{2, 1, 0}, result) 132 | 133 | index.GetItem(1, &result) 134 | assert.Equal(suite.T(), []float32{1, 2, 0}, result) 135 | 136 | index.GetItem(2, &result) 137 | assert.Equal(suite.T(), []float32{0, 0, 1}, result) 138 | 139 | annoyindex.DeleteAnnoyIndexAngular(index) 140 | } 141 | 142 | func (suite *AnnoyTestSuite) TestGetDistance() { 143 | index := annoyindex.NewAnnoyIndexAngular(2) 144 | index.AddItem(0, []float32{0, 1}) 145 | index.AddItem(1, []float32{1, 1}) 146 | index.Build(10) 147 | 148 | assert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3)) 149 | 150 | annoyindex.DeleteAnnoyIndexAngular(index) 151 | } 152 | 153 | func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() { 154 | index := annoyindex.NewAnnoyIndexEuclidean(10) 155 | 156 | for j := 0; j < 10000; j += 2 { 157 | p := make([]float32, 0, 10) 158 | for i := 0; i < 10; i++ { 159 | p = append(p, rand.Float32()) 160 | } 161 | x := make([]float32, 0, 10) 162 | for i := 0; i < 10; i++ { 163 | x = append(x, 1+p[i]+rand.Float32()*1e-2) 164 | } 165 | y := make([]float32, 0, 10) 166 | for i := 0; i < 10; i++ { 167 | y = append(y, 1+p[i]+rand.Float32()*1e-2) 168 | } 169 | index.AddItem(j, x) 170 | index.AddItem(j+1, y) 171 | } 172 | index.Build(10) 173 | for j := 0; j < 10000; j += 2 { 174 | var result []int 175 | index.GetNnsByItem(j, 2, -1, &result) 176 | 177 | assert.Equal(suite.T(), result, []int{j, j + 1}) 178 | 179 | index.GetNnsByItem(j+1, 2, -1, &result) 180 | assert.Equal(suite.T(), result, []int{j + 1, j}) 181 | } 182 | annoyindex.DeleteAnnoyIndexEuclidean(index) 183 | } 184 | 185 | func TestAnnoyTestSuite(t *testing.T) { 186 | suite.Run(t, new(AnnoyTestSuite)) 187 | } 188 | -------------------------------------------------------------------------------- /contextionary/core/annoyindex/annoygomodule.h: -------------------------------------------------------------------------------- 1 | #include "annoylib.h" 2 | #include "kissrandom.h" 3 | 4 | namespace GoAnnoy { 5 | 6 | class AnnoyIndex { 7 | protected: 8 | ::AnnoyIndexInterface *ptr; 9 | 10 | int f; 11 | 12 | public: 13 | ~AnnoyIndex() { 14 | delete ptr; 15 | }; 16 | void addItem(int item, const float* w) { 17 | ptr->add_item(item, w); 18 | }; 19 | void build(int q) { 20 | ptr->build(q); 21 | }; 22 | bool save(const char* filename) { 23 | return ptr->save(filename); 24 | }; 25 | void unload() { 26 | ptr->unload(); 27 | }; 28 | bool load(const char* filename) { 29 | return ptr->load(filename); 30 | }; 31 | float getDistance(int i, int j) { 32 | return ptr->get_distance(i, j); 33 | }; 34 | void getNnsByItem(int item, int n, int search_k, vector* result, vector* distances) { 35 | ptr->get_nns_by_item(item, n, search_k, result, distances); 36 | }; 37 | void getNnsByVector(const float* w, int n, int search_k, vector* result, vector* distances) { 38 | ptr->get_nns_by_vector(w, n, search_k, result, distances); 39 | }; 40 | void getNnsByItem(int item, int n, int search_k, vector* result) { 41 | ptr->get_nns_by_item(item, n, search_k, result, NULL); 42 | }; 43 | void getNnsByVector(const float* w, int n, int search_k, vector* result) { 44 | ptr->get_nns_by_vector(w, n, search_k, result, NULL); 45 | }; 46 | 47 | int getNItems() { 48 | return (int)ptr->get_n_items(); 49 | }; 50 | void verbose(bool v) { 51 | ptr->verbose(v); 52 | }; 53 | void getItem(int item, vector *v) { 54 | v->resize(this->f); 55 | ptr->get_item(item, &v->front()); 56 | }; 57 | }; 58 | 59 | class AnnoyIndexAngular : public AnnoyIndex 60 | { 61 | public: 62 | AnnoyIndexAngular(int f) { 63 | ptr = new ::AnnoyIndex(f); 64 | this->f = f; 65 | } 66 | }; 67 | 68 | class AnnoyIndexEuclidean : public AnnoyIndex { 69 | public: 70 | AnnoyIndexEuclidean(int f) { 71 | ptr = new ::AnnoyIndex(f); 72 | this->f = f; 73 | } 74 | }; 75 | 76 | class AnnoyIndexManhattan : public AnnoyIndex { 77 | public: 78 | AnnoyIndexManhattan(int f) { 79 | ptr = new ::AnnoyIndex(f); 80 | this->f = f; 81 | } 82 | }; 83 | } 84 | -------------------------------------------------------------------------------- /contextionary/core/annoyindex/kissrandom.h: -------------------------------------------------------------------------------- 1 | #ifndef KISSRANDOM_H 2 | #define KISSRANDOM_H 3 | 4 | #if defined(_MSC_VER) && _MSC_VER == 1500 5 | typedef unsigned __int32 uint32_t; 6 | typedef unsigned __int32 uint64_t; 7 | #else 8 | #include 9 | #endif 10 | 11 | // KISS = "keep it simple, stupid", but high quality random number generator 12 | // http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code" 13 | // http://mathforum.org/kb/message.jspa?messageID=6627731 14 | // https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator) 15 | 16 | // 32 bit KISS 17 | struct Kiss32Random { 18 | uint32_t x; 19 | uint32_t y; 20 | uint32_t z; 21 | uint32_t c; 22 | 23 | // seed must be != 0 24 | Kiss32Random(uint32_t seed = 123456789) { 25 | x = seed; 26 | y = 362436000; 27 | z = 521288629; 28 | c = 7654321; 29 | } 30 | 31 | uint32_t kiss() { 32 | // Linear congruence generator 33 | x = 69069 * x + 12345; 34 | 35 | // Xor shift 36 | y ^= y << 13; 37 | y ^= y >> 17; 38 | y ^= y << 5; 39 | 40 | // Multiply-with-carry 41 | uint64_t t = 698769069ULL * z + c; 42 | c = t >> 32; 43 | z = (uint32_t) t; 44 | 45 | return x + y + z; 46 | } 47 | inline int flip() { 48 | // Draw random 0 or 1 49 | return kiss() & 1; 50 | } 51 | inline size_t index(size_t n) { 52 | // Draw random integer between 0 and n-1 where n is at most the number of data points you have 53 | return kiss() % n; 54 | } 55 | inline void set_seed(uint32_t seed) { 56 | x = seed; 57 | } 58 | }; 59 | 60 | // 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) ) 61 | struct Kiss64Random { 62 | uint64_t x; 63 | uint64_t y; 64 | uint64_t z; 65 | uint64_t c; 66 | 67 | // seed must be != 0 68 | Kiss64Random(uint64_t seed = 1234567890987654321ULL) { 69 | x = seed; 70 | y = 362436362436362436ULL; 71 | z = 1066149217761810ULL; 72 | c = 123456123456123456ULL; 73 | } 74 | 75 | uint64_t kiss() { 76 | // Linear congruence generator 77 | z = 6906969069LL*z+1234567; 78 | 79 | // Xor shift 80 | y ^= (y<<13); 81 | y ^= (y>>17); 82 | y ^= (y<<43); 83 | 84 | // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t) 85 | uint64_t t = (x<<58)+c; 86 | c = (x>>6); 87 | x += t; 88 | c += (x= idx.offset && item < (idx.offset+idx.size) { 120 | return ItemIndex(item - idx.offset), idx.index, nil 121 | } 122 | } 123 | 124 | return 0, nil, fmt.Errorf("out of index") 125 | } 126 | 127 | func (ci *CombinedIndex) ItemIndexToWord(item ItemIndex) (string, error) { 128 | offsetted_index, vi, err := ci.find_vector_index_for_item_index(item) 129 | 130 | if err != nil { 131 | return "", err 132 | } 133 | 134 | word, err := (*vi).ItemIndexToWord(offsetted_index) 135 | return word, err 136 | } 137 | 138 | func (ci *CombinedIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) { 139 | offsetted_index, vi, err := ci.find_vector_index_for_item_index(item) 140 | 141 | if err != nil { 142 | return 0, err 143 | } 144 | 145 | occ, err := (*vi).ItemIndexToOccurrence(offsetted_index) 146 | return occ, err 147 | } 148 | 149 | func (ci *CombinedIndex) OccurrencePercentile(perc int) uint64 { 150 | max := uint64(0) 151 | 152 | for _, index := range ci.indices { 153 | occ := (*index.index).OccurrencePercentile(perc) 154 | if occ > max { 155 | max = occ 156 | } 157 | } 158 | 159 | return max 160 | } 161 | 162 | func (ci *CombinedIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) { 163 | offsetted_index, vi, err := ci.find_vector_index_for_item_index(item) 164 | if err != nil { 165 | return nil, errors.NewInternalf(err.Error()) 166 | } 167 | 168 | word, err := (*vi).GetVectorForItemIndex(offsetted_index) 169 | if err != nil { 170 | return word, errors.NewInternalf(err.Error()) 171 | } 172 | 173 | return word, nil 174 | } 175 | 176 | // Compute the distance between two items. 177 | func (ci *CombinedIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) { 178 | v1, err := ci.GetVectorForItemIndex(a) 179 | if err != nil { 180 | return 0.0, err 181 | } 182 | 183 | v2, err := ci.GetVectorForItemIndex(b) 184 | if err != nil { 185 | return 0.0, err 186 | } 187 | 188 | dist, err := v1.Distance(v2) 189 | if err != nil { 190 | return 0.0, err 191 | } 192 | 193 | return dist, nil 194 | } 195 | 196 | // Get the n nearest neighbours of item, examining k trees. 197 | // Returns an array of indices, and of distances between item and the n-nearest neighbors. 198 | func (ci *CombinedIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) { 199 | vec, err := ci.GetVectorForItemIndex(item) 200 | if err != nil { 201 | return nil, nil, fmt.Errorf("could not get vector for item index: %s", err) 202 | } 203 | 204 | return ci.GetNnsByVector(*vec, n, k) 205 | } 206 | 207 | type combined_nn_search_result struct { 208 | item ItemIndex 209 | dist float32 210 | } 211 | 212 | type combined_nn_search_results struct { 213 | items []combined_nn_search_result 214 | ci *CombinedIndex 215 | } 216 | 217 | // SafeGetSimilarWords returns n similar words in the contextionary, 218 | // examining k trees. It is guaratueed to have results, even if the word is 219 | // not in the contextionary. In this case the list only contains the word 220 | // itself. It can then still be used for exact match or levensthein-based 221 | // searches against db backends. 222 | func (ci *CombinedIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) { 223 | return safeGetSimilarWordsFromAny(ci, word, n, k) 224 | } 225 | 226 | // SafeGetSimilarWordsWithCertainty returns similar words in the 227 | // contextionary, if they are close enough to match the required certainty. 228 | // It is guaratueed to have results, even if the word is not in the 229 | // contextionary. In this case the list only contains the word itself. It can 230 | // then still be used for exact match or levensthein-based searches against 231 | // db backends. 232 | func (ci *CombinedIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string { 233 | return safeGetSimilarWordsWithCertaintyFromAny(ci, word, certainty) 234 | } 235 | 236 | func (a combined_nn_search_results) Len() int { return len(a.items) } 237 | func (a combined_nn_search_results) Swap(i, j int) { a.items[i], a.items[j] = a.items[j], a.items[i] } 238 | func (a combined_nn_search_results) Less(i, j int) bool { 239 | // Sort on distance first, if those are the same, sort on lexographical order of the words. 240 | if a.items[i].dist == a.items[j].dist { 241 | wi, err := a.ci.ItemIndexToWord(a.items[i].item) 242 | if err != nil { 243 | panic("should be there") 244 | } 245 | 246 | wj, err := a.ci.ItemIndexToWord(a.items[j].item) 247 | if err != nil { 248 | panic("should be there") 249 | } 250 | return wi < wj 251 | } else { 252 | return a.items[i].dist < a.items[j].dist 253 | } 254 | } 255 | 256 | // Remove a certain element from the result search. 257 | func (a *combined_nn_search_results) Remove(i int) { 258 | a.items = append(a.items[:i], a.items[i+1:]...) 259 | } 260 | 261 | // Get the n nearest neighbours of item, examining k trees. 262 | // Returns an array of indices, and of distances between item and the n-nearest neighbors. 263 | func (ci *CombinedIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) { 264 | results := combined_nn_search_results{ 265 | items: make([]combined_nn_search_result, 0), 266 | ci: ci, 267 | } 268 | 269 | for _, item := range ci.indices { 270 | indices, floats, err := (*item.index).GetNnsByVector(vector, n, k) 271 | if err != nil { 272 | return nil, nil, errors.NewInternalf(err.Error()) 273 | } else { 274 | for i, item_idx := range indices { 275 | results.items = append(results.items, combined_nn_search_result{item: item_idx + ItemIndex(item.offset), dist: floats[i]}) 276 | } 277 | } 278 | } 279 | 280 | sort.Sort(results) 281 | 282 | // Now remove duplicates. 283 | for i := 1; i < len(results.items); { 284 | if results.items[i].item == results.items[i-1].item { 285 | results.Remove(i) 286 | } else { 287 | i++ // only increment if we're not removing. 288 | } 289 | } 290 | 291 | items := make([]ItemIndex, 0) 292 | floats := make([]float32, 0) 293 | 294 | var max_index int 295 | 296 | if n < len(results.items) { 297 | max_index = n 298 | } else { 299 | max_index = len(results.items) 300 | } 301 | 302 | for i := 0; i < max_index; i++ { 303 | items = append(items, results.items[i].item) 304 | floats = append(floats, results.items[i].dist) 305 | } 306 | 307 | return items, floats, nil 308 | } 309 | -------------------------------------------------------------------------------- /contextionary/core/combined_simple_test.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package contextionary 13 | 14 | import ( 15 | "testing" 16 | ) 17 | 18 | func TestSimpleCombinedIndex(t *testing.T) { 19 | builder1 := InMemoryBuilder(3) 20 | builder2 := InMemoryBuilder(3) 21 | builder3 := InMemoryBuilder(3) 22 | 23 | builder1.AddWord("a", NewVector([]float32{1, 0, 0})) 24 | builder2.AddWord("b", NewVector([]float32{0, 1, 0})) 25 | builder3.AddWord("c", NewVector([]float32{0, 0, 1})) 26 | 27 | memory_index1 := Contextionary(builder1.Build(3)) 28 | memory_index2 := Contextionary(builder2.Build(3)) 29 | memory_index3 := Contextionary(builder3.Build(3)) 30 | 31 | var indices123 []Contextionary = []Contextionary{memory_index1, memory_index2, memory_index3} 32 | var indices231 []Contextionary = []Contextionary{memory_index2, memory_index3, memory_index1} 33 | var indices312 []Contextionary = []Contextionary{memory_index3, memory_index1, memory_index2} 34 | 35 | t.Run("indices 123", func(t *testing.T) { test_simple_combined(t, indices123) }) 36 | t.Run("indices 231", func(t *testing.T) { test_simple_combined(t, indices231) }) 37 | t.Run("indices 312", func(t *testing.T) { test_simple_combined(t, indices312) }) 38 | } 39 | 40 | func test_simple_combined(t *testing.T, indices []Contextionary) { 41 | ci, err := CombineVectorIndices(indices) 42 | if err != nil { 43 | panic("should work") 44 | } 45 | 46 | a_idx := ci.WordToItemIndex("a") 47 | if !a_idx.IsPresent() { 48 | panic("should be present") 49 | } 50 | 51 | b_idx := ci.WordToItemIndex("b") 52 | if !b_idx.IsPresent() { 53 | panic("should be present") 54 | } 55 | 56 | c_idx := ci.WordToItemIndex("c") 57 | if !c_idx.IsPresent() { 58 | panic("should be present") 59 | } 60 | 61 | items, _, err := ci.GetNnsByItem(a_idx, 3, 3) 62 | if err != nil { 63 | panic("should work") 64 | } 65 | 66 | assert_eq_idx := func(name string, a, b ItemIndex) { 67 | if a != b { 68 | t.Errorf("Expected %v to be at %v, but was at %b", name, a, b) 69 | } 70 | } 71 | 72 | if len(items) != 3 { 73 | t.Errorf("got length %v, expected 3", len(items)) 74 | t.FailNow() 75 | } 76 | 77 | // assert lexicographical order, if distances are equal 78 | 79 | assert_eq_idx("a", a_idx, items[0]) 80 | assert_eq_idx("b", b_idx, items[1]) 81 | assert_eq_idx("c", c_idx, items[2]) 82 | } 83 | -------------------------------------------------------------------------------- /contextionary/core/component_test.go: -------------------------------------------------------------------------------- 1 | // +build sentence 2 | 3 | package contextionary 4 | 5 | import ( 6 | "fmt" 7 | "testing" 8 | ) 9 | 10 | func TestDevelopmentEnvironmentForContextionary(t *testing.T) { 11 | 12 | // Make sure you have run ./tools/dev/gen_simple_contextionary.sh 13 | // from the project root or downloaded a full contextionary prior 14 | // to running those tests. 15 | 16 | c11y, err := LoadVectorFromDisk("../../tools/dev/example.knn", "../../tools/dev/example.idx") 17 | if err != nil { 18 | t.Fatalf("could not generate c11y: %s", err) 19 | } 20 | 21 | fmt.Printf("here's the c11y, do whatever you want with it: %#v", c11y) 22 | 23 | t.Errorf("... add whatever you like!") 24 | } 25 | -------------------------------------------------------------------------------- /contextionary/core/contextionary.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | 13 | // Package contextionary provides the toolset to add context to words. 14 | package contextionary 15 | 16 | // ItemIndex is an opaque type that models an index number used to identify a 17 | // word. 18 | type ItemIndex int 19 | 20 | // IsPresent can be used after retrieving a word index (which does not error on 21 | // its own), to see if the word was actually present in the contextionary. 22 | func (i *ItemIndex) IsPresent() bool { 23 | return *i >= 0 24 | } 25 | 26 | // Contextionary is the API to decouple the K-nn interface that is needed for 27 | // Weaviate from a concrete implementation. 28 | type Contextionary interface { 29 | 30 | // Return the number of items that is stored in the index. 31 | GetNumberOfItems() int 32 | 33 | // Returns the length of the used vectors. 34 | GetVectorLength() int 35 | 36 | // Look up a word, return an index. 37 | // Check for presence of the index with index.IsPresent() 38 | WordToItemIndex(word string) ItemIndex 39 | 40 | // Based on an index, return the assosiated word. 41 | ItemIndexToWord(item ItemIndex) (string, error) 42 | 43 | // Based on an index, return the assosiated word. 44 | ItemIndexToOccurrence(item ItemIndex) (uint64, error) 45 | 46 | //OccurrencePercentile shows the occurrence of the mentioned percentile in ascending order 47 | OccurrencePercentile(perc int) uint64 48 | 49 | // Get the vector of an item index. 50 | GetVectorForItemIndex(item ItemIndex) (*Vector, error) 51 | 52 | // Compute the distance between two items. 53 | GetDistance(a ItemIndex, b ItemIndex) (float32, error) 54 | 55 | // Get the n nearest neighbours of item, examining k trees. 56 | // Returns an array of indices, and of distances between item and the n-nearest neighbors. 57 | GetNnsByItem(item ItemIndex, n, k int) ([]ItemIndex, []float32, error) 58 | 59 | // Get the n nearest neighbours of item, examining k trees. 60 | // Returns an array of indices, and of distances between item and the n-nearest neighbors. 61 | GetNnsByVector(vector Vector, n, k int) ([]ItemIndex, []float32, error) 62 | 63 | // SafeGetSimilarWords returns n similar words in the contextionary, 64 | // examining k trees. It is guaratueed to have results, even if the word is 65 | // not in the contextionary. In this case the list only contains the word 66 | // itself. It can then still be used for exact match or levensthein-based 67 | // searches against db backends. 68 | SafeGetSimilarWords(word string, n, k int) ([]string, []float32) 69 | 70 | // SafeGetSimilarWordsWithCertainty returns similar words in the 71 | // contextionary, if they are close enough to match the required certainty. 72 | // It is guaratueed to have results, even if the word is not in the 73 | // contextionary. In this case the list only contains the word itself. It can 74 | // then still be used for exact match or levensthein-based searches against 75 | // db backends. 76 | SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string 77 | } 78 | -------------------------------------------------------------------------------- /contextionary/core/generator/cmd/generator.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package main 13 | 14 | import ( 15 | "os" 16 | 17 | flags "github.com/jessevdk/go-flags" 18 | "github.com/weaviate/contextionary/contextionary/core/generator" 19 | ) 20 | 21 | func main() { 22 | var options generator.Options 23 | var parser = flags.NewParser(&options, flags.Default) 24 | 25 | if _, err := parser.Parse(); err != nil { 26 | if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp { 27 | os.Exit(0) 28 | } else { 29 | os.Exit(1) 30 | } 31 | } 32 | 33 | generator.Generate(options) 34 | } 35 | -------------------------------------------------------------------------------- /contextionary/core/generator/generator.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package generator 13 | 14 | import ( 15 | "bufio" 16 | "bytes" 17 | "encoding/binary" 18 | "encoding/gob" 19 | "encoding/json" 20 | "log" 21 | "os" 22 | "strconv" 23 | "strings" 24 | 25 | "github.com/syndtr/goleveldb/leveldb" 26 | annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex" 27 | ) 28 | 29 | type Options struct { 30 | VectorCSVPath string `short:"c" long:"vector-csv-path" description:"Path to the output file of Glove" required:"true"` 31 | TempDBPath string `short:"t" long:"temp-db-path" description:"Location for the temporary database" default:".tmp_import"` 32 | OutputPrefix string `short:"p" long:"output-prefix" description:"The prefix of the names of the files" required:"true"` 33 | K int `short:"k" description:"number of forrests to generate" default:"20"` 34 | } 35 | 36 | type WordVectorInfo struct { 37 | numberOfWords int 38 | vectorWidth int 39 | k int 40 | metadata JsonMetadata 41 | } 42 | 43 | type JsonMetadata struct { 44 | K int `json:"k"` // the number of parallel forrests. 45 | } 46 | 47 | func Generate(options Options) { 48 | db, err := leveldb.OpenFile(options.TempDBPath, nil) 49 | defer db.Close() 50 | 51 | if err != nil { 52 | log.Fatalf("Could not open temporary database file %+v", err) 53 | } 54 | 55 | file, err := os.Open(options.VectorCSVPath) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | defer file.Close() 60 | 61 | log.Print("Processing and ordering raw trained data") 62 | info := readVectorsFromFileAndInsertIntoLevelDB(db, file) 63 | 64 | info.k = options.K 65 | info.metadata = JsonMetadata{options.K} 66 | 67 | log.Print("Generating wordlist") 68 | createWordList(db, info, options.OutputPrefix+".idx") 69 | 70 | log.Print("Generating k-nn index") 71 | createKnn(db, info, options.OutputPrefix+".knn") 72 | 73 | db.Close() 74 | os.RemoveAll(options.TempDBPath) 75 | } 76 | 77 | // read word vectors, insert them into level db, also return the dimension of the vectors. 78 | func readVectorsFromFileAndInsertIntoLevelDB(db *leveldb.DB, file *os.File) WordVectorInfo { 79 | var vector_length int = -1 80 | var nr_words int = 0 81 | 82 | scanner := bufio.NewScanner(file) 83 | 84 | for scanner.Scan() { 85 | nr_words += 1 86 | parts := strings.Split(scanner.Text(), " ") 87 | 88 | word := parts[0] 89 | if vector_length == -1 { 90 | vector_length = len(parts) - 1 91 | } 92 | 93 | if vector_length != len(parts)-1 { 94 | log.Print("Line corruption found for the word [" + word + "]. Lenght expected " + strconv.Itoa(vector_length) + " but found " + strconv.Itoa(len(parts)) + ". Word will be skipped.") 95 | continue 96 | } 97 | 98 | // pre-allocate a vector for speed. 99 | vector := make([]float32, vector_length) 100 | 101 | for i := 1; i <= vector_length; i++ { 102 | float, err := strconv.ParseFloat(parts[i], 64) 103 | 104 | if err != nil { 105 | log.Fatal("Error parsing float") 106 | } 107 | 108 | vector[i-1] = float32(float) 109 | } 110 | 111 | var buf bytes.Buffer 112 | if err := gob.NewEncoder(&buf).Encode(vector); err != nil { 113 | log.Fatal("Could not encode vector for temp db storage") 114 | } 115 | 116 | db.Put([]byte(word), buf.Bytes(), nil) 117 | } 118 | 119 | return WordVectorInfo{numberOfWords: nr_words, vectorWidth: vector_length} 120 | } 121 | 122 | func createWordList(db *leveldb.DB, info WordVectorInfo, outputFileName string) { 123 | file, err := os.Create(outputFileName) 124 | if err != nil { 125 | log.Fatal("Could not open wordlist output file") 126 | } 127 | defer file.Close() 128 | 129 | wbuf := bufio.NewWriter(file) 130 | 131 | // Write file header 132 | err = binary.Write(wbuf, binary.LittleEndian, uint64(info.numberOfWords)) 133 | if err != nil { 134 | log.Fatal("Could not write length of wordlist.") 135 | } 136 | 137 | err = binary.Write(wbuf, binary.LittleEndian, uint64(info.vectorWidth)) 138 | if err != nil { 139 | log.Fatal("Could not write with of the vector.") 140 | } 141 | 142 | metadata, err := json.Marshal(info.metadata) 143 | if err != nil { 144 | log.Fatal("Could not serialize metadata.") 145 | } 146 | 147 | err = binary.Write(wbuf, binary.LittleEndian, uint64(len(metadata))) 148 | if err != nil { 149 | log.Fatal("Could not write with of the vector.") 150 | } 151 | 152 | _, err = wbuf.Write(metadata) 153 | if err != nil { 154 | log.Fatal("Could not write the metadata") 155 | } 156 | 157 | var metadata_len = uint64(len(metadata)) 158 | var metadata_padding = 4 - (metadata_len % 4) 159 | for i := 0; uint64(i) < metadata_padding; i++ { 160 | wbuf.WriteByte(byte(0)) 161 | } 162 | 163 | var word_offset uint64 = (2 + uint64(info.numberOfWords)) * 8 // first two uint64's from the header, then the table of indices. 164 | word_offset += 8 + metadata_len + metadata_padding // and the metadata length + content & padding 165 | 166 | var orig_word_offset = word_offset 167 | 168 | // Iterate first time over all data, computing indices for all words. 169 | iter := db.NewIterator(nil, nil) 170 | for iter.Next() { 171 | key := iter.Key() 172 | word := string(key) 173 | length := len(word) 174 | err = binary.Write(wbuf, binary.LittleEndian, uint64(word_offset)) 175 | 176 | if err != nil { 177 | log.Fatal("Could not write word offset to wordlist") 178 | } 179 | 180 | // reserve 8 bytes for occurence 181 | word_offset += 8 182 | 183 | word_offset += uint64(length) + 1 184 | 185 | // ensure padding on 4-bytes aligned memory 186 | padding := 4 - (word_offset % 4) 187 | word_offset += padding 188 | } 189 | 190 | iter.Release() 191 | word_offset = orig_word_offset 192 | 193 | // Iterate second time over all data, now inserting the words 194 | iter = db.NewIterator(nil, nil) 195 | for iter.Next() { 196 | key := iter.Key() 197 | word := string(key) 198 | length := len(word) 199 | 200 | // hard-code occurence to 102 for now 201 | err = binary.Write(wbuf, binary.LittleEndian, uint64(102)) 202 | 203 | wbuf.Write([]byte(word)) 204 | wbuf.WriteByte(byte(0)) 205 | word_offset += uint64(length) + 1 206 | 207 | // ensure padding on 4-bytes aligned memory 208 | padding := 4 - (word_offset % 4) 209 | for i := 0; uint64(i) < padding; i++ { 210 | wbuf.WriteByte(byte(0)) 211 | } 212 | 213 | word_offset += padding 214 | } 215 | wbuf.Flush() 216 | iter.Release() 217 | } 218 | 219 | func createKnn(db *leveldb.DB, info WordVectorInfo, outputFileName string) { 220 | var knn annoy.AnnoyIndex = annoy.NewAnnoyIndexEuclidean(info.vectorWidth) 221 | var idx int = -1 222 | 223 | iter := db.NewIterator(nil, nil) 224 | 225 | for iter.Next() { 226 | idx += 1 227 | 228 | vector := make([]float32, info.vectorWidth) 229 | err := gob.NewDecoder(bytes.NewBuffer(iter.Value())).Decode(&vector) 230 | if err != nil { 231 | log.Fatalf("Could not decode vector value %+v", err) 232 | } 233 | knn.AddItem(idx, vector) 234 | } 235 | 236 | knn.Build(info.k) // Hardcoded for now. Must be tweaked. 237 | knn.Save(outputFileName) 238 | knn.Unload() 239 | } 240 | -------------------------------------------------------------------------------- /contextionary/core/memory_index.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package contextionary 13 | 14 | import ( 15 | "fmt" 16 | "sort" 17 | 18 | annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex" 19 | ) 20 | 21 | type MemoryIndex struct { 22 | dimensions int 23 | words []string 24 | knn annoy.AnnoyIndex 25 | } 26 | 27 | // Return the number of items that is stored in the index. 28 | func (mi *MemoryIndex) GetNumberOfItems() int { 29 | return len(mi.words) 30 | } 31 | 32 | // Returns the length of the used vectors. 33 | func (mi *MemoryIndex) GetVectorLength() int { 34 | return mi.dimensions 35 | } 36 | 37 | // Look up a word, return an index. 38 | // Perform binary search. 39 | func (mi *MemoryIndex) WordToItemIndex(word string) ItemIndex { 40 | for idx, w := range mi.words { 41 | if word == w { 42 | return ItemIndex(idx) 43 | } 44 | } 45 | 46 | return -1 47 | } 48 | 49 | func (mi *MemoryIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) { 50 | return 0, nil 51 | } 52 | 53 | func (mi *MemoryIndex) OccurrencePercentile(perc int) uint64 { 54 | return 0 55 | } 56 | 57 | // Based on an index, return the assosiated word. 58 | func (mi *MemoryIndex) ItemIndexToWord(item ItemIndex) (string, error) { 59 | if item >= 0 && int(item) <= len(mi.words) { 60 | return mi.words[item], nil 61 | } else { 62 | return "", fmt.Errorf("Index out of bounds") 63 | } 64 | } 65 | 66 | // Get the vector of an item index. 67 | // TODO: Is this ever used? Doesn't look like it as part of the investigation 68 | // in gh-25 and gh-26 69 | func (mi *MemoryIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) { 70 | if item >= 0 && int(item) <= len(mi.words) { 71 | var floats []float32 72 | mi.knn.GetItem(int(item), &floats) 73 | 74 | return &Vector{vector: floats}, nil 75 | } else { 76 | return nil, fmt.Errorf("Index out of bounds") 77 | } 78 | } 79 | 80 | // Compute the distance between two items. 81 | func (mi MemoryIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) { 82 | if a >= 0 && b >= 0 && int(a) <= len(mi.words) && int(b) <= len(mi.words) { 83 | return mi.knn.GetDistance(int(a), int(b)), nil 84 | } else { 85 | return 0, fmt.Errorf("Index out of bounds") 86 | } 87 | } 88 | 89 | // Get the n nearest neighbours of item, examining k trees. 90 | // Returns an array of indices, and of distances between item and the n-nearest neighbors. 91 | func (mi *MemoryIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) { 92 | if item >= 0 && int(item) <= len(mi.words) { 93 | var items []int 94 | var distances []float32 95 | 96 | mi.knn.GetNnsByItem(int(item), n, k, &items, &distances) 97 | 98 | var indices []ItemIndex = make([]ItemIndex, len(items)) 99 | for i, x := range items { 100 | indices[i] = ItemIndex(x) 101 | } 102 | 103 | return indices, distances, nil 104 | } else { 105 | return nil, nil, fmt.Errorf("Index out of bounds") 106 | } 107 | } 108 | 109 | // Get the n nearest neighbours of item, examining k trees. 110 | // Returns an array of indices, and of distances between item and the n-nearest neighbors. 111 | func (mi *MemoryIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) { 112 | if len(vector.vector) == mi.dimensions { 113 | var items []int 114 | var distances []float32 115 | 116 | mi.knn.GetNnsByVector(vector.vector, n, k, &items, &distances) 117 | 118 | var indices []ItemIndex = make([]ItemIndex, len(items)) 119 | for i, x := range items { 120 | indices[i] = ItemIndex(x) 121 | } 122 | 123 | return indices, distances, nil 124 | } else { 125 | return nil, nil, fmt.Errorf("Wrong vector length provided") 126 | } 127 | } 128 | 129 | // SafeGetSimilarWords returns n similar words in the contextionary, 130 | // examining k trees. It is guaratueed to have results, even if the word is 131 | // not in the contextionary. In this case the list only contains the word 132 | // itself. It can then still be used for exact match or levensthein-based 133 | // searches against db backends. 134 | func (mi *MemoryIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) { 135 | return safeGetSimilarWordsFromAny(mi, word, n, k) 136 | } 137 | 138 | // SafeGetSimilarWordsWithCertainty returns similar words in the 139 | // contextionary, if they are close enough to match the required certainty. 140 | // It is guaratueed to have results, even if the word is not in the 141 | // contextionary. In this case the list only contains the word itself. It can 142 | // then still be used for exact match or levensthein-based searches against 143 | // db backends. 144 | func (mi *MemoryIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string { 145 | return safeGetSimilarWordsWithCertaintyFromAny(mi, word, certainty) 146 | } 147 | 148 | // The rest of this file concerns itself with building the Memory Index. 149 | // This is done from the MemoryIndexBuilder struct. 150 | 151 | type MemoryIndexBuilder struct { 152 | dimensions int 153 | word_vectors mib_pairs 154 | } 155 | 156 | type mib_pair struct { 157 | word string 158 | vector Vector 159 | } 160 | 161 | // Define custom type, and implement functions required for sort.Sort. 162 | type mib_pairs []mib_pair 163 | 164 | func (a mib_pairs) Len() int { return len(a) } 165 | func (a mib_pairs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 166 | func (a mib_pairs) Less(i, j int) bool { return a[i].word < a[j].word } 167 | 168 | // Construct a new builder. 169 | func InMemoryBuilder(dimensions int) *MemoryIndexBuilder { 170 | mib := MemoryIndexBuilder{ 171 | dimensions: dimensions, 172 | word_vectors: make([]mib_pair, 0), 173 | } 174 | 175 | return &mib 176 | } 177 | 178 | // Add a word and it's vector to the builder. 179 | func (mib *MemoryIndexBuilder) AddWord(word string, vector Vector) { 180 | wv := mib_pair{word: word, vector: vector} 181 | mib.word_vectors = append(mib.word_vectors, wv) 182 | } 183 | 184 | // Build an efficient lookup iddex from the builder. 185 | func (mib *MemoryIndexBuilder) Build(trees int) *MemoryIndex { 186 | mi := MemoryIndex{ 187 | dimensions: mib.dimensions, 188 | words: make([]string, 0), 189 | knn: annoy.NewAnnoyIndexEuclidean(mib.dimensions), 190 | } 191 | 192 | // First sort the words; this way we can do binary search on the words. 193 | sort.Sort(mib.word_vectors) 194 | 195 | // Then fill up the data in the MemoryIndex 196 | for i, pair := range mib.word_vectors { 197 | mi.words = append(mi.words, pair.word) 198 | mi.knn.AddItem(i, pair.vector.vector) 199 | } 200 | 201 | // And instruct Annoy to build it's index 202 | mi.knn.Build(trees) 203 | 204 | return &mi 205 | } 206 | -------------------------------------------------------------------------------- /contextionary/core/mmapped.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package contextionary 13 | 14 | import ( 15 | "encoding/binary" 16 | "fmt" 17 | "log" 18 | "math" 19 | "os" 20 | "syscall" 21 | 22 | annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex" 23 | ) 24 | 25 | type mmappedIndex struct { 26 | word_index *Wordlist 27 | knn annoy.AnnoyIndex 28 | knnRaw []byte 29 | dimensions int 30 | } 31 | 32 | func (m *mmappedIndex) GetNumberOfItems() int { 33 | return int(m.word_index.numberOfWords) 34 | } 35 | 36 | // Returns the length of the used vectors. 37 | func (m *mmappedIndex) GetVectorLength() int { 38 | return int(m.word_index.vectorWidth) 39 | } 40 | 41 | func (m *mmappedIndex) WordToItemIndex(word string) ItemIndex { 42 | return m.word_index.FindIndexByWord(word) 43 | } 44 | 45 | func (m *mmappedIndex) ItemIndexToWord(item ItemIndex) (string, error) { 46 | if item >= 0 && item <= m.word_index.GetNumberOfWords() { 47 | w, _ := m.word_index.getWord(item) 48 | return w, nil 49 | } else { 50 | return "", fmt.Errorf("Index out of bounds") 51 | } 52 | } 53 | 54 | func (m *mmappedIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) { 55 | if item >= 0 && item <= m.word_index.GetNumberOfWords() { 56 | _, occ := m.word_index.getWord(item) 57 | return occ, nil 58 | } else { 59 | return 0, fmt.Errorf("Index out of bounds") 60 | } 61 | } 62 | 63 | func (m *mmappedIndex) OccurrencePercentile(perc int) uint64 { 64 | return m.word_index.OccurrencePercentile(perc) 65 | } 66 | 67 | func (m *mmappedIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) { 68 | if item < 0 && item > m.word_index.GetNumberOfWords() { 69 | return nil, fmt.Errorf("Index out of bounds") 70 | } 71 | 72 | var floats []float32 73 | floats = m.getItem(int(item)) 74 | 75 | return &Vector{vector: floats}, nil 76 | } 77 | 78 | func (m *mmappedIndex) getItem(index int) []float32 { 79 | offset := 16 80 | vectorSize := m.dimensions * 4 81 | begin := index*(offset+vectorSize) + offset 82 | end := begin + vectorSize 83 | return vectorFromBytes(m.knnRaw[begin:end]) 84 | } 85 | 86 | func vectorFromBytes(in []byte) []float32 { 87 | out := make([]float32, len(in)/4) 88 | for offset := 0; offset < len(in); offset += 4 { 89 | bits := binary.LittleEndian.Uint32(in[offset : offset+4]) 90 | float := math.Float32frombits(bits) 91 | out[offset/4] = float 92 | } 93 | 94 | return out 95 | } 96 | 97 | // Compute the distance between two items. 98 | func (m *mmappedIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) { 99 | if a >= 0 && b >= 0 && a <= m.word_index.GetNumberOfWords() && b <= m.word_index.GetNumberOfWords() { 100 | return m.knn.GetDistance(int(a), int(b)), nil 101 | } else { 102 | return 0, fmt.Errorf("Index out of bounds") 103 | } 104 | } 105 | 106 | func (m *mmappedIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) { 107 | if item >= 0 && item <= m.word_index.GetNumberOfWords() { 108 | var items []int 109 | var distances []float32 110 | 111 | m.knn.GetNnsByItem(int(item), n, k, &items, &distances) 112 | 113 | var indices []ItemIndex = make([]ItemIndex, len(items)) 114 | for i, x := range items { 115 | indices[i] = ItemIndex(x) 116 | } 117 | 118 | return indices, distances, nil 119 | } else { 120 | return nil, nil, fmt.Errorf("Index out of bounds") 121 | } 122 | } 123 | 124 | func (m *mmappedIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) { 125 | if len(vector.vector) == m.GetVectorLength() { 126 | var items []int 127 | var distances []float32 128 | 129 | m.knn.GetNnsByVector(vector.vector, n, k, &items, &distances) 130 | 131 | var indices []ItemIndex = make([]ItemIndex, len(items)) 132 | for i, x := range items { 133 | indices[i] = ItemIndex(x) 134 | } 135 | 136 | return indices, distances, nil 137 | } else { 138 | return nil, nil, fmt.Errorf("Wrong vector length provided") 139 | } 140 | } 141 | 142 | // SafeGetSimilarWords returns n similar words in the contextionary, 143 | // examining k trees. It is guaratueed to have results, even if the word is 144 | // not in the contextionary. In this case the list only contains the word 145 | // itself. It can then still be used for exact match or levensthein-based 146 | // searches against db backends. 147 | func (m *mmappedIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) { 148 | return safeGetSimilarWordsFromAny(m, word, n, k) 149 | } 150 | 151 | // SafeGetSimilarWordsWithCertainty returns similar words in the 152 | // contextionary, if they are close enough to match the required certainty. 153 | // It is guaratueed to have results, even if the word is not in the 154 | // contextionary. In this case the list only contains the word itself. It can 155 | // then still be used for exact match or levensthein-based searches against 156 | // db backends. 157 | func (m *mmappedIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string { 158 | return safeGetSimilarWordsWithCertaintyFromAny(m, word, certainty) 159 | } 160 | 161 | func LoadVectorFromDisk(annoy_index string, word_index_file_name string) (Contextionary, error) { 162 | word_index, err := LoadWordlist(word_index_file_name) 163 | 164 | if err != nil { 165 | return nil, fmt.Errorf("Could not load vector: %+v", err) 166 | } 167 | 168 | knn := annoy.NewAnnoyIndexEuclidean(int(word_index.vectorWidth)) 169 | knn.Load(annoy_index) 170 | 171 | knnRaw, err := loadAnnoyIndexDirectly(annoy_index) 172 | if err != nil { 173 | return nil, fmt.Errorf("load raw index: %v", err) 174 | } 175 | 176 | idx := &mmappedIndex{ 177 | word_index: word_index, 178 | knn: knn, 179 | knnRaw: knnRaw, 180 | dimensions: int(word_index.vectorWidth), 181 | } 182 | 183 | return idx, nil 184 | } 185 | 186 | // directly load the annoy index file to avoid memory leaks in the annoy 187 | // go-port of the C library, see #26 188 | func loadAnnoyIndexDirectly(path string) ([]byte, error) { 189 | file, err := os.Open(path) 190 | if err != nil { 191 | log.Fatalf("Can't open the knn file at %s: %+v", path, err) 192 | } 193 | 194 | file_info, err := file.Stat() 195 | if err != nil { 196 | log.Fatalf("Can't stat the knn file at %s: %+v", path, err) 197 | } 198 | 199 | mmap, err := syscall.Mmap(int(file.Fd()), 0, int(file_info.Size()), syscall.PROT_READ, syscall.MAP_SHARED) 200 | if err != nil { 201 | log.Fatalf("Can't mmap the knn file %s: %+v", path, err) 202 | } 203 | 204 | return mmap, nil 205 | } 206 | -------------------------------------------------------------------------------- /contextionary/core/similar_words.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package contextionary 12 | 13 | import ( 14 | "regexp" 15 | ) 16 | 17 | const simliarWordsLimit = 15 18 | 19 | func safeGetSimilarWordsFromAny(c11y Contextionary, word string, n, k int) ([]string, []float32) { 20 | i := c11y.WordToItemIndex(word) 21 | if !i.IsPresent() { 22 | return []string{word}, []float32{1} 23 | } 24 | 25 | indices, newCertainties, err := c11y.GetNnsByItem(i, n, k) 26 | if err != nil { 27 | return []string{word}, []float32{1} 28 | } 29 | 30 | var words []string 31 | var certainties []float32 32 | for i, index := range indices { 33 | word, err := c11y.ItemIndexToWord(index) 34 | if err != nil { 35 | continue 36 | } 37 | 38 | if wordHasIllegalCharacters(word) { 39 | continue 40 | } 41 | 42 | words = append(words, word) 43 | certainties = append(certainties, newCertainties[i]) 44 | } 45 | 46 | return words, certainties 47 | } 48 | 49 | func safeGetSimilarWordsWithCertaintyFromAny(c11y Contextionary, word string, certainty float32) []string { 50 | var matchingWords []string 51 | var matchtingCertainties []float32 52 | 53 | count := 0 54 | words, certainties := c11y.SafeGetSimilarWords(word, 100, 32) 55 | for i, word := range words { 56 | if count >= simliarWordsLimit { 57 | break 58 | } 59 | 60 | var dist float32 61 | if dist = DistanceToCertainty(certainties[i]); dist < certainty { 62 | continue 63 | } 64 | 65 | count++ 66 | matchingWords = append(matchingWords, alphanumeric(word)) 67 | matchtingCertainties = append(matchtingCertainties, dist) 68 | } 69 | 70 | return matchingWords 71 | } 72 | 73 | func wordHasIllegalCharacters(word string) bool { 74 | // we know that the schema based contextionary uses a leading dollar sign for 75 | // the class and property centroids, so we can easily filter them out 76 | return regexp.MustCompile("^\\$").MatchString(word) 77 | } 78 | 79 | func alphanumeric(word string) string { 80 | return regexp.MustCompile("[^a-zA-Z0-9_]+").ReplaceAllString(word, "") 81 | } 82 | -------------------------------------------------------------------------------- /contextionary/core/similar_words_test.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package contextionary 12 | 13 | import ( 14 | "testing" 15 | 16 | "github.com/stretchr/testify/assert" 17 | ) 18 | 19 | func TestSimilarWords(t *testing.T) { 20 | 21 | t.Run("with a word that's not in the c11y", func(t *testing.T) { 22 | c := newC11y() 23 | expectedWords := []string{"vehicle"} 24 | 25 | words := c.SafeGetSimilarWordsWithCertainty("vehicle", 0.8) 26 | 27 | assert.Equal(t, expectedWords, words) 28 | }) 29 | 30 | t.Run("with a word thats present and a high certainty", func(t *testing.T) { 31 | c := newC11y() 32 | expectedWords := []string{"car", "automobile"} 33 | 34 | words := c.SafeGetSimilarWordsWithCertainty("car", 0.95) 35 | 36 | assert.Equal(t, expectedWords, words) 37 | }) 38 | 39 | t.Run("with a word thats present and a medium certainty", func(t *testing.T) { 40 | c := newC11y() 41 | expectedWords := []string{"car", "automobile", "airplane"} 42 | 43 | words := c.SafeGetSimilarWordsWithCertainty("car", 0.7) 44 | 45 | assert.Equal(t, expectedWords, words) 46 | }) 47 | 48 | t.Run("with a word thats present and a really low certainty", func(t *testing.T) { 49 | c := newC11y() 50 | expectedWords := []string{"car", "automobile", "airplane", "cabernetsauvignon"} 51 | 52 | words := c.SafeGetSimilarWordsWithCertainty("car", 0.001) 53 | 54 | assert.Equal(t, expectedWords, words) 55 | }) 56 | 57 | } 58 | 59 | func newC11y() Contextionary { 60 | builder := InMemoryBuilder(3) 61 | 62 | builder.AddWord("car", NewVector([]float32{1, 0, 0})) 63 | builder.AddWord("automobile", NewVector([]float32{0.9, 0, 0})) 64 | builder.AddWord("airplane", NewVector([]float32{0.3, 0, 0})) 65 | builder.AddWord("cabernet-sauvignon", NewVector([]float32{0, 0, 10})) 66 | builder.AddWord("$THING[Car]", NewVector([]float32{1, 0, 0})) 67 | 68 | return Contextionary(builder.Build(3)) 69 | } 70 | -------------------------------------------------------------------------------- /contextionary/core/stopwords/detector.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package stopwords 12 | 13 | import ( 14 | "encoding/json" 15 | "fmt" 16 | "io/ioutil" 17 | "os" 18 | ) 19 | 20 | // Detector can be used to detect whether a word is a stopword 21 | type Detector struct { 22 | lookup map[string]int 23 | } 24 | 25 | type stopWordDoc struct { 26 | Language string `json:"language"` 27 | Words []string `json:"words"` 28 | } 29 | 30 | // NewFromFile creates an in-memory stopword detector based on a file read once 31 | // at init time 32 | func NewFromFile(path string) (*Detector, error) { 33 | file, err := os.Open(path) 34 | if err != nil { 35 | return nil, fmt.Errorf("could not open file at %s: %v", path, err) 36 | } 37 | 38 | fileBytes, err := ioutil.ReadAll(file) 39 | if err != nil { 40 | return nil, fmt.Errorf("could not read file contents: %v", err) 41 | } 42 | 43 | var doc stopWordDoc 44 | err = json.Unmarshal(fileBytes, &doc) 45 | if err != nil { 46 | return nil, fmt.Errorf("could not unmarshal json: %v", err) 47 | } 48 | 49 | lookup := buildLookupMap(doc.Words) 50 | 51 | return &Detector{ 52 | lookup: lookup, 53 | }, nil 54 | } 55 | 56 | // IsStopWord returns true on stop words, false on all other words 57 | func (d *Detector) IsStopWord(word string) bool { 58 | if _, ok := d.lookup[word]; ok { 59 | return true 60 | } 61 | 62 | return false 63 | } 64 | 65 | func buildLookupMap(words []string) map[string]int { 66 | lookup := map[string]int{} 67 | for _, word := range words { 68 | lookup[word] = 1 69 | } 70 | 71 | return lookup 72 | } 73 | -------------------------------------------------------------------------------- /contextionary/core/vector.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package contextionary 13 | 14 | import ( 15 | "fmt" 16 | "math" 17 | ) 18 | 19 | // Opque type that models a fixed-length vector. 20 | type Vector struct { 21 | vector []float32 22 | Source []InputElement 23 | } 24 | 25 | type InputElement struct { 26 | Concept string 27 | Weight float64 28 | Occurrence uint64 29 | } 30 | 31 | func NewVector(vector []float32) Vector { 32 | return Vector{vector: vector} 33 | } 34 | 35 | func (v *Vector) Equal(other *Vector) (bool, error) { 36 | if len(v.vector) != len(other.vector) { 37 | return false, fmt.Errorf("Vectors have different dimensions; %v vs %v", len(v.vector), len(other.vector)) 38 | } 39 | 40 | for i, v := range v.vector { 41 | if other.vector[i] != v { 42 | return false, nil 43 | } 44 | } 45 | 46 | return true, nil 47 | } 48 | 49 | func (v *Vector) EqualEpsilon(other *Vector, epsilon float32) (bool, error) { 50 | if len(v.vector) != len(other.vector) { 51 | return false, fmt.Errorf("Vectors have different dimensions; %v vs %v", len(v.vector), len(other.vector)) 52 | } 53 | 54 | for i, v := range v.vector { 55 | v_min := v - epsilon 56 | v_max := v + epsilon 57 | if other.vector[i] < v_min && other.vector[i] > v_max { 58 | return false, nil 59 | } 60 | } 61 | 62 | return true, nil 63 | } 64 | 65 | func (v *Vector) Len() int { 66 | return len(v.vector) 67 | } 68 | 69 | func (v *Vector) ToString() string { 70 | str := "[" 71 | first := true 72 | for _, i := range v.vector { 73 | if first { 74 | first = false 75 | } else { 76 | str += ", " 77 | } 78 | 79 | str += fmt.Sprintf("%.6f", i) 80 | } 81 | 82 | str += "]" 83 | 84 | return str 85 | } 86 | 87 | func (v *Vector) ToArray() []float32 { 88 | 89 | var returner []float32 90 | 91 | for _, i := range v.vector { 92 | returner = append(returner, i) 93 | } 94 | 95 | return returner 96 | } 97 | 98 | func (v *Vector) Distance(other *Vector) (float32, error) { 99 | var sum float32 100 | 101 | if len(v.vector) != len(other.vector) { 102 | return 0.0, fmt.Errorf("Vectors have different dimensions") 103 | } 104 | 105 | for i := 0; i < len(v.vector); i++ { 106 | x := v.vector[i] - other.vector[i] 107 | sum += x * x 108 | } 109 | 110 | return float32(math.Sqrt(float64(sum))), nil 111 | } 112 | -------------------------------------------------------------------------------- /contextionary/core/wordlist.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */ 12 | package contextionary 13 | 14 | // //// #include 15 | // //import "C" 16 | 17 | import ( 18 | "bytes" 19 | "encoding/binary" 20 | "encoding/json" 21 | "fmt" 22 | "os" 23 | "sort" 24 | "syscall" 25 | ) 26 | 27 | type Wordlist struct { 28 | vectorWidth uint64 29 | numberOfWords uint64 30 | metadata map[string]interface{} 31 | occurrencePercentiles []uint64 32 | 33 | file os.File 34 | startOfTable int 35 | mmap []byte 36 | } 37 | 38 | func LoadWordlist(path string) (*Wordlist, error) { 39 | file, err := os.Open(path) 40 | if err != nil { 41 | return nil, fmt.Errorf("Can't open the wordlist at %s: %+v", path, err) 42 | } 43 | 44 | file_info, err := file.Stat() 45 | if err != nil { 46 | return nil, fmt.Errorf("Can't stat the wordlist at %s: %+v", path, err) 47 | } 48 | 49 | mmap, err := syscall.Mmap(int(file.Fd()), 0, int(file_info.Size()), syscall.PROT_READ, syscall.MAP_SHARED) 50 | if err != nil { 51 | return nil, fmt.Errorf("Can't mmap the file %s: %+v", path, err) 52 | } 53 | 54 | nrWordsBytes := mmap[0:8] 55 | vectorWidthBytes := mmap[8:16] 56 | metadataLengthBytes := mmap[16:24] 57 | 58 | nrWords := binary.LittleEndian.Uint64(nrWordsBytes) 59 | vectorWidth := binary.LittleEndian.Uint64(vectorWidthBytes) 60 | metadataLength := binary.LittleEndian.Uint64(metadataLengthBytes) 61 | 62 | metadataBytes := mmap[24 : 24+metadataLength] 63 | var metadata map[string]interface{} 64 | 65 | json.Unmarshal(metadataBytes, &metadata) 66 | 67 | // Compute beginning of word list lookup table. 68 | var start_of_table int = 24 + int(metadataLength) 69 | var offset int = 4 - (start_of_table % 4) 70 | start_of_table += offset 71 | 72 | wl := &Wordlist{ 73 | vectorWidth: vectorWidth, 74 | numberOfWords: nrWords, 75 | metadata: metadata, 76 | startOfTable: start_of_table, 77 | mmap: mmap, 78 | } 79 | 80 | wl.initOccurrencePercentiles() 81 | 82 | return wl, nil 83 | } 84 | 85 | func (w *Wordlist) GetNumberOfWords() ItemIndex { 86 | return ItemIndex(w.numberOfWords) 87 | } 88 | 89 | func (w *Wordlist) OccurrencePercentile(percentile int) uint64 { 90 | if percentile < 0 || percentile > 100 { 91 | panic("incorrect usage of occurrence percentile, must be between 0 and 100") 92 | } 93 | 94 | return w.occurrencePercentiles[percentile] 95 | } 96 | 97 | func (w *Wordlist) FindIndexByWord(_needle string) ItemIndex { 98 | var needle = string([]byte(_needle)) 99 | needle += "\x00" 100 | 101 | var bytes_needle = []byte(needle) 102 | 103 | var low ItemIndex = 0 104 | var high ItemIndex = ItemIndex(w.numberOfWords) - 1 105 | 106 | for low <= high { 107 | var midpoint ItemIndex = (low + high) / 2 108 | 109 | ptr := w.getWordPtr(midpoint) 110 | 111 | // if the last word in the index is shorter than our needle, we would panic 112 | // by accessing a non-existing adress. To prevent this, the higher boundary 113 | // can never be higher than the len(index)-1 114 | endPos := 8 + len(bytes_needle) 115 | if endPos >= len(ptr) { 116 | endPos = len(ptr) - 1 117 | } 118 | 119 | // ignore the first 8 bytes as they are reserved for occurrence 120 | word := ptr[8:endPos] 121 | 122 | var cmp = bytes.Compare(bytes_needle, word) 123 | 124 | if cmp == 0 { 125 | return midpoint 126 | } else if cmp < 0 { 127 | high = midpoint - 1 128 | } else { 129 | low = midpoint + 1 130 | } 131 | } 132 | 133 | return -1 134 | } 135 | 136 | func (w *Wordlist) getWordPtr(index ItemIndex) []byte { 137 | entry_addr := ItemIndex(w.startOfTable) + index*8 138 | word_address_bytes := w.mmap[entry_addr : entry_addr+8] 139 | word_address := binary.LittleEndian.Uint64(word_address_bytes) 140 | return w.mmap[word_address:] 141 | } 142 | 143 | func (w *Wordlist) getWord(index ItemIndex) (string, uint64) { 144 | ptr := w.getWordPtr(index) 145 | occurrence := binary.LittleEndian.Uint64(ptr[0:8]) 146 | for i := 8; i < len(ptr); i++ { 147 | if ptr[i] == '\x00' { 148 | return string(ptr[8:i]), occurrence 149 | } 150 | } 151 | 152 | return "", 0 153 | } 154 | 155 | func (w *Wordlist) initOccurrencePercentiles() { 156 | w.occurrencePercentiles = make([]uint64, 101) // make 101 elements longs, so both index 0 and 100 are included 157 | max := int(w.GetNumberOfWords()) 158 | allOccs := make([]uint64, max) 159 | 160 | for i := ItemIndex(0); int(i) < max; i++ { 161 | _, occ := w.getWord(i) 162 | allOccs[i] = occ 163 | } 164 | 165 | sort.Slice(allOccs, func(a, b int) bool { return allOccs[a] < allOccs[b] }) 166 | 167 | for i := 0; i <= 100; i++ { // note that this is 101 elements! 168 | if i == 0 { 169 | w.occurrencePercentiles[i] = 0 170 | continue 171 | } 172 | 173 | if i == 100 { 174 | w.occurrencePercentiles[i] = allOccs[len(allOccs)-1] 175 | continue 176 | } 177 | 178 | occ := uint64(float64(i) / 100 * float64(len(allOccs))) 179 | w.occurrencePercentiles[i] = occ 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /contextionary/schema/contextionary.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package schema 12 | 13 | import contextionary "github.com/weaviate/contextionary/contextionary/core" 14 | 15 | // Contextionary composes a regular contextionary with additional 16 | // schema-related query methods 17 | type Contextionary struct { 18 | contextionary.Contextionary 19 | } 20 | 21 | // New creates a new Contextionary from a contextionary.Contextionary which it 22 | // extends with Schema-related search methods 23 | func New(c contextionary.Contextionary) *Contextionary { 24 | return &Contextionary{ 25 | Contextionary: c, 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /contextionary/schema/schema_search.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package schema 12 | 13 | import ( 14 | "fmt" 15 | "regexp" 16 | "strings" 17 | 18 | "github.com/fatih/camelcase" 19 | pb "github.com/weaviate/contextionary/contextionary" 20 | contextionary "github.com/weaviate/contextionary/contextionary/core" 21 | "github.com/weaviate/contextionary/errors" 22 | ) 23 | 24 | // SearchResult is a single search result. See wrapping Search Results for the Type 25 | type SearchResult struct { 26 | Name string 27 | Certainty float32 28 | } 29 | 30 | // SearchResults is grouping of SearchResults for a SchemaSearch 31 | type SearchResults struct { 32 | Type SearchType 33 | Results []SearchResult 34 | } 35 | 36 | // Len of the result set 37 | func (r SearchResults) Len() int { 38 | return len(r.Results) 39 | } 40 | 41 | // SchemaSearch can be used to search for related classes and properties, see 42 | // documentation of SearchParams for more details on how to use it and 43 | // documentation on *pb.SchemaSearchResults for more details on how to use the return 44 | // value 45 | func (con *Contextionary) SchemaSearch(params *pb.SchemaSearchParams) (*pb.SchemaSearchResults, error) { 46 | p := SearchParams{params} 47 | if err := p.Validate(); err != nil { 48 | return nil, errors.NewInvalidUserInputf("invalid search params: %s", err) 49 | } 50 | 51 | centroid, err := con.centroidFromNameAndKeywords(p) 52 | if err != nil { 53 | return nil, errors.NewInvalidUserInputf("could not build centroid from name and keywords: %s", err) 54 | } 55 | 56 | rawResults, err := con.knnSearch(*centroid) 57 | if err != nil { 58 | return nil, errors.NewInternalf("could not perform knn search: %s", err) 59 | } 60 | 61 | if p.SearchType == pb.SearchType_CLASS { 62 | return con.handleClassSearch(p, rawResults) 63 | } 64 | 65 | // since we have passed validation we know that anything that's not a class 66 | // search must be a property search 67 | return con.handlePropertySearch(p, rawResults) 68 | } 69 | 70 | func (con *Contextionary) centroidFromNameAndKeywords(p SearchParams) (*contextionary.Vector, error) { 71 | nameVector, err := con.camelCaseWordToVector(p.Name) 72 | if err != nil { 73 | return nil, fmt.Errorf("invalid name in search: %s", err) 74 | } 75 | 76 | if len(p.Keywords) == 0 { 77 | return nameVector, nil 78 | } 79 | 80 | vectors := make([]contextionary.Vector, len(p.Keywords)+1, len(p.Keywords)+1) 81 | weights := make([]float32, len(p.Keywords)+1, len(p.Keywords)+1) 82 | // set last vector to className which always has weight=1 83 | vectors[len(vectors)-1] = *nameVector 84 | weights[len(vectors)-1] = 1 85 | 86 | for i, keyword := range p.Keywords { 87 | kwVector, err := con.wordToVector(keyword.Keyword) 88 | if err != nil { 89 | return nil, fmt.Errorf("invalid keyword in search: %s", err) 90 | } 91 | vectors[i] = *kwVector 92 | weights[i] = keyword.Weight 93 | } 94 | 95 | return contextionary.ComputeWeightedCentroid(vectors, weights) 96 | } 97 | 98 | func (con *Contextionary) camelCaseWordToVector(w string) (*contextionary.Vector, error) { 99 | parts := camelcase.Split(w) 100 | if len(parts) == 1 { 101 | // no camelcasing, no need to build a centroid 102 | return con.wordToVector(w) 103 | } 104 | 105 | vectors := make([]contextionary.Vector, len(parts), len(parts)) 106 | weights := make([]float32, len(parts), len(parts)) 107 | for i, part := range parts { 108 | v, err := con.wordToVector(part) 109 | if err != nil { 110 | return nil, fmt.Errorf("invalid camelCased compound word: %s", err) 111 | } 112 | 113 | vectors[i] = *v 114 | weights[i] = 1 // on camel-casing all parts are weighted equally 115 | } 116 | 117 | return contextionary.ComputeWeightedCentroid(vectors, weights) 118 | } 119 | 120 | func (con *Contextionary) wordToVector(w string) (*contextionary.Vector, error) { 121 | w = strings.ToLower(w) 122 | itemIndex := con.WordToItemIndex(w) 123 | if ok := itemIndex.IsPresent(); !ok { 124 | return nil, fmt.Errorf( 125 | "the word '%s' is not present in the contextionary and therefore not a valid search term", w) 126 | } 127 | 128 | vector, err := con.GetVectorForItemIndex(itemIndex) 129 | if err != nil { 130 | return nil, fmt.Errorf("could not get vector for word '%s' with itemIndex '%d': %s", 131 | w, itemIndex, err) 132 | } 133 | 134 | return vector, nil 135 | } 136 | 137 | func (con *Contextionary) handleClassSearch(p SearchParams, search rawResults) (*pb.SchemaSearchResults, error) { 138 | return &pb.SchemaSearchResults{ 139 | Type: p.SearchType, 140 | Results: search.extractClassNames(p), 141 | }, nil 142 | } 143 | 144 | func (con *Contextionary) handlePropertySearch(p SearchParams, search rawResults) (*pb.SchemaSearchResults, error) { 145 | return &pb.SchemaSearchResults{ 146 | Type: p.SearchType, 147 | Results: search.extractPropertyNames(p), 148 | }, nil 149 | } 150 | 151 | func (con *Contextionary) knnSearch(vector contextionary.Vector) (rawResults, error) { 152 | list, distances, err := con.GetNnsByVector(vector, 10000, 3) 153 | if err != nil { 154 | return nil, fmt.Errorf("could not get nearest neighbors for vector '%v': %s", vector, err) 155 | } 156 | 157 | results := make(rawResults, len(list), len(list)) 158 | for i := range list { 159 | word, err := con.ItemIndexToWord(list[i]) 160 | if err != nil { 161 | return results, fmt.Errorf("got a result from kNN search, but don't have a word for this index: %s", err) 162 | } 163 | 164 | results[i] = rawResult{ 165 | name: word, 166 | distance: distances[i], 167 | } 168 | } 169 | 170 | return results, nil 171 | } 172 | 173 | // rawResult is a helper struct to contain the results of the kNN-search. It 174 | // does not yet contain the desired output. This means the names can be both 175 | // classes/properties and arbitrary words. Furthermore the certainty has not 176 | // yet been normalized , so it is merely the raw kNN distance 177 | type rawResult struct { 178 | name string 179 | distance float32 180 | } 181 | 182 | type rawResults []rawResult 183 | 184 | func (r rawResults) extractClassNames(p SearchParams) []*pb.SchemaSearchResult { 185 | var results []*pb.SchemaSearchResult 186 | regex := regexp.MustCompile(fmt.Sprintf("^\\$%s\\[([A-Za-z]+)\\]$", "OBJECT")) 187 | 188 | for _, rawRes := range r { 189 | if regex.MatchString(rawRes.name) { 190 | certainty := distanceToCertainty(rawRes.distance) 191 | if certainty < p.Certainty { 192 | continue 193 | } 194 | 195 | results = append(results, &pb.SchemaSearchResult{ 196 | Name: regex.FindStringSubmatch(rawRes.name)[1], //safe because we ran .MatchString before 197 | Certainty: certainty, 198 | }) 199 | } 200 | } 201 | 202 | return results 203 | } 204 | 205 | func (r rawResults) extractPropertyNames(p SearchParams) []*pb.SchemaSearchResult { 206 | var results []*pb.SchemaSearchResult 207 | regex := regexp.MustCompile("^\\$[A-Za-z]+\\[[A-Za-z]+\\]\\[([A-Za-z]+)\\]$") 208 | 209 | propsMap := map[string][]*pb.SchemaSearchResult{} 210 | 211 | for _, rawRes := range r { 212 | if regex.MatchString(rawRes.name) { 213 | name := regex.FindStringSubmatch(rawRes.name)[1] //safe because we ran .MatchString before 214 | certainty := distanceToCertainty(rawRes.distance) 215 | if certainty < p.Certainty { 216 | continue 217 | } 218 | 219 | res := &pb.SchemaSearchResult{ 220 | Name: name, 221 | Certainty: certainty, 222 | } 223 | if _, ok := propsMap[name]; !ok { 224 | propsMap[name] = []*pb.SchemaSearchResult{res} 225 | } else { 226 | propsMap[name] = append(propsMap[name], res) 227 | } 228 | } 229 | } 230 | 231 | // now calculate mean of duplicate results 232 | for _, resultsPerName := range propsMap { 233 | results = append(results, &pb.SchemaSearchResult{ 234 | Name: resultsPerName[0].Name, 235 | Certainty: meanCertainty(resultsPerName), 236 | }) 237 | } 238 | 239 | return results 240 | } 241 | 242 | func meanCertainty(rs []*pb.SchemaSearchResult) float32 { 243 | var compound float32 244 | for _, r := range rs { 245 | compound += r.Certainty 246 | } 247 | 248 | return compound / float32(len(rs)) 249 | } 250 | 251 | func distanceToCertainty(d float32) float32 { 252 | return 1 - d/12 253 | } 254 | -------------------------------------------------------------------------------- /contextionary/schema/schema_search_params.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package schema 12 | 13 | import ( 14 | "fmt" 15 | 16 | "github.com/fatih/camelcase" 17 | pb "github.com/weaviate/contextionary/contextionary" 18 | ) 19 | 20 | // SearchType to search for either class names or property names 21 | type SearchType string 22 | 23 | const ( 24 | // SearchTypeClass to search the contextionary for class names 25 | SearchTypeClass SearchType = "class" 26 | // SearchTypeProperty to search the contextionary for property names 27 | SearchTypeProperty SearchType = "property" 28 | ) 29 | 30 | // SearchParams to be used for a SchemaSearch. See individual properties for 31 | // additional documentation on what they do 32 | type SearchParams struct { 33 | *pb.SchemaSearchParams 34 | } 35 | 36 | // Validate the feasibility of the specified arguments 37 | func (p SearchParams) Validate() error { 38 | if p.Name == "" { 39 | return fmt.Errorf("Name cannot be empty") 40 | } 41 | 42 | if err := p.validateCertaintyOrWeight(p.Certainty); err != nil { 43 | return fmt.Errorf("invalid Certainty: %s", err) 44 | } 45 | 46 | if p.SearchType != pb.SearchType_CLASS && p.SearchType != pb.SearchType_PROPERTY { 47 | return fmt.Errorf( 48 | "SearchType must be SearchType_CLASS or SearchType_PROPERTY, but got '%s'", p.SearchType) 49 | } 50 | 51 | for i, keyword := range p.Keywords { 52 | if err := p.validateKeyword(keyword); err != nil { 53 | return fmt.Errorf("invalid keyword at position %d: %s", i, err) 54 | } 55 | } 56 | 57 | return nil 58 | } 59 | 60 | func (p SearchParams) validateKeyword(kw *pb.Keyword) error { 61 | if kw.Keyword == "" { 62 | return fmt.Errorf("Keyword cannot be empty") 63 | } 64 | 65 | if len(camelcase.Split(kw.Keyword)) > 1 { 66 | return fmt.Errorf("invalid Keyword: keywords cannot be camelCased - "+ 67 | "instead split your keyword up into several keywords, this way each word "+ 68 | "of your camelCased string can have its own weight, got '%s'", kw.Keyword) 69 | } 70 | 71 | if err := p.validateCertaintyOrWeight(kw.Weight); err != nil { 72 | return fmt.Errorf("invalid Weight: %s", err) 73 | } 74 | 75 | return nil 76 | } 77 | 78 | func (p SearchParams) validateCertaintyOrWeight(c float32) error { 79 | if c >= 0 && c <= 1 { 80 | return nil 81 | } 82 | 83 | return fmt.Errorf("must be between 0 and 1, but got '%f'", c) 84 | } 85 | -------------------------------------------------------------------------------- /contextionary/schema/schema_search_params_test.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package schema 12 | 13 | import ( 14 | "errors" 15 | "testing" 16 | 17 | "github.com/stretchr/testify/assert" 18 | "github.com/weaviate/contextionary/contextionary" 19 | ) 20 | 21 | func Test__SchemaSearch_Validation(t *testing.T) { 22 | tests := schemaSearchTests{ 23 | { 24 | name: "valid params", 25 | searchParams: SearchParams{ 26 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 27 | SearchType: contextionary.SearchType_CLASS, 28 | Name: "foo", 29 | Certainty: 1.0, 30 | }, 31 | }, 32 | expectedError: nil, 33 | }, 34 | { 35 | name: "missing search name", 36 | searchParams: SearchParams{ 37 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 38 | SearchType: contextionary.SearchType_CLASS, 39 | Name: "", 40 | Certainty: 0.0, 41 | }, 42 | }, 43 | expectedError: errors.New("Name cannot be empty"), 44 | }, 45 | { 46 | name: "certainty too low", 47 | searchParams: SearchParams{ 48 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 49 | SearchType: contextionary.SearchType_CLASS, 50 | Name: "bestName", 51 | Certainty: -4, 52 | }, 53 | }, 54 | expectedError: errors.New("invalid Certainty: must be between 0 and 1, but got '-4.000000'"), 55 | }, 56 | { 57 | name: "certainty too high", 58 | searchParams: SearchParams{ 59 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 60 | SearchType: contextionary.SearchType_CLASS, 61 | Name: "bestName", 62 | Certainty: 4, 63 | }, 64 | }, 65 | expectedError: errors.New("invalid Certainty: must be between 0 and 1, but got '4.000000'"), 66 | }, 67 | { 68 | name: "missing kind on class search", 69 | searchParams: SearchParams{ 70 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 71 | SearchType: contextionary.SearchType_CLASS, 72 | Name: "bestName", 73 | Certainty: 0.5, 74 | }, 75 | }, 76 | expectedError: errors.New("Kind cannot be empty"), 77 | }, 78 | { 79 | name: "valid keywords", 80 | searchParams: SearchParams{ 81 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 82 | SearchType: contextionary.SearchType_CLASS, 83 | Name: "foo", 84 | Certainty: 1.0, 85 | Keywords: []*contextionary.Keyword{ 86 | { 87 | Keyword: "foobar", 88 | Weight: 1.0, 89 | }, 90 | }, 91 | }, 92 | }, 93 | expectedError: nil, 94 | }, 95 | { 96 | name: "keywords with empty names", 97 | searchParams: SearchParams{ 98 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 99 | SearchType: contextionary.SearchType_CLASS, 100 | Name: "foo", 101 | Certainty: 1.0, 102 | Keywords: []*contextionary.Keyword{ 103 | { 104 | Keyword: "", 105 | Weight: 1.0, 106 | }, 107 | }, 108 | }, 109 | }, 110 | expectedError: errors.New("invalid keyword at position 0: Keyword cannot be empty"), 111 | }, 112 | { 113 | name: "keywords with invalid weights", 114 | searchParams: SearchParams{ 115 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 116 | SearchType: contextionary.SearchType_CLASS, 117 | Name: "foo", 118 | Certainty: 1.0, 119 | Keywords: []*contextionary.Keyword{{ 120 | Keyword: "bestkeyword", 121 | Weight: 1.3, 122 | }}, 123 | }, 124 | }, 125 | expectedError: errors.New("invalid keyword at position 0: invalid Weight: " + 126 | "must be between 0 and 1, but got '1.300000'"), 127 | }, 128 | { 129 | name: "CamelCased keywords", 130 | searchParams: SearchParams{ 131 | SchemaSearchParams: &contextionary.SchemaSearchParams{ 132 | SearchType: contextionary.SearchType_CLASS, 133 | Name: "foo", 134 | Certainty: 1.0, 135 | Keywords: []*contextionary.Keyword{{ 136 | Keyword: "worstKeyword", 137 | Weight: 0.8, 138 | }}, 139 | }, 140 | }, 141 | expectedError: errors.New("invalid keyword at position 0: invalid Keyword: " + 142 | "keywords cannot be camelCased - instead split your keyword up into several keywords, " + 143 | "this way each word of your camelCased string can have its own weight, got 'worstKeyword'"), 144 | }, 145 | } 146 | 147 | tests.AssertValidation(t) 148 | } 149 | 150 | func (s schemaSearchTests) AssertValidation(t *testing.T) { 151 | for _, test := range s { 152 | t.Run(test.name, func(t *testing.T) { 153 | err := test.searchParams.Validate() 154 | 155 | // assert error 156 | assert.Equal(t, test.expectedError, err, "should match the expected error") 157 | 158 | }) 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /errors/errors.go: -------------------------------------------------------------------------------- 1 | package errors 2 | 3 | import "fmt" 4 | 5 | // InvalidUserInput indicates a client-side error 6 | type InvalidUserInput struct { 7 | msg string 8 | } 9 | 10 | func (e InvalidUserInput) Error() string { 11 | return e.msg 12 | } 13 | 14 | // NewInvalidUserInput with Errorf signature 15 | func NewInvalidUserInputf(format string, args ...interface{}) InvalidUserInput { 16 | return InvalidUserInput{msg: fmt.Sprintf(format, args...)} 17 | } 18 | 19 | // Internal indicates something went wrong during processing 20 | type Internal struct { 21 | msg string 22 | } 23 | 24 | func (e Internal) Error() string { 25 | return e.msg 26 | } 27 | 28 | // NewInternal with Errorf signature 29 | func NewInternalf(format string, args ...interface{}) Internal { 30 | return Internal{msg: fmt.Sprintf(format, args...)} 31 | } 32 | 33 | // NotFound indicates the desired resource doesn't exist 34 | type NotFound struct { 35 | msg string 36 | } 37 | 38 | func (e NotFound) Error() string { 39 | return e.msg 40 | } 41 | 42 | // NewNotFound with Errorf signature 43 | func NewNotFoundf(format string, args ...interface{}) NotFound { 44 | return NotFound{msg: fmt.Sprintf(format, args...)} 45 | } 46 | -------------------------------------------------------------------------------- /extensions/extension.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | type Extension struct { 4 | Concept string `json:"concept"` 5 | Vector []float32 `json:"vector"` 6 | Occurrence int `json:"occurrence"` 7 | Input ExtensionInput `json:"input"` 8 | } 9 | 10 | type ExtensionInput struct { 11 | Definition string `json:"definition"` 12 | Weight float32 `json:"weight"` 13 | } 14 | -------------------------------------------------------------------------------- /extensions/looker_upper.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | type LookerUpper struct { 8 | repo RetrieverRepo 9 | sync.Mutex 10 | db map[string]Extension 11 | } 12 | 13 | type RetrieverRepo interface { 14 | // WatchAll must send an immediate response after opening (for 15 | // initializiation), then send another response whenver the db has changed 16 | WatchAll() chan WatchResponse 17 | } 18 | 19 | func NewLookerUpper(repo RetrieverRepo) *LookerUpper { 20 | lu := &LookerUpper{ 21 | repo: repo, 22 | db: map[string]Extension{}, 23 | } 24 | lu.initWatcher() 25 | return lu 26 | } 27 | 28 | func (lu *LookerUpper) Lookup(concept string) (*Extension, error) { 29 | lu.Lock() 30 | defer lu.Unlock() 31 | 32 | ext, ok := lu.db[concept] 33 | if !ok { 34 | return nil, nil 35 | } 36 | 37 | return &ext, nil 38 | } 39 | 40 | type WatchResponse []Extension 41 | 42 | func (lu *LookerUpper) initWatcher() { 43 | updateCh := lu.repo.WatchAll() 44 | 45 | go func() { 46 | for res := range updateCh { 47 | lu.updateDB(res) 48 | } 49 | }() 50 | } 51 | 52 | func (lu *LookerUpper) updateDB(list []Extension) { 53 | lu.Lock() 54 | defer lu.Unlock() 55 | 56 | for _, ext := range list { 57 | lu.db[ext.Concept] = ext 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /extensions/looker_upper_test.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/assert" 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func Test_LookerUpper(t *testing.T) { 12 | t.Run("looking up a non-existant concept", func(t *testing.T) { 13 | repo := newFakeRepo() 14 | lu := NewLookerUpper(repo) 15 | extension, err := lu.Lookup("non_existing_concept") 16 | require.Nil(t, err) 17 | assert.Nil(t, extension) 18 | }) 19 | 20 | t.Run("looking up existing concepts", func(t *testing.T) { 21 | repo := newFakeRepo() 22 | lu := NewLookerUpper(repo) 23 | 24 | t.Run("with an initial concept", func(t *testing.T) { 25 | ext := Extension{ 26 | Concept: "flux_capacitor", 27 | Vector: []float32{0, 1, 2}, 28 | Occurrence: 1000, 29 | } 30 | repo.add(ext) 31 | time.Sleep(100 * time.Millisecond) 32 | actual, err := lu.Lookup("flux_capacitor") 33 | require.Nil(t, err) 34 | assert.Equal(t, &ext, actual) 35 | }) 36 | 37 | t.Run("with second concept", func(t *testing.T) { 38 | ext := Extension{ 39 | Concept: "clux_fapacitor", 40 | Vector: []float32{0, 1, 2}, 41 | Occurrence: 1000, 42 | } 43 | repo.add(ext) 44 | time.Sleep(100 * time.Millisecond) 45 | 46 | t.Run("looking up the original concept", func(t *testing.T) { 47 | actual, err := lu.Lookup("flux_capacitor") 48 | require.Nil(t, err) 49 | require.NotNil(t, actual) 50 | assert.Equal(t, "flux_capacitor", actual.Concept) 51 | }) 52 | 53 | t.Run("looking up the second concept concept", func(t *testing.T) { 54 | actual, err := lu.Lookup("clux_fapacitor") 55 | require.Nil(t, err) 56 | require.NotNil(t, actual) 57 | assert.Equal(t, "clux_fapacitor", actual.Concept) 58 | }) 59 | }) 60 | }) 61 | } 62 | 63 | func newFakeRepo() *fakeRepo { 64 | repo := &fakeRepo{ 65 | ch: make(chan WatchResponse), 66 | } 67 | 68 | return repo 69 | } 70 | 71 | type fakeRepo struct { 72 | ch chan WatchResponse 73 | extensions []Extension 74 | } 75 | 76 | func (f *fakeRepo) WatchAll() chan WatchResponse { 77 | return f.ch 78 | } 79 | 80 | func (f *fakeRepo) add(ex Extension) { 81 | f.extensions = append(f.extensions, ex) 82 | f.ch <- f.extensions 83 | } 84 | -------------------------------------------------------------------------------- /extensions/storer.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | "unicode" 8 | 9 | "github.com/sirupsen/logrus" 10 | core "github.com/weaviate/contextionary/contextionary/core" 11 | "github.com/weaviate/contextionary/errors" 12 | ) 13 | 14 | type Vectorizer interface { 15 | Corpi(corpi []string, overrides map[string]string) (*core.Vector, error) 16 | } 17 | 18 | type StorerRepo interface { 19 | Put(ctx context.Context, ext Extension) error 20 | } 21 | 22 | type Storer struct { 23 | vectorizer Vectorizer 24 | repo StorerRepo 25 | logger logrus.FieldLogger 26 | } 27 | 28 | func NewStorer(vectorizer Vectorizer, repo StorerRepo, logger logrus.FieldLogger) *Storer { 29 | return &Storer{vectorizer, repo, logger} 30 | } 31 | 32 | func (s *Storer) Put(ctx context.Context, concept string, input ExtensionInput) error { 33 | s.logger.WithField("action", "extensions_put"). 34 | WithField("concept", concept). 35 | WithField("extension", input). 36 | Debug("received request to add/replace custom extension") 37 | 38 | err := s.validate(concept, input) 39 | if err != nil { 40 | return errors.NewInvalidUserInputf("invalid extension: %v", err) 41 | } 42 | 43 | vector, err := s.vectorizer.Corpi([]string{input.Definition}, nil) 44 | if err != nil { 45 | return errors.NewInternalf("vectorize definition: %v", err) 46 | } 47 | 48 | concept = s.compound(concept) 49 | 50 | ext := Extension{ 51 | Concept: concept, 52 | Input: input, 53 | Vector: vector.ToArray(), // nil-check can be omitted as vectorizer will return non-nil if err==nil 54 | Occurrence: 1000, // TODO: Improve! 55 | } 56 | 57 | s.logger.WithField("action", "extensions_put_prestore"). 58 | WithField("concept", ext.Concept). 59 | WithField("extension", ext). 60 | Debug("calculated vector, about to store in repo") 61 | 62 | err = s.repo.Put(ctx, ext) 63 | if err != nil { 64 | s.logger.WithField("action", "extensions_store_error"). 65 | WithField("concept", ext.Concept). 66 | Errorf("repo put: %v", err) 67 | return errors.NewInternalf("store extension: %v", err) 68 | } 69 | 70 | s.logger.WithField("action", "extensions_put_poststore"). 71 | WithField("concept", ext.Concept). 72 | Debug("successfully stored extension in repo") 73 | 74 | return nil 75 | } 76 | 77 | func (s *Storer) compound(inp string) string { 78 | parts := strings.Split(inp, " ") 79 | return strings.Join(parts, "_") 80 | } 81 | 82 | func (s *Storer) validate(concept string, input ExtensionInput) error { 83 | if len(concept) < 2 { 84 | return fmt.Errorf("concept must have at least two characters") 85 | } 86 | 87 | for _, r := range concept { 88 | if !unicode.IsLower(r) && !unicode.IsSpace(r) && !unicode.IsNumber(r) { 89 | return fmt.Errorf("concept must be made up of all lowercase letters and/or numbers, " + 90 | "for custom compund words use spaces, e.g. 'flux capacitor'") 91 | } 92 | } 93 | 94 | if len(input.Definition) == 0 { 95 | return fmt.Errorf("definition cannot be empty") 96 | } 97 | 98 | if input.Weight > 1 || input.Weight < 0 { 99 | return fmt.Errorf("weight must be between 0 and 1") 100 | } 101 | 102 | if input.Weight < 1 { 103 | return fmt.Errorf("weights below 1 (extending an existing concept) not supported yet - coming soon") 104 | } 105 | 106 | return nil 107 | } 108 | -------------------------------------------------------------------------------- /extensions/storer_test.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "testing" 7 | 8 | "github.com/sirupsen/logrus/hooks/test" 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/mock" 11 | "github.com/stretchr/testify/require" 12 | core "github.com/weaviate/contextionary/contextionary/core" 13 | ) 14 | 15 | func Test_Storer(t *testing.T) { 16 | t.Run("with invalid inputs", func(t *testing.T) { 17 | repo := &fakeStorerRepo{} 18 | logger, _ := test.NewNullLogger() 19 | s := NewStorer(&fakeVectorizer{}, repo, logger) 20 | inp := ExtensionInput{ 21 | Definition: "an electrical device to store energy in the short term", 22 | Weight: 1, 23 | } 24 | 25 | type testCase struct { 26 | concept string 27 | inp ExtensionInput 28 | expectedErr error 29 | } 30 | 31 | tests := []testCase{ 32 | testCase{ 33 | concept: "lowerAndUpperCase", 34 | expectedErr: fmt.Errorf("invalid extension: concept must be made up of all lowercase letters and/or numbers, for custom compund words use spaces, e.g. 'flux capacitor'"), 35 | inp: inp, 36 | }, 37 | testCase{ 38 | concept: "a", 39 | expectedErr: fmt.Errorf("invalid extension: concept must have at least two characters"), 40 | inp: inp, 41 | }, 42 | testCase{ 43 | concept: "foo", 44 | expectedErr: fmt.Errorf("invalid extension: definition cannot be empty"), 45 | inp: ExtensionInput{Weight: 1}, 46 | }, 47 | testCase{ 48 | concept: "foo", 49 | expectedErr: fmt.Errorf("invalid extension: weight must be between 0 and 1"), 50 | inp: ExtensionInput{Weight: -1, Definition: "foo bar"}, 51 | }, 52 | testCase{ 53 | concept: "foo", 54 | expectedErr: fmt.Errorf("invalid extension: weight must be between 0 and 1"), 55 | inp: ExtensionInput{Weight: 3, Definition: "foo bar"}, 56 | }, 57 | testCase{ // TODO: add feature, then remove limitation 58 | concept: "foo", 59 | expectedErr: fmt.Errorf("invalid extension: weights below 1 (extending an existing concept) not supported yet - coming soon"), 60 | inp: ExtensionInput{Weight: 0.7, Definition: "foo bar"}, 61 | }, 62 | } 63 | 64 | for _, test := range tests { 65 | t.Run(test.concept, func(t *testing.T) { 66 | err := s.Put(context.Background(), test.concept, test.inp) 67 | assert.Equal(t, test.expectedErr.Error(), err.Error()) 68 | }) 69 | } 70 | }) 71 | 72 | t.Run("with valid input (single word)", func(t *testing.T) { 73 | repo := &fakeStorerRepo{} 74 | logger, _ := test.NewNullLogger() 75 | s := NewStorer(&fakeVectorizer{}, repo, logger) 76 | concept := "capacitor" 77 | inp := ExtensionInput{ 78 | Definition: "an electrical device to store energy in the short term", 79 | Weight: 1, 80 | } 81 | 82 | expectedExtension := Extension{ 83 | Input: inp, 84 | Concept: concept, 85 | Vector: []float32{1, 2, 3}, 86 | Occurrence: 1000, 87 | } 88 | repo.On("Put", expectedExtension).Return(nil) 89 | err := s.Put(context.Background(), concept, inp) 90 | require.Nil(t, err) 91 | repo.AssertExpectations(t) 92 | 93 | }) 94 | 95 | t.Run("with valid input (compound word)", func(t *testing.T) { 96 | // this is a special case because users will input their words using 97 | // spaces, but we store them using snake_case 98 | repo := &fakeStorerRepo{} 99 | logger, _ := test.NewNullLogger() 100 | s := NewStorer(&fakeVectorizer{}, repo, logger) 101 | concept := "flux capacitor" 102 | inp := ExtensionInput{ 103 | Definition: "an energy source for cars to travel through time", 104 | Weight: 1, 105 | } 106 | 107 | expectedExtension := Extension{ 108 | Input: inp, 109 | Concept: "flux_capacitor", 110 | Vector: []float32{1, 2, 3}, 111 | Occurrence: 1000, 112 | } 113 | repo.On("Put", expectedExtension).Return(nil) 114 | err := s.Put(context.Background(), concept, inp) 115 | require.Nil(t, err) 116 | repo.AssertExpectations(t) 117 | }) 118 | } 119 | 120 | type fakeVectorizer struct{} 121 | 122 | func (f *fakeVectorizer) Corpi(corpi []string, overrides map[string]string) (*core.Vector, error) { 123 | v := core.NewVector([]float32{1, 2, 3}) 124 | return &v, nil 125 | } 126 | 127 | type fakeStorerRepo struct { 128 | mock.Mock 129 | } 130 | 131 | func (f *fakeStorerRepo) Put(ctx context.Context, ext Extension) error { 132 | args := f.Called(ext) 133 | return args.Error(0) 134 | } 135 | -------------------------------------------------------------------------------- /gen_proto_code.sh: -------------------------------------------------------------------------------- 1 | protoc -I contextionary/ contextionary/contextionary.proto --go_out=plugins=grpc:contextionary 2 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/weaviate/contextionary 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/fatih/camelcase v1.0.0 7 | github.com/golang/protobuf v1.4.3 8 | github.com/golang/snappy v0.0.3 // indirect 9 | github.com/jessevdk/go-flags v1.4.0 10 | github.com/onsi/ginkgo v1.15.2 // indirect 11 | github.com/onsi/gomega v1.11.0 // indirect 12 | github.com/sirupsen/logrus v1.6.0 13 | github.com/stretchr/testify v1.6.1 14 | github.com/syndtr/goleveldb v0.0.0-20180708030551-c4c61651e9e3 15 | google.golang.org/grpc v1.24.0 16 | ) 17 | -------------------------------------------------------------------------------- /logparser/parse.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "os" 9 | ) 10 | 11 | type logEntry struct { 12 | Action string `json:"action"` 13 | Words []word `json:"words"` 14 | } 15 | 16 | type word struct { 17 | Occurrence int `json:"occurrence"` 18 | Weight float64 `json:"weight"` 19 | Word string `json:"word"` 20 | } 21 | 22 | func main() { 23 | scanner := bufio.NewScanner(os.Stdin) 24 | var results []logEntry 25 | 26 | for scanner.Scan() { 27 | var current logEntry 28 | err := json.Unmarshal(scanner.Bytes(), ¤t) 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | 33 | if current.Action == "debug_vector_weights" { 34 | results = append(results, current) 35 | } 36 | } 37 | 38 | marshalled, err := json.MarshalIndent(results, "", " ") 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | fmt.Print(string(marshalled)) 44 | } 45 | -------------------------------------------------------------------------------- /main/splitter_preprocessor.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/weaviate/contextionary/preprocessing" 8 | ) 9 | 10 | func main() { 11 | if len(os.Args) != 5 { 12 | missing := fmt.Errorf("Missing arguments requires: [.idx, .dic, .aff, output_file]") 13 | panic(missing.Error()) 14 | } 15 | 16 | err := preprocessing.GenerateSplittingDictFile(os.Args[1], os.Args[2], os.Args[3], os.Args[4]) 17 | if err != nil { 18 | panic(err.Error()) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /prepare_docker_buildx.sh: -------------------------------------------------------------------------------- 1 | docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 2 | docker buildx create --name multiarch --driver docker-container --use 3 | docker buildx inspect --bootstrap 4 | -------------------------------------------------------------------------------- /preprocessing/dictionary_pre_processing.go: -------------------------------------------------------------------------------- 1 | package preprocessing 2 | 3 | import ( 4 | "encoding/binary" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | "os" 9 | "strings" 10 | ) 11 | 12 | // PreprocessDict temp storage for reading in the index file 13 | type PreprocessDict struct { 14 | dict map[string]int 15 | } 16 | 17 | // GenerateSplittingDictFile from 18 | // 19 | // contextionaryIndexFile binary .idx file containing the words for the specific language 20 | // languageDictionaryFile a hunspell .dic file for the specific language 21 | // languageAffixesFile a hunspell .aff file for the specific language 22 | // to reduce file- and hunspell dependencies for the splitter 23 | func GenerateSplittingDictFile(contextionaryIndexFile string, languageDictionaryFile string, languageAffixesFile string, outputFile string) error { 24 | dict := NewPreprocessDict(contextionaryIndexFile, languageDictionaryFile, languageAffixesFile) 25 | out, err := os.Create(outputFile) 26 | if err != nil { 27 | return err 28 | } 29 | defer out.Close() 30 | 31 | for word, occurrence := range dict.dict { 32 | line := fmt.Sprintf("%s,%v\n", word, occurrence) 33 | _, err := out.Write([]byte(line)) 34 | if err != nil { 35 | return err 36 | } 37 | } 38 | return nil 39 | } 40 | 41 | // NewPreprocessDict from 42 | // 43 | // contextionaryIndexFile binary .idx file containing the words for the specific language 44 | // languageDictionaryFile a hunspell .dic file for the specific language 45 | // languageAffixesFile a hunspell .aff file for the specific language 46 | func NewPreprocessDict(contextionaryIndexFile string, languageDictionaryFile string, languageAffixesFile string) *PreprocessDict { 47 | dict := &PreprocessDict{ 48 | dict: make(map[string]int, 1200000), 49 | } 50 | hunspellFilter := Hunspell(languageAffixesFile, languageDictionaryFile) 51 | 52 | err := dict.loadContextionary(contextionaryIndexFile, hunspellFilter) 53 | if err != nil { 54 | panic(err.Error()) 55 | } 56 | return dict 57 | } 58 | 59 | // loadContextionary from binary file 60 | func (cd *PreprocessDict) loadContextionary(path string, filter *Hunhandle) error { 61 | data, readFileErr := ioutil.ReadFile(path) 62 | if readFileErr != nil { 63 | return readFileErr 64 | } 65 | 66 | // File format: 67 | // https://github.com/weaviate/weaviate-vector-generator#wordlist-file-format 68 | nrWordsBytes := data[0:8] 69 | //vectorLengthBytes := data[8:16] 70 | metaDataLengthBytes := data[16:24] 71 | 72 | nrWords := binary.LittleEndian.Uint64(nrWordsBytes) 73 | //vectorLength := binary.LittleEndian.Uint64(vectorLengthBytes) 74 | metaDataLength := binary.LittleEndian.Uint64(metaDataLengthBytes) 75 | 76 | // Read meta data 77 | metaDataBytes := data[24 : 24+metaDataLength] 78 | var metadata map[string]interface{} 79 | unMarshalErr := json.Unmarshal(metaDataBytes, &metadata) 80 | if unMarshalErr != nil { 81 | return unMarshalErr 82 | } 83 | 84 | var startOfTable uint64 = 24 + uint64(metaDataLength) 85 | var offset uint64 = 4 - (startOfTable % 4) 86 | startOfTable += offset 87 | 88 | for wordIndex := uint64(0); wordIndex < nrWords; wordIndex++ { 89 | // entryAddress is the index in the data where the pointer to 90 | // the word is located 91 | entryAddress := startOfTable + 8*wordIndex 92 | pointerToWordByte := data[entryAddress : entryAddress+8] 93 | pointerToWord := binary.LittleEndian.Uint64(pointerToWordByte) 94 | word, occurence := getWordAndOccurence(data, pointerToWord) 95 | // Only add the word if it passes the filter 96 | if passesFilter(word, filter) { 97 | cd.dict[word] = int(occurence) 98 | } 99 | } 100 | 101 | return nil 102 | } 103 | 104 | // getWordAndOccurence from the data frame indecated by the pointer 105 | func getWordAndOccurence(data []byte, pointer uint64) (string, uint64) { 106 | ocurrence := binary.LittleEndian.Uint64(data[pointer : pointer+8]) 107 | 108 | pointer = pointer + 8 109 | for i := uint64(0); ; i++ { 110 | if data[pointer+i] == '\x00' { 111 | word := string(data[pointer : pointer+i]) 112 | return word, ocurrence 113 | } 114 | } 115 | } 116 | 117 | // passesFilter if the word is in the dictionary of the given language 118 | func passesFilter(word string, filter *Hunhandle) bool { 119 | inDict := filter.Spell(word) 120 | if inDict { 121 | return true 122 | } 123 | // Check if upper case word 124 | inDict = filter.Spell(strings.Title(word)) 125 | return inDict 126 | } 127 | -------------------------------------------------------------------------------- /preprocessing/dictionary_pre_processing_test.go: -------------------------------------------------------------------------------- 1 | package preprocessing 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/weaviate/contextionary/compoundsplitting" 11 | ) 12 | 13 | func TestPreprocessorSplitterDictFile(t *testing.T) { 14 | // Create the file 15 | outputFile := "test_dict.splitdict" 16 | GenerateSplittingDictFile("../test/compoundsplitting/contextionary.idx", "../test/compoundsplitting/nl_NL.dic", "../test/compoundsplitting/nl_NL.aff", outputFile) 17 | 18 | // Validate the output file 19 | file, err := os.Open(outputFile) 20 | if err != nil { 21 | t.Fail() 22 | } 23 | defer file.Close() 24 | 25 | scanner := bufio.NewScanner(file) 26 | found := false 27 | for scanner.Scan() { 28 | line := scanner.Text() 29 | split := strings.Split(line, ",") 30 | if split[0] == "appellantes" { 31 | found = true 32 | break 33 | } 34 | } 35 | assert.True(t, found) 36 | 37 | if err := scanner.Err(); err != nil { 38 | t.Fail() 39 | } 40 | 41 | err = file.Close() 42 | if err != nil { 43 | t.Fail() 44 | } 45 | 46 | // Load from output file 47 | dict, err := compoundsplitting.NewContextionaryDict(outputFile) 48 | if err != nil { 49 | t.Fail() 50 | } 51 | 52 | assert.True(t, dict.Contains("amsterdam")) 53 | assert.True(t, dict.Contains("appellante")) 54 | assert.True(t, dict.Contains("appellantes")) 55 | 56 | // Remove test file 57 | err = os.Remove(outputFile) 58 | if err != nil { 59 | t.Fail() 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /preprocessing/hunspell.go: -------------------------------------------------------------------------------- 1 | package preprocessing 2 | 3 | // #cgo linux LDFLAGS: -lhunspell 4 | // #cgo darwin LDFLAGS: -lhunspell-1.7 -L/usr/local/Cellar/hunspell/1.7.0_2/lib 5 | // #cgo darwin CFLAGS: -I/usr/local/Cellar/hunspell/1.7.0_2/include/ 6 | // 7 | // #include 8 | // #include 9 | // #include 10 | import "C" 11 | import ( 12 | "reflect" 13 | "runtime" 14 | "sync" 15 | "unsafe" 16 | ) 17 | 18 | // Code in this file copied/based on 19 | // https://github.com/sthorne/go-hunspell/blob/99efdad5368d3e39a44c8cdaf101c33a4f20f8b9/hunspell.go 20 | // Original is licensed under "MIT License" Original license located at: 21 | // https://github.com/sthorne/go-hunspell/blob/99efdad5368d3e39a44c8cdaf101c33a4f20f8b9/LICENSE 22 | 23 | type Hunhandle struct { 24 | handle *C.Hunhandle 25 | lock *sync.Mutex 26 | } 27 | 28 | func Hunspell(affpath string, dpath string) *Hunhandle { 29 | 30 | affpathcs := C.CString(affpath) 31 | defer C.free(unsafe.Pointer(affpathcs)) 32 | 33 | dpathcs := C.CString(dpath) 34 | defer C.free(unsafe.Pointer(dpathcs)) 35 | 36 | h := &Hunhandle{lock: new(sync.Mutex)} 37 | h.handle = C.Hunspell_create(affpathcs, dpathcs) 38 | 39 | runtime.SetFinalizer(h, func(handle *Hunhandle) { 40 | C.Hunspell_destroy(handle.handle) 41 | h.handle = nil 42 | }) 43 | 44 | return h 45 | } 46 | 47 | func CArrayToString(c **C.char, l int) []string { 48 | 49 | s := []string{} 50 | 51 | hdr := reflect.SliceHeader{ 52 | Data: uintptr(unsafe.Pointer(c)), 53 | Len: l, 54 | Cap: l, 55 | } 56 | 57 | for _, v := range *(*[]*C.char)(unsafe.Pointer(&hdr)) { 58 | s = append(s, C.GoString(v)) 59 | } 60 | 61 | return s 62 | } 63 | 64 | func (handle *Hunhandle) Suggest(word string) []string { 65 | wordcs := C.CString(word) 66 | defer C.free(unsafe.Pointer(wordcs)) 67 | 68 | var carray **C.char 69 | var length C.int 70 | handle.lock.Lock() 71 | length = C.Hunspell_suggest(handle.handle, &carray, wordcs) 72 | handle.lock.Unlock() 73 | 74 | words := CArrayToString(carray, int(length)) 75 | 76 | C.Hunspell_free_list(handle.handle, &carray, length) 77 | return words 78 | } 79 | 80 | func (handle *Hunhandle) Add(word string) bool { 81 | 82 | cWord := C.CString(word) 83 | defer C.free(unsafe.Pointer(cWord)) 84 | 85 | var r C.int 86 | r = C.Hunspell_add(handle.handle, cWord) 87 | 88 | if int(r) != 0 { 89 | return false 90 | } 91 | 92 | return true 93 | } 94 | 95 | func (handle *Hunhandle) Stem(word string) []string { 96 | wordcs := C.CString(word) 97 | defer C.free(unsafe.Pointer(wordcs)) 98 | var carray **C.char 99 | var length C.int 100 | handle.lock.Lock() 101 | length = C.Hunspell_stem(handle.handle, &carray, wordcs) 102 | handle.lock.Unlock() 103 | 104 | words := CArrayToString(carray, int(length)) 105 | 106 | C.Hunspell_free_list(handle.handle, &carray, length) 107 | return words 108 | } 109 | 110 | func (handle *Hunhandle) Spell(word string) bool { 111 | wordcs := C.CString(word) 112 | defer C.free(unsafe.Pointer(wordcs)) 113 | handle.lock.Lock() 114 | res := C.Hunspell_spell(handle.handle, wordcs) 115 | handle.lock.Unlock() 116 | 117 | if int(res) == 0 { 118 | return false 119 | } 120 | return true 121 | } 122 | -------------------------------------------------------------------------------- /preprocessing/hunspell_test.go: -------------------------------------------------------------------------------- 1 | package preprocessing 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestImplementation(t *testing.T) { 10 | 11 | hsp := Hunspell("../test/compoundsplitting/nl_NL.aff", "../test/compoundsplitting/nl_NL.dic") 12 | 13 | assert.True(t, hsp.Spell("Amsterdam")) 14 | assert.True(t, hsp.Spell("appellante")) 15 | assert.True(t, hsp.Spell("appellantes")) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /server/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strconv" 7 | 8 | "github.com/sirupsen/logrus" 9 | ) 10 | 11 | // Config is used to load application wide config from the environment 12 | type Config struct { 13 | logger logrus.FieldLogger 14 | KNNFile string 15 | IDXFile string 16 | StopwordsFile string 17 | 18 | SchemaProviderURL string 19 | SchemaProviderKey string 20 | ExtensionsPrefix string 21 | ExtensionsStorageOrigin string 22 | ExtensionsStorageMode string 23 | 24 | ServerPort int 25 | 26 | OccurrenceWeightStrategy string 27 | OccurrenceWeightLinearFactor float32 28 | MaxCompoundWordLength int 29 | MaximumBatchSize int 30 | MaximumVectorCacheSize int 31 | NeighborOccurrenceIgnorePercentile int 32 | 33 | EnableCompundSplitting bool 34 | CompoundSplittingDictionaryFile string 35 | 36 | LogLevel string 37 | } 38 | 39 | // New Config from the environment. Errors if required env vars can't be found 40 | func New(logger logrus.FieldLogger) (*Config, error) { 41 | cfg := &Config{logger: logger} 42 | if err := cfg.init(); err != nil { 43 | return nil, fmt.Errorf("could not load config from env: %v", err) 44 | } 45 | 46 | return cfg, nil 47 | } 48 | 49 | func (c *Config) init() error { 50 | knn, err := c.requiredString("KNN_FILE") 51 | if err != nil { 52 | return err 53 | } 54 | c.KNNFile = knn 55 | 56 | idx, err := c.requiredString("IDX_FILE") 57 | if err != nil { 58 | return err 59 | } 60 | c.IDXFile = idx 61 | 62 | sw, err := c.requiredString("STOPWORDS_FILE") 63 | if err != nil { 64 | return err 65 | } 66 | c.StopwordsFile = sw 67 | 68 | sp := c.optionalString("SCHEMA_PROVIDER_URL", "") 69 | c.SchemaProviderURL = sp 70 | 71 | spk := c.optionalString("SCHEMA_PROVIDER_KEY", "/weaviate/schema/state") 72 | c.SchemaProviderKey = spk 73 | 74 | ep := c.optionalString("EXTENSIONS_PREFIX", "/contextionary/") 75 | c.ExtensionsPrefix = ep 76 | 77 | extMode := c.optionalString("EXTENSIONS_STORAGE_MODE", "weaviate") 78 | c.ExtensionsStorageMode = extMode 79 | 80 | extOrigin := c.optionalString("EXTENSIONS_STORAGE_ORIGIN", "") 81 | c.ExtensionsStorageOrigin = extOrigin 82 | 83 | port, err := c.optionalInt("SERVER_PORT", 9999) 84 | if err != nil { 85 | return err 86 | } 87 | c.ServerPort = port 88 | 89 | factor, err := c.optionalFloat32("OCCURRENCE_WEIGHT_LINEAR_FACTOR", 0.5) 90 | if err != nil { 91 | return err 92 | } 93 | c.OccurrenceWeightLinearFactor = factor 94 | 95 | ignorePercentile, err := c.optionalInt("NEIGHBOR_OCCURRENCE_IGNORE_PERCENTILE", 5) 96 | if err != nil { 97 | return err 98 | } 99 | 100 | if ignorePercentile < 0 || ignorePercentile > 100 { 101 | return fmt.Errorf("minimum relative neighbor occurrence must be a value between 0 and 100, got: %d", ignorePercentile) 102 | } 103 | 104 | c.NeighborOccurrenceIgnorePercentile = ignorePercentile 105 | 106 | strategy := c.optionalString("OCCURRENCE_WEIGHT_STRATEGY", "log") 107 | c.OccurrenceWeightStrategy = strategy 108 | 109 | // this should match the underlying vector db file, a smaller value than in 110 | // the vector file will lead to missing out on compound words, whereas a 111 | // larger value will lead to unnecessary lookups slowing down the 112 | // vectorization process 113 | compoundLength, err := c.optionalInt("MAX_COMPOUND_WORD_LENGTH", 1) 114 | if err != nil { 115 | return err 116 | } 117 | c.MaxCompoundWordLength = compoundLength 118 | 119 | batchSize, err := c.optionalInt("MAX_BATCH_SIZE", 200) 120 | if err != nil { 121 | return err 122 | } 123 | c.MaximumBatchSize = batchSize 124 | 125 | vectorCacheSize, err := c.optionalInt("MAX_VECTORCACHE_SIZE", 10000) 126 | if err != nil { 127 | return err 128 | } 129 | c.MaximumVectorCacheSize = vectorCacheSize 130 | 131 | c.EnableCompundSplitting = c.optionalBool("ENABLE_COMPOUND_SPLITTING", false) 132 | 133 | if c.EnableCompundSplitting { 134 | compoundSplittingDictionaryFile, err := c.requiredString("COMPOUND_SPLITTING_DICTIONARY_FILE") 135 | if err != nil { 136 | return err 137 | } 138 | c.CompoundSplittingDictionaryFile = compoundSplittingDictionaryFile 139 | } 140 | 141 | loglevel := c.optionalString("LOG_LEVEL", "info") 142 | c.LogLevel = loglevel 143 | 144 | return nil 145 | } 146 | 147 | func (c *Config) optionalInt(varName string, defaultValue int) (int, error) { 148 | value := os.Getenv(varName) 149 | if value == "" { 150 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'", 151 | varName, defaultValue) 152 | return defaultValue, nil 153 | } 154 | 155 | asInt, err := strconv.Atoi(value) 156 | if err != nil { 157 | return 0, fmt.Errorf("cannot convert value of var '%s' ('%v') to int: %s", 158 | varName, value, err) 159 | } 160 | 161 | return asInt, nil 162 | } 163 | 164 | func (c *Config) optionalFloat32(varName string, defaultValue float32) (float32, error) { 165 | value := os.Getenv(varName) 166 | if value == "" { 167 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'", 168 | varName, defaultValue) 169 | return defaultValue, nil 170 | } 171 | 172 | asFloat, err := strconv.ParseFloat(value, 32) 173 | if err != nil { 174 | return 0, fmt.Errorf("cannot convert value of var '%s' ('%v') to int: %s", 175 | varName, value, err) 176 | } 177 | 178 | return float32(asFloat), nil 179 | } 180 | 181 | func (c *Config) requiredString(varName string) (string, error) { 182 | value := os.Getenv(varName) 183 | if value == "" { 184 | return "", fmt.Errorf("required variable '%s' is not set", varName) 185 | } 186 | 187 | return value, nil 188 | } 189 | 190 | func (c *Config) optionalString(varName, defaultInput string) string { 191 | value := os.Getenv(varName) 192 | if value == "" { 193 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'", 194 | varName, defaultInput) 195 | return defaultInput 196 | } 197 | 198 | return value 199 | } 200 | 201 | func (c *Config) optionalBool(varName string, defaultInput bool) bool { 202 | value := os.Getenv(varName) 203 | if value == "" { 204 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'", 205 | varName, defaultInput) 206 | return defaultInput 207 | } 208 | 209 | return value == "true" || value == "1" || value == "on" || value == "enabled" 210 | } 211 | -------------------------------------------------------------------------------- /server/contextionary.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/weaviate/contextionary/compoundsplitting" 8 | 9 | "github.com/weaviate/contextionary/adapters/repos" 10 | core "github.com/weaviate/contextionary/contextionary/core" 11 | "github.com/weaviate/contextionary/contextionary/core/stopwords" 12 | "github.com/weaviate/contextionary/extensions" 13 | ) 14 | 15 | func (s *server) init() error { 16 | s.logger.WithField("config", s.config).Debugf("starting up with this config") 17 | 18 | if err := s.loadRawContextionary(); err != nil { 19 | return err 20 | } 21 | 22 | swDetector, err := stopwords.NewFromFile(s.config.StopwordsFile) 23 | if err != nil { 24 | return err 25 | } 26 | s.stopwordDetector = swDetector 27 | 28 | if err := s.buildContextionary(); err != nil { 29 | return err 30 | } 31 | 32 | var er extensionRepo 33 | var extensionRetriever extensionLookerUpper 34 | 35 | // ExtensionsStorageMode == "weaviate" is now a default storage option 36 | er = repos.NewExtensionsRepo(s.logger, s.config, 1*time.Second) 37 | extensionRetriever = extensions.NewLookerUpper(er) 38 | 39 | compoundSplitter, err := s.initCompoundSplitter() 40 | if err != nil { 41 | return err 42 | } 43 | vectorizer, err := NewVectorizer(s.rawContextionary, s.stopwordDetector, s.config, s.logger, 44 | NewSplitter(), extensionRetriever, compoundSplitter) 45 | if err != nil { 46 | return err 47 | } 48 | 49 | s.vectorizer = vectorizer 50 | s.extensionStorer = extensions.NewStorer(s.vectorizer, er, s.logger) 51 | s.extensionLookerUpper = extensionRetriever 52 | 53 | return nil 54 | } 55 | 56 | func (s *server) loadRawContextionary() error { 57 | c, err := core.LoadVectorFromDisk(s.config.KNNFile, s.config.IDXFile) 58 | if err != nil { 59 | return fmt.Errorf("could not initialize (raw) contextionary: %v", err) 60 | } 61 | 62 | s.rawContextionary = c 63 | return nil 64 | } 65 | 66 | type stopwordDetector interface { 67 | IsStopWord(word string) bool 68 | } 69 | 70 | // any time the schema changes the contextionary needs to be rebuilt. 71 | func (s *server) buildContextionary() error { 72 | s.combinedContextionary = s.rawContextionary 73 | return nil 74 | } 75 | 76 | func (s *server) initCompoundSplitter() (compoundSplitter, error) { 77 | if s.config.EnableCompundSplitting { 78 | dict, err := compoundsplitting.NewContextionaryDict(s.config.CompoundSplittingDictionaryFile) 79 | if err != nil { 80 | return nil, err 81 | } 82 | return compoundsplitting.NewSplitter(dict), nil 83 | } else { 84 | return compoundsplitting.NewNoopSplitter(), nil 85 | } 86 | } 87 | 88 | type extensionRepo interface { 89 | extensions.RetrieverRepo 90 | extensions.StorerRepo 91 | } 92 | -------------------------------------------------------------------------------- /server/grpc_error.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/weaviate/contextionary/errors" 5 | "google.golang.org/grpc/codes" 6 | "google.golang.org/grpc/status" 7 | ) 8 | 9 | func GrpcErrFromTyped(err error) error { 10 | if err == nil { 11 | return nil 12 | } 13 | 14 | switch err.(type) { 15 | case errors.InvalidUserInput: 16 | return status.Error(codes.InvalidArgument, err.Error()) 17 | case errors.Internal: 18 | return status.Error(codes.Internal, err.Error()) 19 | case errors.NotFound: 20 | return status.Error(codes.NotFound, err.Error()) 21 | default: 22 | return status.Error(codes.Unknown, err.Error()) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /server/server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "os" 7 | 8 | "github.com/sirupsen/logrus" 9 | pb "github.com/weaviate/contextionary/contextionary" 10 | core "github.com/weaviate/contextionary/contextionary/core" 11 | "github.com/weaviate/contextionary/extensions" 12 | "github.com/weaviate/contextionary/server/config" 13 | grpc "google.golang.org/grpc" 14 | ) 15 | 16 | // Version is filled through a build arg 17 | var Version string 18 | 19 | func main() { 20 | server := new() 21 | server.logger.WithField("version", Version).Info() 22 | grpcServer := grpc.NewServer() 23 | pb.RegisterContextionaryServer(grpcServer, server) 24 | lis, err := net.Listen("tcp", fmt.Sprintf(":%d", server.config.ServerPort)) 25 | if err != nil { 26 | server.logger.Errorf("can't listen on port: %s", err) 27 | os.Exit(1) 28 | } 29 | 30 | grpcServer.Serve(lis) 31 | } 32 | 33 | type server struct { 34 | // to be used to serve rpc requests, combination of the raw contextionary 35 | // and the schema 36 | combinedContextionary core.Contextionary 37 | 38 | // initialized at startup, to be used to build the 39 | // schema contextionary 40 | rawContextionary core.Contextionary 41 | 42 | config *config.Config 43 | 44 | logger logrus.FieldLogger 45 | 46 | // ucs 47 | extensionStorer *extensions.Storer 48 | extensionLookerUpper extensionLookerUpper 49 | stopwordDetector stopwordDetector 50 | vectorizer *Vectorizer 51 | } 52 | 53 | // new gRPC server to serve the contextionary 54 | func new() *server { 55 | logger := logrus.New() 56 | logger.SetFormatter(&logrus.JSONFormatter{}) 57 | cfg, err := config.New(logger) 58 | if err != nil { 59 | logger. 60 | WithError(err). 61 | Errorf("cannot start up") 62 | os.Exit(1) 63 | } 64 | 65 | loglevel, err := logrus.ParseLevel(cfg.LogLevel) 66 | if err != nil { 67 | logger. 68 | WithError(err). 69 | Errorf("cannot start up") 70 | os.Exit(1) 71 | } 72 | logger.SetLevel(loglevel) 73 | logger.WithField("log_level", loglevel.String()).Info() 74 | 75 | s := &server{ 76 | config: cfg, 77 | logger: logger, 78 | } 79 | 80 | err = s.init() 81 | if err != nil { 82 | logger. 83 | WithError(err). 84 | Errorf("cannot start up") 85 | os.Exit(1) 86 | } 87 | 88 | return s 89 | } 90 | -------------------------------------------------------------------------------- /server/splitter.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "strings" 5 | "unicode" 6 | ) 7 | 8 | func NewSplitter() *Splitter { 9 | return &Splitter{} 10 | } 11 | 12 | type Splitter struct{} 13 | 14 | func (s *Splitter) Split(corpus string) []string { 15 | return strings.FieldsFunc(corpus, func(c rune) bool { 16 | return !unicode.IsLetter(c) && !unicode.IsNumber(c) 17 | }) 18 | } 19 | -------------------------------------------------------------------------------- /server/splitter_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func Test_Splitter(t *testing.T) { 10 | type testcase struct { 11 | name string 12 | input string 13 | output []string 14 | } 15 | 16 | tests := []testcase{ 17 | testcase{ 18 | name: "single word", 19 | input: "single", 20 | output: []string{"single"}, 21 | }, 22 | testcase{ 23 | name: "words separated by space", 24 | input: "hello my name is John", 25 | output: []string{"hello", "my", "name", "is", "John"}, 26 | }, 27 | testcase{ 28 | name: "multiple spaces in between words", 29 | input: "hello John", 30 | output: []string{"hello", "John"}, 31 | }, 32 | 33 | testcase{ 34 | name: "words with numbers", 35 | input: "foo1 foo2", 36 | output: []string{"foo1", "foo2"}, 37 | }, 38 | 39 | testcase{ 40 | name: "hyphenated words", 41 | input: "r2-d2", 42 | output: []string{"r2", "d2"}, 43 | }, 44 | 45 | testcase{ 46 | name: "on commas (with and without spaces)", 47 | input: "jane, john,anna", 48 | output: []string{"jane", "john", "anna"}, 49 | }, 50 | 51 | testcase{ 52 | name: "on other characters", 53 | input: "foobar baz#(*@@baq", 54 | output: []string{"foobar", "baz", "baq"}, 55 | }, 56 | 57 | testcase{ 58 | name: "words containing umlauts (upper and lower)", 59 | input: "Ölpreis über 80 dollar!", 60 | output: []string{"Ölpreis", "über", "80", "dollar"}, 61 | }, 62 | 63 | testcase{ 64 | name: "words containing turkish characters", 65 | input: "Ölpreis über 80 dollar!", 66 | output: []string{"Ölpreis", "über", "80", "dollar"}, 67 | }, 68 | 69 | testcase{ 70 | name: "words containing turkish characters", 71 | input: "Weaviate ayrıca Türkçe konuşabilir", 72 | output: []string{"Weaviate", "ayrıca", "Türkçe", "konuşabilir"}, 73 | }, 74 | 75 | testcase{ 76 | name: "mixed characters including a '<'", 77 | input: "car, car#of,,,,brand 0 { 60 | e.parsedStack = append(e.parsedStack, strings.Join(currOperandDigits, "")) 61 | currOperandDigits = nil 62 | } 63 | 64 | // We will eventually append our current operator to the operator stack. 65 | // However, first it must be compared against current operators, if the 66 | // top of the stack has a higher or equal precedence to the current one, 67 | // we will pop that first. We continue this pattern until either the 68 | // stack is empty or the topmost element of the stack is of lower 69 | // precedence than the current 70 | for len(operatorStack) > 0 { 71 | topStack := operatorStack[len(operatorStack)-1] 72 | if operatorPrecedence(topStack) < operatorPrecedence(string(r)) { 73 | break 74 | } 75 | 76 | e.parsedStack = append(e.parsedStack, topStack) 77 | operatorStack = operatorStack[:len(operatorStack)-1] 78 | } 79 | operatorStack = append(operatorStack, string(r)) 80 | } 81 | 82 | // in case the expression ends with an operand, we need to check again if the 83 | // temp digit stack still contains elements 84 | if len(currOperandDigits) > 0 { 85 | e.parsedStack = append(e.parsedStack, strings.Join(currOperandDigits, "")) 86 | currOperandDigits = nil 87 | } 88 | 89 | // append the remainder of the operatorStack (if any) to the parsed output in 90 | // reverse order 91 | e.parsedStack = append(e.parsedStack, reverseSlice(operatorStack)...) 92 | return nil 93 | } 94 | 95 | func (e *Evaluator) unrecognizedOperator(op string) error { 96 | if op == "(" || op == ")" { 97 | return fmt.Errorf("using parantheses in the expression is not supported") 98 | } 99 | 100 | return fmt.Errorf("unrecognized operator: %s", string(op)) 101 | } 102 | 103 | func (e Evaluator) evaluate() (float64, error) { 104 | var operandStack []float64 105 | for _, item := range e.parsedStack { 106 | if !isOperator(item) { 107 | // not an operator, so it must be an operand 108 | num, err := e.parseNumberOrVariable(item) 109 | if err != nil { 110 | return 0, err 111 | } 112 | 113 | operandStack = append(operandStack, num) 114 | continue 115 | } 116 | 117 | // is an operator 118 | if len(operandStack) < 2 { 119 | return 0, fmt.Errorf("invalid or unsupported math expression") 120 | } 121 | 122 | // note that the topStack is the right operator, whereas topStack-1 is the left! 123 | op1, op2 := operandStack[len(operandStack)-2], operandStack[len(operandStack)-1] 124 | operandStack = operandStack[:len(operandStack)-2] 125 | 126 | res, err := evaluteOperator(item, op1, op2) 127 | if err != nil { 128 | return 0, err 129 | } 130 | operandStack = append(operandStack, res) 131 | } 132 | 133 | if len(operandStack) != 1 { 134 | return 0, fmt.Errorf("could not evaluate mathematical expression") 135 | } 136 | 137 | return operandStack[0], nil 138 | } 139 | 140 | func evaluteOperator(op string, left, right float64) (float64, error) { 141 | switch op { 142 | case "+": 143 | return left + right, nil 144 | case "-": 145 | return left - right, nil 146 | case "*": 147 | return left * right, nil 148 | case "/": 149 | return left / right, nil 150 | default: 151 | return 0, fmt.Errorf("this should be unreachable - or the implentation of an operator is missing") 152 | } 153 | } 154 | 155 | func isOperator(in string) bool { 156 | switch in { 157 | case "*", "+", "-", "/": 158 | return true 159 | default: 160 | return false 161 | } 162 | } 163 | 164 | // we allow numbers, the dot as a floating point symbol, as well as letters to 165 | // represent variables 166 | func isOperand(r rune) bool { 167 | if unicode.IsLetter(r) || unicode.IsNumber(r) || string(r) == "." { 168 | return true 169 | } 170 | return false 171 | } 172 | 173 | func (e *Evaluator) parseNumberOrVariable(in string) (float64, error) { 174 | r := rune(in[0]) 175 | if unicode.IsNumber(r) { 176 | return strconv.ParseFloat(in, 64) 177 | } else { 178 | if in == "w" { 179 | return e.originalWeight, nil 180 | } 181 | return 0, fmt.Errorf("unrecognized variable '%s', use 'w' to represent original weight", in) 182 | } 183 | } 184 | 185 | func operatorPrecedence(op string) int { 186 | switch op { 187 | case "+", "-": 188 | return 1 189 | case "*", "/": 190 | return 2 191 | default: 192 | return -1 193 | } 194 | } 195 | 196 | // from https://github.com/golang/go/wiki/SliceTricks 197 | func reverseSlice(a []string) []string { 198 | for i := len(a)/2 - 1; i >= 0; i-- { 199 | opp := len(a) - 1 - i 200 | a[i], a[opp] = a[opp], a[i] 201 | } 202 | 203 | return a 204 | } 205 | -------------------------------------------------------------------------------- /server/weight_manipulator_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func TestWeightManipulator(t *testing.T) { 12 | 13 | type test struct { 14 | originalWeight float64 15 | expression string 16 | expectedResult float64 17 | expectedError error 18 | name string 19 | } 20 | 21 | tests := []test{ 22 | 23 | test{ 24 | originalWeight: 2.0, 25 | expression: "7", 26 | expectedResult: 7.0, 27 | expectedError: nil, 28 | name: "single operand, no operators", 29 | }, 30 | test{ 31 | originalWeight: 2.0, 32 | expression: "17", 33 | expectedResult: 17.0, 34 | expectedError: nil, 35 | name: "single operand, more than one digit", 36 | }, 37 | test{ 38 | originalWeight: 2.0, 39 | expression: "15.662", 40 | expectedResult: 15.662, 41 | expectedError: nil, 42 | name: "single operand, floating point using . as decimal", 43 | }, 44 | test{ 45 | originalWeight: 2.0, 46 | expression: "w * 2", 47 | expectedResult: 4.0, 48 | expectedError: nil, 49 | name: "simple multiplication", 50 | }, 51 | test{ 52 | originalWeight: 2.0, 53 | expression: "w * 2 * 3 * 4", 54 | expectedResult: 48.0, 55 | expectedError: nil, 56 | name: "multiplication with several operands", 57 | }, 58 | test{ 59 | originalWeight: 2.0, 60 | expression: "w + 3", 61 | expectedResult: 5.0, 62 | expectedError: nil, 63 | name: "simple addition", 64 | }, 65 | test{ 66 | originalWeight: 2.0, 67 | expression: "w + 3 + 7", 68 | expectedResult: 12.0, 69 | expectedError: nil, 70 | name: "additional with several operands", 71 | }, 72 | test{ 73 | originalWeight: 2.0, 74 | expression: "1+2*3+4", 75 | expectedResult: 11.0, 76 | expectedError: nil, 77 | name: "mixing operators with different precedence", 78 | }, 79 | test{ 80 | originalWeight: 2.0, 81 | expression: "1+2*3-4", 82 | expectedResult: 3.0, 83 | expectedError: nil, 84 | name: "mixing operators with different precedence, including -", 85 | }, 86 | test{ 87 | originalWeight: 2.0, 88 | expression: "1+2/4-4", 89 | expectedResult: -2.5, 90 | expectedError: nil, 91 | name: "mixing operators with different precedence, including /", 92 | }, 93 | test{ 94 | originalWeight: 7.0, 95 | expression: "1+ 2.5/7 * w -4/2", 96 | expectedResult: 1.5, 97 | expectedError: nil, 98 | name: "long expression including all operators", 99 | }, 100 | test{ 101 | originalWeight: 7.0, 102 | expression: "w * w", 103 | expectedResult: 49, 104 | expectedError: nil, 105 | name: "including the weight variable multiple times", 106 | }, 107 | test{ 108 | originalWeight: 7.0, 109 | expression: "2 * (1+3)", 110 | expectedError: fmt.Errorf("using parantheses in the expression is not supported"), 111 | name: "using parantheses", 112 | }, 113 | test{ 114 | originalWeight: 7.0, 115 | expression: "a + b * c", 116 | expectedError: fmt.Errorf("unrecognized variable 'a', use 'w' to represent original weight"), 117 | name: "using a variable other than w", 118 | }, 119 | } 120 | 121 | for _, test := range tests { 122 | t.Run(test.name, func(t *testing.T) { 123 | res, err := NewEvaluator(test.expression, test.originalWeight).Do() 124 | require.Equal(t, test.expectedError, err) 125 | assert.Equal(t, test.expectedResult, res) 126 | }) 127 | 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /test/compoundsplitting/contextionary.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/contextionary/327ffb5f74ff9ede347bd31a8973d79d25fcac9b/test/compoundsplitting/contextionary.idx -------------------------------------------------------------------------------- /test/journey.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Jump to root directory 6 | cd "$( dirname "${BASH_SOURCE[0]}" )"/.. 7 | 8 | # set some defaults so we can also run locally 9 | if [ -z "$DOCKER_ORG" ] 10 | then 11 | DOCKER_ORG=semitechnologies 12 | fi 13 | 14 | if [ -z "$DOCKER_REPO" ] 15 | then 16 | DOCKER_REPO=contextionary 17 | fi 18 | 19 | if [ -z "$SOFTWARE_VERSION" ] 20 | then 21 | SOFTWARE_VERSION=local 22 | fi 23 | 24 | if [ -z "$MODEL_VERSION" ] 25 | then 26 | MODEL_VERSION=0.16.0 27 | fi 28 | 29 | if [ -z "$LANGUAGE" ] 30 | then 31 | LANGUAGE=en 32 | fi 33 | 34 | VERSION="${MODEL_VERSION}-${SOFTWARE_VERSION}" 35 | 36 | docker tag "$DOCKER_ORG/$DOCKER_REPO:${LANGUAGE}$VERSION-minimal" c11y-local-journeytest-minimal 37 | docker tag "$DOCKER_ORG/$DOCKER_REPO:${LANGUAGE}$VERSION" c11y-local-journeytest-full 38 | 39 | echo "Cleaning up from previous runs" 40 | docker-compose -f ./test/journey/docker-compose.yml down 41 | 42 | echo "Starting containers" 43 | docker-compose -f ./test/journey/docker-compose.yml up -d minimal full weaviate 44 | 45 | echo "Building tests" 46 | docker-compose -f ./test/journey/docker-compose.yml build test-env 47 | 48 | echo "Running tests" 49 | docker-compose -f ./test/journey/docker-compose.yml run test-env go test . 50 | 51 | -------------------------------------------------------------------------------- /test/journey/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.13 2 | WORKDIR /testfiles 3 | COPY go.mod go.sum ./ 4 | RUN go mod download 5 | 6 | -------------------------------------------------------------------------------- /test/journey/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | services: 3 | weaviate: 4 | image: semitechnologies/weaviate:1.18.0 5 | ports: 6 | - "8080:8080" 7 | environment: 8 | LOG_LEVEL: "debug" 9 | CONTEXTIONARY_URL: host.docker.internal:9999 10 | QUERY_DEFAULTS_LIMIT: 20 11 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 12 | PERSISTENCE_DATA_PATH: "./data" 13 | DEFAULT_VECTORIZER_MODULE: text2vec-contextionary 14 | minimal: 15 | image: c11y-local-journeytest-minimal 16 | environment: 17 | EXTENSIONS_STORAGE_MODE: weaviate 18 | EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080 19 | full: 20 | image: c11y-local-journeytest-full 21 | environment: 22 | EXTENSIONS_STORAGE_MODE: weaviate 23 | EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080 24 | LOG_LEVEL: debug 25 | MAX_COMPOUND_WORD_LENGTH: 4 26 | MAX_BATCH_SIZE: 200 27 | ports: 28 | - "9999:9999" 29 | test-env: 30 | build: 31 | context: . # paths are relative to the docker-compose file, so they point to ./test/journey/ 32 | dockerfile: ./Dockerfile 33 | volumes: 34 | - ./:/testfiles 35 | environment: 36 | DIMENSIONS: "$DIMENSIONS" 37 | 38 | 39 | -------------------------------------------------------------------------------- /test/journey/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/weaviate/contextionary/test/journey 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/stretchr/testify v1.6.1 7 | github.com/weaviate/contextionary v1.1.2-0.20230307155526-f7e24eb73eb0 8 | google.golang.org/grpc v1.24.0 9 | ) 10 | -------------------------------------------------------------------------------- /test/journey/journey_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "strconv" 8 | "testing" 9 | 10 | "github.com/stretchr/testify/assert" 11 | "github.com/stretchr/testify/require" 12 | pb "github.com/weaviate/contextionary/contextionary" 13 | "google.golang.org/grpc" 14 | ) 15 | 16 | var expectedDimensions int 17 | 18 | func init() { 19 | 20 | d, err := strconv.Atoi(os.Getenv("DIMENSIONS")) 21 | if err != nil { 22 | panic(err) 23 | } 24 | 25 | expectedDimensions = d 26 | } 27 | 28 | func Test_Contextionary_Journey(t *testing.T) { 29 | // minimal 30 | connMinimal, err := grpc.Dial("minimal:9999", grpc.WithInsecure()) 31 | if err != nil { 32 | t.Fatalf("couldn't connect to minimal c11y: %s", err) 33 | } 34 | defer connMinimal.Close() 35 | 36 | connFull, err := grpc.Dial("full:9999", grpc.WithInsecure()) 37 | if err != nil { 38 | t.Fatalf("couldn't connect to minimal c11y: %s", err) 39 | } 40 | defer connFull.Close() 41 | 42 | clientMinimal := pb.NewContextionaryClient(connMinimal) 43 | clientFull := pb.NewContextionaryClient(connFull) 44 | 45 | t.Run("the minimal contextionary", func(t *testing.T) { 46 | client := clientMinimal 47 | 48 | t.Run("testing words present", func(t *testing.T) { 49 | words := []string{"car", "engine", "automobile", "name"} 50 | 51 | for _, word := range words { 52 | t.Run(word, func(t *testing.T) { 53 | res, err := client.IsWordPresent(context.Background(), &pb.Word{Word: word}) 54 | require.Nil(t, err) 55 | assert.Equal(t, true, res.Present) 56 | }) 57 | } 58 | }) 59 | 60 | t.Run("testing stopwords", func(t *testing.T) { 61 | words := []string{"of", "the"} 62 | 63 | for _, word := range words { 64 | t.Run(word, func(t *testing.T) { 65 | res, err := client.IsWordStopword(context.Background(), &pb.Word{Word: word}) 66 | require.Nil(t, err) 67 | assert.Equal(t, true, res.Stopword) 68 | }) 69 | } 70 | }) 71 | 72 | t.Run("corpi to vector", func(t *testing.T) { 73 | t.Run("only stopwords", func(t *testing.T) { 74 | corpi := []string{"of", "the of"} 75 | _, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi}) 76 | assert.NotNil(t, err) 77 | }) 78 | 79 | t.Run("only stopwords", func(t *testing.T) { 80 | corpi := []string{"car", "car of brand mercedes", "color blue"} 81 | res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi}) 82 | assert.Nil(t, err) 83 | // TODO: also upgrade minimal one to 600 vectors 84 | assert.Len(t, res.Entries, 300) 85 | }) 86 | 87 | t.Run("two corpi with and without splitting characters should lead to the same vector", func(t *testing.T) { 88 | corpi1 := []string{"car", "car of brand mercedes", "color blue"} 89 | corpi2 := []string{"car,", "car#of,,,,brand maxOcc { 84 | maxOcc = occurrence 85 | } 86 | 87 | occurrences = append(occurrences, occurrence) 88 | presentWords = append(presentWords, word) 89 | } 90 | 91 | } 92 | 93 | // calculate weights by normalizing the occurrences to 0..1 94 | weights := make([]float32, len(occurrences), len(occurrences)) 95 | for i, occ := range occurrences { 96 | // _ = occ 97 | // weights[i] = 1 98 | weight := 1 - float32(occ-minOcc)/float32(maxOcc-minOcc) 99 | weights[i] = weight 100 | 101 | // fmt.Printf("%s: %f\n", presentWords[i], weight) 102 | } 103 | 104 | centroid, err := contextionary.ComputeWeightedCentroid(vectors, weights) 105 | fatal(err) 106 | 107 | // fmt.Printf("%d stop words out of %d removed. %d of the remainder contained\n", stopWords, total, len(vectors)) 108 | 109 | return centroid 110 | 111 | } 112 | -------------------------------------------------------------------------------- /tools/dev/contextionary-playground/class_vectors/search.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package main 12 | 13 | import ( 14 | "bytes" 15 | "encoding/json" 16 | "fmt" 17 | "io/ioutil" 18 | "log" 19 | "net/http" 20 | "strings" 21 | 22 | contextionary "github.com/weaviate/contextionary/contextionary/core" 23 | ) 24 | 25 | func searchString(word string, c11y contextionary.Contextionary) { 26 | words := strings.Split(word, " ") 27 | 28 | var usableWords []string 29 | var vectors []contextionary.Vector 30 | var weights []float32 31 | 32 | for _, word := range words { 33 | if isStopWord(word) { 34 | continue 35 | } 36 | 37 | itemIndex := c11y.WordToItemIndex(word) 38 | if ok := itemIndex.IsPresent(); !ok { 39 | log.Fatalf("the word %s is not in the c11y", word) 40 | } 41 | 42 | vector, err := c11y.GetVectorForItemIndex(itemIndex) 43 | if err != nil { 44 | log.Fatalf("could not get vector for word '%s': %v", word, err) 45 | } 46 | 47 | usableWords = append(usableWords, word) 48 | vectors = append(vectors, *vector) 49 | weights = append(weights, 1.0) 50 | } 51 | 52 | stopWordsRatio := float32((len(words) - len(usableWords))) / float32(len(words)) 53 | fmt.Printf("Original Search Term: %s\n", word) 54 | fmt.Printf("After stop word removal: %s (%2.0f%% removed)\n", strings.Join(usableWords, " "), stopWordsRatio*100) 55 | fmt.Printf("\n") 56 | 57 | centroid, err := contextionary.ComputeWeightedCentroid(vectors, weights) 58 | fatal(err) 59 | 60 | search(centroid.ToArray()) 61 | fmt.Printf("\n\n") 62 | } 63 | 64 | func search(v []float32) { 65 | body := fmt.Sprintf(`{ 66 | "query": { 67 | "function_score": { 68 | "query": { 69 | "bool": { 70 | "filter": { 71 | "match": { 72 | "sampleBoolProp": false 73 | } 74 | } 75 | } 76 | }, 77 | "boost_mode": "replace", 78 | "script_score": { 79 | "script": { 80 | "inline": "binary_vector_score", 81 | "lang": "knn", 82 | "params": { 83 | "cosine": false, 84 | "field": "embedding_vector", 85 | "vector": [ 86 | %s 87 | ] 88 | } 89 | } 90 | } 91 | } 92 | }, 93 | "size": 3 94 | } `, printVector(v)) 95 | 96 | req, _ := http.NewRequest("GET", "http://localhost:9900/documents/_search", bytes.NewReader([]byte(body))) 97 | res, err := (&http.Client{}).Do(req) 98 | if err != nil { 99 | panic(err) 100 | } 101 | 102 | if res.StatusCode != 200 { 103 | bb, _ := ioutil.ReadAll(res.Body) 104 | panic(fmt.Errorf("status is %d: %s", res.StatusCode, bb)) 105 | } 106 | 107 | defer res.Body.Close() 108 | bytes, err := ioutil.ReadAll(res.Body) 109 | if err != nil { 110 | panic(err) 111 | } 112 | 113 | var eres elasticResult 114 | err = json.Unmarshal(bytes, &eres) 115 | if err != nil { 116 | panic(err) 117 | } 118 | 119 | for i, hit := range eres.Hits.Hits { 120 | content := firstChars(hit.Source.Content, 120) 121 | fmt.Printf("\n\tNo: %d\tScore: %2.3f\tName: %s\n\t Content: %s\n", i, hit.Score, hit.Source.Name, content) 122 | } 123 | } 124 | 125 | type elasticResult struct { 126 | Hits elasticHits `json:"hits"` 127 | } 128 | 129 | type elasticHits struct { 130 | Hits []elasticHit `json:"hits"` 131 | } 132 | 133 | type elasticHit struct { 134 | Score float32 `json:"_score"` 135 | Source document `json:"_source"` 136 | } 137 | 138 | func firstChars(input string, limit int) string { 139 | if len(input) < limit { 140 | return input 141 | } 142 | return input[:limit] + "..." 143 | } 144 | -------------------------------------------------------------------------------- /tools/dev/contextionary-playground/comparison/main.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package main 12 | 13 | import ( 14 | "fmt" 15 | "os" 16 | 17 | contextionary "github.com/weaviate/contextionary/contextionary/core" 18 | ) 19 | 20 | func fatal(err error) { 21 | if err != nil { 22 | fmt.Println(err.Error()) 23 | os.Exit(1) 24 | } 25 | } 26 | 27 | func main() { 28 | root := os.Args[1] 29 | c1Path := root + "/filter-after-glove" 30 | c2Path := root + "/preprocessing" 31 | c3Path := root + "/stopword-removal" 32 | 33 | c1, err := contextionary.LoadVectorFromDisk(c1Path+"/contextionary-en.knn", c1Path+"/contextionary-en.idx") 34 | fatal(err) 35 | 36 | c2, err := contextionary.LoadVectorFromDisk(c2Path+"/contextionary-en.knn", c2Path+"/contextionary-en.idx") 37 | fatal(err) 38 | 39 | c3, err := contextionary.LoadVectorFromDisk(c3Path+"/contextionary-en.knn", c3Path+"/contextionary-en.idx") 40 | fatal(err) 41 | 42 | word := os.Args[2] 43 | c1Dist, c1Words := kNN(word, c1) 44 | c2Dist, c2Words := kNN(word, c2) 45 | c3Dist, c3Words := kNN(word, c3) 46 | 47 | for i := range c1Dist { 48 | fmt.Printf("%f %-15s\t\t\t%f %-15s\t\t\t%f %-15s\n", c1Dist[i], c1Words[i], c2Dist[i], c2Words[i], c3Dist[i], c3Words[i]) 49 | } 50 | } 51 | 52 | func kNN(name string, contextionary contextionary.Contextionary) ([]float32, []string) { 53 | itemIndex := contextionary.WordToItemIndex(name) 54 | if ok := itemIndex.IsPresent(); !ok { 55 | fatal(fmt.Errorf("item index for %s is not present", name)) 56 | } 57 | 58 | list, distances, err := contextionary.GetNnsByItem(itemIndex, 20, 3) 59 | if err != nil { 60 | fatal(fmt.Errorf("get nns errored: %s", err)) 61 | } 62 | 63 | words := make([]string, len(list), len(list)) 64 | for i := range list { 65 | w, err := contextionary.ItemIndexToWord(list[i]) 66 | if err != nil { 67 | fmt.Printf("error: %s", err) 68 | } 69 | words[i] = w 70 | } 71 | 72 | return distances, words 73 | } 74 | -------------------------------------------------------------------------------- /tools/dev/contextionary-playground/main.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package main 12 | 13 | import ( 14 | "fmt" 15 | "os" 16 | 17 | contextionary "github.com/weaviate/contextionary/contextionary/core" 18 | ) 19 | 20 | func fatal(err error) { 21 | if err != nil { 22 | fmt.Println(err.Error()) 23 | os.Exit(1) 24 | } 25 | } 26 | 27 | func main() { 28 | c13y, err := contextionary.LoadVectorFromDisk("./tools/dev/contextionary-playground/contextionary.knn", "./tools/dev/contextionary-playground/contextionary.idx") 29 | fatal(err) 30 | 31 | fmt.Println("results before building centroid based on keywords: ") 32 | kNN("city", c13y) 33 | 34 | // Combine contextionaries 35 | contextionaries := []contextionary.Contextionary{c13y} 36 | combined, err := contextionary.CombineVectorIndices(contextionaries) 37 | fatal(err) 38 | 39 | fmt.Println("results after building centroid based on keywords: ") 40 | kNN("ocean", combined) 41 | } 42 | 43 | func kNN(name string, contextionary contextionary.Contextionary) { 44 | itemIndex := contextionary.WordToItemIndex(name) 45 | if ok := itemIndex.IsPresent(); !ok { 46 | fatal(fmt.Errorf("item index for %s is not present", name)) 47 | } 48 | 49 | list, distances, err := contextionary.GetNnsByItem(itemIndex, 1000000, 3) 50 | if err != nil { 51 | fatal(fmt.Errorf("get nns errored: %s", err)) 52 | } 53 | 54 | for i := range list { 55 | w, err := contextionary.ItemIndexToWord(list[i]) 56 | if err != nil { 57 | fmt.Printf("error: %s", err) 58 | } 59 | fmt.Printf("\n%d %f %s\n", list[i], distances[i], w) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /tools/dev/contextionary-playground/schema/main.go: -------------------------------------------------------------------------------- 1 | /* _ _ 2 | *__ _____ __ ___ ___ __ _| |_ ___ 3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 | * \ V V / __/ (_| |\ V /| | (_| | || __/ 5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 | * 7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved. 8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE 9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt) 10 | * CONTACT: hello@weaviate.io 11 | */package main 12 | 13 | import ( 14 | "fmt" 15 | "os" 16 | 17 | contextionary "github.com/weaviate/contextionary/contextionary/core" 18 | ) 19 | 20 | func fatal(err error) { 21 | if err != nil { 22 | fmt.Println(err.Error()) 23 | os.Exit(1) 24 | } 25 | } 26 | 27 | func main() { 28 | c11y, err := contextionary.LoadVectorFromDisk("./test/contextionary/example.knn", "./test/contextionary/example.idx") 29 | fatal(err) 30 | 31 | fmt.Println("results before building centroid based on keywords: ") 32 | kNN("city", c11y) 33 | 34 | // Combine contextionaries 35 | contextionaries := []contextionary.Contextionary{c11y} 36 | combined, err := contextionary.CombineVectorIndices(contextionaries) 37 | fatal(err) 38 | 39 | fmt.Println("results after building centroid based on keywords: ") 40 | kNN("ocean", combined) 41 | } 42 | 43 | func kNN(name string, contextionary contextionary.Contextionary) { 44 | itemIndex := contextionary.WordToItemIndex(name) 45 | if ok := itemIndex.IsPresent(); !ok { 46 | fatal(fmt.Errorf("item index for %s is not present", name)) 47 | } 48 | 49 | list, distances, err := contextionary.GetNnsByItem(itemIndex, 20, 3) 50 | if err != nil { 51 | fatal(fmt.Errorf("get nns errored: %s", err)) 52 | } 53 | 54 | for i := range list { 55 | w, err := contextionary.ItemIndexToWord(list[i]) 56 | if err != nil { 57 | fmt.Printf("error: %s", err) 58 | } 59 | fmt.Printf("\n%d %f %s\n", list[i], distances[i], w) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /tools/dev/en_test-vectors-small.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/contextionary/327ffb5f74ff9ede347bd31a8973d79d25fcac9b/tools/dev/en_test-vectors-small.txt.bz2 -------------------------------------------------------------------------------- /tools/dev/gen_simple_contextionary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | echo "Unpacking fixture vectors" 5 | rm -f tools/dev/en_test-vectors-small.txt || true 6 | bunzip2 -k tools/dev/en_test-vectors-small.txt.bz2 7 | 8 | # Fake stopword removal by removing the first 10 words. This will become 9 | # obsolete once we have released a new minimal c11y 10 | 11 | # build stopword.json 12 | cat tools/dev/en_test-vectors-small.txt | head | \ 13 | while read -r word _; do echo "$word"; done | jq -nR '[inputs | select(length>0)] | { language: "en", words: . }' > tools/dev/stopwords.json 14 | 15 | # remove stop words 16 | sed -i.bak 1,10d tools/dev/en_test-vectors-small.txt && rm tools/dev/en_test-vectors-small.txt.bak 17 | 18 | if [ -f tools/dev/example.knn ]; then 19 | echo "Fixture contextionary already generated" 20 | else 21 | go run contextionary/core/generator/cmd/generator.go \ 22 | -c tools/dev/en_test-vectors-small.txt \ 23 | -p tools/dev/example 24 | fi 25 | -------------------------------------------------------------------------------- /tools/dev/run.sh: -------------------------------------------------------------------------------- 1 | GO111MODULE=on \ 2 | KNN_FILE="./tools/dev/example.knn" \ 3 | IDX_FILE="./tools/dev/example.idx" \ 4 | STOPWORDS_FILE="./tools/dev/stopwords.json" \ 5 | SCHEMA_PROVIDER_URL="localhost:2379" \ 6 | go run ./server 2>&1 7 | -------------------------------------------------------------------------------- /tools/dev/stopwords.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "en", 3 | "words": [ 4 | "the", 5 | "of", 6 | "and", 7 | "in", 8 | "to", 9 | "a", 10 | "was", 11 | "The", 12 | "is", 13 | "for" 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /tools/download_contextionary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | language=${1} 6 | version="${2}" 7 | 8 | rm -rf ./data && mkdir ./data 9 | 10 | # Download the latest files and remove old ones 11 | for FILE in stopwords.json contextionary.idx contextionary.knn; do 12 | echo "Start Downloading $FILE" && \ 13 | #echo "Downloading url: https://c11y.semi.technology/$version/$language/$FILE" 14 | wget --quiet -O ./data/$FILE "https://c11y.semi.technology/$version/$language/$FILE" && \ 15 | echo "$FILE = done" & 16 | done 17 | 18 | # Wait to finish download 19 | wait 20 | 21 | echo "Done downloading open source contextionary v$VECTORDB_VERSION." 22 | exit 0 23 | -------------------------------------------------------------------------------- /tools/native_build_contextionary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #Download contextionary 4 | LANGUAGE=en 5 | MODEL_VERSION=0.16.0 6 | ./tools/download_contextionary.sh "$LANGUAGE" "$MODEL_VERSION" 7 | 8 | #Build the server 9 | VERSION=1.2.0 10 | CGO_ENABLED=1 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -X main.Version=$VERSION" ./server 11 | 12 | #Generate contextionary 13 | tools/dev/gen_simple_contextionary.sh 14 | 15 | #Preprocess splitter dictionary 16 | /bin/bash ./tools/preprocess_splitter_dict_native_build.sh "$LANGUAGE" "./data/contextionary.idx" 17 | 18 | #Copy files to Alpine image 19 | cp ./contextionary-server $PWD 20 | 21 | #Set environment variables 22 | export KNN_FILE=./data/contextionary.knn 23 | export IDX_FILE=./data/contextionary.idx 24 | export STOPWORDS_FILE=./data/stopwords.json 25 | export COMPOUND_SPLITTING_DICTIONARY_FILE=./data/splitter_dict.csv 26 | 27 | #Run the server 28 | ./contextionary-server 29 | -------------------------------------------------------------------------------- /tools/preprocess_splitter_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | language=${1} 6 | index_file=${2} 7 | 8 | # Get dictionaries 9 | git clone https://github.com/LibreOffice/dictionaries.git 10 | 11 | aff_file="" 12 | dic_file="" 13 | 14 | if [ "$language" == "en" ]; then 15 | aff_file="/app/dictionaries/en/en_US.aff" 16 | dic_file="/app/dictionaries/en/en_US.dic" 17 | fi 18 | if [ "$language" == "de" ]; then 19 | aff_file="/app/dictionaries/de/de_DE_frami.aff" 20 | dic_file="/app/dictionaries/de/de_DE_frami.dic" 21 | fi 22 | if [ "$language" == "nl" ]; then 23 | aff_file="/app/dictionaries/nl_NL/nl_NL.aff" 24 | dic_file="/app/dictionaries/nl_NL/nl_NL.dic" 25 | fi 26 | if [ "$language" == "it" ]; then 27 | aff_file="/app/dictionaries/it_IT/it_IT.aff" 28 | dic_file="/app/dictionaries/it_IT/it_IT.dic" 29 | fi 30 | if [ "$language" == "cs" ]; then 31 | aff_file="/app/dictionaries/cs_CZ/cs_CZ.aff" 32 | dic_file="/app/dictionaries/cs_CZ/cs_CZ.dic" 33 | fi 34 | 35 | if [ "$aff_file" == "" ]; then 36 | echo "Missing dictionary for preprocessor see process_splitter_dict.sh" 37 | exit 3 38 | fi 39 | 40 | echo "Building dict with:" 41 | go run main/splitter_preprocessor.go "$index_file" "$dic_file" "$aff_file" "/app/data/splitter_dict.csv" 42 | 43 | -------------------------------------------------------------------------------- /tools/preprocess_splitter_dict_native_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | language=${1} 6 | index_file=${2} 7 | 8 | # Get dictionaries 9 | git clone https://github.com/LibreOffice/dictionaries.git 10 | 11 | aff_file="" 12 | dic_file="" 13 | 14 | if [ "$language" == "en" ]; then 15 | aff_file="./dictionaries/en/en_US.aff" 16 | dic_file="./dictionaries/en/en_US.dic" 17 | fi 18 | if [ "$language" == "de" ]; then 19 | aff_file="./dictionaries/de/de_DE_frami.aff" 20 | dic_file="./dictionaries/de/de_DE_frami.dic" 21 | fi 22 | if [ "$language" == "nl" ]; then 23 | aff_file="./dictionaries/nl_NL/nl_NL.aff" 24 | dic_file="./dictionaries/nl_NL/nl_NL.dic" 25 | fi 26 | if [ "$language" == "it" ]; then 27 | aff_file="./dictionaries/it_IT/it_IT.aff" 28 | dic_file="./dictionaries/it_IT/it_IT.dic" 29 | fi 30 | if [ "$language" == "cs" ]; then 31 | aff_file="./dictionaries/cs_CZ/cs_CZ.aff" 32 | dic_file="./dictionaries/cs_CZ/cs_CZ.dic" 33 | fi 34 | 35 | if [ "$aff_file" == "" ]; then 36 | echo "Missing dictionary for preprocessor see process_splitter_dict.sh" 37 | exit 3 38 | fi 39 | 40 | echo "Building dict with:" 41 | go run main/splitter_preprocessor.go "$index_file" "$dic_file" "$aff_file" "./data/splitter_dict.csv" 42 | 43 | --------------------------------------------------------------------------------