├── .dockerignore
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── create-release.yaml
    │   └── tests.yaml
├── .gitignore
├── Dockerfile.full
├── Dockerfile.local-vectordb
├── Dockerfile.minimal
├── LICENSE
├── README.md
├── adapters
    └── repos
    │   └── extensions_weaviate_module.go
├── build.sh
├── client
    └── client.go
├── compoundsplitting
    ├── dictionary.go
    ├── noop_splitter.go
    ├── splitter.go
    └── splitter_test.go
├── contextionary
    ├── contextionary.pb.go
    ├── contextionary.proto
    ├── core
    │   ├── annoyindex
    │   │   ├── annoy_test.go
    │   │   ├── annoygomodule.h
    │   │   ├── annoygomodule_wrap.cxx
    │   │   ├── annoyindex.go
    │   │   ├── annoylib.h
    │   │   └── kissrandom.h
    │   ├── centroid.go
    │   ├── centroid_test.go
    │   ├── certainty.go
    │   ├── combined.go
    │   ├── combined_simple_test.go
    │   ├── component_test.go
    │   ├── contextionary.go
    │   ├── generator
    │   │   ├── cmd
    │   │   │   └── generator.go
    │   │   └── generator.go
    │   ├── indices_test.go
    │   ├── memory_index.go
    │   ├── mmapped.go
    │   ├── similar_words.go
    │   ├── similar_words_test.go
    │   ├── stopwords
    │   │   └── detector.go
    │   ├── vector.go
    │   └── wordlist.go
    └── schema
    │   ├── contextionary.go
    │   ├── schema_search.go
    │   ├── schema_search_params.go
    │   ├── schema_search_params_test.go
    │   └── schema_search_test.go
├── errors
    └── errors.go
├── extensions
    ├── extension.go
    ├── looker_upper.go
    ├── looker_upper_test.go
    ├── storer.go
    └── storer_test.go
├── gen_proto_code.sh
├── go.mod
├── go.sum
├── logparser
    └── parse.go
├── main
    └── splitter_preprocessor.go
├── prepare_docker_buildx.sh
├── preprocessing
    ├── dictionary_pre_processing.go
    ├── dictionary_pre_processing_test.go
    ├── hunspell.go
    └── hunspell_test.go
├── server
    ├── api.go
    ├── config
    │   └── config.go
    ├── contextionary.go
    ├── corpus_vectorizer.go
    ├── corpus_vectorizer_test.go
    ├── grpc_error.go
    ├── server.go
    ├── splitter.go
    ├── splitter_test.go
    ├── weight_manipulator.go
    └── weight_manipulator_test.go
├── test
    ├── compoundsplitting
    │   ├── contextionary.idx
    │   ├── nl_NL.aff
    │   ├── nl_NL.dic
    │   └── pre_processed_splitter_dict.csv
    ├── journey.sh
    └── journey
    │   ├── Dockerfile
    │   ├── docker-compose.yml
    │   ├── go.mod
    │   ├── go.sum
    │   └── journey_test.go
└── tools
    ├── dev
        ├── .gitignore
        ├── contextionary-playground
        │   ├── .gitignore
        │   ├── class_vectors
        │   │   ├── elastic.go
        │   │   ├── main.go
        │   │   ├── search.go
        │   │   ├── stopwords.go
        │   │   └── texts.go
        │   ├── comparison
        │   │   └── main.go
        │   ├── main.go
        │   └── schema
        │   │   └── main.go
        ├── en_test-vectors-small.txt.bz2
        ├── gen_simple_contextionary.sh
        ├── run.sh
        └── stopwords.json
    ├── download_contextionary.sh
    ├── native_build_contextionary.sh
    ├── preprocess_splitter_dict.sh
    └── preprocess_splitter_dict_native_build.sh


/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Ci related folders
2 | /.github/ @weaviate/core
3 | build.sh @weaviate/core
4 | prepare_docker_buildx.sh @weaviate/core
5 | 


--------------------------------------------------------------------------------
/.github/workflows/create-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Create Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '**'
 7 | 
 8 | jobs:
 9 |   create-release:
10 |     name: Create Release
11 |     if: startsWith(github.ref, 'refs/tags')
12 |     runs-on: ubuntu-latest-4-cores
13 |     strategy:
14 |       matrix:
15 |         include:
16 |           - language: en
17 |             model_version: 0.16.0
18 |           - language: nl
19 |             model_version: 0.16.0
20 |           - language: en
21 |             model_version: 0.14.0
22 |           - language: nl
23 |             model_version: 0.14.0
24 |           - language: de
25 |             model_version: 0.14.0
26 |           - language: cs
27 |             model_version: 0.14.0
28 |           - language: it
29 |             model_version: 0.14.0
30 |     env:
31 |       DOCKER_ORG: semitechnologies
32 |       DOCKER_REPO: contextionary
33 |       LANGUAGE: ${{matrix.language}}
34 |       MODEL_VERSION: ${{matrix.model_version}}
35 |     steps:
36 |       - uses: actions/checkout@v3
37 |       - name: Login to Docker Hub
38 |         uses: docker/login-action@v2
39 |         if: ${{ !github.event.pull_request.head.repo.fork }}
40 |         with:
41 |           username: ${{secrets.DOCKER_USERNAME}}
42 |           password: ${{secrets.DOCKER_PASSWORD}}
43 |       - name: Set up Go
44 |         uses: actions/setup-go@v3
45 |         with:
46 |           go-version: 1.19
47 |           cache: true
48 |       - name: Build and release
49 |         run: |
50 |           export SOFTWARE_VERSION=${GITHUB_REF##*/}
51 |           set -e
52 |           ./prepare_docker_buildx.sh
53 |           PUSH_MULTIARCH=1 ./build.sh
54 |           echo "Success"
55 |   gh-release:
56 |     name: Create a GitHub Release
57 |     if: startsWith(github.ref, 'refs/tags')
58 |     runs-on: ubuntu-latest
59 |     needs: create-release
60 |     steps:
61 |       - name: Checkout
62 |         uses: actions/checkout@v3
63 |       - name: Release
64 |         uses: softprops/action-gh-release@v1
65 |         with:
66 |           generate_release_notes: true
67 |           draft: true
68 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     tags:
 8 |       - '**'
 9 |     paths-ignore:
10 |       - LICENSE
11 |       - README.md
12 |   pull_request:
13 | 
14 | jobs:
15 |   tests:
16 |     name: Tests
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       matrix:
20 |         include:
21 |           - model_version: 0.16.0
22 |             dimensions: 300
23 |           - model_version: 0.14.0
24 |             dimensions: 600
25 |     env:
26 |       DOCKER_ORG: semitechnologies
27 |       DOCKER_REPO: contextionary
28 |       LANGUAGE: en
29 |       SOFTWARE_VERSION: localtest
30 |       MODEL_VERSION: ${{matrix.model_version}}
31 |       DIMENSIONS: ${{matrix.dimensions}}
32 |     steps:
33 |       - uses: actions/checkout@v3
34 |       - name: Login to Docker Hub
35 |         uses: docker/login-action@v2
36 |         if: ${{ !github.event.pull_request.head.repo.fork }}
37 |         with:
38 |           username: ${{secrets.DOCKER_USERNAME}}
39 |           password: ${{secrets.DOCKER_PASSWORD}}
40 |       - name: Set up Go
41 |         uses: actions/setup-go@v3
42 |         with:
43 |           go-version: 1.19
44 |           cache: true
45 |       - name: Build and run journey tests
46 |         run: |
47 |           set -e
48 |           docker buildx version
49 |           ./build.sh
50 |           ./test/journey.sh
51 |           echo "Success"
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | local-vectordb/
3 | 
4 | .idea
5 | 
6 | vendor/


--------------------------------------------------------------------------------
/Dockerfile.full:
--------------------------------------------------------------------------------
 1 | # vi: ft=Dockerfile
 2 | 
 3 | 
 4 | FROM golang:1.13 as builder
 5 | WORKDIR /app
 6 | 
 7 | RUN apt-get update && apt-get install -y bzip2 jq hunspell libhunspell-dev git
 8 | 
 9 | COPY ./tools/download_contextionary.sh ./
10 | ARG LANGUAGE
11 | ARG MODEL_VERSION
12 | RUN ./download_contextionary.sh "$LANGUAGE" "$MODEL_VERSION"
13 | 
14 | COPY go.mod go.sum ./
15 | RUN go mod download
16 | 
17 | COPY . .
18 | ARG VERSION
19 | ARG TARGETARCH
20 | 
21 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=$TARGETARCH go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server
22 | 
23 | RUN tools/dev/gen_simple_contextionary.sh
24 | RUN mkdir -p ./data
25 | 
26 | COPY ./tools/preprocess_splitter_dict.sh ./
27 | RUN /bin/bash preprocess_splitter_dict.sh "$LANGUAGE" "/app/data/contextionary.idx"
28 | 
29 | 
30 | FROM alpine
31 | 
32 | COPY --from=builder /app/data/contextionary.idx /app/data/contextionary.knn /app/data/stopwords.json /app/data/splitter_dict.csv /
33 | COPY --from=builder /app/contextionary-server /
34 | 
35 | ENV KNN_FILE=/contextionary.knn
36 | ENV IDX_FILE=/contextionary.idx
37 | ENV STOPWORDS_FILE=/stopwords.json
38 | ENV COMPOUND_SPLITTING_DICTIONARY_FILE=/splitter_dict.csv
39 | 
40 | ENTRYPOINT [ "/contextionary-server" ]
41 | 


--------------------------------------------------------------------------------
/Dockerfile.local-vectordb:
--------------------------------------------------------------------------------
 1 | # vi: ft=Dockerfile
 2 | 
 3 | 
 4 | FROM golang:1.13 as builder
 5 | WORKDIR /app
 6 | 
 7 | RUN apt-get update && apt-get install -y bzip2 jq
 8 | 
 9 | COPY go.mod go.sum ./
10 | RUN go mod download
11 | 
12 | COPY . .
13 | ARG VERSION
14 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server
15 | 
16 | RUN tools/dev/gen_simple_contextionary.sh
17 | RUN mkdir -p ./data
18 | 
19 | 
20 | FROM alpine
21 | 
22 | COPY local-vectordb/contextionary.idx local-vectordb/contextionary.knn local-vectordb/stopwords.json /
23 | COPY --from=builder /app/contextionary-server /
24 | 
25 | ENV KNN_FILE=/contextionary.knn
26 | ENV IDX_FILE=/contextionary.idx
27 | ENV STOPWORDS_FILE=/stopwords.json
28 | 
29 | ENTRYPOINT [ "/contextionary-server" ]
30 | 


--------------------------------------------------------------------------------
/Dockerfile.minimal:
--------------------------------------------------------------------------------
 1 | # vi: ft=Dockerfile
 2 | 
 3 | FROM golang:1.13 as builder
 4 | WORKDIR /app
 5 | 
 6 | RUN apt-get update && apt-get install -y bzip2 jq hunspell libhunspell-dev git
 7 | 
 8 | COPY go.mod go.sum ./
 9 | RUN go mod download
10 | 
11 | COPY . .
12 | ARG VERSION
13 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server
14 | 
15 | RUN tools/dev/gen_simple_contextionary.sh
16 | RUN mkdir -p ./data
17 | 
18 | COPY ./tools/preprocess_splitter_dict.sh ./
19 | RUN /bin/bash preprocess_splitter_dict.sh "en" "/app/tools/dev/example.idx"
20 | 
21 | FROM scratch
22 | 
23 | COPY --from=builder /app/tools/dev/example.idx /app/tools/dev/example.knn /app/tools/dev/stopwords.json /app/data/splitter_dict.csv /
24 | COPY --from=builder /app/contextionary-server /
25 | 
26 | ENV KNN_FILE=/example.knn
27 | ENV IDX_FILE=/example.idx
28 | ENV STOPWORDS_FILE=/stopwords.json
29 | ENV COMPOUND_SPLITTING_DICTIONARY_FILE=/splitter_dict.csv
30 | 
31 | ENTRYPOINT [ "/contextionary-server" ]
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020, Weaviate B.V.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |    this list of conditions and the following disclaimer in the documentation
12 |    and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its
15 |    contributors may be used to endorse or promote products derived from
16 |    this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Weaviate Contextionary <img alt='Weaviate logo' src='https://raw.githubusercontent.com/weaviate/weaviate/19de0956c69b66c5552447e84d016f4fe29d12c9/docs/assets/weaviate-logo.png' width='180' align='right' />
 2 | 
 3 | > The contextionary powers the semantic, context-based searches in Weaviate.
 4 | 
 5 | Not intended for stand-alone use. Used by [Weaviate - the ML-first vector
 6 | search engine](https://github.com/weaviate/weaviate).
 7 | 
 8 | ## Versioning
 9 | 
10 | The version tag is `<language-of-db><semver-of-db>-v<semver-of-app>`. So for
11 | example the app version `0.1.0` deployed with the [contextionary vector db
12 | version](https://c11y.semi.technology/contextionary.json) `0.6.0` of the
13 | English language  will have the version `en0.6.0-v0.1.0`. This also
14 | corresponds to the Docker tag.
15 | 
16 | ## Languages
17 | 
18 | Currently available languages include:
19 | * `en` 
20 | * `de`
21 | * `nl`
22 | * `cs`
23 | * `it`
24 | 
25 | Other languages coming soon.
26 | 
27 | ## Docker Requirements
28 | 
29 | The build pipeline makes use of Docker's `buildx` for multi-arch builds. Make
30 | sure you run a Docker version which supports `buildx` and have run `docker
31 | buildx create --use` at least once.
32 | 
33 | ## How to build and test project
34 | 
35 | 1. Regenerate schema:
36 | 
37 | ```bash
38 | ./gen_proto_code.sh
39 | ```
40 | 
41 | 2. Build image:
42 | 
43 | ```bash
44 | LANGUAGE=en MODEL_VERSION=0.16.0 ./build.sh
45 | ```
46 | 
47 | 3. Run journey tests:
48 | 
49 | ```bash
50 | LANGUAGE=en MODEL_VERSION=0.16.0 ./build.sh && DIMENSIONS=300 ./test/journey.sh
51 | ```
52 | 


--------------------------------------------------------------------------------
/adapters/repos/extensions_weaviate_module.go:
--------------------------------------------------------------------------------
  1 | package repos
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"context"
  7 | 	"encoding/json"
  8 | 	"fmt"
  9 | 	"net/http"
 10 | 	"time"
 11 | 
 12 | 	"github.com/sirupsen/logrus"
 13 | 	"github.com/weaviate/contextionary/extensions"
 14 | 	"github.com/weaviate/contextionary/server/config"
 15 | )
 16 | 
 17 | type ModuleExtensionRepo struct {
 18 | 	client        *http.Client
 19 | 	logger        logrus.FieldLogger
 20 | 	origin        string
 21 | 	watchInterval time.Duration
 22 | }
 23 | 
 24 | func NewExtensionsRepo(logger logrus.FieldLogger,
 25 | 	config *config.Config, watchInterval time.Duration) *ModuleExtensionRepo {
 26 | 	client := &http.Client{}
 27 | 	return &ModuleExtensionRepo{
 28 | 		client:        client,
 29 | 		logger:        logger,
 30 | 		origin:        config.ExtensionsStorageOrigin,
 31 | 		watchInterval: watchInterval,
 32 | 	}
 33 | }
 34 | 
 35 | func (r *ModuleExtensionRepo) WatchAll() chan extensions.WatchResponse {
 36 | 	returnCh := make(chan extensions.WatchResponse)
 37 | 
 38 | 	go func() {
 39 | 		t := time.Tick(r.watchInterval)
 40 | 		for {
 41 | 			r.updateConsumers(returnCh)
 42 | 			<-t
 43 | 		}
 44 | 	}()
 45 | 
 46 | 	return returnCh
 47 | }
 48 | 
 49 | func (f *ModuleExtensionRepo) uri(path string) string {
 50 | 	return fmt.Sprintf("%s%s", f.origin, path)
 51 | }
 52 | 
 53 | func (r *ModuleExtensionRepo) updateConsumers(returnCh chan extensions.WatchResponse) {
 54 | 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 55 | 	defer cancel()
 56 | 
 57 | 	req, err := http.NewRequestWithContext(ctx, "GET",
 58 | 		r.uri("/v1/modules/text2vec-contextionary/extensions-storage/"), nil)
 59 | 	if err != nil {
 60 | 		r.logger.WithField("action", "extensions_retrieve_all").
 61 | 			WithError(err).Error()
 62 | 		return
 63 | 	}
 64 | 
 65 | 	res, err := r.client.Do(req)
 66 | 	if err != nil {
 67 | 		r.logger.WithField("action", "extensions_retrieve_all").
 68 | 			WithError(err).Error()
 69 | 		return
 70 | 	}
 71 | 
 72 | 	defer res.Body.Close()
 73 | 	if res.StatusCode > 399 {
 74 | 		r.logger.WithField("action", "extensions_retrieve_all").
 75 | 			WithError(fmt.Errorf("expected status < 399, got %d", res.StatusCode)).
 76 | 			Error()
 77 | 		return
 78 | 	}
 79 | 
 80 | 	var exts []extensions.Extension
 81 | 	scanner := bufio.NewScanner(res.Body)
 82 | 	for scanner.Scan() {
 83 | 		if err := scanner.Err(); err != nil {
 84 | 			r.logger.WithField("action", "extensions_retrieve_all").
 85 | 				WithError(err).Error()
 86 | 			return
 87 | 		}
 88 | 
 89 | 		rawExt := scanner.Bytes()
 90 | 		var ext extensions.Extension
 91 | 		err := json.Unmarshal(rawExt, &ext)
 92 | 		if err != nil {
 93 | 			r.logger.WithField("action", "extensions_retrieve_all").
 94 | 				WithError(err).Error()
 95 | 			return
 96 | 		}
 97 | 
 98 | 		exts = append(exts, ext)
 99 | 	}
100 | 
101 | 	returnCh <- exts
102 | }
103 | 
104 | func (r *ModuleExtensionRepo) Put(ctx context.Context, ext extensions.Extension) error {
105 | 	extBytes, err := json.Marshal(ext)
106 | 	if err != nil {
107 | 		return fmt.Errorf("marshal extension to json: %v", err)
108 | 	}
109 | 
110 | 	req, err := http.NewRequestWithContext(ctx, "PUT", r.uri(fmt.Sprintf(
111 | 		"/v1/modules/text2vec-contextionary/extensions-storage/%s", ext.Concept)), bytes.NewReader(extBytes))
112 | 
113 | 	res, err := r.client.Do(req)
114 | 	if err != nil {
115 | 		return fmt.Errorf("put: %v", err)
116 | 	}
117 | 
118 | 	defer res.Body.Close()
119 | 	if res.StatusCode > 399 {
120 | 		return fmt.Errorf("expected status < 399, got %d", res.StatusCode)
121 | 	}
122 | 
123 | 	return nil
124 | }
125 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # set some defaults so we can also run locally
 6 | if [ -z "$DOCKER_ORG" ]
 7 | then
 8 |   DOCKER_ORG=semitechnologies
 9 | fi
10 | 
11 | if [ -z "$DOCKER_REPO" ]
12 | then
13 |   DOCKER_REPO=contextionary
14 | fi
15 | 
16 | if [ -z "$SOFTWARE_VERSION" ]
17 | then
18 |   SOFTWARE_VERSION=local
19 | fi
20 | 
21 | if [ -z "$MODEL_VERSION" ]
22 | then
23 |   MODEL_VERSION=0.16.0
24 | fi
25 | 
26 | if [ -z "$LANGUAGE" ]
27 | then
28 |   LANGUAGE=en
29 | fi
30 | 
31 | VERSION="${MODEL_VERSION}-${SOFTWARE_VERSION}"
32 | 
33 | if [ -z "$FULL_VERSION_DOCKERFILE" ]
34 | then
35 |   FULL_VERSION_DOCKERFILE=Dockerfile.full
36 | fi
37 | 
38 | if [ "$PUSH_MULTIARCH" = "1" ]; then
39 |   echo "Build and push multi-arch full version"
40 |   echo "Build $LANGUAGE:"
41 |   full_version="${LANGUAGE}${VERSION}" 
42 |   docker buildx build --platform=linux/amd64,linux/arm64 \
43 |     --push \
44 |     -f "$FULL_VERSION_DOCKERFILE" \
45 |     --build-arg VERSION="$full_version" \
46 |     --build-arg MODEL_VERSION="$MODEL_VERSION" \
47 |     --build-arg LANGUAGE="$LANGUAGE" \
48 |     -t "$DOCKER_ORG/$DOCKER_REPO:$full_version" .
49 | else
50 |   echo "Build minimal version (english only)"
51 |   docker build -f Dockerfile.minimal --build-arg VERSION="$VERSION-minimal" -t "$DOCKER_ORG/$DOCKER_REPO:en$VERSION-minimal" .
52 | 
53 |   echo "Build single-arch full version"
54 |   echo "Build $LANGUAGE:"
55 |   full_version="${LANGUAGE}${VERSION}" 
56 |   docker build \
57 |     -f "$FULL_VERSION_DOCKERFILE" \
58 |     --build-arg VERSION="$full_version" \
59 |     --build-arg MODEL_VERSION="$MODEL_VERSION" \
60 |     --build-arg LANGUAGE="$LANGUAGE" \
61 |     -t "$DOCKER_ORG/$DOCKER_REPO:$full_version" .
62 | fi
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/client/client.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 
 10 | 	pb "github.com/weaviate/contextionary/contextionary"
 11 | 	grpc "google.golang.org/grpc"
 12 | )
 13 | 
 14 | func help() {
 15 | 	fmt.Println("the following commands are supported:")
 16 | 	fmt.Printf("\n")
 17 | 	fmt.Printf("\t%-15s%s\n", "meta", "Display meta info, such as versions")
 18 | 	fmt.Printf("\t               %s\n", "Usage: client meta")
 19 | 	fmt.Printf("\n")
 20 | 	fmt.Printf("\t%-15s%s\n", "word-present", "Check if the word is present in the db or as an extension")
 21 | 	fmt.Printf("\t               %s\n", "Usage: client word-present word")
 22 | 	fmt.Printf("\n")
 23 | 	fmt.Printf("\t%-15s%s\n", "word-stopword", "Check if the word is considered a stopword")
 24 | 	fmt.Printf("\t               %s\n", "Usage: client word-stopword word")
 25 | 	fmt.Printf("\n")
 26 | 	fmt.Printf("\t%-15s%s\n", "search", "Search for word or property")
 27 | 	fmt.Printf("\t               %s\n", "For usage run client search and see instructions from there")
 28 | 	fmt.Printf("\n")
 29 | 	fmt.Printf("\t%-15s%s\n", "similar-words", "Search for similar words within the specified certainty")
 30 | 	fmt.Printf("\t               %s\n", "Usage: client similar-words word certainty")
 31 | 	fmt.Printf("\n")
 32 | 	fmt.Printf("\t%-15s%s\n", "extend", "Extend the contextionary with custom concepts")
 33 | 	fmt.Printf("\t               %s\n", "Usage: client extend newconcept \"definition of the new concept\"")
 34 | 	fmt.Printf("\n")
 35 | 	fmt.Printf("\t%-15s%s\n", "vectorize", "Vectorize any string")
 36 | 	fmt.Printf("\t               %s\n", "Usage: client vectorize \"input string to vectorize\"")
 37 | 	fmt.Printf("\t%-15s%s\n", "multi-vector-for-word", "Vectorize multiple strings")
 38 | 	fmt.Printf("\t               %s\n", "Usage: client multi-vector-for-word \"word1 word2 word3 ... wordN\"")
 39 | }
 40 | 
 41 | func main() {
 42 | 	conn, err := grpc.Dial("localhost:9999", grpc.WithInsecure())
 43 | 	if err != nil {
 44 | 		fmt.Fprintf(os.Stderr, "couldn't connect: %s", err)
 45 | 		os.Exit(1)
 46 | 	}
 47 | 	defer conn.Close()
 48 | 
 49 | 	client := pb.NewContextionaryClient(conn)
 50 | 
 51 | 	args := os.Args[1:]
 52 | 	if len(args) == 0 {
 53 | 		fmt.Fprintf(os.Stderr, "no command provided, try 'word-present'\n")
 54 | 		os.Exit(1)
 55 | 	}
 56 | 
 57 | 	cmd := args[0]
 58 | 	switch cmd {
 59 | 	case "help":
 60 | 		help()
 61 | 	case "meta", "version":
 62 | 		meta(client, args[1:])
 63 | 	case "word-present":
 64 | 		wordPresent(client, args[1:])
 65 | 	case "word-stopword":
 66 | 		wordStopword(client, args[1:])
 67 | 	case "search":
 68 | 		search(client, args[1:])
 69 | 	case "similar-words":
 70 | 		similarWords(client, args[1:])
 71 | 	case "extend":
 72 | 		extend(client, args[1:])
 73 | 	case "vectorize":
 74 | 		vectorize(client, args[1:])
 75 | 	case "multi-vector-for-word":
 76 | 		multiVecForWord(client, args[1:])
 77 | 
 78 | 	default:
 79 | 		fmt.Fprintf(os.Stderr, "unknown command '%s'\n", cmd)
 80 | 		os.Exit(1)
 81 | 	}
 82 | }
 83 | func meta(client pb.ContextionaryClient, args []string) {
 84 | 	ctx := context.Background()
 85 | 
 86 | 	res, err := client.Meta(ctx, &pb.MetaParams{})
 87 | 	if err != nil {
 88 | 		fmt.Fprintf(os.Stderr, "ERROR: couldn't display meta: %s", err)
 89 | 		os.Exit(1)
 90 | 	}
 91 | 
 92 | 	fmt.Printf("%#v\n", res)
 93 | }
 94 | 
 95 | func wordPresent(client pb.ContextionaryClient, args []string) {
 96 | 	if len(args) == 0 {
 97 | 		fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to check\n")
 98 | 		os.Exit(1)
 99 | 	}
100 | 
101 | 	ctx := context.Background()
102 | 
103 | 	for _, word := range args {
104 | 		res, err := client.IsWordPresent(ctx, &pb.Word{Word: word})
105 | 		if err != nil {
106 | 			fmt.Fprintf(os.Stderr, "ERROR: couldn't get word: %s", err)
107 | 			os.Exit(1)
108 | 		}
109 | 		if res.Present {
110 | 			fmt.Printf("word '%s' is present in the contextionary\n", word)
111 | 		} else {
112 | 			fmt.Printf("word '%s' is NOT present in the contextionary\n", word)
113 | 		}
114 | 	}
115 | }
116 | 
117 | func similarWords(client pb.ContextionaryClient, args []string) {
118 | 	var word string
119 | 	var certainty float32
120 | 
121 | 	if len(args) == 0 {
122 | 		fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to find similarities to\n")
123 | 		os.Exit(1)
124 | 	}
125 | 	word = args[0]
126 | 
127 | 	if len(args) == 1 {
128 | 		fmt.Fprintf(os.Stderr, "need at least one other argument: the minimum required certainty\n")
129 | 		os.Exit(1)
130 | 	}
131 | 
132 | 	c, err := strconv.ParseFloat(args[1], 32)
133 | 	if err != nil {
134 | 		fmt.Fprintf(os.Stderr, "couldnt parse certainty: %v\n", err)
135 | 		os.Exit(1)
136 | 	}
137 | 	certainty = float32(c)
138 | 
139 | 	res, err := client.SafeGetSimilarWordsWithCertainty(context.Background(), &pb.SimilarWordsParams{
140 | 		Certainty: certainty,
141 | 		Word:      word,
142 | 	})
143 | 	if err != nil {
144 | 		fmt.Fprintf(os.Stderr, "ERROR: couldn't get similar words: %s", err)
145 | 		os.Exit(1)
146 | 	}
147 | 
148 | 	for _, word := range res.Words {
149 | 		fmt.Printf("🥳  %s\n", word.Word)
150 | 	}
151 | }
152 | func extend(client pb.ContextionaryClient, args []string) {
153 | 	if len(args) != 2 {
154 | 		fmt.Fprintf(os.Stderr, "need two arguments, the concept to add/extend and its definition\n")
155 | 		os.Exit(1)
156 | 	}
157 | 	concept := args[0]
158 | 	definition := strings.ToLower(args[1])
159 | 
160 | 	_, err := client.AddExtension(context.Background(), &pb.ExtensionInput{
161 | 		Concept:    concept,
162 | 		Definition: definition,
163 | 		Weight:     1,
164 | 	})
165 | 	if err != nil {
166 | 		fmt.Fprintf(os.Stderr, "ERROR: %s", err)
167 | 		os.Exit(1)
168 | 	} else {
169 | 		fmt.Fprintf(os.Stdout, "Success!")
170 | 		os.Exit(0)
171 | 	}
172 | }
173 | 
174 | func vectorize(client pb.ContextionaryClient, args []string) {
175 | 	if len(args) != 1 {
176 | 		fmt.Fprintf(os.Stderr, "need one argument: the input string to vectorize")
177 | 		os.Exit(1)
178 | 	}
179 | 	input := args[0]
180 | 
181 | 	res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{
182 | 		Corpi: []string{input},
183 | 	})
184 | 	if err != nil {
185 | 		fmt.Fprintf(os.Stderr, "ERROR: %s", err)
186 | 		os.Exit(1)
187 | 	} else {
188 | 		fmt.Fprintf(os.Stdout, "Success: %v\n", res.Entries)
189 | 		fmt.Fprintf(os.Stdout, "Source: %v\n", res.Source)
190 | 		os.Exit(0)
191 | 	}
192 | }
193 | 
194 | func multiVecForWord(client pb.ContextionaryClient, args []string) {
195 | 	if len(args) < 1 {
196 | 		fmt.Fprintf(os.Stderr, "need at least one argument: the input word to vectorize")
197 | 		os.Exit(1)
198 | 	}
199 | 
200 | 	words := make([]*pb.Word, len(args))
201 | 	for i, word := range args {
202 | 		words[i] = &pb.Word{Word: word}
203 | 	}
204 | 
205 | 	res, err := client.MultiVectorForWord(context.Background(), &pb.WordList{
206 | 		Words: words,
207 | 	})
208 | 	if err != nil {
209 | 		fmt.Fprintf(os.Stderr, "ERROR: %s", err)
210 | 		os.Exit(1)
211 | 	} else {
212 | 		fmt.Fprintf(os.Stdout, "Success: %v", res.Vectors)
213 | 		os.Exit(0)
214 | 	}
215 | }
216 | 
217 | func wordStopword(client pb.ContextionaryClient, args []string) {
218 | 	if len(args) == 0 {
219 | 		fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to check\n")
220 | 		os.Exit(1)
221 | 	}
222 | 
223 | 	ctx := context.Background()
224 | 
225 | 	for _, word := range args {
226 | 		res, err := client.IsWordStopword(ctx, &pb.Word{Word: word})
227 | 		if err != nil {
228 | 			fmt.Fprintf(os.Stderr, "ERROR: couldn't get word: %s", err)
229 | 			os.Exit(1)
230 | 		}
231 | 		if res.Stopword {
232 | 			fmt.Printf("word '%s' is a stopword\n", word)
233 | 		} else {
234 | 			fmt.Printf("word '%s' is not a stopword\n", word)
235 | 		}
236 | 	}
237 | }
238 | 
239 | func search(client pb.ContextionaryClient, args []string) {
240 | 	if len(args) == 0 {
241 | 		fmt.Fprintf(os.Stderr, "need at least one other argument: either 'class' or 'property' \n")
242 | 		os.Exit(1)
243 | 	}
244 | 
245 | 	cmd := args[0]
246 | 	switch cmd {
247 | 	case "class":
248 | 		searchClass(client, args[1:])
249 | 	default:
250 | 		fmt.Fprintf(os.Stderr, "unknown command '%s'\n", cmd)
251 | 		os.Exit(1)
252 | 	}
253 | }
254 | 
255 | func searchClass(client pb.ContextionaryClient, args []string) {
256 | 	if len(args) == 0 {
257 | 		fmt.Fprintf(os.Stderr, "need at least one other argument the search term\n")
258 | 		os.Exit(1)
259 | 	}
260 | 
261 | 	if len(args) == 1 {
262 | 		fmt.Fprintf(os.Stderr, "need at least one other argument the desired certainty\n")
263 | 		os.Exit(1)
264 | 	}
265 | 
266 | 	searchTerm := args[0]
267 | 	certainty, err := strconv.ParseFloat(args[1], 32)
268 | 	if err != nil {
269 | 		fmt.Fprintf(os.Stderr, "cannot parse certainty '%s'\n", args[1])
270 | 		os.Exit(1)
271 | 	}
272 | 
273 | 	params := &pb.SchemaSearchParams{
274 | 		Certainty: float32(certainty),
275 | 		Name:      searchTerm,
276 | 	}
277 | 
278 | 	ctx := context.Background()
279 | 	res, err := client.SchemaSearch(ctx, params)
280 | 	if err != nil {
281 | 		fmt.Fprintf(os.Stderr, "schema search failed: %s", err)
282 | 		os.Exit(1)
283 | 	}
284 | 
285 | 	if len(res.Results) == 0 {
286 | 		fmt.Println("😵 nothing found")
287 | 	}
288 | 
289 | 	for _, class := range res.Results {
290 | 		fmt.Printf("🥳  %s (Certainty: %f)\n", class.Name, class.Certainty)
291 | 	}
292 | }
293 | 


--------------------------------------------------------------------------------
/compoundsplitting/dictionary.go:
--------------------------------------------------------------------------------
 1 | package compoundsplitting
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"os"
 6 | 	"strconv"
 7 | 	"strings"
 8 | )
 9 | 
10 | 
11 | // Dictionary filter for the splitting algorithm
12 | // based on the words in the contextionary
13 | type ContextionaryDict struct {
14 | 	dict map[string]int // storing the word and its occurrence
15 | }
16 | 
17 | // NewContextionaryDict
18 | // uses a dictionary file that was created using the preprocessing procedures
19 | func NewContextionaryDict(contextionaryDictFile string) (*ContextionaryDict, error) {
20 | 	file, err := os.Open(contextionaryDictFile)
21 | 	if err != nil {
22 | 		return nil, err
23 | 	}
24 | 	defer file.Close()
25 | 
26 | 	dict := &ContextionaryDict{
27 | 		dict: make(map[string]int, 400000),
28 | 	}
29 | 
30 | 	scanner := bufio.NewScanner(file)
31 | 	for scanner.Scan() {
32 | 		line := scanner.Text()
33 | 		split := strings.Split(line, ",")
34 | 		occurrence, err := strconv.Atoi(split[1])
35 | 		if err != nil {
36 | 			return nil, err
37 | 		}
38 | 		dict.dict[split[0]] = occurrence
39 | 	}
40 | 
41 | 	return dict, nil
42 | }
43 | 
44 | // Contains true if word is in contextionary
45 | func (cd *ContextionaryDict) Contains(word string) bool {
46 | 	_, exists := cd.dict[word]
47 | 	return exists
48 | }
49 | 
50 | //Score prefers long and few words
51 | func (cd *ContextionaryDict) Score(phrase []string) float64 {
52 | 	// Prefer longer words as scoring
53 | 	// Assumption is that the compound words are on average more similar to splittings that
54 | 	// share most of the characters with the compound.
55 | 	lenScore := 0
56 | 	for _, word := range phrase {
57 | 		lenScore += len(word)
58 | 	}
59 | 
60 | 	// Give a boost for less words
61 | 	if len(phrase) == 2 {
62 | 		lenScore += 3
63 | 	}
64 | 	if len(phrase) == 3 {
65 | 		lenScore += 1
66 | 	}
67 | 
68 | 	return float64(lenScore)
69 | }
70 | 
71 | 
72 | // DictMock used for unit testing
73 | type DictMock struct {
74 | 	scores map[string]float64
75 | }
76 | 
77 | // Contains
78 | func (dm *DictMock) Contains(word string) bool {
79 | 	_, exists := dm.scores[word]
80 | 	return exists
81 | }
82 | 
83 | // Score
84 | func (dm *DictMock) Score(phrase []string) float64 {
85 | 	score := 0.0
86 | 	for _, word := range phrase {
87 | 		score += dm.scores[word]
88 | 	}
89 | 	return score
90 | }
91 | 


--------------------------------------------------------------------------------
/compoundsplitting/noop_splitter.go:
--------------------------------------------------------------------------------
 1 | package compoundsplitting
 2 | 
 3 | type NoopSplitter struct{}
 4 | 
 5 | func NewNoopSplitter() NoopSplitter {
 6 | 	return NoopSplitter{}
 7 | }
 8 | 
 9 | func (n NoopSplitter) Split(words string) ([]string, error) {
10 | 	return []string{}, nil
11 | }
12 | 


--------------------------------------------------------------------------------
/compoundsplitting/splitter.go:
--------------------------------------------------------------------------------
  1 | package compoundsplitting
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"time"
  7 | )
  8 | 
  9 | // minCompoundWordLength prevents the splitting into very small (often not real) words
 10 | //  to prevent a bloated tree
 11 | const minCompoundWordLength = 4
 12 | 
 13 | // maxWordLength prevents a tree from growing too big when adding very long strings
 14 | const maxWordLength = 100
 15 | 
 16 | // maxNumberTreeNodes
 17 | const maxNumberTreeNodes = 20
 18 | 
 19 | const cancelSplittingAfter = 500 * time.Millisecond
 20 | 
 21 | type Dictionary interface {
 22 | 	// Score receives a phrase of words and gives a score on how "good" this phrase is.
 23 | 	//  If a compound word can be splitted into multiple phrases it will choose the one with the highest score.
 24 | 	Score(phrase []string) float64
 25 | 	// Contains is true if the word is in the dictionary
 26 | 	Contains(word string) bool
 27 | }
 28 | 
 29 | // Splitter builds a tree of compound splits and selects
 30 | //  the best option based on a scoring mechanism
 31 | type Splitter struct {
 32 | 	dict Dictionary
 33 | 	cancelAfter       time.Duration
 34 | }
 35 | 
 36 | // New Splitter recognizing words given by dict and
 37 | //  selecting split phrases based on scoring
 38 | func NewSplitter(dict Dictionary) *Splitter {
 39 | 	return &Splitter{
 40 | 		dict:              dict,
 41 | 		cancelAfter:       cancelSplittingAfter,
 42 | 	}
 43 | }
 44 | 
 45 | type CompoundSplit struct {
 46 | 	// Combinations of compound combinations in a phrase
 47 | 	combinations      []*Node
 48 | }
 49 | 
 50 | // Split a compound word into its compounds
 51 | func (sp *Splitter) Split(word string) ([]string, error) {
 52 | 
 53 | 	if len(word) > maxWordLength {
 54 | 		return []string{}, nil
 55 | 	}
 56 | 
 57 | 	compoundSplit := CompoundSplit{}
 58 | 
 59 | 	// spawn a new context that cancels the recursion if we are spending too much
 60 | 	// time on it
 61 | 	ctx, cancel := context.WithTimeout(context.Background(), sp.cancelAfter)
 62 | 	defer cancel()
 63 | 
 64 | 	err := sp.findAllWordCombinations(ctx, word, &compoundSplit)
 65 | 	if err != nil {
 66 | 		return nil, err
 67 | 	}
 68 | 	combinations := compoundSplit.getAllWordCombinations(ctx)
 69 | 	maxScore := 0.0
 70 | 	maxPhrase := []string{}
 71 | 	for _, combination := range combinations {
 72 | 		currentScore := sp.dict.Score(combination)
 73 | 		if len(maxPhrase) == 0 {
 74 | 			// Initialize if score is negative
 75 | 			maxScore = currentScore
 76 | 			maxPhrase = combination
 77 | 		}
 78 | 		if currentScore > maxScore {
 79 | 			maxScore = currentScore
 80 | 			maxPhrase = combination
 81 | 		}
 82 | 	}
 83 | 	return maxPhrase, nil
 84 | }
 85 | 
 86 | func (cs *CompoundSplit) insertCompound(ctx context.Context, word string,
 87 | 	startIndex int) error {
 88 | 	compound := NewNode(word, startIndex)
 89 | 	appended := false
 90 | 	for _, combination := range cs.combinations {
 91 | 		// For all possible combinations
 92 | 
 93 | 		leaves := combination.RecursivelyFindLeavesBeforeIndex(ctx, startIndex)
 94 | 		for _, leave := range leaves {
 95 | 			// Append the new compound to the leaves
 96 | 
 97 | 			appended = true
 98 | 			err := leave.AddChild(compound)
 99 | 			if err != nil {
100 | 				return err
101 | 			}
102 | 		}
103 | 	}
104 | 	if !appended {
105 | 		// if compound was not added to any leave add it to combinations
106 | 		cs.combinations = append(cs.combinations, compound)
107 | 	}
108 | 	return nil
109 | }
110 | 
111 | func (sp *Splitter) findAllWordCombinations(ctx context.Context, str string, compoundSplit *CompoundSplit) error {
112 | 	compoundsUsed := 0
113 | 	for offset, _ := range str {
114 | 		// go from left to right and choose offsetted substring
115 | 		offsetted := str[offset:]
116 | 
117 | 		for i := 1; i <= len(offsetted); i++ {
118 | 			// go from left to right to find a word
119 | 			word := offsetted[:i]
120 | 			if len(word) < minCompoundWordLength {
121 | 				continue
122 | 			}
123 | 
124 | 			if sp.dict.Contains(word) {
125 | 				compoundsUsed += 1
126 | 				if compoundsUsed == maxNumberTreeNodes {
127 | 					// Tree is getting out of bounds stopping for performance
128 | 					return nil
129 | 				}
130 | 				err := compoundSplit.insertCompound(ctx, word, offset)
131 | 				if err != nil {
132 | 					return err
133 | 				}
134 | 			}
135 | 		}
136 | 	}
137 | 	return nil
138 | }
139 | 
140 | func (cs *CompoundSplit) getAllWordCombinations(ctx context.Context) [][]string {
141 | 	wordCombinations := [][]string{}
142 | 
143 | 	for _, combination := range cs.combinations {
144 | 		wordCombinations = append(wordCombinations,
145 | 			combination.RecursivelyBuildNames(ctx)...)
146 | 	}
147 | 
148 | 	return wordCombinations
149 | }
150 | 
151 | // Node for of the word tree
152 | type Node struct {
153 | 	name       string
154 | 	children   []*Node
155 | 	startIndex int // inclusiv
156 | 	endIndex   int // exclusive
157 | }
158 | 
159 | // NewNode from node name and in compoundword index
160 | func NewNode(name string, startIndex int) *Node {
161 | 	return &Node{
162 | 		name:       name,
163 | 		children:   []*Node{},
164 | 		startIndex: startIndex,
165 | 		endIndex:   startIndex + len(name),
166 | 	}
167 | }
168 | 
169 | // AddChild node to node
170 | func (node *Node) AddChild(newChildNode *Node) error {
171 | 	if newChildNode.startIndex < node.endIndex {
172 | 		return fmt.Errorf("Child starts at %v but this node ends at %v can't add as child", newChildNode.startIndex, node.endIndex)
173 | 	}
174 | 	node.children = append(node.children, newChildNode)
175 | 	return nil
176 | }
177 | 
178 | func (node *Node) findChildNodesBeforeIndex(index int) []*Node {
179 | 	childrensThatEndBeforeIndex := []*Node{}
180 | 
181 | 	for _, child := range node.children {
182 | 		if child.endIndex <= index {
183 | 			childrensThatEndBeforeIndex = append(childrensThatEndBeforeIndex, child)
184 | 		}
185 | 	}
186 | 
187 | 	return childrensThatEndBeforeIndex
188 | }
189 | 
190 | // RecursivelyBuildNames of compounds
191 | func (node *Node) RecursivelyBuildNames(ctx context.Context) [][]string {
192 | 	compoundName := [][]string{}
193 | 	if ctx.Err() != nil {
194 | 		// we've been going recursively too long, abort!
195 | 		compoundName = append(compoundName, []string{node.name})
196 | 		return compoundName
197 | 	}
198 | 
199 | 	for _, child := range node.children {
200 | 		childNames := child.RecursivelyBuildNames(ctx)
201 | 
202 | 		for _, childName := range childNames {
203 | 			// Add the name of this node first
204 | 			fullName := []string{node.name}
205 | 			fullName = append(fullName, childName...)
206 | 			compoundName = append(compoundName, fullName)
207 | 		}
208 | 	}
209 | 	if len(compoundName) == 0 {
210 | 		// This is a leave node
211 | 		compoundName = append(compoundName, []string{node.name})
212 | 	}
213 | 
214 | 	return compoundName
215 | }
216 | 
217 | // RecursivelyFindLeavesBeforeIndex where to add a new node
218 | func (node *Node) RecursivelyFindLeavesBeforeIndex(ctx context.Context, index int) []*Node {
219 | 	foundLeaves := []*Node{}
220 | 	if ctx.Err() != nil {
221 | 		// we've been going recursively too long, abort!
222 | 		return foundLeaves
223 | 	}
224 | 
225 | 	children := node.findChildNodesBeforeIndex(index)
226 | 	for _, child := range children {
227 | 		leaves := child.RecursivelyFindLeavesBeforeIndex(ctx, index)
228 | 		if len(leaves) == 0 {
229 | 			// There are no leaves this means the child node is already a leave
230 | 			foundLeaves = append(foundLeaves, child)
231 | 		} else {
232 | 			// Found leaves use them instead of direct child
233 | 			foundLeaves = append(foundLeaves, leaves...)
234 | 		}
235 | 	}
236 | 
237 | 	if len(foundLeaves) == 0 && node.endIndex <= index {
238 | 		// This node is the leave
239 | 		foundLeaves = append(foundLeaves, node)
240 | 	}
241 | 
242 | 	return foundLeaves
243 | }
244 | 
245 | // NewEmptyTestSplitter creates a splitter,
246 | //  that does not know any words and
247 | //  thus is not able to split any words
248 | func NewEmptyTestSplitter() *Splitter {
249 | 	dictMock := &DictMock{
250 | 		scores: map[string]float64{},
251 | 	}
252 | 	return &Splitter{
253 | 		dict: dictMock,
254 | 	}
255 | }
256 | 
257 | func NewTestSplitter(wordScoreMapping map[string]float64) *Splitter {
258 | 	dict := &DictMock{
259 | 		scores: wordScoreMapping,
260 | 	}
261 | 	return &Splitter{
262 | 		dict: dict,
263 | 	}
264 | }
265 | 


--------------------------------------------------------------------------------
/compoundsplitting/splitter_test.go:
--------------------------------------------------------------------------------
  1 | package compoundsplitting
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"testing"
  7 | 	"time"
  8 | 
  9 | 	"github.com/stretchr/testify/assert"
 10 | 	"github.com/stretchr/testify/require"
 11 | )
 12 | 
 13 | func TestSplitTreeSplitter(t *testing.T) {
 14 | 	dictMock := &DictMock{
 15 | 		scores: map[string]float64{
 16 | 			"drie":     2.0,
 17 | 			"hoek":     2.0,
 18 | 			"brood":    4.0,
 19 | 			"driehoek": 5.0,
 20 | 			"broodje":  5.0,
 21 | 		},
 22 | 	}
 23 | 
 24 | 	ts := Splitter{
 25 | 		dict:        dictMock,
 26 | 		cancelAfter: 500 * time.Millisecond,
 27 | 	}
 28 | 
 29 | 	// drie hoek brood
 30 | 	//           broodje
 31 | 	// driehoek brood
 32 | 	//          broodje
 33 | 
 34 | 
 35 | 	cs := CompoundSplit{}
 36 | 
 37 | 	ts.findAllWordCombinations(context.Background(), "driehoeksbroodje", &cs)
 38 | 
 39 | 	combinations := cs.getAllWordCombinations(context.Background())
 40 | 	assert.Equal(t, 4, len(combinations))
 41 | 	for _, combination := range combinations {
 42 | 		fmt.Printf("%v\n", combination)
 43 | 	}
 44 | 
 45 | 	splited, err := ts.Split("driehoeksbroodje")
 46 | 	assert.Nil(t, err)
 47 | 	require.Equal(t, 2, len(splited))
 48 | 	assert.Equal(t, "driehoek", splited[0])
 49 | 	assert.Equal(t, "broodje", splited[1])
 50 | 
 51 | 	// Test no result
 52 | 	splited, err = ts.Split("raupenprozessionsspinner")
 53 | 	assert.Nil(t, err)
 54 | 	assert.Equal(t, 0, len(splited), "Expected no result since no substring is in the dict")
 55 | }
 56 | 
 57 | func TestNegativeScore(t *testing.T) {
 58 | 	dictMock := &DictMock{
 59 | 		scores: map[string]float64{
 60 | 			"drie":     -10.0,
 61 | 			"hoek":     -10.0,
 62 | 			"brood":    -8.0,
 63 | 			"driehoek": -2.0,
 64 | 			"broodje":  -2.0,
 65 | 		},
 66 | 	}
 67 | 
 68 | 	ts := NewSplitter(dictMock)
 69 | 
 70 | 	splited, err := ts.Split("driehoeksbroodje")
 71 | 	assert.Nil(t, err)
 72 | 	assert.Equal(t, 2, len(splited))
 73 | 	assert.Equal(t, "driehoek", splited[0])
 74 | 	assert.Equal(t, "broodje", splited[1])
 75 | }
 76 | 
 77 | func TestInsertCompound(t *testing.T) {
 78 | 
 79 | 	t.Run("Add a new word", func(t *testing.T) {
 80 | 		ts := CompoundSplit{}
 81 | 		ts.insertCompound(context.Background(), "test", 0)
 82 | 
 83 | 		assert.Equal(t, 1, len(ts.combinations))
 84 | 		assert.Equal(t, "test", ts.combinations[0].name)
 85 | 	})
 86 | 
 87 | 	t.Run("Add a two words", func(t *testing.T) {
 88 | 		ts := CompoundSplit{}
 89 | 		ts.insertCompound(context.Background(), "test", 0)
 90 | 		ts.insertCompound(context.Background(), "testje", 0)
 91 | 
 92 | 		assert.Equal(t, 2, len(ts.combinations))
 93 | 		assert.Equal(t, "test", ts.combinations[0].name)
 94 | 		assert.Equal(t, "testje", ts.combinations[1].name)
 95 | 	})
 96 | 
 97 | 	t.Run("Add a two words different index", func(t *testing.T) {
 98 | 		ts := CompoundSplit{}
 99 | 
100 | 		// phrase: testje
101 | 		ts.insertCompound(context.Background(), "test", 0)
102 | 		ts.insertCompound(context.Background(), "stje", 2)
103 | 
104 | 		assert.Equal(t, 2, len(ts.combinations))
105 | 		assert.Equal(t, "test", ts.combinations[0].name)
106 | 		assert.Equal(t, "stje", ts.combinations[1].name)
107 | 	})
108 | 
109 | 	t.Run("Add a two words different index", func(t *testing.T) {
110 | 		ts := CompoundSplit{}
111 | 
112 | 		// phrase: testjenuttig
113 | 		//         123456789111
114 | 		//                  012
115 | 		ts.insertCompound(context.Background(), "test", 0)
116 | 		ts.insertCompound(context.Background(), "nuttig", 8)
117 | 
118 | 		assert.Equal(t, 1, len(ts.combinations))
119 | 		phrase := ts.combinations[0]
120 | 		assert.Equal(t, "test", phrase.name)
121 | 		assert.Equal(t, "nuttig", phrase.children[0].name)
122 | 
123 | 	})
124 | 
125 | 	t.Run("Add a two combinations", func(t *testing.T) {
126 | 		ts := CompoundSplit{}
127 | 
128 | 		// phrase: testjenuttig
129 | 		//         123456789111
130 | 		//                  012
131 | 		ts.insertCompound(context.Background(), "test", 0)
132 | 		ts.insertCompound(context.Background(), "est", 1)
133 | 		ts.insertCompound(context.Background(), "nuttig", 8)
134 | 
135 | 		assert.Equal(t, 2, len(ts.combinations))
136 | 		phrase := ts.combinations[0]
137 | 		assert.Equal(t, "test", phrase.name)
138 | 		assert.Equal(t, "nuttig", phrase.children[0].name)
139 | 
140 | 		phrase = ts.combinations[1]
141 | 		assert.Equal(t, "est", phrase.name)
142 | 		assert.Equal(t, "nuttig", phrase.children[0].name)
143 | 	})
144 | 
145 | 	t.Run("Add driehoeksbroodje", func(t *testing.T) {
146 | 		ts := CompoundSplit{}
147 | 
148 | 		// phrase: driehoeksbroodje
149 | 		//         1234567891111111
150 | 		//                  0123456
151 | 		ts.insertCompound(context.Background(), "drie", 0)
152 | 		ts.insertCompound(context.Background(), "driehoek", 0)
153 | 		ts.insertCompound(context.Background(), "hoek", 5)
154 | 		ts.insertCompound(context.Background(), "brood", 10)
155 | 		ts.insertCompound(context.Background(), "broodje", 10)
156 | 
157 | 		// drie hoek brood
158 | 		//           broodje
159 | 
160 | 		// driehoek brood
161 | 		//          broodje
162 | 
163 | 		assert.Equal(t, 2, len(ts.combinations))
164 | 	})
165 | 
166 | }
167 | 
168 | func TestNode(t *testing.T) {
169 | 
170 | 	t.Run("New Node", func(t *testing.T) {
171 | 		node := NewNode("test", 2)
172 | 		assert.Equal(t, 6, node.endIndex)
173 | 	})
174 | 
175 | 	t.Run("Add child", func(t *testing.T) {
176 | 		node1 := NewNode("test", 2)
177 | 		node2 := NewNode("case", 6)
178 | 		node3 := NewNode("ase", 7)
179 | 		err := node1.AddChild(node2)
180 | 		assert.Nil(t, err)
181 | 		err = node1.AddChild(node3)
182 | 		assert.Nil(t, err)
183 | 
184 | 		assert.Equal(t, 2, len(node1.children))
185 | 	})
186 | 
187 | 	t.Run("Add wrong index", func(t *testing.T) {
188 | 		node1 := NewNode("test", 2)
189 | 		node2 := NewNode("esting", 3)
190 | 		err := node1.AddChild(node2)
191 | 		assert.NotNil(t, err)
192 | 	})
193 | 
194 | 	t.Run("find children before index", func(t *testing.T) {
195 | 		// testcasees
196 | 		// 0123456789
197 | 		test := NewNode("test", 0)
198 | 		caseN := NewNode("case", 4)
199 | 		as := NewNode("as", 5)
200 | 		see := NewNode("see", 6)
201 | 		es := NewNode("es", 8)
202 | 
203 | 		// test case es
204 | 		// test  as  es
205 | 		// test   see
206 | 
207 | 		test.AddChild(caseN)
208 | 		test.AddChild(as)
209 | 		test.AddChild(see)
210 | 		caseN.AddChild(es)
211 | 		as.AddChild(es)
212 | 
213 | 		// no child nodes that end before index 6
214 | 		assert.Equal(t, 0, len(test.findChildNodesBeforeIndex(6)))
215 | 		// as ends at 7
216 | 		assert.Equal(t, 1, len(test.findChildNodesBeforeIndex(7)))
217 | 		// case ends at 8
218 | 		assert.Equal(t, 2, len(test.findChildNodesBeforeIndex(8)))
219 | 		// see ends at 9
220 | 		assert.Equal(t, 3, len(test.findChildNodesBeforeIndex(9)))
221 | 	})
222 | 
223 | 	t.Run("find leaves before index", func(t *testing.T) {
224 | 		// testcasees
225 | 		// 0123456789
226 | 		test := NewNode("test", 0)
227 | 		caseN := NewNode("case", 4)
228 | 		as := NewNode("as", 5)
229 | 		see := NewNode("see", 6)
230 | 		es := NewNode("es", 8)
231 | 
232 | 		// test case es
233 | 		// test  as  es
234 | 		// test   see
235 | 
236 | 		test.AddChild(caseN)
237 | 		test.AddChild(as)
238 | 		test.AddChild(see)
239 | 		caseN.AddChild(es)
240 | 		as.AddChild(es)
241 | 
242 | 		assert.Equal(t, 0, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 0)))
243 | 		assert.Equal(t, 0, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 3)))
244 | 		assert.Equal(t, 1, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 4)))
245 | 		node := test.RecursivelyFindLeavesBeforeIndex(context.Background(), 4)[0]
246 | 		assert.Equal(t, "test", node.name)
247 | 
248 | 		assert.Equal(t, 1, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 7)))
249 | 		node = test.RecursivelyFindLeavesBeforeIndex(context.Background(), 7)[0]
250 | 		assert.Equal(t, "as", node.name)
251 | 
252 | 		assert.Equal(t, 2, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 8)))
253 | 	})
254 | 
255 | }
256 | 
257 | func TestSplitVeryLongWords(t *testing.T) {
258 | 	dictMock := &DictMock{
259 | 		scores: map[string]float64{
260 | 			"aaaa": 1.0,
261 | 			"bbbb": 1.0,
262 | 		},
263 | 	}
264 | 
265 | 	ts := Splitter{
266 | 		dict: dictMock,
267 | 	}
268 | 
269 | 	t1 := time.Now()
270 | 
271 | 	split, err := ts.Split("aaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaa")
272 | 
273 | 	t2 := time.Now()
274 | 	diff := t2.Sub(t1)
275 | 
276 | 	assert.Nil(t, err)
277 | 	assert.Less(t, 0, len(split))
278 | 
279 | 	if diff > time.Millisecond*200 {
280 | 		fmt.Errorf("Splitter took too long")
281 | 		t.Fail()
282 | 	}
283 | }
284 | 
285 | func TestSplitTooLongWords(t *testing.T) {
286 | 	dictMock := &DictMock{
287 | 		scores: map[string]float64{
288 | 			"aaaa": 1.0,
289 | 			"bbbb": 1.0,
290 | 		},
291 | 	}
292 | 
293 | 	ts := Splitter{
294 | 		dict: dictMock,
295 | 	}
296 | 
297 | 	split, err := ts.Split("aaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbb")
298 | 
299 | 	assert.Nil(t, err)
300 | 	assert.Equal(t, 0, len(split))
301 | }
302 | 
303 | func TestUnboundTree(t *testing.T) {
304 | 	dictMock := &DictMock{
305 | 		scores: map[string]float64{
306 | 			"5555":             1.0,
307 | 			"55555":            1.0,
308 | 			"5555555555555555": 1.0,
309 | 		},
310 | 	}
311 | 
312 | 	ts := Splitter{
313 | 		dict: dictMock,
314 | 	}
315 | 
316 | 	t1 := time.Now()
317 | 
318 | 	_, err := ts.Split("ql55555555555555555555555555555")
319 | 
320 | 	t2 := time.Now()
321 | 	diff := t2.Sub(t1)
322 | 
323 | 	assert.Nil(t, err)
324 | 
325 | 	if diff > time.Millisecond*200 {
326 | 		fmt.Errorf("Splitter took too long")
327 | 		t.Fail()
328 | 	}
329 | }
330 | 


--------------------------------------------------------------------------------
/contextionary/contextionary.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto3";
  2 | 
  3 | package contextionary;
  4 | 
  5 | service Contextionary {
  6 |   rpc IsWordStopword(Word) returns (WordStopword) {}
  7 |   rpc IsWordPresent(Word) returns (WordPresent) {}
  8 |   rpc SchemaSearch(SchemaSearchParams) returns (SchemaSearchResults) {}
  9 |   rpc SafeGetSimilarWordsWithCertainty(SimilarWordsParams) returns (SimilarWordsResults) {}
 10 |   rpc VectorForWord(Word) returns (Vector) {}
 11 |   rpc MultiVectorForWord(WordList) returns (VectorList) {}
 12 |   rpc VectorForCorpi(Corpi) returns (Vector) {}
 13 |   rpc NearestWordsByVector(VectorNNParams) returns (NearestWords) {}
 14 |   rpc MultiNearestWordsByVector(VectorNNParamsList) returns (NearestWordsList) {}
 15 |   rpc Meta(MetaParams) returns (MetaOverview) {}
 16 |   rpc AddExtension(ExtensionInput) returns (AddExtensionResult) {}
 17 | }
 18 | 
 19 | message ExtensionInput {
 20 |   string concept = 1;
 21 |   string definition = 2;
 22 |   float weight = 3;
 23 | }
 24 | 
 25 | message AddExtensionResult { }
 26 | 
 27 | message MetaParams {}
 28 | 
 29 | message MetaOverview {
 30 |   string version = 1;
 31 |   int64 wordCount = 2;
 32 | }
 33 | 
 34 | message Word {
 35 |  string word = 1;
 36 | }
 37 | 
 38 | message WordList {
 39 |  repeated Word words = 1;
 40 | }
 41 | 
 42 | message WordPresent {
 43 |  bool present = 1;
 44 | }
 45 | 
 46 | message Vector {
 47 |   repeated VectorEntry entries = 1;
 48 |   repeated InputElement source = 2;
 49 | };
 50 | 
 51 | message InputElement {
 52 |   string concept = 1;
 53 |   float weight = 2;
 54 |   uint64 occurrence = 3;
 55 | };
 56 | 
 57 | message VectorList {
 58 |   repeated Vector vectors = 1;
 59 | }
 60 | 
 61 | message VectorEntry {
 62 |   float Entry = 1;
 63 | }
 64 | 
 65 | message VectorNNParams {
 66 |   Vector vector = 1;
 67 |   int32 k = 2;
 68 |   int32 n = 3;
 69 | }
 70 | 
 71 | message VectorNNParamsList {
 72 |   repeated VectorNNParams Params = 1;
 73 | }
 74 | 
 75 | message Corpi {
 76 |   repeated string corpi = 1;
 77 |   repeated Override overrides = 2;
 78 | }
 79 | 
 80 | message Override {
 81 |   string word = 1;
 82 |   string expression = 2;
 83 | }
 84 | 
 85 | message WordStopword {
 86 |  bool stopword = 1;
 87 | }
 88 | 
 89 | message SimilarWordsParams {
 90 |   string word = 1;
 91 |   float certainty = 2;
 92 | }
 93 | 
 94 | message SimilarWordsResults {
 95 |   repeated Word words = 1;
 96 | }
 97 | 
 98 | message NearestWords {
 99 |   repeated string words = 1;
100 |   repeated float distances = 2;
101 |   VectorList vectors = 3;
102 | }
103 | 
104 | message NearestWordsList {
105 |   repeated NearestWords words = 1;
106 | }
107 | 
108 | message Keyword {
109 |   string keyword = 1;
110 |   float weight = 2;
111 | }
112 | 
113 | enum SearchType {
114 |   CLASS=0;
115 |   PROPERTY=1;
116 | };
117 | 
118 | message SchemaSearchParams {
119 |   SearchType searchType = 1;
120 |   string name = 2;
121 |   repeated Keyword keywords = 3;
122 |   float certainty = 5;
123 | }
124 | 
125 | message SchemaSearchResults {
126 |   SearchType type = 1;
127 |   repeated SchemaSearchResult results = 2;
128 | }
129 | 
130 | message SchemaSearchResult {
131 |   string name = 1;
132 |   float certainty = 3;
133 | }
134 | 


--------------------------------------------------------------------------------
/contextionary/core/annoyindex/annoy_test.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | /*
 13 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not
 14 | # use this file except in compliance with the License. You may obtain a copy of
 15 | # the License at
 16 | #
 17 | # http://www.apache.org/licenses/LICENSE-2.0
 18 | #
 19 | # Unless required by applicable law or agreed to in writing, software
 20 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 21 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 22 | # License for the specific language governing permissions and limitations under
 23 | # the License.
 24 | */
 25 | 
 26 | package annoyindex_test
 27 | 
 28 | import (
 29 | 	"math"
 30 | 	"math/rand"
 31 | 	"os"
 32 | 	"testing"
 33 | 
 34 | 	"github.com/weaviate/contextionary/contextionary/core/annoyindex"
 35 | 
 36 | 	"github.com/stretchr/testify/assert"
 37 | 	"github.com/stretchr/testify/suite"
 38 | )
 39 | 
 40 | type AnnoyTestSuite struct {
 41 | 	suite.Suite
 42 | }
 43 | 
 44 | func Round(f float64) float64 {
 45 | 	return math.Floor(f + 0.5)
 46 | }
 47 | 
 48 | func RoundPlus(f float64, places int) float64 {
 49 | 	shift := math.Pow(10, float64(places))
 50 | 	return Round(f*shift) / shift
 51 | }
 52 | 
 53 | func (suite *AnnoyTestSuite) SetupTest() {
 54 | }
 55 | 
 56 | func (suite *AnnoyTestSuite) TestFileHandling() {
 57 | 	index := annoyindex.NewAnnoyIndexAngular(3)
 58 | 	index.AddItem(0, []float32{0, 0, 1})
 59 | 	index.AddItem(1, []float32{0, 1, 0})
 60 | 	index.AddItem(2, []float32{1, 0, 0})
 61 | 	index.Build(10)
 62 | 
 63 | 	index.Save("go_test.ann")
 64 | 
 65 | 	info, err := os.Stat("go_test.ann")
 66 | 	if err != nil {
 67 | 		assert.Fail(suite.T(), "Failed to create file, file not found")
 68 | 	}
 69 | 	if info.Size() == 0 {
 70 | 		assert.Fail(suite.T(), "Failed to create file, file size zero")
 71 | 	}
 72 | 
 73 | 	annoyindex.DeleteAnnoyIndexAngular(index)
 74 | 
 75 | 	index = annoyindex.NewAnnoyIndexAngular(3)
 76 | 	if ret := index.Load("go_test.ann"); ret == false {
 77 | 		assert.Fail(suite.T(), "Failed to load file")
 78 | 	}
 79 | 	annoyindex.DeleteAnnoyIndexAngular(index)
 80 | 
 81 | 	os.Remove("go_test.ann")
 82 | }
 83 | 
 84 | func (suite *AnnoyTestSuite) TestGetNnsByVector() {
 85 | 	index := annoyindex.NewAnnoyIndexAngular(3)
 86 | 	index.AddItem(0, []float32{0, 0, 1})
 87 | 	index.AddItem(1, []float32{0, 1, 0})
 88 | 	index.AddItem(2, []float32{1, 0, 0})
 89 | 	index.Build(10)
 90 | 
 91 | 	var result []int
 92 | 	index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, &result)
 93 | 	assert.Equal(suite.T(), []int{2, 1, 0}, result)
 94 | 
 95 | 	index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, &result)
 96 | 	assert.Equal(suite.T(), []int{0, 1, 2}, result)
 97 | 
 98 | 	index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, &result)
 99 | 	assert.Equal(suite.T(), []int{2, 0, 1}, result)
100 | 
101 | 	annoyindex.DeleteAnnoyIndexAngular(index)
102 | }
103 | 
104 | func (suite *AnnoyTestSuite) TestGetNnsByItem() {
105 | 	index := annoyindex.NewAnnoyIndexAngular(3)
106 | 	index.AddItem(0, []float32{2, 1, 0})
107 | 	index.AddItem(1, []float32{1, 2, 0})
108 | 	index.AddItem(2, []float32{0, 0, 1})
109 | 	index.Build(10)
110 | 
111 | 	var result []int
112 | 	index.GetNnsByItem(0, 3, -1, &result)
113 | 	assert.Equal(suite.T(), []int{0, 1, 2}, result)
114 | 
115 | 	index.GetNnsByItem(1, 3, -1, &result)
116 | 	assert.Equal(suite.T(), []int{1, 0, 2}, result)
117 | 
118 | 	annoyindex.DeleteAnnoyIndexAngular(index)
119 | }
120 | 
121 | func (suite *AnnoyTestSuite) TestGetItem() {
122 | 	index := annoyindex.NewAnnoyIndexAngular(3)
123 | 	index.AddItem(0, []float32{2, 1, 0})
124 | 	index.AddItem(1, []float32{1, 2, 0})
125 | 	index.AddItem(2, []float32{0, 0, 1})
126 | 	index.Build(10)
127 | 
128 | 	var result []float32
129 | 
130 | 	index.GetItem(0, &result)
131 | 	assert.Equal(suite.T(), []float32{2, 1, 0}, result)
132 | 
133 | 	index.GetItem(1, &result)
134 | 	assert.Equal(suite.T(), []float32{1, 2, 0}, result)
135 | 
136 | 	index.GetItem(2, &result)
137 | 	assert.Equal(suite.T(), []float32{0, 0, 1}, result)
138 | 
139 | 	annoyindex.DeleteAnnoyIndexAngular(index)
140 | }
141 | 
142 | func (suite *AnnoyTestSuite) TestGetDistance() {
143 | 	index := annoyindex.NewAnnoyIndexAngular(2)
144 | 	index.AddItem(0, []float32{0, 1})
145 | 	index.AddItem(1, []float32{1, 1})
146 | 	index.Build(10)
147 | 
148 | 	assert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3))
149 | 
150 | 	annoyindex.DeleteAnnoyIndexAngular(index)
151 | }
152 | 
153 | func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() {
154 | 	index := annoyindex.NewAnnoyIndexEuclidean(10)
155 | 
156 | 	for j := 0; j < 10000; j += 2 {
157 | 		p := make([]float32, 0, 10)
158 | 		for i := 0; i < 10; i++ {
159 | 			p = append(p, rand.Float32())
160 | 		}
161 | 		x := make([]float32, 0, 10)
162 | 		for i := 0; i < 10; i++ {
163 | 			x = append(x, 1+p[i]+rand.Float32()*1e-2)
164 | 		}
165 | 		y := make([]float32, 0, 10)
166 | 		for i := 0; i < 10; i++ {
167 | 			y = append(y, 1+p[i]+rand.Float32()*1e-2)
168 | 		}
169 | 		index.AddItem(j, x)
170 | 		index.AddItem(j+1, y)
171 | 	}
172 | 	index.Build(10)
173 | 	for j := 0; j < 10000; j += 2 {
174 | 		var result []int
175 | 		index.GetNnsByItem(j, 2, -1, &result)
176 | 
177 | 		assert.Equal(suite.T(), result, []int{j, j + 1})
178 | 
179 | 		index.GetNnsByItem(j+1, 2, -1, &result)
180 | 		assert.Equal(suite.T(), result, []int{j + 1, j})
181 | 	}
182 | 	annoyindex.DeleteAnnoyIndexEuclidean(index)
183 | }
184 | 
185 | func TestAnnoyTestSuite(t *testing.T) {
186 | 	suite.Run(t, new(AnnoyTestSuite))
187 | }
188 | 


--------------------------------------------------------------------------------
/contextionary/core/annoyindex/annoygomodule.h:
--------------------------------------------------------------------------------
 1 | #include "annoylib.h"
 2 | #include "kissrandom.h"
 3 | 
 4 | namespace GoAnnoy {
 5 | 
 6 | class AnnoyIndex {
 7 |  protected:
 8 |   ::AnnoyIndexInterface<int32_t, float> *ptr;
 9 | 
10 |   int f;
11 | 
12 |  public:
13 |   ~AnnoyIndex() {
14 |     delete ptr;
15 |   };
16 |   void addItem(int item, const float* w) {
17 |     ptr->add_item(item, w);
18 |   };
19 |   void build(int q) {
20 |     ptr->build(q);
21 |   };
22 |   bool save(const char* filename) {
23 |     return ptr->save(filename);
24 |   };
25 |   void unload() {
26 |     ptr->unload();
27 |   };
28 |   bool load(const char* filename) {
29 |     return ptr->load(filename);
30 |   };
31 |   float getDistance(int i, int j) {
32 |     return ptr->get_distance(i, j);
33 |   };
34 |   void getNnsByItem(int item, int n, int search_k, vector<int32_t>* result, vector<float>* distances) {
35 |     ptr->get_nns_by_item(item, n, search_k, result, distances);
36 |   };
37 |   void getNnsByVector(const float* w, int n, int search_k, vector<int32_t>* result, vector<float>* distances) {
38 |     ptr->get_nns_by_vector(w, n, search_k, result, distances);
39 |   };
40 |   void getNnsByItem(int item, int n, int search_k, vector<int32_t>* result) {
41 |     ptr->get_nns_by_item(item, n, search_k, result, NULL);
42 |   };
43 |   void getNnsByVector(const float* w, int n, int search_k, vector<int32_t>* result) {
44 |     ptr->get_nns_by_vector(w, n, search_k, result, NULL);
45 |   };
46 | 
47 |   int getNItems() {
48 |     return (int)ptr->get_n_items();
49 |   };
50 |   void verbose(bool v) {
51 |     ptr->verbose(v);
52 |   };
53 |   void getItem(int item, vector<float> *v) {
54 |     v->resize(this->f);
55 |     ptr->get_item(item, &v->front());
56 |   };
57 | };
58 | 
59 | class AnnoyIndexAngular : public AnnoyIndex 
60 | {
61 |  public:
62 |   AnnoyIndexAngular(int f) {
63 |     ptr = new ::AnnoyIndex<int32_t, float, ::Angular, ::Kiss64Random>(f);
64 |     this->f = f;
65 |   }
66 | };
67 | 
68 | class AnnoyIndexEuclidean : public AnnoyIndex {
69 |  public:
70 |   AnnoyIndexEuclidean(int f) {
71 |     ptr = new ::AnnoyIndex<int32_t, float, ::Euclidean, ::Kiss64Random>(f);
72 |     this->f = f;
73 |   }
74 | };
75 | 
76 | class AnnoyIndexManhattan : public AnnoyIndex {
77 |  public:
78 |   AnnoyIndexManhattan(int f) {
79 |     ptr = new ::AnnoyIndex<int32_t, float, ::Manhattan, ::Kiss64Random>(f);
80 |     this->f = f;
81 |   }
82 | };
83 | }
84 | 


--------------------------------------------------------------------------------
/contextionary/core/annoyindex/kissrandom.h:
--------------------------------------------------------------------------------
  1 | #ifndef KISSRANDOM_H
  2 | #define KISSRANDOM_H
  3 | 
  4 | #if defined(_MSC_VER) && _MSC_VER == 1500
  5 | typedef unsigned __int32    uint32_t;
  6 | typedef unsigned __int32    uint64_t;
  7 | #else
  8 | #include <stdint.h>
  9 | #endif
 10 | 
 11 | // KISS = "keep it simple, stupid", but high quality random number generator
 12 | // http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
 13 | // http://mathforum.org/kb/message.jspa?messageID=6627731
 14 | // https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
 15 | 
 16 | // 32 bit KISS
 17 | struct Kiss32Random {
 18 |   uint32_t x;
 19 |   uint32_t y;
 20 |   uint32_t z;
 21 |   uint32_t c;
 22 | 
 23 |   // seed must be != 0
 24 |   Kiss32Random(uint32_t seed = 123456789) {
 25 |     x = seed;
 26 |     y = 362436000;
 27 |     z = 521288629;
 28 |     c = 7654321;
 29 |   }
 30 | 
 31 |   uint32_t kiss() {
 32 |     // Linear congruence generator
 33 |     x = 69069 * x + 12345;
 34 | 
 35 |     // Xor shift
 36 |     y ^= y << 13;
 37 |     y ^= y >> 17;
 38 |     y ^= y << 5;
 39 | 
 40 |     // Multiply-with-carry
 41 |     uint64_t t = 698769069ULL * z + c;
 42 |     c = t >> 32;
 43 |     z = (uint32_t) t;
 44 | 
 45 |     return x + y + z;
 46 |   }
 47 |   inline int flip() {
 48 |     // Draw random 0 or 1
 49 |     return kiss() & 1;
 50 |   }
 51 |   inline size_t index(size_t n) {
 52 |     // Draw random integer between 0 and n-1 where n is at most the number of data points you have
 53 |     return kiss() % n;
 54 |   }
 55 |   inline void set_seed(uint32_t seed) {
 56 |     x = seed;
 57 |   }
 58 | };
 59 | 
 60 | // 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
 61 | struct Kiss64Random {
 62 |   uint64_t x;
 63 |   uint64_t y;
 64 |   uint64_t z;
 65 |   uint64_t c;
 66 | 
 67 |   // seed must be != 0
 68 |   Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
 69 |     x = seed;
 70 |     y = 362436362436362436ULL;
 71 |     z = 1066149217761810ULL;
 72 |     c = 123456123456123456ULL;
 73 |   }
 74 | 
 75 |   uint64_t kiss() {
 76 |     // Linear congruence generator
 77 |     z = 6906969069LL*z+1234567;
 78 | 
 79 |     // Xor shift
 80 |     y ^= (y<<13);
 81 |     y ^= (y>>17);
 82 |     y ^= (y<<43);
 83 | 
 84 |     // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
 85 |     uint64_t t = (x<<58)+c;
 86 |     c = (x>>6);
 87 |     x += t;
 88 |     c += (x<t);
 89 | 
 90 |     return x + y + z;
 91 |   }
 92 |   inline int flip() {
 93 |     // Draw random 0 or 1
 94 |     return kiss() & 1;
 95 |   }
 96 |   inline size_t index(size_t n) {
 97 |     // Draw random integer between 0 and n-1 where n is at most the number of data points you have
 98 |     return kiss() % n;
 99 |   }
100 |   inline void set_seed(uint32_t seed) {
101 |     x = seed;
102 |   }
103 | };
104 | 
105 | #endif
106 | // vim: tabstop=2 shiftwidth=2
107 | 


--------------------------------------------------------------------------------
/contextionary/core/centroid.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */
12 | package contextionary
13 | 
14 | import (
15 | 	"fmt"
16 | )
17 | 
18 | func ComputeCentroid(vectors []Vector) (*Vector, error) {
19 | 	var weights []float32 = make([]float32, len(vectors))
20 | 
21 | 	for i := 0; i < len(vectors); i++ {
22 | 		weights[i] = 1.0
23 | 	}
24 | 
25 | 	return ComputeWeightedCentroid(vectors, weights)
26 | }
27 | 
28 | func ComputeWeightedCentroid(vectors []Vector, weights []float32) (*Vector, error) {
29 | 
30 | 	if len(vectors) == 0 {
31 | 		return nil, fmt.Errorf("Can not compute centroid of empty slice")
32 | 	} else if len(vectors) != len(weights) {
33 | 		return nil, fmt.Errorf("Can not compute weighted centroid if len(vectors) != len(weights)")
34 | 	} else if len(vectors) == 1 {
35 | 		return &vectors[0], nil
36 | 	} else {
37 | 		vector_len := vectors[0].Len()
38 | 
39 | 		var new_vector []float32 = make([]float32, vector_len)
40 | 		var weight_sum float32 = 0.0
41 | 
42 | 		for vector_i, v := range vectors {
43 | 			if v.Len() != vector_len {
44 | 				return nil, fmt.Errorf("Vectors have different lengths")
45 | 			}
46 | 
47 | 			weight_sum += weights[vector_i]
48 | 
49 | 			for i := 0; i < vector_len; i++ {
50 | 				new_vector[i] += v.vector[i] * weights[vector_i]
51 | 			}
52 | 		}
53 | 
54 | 		for i := 0; i < vector_len; i++ {
55 | 			new_vector[i] /= weight_sum
56 | 		}
57 | 
58 | 		result := NewVector(new_vector)
59 | 		return &result, nil
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/contextionary/core/centroid_test.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */
12 | package contextionary
13 | 
14 | import (
15 | 	"testing"
16 | )
17 | 
18 | func TestComputeCentroid(t *testing.T) {
19 | 
20 | 	assert_centroid_equal := func(points []Vector, expected Vector) {
21 | 		centroid, err := ComputeCentroid(points)
22 | 		if err != nil {
23 | 			t.Errorf("Could not compute centroid of %v", points)
24 | 		}
25 | 		equal, err := centroid.Equal(&expected)
26 | 		if err != nil {
27 | 			t.Errorf("Could not compare centroid with expected vector; %v", err)
28 | 		}
29 | 
30 | 		if !equal {
31 | 			points_str := "{"
32 | 			first := true
33 | 
34 | 			for _, point := range points {
35 | 				if first {
36 | 					first = false
37 | 				} else {
38 | 					points_str += ", "
39 | 				}
40 | 
41 | 				points_str += point.ToString()
42 | 			}
43 | 			points_str += "}"
44 | 
45 | 			t.Errorf("centroid of %v should be %v but was %v", points_str, expected.ToString(), centroid.ToString())
46 | 		}
47 | 	}
48 | 
49 | 	assert_weighted_centroid_equal := func(points []Vector, weights []float32, expected Vector) {
50 | 		centroid, err := ComputeWeightedCentroid(points, weights)
51 | 		if err != nil {
52 | 			t.Errorf("Could not compute centroid of %v", points)
53 | 		}
54 | 		equal, err := centroid.EqualEpsilon(&expected, 0.01)
55 | 		if err != nil {
56 | 			t.Errorf("Could not compare centroid with expected vector; %v", err)
57 | 		}
58 | 
59 | 		if !equal {
60 | 			points_str := "{"
61 | 			first := true
62 | 
63 | 			for _, point := range points {
64 | 				if first {
65 | 					first = false
66 | 				} else {
67 | 					points_str += ", "
68 | 				}
69 | 
70 | 				points_str += point.ToString()
71 | 			}
72 | 			points_str += "}"
73 | 
74 | 			t.Errorf("centroid of %v should be %v but was %v", points_str, expected.ToString(), centroid.ToString())
75 | 		}
76 | 	}
77 | 
78 | 	va := NewVector([]float32{1, 1, 1})
79 | 	vb := NewVector([]float32{0, 0, 0})
80 | 	vc := NewVector([]float32{-1, -1, -1})
81 | 
82 | 	assert_centroid_equal([]Vector{va, vb}, NewVector([]float32{0.5, 0.5, 0.5}))
83 | 	assert_centroid_equal([]Vector{va, vb, vc}, NewVector([]float32{0.0, 0.0, 0.0}))
84 | 
85 | 	assert_weighted_centroid_equal([]Vector{va, vb}, []float32{1, 0}, va)
86 | 	assert_weighted_centroid_equal([]Vector{va, vb}, []float32{0, 1}, vb)
87 | 	assert_weighted_centroid_equal([]Vector{va, vb}, []float32{0.66, 0.33}, NewVector([]float32{0.66, 0.66, 0.66}))
88 | }
89 | 


--------------------------------------------------------------------------------
/contextionary/core/certainty.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package contextionary
12 | 
13 | // DistanceToCertainty converts a vector distance to a certainty. For now it's
14 | // a linear scale but will most likely change at some point.
15 | func DistanceToCertainty(d float32) float32 {
16 | 	return 1 - d/12
17 | }
18 | 


--------------------------------------------------------------------------------
/contextionary/core/combined.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | package contextionary
 13 | 
 14 | import (
 15 | 	"fmt"
 16 | 	"sort"
 17 | 
 18 | 	"github.com/weaviate/contextionary/errors"
 19 | )
 20 | 
 21 | type CombinedIndex struct {
 22 | 	indices       []combinedIndex
 23 | 	total_size    int
 24 | 	vector_length int
 25 | }
 26 | 
 27 | type combinedIndex struct {
 28 | 	offset int
 29 | 	size   int
 30 | 	index  *Contextionary
 31 | }
 32 | 
 33 | // Combine multiple indices, present them as one.
 34 | // It assumes that each index stores unique words
 35 | func CombineVectorIndices(indices []Contextionary) (*CombinedIndex, error) {
 36 | 	// We join the ItemIndex spaces the indivual indices, by
 37 | 	// offsetting the 2nd ItemIndex with len(indices[0]),
 38 | 	// the 3rd ItemIndex space with len(indices[0]) + len(indices[1]), etc.
 39 | 
 40 | 	if len(indices) < 2 {
 41 | 		return nil, fmt.Errorf("Less than two vector indices provided!")
 42 | 	}
 43 | 
 44 | 	combined_indices := make([]combinedIndex, len(indices))
 45 | 
 46 | 	var offset int = 0
 47 | 
 48 | 	vector_length := indices[0].GetVectorLength()
 49 | 
 50 | 	for i := 0; i < len(indices); i++ {
 51 | 		size := indices[i].GetNumberOfItems()
 52 | 
 53 | 		combined_indices[i] = combinedIndex{
 54 | 			offset: offset,
 55 | 			size:   size,
 56 | 			index:  &indices[i],
 57 | 		}
 58 | 
 59 | 		offset += size
 60 | 
 61 | 		my_length := indices[i].GetVectorLength()
 62 | 
 63 | 		if my_length != vector_length {
 64 | 			return nil, fmt.Errorf("vector length not equal")
 65 | 		}
 66 | 	}
 67 | 
 68 | 	return &CombinedIndex{indices: combined_indices, total_size: offset, vector_length: vector_length}, nil
 69 | }
 70 | 
 71 | // Verify that all the indices are disjoint
 72 | // Returns nil on success, an error if the words in the indices are not disjoint.
 73 | func (ci *CombinedIndex) VerifyDisjoint() error {
 74 | 	for index_i, item_i := range ci.indices {
 75 | 		for i := ItemIndex(0); int(i) < item_i.size; i++ {
 76 | 			word, err := (*item_i.index).ItemIndexToWord(i)
 77 | 			if err != nil {
 78 | 				panic("should not happen; this index should always be accessible")
 79 | 			}
 80 | 
 81 | 			for index_j, item_j := range ci.indices {
 82 | 				if index_i != index_j {
 83 | 					result := (*(item_j.index)).WordToItemIndex(word)
 84 | 					if result.IsPresent() {
 85 | 						return fmt.Errorf("Word %v is in more than one index.", word)
 86 | 					}
 87 | 				}
 88 | 			}
 89 | 		}
 90 | 	}
 91 | 
 92 | 	return nil
 93 | }
 94 | 
 95 | func (ci *CombinedIndex) GetNumberOfItems() int {
 96 | 	return ci.total_size
 97 | }
 98 | 
 99 | func (ci *CombinedIndex) GetVectorLength() int {
100 | 	return ci.vector_length
101 | }
102 | 
103 | func (ci *CombinedIndex) WordToItemIndex(word string) ItemIndex {
104 | 	for _, item := range ci.indices {
105 | 		item_index := (*item.index).WordToItemIndex(word)
106 | 
107 | 		if (&item_index).IsPresent() {
108 | 			return item_index + ItemIndex(item.offset)
109 | 		}
110 | 	}
111 | 
112 | 	return -1
113 | }
114 | 
115 | func (ci *CombinedIndex) find_vector_index_for_item_index(item_index ItemIndex) (ItemIndex, *Contextionary, error) {
116 | 	item := int(item_index)
117 | 
118 | 	for _, idx := range ci.indices {
119 | 		if item >= idx.offset && item < (idx.offset+idx.size) {
120 | 			return ItemIndex(item - idx.offset), idx.index, nil
121 | 		}
122 | 	}
123 | 
124 | 	return 0, nil, fmt.Errorf("out of index")
125 | }
126 | 
127 | func (ci *CombinedIndex) ItemIndexToWord(item ItemIndex) (string, error) {
128 | 	offsetted_index, vi, err := ci.find_vector_index_for_item_index(item)
129 | 
130 | 	if err != nil {
131 | 		return "", err
132 | 	}
133 | 
134 | 	word, err := (*vi).ItemIndexToWord(offsetted_index)
135 | 	return word, err
136 | }
137 | 
138 | func (ci *CombinedIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) {
139 | 	offsetted_index, vi, err := ci.find_vector_index_for_item_index(item)
140 | 
141 | 	if err != nil {
142 | 		return 0, err
143 | 	}
144 | 
145 | 	occ, err := (*vi).ItemIndexToOccurrence(offsetted_index)
146 | 	return occ, err
147 | }
148 | 
149 | func (ci *CombinedIndex) OccurrencePercentile(perc int) uint64 {
150 | 	max := uint64(0)
151 | 
152 | 	for _, index := range ci.indices {
153 | 		occ := (*index.index).OccurrencePercentile(perc)
154 | 		if occ > max {
155 | 			max = occ
156 | 		}
157 | 	}
158 | 
159 | 	return max
160 | }
161 | 
162 | func (ci *CombinedIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) {
163 | 	offsetted_index, vi, err := ci.find_vector_index_for_item_index(item)
164 | 	if err != nil {
165 | 		return nil, errors.NewInternalf(err.Error())
166 | 	}
167 | 
168 | 	word, err := (*vi).GetVectorForItemIndex(offsetted_index)
169 | 	if err != nil {
170 | 		return word, errors.NewInternalf(err.Error())
171 | 	}
172 | 
173 | 	return word, nil
174 | }
175 | 
176 | // Compute the distance between two items.
177 | func (ci *CombinedIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) {
178 | 	v1, err := ci.GetVectorForItemIndex(a)
179 | 	if err != nil {
180 | 		return 0.0, err
181 | 	}
182 | 
183 | 	v2, err := ci.GetVectorForItemIndex(b)
184 | 	if err != nil {
185 | 		return 0.0, err
186 | 	}
187 | 
188 | 	dist, err := v1.Distance(v2)
189 | 	if err != nil {
190 | 		return 0.0, err
191 | 	}
192 | 
193 | 	return dist, nil
194 | }
195 | 
196 | // Get the n nearest neighbours of item, examining k trees.
197 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
198 | func (ci *CombinedIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) {
199 | 	vec, err := ci.GetVectorForItemIndex(item)
200 | 	if err != nil {
201 | 		return nil, nil, fmt.Errorf("could not get vector for item index: %s", err)
202 | 	}
203 | 
204 | 	return ci.GetNnsByVector(*vec, n, k)
205 | }
206 | 
207 | type combined_nn_search_result struct {
208 | 	item ItemIndex
209 | 	dist float32
210 | }
211 | 
212 | type combined_nn_search_results struct {
213 | 	items []combined_nn_search_result
214 | 	ci    *CombinedIndex
215 | }
216 | 
217 | // SafeGetSimilarWords returns n similar words in the contextionary,
218 | // examining k trees. It is guaratueed to have results, even if the word is
219 | // not in the contextionary. In this case the list only contains the word
220 | // itself. It can then still be used for exact match or levensthein-based
221 | // searches against db backends.
222 | func (ci *CombinedIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) {
223 | 	return safeGetSimilarWordsFromAny(ci, word, n, k)
224 | }
225 | 
226 | // SafeGetSimilarWordsWithCertainty returns  similar words in the
227 | // contextionary, if they are close enough to match the required certainty.
228 | // It is guaratueed to have results, even if the word is not in the
229 | // contextionary. In this case the list only contains the word itself. It can
230 | // then still be used for exact match or levensthein-based searches against
231 | // db backends.
232 | func (ci *CombinedIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string {
233 | 	return safeGetSimilarWordsWithCertaintyFromAny(ci, word, certainty)
234 | }
235 | 
236 | func (a combined_nn_search_results) Len() int      { return len(a.items) }
237 | func (a combined_nn_search_results) Swap(i, j int) { a.items[i], a.items[j] = a.items[j], a.items[i] }
238 | func (a combined_nn_search_results) Less(i, j int) bool {
239 | 	// Sort on distance first, if those are the same, sort on lexographical order of the words.
240 | 	if a.items[i].dist == a.items[j].dist {
241 | 		wi, err := a.ci.ItemIndexToWord(a.items[i].item)
242 | 		if err != nil {
243 | 			panic("should be there")
244 | 		}
245 | 
246 | 		wj, err := a.ci.ItemIndexToWord(a.items[j].item)
247 | 		if err != nil {
248 | 			panic("should be there")
249 | 		}
250 | 		return wi < wj
251 | 	} else {
252 | 		return a.items[i].dist < a.items[j].dist
253 | 	}
254 | }
255 | 
256 | // Remove a certain element from the result search.
257 | func (a *combined_nn_search_results) Remove(i int) {
258 | 	a.items = append(a.items[:i], a.items[i+1:]...)
259 | }
260 | 
261 | // Get the n nearest neighbours of item, examining k trees.
262 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
263 | func (ci *CombinedIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) {
264 | 	results := combined_nn_search_results{
265 | 		items: make([]combined_nn_search_result, 0),
266 | 		ci:    ci,
267 | 	}
268 | 
269 | 	for _, item := range ci.indices {
270 | 		indices, floats, err := (*item.index).GetNnsByVector(vector, n, k)
271 | 		if err != nil {
272 | 			return nil, nil, errors.NewInternalf(err.Error())
273 | 		} else {
274 | 			for i, item_idx := range indices {
275 | 				results.items = append(results.items, combined_nn_search_result{item: item_idx + ItemIndex(item.offset), dist: floats[i]})
276 | 			}
277 | 		}
278 | 	}
279 | 
280 | 	sort.Sort(results)
281 | 
282 | 	// Now remove duplicates.
283 | 	for i := 1; i < len(results.items); {
284 | 		if results.items[i].item == results.items[i-1].item {
285 | 			results.Remove(i)
286 | 		} else {
287 | 			i++ // only increment if we're not removing.
288 | 		}
289 | 	}
290 | 
291 | 	items := make([]ItemIndex, 0)
292 | 	floats := make([]float32, 0)
293 | 
294 | 	var max_index int
295 | 
296 | 	if n < len(results.items) {
297 | 		max_index = n
298 | 	} else {
299 | 		max_index = len(results.items)
300 | 	}
301 | 
302 | 	for i := 0; i < max_index; i++ {
303 | 		items = append(items, results.items[i].item)
304 | 		floats = append(floats, results.items[i].dist)
305 | 	}
306 | 
307 | 	return items, floats, nil
308 | }
309 | 


--------------------------------------------------------------------------------
/contextionary/core/combined_simple_test.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */
12 | package contextionary
13 | 
14 | import (
15 | 	"testing"
16 | )
17 | 
18 | func TestSimpleCombinedIndex(t *testing.T) {
19 | 	builder1 := InMemoryBuilder(3)
20 | 	builder2 := InMemoryBuilder(3)
21 | 	builder3 := InMemoryBuilder(3)
22 | 
23 | 	builder1.AddWord("a", NewVector([]float32{1, 0, 0}))
24 | 	builder2.AddWord("b", NewVector([]float32{0, 1, 0}))
25 | 	builder3.AddWord("c", NewVector([]float32{0, 0, 1}))
26 | 
27 | 	memory_index1 := Contextionary(builder1.Build(3))
28 | 	memory_index2 := Contextionary(builder2.Build(3))
29 | 	memory_index3 := Contextionary(builder3.Build(3))
30 | 
31 | 	var indices123 []Contextionary = []Contextionary{memory_index1, memory_index2, memory_index3}
32 | 	var indices231 []Contextionary = []Contextionary{memory_index2, memory_index3, memory_index1}
33 | 	var indices312 []Contextionary = []Contextionary{memory_index3, memory_index1, memory_index2}
34 | 
35 | 	t.Run("indices 123", func(t *testing.T) { test_simple_combined(t, indices123) })
36 | 	t.Run("indices 231", func(t *testing.T) { test_simple_combined(t, indices231) })
37 | 	t.Run("indices 312", func(t *testing.T) { test_simple_combined(t, indices312) })
38 | }
39 | 
40 | func test_simple_combined(t *testing.T, indices []Contextionary) {
41 | 	ci, err := CombineVectorIndices(indices)
42 | 	if err != nil {
43 | 		panic("should work")
44 | 	}
45 | 
46 | 	a_idx := ci.WordToItemIndex("a")
47 | 	if !a_idx.IsPresent() {
48 | 		panic("should be present")
49 | 	}
50 | 
51 | 	b_idx := ci.WordToItemIndex("b")
52 | 	if !b_idx.IsPresent() {
53 | 		panic("should be present")
54 | 	}
55 | 
56 | 	c_idx := ci.WordToItemIndex("c")
57 | 	if !c_idx.IsPresent() {
58 | 		panic("should be present")
59 | 	}
60 | 
61 | 	items, _, err := ci.GetNnsByItem(a_idx, 3, 3)
62 | 	if err != nil {
63 | 		panic("should work")
64 | 	}
65 | 
66 | 	assert_eq_idx := func(name string, a, b ItemIndex) {
67 | 		if a != b {
68 | 			t.Errorf("Expected %v to be at %v, but was at %b", name, a, b)
69 | 		}
70 | 	}
71 | 
72 | 	if len(items) != 3 {
73 | 		t.Errorf("got length %v, expected 3", len(items))
74 | 		t.FailNow()
75 | 	}
76 | 
77 | 	// assert lexicographical order, if distances are equal
78 | 
79 | 	assert_eq_idx("a", a_idx, items[0])
80 | 	assert_eq_idx("b", b_idx, items[1])
81 | 	assert_eq_idx("c", c_idx, items[2])
82 | }
83 | 


--------------------------------------------------------------------------------
/contextionary/core/component_test.go:
--------------------------------------------------------------------------------
 1 | // +build sentence
 2 | 
 3 | package contextionary
 4 | 
 5 | import (
 6 | 	"fmt"
 7 | 	"testing"
 8 | )
 9 | 
10 | func TestDevelopmentEnvironmentForContextionary(t *testing.T) {
11 | 
12 | 	// Make sure you have run ./tools/dev/gen_simple_contextionary.sh
13 | 	// from the project root or downloaded a full contextionary prior
14 | 	// to running those tests.
15 | 
16 | 	c11y, err := LoadVectorFromDisk("../../tools/dev/example.knn", "../../tools/dev/example.idx")
17 | 	if err != nil {
18 | 		t.Fatalf("could not generate c11y: %s", err)
19 | 	}
20 | 
21 | 	fmt.Printf("here's the c11y, do whatever you want with it: %#v", c11y)
22 | 
23 | 	t.Errorf("... add whatever you like!")
24 | }
25 | 


--------------------------------------------------------------------------------
/contextionary/core/contextionary.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */
12 | 
13 | // Package contextionary provides the toolset to add context to words.
14 | package contextionary
15 | 
16 | // ItemIndex is an opaque type that models an index number used to identify a
17 | // word.
18 | type ItemIndex int
19 | 
20 | // IsPresent can be used after retrieving a word index (which does not error on
21 | // its own), to see if the word was actually present in the contextionary.
22 | func (i *ItemIndex) IsPresent() bool {
23 | 	return *i >= 0
24 | }
25 | 
26 | // Contextionary is the API to decouple the K-nn interface that is needed for
27 | // Weaviate from a concrete implementation.
28 | type Contextionary interface {
29 | 
30 | 	// Return the number of items that is stored in the index.
31 | 	GetNumberOfItems() int
32 | 
33 | 	// Returns the length of the used vectors.
34 | 	GetVectorLength() int
35 | 
36 | 	// Look up a word, return an index.
37 | 	// Check for presence of the index with index.IsPresent()
38 | 	WordToItemIndex(word string) ItemIndex
39 | 
40 | 	// Based on an index, return the assosiated word.
41 | 	ItemIndexToWord(item ItemIndex) (string, error)
42 | 
43 | 	// Based on an index, return the assosiated word.
44 | 	ItemIndexToOccurrence(item ItemIndex) (uint64, error)
45 | 
46 | 	//OccurrencePercentile shows the occurrence of the mentioned percentile in ascending order
47 | 	OccurrencePercentile(perc int) uint64
48 | 
49 | 	// Get the vector of an item index.
50 | 	GetVectorForItemIndex(item ItemIndex) (*Vector, error)
51 | 
52 | 	// Compute the distance between two items.
53 | 	GetDistance(a ItemIndex, b ItemIndex) (float32, error)
54 | 
55 | 	// Get the n nearest neighbours of item, examining k trees.
56 | 	// Returns an array of indices, and of distances between item and the n-nearest neighbors.
57 | 	GetNnsByItem(item ItemIndex, n, k int) ([]ItemIndex, []float32, error)
58 | 
59 | 	// Get the n nearest neighbours of item, examining k trees.
60 | 	// Returns an array of indices, and of distances between item and the n-nearest neighbors.
61 | 	GetNnsByVector(vector Vector, n, k int) ([]ItemIndex, []float32, error)
62 | 
63 | 	// SafeGetSimilarWords returns n similar words in the contextionary,
64 | 	// examining k trees. It is guaratueed to have results, even if the word is
65 | 	// not in the contextionary. In this case the list only contains the word
66 | 	// itself. It can then still be used for exact match or levensthein-based
67 | 	// searches against db backends.
68 | 	SafeGetSimilarWords(word string, n, k int) ([]string, []float32)
69 | 
70 | 	// SafeGetSimilarWordsWithCertainty returns  similar words in the
71 | 	// contextionary, if they are close enough to match the required certainty.
72 | 	// It is guaratueed to have results, even if the word is not in the
73 | 	// contextionary. In this case the list only contains the word itself. It can
74 | 	// then still be used for exact match or levensthein-based searches against
75 | 	// db backends.
76 | 	SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string
77 | }
78 | 


--------------------------------------------------------------------------------
/contextionary/core/generator/cmd/generator.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */
12 | package main
13 | 
14 | import (
15 | 	"os"
16 | 
17 | 	flags "github.com/jessevdk/go-flags"
18 | 	"github.com/weaviate/contextionary/contextionary/core/generator"
19 | )
20 | 
21 | func main() {
22 | 	var options generator.Options
23 | 	var parser = flags.NewParser(&options, flags.Default)
24 | 
25 | 	if _, err := parser.Parse(); err != nil {
26 | 		if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp {
27 | 			os.Exit(0)
28 | 		} else {
29 | 			os.Exit(1)
30 | 		}
31 | 	}
32 | 
33 | 	generator.Generate(options)
34 | }
35 | 


--------------------------------------------------------------------------------
/contextionary/core/generator/generator.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | package generator
 13 | 
 14 | import (
 15 | 	"bufio"
 16 | 	"bytes"
 17 | 	"encoding/binary"
 18 | 	"encoding/gob"
 19 | 	"encoding/json"
 20 | 	"log"
 21 | 	"os"
 22 | 	"strconv"
 23 | 	"strings"
 24 | 
 25 | 	"github.com/syndtr/goleveldb/leveldb"
 26 | 	annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex"
 27 | )
 28 | 
 29 | type Options struct {
 30 | 	VectorCSVPath string `short:"c" long:"vector-csv-path" description:"Path to the output file of Glove" required:"true"`
 31 | 	TempDBPath    string `short:"t" long:"temp-db-path" description:"Location for the temporary database" default:".tmp_import"`
 32 | 	OutputPrefix  string `short:"p" long:"output-prefix" description:"The prefix of the names of the files" required:"true"`
 33 | 	K             int    `short:"k" description:"number of forrests to generate" default:"20"`
 34 | }
 35 | 
 36 | type WordVectorInfo struct {
 37 | 	numberOfWords int
 38 | 	vectorWidth   int
 39 | 	k             int
 40 | 	metadata      JsonMetadata
 41 | }
 42 | 
 43 | type JsonMetadata struct {
 44 | 	K int `json:"k"` // the number of parallel forrests.
 45 | }
 46 | 
 47 | func Generate(options Options) {
 48 | 	db, err := leveldb.OpenFile(options.TempDBPath, nil)
 49 | 	defer db.Close()
 50 | 
 51 | 	if err != nil {
 52 | 		log.Fatalf("Could not open temporary database file %+v", err)
 53 | 	}
 54 | 
 55 | 	file, err := os.Open(options.VectorCSVPath)
 56 | 	if err != nil {
 57 | 		log.Fatal(err)
 58 | 	}
 59 | 	defer file.Close()
 60 | 
 61 | 	log.Print("Processing and ordering raw trained data")
 62 | 	info := readVectorsFromFileAndInsertIntoLevelDB(db, file)
 63 | 
 64 | 	info.k = options.K
 65 | 	info.metadata = JsonMetadata{options.K}
 66 | 
 67 | 	log.Print("Generating wordlist")
 68 | 	createWordList(db, info, options.OutputPrefix+".idx")
 69 | 
 70 | 	log.Print("Generating k-nn index")
 71 | 	createKnn(db, info, options.OutputPrefix+".knn")
 72 | 
 73 | 	db.Close()
 74 | 	os.RemoveAll(options.TempDBPath)
 75 | }
 76 | 
 77 | // read word vectors, insert them into level db, also return the dimension of the vectors.
 78 | func readVectorsFromFileAndInsertIntoLevelDB(db *leveldb.DB, file *os.File) WordVectorInfo {
 79 | 	var vector_length int = -1
 80 | 	var nr_words int = 0
 81 | 
 82 | 	scanner := bufio.NewScanner(file)
 83 | 
 84 | 	for scanner.Scan() {
 85 | 		nr_words += 1
 86 | 		parts := strings.Split(scanner.Text(), " ")
 87 | 
 88 | 		word := parts[0]
 89 | 		if vector_length == -1 {
 90 | 			vector_length = len(parts) - 1
 91 | 		}
 92 | 
 93 | 		if vector_length != len(parts)-1 {
 94 | 			log.Print("Line corruption found for the word [" + word + "]. Lenght expected " + strconv.Itoa(vector_length) + " but found " + strconv.Itoa(len(parts)) + ". Word will be skipped.")
 95 | 			continue
 96 | 		}
 97 | 
 98 | 		// pre-allocate a vector for speed.
 99 | 		vector := make([]float32, vector_length)
100 | 
101 | 		for i := 1; i <= vector_length; i++ {
102 | 			float, err := strconv.ParseFloat(parts[i], 64)
103 | 
104 | 			if err != nil {
105 | 				log.Fatal("Error parsing float")
106 | 			}
107 | 
108 | 			vector[i-1] = float32(float)
109 | 		}
110 | 
111 | 		var buf bytes.Buffer
112 | 		if err := gob.NewEncoder(&buf).Encode(vector); err != nil {
113 | 			log.Fatal("Could not encode vector for temp db storage")
114 | 		}
115 | 
116 | 		db.Put([]byte(word), buf.Bytes(), nil)
117 | 	}
118 | 
119 | 	return WordVectorInfo{numberOfWords: nr_words, vectorWidth: vector_length}
120 | }
121 | 
122 | func createWordList(db *leveldb.DB, info WordVectorInfo, outputFileName string) {
123 | 	file, err := os.Create(outputFileName)
124 | 	if err != nil {
125 | 		log.Fatal("Could not open wordlist output file")
126 | 	}
127 | 	defer file.Close()
128 | 
129 | 	wbuf := bufio.NewWriter(file)
130 | 
131 | 	// Write file header
132 | 	err = binary.Write(wbuf, binary.LittleEndian, uint64(info.numberOfWords))
133 | 	if err != nil {
134 | 		log.Fatal("Could not write length of wordlist.")
135 | 	}
136 | 
137 | 	err = binary.Write(wbuf, binary.LittleEndian, uint64(info.vectorWidth))
138 | 	if err != nil {
139 | 		log.Fatal("Could not write with of the vector.")
140 | 	}
141 | 
142 | 	metadata, err := json.Marshal(info.metadata)
143 | 	if err != nil {
144 | 		log.Fatal("Could not serialize metadata.")
145 | 	}
146 | 
147 | 	err = binary.Write(wbuf, binary.LittleEndian, uint64(len(metadata)))
148 | 	if err != nil {
149 | 		log.Fatal("Could not write with of the vector.")
150 | 	}
151 | 
152 | 	_, err = wbuf.Write(metadata)
153 | 	if err != nil {
154 | 		log.Fatal("Could not write the metadata")
155 | 	}
156 | 
157 | 	var metadata_len = uint64(len(metadata))
158 | 	var metadata_padding = 4 - (metadata_len % 4)
159 | 	for i := 0; uint64(i) < metadata_padding; i++ {
160 | 		wbuf.WriteByte(byte(0))
161 | 	}
162 | 
163 | 	var word_offset uint64 = (2 + uint64(info.numberOfWords)) * 8 // first two uint64's from the header, then the table of indices.
164 | 	word_offset += 8 + metadata_len + metadata_padding            // and the metadata length + content & padding
165 | 
166 | 	var orig_word_offset = word_offset
167 | 
168 | 	// Iterate first time over all data, computing indices for all words.
169 | 	iter := db.NewIterator(nil, nil)
170 | 	for iter.Next() {
171 | 		key := iter.Key()
172 | 		word := string(key)
173 | 		length := len(word)
174 | 		err = binary.Write(wbuf, binary.LittleEndian, uint64(word_offset))
175 | 
176 | 		if err != nil {
177 | 			log.Fatal("Could not write word offset to wordlist")
178 | 		}
179 | 
180 | 		// reserve 8 bytes for occurence
181 | 		word_offset += 8
182 | 
183 | 		word_offset += uint64(length) + 1
184 | 
185 | 		// ensure padding on 4-bytes aligned memory
186 | 		padding := 4 - (word_offset % 4)
187 | 		word_offset += padding
188 | 	}
189 | 
190 | 	iter.Release()
191 | 	word_offset = orig_word_offset
192 | 
193 | 	// Iterate second time over all data, now inserting the words
194 | 	iter = db.NewIterator(nil, nil)
195 | 	for iter.Next() {
196 | 		key := iter.Key()
197 | 		word := string(key)
198 | 		length := len(word)
199 | 
200 | 		// hard-code occurence to 102 for now
201 | 		err = binary.Write(wbuf, binary.LittleEndian, uint64(102))
202 | 
203 | 		wbuf.Write([]byte(word))
204 | 		wbuf.WriteByte(byte(0))
205 | 		word_offset += uint64(length) + 1
206 | 
207 | 		// ensure padding on 4-bytes aligned memory
208 | 		padding := 4 - (word_offset % 4)
209 | 		for i := 0; uint64(i) < padding; i++ {
210 | 			wbuf.WriteByte(byte(0))
211 | 		}
212 | 
213 | 		word_offset += padding
214 | 	}
215 | 	wbuf.Flush()
216 | 	iter.Release()
217 | }
218 | 
219 | func createKnn(db *leveldb.DB, info WordVectorInfo, outputFileName string) {
220 | 	var knn annoy.AnnoyIndex = annoy.NewAnnoyIndexEuclidean(info.vectorWidth)
221 | 	var idx int = -1
222 | 
223 | 	iter := db.NewIterator(nil, nil)
224 | 
225 | 	for iter.Next() {
226 | 		idx += 1
227 | 
228 | 		vector := make([]float32, info.vectorWidth)
229 | 		err := gob.NewDecoder(bytes.NewBuffer(iter.Value())).Decode(&vector)
230 | 		if err != nil {
231 | 			log.Fatalf("Could not decode vector value %+v", err)
232 | 		}
233 | 		knn.AddItem(idx, vector)
234 | 	}
235 | 
236 | 	knn.Build(info.k) // Hardcoded for now. Must be tweaked.
237 | 	knn.Save(outputFileName)
238 | 	knn.Unload()
239 | }
240 | 


--------------------------------------------------------------------------------
/contextionary/core/memory_index.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | package contextionary
 13 | 
 14 | import (
 15 | 	"fmt"
 16 | 	"sort"
 17 | 
 18 | 	annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex"
 19 | )
 20 | 
 21 | type MemoryIndex struct {
 22 | 	dimensions int
 23 | 	words      []string
 24 | 	knn        annoy.AnnoyIndex
 25 | }
 26 | 
 27 | // Return the number of items that is stored in the index.
 28 | func (mi *MemoryIndex) GetNumberOfItems() int {
 29 | 	return len(mi.words)
 30 | }
 31 | 
 32 | // Returns the length of the used vectors.
 33 | func (mi *MemoryIndex) GetVectorLength() int {
 34 | 	return mi.dimensions
 35 | }
 36 | 
 37 | // Look up a word, return an index.
 38 | // Perform binary search.
 39 | func (mi *MemoryIndex) WordToItemIndex(word string) ItemIndex {
 40 | 	for idx, w := range mi.words {
 41 | 		if word == w {
 42 | 			return ItemIndex(idx)
 43 | 		}
 44 | 	}
 45 | 
 46 | 	return -1
 47 | }
 48 | 
 49 | func (mi *MemoryIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) {
 50 | 	return 0, nil
 51 | }
 52 | 
 53 | func (mi *MemoryIndex) OccurrencePercentile(perc int) uint64 {
 54 | 	return 0
 55 | }
 56 | 
 57 | // Based on an index, return the assosiated word.
 58 | func (mi *MemoryIndex) ItemIndexToWord(item ItemIndex) (string, error) {
 59 | 	if item >= 0 && int(item) <= len(mi.words) {
 60 | 		return mi.words[item], nil
 61 | 	} else {
 62 | 		return "", fmt.Errorf("Index out of bounds")
 63 | 	}
 64 | }
 65 | 
 66 | // Get the vector of an item index.
 67 | // TODO: Is this ever used? Doesn't look like it as part of the investigation
 68 | // in gh-25 and gh-26
 69 | func (mi *MemoryIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) {
 70 | 	if item >= 0 && int(item) <= len(mi.words) {
 71 | 		var floats []float32
 72 | 		mi.knn.GetItem(int(item), &floats)
 73 | 
 74 | 		return &Vector{vector: floats}, nil
 75 | 	} else {
 76 | 		return nil, fmt.Errorf("Index out of bounds")
 77 | 	}
 78 | }
 79 | 
 80 | // Compute the distance between two items.
 81 | func (mi MemoryIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) {
 82 | 	if a >= 0 && b >= 0 && int(a) <= len(mi.words) && int(b) <= len(mi.words) {
 83 | 		return mi.knn.GetDistance(int(a), int(b)), nil
 84 | 	} else {
 85 | 		return 0, fmt.Errorf("Index out of bounds")
 86 | 	}
 87 | }
 88 | 
 89 | // Get the n nearest neighbours of item, examining k trees.
 90 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
 91 | func (mi *MemoryIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) {
 92 | 	if item >= 0 && int(item) <= len(mi.words) {
 93 | 		var items []int
 94 | 		var distances []float32
 95 | 
 96 | 		mi.knn.GetNnsByItem(int(item), n, k, &items, &distances)
 97 | 
 98 | 		var indices []ItemIndex = make([]ItemIndex, len(items))
 99 | 		for i, x := range items {
100 | 			indices[i] = ItemIndex(x)
101 | 		}
102 | 
103 | 		return indices, distances, nil
104 | 	} else {
105 | 		return nil, nil, fmt.Errorf("Index out of bounds")
106 | 	}
107 | }
108 | 
109 | // Get the n nearest neighbours of item, examining k trees.
110 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
111 | func (mi *MemoryIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) {
112 | 	if len(vector.vector) == mi.dimensions {
113 | 		var items []int
114 | 		var distances []float32
115 | 
116 | 		mi.knn.GetNnsByVector(vector.vector, n, k, &items, &distances)
117 | 
118 | 		var indices []ItemIndex = make([]ItemIndex, len(items))
119 | 		for i, x := range items {
120 | 			indices[i] = ItemIndex(x)
121 | 		}
122 | 
123 | 		return indices, distances, nil
124 | 	} else {
125 | 		return nil, nil, fmt.Errorf("Wrong vector length provided")
126 | 	}
127 | }
128 | 
129 | // SafeGetSimilarWords returns n similar words in the contextionary,
130 | // examining k trees. It is guaratueed to have results, even if the word is
131 | // not in the contextionary. In this case the list only contains the word
132 | // itself. It can then still be used for exact match or levensthein-based
133 | // searches against db backends.
134 | func (mi *MemoryIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) {
135 | 	return safeGetSimilarWordsFromAny(mi, word, n, k)
136 | }
137 | 
138 | // SafeGetSimilarWordsWithCertainty returns  similar words in the
139 | // contextionary, if they are close enough to match the required certainty.
140 | // It is guaratueed to have results, even if the word is not in the
141 | // contextionary. In this case the list only contains the word itself. It can
142 | // then still be used for exact match or levensthein-based searches against
143 | // db backends.
144 | func (mi *MemoryIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string {
145 | 	return safeGetSimilarWordsWithCertaintyFromAny(mi, word, certainty)
146 | }
147 | 
148 | // The rest of this file concerns itself with building the Memory Index.
149 | // This is done from the MemoryIndexBuilder struct.
150 | 
151 | type MemoryIndexBuilder struct {
152 | 	dimensions   int
153 | 	word_vectors mib_pairs
154 | }
155 | 
156 | type mib_pair struct {
157 | 	word   string
158 | 	vector Vector
159 | }
160 | 
161 | // Define custom type, and implement functions required for sort.Sort.
162 | type mib_pairs []mib_pair
163 | 
164 | func (a mib_pairs) Len() int           { return len(a) }
165 | func (a mib_pairs) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
166 | func (a mib_pairs) Less(i, j int) bool { return a[i].word < a[j].word }
167 | 
168 | // Construct a new builder.
169 | func InMemoryBuilder(dimensions int) *MemoryIndexBuilder {
170 | 	mib := MemoryIndexBuilder{
171 | 		dimensions:   dimensions,
172 | 		word_vectors: make([]mib_pair, 0),
173 | 	}
174 | 
175 | 	return &mib
176 | }
177 | 
178 | // Add a word and it's vector to the builder.
179 | func (mib *MemoryIndexBuilder) AddWord(word string, vector Vector) {
180 | 	wv := mib_pair{word: word, vector: vector}
181 | 	mib.word_vectors = append(mib.word_vectors, wv)
182 | }
183 | 
184 | // Build an efficient lookup iddex from the builder.
185 | func (mib *MemoryIndexBuilder) Build(trees int) *MemoryIndex {
186 | 	mi := MemoryIndex{
187 | 		dimensions: mib.dimensions,
188 | 		words:      make([]string, 0),
189 | 		knn:        annoy.NewAnnoyIndexEuclidean(mib.dimensions),
190 | 	}
191 | 
192 | 	// First sort the words; this way we can do binary search on the words.
193 | 	sort.Sort(mib.word_vectors)
194 | 
195 | 	// Then fill up the data in the MemoryIndex
196 | 	for i, pair := range mib.word_vectors {
197 | 		mi.words = append(mi.words, pair.word)
198 | 		mi.knn.AddItem(i, pair.vector.vector)
199 | 	}
200 | 
201 | 	// And instruct Annoy to build it's index
202 | 	mi.knn.Build(trees)
203 | 
204 | 	return &mi
205 | }
206 | 


--------------------------------------------------------------------------------
/contextionary/core/mmapped.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | package contextionary
 13 | 
 14 | import (
 15 | 	"encoding/binary"
 16 | 	"fmt"
 17 | 	"log"
 18 | 	"math"
 19 | 	"os"
 20 | 	"syscall"
 21 | 
 22 | 	annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex"
 23 | )
 24 | 
 25 | type mmappedIndex struct {
 26 | 	word_index *Wordlist
 27 | 	knn        annoy.AnnoyIndex
 28 | 	knnRaw     []byte
 29 | 	dimensions int
 30 | }
 31 | 
 32 | func (m *mmappedIndex) GetNumberOfItems() int {
 33 | 	return int(m.word_index.numberOfWords)
 34 | }
 35 | 
 36 | // Returns the length of the used vectors.
 37 | func (m *mmappedIndex) GetVectorLength() int {
 38 | 	return int(m.word_index.vectorWidth)
 39 | }
 40 | 
 41 | func (m *mmappedIndex) WordToItemIndex(word string) ItemIndex {
 42 | 	return m.word_index.FindIndexByWord(word)
 43 | }
 44 | 
 45 | func (m *mmappedIndex) ItemIndexToWord(item ItemIndex) (string, error) {
 46 | 	if item >= 0 && item <= m.word_index.GetNumberOfWords() {
 47 | 		w, _ := m.word_index.getWord(item)
 48 | 		return w, nil
 49 | 	} else {
 50 | 		return "", fmt.Errorf("Index out of bounds")
 51 | 	}
 52 | }
 53 | 
 54 | func (m *mmappedIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) {
 55 | 	if item >= 0 && item <= m.word_index.GetNumberOfWords() {
 56 | 		_, occ := m.word_index.getWord(item)
 57 | 		return occ, nil
 58 | 	} else {
 59 | 		return 0, fmt.Errorf("Index out of bounds")
 60 | 	}
 61 | }
 62 | 
 63 | func (m *mmappedIndex) OccurrencePercentile(perc int) uint64 {
 64 | 	return m.word_index.OccurrencePercentile(perc)
 65 | }
 66 | 
 67 | func (m *mmappedIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) {
 68 | 	if item < 0 && item > m.word_index.GetNumberOfWords() {
 69 | 		return nil, fmt.Errorf("Index out of bounds")
 70 | 	}
 71 | 
 72 | 	var floats []float32
 73 | 	floats = m.getItem(int(item))
 74 | 
 75 | 	return &Vector{vector: floats}, nil
 76 | }
 77 | 
 78 | func (m *mmappedIndex) getItem(index int) []float32 {
 79 | 	offset := 16
 80 | 	vectorSize := m.dimensions * 4
 81 | 	begin := index*(offset+vectorSize) + offset
 82 | 	end := begin + vectorSize
 83 | 	return vectorFromBytes(m.knnRaw[begin:end])
 84 | }
 85 | 
 86 | func vectorFromBytes(in []byte) []float32 {
 87 | 	out := make([]float32, len(in)/4)
 88 | 	for offset := 0; offset < len(in); offset += 4 {
 89 | 		bits := binary.LittleEndian.Uint32(in[offset : offset+4])
 90 | 		float := math.Float32frombits(bits)
 91 | 		out[offset/4] = float
 92 | 	}
 93 | 
 94 | 	return out
 95 | }
 96 | 
 97 | // Compute the distance between two items.
 98 | func (m *mmappedIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) {
 99 | 	if a >= 0 && b >= 0 && a <= m.word_index.GetNumberOfWords() && b <= m.word_index.GetNumberOfWords() {
100 | 		return m.knn.GetDistance(int(a), int(b)), nil
101 | 	} else {
102 | 		return 0, fmt.Errorf("Index out of bounds")
103 | 	}
104 | }
105 | 
106 | func (m *mmappedIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) {
107 | 	if item >= 0 && item <= m.word_index.GetNumberOfWords() {
108 | 		var items []int
109 | 		var distances []float32
110 | 
111 | 		m.knn.GetNnsByItem(int(item), n, k, &items, &distances)
112 | 
113 | 		var indices []ItemIndex = make([]ItemIndex, len(items))
114 | 		for i, x := range items {
115 | 			indices[i] = ItemIndex(x)
116 | 		}
117 | 
118 | 		return indices, distances, nil
119 | 	} else {
120 | 		return nil, nil, fmt.Errorf("Index out of bounds")
121 | 	}
122 | }
123 | 
124 | func (m *mmappedIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) {
125 | 	if len(vector.vector) == m.GetVectorLength() {
126 | 		var items []int
127 | 		var distances []float32
128 | 
129 | 		m.knn.GetNnsByVector(vector.vector, n, k, &items, &distances)
130 | 
131 | 		var indices []ItemIndex = make([]ItemIndex, len(items))
132 | 		for i, x := range items {
133 | 			indices[i] = ItemIndex(x)
134 | 		}
135 | 
136 | 		return indices, distances, nil
137 | 	} else {
138 | 		return nil, nil, fmt.Errorf("Wrong vector length provided")
139 | 	}
140 | }
141 | 
142 | // SafeGetSimilarWords returns n similar words in the contextionary,
143 | // examining k trees. It is guaratueed to have results, even if the word is
144 | // not in the contextionary. In this case the list only contains the word
145 | // itself. It can then still be used for exact match or levensthein-based
146 | // searches against db backends.
147 | func (m *mmappedIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) {
148 | 	return safeGetSimilarWordsFromAny(m, word, n, k)
149 | }
150 | 
151 | // SafeGetSimilarWordsWithCertainty returns  similar words in the
152 | // contextionary, if they are close enough to match the required certainty.
153 | // It is guaratueed to have results, even if the word is not in the
154 | // contextionary. In this case the list only contains the word itself. It can
155 | // then still be used for exact match or levensthein-based searches against
156 | // db backends.
157 | func (m *mmappedIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string {
158 | 	return safeGetSimilarWordsWithCertaintyFromAny(m, word, certainty)
159 | }
160 | 
161 | func LoadVectorFromDisk(annoy_index string, word_index_file_name string) (Contextionary, error) {
162 | 	word_index, err := LoadWordlist(word_index_file_name)
163 | 
164 | 	if err != nil {
165 | 		return nil, fmt.Errorf("Could not load vector: %+v", err)
166 | 	}
167 | 
168 | 	knn := annoy.NewAnnoyIndexEuclidean(int(word_index.vectorWidth))
169 | 	knn.Load(annoy_index)
170 | 
171 | 	knnRaw, err := loadAnnoyIndexDirectly(annoy_index)
172 | 	if err != nil {
173 | 		return nil, fmt.Errorf("load raw index: %v", err)
174 | 	}
175 | 
176 | 	idx := &mmappedIndex{
177 | 		word_index: word_index,
178 | 		knn:        knn,
179 | 		knnRaw:     knnRaw,
180 | 		dimensions: int(word_index.vectorWidth),
181 | 	}
182 | 
183 | 	return idx, nil
184 | }
185 | 
186 | // directly load the annoy index file to avoid memory leaks in the annoy
187 | // go-port of the C library, see #26
188 | func loadAnnoyIndexDirectly(path string) ([]byte, error) {
189 | 	file, err := os.Open(path)
190 | 	if err != nil {
191 | 		log.Fatalf("Can't open the knn file at %s: %+v", path, err)
192 | 	}
193 | 
194 | 	file_info, err := file.Stat()
195 | 	if err != nil {
196 | 		log.Fatalf("Can't stat the knn file at %s: %+v", path, err)
197 | 	}
198 | 
199 | 	mmap, err := syscall.Mmap(int(file.Fd()), 0, int(file_info.Size()), syscall.PROT_READ, syscall.MAP_SHARED)
200 | 	if err != nil {
201 | 		log.Fatalf("Can't mmap the knn file %s: %+v", path, err)
202 | 	}
203 | 
204 | 	return mmap, nil
205 | }
206 | 


--------------------------------------------------------------------------------
/contextionary/core/similar_words.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package contextionary
12 | 
13 | import (
14 | 	"regexp"
15 | )
16 | 
17 | const simliarWordsLimit = 15
18 | 
19 | func safeGetSimilarWordsFromAny(c11y Contextionary, word string, n, k int) ([]string, []float32) {
20 | 	i := c11y.WordToItemIndex(word)
21 | 	if !i.IsPresent() {
22 | 		return []string{word}, []float32{1}
23 | 	}
24 | 
25 | 	indices, newCertainties, err := c11y.GetNnsByItem(i, n, k)
26 | 	if err != nil {
27 | 		return []string{word}, []float32{1}
28 | 	}
29 | 
30 | 	var words []string
31 | 	var certainties []float32
32 | 	for i, index := range indices {
33 | 		word, err := c11y.ItemIndexToWord(index)
34 | 		if err != nil {
35 | 			continue
36 | 		}
37 | 
38 | 		if wordHasIllegalCharacters(word) {
39 | 			continue
40 | 		}
41 | 
42 | 		words = append(words, word)
43 | 		certainties = append(certainties, newCertainties[i])
44 | 	}
45 | 
46 | 	return words, certainties
47 | }
48 | 
49 | func safeGetSimilarWordsWithCertaintyFromAny(c11y Contextionary, word string, certainty float32) []string {
50 | 	var matchingWords []string
51 | 	var matchtingCertainties []float32
52 | 
53 | 	count := 0
54 | 	words, certainties := c11y.SafeGetSimilarWords(word, 100, 32)
55 | 	for i, word := range words {
56 | 		if count >= simliarWordsLimit {
57 | 			break
58 | 		}
59 | 
60 | 		var dist float32
61 | 		if dist = DistanceToCertainty(certainties[i]); dist < certainty {
62 | 			continue
63 | 		}
64 | 
65 | 		count++
66 | 		matchingWords = append(matchingWords, alphanumeric(word))
67 | 		matchtingCertainties = append(matchtingCertainties, dist)
68 | 	}
69 | 
70 | 	return matchingWords
71 | }
72 | 
73 | func wordHasIllegalCharacters(word string) bool {
74 | 	// we know that the schema based contextionary uses a leading dollar sign for
75 | 	// the class and property centroids, so we can easily filter them out
76 | 	return regexp.MustCompile("^\\$").MatchString(word)
77 | }
78 | 
79 | func alphanumeric(word string) string {
80 | 	return regexp.MustCompile("[^a-zA-Z0-9_]+").ReplaceAllString(word, "")
81 | }
82 | 


--------------------------------------------------------------------------------
/contextionary/core/similar_words_test.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package contextionary
12 | 
13 | import (
14 | 	"testing"
15 | 
16 | 	"github.com/stretchr/testify/assert"
17 | )
18 | 
19 | func TestSimilarWords(t *testing.T) {
20 | 
21 | 	t.Run("with a word that's not in the c11y", func(t *testing.T) {
22 | 		c := newC11y()
23 | 		expectedWords := []string{"vehicle"}
24 | 
25 | 		words := c.SafeGetSimilarWordsWithCertainty("vehicle", 0.8)
26 | 
27 | 		assert.Equal(t, expectedWords, words)
28 | 	})
29 | 
30 | 	t.Run("with a word thats present and a high certainty", func(t *testing.T) {
31 | 		c := newC11y()
32 | 		expectedWords := []string{"car", "automobile"}
33 | 
34 | 		words := c.SafeGetSimilarWordsWithCertainty("car", 0.95)
35 | 
36 | 		assert.Equal(t, expectedWords, words)
37 | 	})
38 | 
39 | 	t.Run("with a word thats present and a medium certainty", func(t *testing.T) {
40 | 		c := newC11y()
41 | 		expectedWords := []string{"car", "automobile", "airplane"}
42 | 
43 | 		words := c.SafeGetSimilarWordsWithCertainty("car", 0.7)
44 | 
45 | 		assert.Equal(t, expectedWords, words)
46 | 	})
47 | 
48 | 	t.Run("with a word thats present and a really low certainty", func(t *testing.T) {
49 | 		c := newC11y()
50 | 		expectedWords := []string{"car", "automobile", "airplane", "cabernetsauvignon"}
51 | 
52 | 		words := c.SafeGetSimilarWordsWithCertainty("car", 0.001)
53 | 
54 | 		assert.Equal(t, expectedWords, words)
55 | 	})
56 | 
57 | }
58 | 
59 | func newC11y() Contextionary {
60 | 	builder := InMemoryBuilder(3)
61 | 
62 | 	builder.AddWord("car", NewVector([]float32{1, 0, 0}))
63 | 	builder.AddWord("automobile", NewVector([]float32{0.9, 0, 0}))
64 | 	builder.AddWord("airplane", NewVector([]float32{0.3, 0, 0}))
65 | 	builder.AddWord("cabernet-sauvignon", NewVector([]float32{0, 0, 10}))
66 | 	builder.AddWord("$THING[Car]", NewVector([]float32{1, 0, 0}))
67 | 
68 | 	return Contextionary(builder.Build(3))
69 | }
70 | 


--------------------------------------------------------------------------------
/contextionary/core/stopwords/detector.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package stopwords
12 | 
13 | import (
14 | 	"encoding/json"
15 | 	"fmt"
16 | 	"io/ioutil"
17 | 	"os"
18 | )
19 | 
20 | // Detector can be used to detect whether a word is a stopword
21 | type Detector struct {
22 | 	lookup map[string]int
23 | }
24 | 
25 | type stopWordDoc struct {
26 | 	Language string   `json:"language"`
27 | 	Words    []string `json:"words"`
28 | }
29 | 
30 | // NewFromFile creates an in-memory stopword detector based on a file read once
31 | // at init time
32 | func NewFromFile(path string) (*Detector, error) {
33 | 	file, err := os.Open(path)
34 | 	if err != nil {
35 | 		return nil, fmt.Errorf("could not open file at %s: %v", path, err)
36 | 	}
37 | 
38 | 	fileBytes, err := ioutil.ReadAll(file)
39 | 	if err != nil {
40 | 		return nil, fmt.Errorf("could not read file contents: %v", err)
41 | 	}
42 | 
43 | 	var doc stopWordDoc
44 | 	err = json.Unmarshal(fileBytes, &doc)
45 | 	if err != nil {
46 | 		return nil, fmt.Errorf("could not unmarshal json: %v", err)
47 | 	}
48 | 
49 | 	lookup := buildLookupMap(doc.Words)
50 | 
51 | 	return &Detector{
52 | 		lookup: lookup,
53 | 	}, nil
54 | }
55 | 
56 | // IsStopWord returns true on stop words, false on all other words
57 | func (d *Detector) IsStopWord(word string) bool {
58 | 	if _, ok := d.lookup[word]; ok {
59 | 		return true
60 | 	}
61 | 
62 | 	return false
63 | }
64 | 
65 | func buildLookupMap(words []string) map[string]int {
66 | 	lookup := map[string]int{}
67 | 	for _, word := range words {
68 | 		lookup[word] = 1
69 | 	}
70 | 
71 | 	return lookup
72 | }
73 | 


--------------------------------------------------------------------------------
/contextionary/core/vector.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | package contextionary
 13 | 
 14 | import (
 15 | 	"fmt"
 16 | 	"math"
 17 | )
 18 | 
 19 | // Opque type that models a fixed-length vector.
 20 | type Vector struct {
 21 | 	vector []float32
 22 | 	Source []InputElement
 23 | }
 24 | 
 25 | type InputElement struct {
 26 | 	Concept    string
 27 | 	Weight     float64
 28 | 	Occurrence uint64
 29 | }
 30 | 
 31 | func NewVector(vector []float32) Vector {
 32 | 	return Vector{vector: vector}
 33 | }
 34 | 
 35 | func (v *Vector) Equal(other *Vector) (bool, error) {
 36 | 	if len(v.vector) != len(other.vector) {
 37 | 		return false, fmt.Errorf("Vectors have different dimensions; %v vs %v", len(v.vector), len(other.vector))
 38 | 	}
 39 | 
 40 | 	for i, v := range v.vector {
 41 | 		if other.vector[i] != v {
 42 | 			return false, nil
 43 | 		}
 44 | 	}
 45 | 
 46 | 	return true, nil
 47 | }
 48 | 
 49 | func (v *Vector) EqualEpsilon(other *Vector, epsilon float32) (bool, error) {
 50 | 	if len(v.vector) != len(other.vector) {
 51 | 		return false, fmt.Errorf("Vectors have different dimensions; %v vs %v", len(v.vector), len(other.vector))
 52 | 	}
 53 | 
 54 | 	for i, v := range v.vector {
 55 | 		v_min := v - epsilon
 56 | 		v_max := v + epsilon
 57 | 		if other.vector[i] < v_min && other.vector[i] > v_max {
 58 | 			return false, nil
 59 | 		}
 60 | 	}
 61 | 
 62 | 	return true, nil
 63 | }
 64 | 
 65 | func (v *Vector) Len() int {
 66 | 	return len(v.vector)
 67 | }
 68 | 
 69 | func (v *Vector) ToString() string {
 70 | 	str := "["
 71 | 	first := true
 72 | 	for _, i := range v.vector {
 73 | 		if first {
 74 | 			first = false
 75 | 		} else {
 76 | 			str += ", "
 77 | 		}
 78 | 
 79 | 		str += fmt.Sprintf("%.6f", i)
 80 | 	}
 81 | 
 82 | 	str += "]"
 83 | 
 84 | 	return str
 85 | }
 86 | 
 87 | func (v *Vector) ToArray() []float32 {
 88 | 
 89 | 	var returner []float32
 90 | 
 91 | 	for _, i := range v.vector {
 92 | 		returner = append(returner, i)
 93 | 	}
 94 | 
 95 | 	return returner
 96 | }
 97 | 
 98 | func (v *Vector) Distance(other *Vector) (float32, error) {
 99 | 	var sum float32
100 | 
101 | 	if len(v.vector) != len(other.vector) {
102 | 		return 0.0, fmt.Errorf("Vectors have different dimensions")
103 | 	}
104 | 
105 | 	for i := 0; i < len(v.vector); i++ {
106 | 		x := v.vector[i] - other.vector[i]
107 | 		sum += x * x
108 | 	}
109 | 
110 | 	return float32(math.Sqrt(float64(sum))), nil
111 | }
112 | 


--------------------------------------------------------------------------------
/contextionary/core/wordlist.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */
 12 | package contextionary
 13 | 
 14 | // //// #include <string.h>
 15 | // //import "C"
 16 | 
 17 | import (
 18 | 	"bytes"
 19 | 	"encoding/binary"
 20 | 	"encoding/json"
 21 | 	"fmt"
 22 | 	"os"
 23 | 	"sort"
 24 | 	"syscall"
 25 | )
 26 | 
 27 | type Wordlist struct {
 28 | 	vectorWidth           uint64
 29 | 	numberOfWords         uint64
 30 | 	metadata              map[string]interface{}
 31 | 	occurrencePercentiles []uint64
 32 | 
 33 | 	file         os.File
 34 | 	startOfTable int
 35 | 	mmap         []byte
 36 | }
 37 | 
 38 | func LoadWordlist(path string) (*Wordlist, error) {
 39 | 	file, err := os.Open(path)
 40 | 	if err != nil {
 41 | 		return nil, fmt.Errorf("Can't open the wordlist at %s: %+v", path, err)
 42 | 	}
 43 | 
 44 | 	file_info, err := file.Stat()
 45 | 	if err != nil {
 46 | 		return nil, fmt.Errorf("Can't stat the wordlist at %s: %+v", path, err)
 47 | 	}
 48 | 
 49 | 	mmap, err := syscall.Mmap(int(file.Fd()), 0, int(file_info.Size()), syscall.PROT_READ, syscall.MAP_SHARED)
 50 | 	if err != nil {
 51 | 		return nil, fmt.Errorf("Can't mmap the file %s: %+v", path, err)
 52 | 	}
 53 | 
 54 | 	nrWordsBytes := mmap[0:8]
 55 | 	vectorWidthBytes := mmap[8:16]
 56 | 	metadataLengthBytes := mmap[16:24]
 57 | 
 58 | 	nrWords := binary.LittleEndian.Uint64(nrWordsBytes)
 59 | 	vectorWidth := binary.LittleEndian.Uint64(vectorWidthBytes)
 60 | 	metadataLength := binary.LittleEndian.Uint64(metadataLengthBytes)
 61 | 
 62 | 	metadataBytes := mmap[24 : 24+metadataLength]
 63 | 	var metadata map[string]interface{}
 64 | 
 65 | 	json.Unmarshal(metadataBytes, &metadata)
 66 | 
 67 | 	// Compute beginning of word list lookup table.
 68 | 	var start_of_table int = 24 + int(metadataLength)
 69 | 	var offset int = 4 - (start_of_table % 4)
 70 | 	start_of_table += offset
 71 | 
 72 | 	wl := &Wordlist{
 73 | 		vectorWidth:   vectorWidth,
 74 | 		numberOfWords: nrWords,
 75 | 		metadata:      metadata,
 76 | 		startOfTable:  start_of_table,
 77 | 		mmap:          mmap,
 78 | 	}
 79 | 
 80 | 	wl.initOccurrencePercentiles()
 81 | 
 82 | 	return wl, nil
 83 | }
 84 | 
 85 | func (w *Wordlist) GetNumberOfWords() ItemIndex {
 86 | 	return ItemIndex(w.numberOfWords)
 87 | }
 88 | 
 89 | func (w *Wordlist) OccurrencePercentile(percentile int) uint64 {
 90 | 	if percentile < 0 || percentile > 100 {
 91 | 		panic("incorrect usage of occurrence percentile, must be between 0 and 100")
 92 | 	}
 93 | 
 94 | 	return w.occurrencePercentiles[percentile]
 95 | }
 96 | 
 97 | func (w *Wordlist) FindIndexByWord(_needle string) ItemIndex {
 98 | 	var needle = string([]byte(_needle))
 99 | 	needle += "\x00"
100 | 
101 | 	var bytes_needle = []byte(needle)
102 | 
103 | 	var low ItemIndex = 0
104 | 	var high ItemIndex = ItemIndex(w.numberOfWords) - 1
105 | 
106 | 	for low <= high {
107 | 		var midpoint ItemIndex = (low + high) / 2
108 | 
109 | 		ptr := w.getWordPtr(midpoint)
110 | 
111 | 		// if the last word in the index is shorter than our needle, we would panic
112 | 		// by accessing a non-existing adress. To prevent this, the higher boundary
113 | 		// can never be higher than the len(index)-1
114 | 		endPos := 8 + len(bytes_needle)
115 | 		if endPos >= len(ptr) {
116 | 			endPos = len(ptr) - 1
117 | 		}
118 | 
119 | 		// ignore the first 8 bytes as they are reserved for occurrence
120 | 		word := ptr[8:endPos]
121 | 
122 | 		var cmp = bytes.Compare(bytes_needle, word)
123 | 
124 | 		if cmp == 0 {
125 | 			return midpoint
126 | 		} else if cmp < 0 {
127 | 			high = midpoint - 1
128 | 		} else {
129 | 			low = midpoint + 1
130 | 		}
131 | 	}
132 | 
133 | 	return -1
134 | }
135 | 
136 | func (w *Wordlist) getWordPtr(index ItemIndex) []byte {
137 | 	entry_addr := ItemIndex(w.startOfTable) + index*8
138 | 	word_address_bytes := w.mmap[entry_addr : entry_addr+8]
139 | 	word_address := binary.LittleEndian.Uint64(word_address_bytes)
140 | 	return w.mmap[word_address:]
141 | }
142 | 
143 | func (w *Wordlist) getWord(index ItemIndex) (string, uint64) {
144 | 	ptr := w.getWordPtr(index)
145 | 	occurrence := binary.LittleEndian.Uint64(ptr[0:8])
146 | 	for i := 8; i < len(ptr); i++ {
147 | 		if ptr[i] == '\x00' {
148 | 			return string(ptr[8:i]), occurrence
149 | 		}
150 | 	}
151 | 
152 | 	return "", 0
153 | }
154 | 
155 | func (w *Wordlist) initOccurrencePercentiles() {
156 | 	w.occurrencePercentiles = make([]uint64, 101) // make 101 elements longs, so both index 0 and 100 are included
157 | 	max := int(w.GetNumberOfWords())
158 | 	allOccs := make([]uint64, max)
159 | 
160 | 	for i := ItemIndex(0); int(i) < max; i++ {
161 | 		_, occ := w.getWord(i)
162 | 		allOccs[i] = occ
163 | 	}
164 | 
165 | 	sort.Slice(allOccs, func(a, b int) bool { return allOccs[a] < allOccs[b] })
166 | 
167 | 	for i := 0; i <= 100; i++ { // note that this is 101 elements!
168 | 		if i == 0 {
169 | 			w.occurrencePercentiles[i] = 0
170 | 			continue
171 | 		}
172 | 
173 | 		if i == 100 {
174 | 			w.occurrencePercentiles[i] = allOccs[len(allOccs)-1]
175 | 			continue
176 | 		}
177 | 
178 | 		occ := uint64(float64(i) / 100 * float64(len(allOccs)))
179 | 		w.occurrencePercentiles[i] = occ
180 | 	}
181 | }
182 | 


--------------------------------------------------------------------------------
/contextionary/schema/contextionary.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package schema
12 | 
13 | import contextionary "github.com/weaviate/contextionary/contextionary/core"
14 | 
15 | // Contextionary composes a regular contextionary with additional
16 | // schema-related query methods
17 | type Contextionary struct {
18 | 	contextionary.Contextionary
19 | }
20 | 
21 | // New creates a new Contextionary from a contextionary.Contextionary which it
22 | // extends with Schema-related search methods
23 | func New(c contextionary.Contextionary) *Contextionary {
24 | 	return &Contextionary{
25 | 		Contextionary: c,
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/contextionary/schema/schema_search.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */package schema
 12 | 
 13 | import (
 14 | 	"fmt"
 15 | 	"regexp"
 16 | 	"strings"
 17 | 
 18 | 	"github.com/fatih/camelcase"
 19 | 	pb "github.com/weaviate/contextionary/contextionary"
 20 | 	contextionary "github.com/weaviate/contextionary/contextionary/core"
 21 | 	"github.com/weaviate/contextionary/errors"
 22 | )
 23 | 
 24 | // SearchResult is a single search result. See wrapping Search Results for the Type
 25 | type SearchResult struct {
 26 | 	Name      string
 27 | 	Certainty float32
 28 | }
 29 | 
 30 | // SearchResults is grouping of SearchResults for a SchemaSearch
 31 | type SearchResults struct {
 32 | 	Type    SearchType
 33 | 	Results []SearchResult
 34 | }
 35 | 
 36 | // Len of the result set
 37 | func (r SearchResults) Len() int {
 38 | 	return len(r.Results)
 39 | }
 40 | 
 41 | // SchemaSearch can be used to search for related classes and properties, see
 42 | // documentation of SearchParams for more details on how to use it and
 43 | // documentation on *pb.SchemaSearchResults for more details on how to use the return
 44 | // value
 45 | func (con *Contextionary) SchemaSearch(params *pb.SchemaSearchParams) (*pb.SchemaSearchResults, error) {
 46 | 	p := SearchParams{params}
 47 | 	if err := p.Validate(); err != nil {
 48 | 		return nil, errors.NewInvalidUserInputf("invalid search params: %s", err)
 49 | 	}
 50 | 
 51 | 	centroid, err := con.centroidFromNameAndKeywords(p)
 52 | 	if err != nil {
 53 | 		return nil, errors.NewInvalidUserInputf("could not build centroid from name and keywords: %s", err)
 54 | 	}
 55 | 
 56 | 	rawResults, err := con.knnSearch(*centroid)
 57 | 	if err != nil {
 58 | 		return nil, errors.NewInternalf("could not perform knn search: %s", err)
 59 | 	}
 60 | 
 61 | 	if p.SearchType == pb.SearchType_CLASS {
 62 | 		return con.handleClassSearch(p, rawResults)
 63 | 	}
 64 | 
 65 | 	// since we have passed validation we know that anything that's not a class
 66 | 	// search must be a property search
 67 | 	return con.handlePropertySearch(p, rawResults)
 68 | }
 69 | 
 70 | func (con *Contextionary) centroidFromNameAndKeywords(p SearchParams) (*contextionary.Vector, error) {
 71 | 	nameVector, err := con.camelCaseWordToVector(p.Name)
 72 | 	if err != nil {
 73 | 		return nil, fmt.Errorf("invalid name in search: %s", err)
 74 | 	}
 75 | 
 76 | 	if len(p.Keywords) == 0 {
 77 | 		return nameVector, nil
 78 | 	}
 79 | 
 80 | 	vectors := make([]contextionary.Vector, len(p.Keywords)+1, len(p.Keywords)+1)
 81 | 	weights := make([]float32, len(p.Keywords)+1, len(p.Keywords)+1)
 82 | 	// set last vector to className which always has weight=1
 83 | 	vectors[len(vectors)-1] = *nameVector
 84 | 	weights[len(vectors)-1] = 1
 85 | 
 86 | 	for i, keyword := range p.Keywords {
 87 | 		kwVector, err := con.wordToVector(keyword.Keyword)
 88 | 		if err != nil {
 89 | 			return nil, fmt.Errorf("invalid keyword in search: %s", err)
 90 | 		}
 91 | 		vectors[i] = *kwVector
 92 | 		weights[i] = keyword.Weight
 93 | 	}
 94 | 
 95 | 	return contextionary.ComputeWeightedCentroid(vectors, weights)
 96 | }
 97 | 
 98 | func (con *Contextionary) camelCaseWordToVector(w string) (*contextionary.Vector, error) {
 99 | 	parts := camelcase.Split(w)
100 | 	if len(parts) == 1 {
101 | 		// no camelcasing, no need to build a centroid
102 | 		return con.wordToVector(w)
103 | 	}
104 | 
105 | 	vectors := make([]contextionary.Vector, len(parts), len(parts))
106 | 	weights := make([]float32, len(parts), len(parts))
107 | 	for i, part := range parts {
108 | 		v, err := con.wordToVector(part)
109 | 		if err != nil {
110 | 			return nil, fmt.Errorf("invalid camelCased compound word: %s", err)
111 | 		}
112 | 
113 | 		vectors[i] = *v
114 | 		weights[i] = 1 // on camel-casing all parts are weighted equally
115 | 	}
116 | 
117 | 	return contextionary.ComputeWeightedCentroid(vectors, weights)
118 | }
119 | 
120 | func (con *Contextionary) wordToVector(w string) (*contextionary.Vector, error) {
121 | 	w = strings.ToLower(w)
122 | 	itemIndex := con.WordToItemIndex(w)
123 | 	if ok := itemIndex.IsPresent(); !ok {
124 | 		return nil, fmt.Errorf(
125 | 			"the word '%s' is not present in the contextionary and therefore not a valid search term", w)
126 | 	}
127 | 
128 | 	vector, err := con.GetVectorForItemIndex(itemIndex)
129 | 	if err != nil {
130 | 		return nil, fmt.Errorf("could not get vector for word '%s' with itemIndex '%d': %s",
131 | 			w, itemIndex, err)
132 | 	}
133 | 
134 | 	return vector, nil
135 | }
136 | 
137 | func (con *Contextionary) handleClassSearch(p SearchParams, search rawResults) (*pb.SchemaSearchResults, error) {
138 | 	return &pb.SchemaSearchResults{
139 | 		Type:    p.SearchType,
140 | 		Results: search.extractClassNames(p),
141 | 	}, nil
142 | }
143 | 
144 | func (con *Contextionary) handlePropertySearch(p SearchParams, search rawResults) (*pb.SchemaSearchResults, error) {
145 | 	return &pb.SchemaSearchResults{
146 | 		Type:    p.SearchType,
147 | 		Results: search.extractPropertyNames(p),
148 | 	}, nil
149 | }
150 | 
151 | func (con *Contextionary) knnSearch(vector contextionary.Vector) (rawResults, error) {
152 | 	list, distances, err := con.GetNnsByVector(vector, 10000, 3)
153 | 	if err != nil {
154 | 		return nil, fmt.Errorf("could not get nearest neighbors for vector '%v': %s", vector, err)
155 | 	}
156 | 
157 | 	results := make(rawResults, len(list), len(list))
158 | 	for i := range list {
159 | 		word, err := con.ItemIndexToWord(list[i])
160 | 		if err != nil {
161 | 			return results, fmt.Errorf("got a result from kNN search, but don't have a word for this index: %s", err)
162 | 		}
163 | 
164 | 		results[i] = rawResult{
165 | 			name:     word,
166 | 			distance: distances[i],
167 | 		}
168 | 	}
169 | 
170 | 	return results, nil
171 | }
172 | 
173 | // rawResult is a helper struct to contain the results of the kNN-search. It
174 | // does not yet contain the desired output. This means the names can be both
175 | // classes/properties and arbitrary words. Furthermore the certainty has not
176 | // yet been normalized , so it is merely the raw kNN distance
177 | type rawResult struct {
178 | 	name     string
179 | 	distance float32
180 | }
181 | 
182 | type rawResults []rawResult
183 | 
184 | func (r rawResults) extractClassNames(p SearchParams) []*pb.SchemaSearchResult {
185 | 	var results []*pb.SchemaSearchResult
186 | 	regex := regexp.MustCompile(fmt.Sprintf("^\\$%s\\[([A-Za-z]+)\\]$", "OBJECT"))
187 | 
188 | 	for _, rawRes := range r {
189 | 		if regex.MatchString(rawRes.name) {
190 | 			certainty := distanceToCertainty(rawRes.distance)
191 | 			if certainty < p.Certainty {
192 | 				continue
193 | 			}
194 | 
195 | 			results = append(results, &pb.SchemaSearchResult{
196 | 				Name:      regex.FindStringSubmatch(rawRes.name)[1], //safe because we ran .MatchString before
197 | 				Certainty: certainty,
198 | 			})
199 | 		}
200 | 	}
201 | 
202 | 	return results
203 | }
204 | 
205 | func (r rawResults) extractPropertyNames(p SearchParams) []*pb.SchemaSearchResult {
206 | 	var results []*pb.SchemaSearchResult
207 | 	regex := regexp.MustCompile("^\\$[A-Za-z]+\\[[A-Za-z]+\\]\\[([A-Za-z]+)\\]$")
208 | 
209 | 	propsMap := map[string][]*pb.SchemaSearchResult{}
210 | 
211 | 	for _, rawRes := range r {
212 | 		if regex.MatchString(rawRes.name) {
213 | 			name := regex.FindStringSubmatch(rawRes.name)[1] //safe because we ran .MatchString before
214 | 			certainty := distanceToCertainty(rawRes.distance)
215 | 			if certainty < p.Certainty {
216 | 				continue
217 | 			}
218 | 
219 | 			res := &pb.SchemaSearchResult{
220 | 				Name:      name,
221 | 				Certainty: certainty,
222 | 			}
223 | 			if _, ok := propsMap[name]; !ok {
224 | 				propsMap[name] = []*pb.SchemaSearchResult{res}
225 | 			} else {
226 | 				propsMap[name] = append(propsMap[name], res)
227 | 			}
228 | 		}
229 | 	}
230 | 
231 | 	// now calculate mean of duplicate results
232 | 	for _, resultsPerName := range propsMap {
233 | 		results = append(results, &pb.SchemaSearchResult{
234 | 			Name:      resultsPerName[0].Name,
235 | 			Certainty: meanCertainty(resultsPerName),
236 | 		})
237 | 	}
238 | 
239 | 	return results
240 | }
241 | 
242 | func meanCertainty(rs []*pb.SchemaSearchResult) float32 {
243 | 	var compound float32
244 | 	for _, r := range rs {
245 | 		compound += r.Certainty
246 | 	}
247 | 
248 | 	return compound / float32(len(rs))
249 | }
250 | 
251 | func distanceToCertainty(d float32) float32 {
252 | 	return 1 - d/12
253 | }
254 | 


--------------------------------------------------------------------------------
/contextionary/schema/schema_search_params.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package schema
12 | 
13 | import (
14 | 	"fmt"
15 | 
16 | 	"github.com/fatih/camelcase"
17 | 	pb "github.com/weaviate/contextionary/contextionary"
18 | )
19 | 
20 | // SearchType to search for either class names or property names
21 | type SearchType string
22 | 
23 | const (
24 | 	// SearchTypeClass to search the contextionary for class names
25 | 	SearchTypeClass SearchType = "class"
26 | 	// SearchTypeProperty to search the contextionary for property names
27 | 	SearchTypeProperty SearchType = "property"
28 | )
29 | 
30 | // SearchParams to be used for a SchemaSearch. See individual properties for
31 | // additional documentation on what they do
32 | type SearchParams struct {
33 | 	*pb.SchemaSearchParams
34 | }
35 | 
36 | // Validate the feasibility of the specified arguments
37 | func (p SearchParams) Validate() error {
38 | 	if p.Name == "" {
39 | 		return fmt.Errorf("Name cannot be empty")
40 | 	}
41 | 
42 | 	if err := p.validateCertaintyOrWeight(p.Certainty); err != nil {
43 | 		return fmt.Errorf("invalid Certainty: %s", err)
44 | 	}
45 | 
46 | 	if p.SearchType != pb.SearchType_CLASS && p.SearchType != pb.SearchType_PROPERTY {
47 | 		return fmt.Errorf(
48 | 			"SearchType must be SearchType_CLASS or SearchType_PROPERTY, but got '%s'", p.SearchType)
49 | 	}
50 | 
51 | 	for i, keyword := range p.Keywords {
52 | 		if err := p.validateKeyword(keyword); err != nil {
53 | 			return fmt.Errorf("invalid keyword at position %d: %s", i, err)
54 | 		}
55 | 	}
56 | 
57 | 	return nil
58 | }
59 | 
60 | func (p SearchParams) validateKeyword(kw *pb.Keyword) error {
61 | 	if kw.Keyword == "" {
62 | 		return fmt.Errorf("Keyword cannot be empty")
63 | 	}
64 | 
65 | 	if len(camelcase.Split(kw.Keyword)) > 1 {
66 | 		return fmt.Errorf("invalid Keyword: keywords cannot be camelCased - "+
67 | 			"instead split your keyword up into several keywords, this way each word "+
68 | 			"of your camelCased string can have its own weight, got '%s'", kw.Keyword)
69 | 	}
70 | 
71 | 	if err := p.validateCertaintyOrWeight(kw.Weight); err != nil {
72 | 		return fmt.Errorf("invalid Weight: %s", err)
73 | 	}
74 | 
75 | 	return nil
76 | }
77 | 
78 | func (p SearchParams) validateCertaintyOrWeight(c float32) error {
79 | 	if c >= 0 && c <= 1 {
80 | 		return nil
81 | 	}
82 | 
83 | 	return fmt.Errorf("must be between 0 and 1, but got '%f'", c)
84 | }
85 | 


--------------------------------------------------------------------------------
/contextionary/schema/schema_search_params_test.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */package schema
 12 | 
 13 | import (
 14 | 	"errors"
 15 | 	"testing"
 16 | 
 17 | 	"github.com/stretchr/testify/assert"
 18 | 	"github.com/weaviate/contextionary/contextionary"
 19 | )
 20 | 
 21 | func Test__SchemaSearch_Validation(t *testing.T) {
 22 | 	tests := schemaSearchTests{
 23 | 		{
 24 | 			name: "valid params",
 25 | 			searchParams: SearchParams{
 26 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 27 | 					SearchType: contextionary.SearchType_CLASS,
 28 | 					Name:       "foo",
 29 | 					Certainty:  1.0,
 30 | 				},
 31 | 			},
 32 | 			expectedError: nil,
 33 | 		},
 34 | 		{
 35 | 			name: "missing search name",
 36 | 			searchParams: SearchParams{
 37 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 38 | 					SearchType: contextionary.SearchType_CLASS,
 39 | 					Name:       "",
 40 | 					Certainty:  0.0,
 41 | 				},
 42 | 			},
 43 | 			expectedError: errors.New("Name cannot be empty"),
 44 | 		},
 45 | 		{
 46 | 			name: "certainty too low",
 47 | 			searchParams: SearchParams{
 48 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 49 | 					SearchType: contextionary.SearchType_CLASS,
 50 | 					Name:       "bestName",
 51 | 					Certainty:  -4,
 52 | 				},
 53 | 			},
 54 | 			expectedError: errors.New("invalid Certainty: must be between 0 and 1, but got '-4.000000'"),
 55 | 		},
 56 | 		{
 57 | 			name: "certainty too high",
 58 | 			searchParams: SearchParams{
 59 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 60 | 					SearchType: contextionary.SearchType_CLASS,
 61 | 					Name:       "bestName",
 62 | 					Certainty:  4,
 63 | 				},
 64 | 			},
 65 | 			expectedError: errors.New("invalid Certainty: must be between 0 and 1, but got '4.000000'"),
 66 | 		},
 67 | 		{
 68 | 			name: "missing kind on class search",
 69 | 			searchParams: SearchParams{
 70 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 71 | 					SearchType: contextionary.SearchType_CLASS,
 72 | 					Name:       "bestName",
 73 | 					Certainty:  0.5,
 74 | 				},
 75 | 			},
 76 | 			expectedError: errors.New("Kind cannot be empty"),
 77 | 		},
 78 | 		{
 79 | 			name: "valid keywords",
 80 | 			searchParams: SearchParams{
 81 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 82 | 					SearchType: contextionary.SearchType_CLASS,
 83 | 					Name:       "foo",
 84 | 					Certainty:  1.0,
 85 | 					Keywords: []*contextionary.Keyword{
 86 | 						{
 87 | 							Keyword: "foobar",
 88 | 							Weight:  1.0,
 89 | 						},
 90 | 					},
 91 | 				},
 92 | 			},
 93 | 			expectedError: nil,
 94 | 		},
 95 | 		{
 96 | 			name: "keywords with empty names",
 97 | 			searchParams: SearchParams{
 98 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
 99 | 					SearchType: contextionary.SearchType_CLASS,
100 | 					Name:       "foo",
101 | 					Certainty:  1.0,
102 | 					Keywords: []*contextionary.Keyword{
103 | 						{
104 | 							Keyword: "",
105 | 							Weight:  1.0,
106 | 						},
107 | 					},
108 | 				},
109 | 			},
110 | 			expectedError: errors.New("invalid keyword at position 0: Keyword cannot be empty"),
111 | 		},
112 | 		{
113 | 			name: "keywords with invalid weights",
114 | 			searchParams: SearchParams{
115 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
116 | 					SearchType: contextionary.SearchType_CLASS,
117 | 					Name:       "foo",
118 | 					Certainty:  1.0,
119 | 					Keywords: []*contextionary.Keyword{{
120 | 						Keyword: "bestkeyword",
121 | 						Weight:  1.3,
122 | 					}},
123 | 				},
124 | 			},
125 | 			expectedError: errors.New("invalid keyword at position 0: invalid Weight: " +
126 | 				"must be between 0 and 1, but got '1.300000'"),
127 | 		},
128 | 		{
129 | 			name: "CamelCased keywords",
130 | 			searchParams: SearchParams{
131 | 				SchemaSearchParams: &contextionary.SchemaSearchParams{
132 | 					SearchType: contextionary.SearchType_CLASS,
133 | 					Name:       "foo",
134 | 					Certainty:  1.0,
135 | 					Keywords: []*contextionary.Keyword{{
136 | 						Keyword: "worstKeyword",
137 | 						Weight:  0.8,
138 | 					}},
139 | 				},
140 | 			},
141 | 			expectedError: errors.New("invalid keyword at position 0: invalid Keyword: " +
142 | 				"keywords cannot be camelCased - instead split your keyword up into several keywords, " +
143 | 				"this way each word of your camelCased string can have its own weight, got 'worstKeyword'"),
144 | 		},
145 | 	}
146 | 
147 | 	tests.AssertValidation(t)
148 | }
149 | 
150 | func (s schemaSearchTests) AssertValidation(t *testing.T) {
151 | 	for _, test := range s {
152 | 		t.Run(test.name, func(t *testing.T) {
153 | 			err := test.searchParams.Validate()
154 | 
155 | 			// assert error
156 | 			assert.Equal(t, test.expectedError, err, "should match the expected error")
157 | 
158 | 		})
159 | 	}
160 | }
161 | 


--------------------------------------------------------------------------------
/errors/errors.go:
--------------------------------------------------------------------------------
 1 | package errors
 2 | 
 3 | import "fmt"
 4 | 
 5 | // InvalidUserInput indicates a client-side error
 6 | type InvalidUserInput struct {
 7 | 	msg string
 8 | }
 9 | 
10 | func (e InvalidUserInput) Error() string {
11 | 	return e.msg
12 | }
13 | 
14 | // NewInvalidUserInput with Errorf signature
15 | func NewInvalidUserInputf(format string, args ...interface{}) InvalidUserInput {
16 | 	return InvalidUserInput{msg: fmt.Sprintf(format, args...)}
17 | }
18 | 
19 | // Internal indicates something went wrong during processing
20 | type Internal struct {
21 | 	msg string
22 | }
23 | 
24 | func (e Internal) Error() string {
25 | 	return e.msg
26 | }
27 | 
28 | // NewInternal with Errorf signature
29 | func NewInternalf(format string, args ...interface{}) Internal {
30 | 	return Internal{msg: fmt.Sprintf(format, args...)}
31 | }
32 | 
33 | // NotFound indicates the desired resource doesn't exist
34 | type NotFound struct {
35 | 	msg string
36 | }
37 | 
38 | func (e NotFound) Error() string {
39 | 	return e.msg
40 | }
41 | 
42 | // NewNotFound with Errorf signature
43 | func NewNotFoundf(format string, args ...interface{}) NotFound {
44 | 	return NotFound{msg: fmt.Sprintf(format, args...)}
45 | }
46 | 


--------------------------------------------------------------------------------
/extensions/extension.go:
--------------------------------------------------------------------------------
 1 | package extensions
 2 | 
 3 | type Extension struct {
 4 | 	Concept    string         `json:"concept"`
 5 | 	Vector     []float32      `json:"vector"`
 6 | 	Occurrence int            `json:"occurrence"`
 7 | 	Input      ExtensionInput `json:"input"`
 8 | }
 9 | 
10 | type ExtensionInput struct {
11 | 	Definition string  `json:"definition"`
12 | 	Weight     float32 `json:"weight"`
13 | }
14 | 


--------------------------------------------------------------------------------
/extensions/looker_upper.go:
--------------------------------------------------------------------------------
 1 | package extensions
 2 | 
 3 | import (
 4 | 	"sync"
 5 | )
 6 | 
 7 | type LookerUpper struct {
 8 | 	repo RetrieverRepo
 9 | 	sync.Mutex
10 | 	db map[string]Extension
11 | }
12 | 
13 | type RetrieverRepo interface {
14 | 	// WatchAll must send an immediate response after opening (for
15 | 	// initializiation), then send another response whenver the db has changed
16 | 	WatchAll() chan WatchResponse
17 | }
18 | 
19 | func NewLookerUpper(repo RetrieverRepo) *LookerUpper {
20 | 	lu := &LookerUpper{
21 | 		repo: repo,
22 | 		db:   map[string]Extension{},
23 | 	}
24 | 	lu.initWatcher()
25 | 	return lu
26 | }
27 | 
28 | func (lu *LookerUpper) Lookup(concept string) (*Extension, error) {
29 | 	lu.Lock()
30 | 	defer lu.Unlock()
31 | 
32 | 	ext, ok := lu.db[concept]
33 | 	if !ok {
34 | 		return nil, nil
35 | 	}
36 | 
37 | 	return &ext, nil
38 | }
39 | 
40 | type WatchResponse []Extension
41 | 
42 | func (lu *LookerUpper) initWatcher() {
43 | 	updateCh := lu.repo.WatchAll()
44 | 
45 | 	go func() {
46 | 		for res := range updateCh {
47 | 			lu.updateDB(res)
48 | 		}
49 | 	}()
50 | }
51 | 
52 | func (lu *LookerUpper) updateDB(list []Extension) {
53 | 	lu.Lock()
54 | 	defer lu.Unlock()
55 | 
56 | 	for _, ext := range list {
57 | 		lu.db[ext.Concept] = ext
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/extensions/looker_upper_test.go:
--------------------------------------------------------------------------------
 1 | package extensions
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | 
 7 | 	"github.com/stretchr/testify/assert"
 8 | 	"github.com/stretchr/testify/require"
 9 | )
10 | 
11 | func Test_LookerUpper(t *testing.T) {
12 | 	t.Run("looking up a non-existant concept", func(t *testing.T) {
13 | 		repo := newFakeRepo()
14 | 		lu := NewLookerUpper(repo)
15 | 		extension, err := lu.Lookup("non_existing_concept")
16 | 		require.Nil(t, err)
17 | 		assert.Nil(t, extension)
18 | 	})
19 | 
20 | 	t.Run("looking up existing concepts", func(t *testing.T) {
21 | 		repo := newFakeRepo()
22 | 		lu := NewLookerUpper(repo)
23 | 
24 | 		t.Run("with an initial concept", func(t *testing.T) {
25 | 			ext := Extension{
26 | 				Concept:    "flux_capacitor",
27 | 				Vector:     []float32{0, 1, 2},
28 | 				Occurrence: 1000,
29 | 			}
30 | 			repo.add(ext)
31 | 			time.Sleep(100 * time.Millisecond)
32 | 			actual, err := lu.Lookup("flux_capacitor")
33 | 			require.Nil(t, err)
34 | 			assert.Equal(t, &ext, actual)
35 | 		})
36 | 
37 | 		t.Run("with second concept", func(t *testing.T) {
38 | 			ext := Extension{
39 | 				Concept:    "clux_fapacitor",
40 | 				Vector:     []float32{0, 1, 2},
41 | 				Occurrence: 1000,
42 | 			}
43 | 			repo.add(ext)
44 | 			time.Sleep(100 * time.Millisecond)
45 | 
46 | 			t.Run("looking up the original concept", func(t *testing.T) {
47 | 				actual, err := lu.Lookup("flux_capacitor")
48 | 				require.Nil(t, err)
49 | 				require.NotNil(t, actual)
50 | 				assert.Equal(t, "flux_capacitor", actual.Concept)
51 | 			})
52 | 
53 | 			t.Run("looking up the second concept concept", func(t *testing.T) {
54 | 				actual, err := lu.Lookup("clux_fapacitor")
55 | 				require.Nil(t, err)
56 | 				require.NotNil(t, actual)
57 | 				assert.Equal(t, "clux_fapacitor", actual.Concept)
58 | 			})
59 | 		})
60 | 	})
61 | }
62 | 
63 | func newFakeRepo() *fakeRepo {
64 | 	repo := &fakeRepo{
65 | 		ch: make(chan WatchResponse),
66 | 	}
67 | 
68 | 	return repo
69 | }
70 | 
71 | type fakeRepo struct {
72 | 	ch         chan WatchResponse
73 | 	extensions []Extension
74 | }
75 | 
76 | func (f *fakeRepo) WatchAll() chan WatchResponse {
77 | 	return f.ch
78 | }
79 | 
80 | func (f *fakeRepo) add(ex Extension) {
81 | 	f.extensions = append(f.extensions, ex)
82 | 	f.ch <- f.extensions
83 | }
84 | 


--------------------------------------------------------------------------------
/extensions/storer.go:
--------------------------------------------------------------------------------
  1 | package extensions
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"strings"
  7 | 	"unicode"
  8 | 
  9 | 	"github.com/sirupsen/logrus"
 10 | 	core "github.com/weaviate/contextionary/contextionary/core"
 11 | 	"github.com/weaviate/contextionary/errors"
 12 | )
 13 | 
 14 | type Vectorizer interface {
 15 | 	Corpi(corpi []string, overrides map[string]string) (*core.Vector, error)
 16 | }
 17 | 
 18 | type StorerRepo interface {
 19 | 	Put(ctx context.Context, ext Extension) error
 20 | }
 21 | 
 22 | type Storer struct {
 23 | 	vectorizer Vectorizer
 24 | 	repo       StorerRepo
 25 | 	logger     logrus.FieldLogger
 26 | }
 27 | 
 28 | func NewStorer(vectorizer Vectorizer, repo StorerRepo, logger logrus.FieldLogger) *Storer {
 29 | 	return &Storer{vectorizer, repo, logger}
 30 | }
 31 | 
 32 | func (s *Storer) Put(ctx context.Context, concept string, input ExtensionInput) error {
 33 | 	s.logger.WithField("action", "extensions_put").
 34 | 		WithField("concept", concept).
 35 | 		WithField("extension", input).
 36 | 		Debug("received request to add/replace custom extension")
 37 | 
 38 | 	err := s.validate(concept, input)
 39 | 	if err != nil {
 40 | 		return errors.NewInvalidUserInputf("invalid extension: %v", err)
 41 | 	}
 42 | 
 43 | 	vector, err := s.vectorizer.Corpi([]string{input.Definition}, nil)
 44 | 	if err != nil {
 45 | 		return errors.NewInternalf("vectorize definition: %v", err)
 46 | 	}
 47 | 
 48 | 	concept = s.compound(concept)
 49 | 
 50 | 	ext := Extension{
 51 | 		Concept:    concept,
 52 | 		Input:      input,
 53 | 		Vector:     vector.ToArray(), // nil-check can be omitted as vectorizer will return non-nil if err==nil
 54 | 		Occurrence: 1000,             // TODO: Improve!
 55 | 	}
 56 | 
 57 | 	s.logger.WithField("action", "extensions_put_prestore").
 58 | 		WithField("concept", ext.Concept).
 59 | 		WithField("extension", ext).
 60 | 		Debug("calculated vector, about to store in repo")
 61 | 
 62 | 	err = s.repo.Put(ctx, ext)
 63 | 	if err != nil {
 64 | 		s.logger.WithField("action", "extensions_store_error").
 65 | 			WithField("concept", ext.Concept).
 66 | 			Errorf("repo put: %v", err)
 67 | 		return errors.NewInternalf("store extension: %v", err)
 68 | 	}
 69 | 
 70 | 	s.logger.WithField("action", "extensions_put_poststore").
 71 | 		WithField("concept", ext.Concept).
 72 | 		Debug("successfully stored extension in repo")
 73 | 
 74 | 	return nil
 75 | }
 76 | 
 77 | func (s *Storer) compound(inp string) string {
 78 | 	parts := strings.Split(inp, " ")
 79 | 	return strings.Join(parts, "_")
 80 | }
 81 | 
 82 | func (s *Storer) validate(concept string, input ExtensionInput) error {
 83 | 	if len(concept) < 2 {
 84 | 		return fmt.Errorf("concept must have at least two characters")
 85 | 	}
 86 | 
 87 | 	for _, r := range concept {
 88 | 		if !unicode.IsLower(r) && !unicode.IsSpace(r) && !unicode.IsNumber(r) {
 89 | 			return fmt.Errorf("concept must be made up of all lowercase letters and/or numbers, " +
 90 | 				"for custom compund words use spaces, e.g. 'flux capacitor'")
 91 | 		}
 92 | 	}
 93 | 
 94 | 	if len(input.Definition) == 0 {
 95 | 		return fmt.Errorf("definition cannot be empty")
 96 | 	}
 97 | 
 98 | 	if input.Weight > 1 || input.Weight < 0 {
 99 | 		return fmt.Errorf("weight must be between 0 and 1")
100 | 	}
101 | 
102 | 	if input.Weight < 1 {
103 | 		return fmt.Errorf("weights below 1 (extending an existing concept) not supported yet - coming soon")
104 | 	}
105 | 
106 | 	return nil
107 | }
108 | 


--------------------------------------------------------------------------------
/extensions/storer_test.go:
--------------------------------------------------------------------------------
  1 | package extensions
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/sirupsen/logrus/hooks/test"
  9 | 	"github.com/stretchr/testify/assert"
 10 | 	"github.com/stretchr/testify/mock"
 11 | 	"github.com/stretchr/testify/require"
 12 | 	core "github.com/weaviate/contextionary/contextionary/core"
 13 | )
 14 | 
 15 | func Test_Storer(t *testing.T) {
 16 | 	t.Run("with invalid inputs", func(t *testing.T) {
 17 | 		repo := &fakeStorerRepo{}
 18 | 		logger, _ := test.NewNullLogger()
 19 | 		s := NewStorer(&fakeVectorizer{}, repo, logger)
 20 | 		inp := ExtensionInput{
 21 | 			Definition: "an electrical device to store energy in the short term",
 22 | 			Weight:     1,
 23 | 		}
 24 | 
 25 | 		type testCase struct {
 26 | 			concept     string
 27 | 			inp         ExtensionInput
 28 | 			expectedErr error
 29 | 		}
 30 | 
 31 | 		tests := []testCase{
 32 | 			testCase{
 33 | 				concept:     "lowerAndUpperCase",
 34 | 				expectedErr: fmt.Errorf("invalid extension: concept must be made up of all lowercase letters and/or numbers, for custom compund words use spaces, e.g. 'flux capacitor'"),
 35 | 				inp:         inp,
 36 | 			},
 37 | 			testCase{
 38 | 				concept:     "a",
 39 | 				expectedErr: fmt.Errorf("invalid extension: concept must have at least two characters"),
 40 | 				inp:         inp,
 41 | 			},
 42 | 			testCase{
 43 | 				concept:     "foo",
 44 | 				expectedErr: fmt.Errorf("invalid extension: definition cannot be empty"),
 45 | 				inp:         ExtensionInput{Weight: 1},
 46 | 			},
 47 | 			testCase{
 48 | 				concept:     "foo",
 49 | 				expectedErr: fmt.Errorf("invalid extension: weight must be between 0 and 1"),
 50 | 				inp:         ExtensionInput{Weight: -1, Definition: "foo bar"},
 51 | 			},
 52 | 			testCase{
 53 | 				concept:     "foo",
 54 | 				expectedErr: fmt.Errorf("invalid extension: weight must be between 0 and 1"),
 55 | 				inp:         ExtensionInput{Weight: 3, Definition: "foo bar"},
 56 | 			},
 57 | 			testCase{ // TODO: add feature, then remove limitation
 58 | 				concept:     "foo",
 59 | 				expectedErr: fmt.Errorf("invalid extension: weights below 1 (extending an existing concept) not supported yet - coming soon"),
 60 | 				inp:         ExtensionInput{Weight: 0.7, Definition: "foo bar"},
 61 | 			},
 62 | 		}
 63 | 
 64 | 		for _, test := range tests {
 65 | 			t.Run(test.concept, func(t *testing.T) {
 66 | 				err := s.Put(context.Background(), test.concept, test.inp)
 67 | 				assert.Equal(t, test.expectedErr.Error(), err.Error())
 68 | 			})
 69 | 		}
 70 | 	})
 71 | 
 72 | 	t.Run("with valid input (single word)", func(t *testing.T) {
 73 | 		repo := &fakeStorerRepo{}
 74 | 		logger, _ := test.NewNullLogger()
 75 | 		s := NewStorer(&fakeVectorizer{}, repo, logger)
 76 | 		concept := "capacitor"
 77 | 		inp := ExtensionInput{
 78 | 			Definition: "an electrical device to store energy in the short term",
 79 | 			Weight:     1,
 80 | 		}
 81 | 
 82 | 		expectedExtension := Extension{
 83 | 			Input:      inp,
 84 | 			Concept:    concept,
 85 | 			Vector:     []float32{1, 2, 3},
 86 | 			Occurrence: 1000,
 87 | 		}
 88 | 		repo.On("Put", expectedExtension).Return(nil)
 89 | 		err := s.Put(context.Background(), concept, inp)
 90 | 		require.Nil(t, err)
 91 | 		repo.AssertExpectations(t)
 92 | 
 93 | 	})
 94 | 
 95 | 	t.Run("with valid input (compound word)", func(t *testing.T) {
 96 | 		// this is a special case because users will input their words using
 97 | 		// spaces, but we store them using snake_case
 98 | 		repo := &fakeStorerRepo{}
 99 | 		logger, _ := test.NewNullLogger()
100 | 		s := NewStorer(&fakeVectorizer{}, repo, logger)
101 | 		concept := "flux capacitor"
102 | 		inp := ExtensionInput{
103 | 			Definition: "an energy source for cars to travel through time",
104 | 			Weight:     1,
105 | 		}
106 | 
107 | 		expectedExtension := Extension{
108 | 			Input:      inp,
109 | 			Concept:    "flux_capacitor",
110 | 			Vector:     []float32{1, 2, 3},
111 | 			Occurrence: 1000,
112 | 		}
113 | 		repo.On("Put", expectedExtension).Return(nil)
114 | 		err := s.Put(context.Background(), concept, inp)
115 | 		require.Nil(t, err)
116 | 		repo.AssertExpectations(t)
117 | 	})
118 | }
119 | 
120 | type fakeVectorizer struct{}
121 | 
122 | func (f *fakeVectorizer) Corpi(corpi []string, overrides map[string]string) (*core.Vector, error) {
123 | 	v := core.NewVector([]float32{1, 2, 3})
124 | 	return &v, nil
125 | }
126 | 
127 | type fakeStorerRepo struct {
128 | 	mock.Mock
129 | }
130 | 
131 | func (f *fakeStorerRepo) Put(ctx context.Context, ext Extension) error {
132 | 	args := f.Called(ext)
133 | 	return args.Error(0)
134 | }
135 | 


--------------------------------------------------------------------------------
/gen_proto_code.sh:
--------------------------------------------------------------------------------
1 | protoc -I contextionary/ contextionary/contextionary.proto --go_out=plugins=grpc:contextionary
2 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/weaviate/contextionary
 2 | 
 3 | go 1.13
 4 | 
 5 | require (
 6 | 	github.com/fatih/camelcase v1.0.0
 7 | 	github.com/golang/protobuf v1.4.3
 8 | 	github.com/golang/snappy v0.0.3 // indirect
 9 | 	github.com/jessevdk/go-flags v1.4.0
10 | 	github.com/onsi/ginkgo v1.15.2 // indirect
11 | 	github.com/onsi/gomega v1.11.0 // indirect
12 | 	github.com/sirupsen/logrus v1.6.0
13 | 	github.com/stretchr/testify v1.6.1
14 | 	github.com/syndtr/goleveldb v0.0.0-20180708030551-c4c61651e9e3
15 | 	google.golang.org/grpc v1.24.0
16 | )
17 | 


--------------------------------------------------------------------------------
/logparser/parse.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"encoding/json"
 6 | 	"fmt"
 7 | 	"log"
 8 | 	"os"
 9 | )
10 | 
11 | type logEntry struct {
12 | 	Action string `json:"action"`
13 | 	Words  []word `json:"words"`
14 | }
15 | 
16 | type word struct {
17 | 	Occurrence int     `json:"occurrence"`
18 | 	Weight     float64 `json:"weight"`
19 | 	Word       string  `json:"word"`
20 | }
21 | 
22 | func main() {
23 | 	scanner := bufio.NewScanner(os.Stdin)
24 | 	var results []logEntry
25 | 
26 | 	for scanner.Scan() {
27 | 		var current logEntry
28 | 		err := json.Unmarshal(scanner.Bytes(), &current)
29 | 		if err != nil {
30 | 			log.Fatal(err)
31 | 		}
32 | 
33 | 		if current.Action == "debug_vector_weights" {
34 | 			results = append(results, current)
35 | 		}
36 | 	}
37 | 
38 | 	marshalled, err := json.MarshalIndent(results, "", "  ")
39 | 	if err != nil {
40 | 		log.Fatal(err)
41 | 	}
42 | 
43 | 	fmt.Print(string(marshalled))
44 | }
45 | 


--------------------------------------------------------------------------------
/main/splitter_preprocessor.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/weaviate/contextionary/preprocessing"
 8 | )
 9 | 
10 | func main() {
11 | 	if len(os.Args) != 5 {
12 | 		missing := fmt.Errorf("Missing arguments requires: [.idx, .dic, .aff, output_file]")
13 | 		panic(missing.Error())
14 | 	}
15 | 
16 | 	err := preprocessing.GenerateSplittingDictFile(os.Args[1], os.Args[2], os.Args[3], os.Args[4])
17 | 	if err != nil {
18 | 		panic(err.Error())
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/prepare_docker_buildx.sh:
--------------------------------------------------------------------------------
1 | docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
2 | docker buildx create --name multiarch --driver docker-container --use
3 | docker buildx inspect --bootstrap
4 | 


--------------------------------------------------------------------------------
/preprocessing/dictionary_pre_processing.go:
--------------------------------------------------------------------------------
  1 | package preprocessing
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"strings"
 10 | )
 11 | 
 12 | // PreprocessDict temp storage for reading in the index file
 13 | type PreprocessDict struct {
 14 | 	dict map[string]int
 15 | }
 16 | 
 17 | // GenerateSplittingDictFile from
 18 | //
 19 | //	contextionaryIndexFile binary .idx file containing the words for the specific language
 20 | //	languageDictionaryFile a hunspell .dic file for the specific language
 21 | //	languageAffixesFile a hunspell .aff file for the specific language
 22 | //	to reduce file- and hunspell dependencies for the splitter
 23 | func GenerateSplittingDictFile(contextionaryIndexFile string, languageDictionaryFile string, languageAffixesFile string, outputFile string) error {
 24 | 	dict := NewPreprocessDict(contextionaryIndexFile, languageDictionaryFile, languageAffixesFile)
 25 | 	out, err := os.Create(outputFile)
 26 | 	if err != nil {
 27 | 		return err
 28 | 	}
 29 | 	defer out.Close()
 30 | 
 31 | 	for word, occurrence := range dict.dict {
 32 | 		line := fmt.Sprintf("%s,%v\n", word, occurrence)
 33 | 		_, err := out.Write([]byte(line))
 34 | 		if err != nil {
 35 | 			return err
 36 | 		}
 37 | 	}
 38 | 	return nil
 39 | }
 40 | 
 41 | // NewPreprocessDict from
 42 | //
 43 | //	contextionaryIndexFile binary .idx file containing the words for the specific language
 44 | //	languageDictionaryFile a hunspell .dic file for the specific language
 45 | //	languageAffixesFile a hunspell .aff file for the specific language
 46 | func NewPreprocessDict(contextionaryIndexFile string, languageDictionaryFile string, languageAffixesFile string) *PreprocessDict {
 47 | 	dict := &PreprocessDict{
 48 | 		dict: make(map[string]int, 1200000),
 49 | 	}
 50 | 	hunspellFilter := Hunspell(languageAffixesFile, languageDictionaryFile)
 51 | 
 52 | 	err := dict.loadContextionary(contextionaryIndexFile, hunspellFilter)
 53 | 	if err != nil {
 54 | 		panic(err.Error())
 55 | 	}
 56 | 	return dict
 57 | }
 58 | 
 59 | // loadContextionary from binary file
 60 | func (cd *PreprocessDict) loadContextionary(path string, filter *Hunhandle) error {
 61 | 	data, readFileErr := ioutil.ReadFile(path)
 62 | 	if readFileErr != nil {
 63 | 		return readFileErr
 64 | 	}
 65 | 
 66 | 	// File format:
 67 | 	// https://github.com/weaviate/weaviate-vector-generator#wordlist-file-format
 68 | 	nrWordsBytes := data[0:8]
 69 | 	//vectorLengthBytes := data[8:16]
 70 | 	metaDataLengthBytes := data[16:24]
 71 | 
 72 | 	nrWords := binary.LittleEndian.Uint64(nrWordsBytes)
 73 | 	//vectorLength := binary.LittleEndian.Uint64(vectorLengthBytes)
 74 | 	metaDataLength := binary.LittleEndian.Uint64(metaDataLengthBytes)
 75 | 
 76 | 	// Read meta data
 77 | 	metaDataBytes := data[24 : 24+metaDataLength]
 78 | 	var metadata map[string]interface{}
 79 | 	unMarshalErr := json.Unmarshal(metaDataBytes, &metadata)
 80 | 	if unMarshalErr != nil {
 81 | 		return unMarshalErr
 82 | 	}
 83 | 
 84 | 	var startOfTable uint64 = 24 + uint64(metaDataLength)
 85 | 	var offset uint64 = 4 - (startOfTable % 4)
 86 | 	startOfTable += offset
 87 | 
 88 | 	for wordIndex := uint64(0); wordIndex < nrWords; wordIndex++ {
 89 | 		// entryAddress is the index in the data where the pointer to
 90 | 		// the word is located
 91 | 		entryAddress := startOfTable + 8*wordIndex
 92 | 		pointerToWordByte := data[entryAddress : entryAddress+8]
 93 | 		pointerToWord := binary.LittleEndian.Uint64(pointerToWordByte)
 94 | 		word, occurence := getWordAndOccurence(data, pointerToWord)
 95 | 		// Only add the word if it passes the filter
 96 | 		if passesFilter(word, filter) {
 97 | 			cd.dict[word] = int(occurence)
 98 | 		}
 99 | 	}
100 | 
101 | 	return nil
102 | }
103 | 
104 | // getWordAndOccurence from the data frame indecated by the pointer
105 | func getWordAndOccurence(data []byte, pointer uint64) (string, uint64) {
106 | 	ocurrence := binary.LittleEndian.Uint64(data[pointer : pointer+8])
107 | 
108 | 	pointer = pointer + 8
109 | 	for i := uint64(0); ; i++ {
110 | 		if data[pointer+i] == '\x00' {
111 | 			word := string(data[pointer : pointer+i])
112 | 			return word, ocurrence
113 | 		}
114 | 	}
115 | }
116 | 
117 | // passesFilter if the word is in the dictionary of the given language
118 | func passesFilter(word string, filter *Hunhandle) bool {
119 | 	inDict := filter.Spell(word)
120 | 	if inDict {
121 | 		return true
122 | 	}
123 | 	// Check if upper case word
124 | 	inDict = filter.Spell(strings.Title(word))
125 | 	return inDict
126 | }
127 | 


--------------------------------------------------------------------------------
/preprocessing/dictionary_pre_processing_test.go:
--------------------------------------------------------------------------------
 1 | package preprocessing
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"os"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/stretchr/testify/assert"
10 | 	"github.com/weaviate/contextionary/compoundsplitting"
11 | )
12 | 
13 | func TestPreprocessorSplitterDictFile(t *testing.T) {
14 | 	// Create the file
15 | 	outputFile := "test_dict.splitdict"
16 | 	GenerateSplittingDictFile("../test/compoundsplitting/contextionary.idx", "../test/compoundsplitting/nl_NL.dic", "../test/compoundsplitting/nl_NL.aff", outputFile)
17 | 
18 | 	// Validate the output file
19 | 	file, err := os.Open(outputFile)
20 | 	if err != nil {
21 | 		t.Fail()
22 | 	}
23 | 	defer file.Close()
24 | 
25 | 	scanner := bufio.NewScanner(file)
26 | 	found := false
27 | 	for scanner.Scan() {
28 | 		line := scanner.Text()
29 | 		split := strings.Split(line, ",")
30 | 		if split[0] == "appellantes" {
31 | 			found = true
32 | 			break
33 | 		}
34 | 	}
35 | 	assert.True(t, found)
36 | 
37 | 	if err := scanner.Err(); err != nil {
38 | 		t.Fail()
39 | 	}
40 | 
41 | 	err = file.Close()
42 | 	if err != nil {
43 | 		t.Fail()
44 | 	}
45 | 
46 | 	// Load from output file
47 | 	dict, err := compoundsplitting.NewContextionaryDict(outputFile)
48 | 	if err != nil {
49 | 		t.Fail()
50 | 	}
51 | 
52 | 	assert.True(t, dict.Contains("amsterdam"))
53 | 	assert.True(t, dict.Contains("appellante"))
54 | 	assert.True(t, dict.Contains("appellantes"))
55 | 
56 | 	// Remove test file
57 | 	err = os.Remove(outputFile)
58 | 	if err != nil {
59 | 		t.Fail()
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/preprocessing/hunspell.go:
--------------------------------------------------------------------------------
  1 | package preprocessing
  2 | 
  3 | // #cgo linux LDFLAGS: -lhunspell
  4 | // #cgo darwin LDFLAGS: -lhunspell-1.7 -L/usr/local/Cellar/hunspell/1.7.0_2/lib
  5 | // #cgo darwin CFLAGS: -I/usr/local/Cellar/hunspell/1.7.0_2/include/
  6 | //
  7 | // #include <stdlib.h>
  8 | // #include <stdio.h>
  9 | // #include <hunspell/hunspell.h>
 10 | import "C"
 11 | import (
 12 | 	"reflect"
 13 | 	"runtime"
 14 | 	"sync"
 15 | 	"unsafe"
 16 | )
 17 | 
 18 | // Code in this file copied/based on
 19 | // https://github.com/sthorne/go-hunspell/blob/99efdad5368d3e39a44c8cdaf101c33a4f20f8b9/hunspell.go
 20 | // Original is licensed under "MIT License" Original license located at:
 21 | // https://github.com/sthorne/go-hunspell/blob/99efdad5368d3e39a44c8cdaf101c33a4f20f8b9/LICENSE
 22 | 
 23 | type Hunhandle struct {
 24 | 	handle *C.Hunhandle
 25 | 	lock   *sync.Mutex
 26 | }
 27 | 
 28 | func Hunspell(affpath string, dpath string) *Hunhandle {
 29 | 
 30 | 	affpathcs := C.CString(affpath)
 31 | 	defer C.free(unsafe.Pointer(affpathcs))
 32 | 
 33 | 	dpathcs := C.CString(dpath)
 34 | 	defer C.free(unsafe.Pointer(dpathcs))
 35 | 
 36 | 	h := &Hunhandle{lock: new(sync.Mutex)}
 37 | 	h.handle = C.Hunspell_create(affpathcs, dpathcs)
 38 | 
 39 | 	runtime.SetFinalizer(h, func(handle *Hunhandle) {
 40 | 		C.Hunspell_destroy(handle.handle)
 41 | 		h.handle = nil
 42 | 	})
 43 | 
 44 | 	return h
 45 | }
 46 | 
 47 | func CArrayToString(c **C.char, l int) []string {
 48 | 
 49 | 	s := []string{}
 50 | 
 51 | 	hdr := reflect.SliceHeader{
 52 | 		Data: uintptr(unsafe.Pointer(c)),
 53 | 		Len:  l,
 54 | 		Cap:  l,
 55 | 	}
 56 | 
 57 | 	for _, v := range *(*[]*C.char)(unsafe.Pointer(&hdr)) {
 58 | 		s = append(s, C.GoString(v))
 59 | 	}
 60 | 
 61 | 	return s
 62 | }
 63 | 
 64 | func (handle *Hunhandle) Suggest(word string) []string {
 65 | 	wordcs := C.CString(word)
 66 | 	defer C.free(unsafe.Pointer(wordcs))
 67 | 
 68 | 	var carray **C.char
 69 | 	var length C.int
 70 | 	handle.lock.Lock()
 71 | 	length = C.Hunspell_suggest(handle.handle, &carray, wordcs)
 72 | 	handle.lock.Unlock()
 73 | 
 74 | 	words := CArrayToString(carray, int(length))
 75 | 
 76 | 	C.Hunspell_free_list(handle.handle, &carray, length)
 77 | 	return words
 78 | }
 79 | 
 80 | func (handle *Hunhandle) Add(word string) bool {
 81 | 
 82 | 	cWord := C.CString(word)
 83 | 	defer C.free(unsafe.Pointer(cWord))
 84 | 
 85 | 	var r C.int
 86 | 	r = C.Hunspell_add(handle.handle, cWord)
 87 | 
 88 | 	if int(r) != 0 {
 89 | 		return false
 90 | 	}
 91 | 
 92 | 	return true
 93 | }
 94 | 
 95 | func (handle *Hunhandle) Stem(word string) []string {
 96 | 	wordcs := C.CString(word)
 97 | 	defer C.free(unsafe.Pointer(wordcs))
 98 | 	var carray **C.char
 99 | 	var length C.int
100 | 	handle.lock.Lock()
101 | 	length = C.Hunspell_stem(handle.handle, &carray, wordcs)
102 | 	handle.lock.Unlock()
103 | 
104 | 	words := CArrayToString(carray, int(length))
105 | 
106 | 	C.Hunspell_free_list(handle.handle, &carray, length)
107 | 	return words
108 | }
109 | 
110 | func (handle *Hunhandle) Spell(word string) bool {
111 | 	wordcs := C.CString(word)
112 | 	defer C.free(unsafe.Pointer(wordcs))
113 | 	handle.lock.Lock()
114 | 	res := C.Hunspell_spell(handle.handle, wordcs)
115 | 	handle.lock.Unlock()
116 | 
117 | 	if int(res) == 0 {
118 | 		return false
119 | 	}
120 | 	return true
121 | }
122 | 


--------------------------------------------------------------------------------
/preprocessing/hunspell_test.go:
--------------------------------------------------------------------------------
 1 | package preprocessing
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func TestImplementation(t *testing.T) {
10 | 
11 | 	hsp := Hunspell("../test/compoundsplitting/nl_NL.aff", "../test/compoundsplitting/nl_NL.dic")
12 | 
13 | 	assert.True(t, hsp.Spell("Amsterdam"))
14 | 	assert.True(t, hsp.Spell("appellante"))
15 | 	assert.True(t, hsp.Spell("appellantes"))
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/server/config/config.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"strconv"
  7 | 
  8 | 	"github.com/sirupsen/logrus"
  9 | )
 10 | 
 11 | // Config is used to load application wide config from the environment
 12 | type Config struct {
 13 | 	logger        logrus.FieldLogger
 14 | 	KNNFile       string
 15 | 	IDXFile       string
 16 | 	StopwordsFile string
 17 | 
 18 | 	SchemaProviderURL       string
 19 | 	SchemaProviderKey       string
 20 | 	ExtensionsPrefix        string
 21 | 	ExtensionsStorageOrigin string
 22 | 	ExtensionsStorageMode   string
 23 | 
 24 | 	ServerPort int
 25 | 
 26 | 	OccurrenceWeightStrategy           string
 27 | 	OccurrenceWeightLinearFactor       float32
 28 | 	MaxCompoundWordLength              int
 29 | 	MaximumBatchSize                   int
 30 | 	MaximumVectorCacheSize             int
 31 | 	NeighborOccurrenceIgnorePercentile int
 32 | 
 33 | 	EnableCompundSplitting          bool
 34 | 	CompoundSplittingDictionaryFile string
 35 | 
 36 | 	LogLevel string
 37 | }
 38 | 
 39 | // New Config from the environment. Errors if required env vars can't be found
 40 | func New(logger logrus.FieldLogger) (*Config, error) {
 41 | 	cfg := &Config{logger: logger}
 42 | 	if err := cfg.init(); err != nil {
 43 | 		return nil, fmt.Errorf("could not load config from env: %v", err)
 44 | 	}
 45 | 
 46 | 	return cfg, nil
 47 | }
 48 | 
 49 | func (c *Config) init() error {
 50 | 	knn, err := c.requiredString("KNN_FILE")
 51 | 	if err != nil {
 52 | 		return err
 53 | 	}
 54 | 	c.KNNFile = knn
 55 | 
 56 | 	idx, err := c.requiredString("IDX_FILE")
 57 | 	if err != nil {
 58 | 		return err
 59 | 	}
 60 | 	c.IDXFile = idx
 61 | 
 62 | 	sw, err := c.requiredString("STOPWORDS_FILE")
 63 | 	if err != nil {
 64 | 		return err
 65 | 	}
 66 | 	c.StopwordsFile = sw
 67 | 
 68 | 	sp := c.optionalString("SCHEMA_PROVIDER_URL", "")
 69 | 	c.SchemaProviderURL = sp
 70 | 
 71 | 	spk := c.optionalString("SCHEMA_PROVIDER_KEY", "/weaviate/schema/state")
 72 | 	c.SchemaProviderKey = spk
 73 | 
 74 | 	ep := c.optionalString("EXTENSIONS_PREFIX", "/contextionary/")
 75 | 	c.ExtensionsPrefix = ep
 76 | 
 77 | 	extMode := c.optionalString("EXTENSIONS_STORAGE_MODE", "weaviate")
 78 | 	c.ExtensionsStorageMode = extMode
 79 | 
 80 | 	extOrigin := c.optionalString("EXTENSIONS_STORAGE_ORIGIN", "")
 81 | 	c.ExtensionsStorageOrigin = extOrigin
 82 | 
 83 | 	port, err := c.optionalInt("SERVER_PORT", 9999)
 84 | 	if err != nil {
 85 | 		return err
 86 | 	}
 87 | 	c.ServerPort = port
 88 | 
 89 | 	factor, err := c.optionalFloat32("OCCURRENCE_WEIGHT_LINEAR_FACTOR", 0.5)
 90 | 	if err != nil {
 91 | 		return err
 92 | 	}
 93 | 	c.OccurrenceWeightLinearFactor = factor
 94 | 
 95 | 	ignorePercentile, err := c.optionalInt("NEIGHBOR_OCCURRENCE_IGNORE_PERCENTILE", 5)
 96 | 	if err != nil {
 97 | 		return err
 98 | 	}
 99 | 
100 | 	if ignorePercentile < 0 || ignorePercentile > 100 {
101 | 		return fmt.Errorf("minimum relative neighbor occurrence must be a value between 0 and 100, got: %d", ignorePercentile)
102 | 	}
103 | 
104 | 	c.NeighborOccurrenceIgnorePercentile = ignorePercentile
105 | 
106 | 	strategy := c.optionalString("OCCURRENCE_WEIGHT_STRATEGY", "log")
107 | 	c.OccurrenceWeightStrategy = strategy
108 | 
109 | 	// this should match the underlying vector db file, a smaller value than in
110 | 	// the vector file will lead to missing out on compound words, whereas a
111 | 	// larger value will lead to unnecessary lookups slowing down the
112 | 	// vectorization process
113 | 	compoundLength, err := c.optionalInt("MAX_COMPOUND_WORD_LENGTH", 1)
114 | 	if err != nil {
115 | 		return err
116 | 	}
117 | 	c.MaxCompoundWordLength = compoundLength
118 | 
119 | 	batchSize, err := c.optionalInt("MAX_BATCH_SIZE", 200)
120 | 	if err != nil {
121 | 		return err
122 | 	}
123 | 	c.MaximumBatchSize = batchSize
124 | 
125 | 	vectorCacheSize, err := c.optionalInt("MAX_VECTORCACHE_SIZE", 10000)
126 | 	if err != nil {
127 | 		return err
128 | 	}
129 | 	c.MaximumVectorCacheSize = vectorCacheSize
130 | 
131 | 	c.EnableCompundSplitting = c.optionalBool("ENABLE_COMPOUND_SPLITTING", false)
132 | 
133 | 	if c.EnableCompundSplitting {
134 | 		compoundSplittingDictionaryFile, err := c.requiredString("COMPOUND_SPLITTING_DICTIONARY_FILE")
135 | 		if err != nil {
136 | 			return err
137 | 		}
138 | 		c.CompoundSplittingDictionaryFile = compoundSplittingDictionaryFile
139 | 	}
140 | 
141 | 	loglevel := c.optionalString("LOG_LEVEL", "info")
142 | 	c.LogLevel = loglevel
143 | 
144 | 	return nil
145 | }
146 | 
147 | func (c *Config) optionalInt(varName string, defaultValue int) (int, error) {
148 | 	value := os.Getenv(varName)
149 | 	if value == "" {
150 | 		c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
151 | 			varName, defaultValue)
152 | 		return defaultValue, nil
153 | 	}
154 | 
155 | 	asInt, err := strconv.Atoi(value)
156 | 	if err != nil {
157 | 		return 0, fmt.Errorf("cannot convert value of var '%s' ('%v') to int: %s",
158 | 			varName, value, err)
159 | 	}
160 | 
161 | 	return asInt, nil
162 | }
163 | 
164 | func (c *Config) optionalFloat32(varName string, defaultValue float32) (float32, error) {
165 | 	value := os.Getenv(varName)
166 | 	if value == "" {
167 | 		c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
168 | 			varName, defaultValue)
169 | 		return defaultValue, nil
170 | 	}
171 | 
172 | 	asFloat, err := strconv.ParseFloat(value, 32)
173 | 	if err != nil {
174 | 		return 0, fmt.Errorf("cannot convert value of var '%s' ('%v') to int: %s",
175 | 			varName, value, err)
176 | 	}
177 | 
178 | 	return float32(asFloat), nil
179 | }
180 | 
181 | func (c *Config) requiredString(varName string) (string, error) {
182 | 	value := os.Getenv(varName)
183 | 	if value == "" {
184 | 		return "", fmt.Errorf("required variable '%s' is not set", varName)
185 | 	}
186 | 
187 | 	return value, nil
188 | }
189 | 
190 | func (c *Config) optionalString(varName, defaultInput string) string {
191 | 	value := os.Getenv(varName)
192 | 	if value == "" {
193 | 		c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
194 | 			varName, defaultInput)
195 | 		return defaultInput
196 | 	}
197 | 
198 | 	return value
199 | }
200 | 
201 | func (c *Config) optionalBool(varName string, defaultInput bool) bool {
202 | 	value := os.Getenv(varName)
203 | 	if value == "" {
204 | 		c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
205 | 			varName, defaultInput)
206 | 		return defaultInput
207 | 	}
208 | 
209 | 	return value == "true" || value == "1" || value == "on" || value == "enabled"
210 | }
211 | 


--------------------------------------------------------------------------------
/server/contextionary.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | 
 7 | 	"github.com/weaviate/contextionary/compoundsplitting"
 8 | 
 9 | 	"github.com/weaviate/contextionary/adapters/repos"
10 | 	core "github.com/weaviate/contextionary/contextionary/core"
11 | 	"github.com/weaviate/contextionary/contextionary/core/stopwords"
12 | 	"github.com/weaviate/contextionary/extensions"
13 | )
14 | 
15 | func (s *server) init() error {
16 | 	s.logger.WithField("config", s.config).Debugf("starting up with this config")
17 | 
18 | 	if err := s.loadRawContextionary(); err != nil {
19 | 		return err
20 | 	}
21 | 
22 | 	swDetector, err := stopwords.NewFromFile(s.config.StopwordsFile)
23 | 	if err != nil {
24 | 		return err
25 | 	}
26 | 	s.stopwordDetector = swDetector
27 | 
28 | 	if err := s.buildContextionary(); err != nil {
29 | 		return err
30 | 	}
31 | 
32 | 	var er extensionRepo
33 | 	var extensionRetriever extensionLookerUpper
34 | 
35 | 	// ExtensionsStorageMode == "weaviate" is now a default storage option
36 | 	er = repos.NewExtensionsRepo(s.logger, s.config, 1*time.Second)
37 | 	extensionRetriever = extensions.NewLookerUpper(er)
38 | 
39 | 	compoundSplitter, err := s.initCompoundSplitter()
40 | 	if err != nil {
41 | 		return err
42 | 	}
43 | 	vectorizer, err := NewVectorizer(s.rawContextionary, s.stopwordDetector, s.config, s.logger,
44 | 		NewSplitter(), extensionRetriever, compoundSplitter)
45 | 	if err != nil {
46 | 		return err
47 | 	}
48 | 
49 | 	s.vectorizer = vectorizer
50 | 	s.extensionStorer = extensions.NewStorer(s.vectorizer, er, s.logger)
51 | 	s.extensionLookerUpper = extensionRetriever
52 | 
53 | 	return nil
54 | }
55 | 
56 | func (s *server) loadRawContextionary() error {
57 | 	c, err := core.LoadVectorFromDisk(s.config.KNNFile, s.config.IDXFile)
58 | 	if err != nil {
59 | 		return fmt.Errorf("could not initialize (raw) contextionary: %v", err)
60 | 	}
61 | 
62 | 	s.rawContextionary = c
63 | 	return nil
64 | }
65 | 
66 | type stopwordDetector interface {
67 | 	IsStopWord(word string) bool
68 | }
69 | 
70 | // any time the schema changes the contextionary needs to be rebuilt.
71 | func (s *server) buildContextionary() error {
72 | 	s.combinedContextionary = s.rawContextionary
73 | 	return nil
74 | }
75 | 
76 | func (s *server) initCompoundSplitter() (compoundSplitter, error) {
77 | 	if s.config.EnableCompundSplitting {
78 | 		dict, err := compoundsplitting.NewContextionaryDict(s.config.CompoundSplittingDictionaryFile)
79 | 		if err != nil {
80 | 			return nil, err
81 | 		}
82 | 		return compoundsplitting.NewSplitter(dict), nil
83 | 	} else {
84 | 		return compoundsplitting.NewNoopSplitter(), nil
85 | 	}
86 | }
87 | 
88 | type extensionRepo interface {
89 | 	extensions.RetrieverRepo
90 | 	extensions.StorerRepo
91 | }
92 | 


--------------------------------------------------------------------------------
/server/grpc_error.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/weaviate/contextionary/errors"
 5 | 	"google.golang.org/grpc/codes"
 6 | 	"google.golang.org/grpc/status"
 7 | )
 8 | 
 9 | func GrpcErrFromTyped(err error) error {
10 | 	if err == nil {
11 | 		return nil
12 | 	}
13 | 
14 | 	switch err.(type) {
15 | 	case errors.InvalidUserInput:
16 | 		return status.Error(codes.InvalidArgument, err.Error())
17 | 	case errors.Internal:
18 | 		return status.Error(codes.Internal, err.Error())
19 | 	case errors.NotFound:
20 | 		return status.Error(codes.NotFound, err.Error())
21 | 	default:
22 | 		return status.Error(codes.Unknown, err.Error())
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/server/server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net"
 6 | 	"os"
 7 | 
 8 | 	"github.com/sirupsen/logrus"
 9 | 	pb "github.com/weaviate/contextionary/contextionary"
10 | 	core "github.com/weaviate/contextionary/contextionary/core"
11 | 	"github.com/weaviate/contextionary/extensions"
12 | 	"github.com/weaviate/contextionary/server/config"
13 | 	grpc "google.golang.org/grpc"
14 | )
15 | 
16 | // Version is filled through a build arg
17 | var Version string
18 | 
19 | func main() {
20 | 	server := new()
21 | 	server.logger.WithField("version", Version).Info()
22 | 	grpcServer := grpc.NewServer()
23 | 	pb.RegisterContextionaryServer(grpcServer, server)
24 | 	lis, err := net.Listen("tcp", fmt.Sprintf(":%d", server.config.ServerPort))
25 | 	if err != nil {
26 | 		server.logger.Errorf("can't listen on port: %s", err)
27 | 		os.Exit(1)
28 | 	}
29 | 
30 | 	grpcServer.Serve(lis)
31 | }
32 | 
33 | type server struct {
34 | 	// to be used to serve rpc requests, combination of the raw contextionary
35 | 	// and the schema
36 | 	combinedContextionary core.Contextionary
37 | 
38 | 	// initialized at startup, to be used to build the
39 | 	// schema contextionary
40 | 	rawContextionary core.Contextionary
41 | 
42 | 	config *config.Config
43 | 
44 | 	logger logrus.FieldLogger
45 | 
46 | 	// ucs
47 | 	extensionStorer      *extensions.Storer
48 | 	extensionLookerUpper extensionLookerUpper
49 | 	stopwordDetector     stopwordDetector
50 | 	vectorizer           *Vectorizer
51 | }
52 | 
53 | // new gRPC server to serve the contextionary
54 | func new() *server {
55 | 	logger := logrus.New()
56 | 	logger.SetFormatter(&logrus.JSONFormatter{})
57 | 	cfg, err := config.New(logger)
58 | 	if err != nil {
59 | 		logger.
60 | 			WithError(err).
61 | 			Errorf("cannot start up")
62 | 		os.Exit(1)
63 | 	}
64 | 
65 | 	loglevel, err := logrus.ParseLevel(cfg.LogLevel)
66 | 	if err != nil {
67 | 		logger.
68 | 			WithError(err).
69 | 			Errorf("cannot start up")
70 | 		os.Exit(1)
71 | 	}
72 | 	logger.SetLevel(loglevel)
73 | 	logger.WithField("log_level", loglevel.String()).Info()
74 | 
75 | 	s := &server{
76 | 		config: cfg,
77 | 		logger: logger,
78 | 	}
79 | 
80 | 	err = s.init()
81 | 	if err != nil {
82 | 		logger.
83 | 			WithError(err).
84 | 			Errorf("cannot start up")
85 | 		os.Exit(1)
86 | 	}
87 | 
88 | 	return s
89 | }
90 | 


--------------------------------------------------------------------------------
/server/splitter.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode"
 6 | )
 7 | 
 8 | func NewSplitter() *Splitter {
 9 | 	return &Splitter{}
10 | }
11 | 
12 | type Splitter struct{}
13 | 
14 | func (s *Splitter) Split(corpus string) []string {
15 | 	return strings.FieldsFunc(corpus, func(c rune) bool {
16 | 		return !unicode.IsLetter(c) && !unicode.IsNumber(c)
17 | 	})
18 | }
19 | 


--------------------------------------------------------------------------------
/server/splitter_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func Test_Splitter(t *testing.T) {
10 | 	type testcase struct {
11 | 		name   string
12 | 		input  string
13 | 		output []string
14 | 	}
15 | 
16 | 	tests := []testcase{
17 | 		testcase{
18 | 			name:   "single word",
19 | 			input:  "single",
20 | 			output: []string{"single"},
21 | 		},
22 | 		testcase{
23 | 			name:   "words separated by space",
24 | 			input:  "hello my name is John",
25 | 			output: []string{"hello", "my", "name", "is", "John"},
26 | 		},
27 | 		testcase{
28 | 			name:   "multiple spaces in between words",
29 | 			input:  "hello     John",
30 | 			output: []string{"hello", "John"},
31 | 		},
32 | 
33 | 		testcase{
34 | 			name:   "words with numbers",
35 | 			input:  "foo1 foo2",
36 | 			output: []string{"foo1", "foo2"},
37 | 		},
38 | 
39 | 		testcase{
40 | 			name:   "hyphenated words",
41 | 			input:  "r2-d2",
42 | 			output: []string{"r2", "d2"},
43 | 		},
44 | 
45 | 		testcase{
46 | 			name:   "on commas (with and without spaces)",
47 | 			input:  "jane, john,anna",
48 | 			output: []string{"jane", "john", "anna"},
49 | 		},
50 | 
51 | 		testcase{
52 | 			name:   "on other characters",
53 | 			input:  "foobar baz#(*@@baq",
54 | 			output: []string{"foobar", "baz", "baq"},
55 | 		},
56 | 
57 | 		testcase{
58 | 			name:   "words containing umlauts (upper and lower)",
59 | 			input:  "Ölpreis über 80 dollar!",
60 | 			output: []string{"Ölpreis", "über", "80", "dollar"},
61 | 		},
62 | 
63 | 		testcase{
64 | 			name:   "words containing turkish characters",
65 | 			input:  "Ölpreis über 80 dollar!",
66 | 			output: []string{"Ölpreis", "über", "80", "dollar"},
67 | 		},
68 | 
69 | 		testcase{
70 | 			name:   "words containing turkish characters",
71 | 			input:  "Weaviate ayrıca Türkçe konuşabilir",
72 | 			output: []string{"Weaviate", "ayrıca", "Türkçe", "konuşabilir"},
73 | 		},
74 | 
75 | 		testcase{
76 | 			name:   "mixed characters including a '<'",
77 | 			input:  "car, car#of,,,,brand<mercedes, color!!blue",
78 | 			output: []string{"car", "car", "of", "brand", "mercedes", "color", "blue"},
79 | 		},
80 | 	}
81 | 
82 | 	for _, test := range tests {
83 | 		t.Run(test.name, func(t *testing.T) {
84 | 			out := NewSplitter().Split(test.input)
85 | 			assert.Equal(t, test.output, out, "output matches expected output")
86 | 		})
87 | 
88 | 	}
89 | }
90 | 


--------------------------------------------------------------------------------
/server/weight_manipulator.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"strconv"
  6 | 	"strings"
  7 | 	"unicode"
  8 | )
  9 | 
 10 | // Evaluator of mathematical expression, use with NewEvaluator constructor fn
 11 | type Evaluator struct {
 12 | 	expression     string
 13 | 	originalWeight float64
 14 | 	parsedStack    []string
 15 | }
 16 | 
 17 | // NewEvaluator with original expression and existing weight
 18 | func NewEvaluator(expr string, weight float64) *Evaluator {
 19 | 	return &Evaluator{expression: expr, originalWeight: weight}
 20 | }
 21 | 
 22 | // Do parses the input expression (inflow notation) and translates it to
 23 | // postfix notation using the
 24 | // https://en.wikipedia.org/wiki/Shunting-yard_algorithm. Once converted, the
 25 | // expression is evaluated and the result returned
 26 | func (e *Evaluator) Do() (float64, error) {
 27 | 	if err := e.parseExpression(); err != nil {
 28 | 		return 0, err
 29 | 	}
 30 | 
 31 | 	return e.evaluate()
 32 | }
 33 | 
 34 | func (e *Evaluator) parseExpression() error {
 35 | 	var operatorStack []string
 36 | 
 37 | 	// a number might be made of of multiple digits, this variable acts as a
 38 | 	// temporary storage for single digits
 39 | 	var currOperandDigits []string
 40 | 
 41 | 	for _, r := range e.expression {
 42 | 		if unicode.IsSpace(r) {
 43 | 			continue
 44 | 		}
 45 | 
 46 | 		if isOperand(r) {
 47 | 			// don't directly append to stack, append to operandDigitsStack first, as
 48 | 			// this might be just a single digit of a multi-digit number
 49 | 			currOperandDigits = append(currOperandDigits, string(r))
 50 | 			continue
 51 | 		}
 52 | 
 53 | 		if !isOperator(string(r)) {
 54 | 			return e.unrecognizedOperator(string(r))
 55 | 
 56 | 		}
 57 | 		// check if the operandStack contains elements, if so we need to append
 58 | 		// and clear that first
 59 | 		if len(currOperandDigits) > 0 {
 60 | 			e.parsedStack = append(e.parsedStack, strings.Join(currOperandDigits, ""))
 61 | 			currOperandDigits = nil
 62 | 		}
 63 | 
 64 | 		// We will eventually append our current operator to the operator stack.
 65 | 		// However, first it must be compared against current operators, if the
 66 | 		// top of the stack has a higher or equal precedence to the current one,
 67 | 		// we will pop that first. We continue this pattern until either the
 68 | 		// stack is empty or the topmost element of the stack is of lower
 69 | 		// precedence than the current
 70 | 		for len(operatorStack) > 0 {
 71 | 			topStack := operatorStack[len(operatorStack)-1]
 72 | 			if operatorPrecedence(topStack) < operatorPrecedence(string(r)) {
 73 | 				break
 74 | 			}
 75 | 
 76 | 			e.parsedStack = append(e.parsedStack, topStack)
 77 | 			operatorStack = operatorStack[:len(operatorStack)-1]
 78 | 		}
 79 | 		operatorStack = append(operatorStack, string(r))
 80 | 	}
 81 | 
 82 | 	// in case the expression ends with an operand, we need to check again if the
 83 | 	// temp digit stack still contains elements
 84 | 	if len(currOperandDigits) > 0 {
 85 | 		e.parsedStack = append(e.parsedStack, strings.Join(currOperandDigits, ""))
 86 | 		currOperandDigits = nil
 87 | 	}
 88 | 
 89 | 	// append the remainder of the operatorStack (if any) to the parsed output in
 90 | 	// reverse order
 91 | 	e.parsedStack = append(e.parsedStack, reverseSlice(operatorStack)...)
 92 | 	return nil
 93 | }
 94 | 
 95 | func (e *Evaluator) unrecognizedOperator(op string) error {
 96 | 	if op == "(" || op == ")" {
 97 | 		return fmt.Errorf("using parantheses in the expression is not supported")
 98 | 	}
 99 | 
100 | 	return fmt.Errorf("unrecognized operator: %s", string(op))
101 | }
102 | 
103 | func (e Evaluator) evaluate() (float64, error) {
104 | 	var operandStack []float64
105 | 	for _, item := range e.parsedStack {
106 | 		if !isOperator(item) {
107 | 			// not an operator, so it must be an operand
108 | 			num, err := e.parseNumberOrVariable(item)
109 | 			if err != nil {
110 | 				return 0, err
111 | 			}
112 | 
113 | 			operandStack = append(operandStack, num)
114 | 			continue
115 | 		}
116 | 
117 | 		// is an operator
118 | 		if len(operandStack) < 2 {
119 | 			return 0, fmt.Errorf("invalid or unsupported math expression")
120 | 		}
121 | 
122 | 		// note that the topStack is the right operator, whereas topStack-1 is the left!
123 | 		op1, op2 := operandStack[len(operandStack)-2], operandStack[len(operandStack)-1]
124 | 		operandStack = operandStack[:len(operandStack)-2]
125 | 
126 | 		res, err := evaluteOperator(item, op1, op2)
127 | 		if err != nil {
128 | 			return 0, err
129 | 		}
130 | 		operandStack = append(operandStack, res)
131 | 	}
132 | 
133 | 	if len(operandStack) != 1 {
134 | 		return 0, fmt.Errorf("could not evaluate mathematical expression")
135 | 	}
136 | 
137 | 	return operandStack[0], nil
138 | }
139 | 
140 | func evaluteOperator(op string, left, right float64) (float64, error) {
141 | 	switch op {
142 | 	case "+":
143 | 		return left + right, nil
144 | 	case "-":
145 | 		return left - right, nil
146 | 	case "*":
147 | 		return left * right, nil
148 | 	case "/":
149 | 		return left / right, nil
150 | 	default:
151 | 		return 0, fmt.Errorf("this should be unreachable - or the implentation of an operator is missing")
152 | 	}
153 | }
154 | 
155 | func isOperator(in string) bool {
156 | 	switch in {
157 | 	case "*", "+", "-", "/":
158 | 		return true
159 | 	default:
160 | 		return false
161 | 	}
162 | }
163 | 
164 | // we allow numbers, the dot as a floating point symbol, as well as letters to
165 | // represent variables
166 | func isOperand(r rune) bool {
167 | 	if unicode.IsLetter(r) || unicode.IsNumber(r) || string(r) == "." {
168 | 		return true
169 | 	}
170 | 	return false
171 | }
172 | 
173 | func (e *Evaluator) parseNumberOrVariable(in string) (float64, error) {
174 | 	r := rune(in[0])
175 | 	if unicode.IsNumber(r) {
176 | 		return strconv.ParseFloat(in, 64)
177 | 	} else {
178 | 		if in == "w" {
179 | 			return e.originalWeight, nil
180 | 		}
181 | 		return 0, fmt.Errorf("unrecognized variable '%s', use 'w' to represent original weight", in)
182 | 	}
183 | }
184 | 
185 | func operatorPrecedence(op string) int {
186 | 	switch op {
187 | 	case "+", "-":
188 | 		return 1
189 | 	case "*", "/":
190 | 		return 2
191 | 	default:
192 | 		return -1
193 | 	}
194 | }
195 | 
196 | // from https://github.com/golang/go/wiki/SliceTricks
197 | func reverseSlice(a []string) []string {
198 | 	for i := len(a)/2 - 1; i >= 0; i-- {
199 | 		opp := len(a) - 1 - i
200 | 		a[i], a[opp] = a[opp], a[i]
201 | 	}
202 | 
203 | 	return a
204 | }
205 | 


--------------------------------------------------------------------------------
/server/weight_manipulator_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | 	"github.com/stretchr/testify/require"
  9 | )
 10 | 
 11 | func TestWeightManipulator(t *testing.T) {
 12 | 
 13 | 	type test struct {
 14 | 		originalWeight float64
 15 | 		expression     string
 16 | 		expectedResult float64
 17 | 		expectedError  error
 18 | 		name           string
 19 | 	}
 20 | 
 21 | 	tests := []test{
 22 | 
 23 | 		test{
 24 | 			originalWeight: 2.0,
 25 | 			expression:     "7",
 26 | 			expectedResult: 7.0,
 27 | 			expectedError:  nil,
 28 | 			name:           "single operand, no operators",
 29 | 		},
 30 | 		test{
 31 | 			originalWeight: 2.0,
 32 | 			expression:     "17",
 33 | 			expectedResult: 17.0,
 34 | 			expectedError:  nil,
 35 | 			name:           "single operand, more than one digit",
 36 | 		},
 37 | 		test{
 38 | 			originalWeight: 2.0,
 39 | 			expression:     "15.662",
 40 | 			expectedResult: 15.662,
 41 | 			expectedError:  nil,
 42 | 			name:           "single operand, floating point using . as decimal",
 43 | 		},
 44 | 		test{
 45 | 			originalWeight: 2.0,
 46 | 			expression:     "w * 2",
 47 | 			expectedResult: 4.0,
 48 | 			expectedError:  nil,
 49 | 			name:           "simple multiplication",
 50 | 		},
 51 | 		test{
 52 | 			originalWeight: 2.0,
 53 | 			expression:     "w * 2 * 3 * 4",
 54 | 			expectedResult: 48.0,
 55 | 			expectedError:  nil,
 56 | 			name:           "multiplication with several operands",
 57 | 		},
 58 | 		test{
 59 | 			originalWeight: 2.0,
 60 | 			expression:     "w + 3",
 61 | 			expectedResult: 5.0,
 62 | 			expectedError:  nil,
 63 | 			name:           "simple addition",
 64 | 		},
 65 | 		test{
 66 | 			originalWeight: 2.0,
 67 | 			expression:     "w + 3 + 7",
 68 | 			expectedResult: 12.0,
 69 | 			expectedError:  nil,
 70 | 			name:           "additional with several operands",
 71 | 		},
 72 | 		test{
 73 | 			originalWeight: 2.0,
 74 | 			expression:     "1+2*3+4",
 75 | 			expectedResult: 11.0,
 76 | 			expectedError:  nil,
 77 | 			name:           "mixing operators with different precedence",
 78 | 		},
 79 | 		test{
 80 | 			originalWeight: 2.0,
 81 | 			expression:     "1+2*3-4",
 82 | 			expectedResult: 3.0,
 83 | 			expectedError:  nil,
 84 | 			name:           "mixing operators with different precedence, including -",
 85 | 		},
 86 | 		test{
 87 | 			originalWeight: 2.0,
 88 | 			expression:     "1+2/4-4",
 89 | 			expectedResult: -2.5,
 90 | 			expectedError:  nil,
 91 | 			name:           "mixing operators with different precedence, including /",
 92 | 		},
 93 | 		test{
 94 | 			originalWeight: 7.0,
 95 | 			expression:     "1+ 2.5/7 * w -4/2",
 96 | 			expectedResult: 1.5,
 97 | 			expectedError:  nil,
 98 | 			name:           "long expression including all operators",
 99 | 		},
100 | 		test{
101 | 			originalWeight: 7.0,
102 | 			expression:     "w * w",
103 | 			expectedResult: 49,
104 | 			expectedError:  nil,
105 | 			name:           "including the weight variable multiple times",
106 | 		},
107 | 		test{
108 | 			originalWeight: 7.0,
109 | 			expression:     "2 * (1+3)",
110 | 			expectedError:  fmt.Errorf("using parantheses in the expression is not supported"),
111 | 			name:           "using parantheses",
112 | 		},
113 | 		test{
114 | 			originalWeight: 7.0,
115 | 			expression:     "a + b * c",
116 | 			expectedError:  fmt.Errorf("unrecognized variable 'a', use 'w' to represent original weight"),
117 | 			name:           "using a variable other than w",
118 | 		},
119 | 	}
120 | 
121 | 	for _, test := range tests {
122 | 		t.Run(test.name, func(t *testing.T) {
123 | 			res, err := NewEvaluator(test.expression, test.originalWeight).Do()
124 | 			require.Equal(t, test.expectedError, err)
125 | 			assert.Equal(t, test.expectedResult, res)
126 | 		})
127 | 
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/test/compoundsplitting/contextionary.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate/contextionary/327ffb5f74ff9ede347bd31a8973d79d25fcac9b/test/compoundsplitting/contextionary.idx


--------------------------------------------------------------------------------
/test/journey.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Jump to root directory
 6 | cd "$( dirname "${BASH_SOURCE[0]}" )"/..
 7 | 
 8 | # set some defaults so we can also run locally
 9 | if [ -z "$DOCKER_ORG" ]
10 | then
11 |   DOCKER_ORG=semitechnologies
12 | fi
13 | 
14 | if [ -z "$DOCKER_REPO" ]
15 | then
16 |   DOCKER_REPO=contextionary
17 | fi
18 | 
19 | if [ -z "$SOFTWARE_VERSION" ]
20 | then
21 |   SOFTWARE_VERSION=local
22 | fi
23 | 
24 | if [ -z "$MODEL_VERSION" ]
25 | then
26 |   MODEL_VERSION=0.16.0
27 | fi
28 | 
29 | if [ -z "$LANGUAGE" ]
30 | then
31 |   LANGUAGE=en
32 | fi
33 | 
34 | VERSION="${MODEL_VERSION}-${SOFTWARE_VERSION}"
35 | 
36 | docker tag "$DOCKER_ORG/$DOCKER_REPO:${LANGUAGE}$VERSION-minimal" c11y-local-journeytest-minimal
37 | docker tag "$DOCKER_ORG/$DOCKER_REPO:${LANGUAGE}$VERSION" c11y-local-journeytest-full
38 | 
39 | echo "Cleaning up from previous runs"
40 | docker-compose -f ./test/journey/docker-compose.yml down
41 | 
42 | echo "Starting containers"
43 | docker-compose -f ./test/journey/docker-compose.yml up -d minimal full weaviate
44 | 
45 | echo "Building tests"
46 | docker-compose -f ./test/journey/docker-compose.yml build test-env 
47 | 
48 | echo "Running tests"
49 | docker-compose -f ./test/journey/docker-compose.yml run test-env go test .
50 | 
51 | 


--------------------------------------------------------------------------------
/test/journey/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.13
2 | WORKDIR /testfiles
3 | COPY go.mod go.sum ./
4 | RUN go mod download
5 | 
6 | 


--------------------------------------------------------------------------------
/test/journey/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | services:
 3 |   weaviate:
 4 |     image: semitechnologies/weaviate:1.18.0
 5 |     ports:
 6 |      - "8080:8080"
 7 |     environment:
 8 |       LOG_LEVEL: "debug"
 9 |       CONTEXTIONARY_URL: host.docker.internal:9999
10 |       QUERY_DEFAULTS_LIMIT: 20
11 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
12 |       PERSISTENCE_DATA_PATH: "./data"
13 |       DEFAULT_VECTORIZER_MODULE: text2vec-contextionary
14 |   minimal:
15 |     image: c11y-local-journeytest-minimal
16 |     environment:
17 |       EXTENSIONS_STORAGE_MODE: weaviate
18 |       EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080
19 |   full:
20 |     image: c11y-local-journeytest-full
21 |     environment:
22 |       EXTENSIONS_STORAGE_MODE: weaviate
23 |       EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080
24 |       LOG_LEVEL: debug
25 |       MAX_COMPOUND_WORD_LENGTH: 4
26 |       MAX_BATCH_SIZE: 200
27 |     ports:
28 |       - "9999:9999"
29 |   test-env:
30 |     build: 
31 |       context: . # paths are relative to the docker-compose file, so they point to ./test/journey/
32 |       dockerfile: ./Dockerfile
33 |     volumes:
34 |       - ./:/testfiles
35 |     environment:
36 |       DIMENSIONS: "$DIMENSIONS"
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/test/journey/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/weaviate/contextionary/test/journey
 2 | 
 3 | go 1.13
 4 | 
 5 | require (
 6 | 	github.com/stretchr/testify v1.6.1
 7 | 	github.com/weaviate/contextionary v1.1.2-0.20230307155526-f7e24eb73eb0
 8 | 	google.golang.org/grpc v1.24.0
 9 | )
10 | 


--------------------------------------------------------------------------------
/test/journey/journey_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"strconv"
  8 | 	"testing"
  9 | 
 10 | 	"github.com/stretchr/testify/assert"
 11 | 	"github.com/stretchr/testify/require"
 12 | 	pb "github.com/weaviate/contextionary/contextionary"
 13 | 	"google.golang.org/grpc"
 14 | )
 15 | 
 16 | var expectedDimensions int
 17 | 
 18 | func init() {
 19 | 
 20 | 	d, err := strconv.Atoi(os.Getenv("DIMENSIONS"))
 21 | 	if err != nil {
 22 | 		panic(err)
 23 | 	}
 24 | 
 25 | 	expectedDimensions = d
 26 | }
 27 | 
 28 | func Test_Contextionary_Journey(t *testing.T) {
 29 | 	// minimal
 30 | 	connMinimal, err := grpc.Dial("minimal:9999", grpc.WithInsecure())
 31 | 	if err != nil {
 32 | 		t.Fatalf("couldn't connect to minimal c11y: %s", err)
 33 | 	}
 34 | 	defer connMinimal.Close()
 35 | 
 36 | 	connFull, err := grpc.Dial("full:9999", grpc.WithInsecure())
 37 | 	if err != nil {
 38 | 		t.Fatalf("couldn't connect to minimal c11y: %s", err)
 39 | 	}
 40 | 	defer connFull.Close()
 41 | 
 42 | 	clientMinimal := pb.NewContextionaryClient(connMinimal)
 43 | 	clientFull := pb.NewContextionaryClient(connFull)
 44 | 
 45 | 	t.Run("the minimal contextionary", func(t *testing.T) {
 46 | 		client := clientMinimal
 47 | 
 48 | 		t.Run("testing words present", func(t *testing.T) {
 49 | 			words := []string{"car", "engine", "automobile", "name"}
 50 | 
 51 | 			for _, word := range words {
 52 | 				t.Run(word, func(t *testing.T) {
 53 | 					res, err := client.IsWordPresent(context.Background(), &pb.Word{Word: word})
 54 | 					require.Nil(t, err)
 55 | 					assert.Equal(t, true, res.Present)
 56 | 				})
 57 | 			}
 58 | 		})
 59 | 
 60 | 		t.Run("testing stopwords", func(t *testing.T) {
 61 | 			words := []string{"of", "the"}
 62 | 
 63 | 			for _, word := range words {
 64 | 				t.Run(word, func(t *testing.T) {
 65 | 					res, err := client.IsWordStopword(context.Background(), &pb.Word{Word: word})
 66 | 					require.Nil(t, err)
 67 | 					assert.Equal(t, true, res.Stopword)
 68 | 				})
 69 | 			}
 70 | 		})
 71 | 
 72 | 		t.Run("corpi to vector", func(t *testing.T) {
 73 | 			t.Run("only stopwords", func(t *testing.T) {
 74 | 				corpi := []string{"of", "the of"}
 75 | 				_, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi})
 76 | 				assert.NotNil(t, err)
 77 | 			})
 78 | 
 79 | 			t.Run("only stopwords", func(t *testing.T) {
 80 | 				corpi := []string{"car", "car of brand mercedes", "color blue"}
 81 | 				res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi})
 82 | 				assert.Nil(t, err)
 83 | 				// TODO: also upgrade minimal one to 600 vectors
 84 | 				assert.Len(t, res.Entries, 300)
 85 | 			})
 86 | 
 87 | 			t.Run("two corpi with and without splitting characters should lead to the same vector", func(t *testing.T) {
 88 | 				corpi1 := []string{"car", "car of brand mercedes", "color blue"}
 89 | 				corpi2 := []string{"car,", "car#of,,,,brand<mercedes", "color!!blue"}
 90 | 				res1, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi1})
 91 | 				assert.Nil(t, err)
 92 | 				assert.Len(t, res1.Entries, 300)
 93 | 
 94 | 				res2, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi2})
 95 | 				assert.Nil(t, err)
 96 | 				assert.Len(t, res2.Entries, 300)
 97 | 
 98 | 				assert.Equal(t, res1.Entries, res2.Entries)
 99 | 			})
100 | 		})
101 | 	})
102 | 
103 | 	t.Run("the full contextionary", func(t *testing.T) {
104 | 		client := clientFull
105 | 
106 | 		t.Run("testing words present", func(t *testing.T) {
107 | 			words := []string{"car", "engine", "automobile", "influenza", "brexit", "condenser", "name"}
108 | 
109 | 			for _, word := range words {
110 | 				t.Run(word, func(t *testing.T) {
111 | 					res, err := client.IsWordPresent(context.Background(), &pb.Word{Word: word})
112 | 					require.Nil(t, err)
113 | 					assert.Equal(t, true, res.Present)
114 | 				})
115 | 			}
116 | 		})
117 | 
118 | 		t.Run("testing stopwords", func(t *testing.T) {
119 | 			words := []string{"the", "a"}
120 | 
121 | 			for _, word := range words {
122 | 				t.Run(word, func(t *testing.T) {
123 | 					res, err := client.IsWordStopword(context.Background(), &pb.Word{Word: word})
124 | 					require.Nil(t, err)
125 | 					assert.Equal(t, true, res.Stopword)
126 | 				})
127 | 			}
128 | 		})
129 | 
130 | 		t.Run("corpi to vector", func(t *testing.T) {
131 | 			t.Run("only stopwords", func(t *testing.T) {
132 | 				corpi := []string{"a", "the a"}
133 | 				_, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi})
134 | 				assert.NotNil(t, err)
135 | 			})
136 | 
137 | 			t.Run("not only stopwords", func(t *testing.T) {
138 | 				corpi := []string{"car", "car of brand mercedes", "color blue"}
139 | 				res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi})
140 | 				require.Nil(t, err)
141 | 				fmt.Println(expectedDimensions)
142 | 				fmt.Println(res)
143 | 				assert.Len(t, res.Entries, expectedDimensions)
144 | 			})
145 | 
146 | 			t.Run("two corpi with and without splitting characters should lead to the same vector", func(t *testing.T) {
147 | 				corpi1 := []string{"car", "car of brand mercedes", "color blue"}
148 | 				corpi2 := []string{"car,", "car#of,,,,brand<mercedes", "color!!blue"}
149 | 				res1, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi1})
150 | 				require.Nil(t, err)
151 | 				assert.Len(t, res1.Entries, expectedDimensions)
152 | 
153 | 				res2, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi2})
154 | 				require.Nil(t, err)
155 | 				assert.Len(t, res2.Entries, expectedDimensions)
156 | 
157 | 				assert.Equal(t, res1.Entries, res2.Entries)
158 | 			})
159 | 		})
160 | 	})
161 | }
162 | 


--------------------------------------------------------------------------------
/tools/dev/.gitignore:
--------------------------------------------------------------------------------
1 | en_test-vectors-small.txt
2 | example.idx
3 | example.knn
4 | 
5 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/.gitignore:
--------------------------------------------------------------------------------
1 | contextionary.idx
2 | contextionary.knn
3 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/class_vectors/elastic.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */package main
 12 | 
 13 | import (
 14 | 	"bytes"
 15 | 	"encoding/base64"
 16 | 	"encoding/binary"
 17 | 	"encoding/json"
 18 | 	"errors"
 19 | 	"fmt"
 20 | 	"io/ioutil"
 21 | 	"math"
 22 | 	"net/http"
 23 | 	"strings"
 24 | )
 25 | 
 26 | func convertArrayToBase64(array []float32) string {
 27 | 	bytes := make([]byte, 0, 4*len(array))
 28 | 	for _, a := range array {
 29 | 		bits := math.Float32bits(a)
 30 | 		b := make([]byte, 4)
 31 | 		binary.BigEndian.PutUint32(b, bits)
 32 | 		bytes = append(bytes, b...)
 33 | 	}
 34 | 
 35 | 	encoded := base64.StdEncoding.EncodeToString(bytes)
 36 | 	return encoded
 37 | }
 38 | 
 39 | type document struct {
 40 | 	ID             int    `json:"id"`
 41 | 	Name           string `json:"name"`
 42 | 	Content        string `json:"content"`
 43 | 	SampleBoolProp bool   `json:"sampleBoolProp"`
 44 | }
 45 | 
 46 | type vectorDocument struct {
 47 | 	document
 48 | 	EmbeddingVector string `json:"embedding_vector"`
 49 | }
 50 | 
 51 | func asJSON(u vectorDocument) []byte {
 52 | 	b, _ := json.Marshal(u)
 53 | 	return b
 54 | }
 55 | 
 56 | func put(item vectorDocument) error {
 57 | 	json := asJSON(item)
 58 | 	req, err := http.NewRequest("PUT", fmt.Sprintf("http://localhost:9900/documents/text/%d", item.ID), bytes.NewReader(json))
 59 | 	if err != nil {
 60 | 		return err
 61 | 	}
 62 | 
 63 | 	req.Header.Set("Content-Type", "application/json")
 64 | 	res, err := (&http.Client{}).Do(req)
 65 | 	if err != nil {
 66 | 		return err
 67 | 	}
 68 | 
 69 | 	if res.StatusCode != 200 && res.StatusCode != 201 {
 70 | 		b := res.Body
 71 | 		defer b.Close()
 72 | 		body, _ := ioutil.ReadAll(b)
 73 | 		return errors.New("failed: " + string(body))
 74 | 	}
 75 | 	return nil
 76 | }
 77 | 
 78 | func setMapping() error {
 79 | 	payload := []byte(`{
 80 |   "mappings": {
 81 |     "text": {
 82 |       "properties": {
 83 |         "embedding_vector": {
 84 | 					"type": "binary",
 85 | 					"doc_values": true
 86 |         },
 87 | 				"sampleBoolProp": {
 88 | 					"type": "boolean"
 89 | 				}
 90 |       }
 91 |     }
 92 |   }
 93 | }`)
 94 | 
 95 | 	req, err := http.NewRequest("PUT", "http://localhost:9900/documents/", bytes.NewReader(payload))
 96 | 	if err != nil {
 97 | 		return err
 98 | 	}
 99 | 
100 | 	req.Header.Set("Content-Type", "application/json")
101 | 	res, err := (&http.Client{}).Do(req)
102 | 	if err != nil {
103 | 		return err
104 | 	}
105 | 
106 | 	if res.StatusCode != 200 && res.StatusCode != 201 {
107 | 		b := res.Body
108 | 		defer b.Close()
109 | 		body, _ := ioutil.ReadAll(b)
110 | 		return errors.New("failed: " + string(body))
111 | 	}
112 | 	return nil
113 | }
114 | 
115 | func printVector(v []float32) string {
116 | 	var asStrings = make([]string, len(v), len(v))
117 | 	for i, number := range v {
118 | 		asStrings[i] = fmt.Sprintf("%f", number)
119 | 	}
120 | 
121 | 	return strings.Join(asStrings, ", ")
122 | }
123 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/class_vectors/main.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */package main
 12 | 
 13 | import (
 14 | 	"fmt"
 15 | 	"os"
 16 | 	"regexp"
 17 | 	"strings"
 18 | 
 19 | 	contextionary "github.com/weaviate/contextionary/contextionary/core"
 20 | )
 21 | 
 22 | func fatal(err error) {
 23 | 	if err != nil {
 24 | 		fmt.Println(err.Error())
 25 | 		os.Exit(1)
 26 | 	}
 27 | }
 28 | 
 29 | var cleanupRegexp = regexp.MustCompile("[^a-zA-Z0-9 ]+")
 30 | 
 31 | func clean(input string) string {
 32 | 	return cleanupRegexp.ReplaceAllString(input, "")
 33 | }
 34 | 
 35 | func main() {
 36 | 	setMapping()
 37 | 
 38 | 	c11yRoot := os.Args[1]
 39 | 
 40 | 	c11y, err := contextionary.LoadVectorFromDisk(c11yRoot+"/contextionary-en.knn", c11yRoot+"/contextionary-en.idx")
 41 | 	fatal(err)
 42 | 
 43 | 	for _, text := range sampleTexts {
 44 | 		vector := vectorForText(text.Content, c11y)
 45 | 		err := put(vectorDocument{text, convertArrayToBase64(vector.ToArray())})
 46 | 		fatal(err)
 47 | 	}
 48 | 
 49 | 	searchString(strings.Join(os.Args[2:], " "), c11y)
 50 | }
 51 | 
 52 | func vectorForText(input string, c11y contextionary.Contextionary) *contextionary.Vector {
 53 | 	words := strings.Split(clean(input), " ")
 54 | 
 55 | 	var total int
 56 | 	// var stopWords int
 57 | 	var vectors []contextionary.Vector
 58 | 	var occurrences []uint64
 59 | 	var stopWords int
 60 | 	var maxOcc uint64
 61 | 	var minOcc uint64 = 1e15
 62 | 	var presentWords []string
 63 | 
 64 | 	for _, word := range words {
 65 | 		total++
 66 | 		if isStopWord(word) {
 67 | 			stopWords++
 68 | 			continue
 69 | 		}
 70 | 
 71 | 		itemIndex := c11y.WordToItemIndex(word)
 72 | 		if ok := itemIndex.IsPresent(); ok {
 73 | 			vector, err := c11y.GetVectorForItemIndex(itemIndex)
 74 | 			fatal(err)
 75 | 
 76 | 			occurrence, err := c11y.ItemIndexToOccurrence(itemIndex)
 77 | 			fatal(err)
 78 | 
 79 | 			vectors = append(vectors, *vector)
 80 | 			if occurrence < minOcc {
 81 | 				minOcc = occurrence
 82 | 			}
 83 | 			if occurrence > maxOcc {
 84 | 				maxOcc = occurrence
 85 | 			}
 86 | 
 87 | 			occurrences = append(occurrences, occurrence)
 88 | 			presentWords = append(presentWords, word)
 89 | 		}
 90 | 
 91 | 	}
 92 | 
 93 | 	// calculate weights by normalizing the occurrences to 0..1
 94 | 	weights := make([]float32, len(occurrences), len(occurrences))
 95 | 	for i, occ := range occurrences {
 96 | 		// _ = occ
 97 | 		// weights[i] = 1
 98 | 		weight := 1 - float32(occ-minOcc)/float32(maxOcc-minOcc)
 99 | 		weights[i] = weight
100 | 
101 | 		// fmt.Printf("%s: %f\n", presentWords[i], weight)
102 | 	}
103 | 
104 | 	centroid, err := contextionary.ComputeWeightedCentroid(vectors, weights)
105 | 	fatal(err)
106 | 
107 | 	// fmt.Printf("%d stop words out of %d removed. %d of the remainder contained\n", stopWords, total, len(vectors))
108 | 
109 | 	return centroid
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/class_vectors/search.go:
--------------------------------------------------------------------------------
  1 | /*                          _       _
  2 |  *__      _____  __ ___   ___  __ _| |_ ___
  3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
  4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
  5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
  6 |  *
  7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
  8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
  9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
 10 |  * CONTACT: hello@weaviate.io
 11 |  */package main
 12 | 
 13 | import (
 14 | 	"bytes"
 15 | 	"encoding/json"
 16 | 	"fmt"
 17 | 	"io/ioutil"
 18 | 	"log"
 19 | 	"net/http"
 20 | 	"strings"
 21 | 
 22 | 	contextionary "github.com/weaviate/contextionary/contextionary/core"
 23 | )
 24 | 
 25 | func searchString(word string, c11y contextionary.Contextionary) {
 26 | 	words := strings.Split(word, " ")
 27 | 
 28 | 	var usableWords []string
 29 | 	var vectors []contextionary.Vector
 30 | 	var weights []float32
 31 | 
 32 | 	for _, word := range words {
 33 | 		if isStopWord(word) {
 34 | 			continue
 35 | 		}
 36 | 
 37 | 		itemIndex := c11y.WordToItemIndex(word)
 38 | 		if ok := itemIndex.IsPresent(); !ok {
 39 | 			log.Fatalf("the word %s is not in the c11y", word)
 40 | 		}
 41 | 
 42 | 		vector, err := c11y.GetVectorForItemIndex(itemIndex)
 43 | 		if err != nil {
 44 | 			log.Fatalf("could not get vector for word '%s': %v", word, err)
 45 | 		}
 46 | 
 47 | 		usableWords = append(usableWords, word)
 48 | 		vectors = append(vectors, *vector)
 49 | 		weights = append(weights, 1.0)
 50 | 	}
 51 | 
 52 | 	stopWordsRatio := float32((len(words) - len(usableWords))) / float32(len(words))
 53 | 	fmt.Printf("Original Search Term: %s\n", word)
 54 | 	fmt.Printf("After stop word removal: %s (%2.0f%% removed)\n", strings.Join(usableWords, " "), stopWordsRatio*100)
 55 | 	fmt.Printf("\n")
 56 | 
 57 | 	centroid, err := contextionary.ComputeWeightedCentroid(vectors, weights)
 58 | 	fatal(err)
 59 | 
 60 | 	search(centroid.ToArray())
 61 | 	fmt.Printf("\n\n")
 62 | }
 63 | 
 64 | func search(v []float32) {
 65 | 	body := fmt.Sprintf(`{
 66 |   "query": {
 67 |     "function_score": {
 68 | 		  "query": {
 69 | 			  "bool": {
 70 | 				  "filter": {
 71 | 						"match": {
 72 | 						  "sampleBoolProp": false
 73 | 						}
 74 | 					}
 75 | 				}
 76 | 			},
 77 |       "boost_mode": "replace",
 78 |       "script_score": {
 79 |         "script": {
 80 |           "inline": "binary_vector_score",
 81 |           "lang": "knn",
 82 |           "params": {
 83 |             "cosine": false,
 84 |             "field": "embedding_vector",
 85 |             "vector": [
 86 | 						%s
 87 |              ]
 88 |           }
 89 |         }
 90 |       }
 91 |     }
 92 |   },
 93 |   "size": 3
 94 | } `, printVector(v))
 95 | 
 96 | 	req, _ := http.NewRequest("GET", "http://localhost:9900/documents/_search", bytes.NewReader([]byte(body)))
 97 | 	res, err := (&http.Client{}).Do(req)
 98 | 	if err != nil {
 99 | 		panic(err)
100 | 	}
101 | 
102 | 	if res.StatusCode != 200 {
103 | 		bb, _ := ioutil.ReadAll(res.Body)
104 | 		panic(fmt.Errorf("status is %d: %s", res.StatusCode, bb))
105 | 	}
106 | 
107 | 	defer res.Body.Close()
108 | 	bytes, err := ioutil.ReadAll(res.Body)
109 | 	if err != nil {
110 | 		panic(err)
111 | 	}
112 | 
113 | 	var eres elasticResult
114 | 	err = json.Unmarshal(bytes, &eres)
115 | 	if err != nil {
116 | 		panic(err)
117 | 	}
118 | 
119 | 	for i, hit := range eres.Hits.Hits {
120 | 		content := firstChars(hit.Source.Content, 120)
121 | 		fmt.Printf("\n\tNo: %d\tScore: %2.3f\tName: %s\n\t  Content: %s\n", i, hit.Score, hit.Source.Name, content)
122 | 	}
123 | }
124 | 
125 | type elasticResult struct {
126 | 	Hits elasticHits `json:"hits"`
127 | }
128 | 
129 | type elasticHits struct {
130 | 	Hits []elasticHit `json:"hits"`
131 | }
132 | 
133 | type elasticHit struct {
134 | 	Score  float32  `json:"_score"`
135 | 	Source document `json:"_source"`
136 | }
137 | 
138 | func firstChars(input string, limit int) string {
139 | 	if len(input) < limit {
140 | 		return input
141 | 	}
142 | 	return input[:limit] + "..."
143 | }
144 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/comparison/main.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package main
12 | 
13 | import (
14 | 	"fmt"
15 | 	"os"
16 | 
17 | 	contextionary "github.com/weaviate/contextionary/contextionary/core"
18 | )
19 | 
20 | func fatal(err error) {
21 | 	if err != nil {
22 | 		fmt.Println(err.Error())
23 | 		os.Exit(1)
24 | 	}
25 | }
26 | 
27 | func main() {
28 | 	root := os.Args[1]
29 | 	c1Path := root + "/filter-after-glove"
30 | 	c2Path := root + "/preprocessing"
31 | 	c3Path := root + "/stopword-removal"
32 | 
33 | 	c1, err := contextionary.LoadVectorFromDisk(c1Path+"/contextionary-en.knn", c1Path+"/contextionary-en.idx")
34 | 	fatal(err)
35 | 
36 | 	c2, err := contextionary.LoadVectorFromDisk(c2Path+"/contextionary-en.knn", c2Path+"/contextionary-en.idx")
37 | 	fatal(err)
38 | 
39 | 	c3, err := contextionary.LoadVectorFromDisk(c3Path+"/contextionary-en.knn", c3Path+"/contextionary-en.idx")
40 | 	fatal(err)
41 | 
42 | 	word := os.Args[2]
43 | 	c1Dist, c1Words := kNN(word, c1)
44 | 	c2Dist, c2Words := kNN(word, c2)
45 | 	c3Dist, c3Words := kNN(word, c3)
46 | 
47 | 	for i := range c1Dist {
48 | 		fmt.Printf("%f  %-15s\t\t\t%f  %-15s\t\t\t%f  %-15s\n", c1Dist[i], c1Words[i], c2Dist[i], c2Words[i], c3Dist[i], c3Words[i])
49 | 	}
50 | }
51 | 
52 | func kNN(name string, contextionary contextionary.Contextionary) ([]float32, []string) {
53 | 	itemIndex := contextionary.WordToItemIndex(name)
54 | 	if ok := itemIndex.IsPresent(); !ok {
55 | 		fatal(fmt.Errorf("item index for %s is not present", name))
56 | 	}
57 | 
58 | 	list, distances, err := contextionary.GetNnsByItem(itemIndex, 20, 3)
59 | 	if err != nil {
60 | 		fatal(fmt.Errorf("get nns errored: %s", err))
61 | 	}
62 | 
63 | 	words := make([]string, len(list), len(list))
64 | 	for i := range list {
65 | 		w, err := contextionary.ItemIndexToWord(list[i])
66 | 		if err != nil {
67 | 			fmt.Printf("error: %s", err)
68 | 		}
69 | 		words[i] = w
70 | 	}
71 | 
72 | 	return distances, words
73 | }
74 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/main.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package main
12 | 
13 | import (
14 | 	"fmt"
15 | 	"os"
16 | 
17 | 	contextionary "github.com/weaviate/contextionary/contextionary/core"
18 | )
19 | 
20 | func fatal(err error) {
21 | 	if err != nil {
22 | 		fmt.Println(err.Error())
23 | 		os.Exit(1)
24 | 	}
25 | }
26 | 
27 | func main() {
28 | 	c13y, err := contextionary.LoadVectorFromDisk("./tools/dev/contextionary-playground/contextionary.knn", "./tools/dev/contextionary-playground/contextionary.idx")
29 | 	fatal(err)
30 | 
31 | 	fmt.Println("results before building centroid based on keywords: ")
32 | 	kNN("city", c13y)
33 | 
34 | 	// Combine contextionaries
35 | 	contextionaries := []contextionary.Contextionary{c13y}
36 | 	combined, err := contextionary.CombineVectorIndices(contextionaries)
37 | 	fatal(err)
38 | 
39 | 	fmt.Println("results after building centroid based on keywords: ")
40 | 	kNN("ocean", combined)
41 | }
42 | 
43 | func kNN(name string, contextionary contextionary.Contextionary) {
44 | 	itemIndex := contextionary.WordToItemIndex(name)
45 | 	if ok := itemIndex.IsPresent(); !ok {
46 | 		fatal(fmt.Errorf("item index for %s is not present", name))
47 | 	}
48 | 
49 | 	list, distances, err := contextionary.GetNnsByItem(itemIndex, 1000000, 3)
50 | 	if err != nil {
51 | 		fatal(fmt.Errorf("get nns errored: %s", err))
52 | 	}
53 | 
54 | 	for i := range list {
55 | 		w, err := contextionary.ItemIndexToWord(list[i])
56 | 		if err != nil {
57 | 			fmt.Printf("error: %s", err)
58 | 		}
59 | 		fmt.Printf("\n%d %f %s\n", list[i], distances[i], w)
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/schema/main.go:
--------------------------------------------------------------------------------
 1 | /*                          _       _
 2 |  *__      _____  __ ___   ___  __ _| |_ ___
 3 |  *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
 4 |  * \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
 5 |  *  \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
 6 |  *
 7 |  * Copyright © 2016 - 2019 Weaviate. All rights reserved.
 8 |  * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
 9 |  * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 |  * CONTACT: hello@weaviate.io
11 |  */package main
12 | 
13 | import (
14 | 	"fmt"
15 | 	"os"
16 | 
17 | 	contextionary "github.com/weaviate/contextionary/contextionary/core"
18 | )
19 | 
20 | func fatal(err error) {
21 | 	if err != nil {
22 | 		fmt.Println(err.Error())
23 | 		os.Exit(1)
24 | 	}
25 | }
26 | 
27 | func main() {
28 | 	c11y, err := contextionary.LoadVectorFromDisk("./test/contextionary/example.knn", "./test/contextionary/example.idx")
29 | 	fatal(err)
30 | 
31 | 	fmt.Println("results before building centroid based on keywords: ")
32 | 	kNN("city", c11y)
33 | 
34 | 	// Combine contextionaries
35 | 	contextionaries := []contextionary.Contextionary{c11y}
36 | 	combined, err := contextionary.CombineVectorIndices(contextionaries)
37 | 	fatal(err)
38 | 
39 | 	fmt.Println("results after building centroid based on keywords: ")
40 | 	kNN("ocean", combined)
41 | }
42 | 
43 | func kNN(name string, contextionary contextionary.Contextionary) {
44 | 	itemIndex := contextionary.WordToItemIndex(name)
45 | 	if ok := itemIndex.IsPresent(); !ok {
46 | 		fatal(fmt.Errorf("item index for %s is not present", name))
47 | 	}
48 | 
49 | 	list, distances, err := contextionary.GetNnsByItem(itemIndex, 20, 3)
50 | 	if err != nil {
51 | 		fatal(fmt.Errorf("get nns errored: %s", err))
52 | 	}
53 | 
54 | 	for i := range list {
55 | 		w, err := contextionary.ItemIndexToWord(list[i])
56 | 		if err != nil {
57 | 			fmt.Printf("error: %s", err)
58 | 		}
59 | 		fmt.Printf("\n%d %f %s\n", list[i], distances[i], w)
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/tools/dev/en_test-vectors-small.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate/contextionary/327ffb5f74ff9ede347bd31a8973d79d25fcac9b/tools/dev/en_test-vectors-small.txt.bz2


--------------------------------------------------------------------------------
/tools/dev/gen_simple_contextionary.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | echo "Unpacking fixture vectors"
 5 | rm -f tools/dev/en_test-vectors-small.txt || true
 6 | bunzip2 -k tools/dev/en_test-vectors-small.txt.bz2
 7 | 
 8 | # Fake stopword removal by removing the first 10 words. This will become
 9 | # obsolete once we have released a new minimal c11y
10 | 
11 | # build stopword.json
12 | cat tools/dev/en_test-vectors-small.txt | head | \
13 |   while read -r word _; do echo "$word"; done | jq -nR '[inputs | select(length>0)] | { language: "en", words: . }'  > tools/dev/stopwords.json
14 | 
15 | # remove stop words
16 | sed -i.bak 1,10d tools/dev/en_test-vectors-small.txt && rm tools/dev/en_test-vectors-small.txt.bak
17 | 
18 | if [ -f tools/dev/example.knn ]; then
19 |   echo "Fixture contextionary already generated"
20 | else
21 |   go run contextionary/core/generator/cmd/generator.go \
22 |     -c tools/dev/en_test-vectors-small.txt \
23 |     -p tools/dev/example
24 | fi
25 | 


--------------------------------------------------------------------------------
/tools/dev/run.sh:
--------------------------------------------------------------------------------
1 | GO111MODULE=on \
2 |   KNN_FILE="./tools/dev/example.knn" \
3 |   IDX_FILE="./tools/dev/example.idx" \
4 |   STOPWORDS_FILE="./tools/dev/stopwords.json" \
5 |   SCHEMA_PROVIDER_URL="localhost:2379" \
6 |   go run ./server 2>&1 
7 | 


--------------------------------------------------------------------------------
/tools/dev/stopwords.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "en",
 3 |   "words": [
 4 |     "the",
 5 |     "of",
 6 |     "and",
 7 |     "in",
 8 |     "to",
 9 |     "a",
10 |     "was",
11 |     "The",
12 |     "is",
13 |     "for"
14 |   ]
15 | }
16 | 


--------------------------------------------------------------------------------
/tools/download_contextionary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | language=${1}
 6 | version="${2}"
 7 | 
 8 | rm -rf ./data && mkdir ./data
 9 | 
10 | # Download the latest files and remove old ones
11 | for FILE in stopwords.json contextionary.idx contextionary.knn; do
12 |     echo "Start Downloading $FILE" && \
13 |     #echo "Downloading url: https://c11y.semi.technology/$version/$language/$FILE"
14 |     wget --quiet -O ./data/$FILE "https://c11y.semi.technology/$version/$language/$FILE" && \
15 |     echo "$FILE = done" &
16 | done 
17 | 
18 | # Wait to finish download
19 | wait
20 | 
21 | echo "Done downloading open source contextionary v$VECTORDB_VERSION."
22 | exit 0
23 | 


--------------------------------------------------------------------------------
/tools/native_build_contextionary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #Download contextionary
 4 | LANGUAGE=en
 5 | MODEL_VERSION=0.16.0
 6 | ./tools/download_contextionary.sh "$LANGUAGE" "$MODEL_VERSION"
 7 | 
 8 | #Build the server
 9 | VERSION=1.2.0
10 | CGO_ENABLED=1 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -X main.Version=$VERSION" ./server
11 | 
12 | #Generate contextionary
13 | tools/dev/gen_simple_contextionary.sh
14 | 
15 | #Preprocess splitter dictionary
16 | /bin/bash ./tools/preprocess_splitter_dict_native_build.sh "$LANGUAGE" "./data/contextionary.idx"
17 | 
18 | #Copy files to Alpine image
19 | cp ./contextionary-server $PWD
20 | 
21 | #Set environment variables
22 | export KNN_FILE=./data/contextionary.knn
23 | export IDX_FILE=./data/contextionary.idx
24 | export STOPWORDS_FILE=./data/stopwords.json
25 | export COMPOUND_SPLITTING_DICTIONARY_FILE=./data/splitter_dict.csv
26 | 
27 | #Run the server
28 | ./contextionary-server
29 | 


--------------------------------------------------------------------------------
/tools/preprocess_splitter_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | language=${1}
 6 | index_file=${2}
 7 | 
 8 | # Get dictionaries
 9 | git clone https://github.com/LibreOffice/dictionaries.git
10 | 
11 | aff_file=""
12 | dic_file=""
13 | 
14 | if [ "$language" == "en" ]; then
15 |   aff_file="/app/dictionaries/en/en_US.aff"
16 |   dic_file="/app/dictionaries/en/en_US.dic"
17 | fi
18 | if [ "$language" == "de" ]; then
19 |   aff_file="/app/dictionaries/de/de_DE_frami.aff"
20 |   dic_file="/app/dictionaries/de/de_DE_frami.dic"
21 | fi
22 | if [ "$language" == "nl" ]; then
23 |   aff_file="/app/dictionaries/nl_NL/nl_NL.aff"
24 |   dic_file="/app/dictionaries/nl_NL/nl_NL.dic"
25 | fi
26 | if [ "$language" == "it" ]; then
27 |   aff_file="/app/dictionaries/it_IT/it_IT.aff"
28 |   dic_file="/app/dictionaries/it_IT/it_IT.dic"
29 | fi
30 | if [ "$language" == "cs" ]; then
31 |   aff_file="/app/dictionaries/cs_CZ/cs_CZ.aff"
32 |   dic_file="/app/dictionaries/cs_CZ/cs_CZ.dic"
33 | fi
34 | 
35 | if [ "$aff_file" == "" ]; then
36 |   echo "Missing dictionary for preprocessor see process_splitter_dict.sh"
37 |   exit 3
38 | fi
39 | 
40 | echo "Building dict with:"
41 | go run main/splitter_preprocessor.go "$index_file" "$dic_file" "$aff_file" "/app/data/splitter_dict.csv"
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/preprocess_splitter_dict_native_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | language=${1}
 6 | index_file=${2}
 7 | 
 8 | # Get dictionaries
 9 | git clone https://github.com/LibreOffice/dictionaries.git
10 | 
11 | aff_file=""
12 | dic_file=""
13 | 
14 | if [ "$language" == "en" ]; then
15 |   aff_file="./dictionaries/en/en_US.aff"
16 |   dic_file="./dictionaries/en/en_US.dic"
17 | fi
18 | if [ "$language" == "de" ]; then
19 |   aff_file="./dictionaries/de/de_DE_frami.aff"
20 |   dic_file="./dictionaries/de/de_DE_frami.dic"
21 | fi
22 | if [ "$language" == "nl" ]; then
23 |   aff_file="./dictionaries/nl_NL/nl_NL.aff"
24 |   dic_file="./dictionaries/nl_NL/nl_NL.dic"
25 | fi
26 | if [ "$language" == "it" ]; then
27 |   aff_file="./dictionaries/it_IT/it_IT.aff"
28 |   dic_file="./dictionaries/it_IT/it_IT.dic"
29 | fi
30 | if [ "$language" == "cs" ]; then
31 |   aff_file="./dictionaries/cs_CZ/cs_CZ.aff"
32 |   dic_file="./dictionaries/cs_CZ/cs_CZ.dic"
33 | fi
34 | 
35 | if [ "$aff_file" == "" ]; then
36 |   echo "Missing dictionary for preprocessor see process_splitter_dict.sh"
37 |   exit 3
38 | fi
39 | 
40 | echo "Building dict with:"
41 | go run main/splitter_preprocessor.go "$index_file" "$dic_file" "$aff_file" "./data/splitter_dict.csv"
42 | 
43 | 


--------------------------------------------------------------------------------