├── .dockerignore
├── .github
├── CODEOWNERS
└── workflows
│ ├── create-release.yaml
│ └── tests.yaml
├── .gitignore
├── Dockerfile.full
├── Dockerfile.local-vectordb
├── Dockerfile.minimal
├── LICENSE
├── README.md
├── adapters
└── repos
│ └── extensions_weaviate_module.go
├── build.sh
├── client
└── client.go
├── compoundsplitting
├── dictionary.go
├── noop_splitter.go
├── splitter.go
└── splitter_test.go
├── contextionary
├── contextionary.pb.go
├── contextionary.proto
├── core
│ ├── annoyindex
│ │ ├── annoy_test.go
│ │ ├── annoygomodule.h
│ │ ├── annoygomodule_wrap.cxx
│ │ ├── annoyindex.go
│ │ ├── annoylib.h
│ │ └── kissrandom.h
│ ├── centroid.go
│ ├── centroid_test.go
│ ├── certainty.go
│ ├── combined.go
│ ├── combined_simple_test.go
│ ├── component_test.go
│ ├── contextionary.go
│ ├── generator
│ │ ├── cmd
│ │ │ └── generator.go
│ │ └── generator.go
│ ├── indices_test.go
│ ├── memory_index.go
│ ├── mmapped.go
│ ├── similar_words.go
│ ├── similar_words_test.go
│ ├── stopwords
│ │ └── detector.go
│ ├── vector.go
│ └── wordlist.go
└── schema
│ ├── contextionary.go
│ ├── schema_search.go
│ ├── schema_search_params.go
│ ├── schema_search_params_test.go
│ └── schema_search_test.go
├── errors
└── errors.go
├── extensions
├── extension.go
├── looker_upper.go
├── looker_upper_test.go
├── storer.go
└── storer_test.go
├── gen_proto_code.sh
├── go.mod
├── go.sum
├── logparser
└── parse.go
├── main
└── splitter_preprocessor.go
├── prepare_docker_buildx.sh
├── preprocessing
├── dictionary_pre_processing.go
├── dictionary_pre_processing_test.go
├── hunspell.go
└── hunspell_test.go
├── server
├── api.go
├── config
│ └── config.go
├── contextionary.go
├── corpus_vectorizer.go
├── corpus_vectorizer_test.go
├── grpc_error.go
├── server.go
├── splitter.go
├── splitter_test.go
├── weight_manipulator.go
└── weight_manipulator_test.go
├── test
├── compoundsplitting
│ ├── contextionary.idx
│ ├── nl_NL.aff
│ ├── nl_NL.dic
│ └── pre_processed_splitter_dict.csv
├── journey.sh
└── journey
│ ├── Dockerfile
│ ├── docker-compose.yml
│ ├── go.mod
│ ├── go.sum
│ └── journey_test.go
└── tools
├── dev
├── .gitignore
├── contextionary-playground
│ ├── .gitignore
│ ├── class_vectors
│ │ ├── elastic.go
│ │ ├── main.go
│ │ ├── search.go
│ │ ├── stopwords.go
│ │ └── texts.go
│ ├── comparison
│ │ └── main.go
│ ├── main.go
│ └── schema
│ │ └── main.go
├── en_test-vectors-small.txt.bz2
├── gen_simple_contextionary.sh
├── run.sh
└── stopwords.json
├── download_contextionary.sh
├── native_build_contextionary.sh
├── preprocess_splitter_dict.sh
└── preprocess_splitter_dict_native_build.sh
/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Ci related folders
2 | /.github/ @weaviate/core
3 | build.sh @weaviate/core
4 | prepare_docker_buildx.sh @weaviate/core
5 |
--------------------------------------------------------------------------------
/.github/workflows/create-release.yaml:
--------------------------------------------------------------------------------
1 | name: Create Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - '**'
7 |
8 | jobs:
9 | create-release:
10 | name: Create Release
11 | if: startsWith(github.ref, 'refs/tags')
12 | runs-on: ubuntu-latest-4-cores
13 | strategy:
14 | matrix:
15 | include:
16 | - language: en
17 | model_version: 0.16.0
18 | - language: nl
19 | model_version: 0.16.0
20 | - language: en
21 | model_version: 0.14.0
22 | - language: nl
23 | model_version: 0.14.0
24 | - language: de
25 | model_version: 0.14.0
26 | - language: cs
27 | model_version: 0.14.0
28 | - language: it
29 | model_version: 0.14.0
30 | env:
31 | DOCKER_ORG: semitechnologies
32 | DOCKER_REPO: contextionary
33 | LANGUAGE: ${{matrix.language}}
34 | MODEL_VERSION: ${{matrix.model_version}}
35 | steps:
36 | - uses: actions/checkout@v3
37 | - name: Login to Docker Hub
38 | uses: docker/login-action@v2
39 | if: ${{ !github.event.pull_request.head.repo.fork }}
40 | with:
41 | username: ${{secrets.DOCKER_USERNAME}}
42 | password: ${{secrets.DOCKER_PASSWORD}}
43 | - name: Set up Go
44 | uses: actions/setup-go@v3
45 | with:
46 | go-version: 1.19
47 | cache: true
48 | - name: Build and release
49 | run: |
50 | export SOFTWARE_VERSION=${GITHUB_REF##*/}
51 | set -e
52 | ./prepare_docker_buildx.sh
53 | PUSH_MULTIARCH=1 ./build.sh
54 | echo "Success"
55 | gh-release:
56 | name: Create a GitHub Release
57 | if: startsWith(github.ref, 'refs/tags')
58 | runs-on: ubuntu-latest
59 | needs: create-release
60 | steps:
61 | - name: Checkout
62 | uses: actions/checkout@v3
63 | - name: Release
64 | uses: softprops/action-gh-release@v1
65 | with:
66 | generate_release_notes: true
67 | draft: true
68 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | tags:
8 | - '**'
9 | paths-ignore:
10 | - LICENSE
11 | - README.md
12 | pull_request:
13 |
14 | jobs:
15 | tests:
16 | name: Tests
17 | runs-on: ubuntu-latest
18 | strategy:
19 | matrix:
20 | include:
21 | - model_version: 0.16.0
22 | dimensions: 300
23 | - model_version: 0.14.0
24 | dimensions: 600
25 | env:
26 | DOCKER_ORG: semitechnologies
27 | DOCKER_REPO: contextionary
28 | LANGUAGE: en
29 | SOFTWARE_VERSION: localtest
30 | MODEL_VERSION: ${{matrix.model_version}}
31 | DIMENSIONS: ${{matrix.dimensions}}
32 | steps:
33 | - uses: actions/checkout@v3
34 | - name: Login to Docker Hub
35 | uses: docker/login-action@v2
36 | if: ${{ !github.event.pull_request.head.repo.fork }}
37 | with:
38 | username: ${{secrets.DOCKER_USERNAME}}
39 | password: ${{secrets.DOCKER_PASSWORD}}
40 | - name: Set up Go
41 | uses: actions/setup-go@v3
42 | with:
43 | go-version: 1.19
44 | cache: true
45 | - name: Build and run journey tests
46 | run: |
47 | set -e
48 | docker buildx version
49 | ./build.sh
50 | ./test/journey.sh
51 | echo "Success"
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | local-vectordb/
3 |
4 | .idea
5 |
6 | vendor/
--------------------------------------------------------------------------------
/Dockerfile.full:
--------------------------------------------------------------------------------
1 | # vi: ft=Dockerfile
2 |
3 |
4 | FROM golang:1.13 as builder
5 | WORKDIR /app
6 |
7 | RUN apt-get update && apt-get install -y bzip2 jq hunspell libhunspell-dev git
8 |
9 | COPY ./tools/download_contextionary.sh ./
10 | ARG LANGUAGE
11 | ARG MODEL_VERSION
12 | RUN ./download_contextionary.sh "$LANGUAGE" "$MODEL_VERSION"
13 |
14 | COPY go.mod go.sum ./
15 | RUN go mod download
16 |
17 | COPY . .
18 | ARG VERSION
19 | ARG TARGETARCH
20 |
21 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=$TARGETARCH go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server
22 |
23 | RUN tools/dev/gen_simple_contextionary.sh
24 | RUN mkdir -p ./data
25 |
26 | COPY ./tools/preprocess_splitter_dict.sh ./
27 | RUN /bin/bash preprocess_splitter_dict.sh "$LANGUAGE" "/app/data/contextionary.idx"
28 |
29 |
30 | FROM alpine
31 |
32 | COPY --from=builder /app/data/contextionary.idx /app/data/contextionary.knn /app/data/stopwords.json /app/data/splitter_dict.csv /
33 | COPY --from=builder /app/contextionary-server /
34 |
35 | ENV KNN_FILE=/contextionary.knn
36 | ENV IDX_FILE=/contextionary.idx
37 | ENV STOPWORDS_FILE=/stopwords.json
38 | ENV COMPOUND_SPLITTING_DICTIONARY_FILE=/splitter_dict.csv
39 |
40 | ENTRYPOINT [ "/contextionary-server" ]
41 |
--------------------------------------------------------------------------------
/Dockerfile.local-vectordb:
--------------------------------------------------------------------------------
1 | # vi: ft=Dockerfile
2 |
3 |
4 | FROM golang:1.13 as builder
5 | WORKDIR /app
6 |
7 | RUN apt-get update && apt-get install -y bzip2 jq
8 |
9 | COPY go.mod go.sum ./
10 | RUN go mod download
11 |
12 | COPY . .
13 | ARG VERSION
14 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server
15 |
16 | RUN tools/dev/gen_simple_contextionary.sh
17 | RUN mkdir -p ./data
18 |
19 |
20 | FROM alpine
21 |
22 | COPY local-vectordb/contextionary.idx local-vectordb/contextionary.knn local-vectordb/stopwords.json /
23 | COPY --from=builder /app/contextionary-server /
24 |
25 | ENV KNN_FILE=/contextionary.knn
26 | ENV IDX_FILE=/contextionary.idx
27 | ENV STOPWORDS_FILE=/stopwords.json
28 |
29 | ENTRYPOINT [ "/contextionary-server" ]
30 |
--------------------------------------------------------------------------------
/Dockerfile.minimal:
--------------------------------------------------------------------------------
1 | # vi: ft=Dockerfile
2 |
3 | FROM golang:1.13 as builder
4 | WORKDIR /app
5 |
6 | RUN apt-get update && apt-get install -y bzip2 jq hunspell libhunspell-dev git
7 |
8 | COPY go.mod go.sum ./
9 | RUN go mod download
10 |
11 | COPY . .
12 | ARG VERSION
13 | RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -extldflags '-static' -X main.Version=$VERSION" ./server
14 |
15 | RUN tools/dev/gen_simple_contextionary.sh
16 | RUN mkdir -p ./data
17 |
18 | COPY ./tools/preprocess_splitter_dict.sh ./
19 | RUN /bin/bash preprocess_splitter_dict.sh "en" "/app/tools/dev/example.idx"
20 |
21 | FROM scratch
22 |
23 | COPY --from=builder /app/tools/dev/example.idx /app/tools/dev/example.knn /app/tools/dev/stopwords.json /app/data/splitter_dict.csv /
24 | COPY --from=builder /app/contextionary-server /
25 |
26 | ENV KNN_FILE=/example.knn
27 | ENV IDX_FILE=/example.idx
28 | ENV STOPWORDS_FILE=/stopwords.json
29 | ENV COMPOUND_SPLITTING_DICTIONARY_FILE=/splitter_dict.csv
30 |
31 | ENTRYPOINT [ "/contextionary-server" ]
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2020, Weaviate B.V.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | 3. Neither the name of the copyright holder nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Weaviate Contextionary
2 |
3 | > The contextionary powers the semantic, context-based searches in Weaviate.
4 |
5 | Not intended for stand-alone use. Used by [Weaviate - the ML-first vector
6 | search engine](https://github.com/weaviate/weaviate).
7 |
8 | ## Versioning
9 |
10 | The version tag is `-v`. So for
11 | example the app version `0.1.0` deployed with the [contextionary vector db
12 | version](https://c11y.semi.technology/contextionary.json) `0.6.0` of the
13 | English language will have the version `en0.6.0-v0.1.0`. This also
14 | corresponds to the Docker tag.
15 |
16 | ## Languages
17 |
18 | Currently available languages include:
19 | * `en`
20 | * `de`
21 | * `nl`
22 | * `cs`
23 | * `it`
24 |
25 | Other languages coming soon.
26 |
27 | ## Docker Requirements
28 |
29 | The build pipeline makes use of Docker's `buildx` for multi-arch builds. Make
30 | sure you run a Docker version which supports `buildx` and have run `docker
31 | buildx create --use` at least once.
32 |
33 | ## How to build and test project
34 |
35 | 1. Regenerate schema:
36 |
37 | ```bash
38 | ./gen_proto_code.sh
39 | ```
40 |
41 | 2. Build image:
42 |
43 | ```bash
44 | LANGUAGE=en MODEL_VERSION=0.16.0 ./build.sh
45 | ```
46 |
47 | 3. Run journey tests:
48 |
49 | ```bash
50 | LANGUAGE=en MODEL_VERSION=0.16.0 ./build.sh && DIMENSIONS=300 ./test/journey.sh
51 | ```
52 |
--------------------------------------------------------------------------------
/adapters/repos/extensions_weaviate_module.go:
--------------------------------------------------------------------------------
1 | package repos
2 |
3 | import (
4 | "bufio"
5 | "bytes"
6 | "context"
7 | "encoding/json"
8 | "fmt"
9 | "net/http"
10 | "time"
11 |
12 | "github.com/sirupsen/logrus"
13 | "github.com/weaviate/contextionary/extensions"
14 | "github.com/weaviate/contextionary/server/config"
15 | )
16 |
17 | type ModuleExtensionRepo struct {
18 | client *http.Client
19 | logger logrus.FieldLogger
20 | origin string
21 | watchInterval time.Duration
22 | }
23 |
24 | func NewExtensionsRepo(logger logrus.FieldLogger,
25 | config *config.Config, watchInterval time.Duration) *ModuleExtensionRepo {
26 | client := &http.Client{}
27 | return &ModuleExtensionRepo{
28 | client: client,
29 | logger: logger,
30 | origin: config.ExtensionsStorageOrigin,
31 | watchInterval: watchInterval,
32 | }
33 | }
34 |
35 | func (r *ModuleExtensionRepo) WatchAll() chan extensions.WatchResponse {
36 | returnCh := make(chan extensions.WatchResponse)
37 |
38 | go func() {
39 | t := time.Tick(r.watchInterval)
40 | for {
41 | r.updateConsumers(returnCh)
42 | <-t
43 | }
44 | }()
45 |
46 | return returnCh
47 | }
48 |
49 | func (f *ModuleExtensionRepo) uri(path string) string {
50 | return fmt.Sprintf("%s%s", f.origin, path)
51 | }
52 |
53 | func (r *ModuleExtensionRepo) updateConsumers(returnCh chan extensions.WatchResponse) {
54 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
55 | defer cancel()
56 |
57 | req, err := http.NewRequestWithContext(ctx, "GET",
58 | r.uri("/v1/modules/text2vec-contextionary/extensions-storage/"), nil)
59 | if err != nil {
60 | r.logger.WithField("action", "extensions_retrieve_all").
61 | WithError(err).Error()
62 | return
63 | }
64 |
65 | res, err := r.client.Do(req)
66 | if err != nil {
67 | r.logger.WithField("action", "extensions_retrieve_all").
68 | WithError(err).Error()
69 | return
70 | }
71 |
72 | defer res.Body.Close()
73 | if res.StatusCode > 399 {
74 | r.logger.WithField("action", "extensions_retrieve_all").
75 | WithError(fmt.Errorf("expected status < 399, got %d", res.StatusCode)).
76 | Error()
77 | return
78 | }
79 |
80 | var exts []extensions.Extension
81 | scanner := bufio.NewScanner(res.Body)
82 | for scanner.Scan() {
83 | if err := scanner.Err(); err != nil {
84 | r.logger.WithField("action", "extensions_retrieve_all").
85 | WithError(err).Error()
86 | return
87 | }
88 |
89 | rawExt := scanner.Bytes()
90 | var ext extensions.Extension
91 | err := json.Unmarshal(rawExt, &ext)
92 | if err != nil {
93 | r.logger.WithField("action", "extensions_retrieve_all").
94 | WithError(err).Error()
95 | return
96 | }
97 |
98 | exts = append(exts, ext)
99 | }
100 |
101 | returnCh <- exts
102 | }
103 |
104 | func (r *ModuleExtensionRepo) Put(ctx context.Context, ext extensions.Extension) error {
105 | extBytes, err := json.Marshal(ext)
106 | if err != nil {
107 | return fmt.Errorf("marshal extension to json: %v", err)
108 | }
109 |
110 | req, err := http.NewRequestWithContext(ctx, "PUT", r.uri(fmt.Sprintf(
111 | "/v1/modules/text2vec-contextionary/extensions-storage/%s", ext.Concept)), bytes.NewReader(extBytes))
112 |
113 | res, err := r.client.Do(req)
114 | if err != nil {
115 | return fmt.Errorf("put: %v", err)
116 | }
117 |
118 | defer res.Body.Close()
119 | if res.StatusCode > 399 {
120 | return fmt.Errorf("expected status < 399, got %d", res.StatusCode)
121 | }
122 |
123 | return nil
124 | }
125 |
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | # set some defaults so we can also run locally
6 | if [ -z "$DOCKER_ORG" ]
7 | then
8 | DOCKER_ORG=semitechnologies
9 | fi
10 |
11 | if [ -z "$DOCKER_REPO" ]
12 | then
13 | DOCKER_REPO=contextionary
14 | fi
15 |
16 | if [ -z "$SOFTWARE_VERSION" ]
17 | then
18 | SOFTWARE_VERSION=local
19 | fi
20 |
21 | if [ -z "$MODEL_VERSION" ]
22 | then
23 | MODEL_VERSION=0.16.0
24 | fi
25 |
26 | if [ -z "$LANGUAGE" ]
27 | then
28 | LANGUAGE=en
29 | fi
30 |
31 | VERSION="${MODEL_VERSION}-${SOFTWARE_VERSION}"
32 |
33 | if [ -z "$FULL_VERSION_DOCKERFILE" ]
34 | then
35 | FULL_VERSION_DOCKERFILE=Dockerfile.full
36 | fi
37 |
38 | if [ "$PUSH_MULTIARCH" = "1" ]; then
39 | echo "Build and push multi-arch full version"
40 | echo "Build $LANGUAGE:"
41 | full_version="${LANGUAGE}${VERSION}"
42 | docker buildx build --platform=linux/amd64,linux/arm64 \
43 | --push \
44 | -f "$FULL_VERSION_DOCKERFILE" \
45 | --build-arg VERSION="$full_version" \
46 | --build-arg MODEL_VERSION="$MODEL_VERSION" \
47 | --build-arg LANGUAGE="$LANGUAGE" \
48 | -t "$DOCKER_ORG/$DOCKER_REPO:$full_version" .
49 | else
50 | echo "Build minimal version (english only)"
51 | docker build -f Dockerfile.minimal --build-arg VERSION="$VERSION-minimal" -t "$DOCKER_ORG/$DOCKER_REPO:en$VERSION-minimal" .
52 |
53 | echo "Build single-arch full version"
54 | echo "Build $LANGUAGE:"
55 | full_version="${LANGUAGE}${VERSION}"
56 | docker build \
57 | -f "$FULL_VERSION_DOCKERFILE" \
58 | --build-arg VERSION="$full_version" \
59 | --build-arg MODEL_VERSION="$MODEL_VERSION" \
60 | --build-arg LANGUAGE="$LANGUAGE" \
61 | -t "$DOCKER_ORG/$DOCKER_REPO:$full_version" .
62 | fi
63 |
64 |
65 |
--------------------------------------------------------------------------------
/client/client.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "os"
7 | "strconv"
8 | "strings"
9 |
10 | pb "github.com/weaviate/contextionary/contextionary"
11 | grpc "google.golang.org/grpc"
12 | )
13 |
14 | func help() {
15 | fmt.Println("the following commands are supported:")
16 | fmt.Printf("\n")
17 | fmt.Printf("\t%-15s%s\n", "meta", "Display meta info, such as versions")
18 | fmt.Printf("\t %s\n", "Usage: client meta")
19 | fmt.Printf("\n")
20 | fmt.Printf("\t%-15s%s\n", "word-present", "Check if the word is present in the db or as an extension")
21 | fmt.Printf("\t %s\n", "Usage: client word-present word")
22 | fmt.Printf("\n")
23 | fmt.Printf("\t%-15s%s\n", "word-stopword", "Check if the word is considered a stopword")
24 | fmt.Printf("\t %s\n", "Usage: client word-stopword word")
25 | fmt.Printf("\n")
26 | fmt.Printf("\t%-15s%s\n", "search", "Search for word or property")
27 | fmt.Printf("\t %s\n", "For usage run client search and see instructions from there")
28 | fmt.Printf("\n")
29 | fmt.Printf("\t%-15s%s\n", "similar-words", "Search for similar words within the specified certainty")
30 | fmt.Printf("\t %s\n", "Usage: client similar-words word certainty")
31 | fmt.Printf("\n")
32 | fmt.Printf("\t%-15s%s\n", "extend", "Extend the contextionary with custom concepts")
33 | fmt.Printf("\t %s\n", "Usage: client extend newconcept \"definition of the new concept\"")
34 | fmt.Printf("\n")
35 | fmt.Printf("\t%-15s%s\n", "vectorize", "Vectorize any string")
36 | fmt.Printf("\t %s\n", "Usage: client vectorize \"input string to vectorize\"")
37 | fmt.Printf("\t%-15s%s\n", "multi-vector-for-word", "Vectorize multiple strings")
38 | fmt.Printf("\t %s\n", "Usage: client multi-vector-for-word \"word1 word2 word3 ... wordN\"")
39 | }
40 |
41 | func main() {
42 | conn, err := grpc.Dial("localhost:9999", grpc.WithInsecure())
43 | if err != nil {
44 | fmt.Fprintf(os.Stderr, "couldn't connect: %s", err)
45 | os.Exit(1)
46 | }
47 | defer conn.Close()
48 |
49 | client := pb.NewContextionaryClient(conn)
50 |
51 | args := os.Args[1:]
52 | if len(args) == 0 {
53 | fmt.Fprintf(os.Stderr, "no command provided, try 'word-present'\n")
54 | os.Exit(1)
55 | }
56 |
57 | cmd := args[0]
58 | switch cmd {
59 | case "help":
60 | help()
61 | case "meta", "version":
62 | meta(client, args[1:])
63 | case "word-present":
64 | wordPresent(client, args[1:])
65 | case "word-stopword":
66 | wordStopword(client, args[1:])
67 | case "search":
68 | search(client, args[1:])
69 | case "similar-words":
70 | similarWords(client, args[1:])
71 | case "extend":
72 | extend(client, args[1:])
73 | case "vectorize":
74 | vectorize(client, args[1:])
75 | case "multi-vector-for-word":
76 | multiVecForWord(client, args[1:])
77 |
78 | default:
79 | fmt.Fprintf(os.Stderr, "unknown command '%s'\n", cmd)
80 | os.Exit(1)
81 | }
82 | }
83 | func meta(client pb.ContextionaryClient, args []string) {
84 | ctx := context.Background()
85 |
86 | res, err := client.Meta(ctx, &pb.MetaParams{})
87 | if err != nil {
88 | fmt.Fprintf(os.Stderr, "ERROR: couldn't display meta: %s", err)
89 | os.Exit(1)
90 | }
91 |
92 | fmt.Printf("%#v\n", res)
93 | }
94 |
95 | func wordPresent(client pb.ContextionaryClient, args []string) {
96 | if len(args) == 0 {
97 | fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to check\n")
98 | os.Exit(1)
99 | }
100 |
101 | ctx := context.Background()
102 |
103 | for _, word := range args {
104 | res, err := client.IsWordPresent(ctx, &pb.Word{Word: word})
105 | if err != nil {
106 | fmt.Fprintf(os.Stderr, "ERROR: couldn't get word: %s", err)
107 | os.Exit(1)
108 | }
109 | if res.Present {
110 | fmt.Printf("word '%s' is present in the contextionary\n", word)
111 | } else {
112 | fmt.Printf("word '%s' is NOT present in the contextionary\n", word)
113 | }
114 | }
115 | }
116 |
117 | func similarWords(client pb.ContextionaryClient, args []string) {
118 | var word string
119 | var certainty float32
120 |
121 | if len(args) == 0 {
122 | fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to find similarities to\n")
123 | os.Exit(1)
124 | }
125 | word = args[0]
126 |
127 | if len(args) == 1 {
128 | fmt.Fprintf(os.Stderr, "need at least one other argument: the minimum required certainty\n")
129 | os.Exit(1)
130 | }
131 |
132 | c, err := strconv.ParseFloat(args[1], 32)
133 | if err != nil {
134 | fmt.Fprintf(os.Stderr, "couldnt parse certainty: %v\n", err)
135 | os.Exit(1)
136 | }
137 | certainty = float32(c)
138 |
139 | res, err := client.SafeGetSimilarWordsWithCertainty(context.Background(), &pb.SimilarWordsParams{
140 | Certainty: certainty,
141 | Word: word,
142 | })
143 | if err != nil {
144 | fmt.Fprintf(os.Stderr, "ERROR: couldn't get similar words: %s", err)
145 | os.Exit(1)
146 | }
147 |
148 | for _, word := range res.Words {
149 | fmt.Printf("🥳 %s\n", word.Word)
150 | }
151 | }
152 | func extend(client pb.ContextionaryClient, args []string) {
153 | if len(args) != 2 {
154 | fmt.Fprintf(os.Stderr, "need two arguments, the concept to add/extend and its definition\n")
155 | os.Exit(1)
156 | }
157 | concept := args[0]
158 | definition := strings.ToLower(args[1])
159 |
160 | _, err := client.AddExtension(context.Background(), &pb.ExtensionInput{
161 | Concept: concept,
162 | Definition: definition,
163 | Weight: 1,
164 | })
165 | if err != nil {
166 | fmt.Fprintf(os.Stderr, "ERROR: %s", err)
167 | os.Exit(1)
168 | } else {
169 | fmt.Fprintf(os.Stdout, "Success!")
170 | os.Exit(0)
171 | }
172 | }
173 |
174 | func vectorize(client pb.ContextionaryClient, args []string) {
175 | if len(args) != 1 {
176 | fmt.Fprintf(os.Stderr, "need one argument: the input string to vectorize")
177 | os.Exit(1)
178 | }
179 | input := args[0]
180 |
181 | res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{
182 | Corpi: []string{input},
183 | })
184 | if err != nil {
185 | fmt.Fprintf(os.Stderr, "ERROR: %s", err)
186 | os.Exit(1)
187 | } else {
188 | fmt.Fprintf(os.Stdout, "Success: %v\n", res.Entries)
189 | fmt.Fprintf(os.Stdout, "Source: %v\n", res.Source)
190 | os.Exit(0)
191 | }
192 | }
193 |
194 | func multiVecForWord(client pb.ContextionaryClient, args []string) {
195 | if len(args) < 1 {
196 | fmt.Fprintf(os.Stderr, "need at least one argument: the input word to vectorize")
197 | os.Exit(1)
198 | }
199 |
200 | words := make([]*pb.Word, len(args))
201 | for i, word := range args {
202 | words[i] = &pb.Word{Word: word}
203 | }
204 |
205 | res, err := client.MultiVectorForWord(context.Background(), &pb.WordList{
206 | Words: words,
207 | })
208 | if err != nil {
209 | fmt.Fprintf(os.Stderr, "ERROR: %s", err)
210 | os.Exit(1)
211 | } else {
212 | fmt.Fprintf(os.Stdout, "Success: %v", res.Vectors)
213 | os.Exit(0)
214 | }
215 | }
216 |
217 | func wordStopword(client pb.ContextionaryClient, args []string) {
218 | if len(args) == 0 {
219 | fmt.Fprintf(os.Stderr, "need at least one other argument: the word you want to check\n")
220 | os.Exit(1)
221 | }
222 |
223 | ctx := context.Background()
224 |
225 | for _, word := range args {
226 | res, err := client.IsWordStopword(ctx, &pb.Word{Word: word})
227 | if err != nil {
228 | fmt.Fprintf(os.Stderr, "ERROR: couldn't get word: %s", err)
229 | os.Exit(1)
230 | }
231 | if res.Stopword {
232 | fmt.Printf("word '%s' is a stopword\n", word)
233 | } else {
234 | fmt.Printf("word '%s' is not a stopword\n", word)
235 | }
236 | }
237 | }
238 |
239 | func search(client pb.ContextionaryClient, args []string) {
240 | if len(args) == 0 {
241 | fmt.Fprintf(os.Stderr, "need at least one other argument: either 'class' or 'property' \n")
242 | os.Exit(1)
243 | }
244 |
245 | cmd := args[0]
246 | switch cmd {
247 | case "class":
248 | searchClass(client, args[1:])
249 | default:
250 | fmt.Fprintf(os.Stderr, "unknown command '%s'\n", cmd)
251 | os.Exit(1)
252 | }
253 | }
254 |
255 | func searchClass(client pb.ContextionaryClient, args []string) {
256 | if len(args) == 0 {
257 | fmt.Fprintf(os.Stderr, "need at least one other argument the search term\n")
258 | os.Exit(1)
259 | }
260 |
261 | if len(args) == 1 {
262 | fmt.Fprintf(os.Stderr, "need at least one other argument the desired certainty\n")
263 | os.Exit(1)
264 | }
265 |
266 | searchTerm := args[0]
267 | certainty, err := strconv.ParseFloat(args[1], 32)
268 | if err != nil {
269 | fmt.Fprintf(os.Stderr, "cannot parse certainty '%s'\n", args[1])
270 | os.Exit(1)
271 | }
272 |
273 | params := &pb.SchemaSearchParams{
274 | Certainty: float32(certainty),
275 | Name: searchTerm,
276 | }
277 |
278 | ctx := context.Background()
279 | res, err := client.SchemaSearch(ctx, params)
280 | if err != nil {
281 | fmt.Fprintf(os.Stderr, "schema search failed: %s", err)
282 | os.Exit(1)
283 | }
284 |
285 | if len(res.Results) == 0 {
286 | fmt.Println("😵 nothing found")
287 | }
288 |
289 | for _, class := range res.Results {
290 | fmt.Printf("🥳 %s (Certainty: %f)\n", class.Name, class.Certainty)
291 | }
292 | }
293 |
--------------------------------------------------------------------------------
/compoundsplitting/dictionary.go:
--------------------------------------------------------------------------------
1 | package compoundsplitting
2 |
3 | import (
4 | "bufio"
5 | "os"
6 | "strconv"
7 | "strings"
8 | )
9 |
10 |
11 | // Dictionary filter for the splitting algorithm
12 | // based on the words in the contextionary
13 | type ContextionaryDict struct {
14 | dict map[string]int // storing the word and its occurrence
15 | }
16 |
17 | // NewContextionaryDict
18 | // uses a dictionary file that was created using the preprocessing procedures
19 | func NewContextionaryDict(contextionaryDictFile string) (*ContextionaryDict, error) {
20 | file, err := os.Open(contextionaryDictFile)
21 | if err != nil {
22 | return nil, err
23 | }
24 | defer file.Close()
25 |
26 | dict := &ContextionaryDict{
27 | dict: make(map[string]int, 400000),
28 | }
29 |
30 | scanner := bufio.NewScanner(file)
31 | for scanner.Scan() {
32 | line := scanner.Text()
33 | split := strings.Split(line, ",")
34 | occurrence, err := strconv.Atoi(split[1])
35 | if err != nil {
36 | return nil, err
37 | }
38 | dict.dict[split[0]] = occurrence
39 | }
40 |
41 | return dict, nil
42 | }
43 |
44 | // Contains true if word is in contextionary
45 | func (cd *ContextionaryDict) Contains(word string) bool {
46 | _, exists := cd.dict[word]
47 | return exists
48 | }
49 |
50 | //Score prefers long and few words
51 | func (cd *ContextionaryDict) Score(phrase []string) float64 {
52 | // Prefer longer words as scoring
53 | // Assumption is that the compound words are on average more similar to splittings that
54 | // share most of the characters with the compound.
55 | lenScore := 0
56 | for _, word := range phrase {
57 | lenScore += len(word)
58 | }
59 |
60 | // Give a boost for less words
61 | if len(phrase) == 2 {
62 | lenScore += 3
63 | }
64 | if len(phrase) == 3 {
65 | lenScore += 1
66 | }
67 |
68 | return float64(lenScore)
69 | }
70 |
71 |
72 | // DictMock used for unit testing
73 | type DictMock struct {
74 | scores map[string]float64
75 | }
76 |
77 | // Contains
78 | func (dm *DictMock) Contains(word string) bool {
79 | _, exists := dm.scores[word]
80 | return exists
81 | }
82 |
83 | // Score
84 | func (dm *DictMock) Score(phrase []string) float64 {
85 | score := 0.0
86 | for _, word := range phrase {
87 | score += dm.scores[word]
88 | }
89 | return score
90 | }
91 |
--------------------------------------------------------------------------------
/compoundsplitting/noop_splitter.go:
--------------------------------------------------------------------------------
1 | package compoundsplitting
2 |
3 | type NoopSplitter struct{}
4 |
5 | func NewNoopSplitter() NoopSplitter {
6 | return NoopSplitter{}
7 | }
8 |
9 | func (n NoopSplitter) Split(words string) ([]string, error) {
10 | return []string{}, nil
11 | }
12 |
--------------------------------------------------------------------------------
/compoundsplitting/splitter.go:
--------------------------------------------------------------------------------
1 | package compoundsplitting
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "time"
7 | )
8 |
9 | // minCompoundWordLength prevents the splitting into very small (often not real) words
10 | // to prevent a bloated tree
11 | const minCompoundWordLength = 4
12 |
13 | // maxWordLength prevents a tree from growing too big when adding very long strings
14 | const maxWordLength = 100
15 |
16 | // maxNumberTreeNodes
17 | const maxNumberTreeNodes = 20
18 |
19 | const cancelSplittingAfter = 500 * time.Millisecond
20 |
21 | type Dictionary interface {
22 | // Score receives a phrase of words and gives a score on how "good" this phrase is.
23 | // If a compound word can be splitted into multiple phrases it will choose the one with the highest score.
24 | Score(phrase []string) float64
25 | // Contains is true if the word is in the dictionary
26 | Contains(word string) bool
27 | }
28 |
29 | // Splitter builds a tree of compound splits and selects
30 | // the best option based on a scoring mechanism
31 | type Splitter struct {
32 | dict Dictionary
33 | cancelAfter time.Duration
34 | }
35 |
36 | // New Splitter recognizing words given by dict and
37 | // selecting split phrases based on scoring
38 | func NewSplitter(dict Dictionary) *Splitter {
39 | return &Splitter{
40 | dict: dict,
41 | cancelAfter: cancelSplittingAfter,
42 | }
43 | }
44 |
45 | type CompoundSplit struct {
46 | // Combinations of compound combinations in a phrase
47 | combinations []*Node
48 | }
49 |
50 | // Split a compound word into its compounds
51 | func (sp *Splitter) Split(word string) ([]string, error) {
52 |
53 | if len(word) > maxWordLength {
54 | return []string{}, nil
55 | }
56 |
57 | compoundSplit := CompoundSplit{}
58 |
59 | // spawn a new context that cancels the recursion if we are spending too much
60 | // time on it
61 | ctx, cancel := context.WithTimeout(context.Background(), sp.cancelAfter)
62 | defer cancel()
63 |
64 | err := sp.findAllWordCombinations(ctx, word, &compoundSplit)
65 | if err != nil {
66 | return nil, err
67 | }
68 | combinations := compoundSplit.getAllWordCombinations(ctx)
69 | maxScore := 0.0
70 | maxPhrase := []string{}
71 | for _, combination := range combinations {
72 | currentScore := sp.dict.Score(combination)
73 | if len(maxPhrase) == 0 {
74 | // Initialize if score is negative
75 | maxScore = currentScore
76 | maxPhrase = combination
77 | }
78 | if currentScore > maxScore {
79 | maxScore = currentScore
80 | maxPhrase = combination
81 | }
82 | }
83 | return maxPhrase, nil
84 | }
85 |
86 | func (cs *CompoundSplit) insertCompound(ctx context.Context, word string,
87 | startIndex int) error {
88 | compound := NewNode(word, startIndex)
89 | appended := false
90 | for _, combination := range cs.combinations {
91 | // For all possible combinations
92 |
93 | leaves := combination.RecursivelyFindLeavesBeforeIndex(ctx, startIndex)
94 | for _, leave := range leaves {
95 | // Append the new compound to the leaves
96 |
97 | appended = true
98 | err := leave.AddChild(compound)
99 | if err != nil {
100 | return err
101 | }
102 | }
103 | }
104 | if !appended {
105 | // if compound was not added to any leave add it to combinations
106 | cs.combinations = append(cs.combinations, compound)
107 | }
108 | return nil
109 | }
110 |
111 | func (sp *Splitter) findAllWordCombinations(ctx context.Context, str string, compoundSplit *CompoundSplit) error {
112 | compoundsUsed := 0
113 | for offset, _ := range str {
114 | // go from left to right and choose offsetted substring
115 | offsetted := str[offset:]
116 |
117 | for i := 1; i <= len(offsetted); i++ {
118 | // go from left to right to find a word
119 | word := offsetted[:i]
120 | if len(word) < minCompoundWordLength {
121 | continue
122 | }
123 |
124 | if sp.dict.Contains(word) {
125 | compoundsUsed += 1
126 | if compoundsUsed == maxNumberTreeNodes {
127 | // Tree is getting out of bounds stopping for performance
128 | return nil
129 | }
130 | err := compoundSplit.insertCompound(ctx, word, offset)
131 | if err != nil {
132 | return err
133 | }
134 | }
135 | }
136 | }
137 | return nil
138 | }
139 |
140 | func (cs *CompoundSplit) getAllWordCombinations(ctx context.Context) [][]string {
141 | wordCombinations := [][]string{}
142 |
143 | for _, combination := range cs.combinations {
144 | wordCombinations = append(wordCombinations,
145 | combination.RecursivelyBuildNames(ctx)...)
146 | }
147 |
148 | return wordCombinations
149 | }
150 |
151 | // Node for of the word tree
152 | type Node struct {
153 | name string
154 | children []*Node
155 | startIndex int // inclusiv
156 | endIndex int // exclusive
157 | }
158 |
159 | // NewNode from node name and in compoundword index
160 | func NewNode(name string, startIndex int) *Node {
161 | return &Node{
162 | name: name,
163 | children: []*Node{},
164 | startIndex: startIndex,
165 | endIndex: startIndex + len(name),
166 | }
167 | }
168 |
169 | // AddChild node to node
170 | func (node *Node) AddChild(newChildNode *Node) error {
171 | if newChildNode.startIndex < node.endIndex {
172 | return fmt.Errorf("Child starts at %v but this node ends at %v can't add as child", newChildNode.startIndex, node.endIndex)
173 | }
174 | node.children = append(node.children, newChildNode)
175 | return nil
176 | }
177 |
178 | func (node *Node) findChildNodesBeforeIndex(index int) []*Node {
179 | childrensThatEndBeforeIndex := []*Node{}
180 |
181 | for _, child := range node.children {
182 | if child.endIndex <= index {
183 | childrensThatEndBeforeIndex = append(childrensThatEndBeforeIndex, child)
184 | }
185 | }
186 |
187 | return childrensThatEndBeforeIndex
188 | }
189 |
190 | // RecursivelyBuildNames of compounds
191 | func (node *Node) RecursivelyBuildNames(ctx context.Context) [][]string {
192 | compoundName := [][]string{}
193 | if ctx.Err() != nil {
194 | // we've been going recursively too long, abort!
195 | compoundName = append(compoundName, []string{node.name})
196 | return compoundName
197 | }
198 |
199 | for _, child := range node.children {
200 | childNames := child.RecursivelyBuildNames(ctx)
201 |
202 | for _, childName := range childNames {
203 | // Add the name of this node first
204 | fullName := []string{node.name}
205 | fullName = append(fullName, childName...)
206 | compoundName = append(compoundName, fullName)
207 | }
208 | }
209 | if len(compoundName) == 0 {
210 | // This is a leave node
211 | compoundName = append(compoundName, []string{node.name})
212 | }
213 |
214 | return compoundName
215 | }
216 |
217 | // RecursivelyFindLeavesBeforeIndex where to add a new node
218 | func (node *Node) RecursivelyFindLeavesBeforeIndex(ctx context.Context, index int) []*Node {
219 | foundLeaves := []*Node{}
220 | if ctx.Err() != nil {
221 | // we've been going recursively too long, abort!
222 | return foundLeaves
223 | }
224 |
225 | children := node.findChildNodesBeforeIndex(index)
226 | for _, child := range children {
227 | leaves := child.RecursivelyFindLeavesBeforeIndex(ctx, index)
228 | if len(leaves) == 0 {
229 | // There are no leaves this means the child node is already a leave
230 | foundLeaves = append(foundLeaves, child)
231 | } else {
232 | // Found leaves use them instead of direct child
233 | foundLeaves = append(foundLeaves, leaves...)
234 | }
235 | }
236 |
237 | if len(foundLeaves) == 0 && node.endIndex <= index {
238 | // This node is the leave
239 | foundLeaves = append(foundLeaves, node)
240 | }
241 |
242 | return foundLeaves
243 | }
244 |
245 | // NewEmptyTestSplitter creates a splitter,
246 | // that does not know any words and
247 | // thus is not able to split any words
248 | func NewEmptyTestSplitter() *Splitter {
249 | dictMock := &DictMock{
250 | scores: map[string]float64{},
251 | }
252 | return &Splitter{
253 | dict: dictMock,
254 | }
255 | }
256 |
257 | func NewTestSplitter(wordScoreMapping map[string]float64) *Splitter {
258 | dict := &DictMock{
259 | scores: wordScoreMapping,
260 | }
261 | return &Splitter{
262 | dict: dict,
263 | }
264 | }
265 |
--------------------------------------------------------------------------------
/compoundsplitting/splitter_test.go:
--------------------------------------------------------------------------------
1 | package compoundsplitting
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "testing"
7 | "time"
8 |
9 | "github.com/stretchr/testify/assert"
10 | "github.com/stretchr/testify/require"
11 | )
12 |
13 | func TestSplitTreeSplitter(t *testing.T) {
14 | dictMock := &DictMock{
15 | scores: map[string]float64{
16 | "drie": 2.0,
17 | "hoek": 2.0,
18 | "brood": 4.0,
19 | "driehoek": 5.0,
20 | "broodje": 5.0,
21 | },
22 | }
23 |
24 | ts := Splitter{
25 | dict: dictMock,
26 | cancelAfter: 500 * time.Millisecond,
27 | }
28 |
29 | // drie hoek brood
30 | // broodje
31 | // driehoek brood
32 | // broodje
33 |
34 |
35 | cs := CompoundSplit{}
36 |
37 | ts.findAllWordCombinations(context.Background(), "driehoeksbroodje", &cs)
38 |
39 | combinations := cs.getAllWordCombinations(context.Background())
40 | assert.Equal(t, 4, len(combinations))
41 | for _, combination := range combinations {
42 | fmt.Printf("%v\n", combination)
43 | }
44 |
45 | splited, err := ts.Split("driehoeksbroodje")
46 | assert.Nil(t, err)
47 | require.Equal(t, 2, len(splited))
48 | assert.Equal(t, "driehoek", splited[0])
49 | assert.Equal(t, "broodje", splited[1])
50 |
51 | // Test no result
52 | splited, err = ts.Split("raupenprozessionsspinner")
53 | assert.Nil(t, err)
54 | assert.Equal(t, 0, len(splited), "Expected no result since no substring is in the dict")
55 | }
56 |
57 | func TestNegativeScore(t *testing.T) {
58 | dictMock := &DictMock{
59 | scores: map[string]float64{
60 | "drie": -10.0,
61 | "hoek": -10.0,
62 | "brood": -8.0,
63 | "driehoek": -2.0,
64 | "broodje": -2.0,
65 | },
66 | }
67 |
68 | ts := NewSplitter(dictMock)
69 |
70 | splited, err := ts.Split("driehoeksbroodje")
71 | assert.Nil(t, err)
72 | assert.Equal(t, 2, len(splited))
73 | assert.Equal(t, "driehoek", splited[0])
74 | assert.Equal(t, "broodje", splited[1])
75 | }
76 |
77 | func TestInsertCompound(t *testing.T) {
78 |
79 | t.Run("Add a new word", func(t *testing.T) {
80 | ts := CompoundSplit{}
81 | ts.insertCompound(context.Background(), "test", 0)
82 |
83 | assert.Equal(t, 1, len(ts.combinations))
84 | assert.Equal(t, "test", ts.combinations[0].name)
85 | })
86 |
87 | t.Run("Add a two words", func(t *testing.T) {
88 | ts := CompoundSplit{}
89 | ts.insertCompound(context.Background(), "test", 0)
90 | ts.insertCompound(context.Background(), "testje", 0)
91 |
92 | assert.Equal(t, 2, len(ts.combinations))
93 | assert.Equal(t, "test", ts.combinations[0].name)
94 | assert.Equal(t, "testje", ts.combinations[1].name)
95 | })
96 |
97 | t.Run("Add a two words different index", func(t *testing.T) {
98 | ts := CompoundSplit{}
99 |
100 | // phrase: testje
101 | ts.insertCompound(context.Background(), "test", 0)
102 | ts.insertCompound(context.Background(), "stje", 2)
103 |
104 | assert.Equal(t, 2, len(ts.combinations))
105 | assert.Equal(t, "test", ts.combinations[0].name)
106 | assert.Equal(t, "stje", ts.combinations[1].name)
107 | })
108 |
109 | t.Run("Add a two words different index", func(t *testing.T) {
110 | ts := CompoundSplit{}
111 |
112 | // phrase: testjenuttig
113 | // 123456789111
114 | // 012
115 | ts.insertCompound(context.Background(), "test", 0)
116 | ts.insertCompound(context.Background(), "nuttig", 8)
117 |
118 | assert.Equal(t, 1, len(ts.combinations))
119 | phrase := ts.combinations[0]
120 | assert.Equal(t, "test", phrase.name)
121 | assert.Equal(t, "nuttig", phrase.children[0].name)
122 |
123 | })
124 |
125 | t.Run("Add a two combinations", func(t *testing.T) {
126 | ts := CompoundSplit{}
127 |
128 | // phrase: testjenuttig
129 | // 123456789111
130 | // 012
131 | ts.insertCompound(context.Background(), "test", 0)
132 | ts.insertCompound(context.Background(), "est", 1)
133 | ts.insertCompound(context.Background(), "nuttig", 8)
134 |
135 | assert.Equal(t, 2, len(ts.combinations))
136 | phrase := ts.combinations[0]
137 | assert.Equal(t, "test", phrase.name)
138 | assert.Equal(t, "nuttig", phrase.children[0].name)
139 |
140 | phrase = ts.combinations[1]
141 | assert.Equal(t, "est", phrase.name)
142 | assert.Equal(t, "nuttig", phrase.children[0].name)
143 | })
144 |
145 | t.Run("Add driehoeksbroodje", func(t *testing.T) {
146 | ts := CompoundSplit{}
147 |
148 | // phrase: driehoeksbroodje
149 | // 1234567891111111
150 | // 0123456
151 | ts.insertCompound(context.Background(), "drie", 0)
152 | ts.insertCompound(context.Background(), "driehoek", 0)
153 | ts.insertCompound(context.Background(), "hoek", 5)
154 | ts.insertCompound(context.Background(), "brood", 10)
155 | ts.insertCompound(context.Background(), "broodje", 10)
156 |
157 | // drie hoek brood
158 | // broodje
159 |
160 | // driehoek brood
161 | // broodje
162 |
163 | assert.Equal(t, 2, len(ts.combinations))
164 | })
165 |
166 | }
167 |
168 | func TestNode(t *testing.T) {
169 |
170 | t.Run("New Node", func(t *testing.T) {
171 | node := NewNode("test", 2)
172 | assert.Equal(t, 6, node.endIndex)
173 | })
174 |
175 | t.Run("Add child", func(t *testing.T) {
176 | node1 := NewNode("test", 2)
177 | node2 := NewNode("case", 6)
178 | node3 := NewNode("ase", 7)
179 | err := node1.AddChild(node2)
180 | assert.Nil(t, err)
181 | err = node1.AddChild(node3)
182 | assert.Nil(t, err)
183 |
184 | assert.Equal(t, 2, len(node1.children))
185 | })
186 |
187 | t.Run("Add wrong index", func(t *testing.T) {
188 | node1 := NewNode("test", 2)
189 | node2 := NewNode("esting", 3)
190 | err := node1.AddChild(node2)
191 | assert.NotNil(t, err)
192 | })
193 |
194 | t.Run("find children before index", func(t *testing.T) {
195 | // testcasees
196 | // 0123456789
197 | test := NewNode("test", 0)
198 | caseN := NewNode("case", 4)
199 | as := NewNode("as", 5)
200 | see := NewNode("see", 6)
201 | es := NewNode("es", 8)
202 |
203 | // test case es
204 | // test as es
205 | // test see
206 |
207 | test.AddChild(caseN)
208 | test.AddChild(as)
209 | test.AddChild(see)
210 | caseN.AddChild(es)
211 | as.AddChild(es)
212 |
213 | // no child nodes that end before index 6
214 | assert.Equal(t, 0, len(test.findChildNodesBeforeIndex(6)))
215 | // as ends at 7
216 | assert.Equal(t, 1, len(test.findChildNodesBeforeIndex(7)))
217 | // case ends at 8
218 | assert.Equal(t, 2, len(test.findChildNodesBeforeIndex(8)))
219 | // see ends at 9
220 | assert.Equal(t, 3, len(test.findChildNodesBeforeIndex(9)))
221 | })
222 |
223 | t.Run("find leaves before index", func(t *testing.T) {
224 | // testcasees
225 | // 0123456789
226 | test := NewNode("test", 0)
227 | caseN := NewNode("case", 4)
228 | as := NewNode("as", 5)
229 | see := NewNode("see", 6)
230 | es := NewNode("es", 8)
231 |
232 | // test case es
233 | // test as es
234 | // test see
235 |
236 | test.AddChild(caseN)
237 | test.AddChild(as)
238 | test.AddChild(see)
239 | caseN.AddChild(es)
240 | as.AddChild(es)
241 |
242 | assert.Equal(t, 0, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 0)))
243 | assert.Equal(t, 0, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 3)))
244 | assert.Equal(t, 1, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 4)))
245 | node := test.RecursivelyFindLeavesBeforeIndex(context.Background(), 4)[0]
246 | assert.Equal(t, "test", node.name)
247 |
248 | assert.Equal(t, 1, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 7)))
249 | node = test.RecursivelyFindLeavesBeforeIndex(context.Background(), 7)[0]
250 | assert.Equal(t, "as", node.name)
251 |
252 | assert.Equal(t, 2, len(test.RecursivelyFindLeavesBeforeIndex(context.Background(), 8)))
253 | })
254 |
255 | }
256 |
257 | func TestSplitVeryLongWords(t *testing.T) {
258 | dictMock := &DictMock{
259 | scores: map[string]float64{
260 | "aaaa": 1.0,
261 | "bbbb": 1.0,
262 | },
263 | }
264 |
265 | ts := Splitter{
266 | dict: dictMock,
267 | }
268 |
269 | t1 := time.Now()
270 |
271 | split, err := ts.Split("aaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaa")
272 |
273 | t2 := time.Now()
274 | diff := t2.Sub(t1)
275 |
276 | assert.Nil(t, err)
277 | assert.Less(t, 0, len(split))
278 |
279 | if diff > time.Millisecond*200 {
280 | fmt.Errorf("Splitter took too long")
281 | t.Fail()
282 | }
283 | }
284 |
285 | func TestSplitTooLongWords(t *testing.T) {
286 | dictMock := &DictMock{
287 | scores: map[string]float64{
288 | "aaaa": 1.0,
289 | "bbbb": 1.0,
290 | },
291 | }
292 |
293 | ts := Splitter{
294 | dict: dictMock,
295 | }
296 |
297 | split, err := ts.Split("aaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbbaaaabbbb")
298 |
299 | assert.Nil(t, err)
300 | assert.Equal(t, 0, len(split))
301 | }
302 |
303 | func TestUnboundTree(t *testing.T) {
304 | dictMock := &DictMock{
305 | scores: map[string]float64{
306 | "5555": 1.0,
307 | "55555": 1.0,
308 | "5555555555555555": 1.0,
309 | },
310 | }
311 |
312 | ts := Splitter{
313 | dict: dictMock,
314 | }
315 |
316 | t1 := time.Now()
317 |
318 | _, err := ts.Split("ql55555555555555555555555555555")
319 |
320 | t2 := time.Now()
321 | diff := t2.Sub(t1)
322 |
323 | assert.Nil(t, err)
324 |
325 | if diff > time.Millisecond*200 {
326 | fmt.Errorf("Splitter took too long")
327 | t.Fail()
328 | }
329 | }
330 |
--------------------------------------------------------------------------------
/contextionary/contextionary.proto:
--------------------------------------------------------------------------------
1 | syntax = "proto3";
2 |
3 | package contextionary;
4 |
5 | service Contextionary {
6 | rpc IsWordStopword(Word) returns (WordStopword) {}
7 | rpc IsWordPresent(Word) returns (WordPresent) {}
8 | rpc SchemaSearch(SchemaSearchParams) returns (SchemaSearchResults) {}
9 | rpc SafeGetSimilarWordsWithCertainty(SimilarWordsParams) returns (SimilarWordsResults) {}
10 | rpc VectorForWord(Word) returns (Vector) {}
11 | rpc MultiVectorForWord(WordList) returns (VectorList) {}
12 | rpc VectorForCorpi(Corpi) returns (Vector) {}
13 | rpc NearestWordsByVector(VectorNNParams) returns (NearestWords) {}
14 | rpc MultiNearestWordsByVector(VectorNNParamsList) returns (NearestWordsList) {}
15 | rpc Meta(MetaParams) returns (MetaOverview) {}
16 | rpc AddExtension(ExtensionInput) returns (AddExtensionResult) {}
17 | }
18 |
19 | message ExtensionInput {
20 | string concept = 1;
21 | string definition = 2;
22 | float weight = 3;
23 | }
24 |
25 | message AddExtensionResult { }
26 |
27 | message MetaParams {}
28 |
29 | message MetaOverview {
30 | string version = 1;
31 | int64 wordCount = 2;
32 | }
33 |
34 | message Word {
35 | string word = 1;
36 | }
37 |
38 | message WordList {
39 | repeated Word words = 1;
40 | }
41 |
42 | message WordPresent {
43 | bool present = 1;
44 | }
45 |
46 | message Vector {
47 | repeated VectorEntry entries = 1;
48 | repeated InputElement source = 2;
49 | };
50 |
51 | message InputElement {
52 | string concept = 1;
53 | float weight = 2;
54 | uint64 occurrence = 3;
55 | };
56 |
57 | message VectorList {
58 | repeated Vector vectors = 1;
59 | }
60 |
61 | message VectorEntry {
62 | float Entry = 1;
63 | }
64 |
65 | message VectorNNParams {
66 | Vector vector = 1;
67 | int32 k = 2;
68 | int32 n = 3;
69 | }
70 |
71 | message VectorNNParamsList {
72 | repeated VectorNNParams Params = 1;
73 | }
74 |
75 | message Corpi {
76 | repeated string corpi = 1;
77 | repeated Override overrides = 2;
78 | }
79 |
80 | message Override {
81 | string word = 1;
82 | string expression = 2;
83 | }
84 |
85 | message WordStopword {
86 | bool stopword = 1;
87 | }
88 |
89 | message SimilarWordsParams {
90 | string word = 1;
91 | float certainty = 2;
92 | }
93 |
94 | message SimilarWordsResults {
95 | repeated Word words = 1;
96 | }
97 |
98 | message NearestWords {
99 | repeated string words = 1;
100 | repeated float distances = 2;
101 | VectorList vectors = 3;
102 | }
103 |
104 | message NearestWordsList {
105 | repeated NearestWords words = 1;
106 | }
107 |
108 | message Keyword {
109 | string keyword = 1;
110 | float weight = 2;
111 | }
112 |
113 | enum SearchType {
114 | CLASS=0;
115 | PROPERTY=1;
116 | };
117 |
118 | message SchemaSearchParams {
119 | SearchType searchType = 1;
120 | string name = 2;
121 | repeated Keyword keywords = 3;
122 | float certainty = 5;
123 | }
124 |
125 | message SchemaSearchResults {
126 | SearchType type = 1;
127 | repeated SchemaSearchResult results = 2;
128 | }
129 |
130 | message SchemaSearchResult {
131 | string name = 1;
132 | float certainty = 3;
133 | }
134 |
--------------------------------------------------------------------------------
/contextionary/core/annoyindex/annoy_test.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | /*
13 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not
14 | # use this file except in compliance with the License. You may obtain a copy of
15 | # the License at
16 | #
17 | # http://www.apache.org/licenses/LICENSE-2.0
18 | #
19 | # Unless required by applicable law or agreed to in writing, software
20 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
21 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
22 | # License for the specific language governing permissions and limitations under
23 | # the License.
24 | */
25 |
26 | package annoyindex_test
27 |
28 | import (
29 | "math"
30 | "math/rand"
31 | "os"
32 | "testing"
33 |
34 | "github.com/weaviate/contextionary/contextionary/core/annoyindex"
35 |
36 | "github.com/stretchr/testify/assert"
37 | "github.com/stretchr/testify/suite"
38 | )
39 |
40 | type AnnoyTestSuite struct {
41 | suite.Suite
42 | }
43 |
44 | func Round(f float64) float64 {
45 | return math.Floor(f + 0.5)
46 | }
47 |
48 | func RoundPlus(f float64, places int) float64 {
49 | shift := math.Pow(10, float64(places))
50 | return Round(f*shift) / shift
51 | }
52 |
53 | func (suite *AnnoyTestSuite) SetupTest() {
54 | }
55 |
56 | func (suite *AnnoyTestSuite) TestFileHandling() {
57 | index := annoyindex.NewAnnoyIndexAngular(3)
58 | index.AddItem(0, []float32{0, 0, 1})
59 | index.AddItem(1, []float32{0, 1, 0})
60 | index.AddItem(2, []float32{1, 0, 0})
61 | index.Build(10)
62 |
63 | index.Save("go_test.ann")
64 |
65 | info, err := os.Stat("go_test.ann")
66 | if err != nil {
67 | assert.Fail(suite.T(), "Failed to create file, file not found")
68 | }
69 | if info.Size() == 0 {
70 | assert.Fail(suite.T(), "Failed to create file, file size zero")
71 | }
72 |
73 | annoyindex.DeleteAnnoyIndexAngular(index)
74 |
75 | index = annoyindex.NewAnnoyIndexAngular(3)
76 | if ret := index.Load("go_test.ann"); ret == false {
77 | assert.Fail(suite.T(), "Failed to load file")
78 | }
79 | annoyindex.DeleteAnnoyIndexAngular(index)
80 |
81 | os.Remove("go_test.ann")
82 | }
83 |
84 | func (suite *AnnoyTestSuite) TestGetNnsByVector() {
85 | index := annoyindex.NewAnnoyIndexAngular(3)
86 | index.AddItem(0, []float32{0, 0, 1})
87 | index.AddItem(1, []float32{0, 1, 0})
88 | index.AddItem(2, []float32{1, 0, 0})
89 | index.Build(10)
90 |
91 | var result []int
92 | index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, &result)
93 | assert.Equal(suite.T(), []int{2, 1, 0}, result)
94 |
95 | index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, &result)
96 | assert.Equal(suite.T(), []int{0, 1, 2}, result)
97 |
98 | index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, &result)
99 | assert.Equal(suite.T(), []int{2, 0, 1}, result)
100 |
101 | annoyindex.DeleteAnnoyIndexAngular(index)
102 | }
103 |
104 | func (suite *AnnoyTestSuite) TestGetNnsByItem() {
105 | index := annoyindex.NewAnnoyIndexAngular(3)
106 | index.AddItem(0, []float32{2, 1, 0})
107 | index.AddItem(1, []float32{1, 2, 0})
108 | index.AddItem(2, []float32{0, 0, 1})
109 | index.Build(10)
110 |
111 | var result []int
112 | index.GetNnsByItem(0, 3, -1, &result)
113 | assert.Equal(suite.T(), []int{0, 1, 2}, result)
114 |
115 | index.GetNnsByItem(1, 3, -1, &result)
116 | assert.Equal(suite.T(), []int{1, 0, 2}, result)
117 |
118 | annoyindex.DeleteAnnoyIndexAngular(index)
119 | }
120 |
121 | func (suite *AnnoyTestSuite) TestGetItem() {
122 | index := annoyindex.NewAnnoyIndexAngular(3)
123 | index.AddItem(0, []float32{2, 1, 0})
124 | index.AddItem(1, []float32{1, 2, 0})
125 | index.AddItem(2, []float32{0, 0, 1})
126 | index.Build(10)
127 |
128 | var result []float32
129 |
130 | index.GetItem(0, &result)
131 | assert.Equal(suite.T(), []float32{2, 1, 0}, result)
132 |
133 | index.GetItem(1, &result)
134 | assert.Equal(suite.T(), []float32{1, 2, 0}, result)
135 |
136 | index.GetItem(2, &result)
137 | assert.Equal(suite.T(), []float32{0, 0, 1}, result)
138 |
139 | annoyindex.DeleteAnnoyIndexAngular(index)
140 | }
141 |
142 | func (suite *AnnoyTestSuite) TestGetDistance() {
143 | index := annoyindex.NewAnnoyIndexAngular(2)
144 | index.AddItem(0, []float32{0, 1})
145 | index.AddItem(1, []float32{1, 1})
146 | index.Build(10)
147 |
148 | assert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3))
149 |
150 | annoyindex.DeleteAnnoyIndexAngular(index)
151 | }
152 |
153 | func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() {
154 | index := annoyindex.NewAnnoyIndexEuclidean(10)
155 |
156 | for j := 0; j < 10000; j += 2 {
157 | p := make([]float32, 0, 10)
158 | for i := 0; i < 10; i++ {
159 | p = append(p, rand.Float32())
160 | }
161 | x := make([]float32, 0, 10)
162 | for i := 0; i < 10; i++ {
163 | x = append(x, 1+p[i]+rand.Float32()*1e-2)
164 | }
165 | y := make([]float32, 0, 10)
166 | for i := 0; i < 10; i++ {
167 | y = append(y, 1+p[i]+rand.Float32()*1e-2)
168 | }
169 | index.AddItem(j, x)
170 | index.AddItem(j+1, y)
171 | }
172 | index.Build(10)
173 | for j := 0; j < 10000; j += 2 {
174 | var result []int
175 | index.GetNnsByItem(j, 2, -1, &result)
176 |
177 | assert.Equal(suite.T(), result, []int{j, j + 1})
178 |
179 | index.GetNnsByItem(j+1, 2, -1, &result)
180 | assert.Equal(suite.T(), result, []int{j + 1, j})
181 | }
182 | annoyindex.DeleteAnnoyIndexEuclidean(index)
183 | }
184 |
185 | func TestAnnoyTestSuite(t *testing.T) {
186 | suite.Run(t, new(AnnoyTestSuite))
187 | }
188 |
--------------------------------------------------------------------------------
/contextionary/core/annoyindex/annoygomodule.h:
--------------------------------------------------------------------------------
1 | #include "annoylib.h"
2 | #include "kissrandom.h"
3 |
4 | namespace GoAnnoy {
5 |
6 | class AnnoyIndex {
7 | protected:
8 | ::AnnoyIndexInterface *ptr;
9 |
10 | int f;
11 |
12 | public:
13 | ~AnnoyIndex() {
14 | delete ptr;
15 | };
16 | void addItem(int item, const float* w) {
17 | ptr->add_item(item, w);
18 | };
19 | void build(int q) {
20 | ptr->build(q);
21 | };
22 | bool save(const char* filename) {
23 | return ptr->save(filename);
24 | };
25 | void unload() {
26 | ptr->unload();
27 | };
28 | bool load(const char* filename) {
29 | return ptr->load(filename);
30 | };
31 | float getDistance(int i, int j) {
32 | return ptr->get_distance(i, j);
33 | };
34 | void getNnsByItem(int item, int n, int search_k, vector* result, vector* distances) {
35 | ptr->get_nns_by_item(item, n, search_k, result, distances);
36 | };
37 | void getNnsByVector(const float* w, int n, int search_k, vector* result, vector* distances) {
38 | ptr->get_nns_by_vector(w, n, search_k, result, distances);
39 | };
40 | void getNnsByItem(int item, int n, int search_k, vector* result) {
41 | ptr->get_nns_by_item(item, n, search_k, result, NULL);
42 | };
43 | void getNnsByVector(const float* w, int n, int search_k, vector* result) {
44 | ptr->get_nns_by_vector(w, n, search_k, result, NULL);
45 | };
46 |
47 | int getNItems() {
48 | return (int)ptr->get_n_items();
49 | };
50 | void verbose(bool v) {
51 | ptr->verbose(v);
52 | };
53 | void getItem(int item, vector *v) {
54 | v->resize(this->f);
55 | ptr->get_item(item, &v->front());
56 | };
57 | };
58 |
59 | class AnnoyIndexAngular : public AnnoyIndex
60 | {
61 | public:
62 | AnnoyIndexAngular(int f) {
63 | ptr = new ::AnnoyIndex(f);
64 | this->f = f;
65 | }
66 | };
67 |
68 | class AnnoyIndexEuclidean : public AnnoyIndex {
69 | public:
70 | AnnoyIndexEuclidean(int f) {
71 | ptr = new ::AnnoyIndex(f);
72 | this->f = f;
73 | }
74 | };
75 |
76 | class AnnoyIndexManhattan : public AnnoyIndex {
77 | public:
78 | AnnoyIndexManhattan(int f) {
79 | ptr = new ::AnnoyIndex(f);
80 | this->f = f;
81 | }
82 | };
83 | }
84 |
--------------------------------------------------------------------------------
/contextionary/core/annoyindex/kissrandom.h:
--------------------------------------------------------------------------------
1 | #ifndef KISSRANDOM_H
2 | #define KISSRANDOM_H
3 |
4 | #if defined(_MSC_VER) && _MSC_VER == 1500
5 | typedef unsigned __int32 uint32_t;
6 | typedef unsigned __int32 uint64_t;
7 | #else
8 | #include
9 | #endif
10 |
11 | // KISS = "keep it simple, stupid", but high quality random number generator
12 | // http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
13 | // http://mathforum.org/kb/message.jspa?messageID=6627731
14 | // https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
15 |
16 | // 32 bit KISS
17 | struct Kiss32Random {
18 | uint32_t x;
19 | uint32_t y;
20 | uint32_t z;
21 | uint32_t c;
22 |
23 | // seed must be != 0
24 | Kiss32Random(uint32_t seed = 123456789) {
25 | x = seed;
26 | y = 362436000;
27 | z = 521288629;
28 | c = 7654321;
29 | }
30 |
31 | uint32_t kiss() {
32 | // Linear congruence generator
33 | x = 69069 * x + 12345;
34 |
35 | // Xor shift
36 | y ^= y << 13;
37 | y ^= y >> 17;
38 | y ^= y << 5;
39 |
40 | // Multiply-with-carry
41 | uint64_t t = 698769069ULL * z + c;
42 | c = t >> 32;
43 | z = (uint32_t) t;
44 |
45 | return x + y + z;
46 | }
47 | inline int flip() {
48 | // Draw random 0 or 1
49 | return kiss() & 1;
50 | }
51 | inline size_t index(size_t n) {
52 | // Draw random integer between 0 and n-1 where n is at most the number of data points you have
53 | return kiss() % n;
54 | }
55 | inline void set_seed(uint32_t seed) {
56 | x = seed;
57 | }
58 | };
59 |
60 | // 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
61 | struct Kiss64Random {
62 | uint64_t x;
63 | uint64_t y;
64 | uint64_t z;
65 | uint64_t c;
66 |
67 | // seed must be != 0
68 | Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
69 | x = seed;
70 | y = 362436362436362436ULL;
71 | z = 1066149217761810ULL;
72 | c = 123456123456123456ULL;
73 | }
74 |
75 | uint64_t kiss() {
76 | // Linear congruence generator
77 | z = 6906969069LL*z+1234567;
78 |
79 | // Xor shift
80 | y ^= (y<<13);
81 | y ^= (y>>17);
82 | y ^= (y<<43);
83 |
84 | // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
85 | uint64_t t = (x<<58)+c;
86 | c = (x>>6);
87 | x += t;
88 | c += (x= idx.offset && item < (idx.offset+idx.size) {
120 | return ItemIndex(item - idx.offset), idx.index, nil
121 | }
122 | }
123 |
124 | return 0, nil, fmt.Errorf("out of index")
125 | }
126 |
127 | func (ci *CombinedIndex) ItemIndexToWord(item ItemIndex) (string, error) {
128 | offsetted_index, vi, err := ci.find_vector_index_for_item_index(item)
129 |
130 | if err != nil {
131 | return "", err
132 | }
133 |
134 | word, err := (*vi).ItemIndexToWord(offsetted_index)
135 | return word, err
136 | }
137 |
138 | func (ci *CombinedIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) {
139 | offsetted_index, vi, err := ci.find_vector_index_for_item_index(item)
140 |
141 | if err != nil {
142 | return 0, err
143 | }
144 |
145 | occ, err := (*vi).ItemIndexToOccurrence(offsetted_index)
146 | return occ, err
147 | }
148 |
149 | func (ci *CombinedIndex) OccurrencePercentile(perc int) uint64 {
150 | max := uint64(0)
151 |
152 | for _, index := range ci.indices {
153 | occ := (*index.index).OccurrencePercentile(perc)
154 | if occ > max {
155 | max = occ
156 | }
157 | }
158 |
159 | return max
160 | }
161 |
162 | func (ci *CombinedIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) {
163 | offsetted_index, vi, err := ci.find_vector_index_for_item_index(item)
164 | if err != nil {
165 | return nil, errors.NewInternalf(err.Error())
166 | }
167 |
168 | word, err := (*vi).GetVectorForItemIndex(offsetted_index)
169 | if err != nil {
170 | return word, errors.NewInternalf(err.Error())
171 | }
172 |
173 | return word, nil
174 | }
175 |
176 | // Compute the distance between two items.
177 | func (ci *CombinedIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) {
178 | v1, err := ci.GetVectorForItemIndex(a)
179 | if err != nil {
180 | return 0.0, err
181 | }
182 |
183 | v2, err := ci.GetVectorForItemIndex(b)
184 | if err != nil {
185 | return 0.0, err
186 | }
187 |
188 | dist, err := v1.Distance(v2)
189 | if err != nil {
190 | return 0.0, err
191 | }
192 |
193 | return dist, nil
194 | }
195 |
196 | // Get the n nearest neighbours of item, examining k trees.
197 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
198 | func (ci *CombinedIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) {
199 | vec, err := ci.GetVectorForItemIndex(item)
200 | if err != nil {
201 | return nil, nil, fmt.Errorf("could not get vector for item index: %s", err)
202 | }
203 |
204 | return ci.GetNnsByVector(*vec, n, k)
205 | }
206 |
207 | type combined_nn_search_result struct {
208 | item ItemIndex
209 | dist float32
210 | }
211 |
212 | type combined_nn_search_results struct {
213 | items []combined_nn_search_result
214 | ci *CombinedIndex
215 | }
216 |
217 | // SafeGetSimilarWords returns n similar words in the contextionary,
218 | // examining k trees. It is guaratueed to have results, even if the word is
219 | // not in the contextionary. In this case the list only contains the word
220 | // itself. It can then still be used for exact match or levensthein-based
221 | // searches against db backends.
222 | func (ci *CombinedIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) {
223 | return safeGetSimilarWordsFromAny(ci, word, n, k)
224 | }
225 |
226 | // SafeGetSimilarWordsWithCertainty returns similar words in the
227 | // contextionary, if they are close enough to match the required certainty.
228 | // It is guaratueed to have results, even if the word is not in the
229 | // contextionary. In this case the list only contains the word itself. It can
230 | // then still be used for exact match or levensthein-based searches against
231 | // db backends.
232 | func (ci *CombinedIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string {
233 | return safeGetSimilarWordsWithCertaintyFromAny(ci, word, certainty)
234 | }
235 |
236 | func (a combined_nn_search_results) Len() int { return len(a.items) }
237 | func (a combined_nn_search_results) Swap(i, j int) { a.items[i], a.items[j] = a.items[j], a.items[i] }
238 | func (a combined_nn_search_results) Less(i, j int) bool {
239 | // Sort on distance first, if those are the same, sort on lexographical order of the words.
240 | if a.items[i].dist == a.items[j].dist {
241 | wi, err := a.ci.ItemIndexToWord(a.items[i].item)
242 | if err != nil {
243 | panic("should be there")
244 | }
245 |
246 | wj, err := a.ci.ItemIndexToWord(a.items[j].item)
247 | if err != nil {
248 | panic("should be there")
249 | }
250 | return wi < wj
251 | } else {
252 | return a.items[i].dist < a.items[j].dist
253 | }
254 | }
255 |
256 | // Remove a certain element from the result search.
257 | func (a *combined_nn_search_results) Remove(i int) {
258 | a.items = append(a.items[:i], a.items[i+1:]...)
259 | }
260 |
261 | // Get the n nearest neighbours of item, examining k trees.
262 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
263 | func (ci *CombinedIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) {
264 | results := combined_nn_search_results{
265 | items: make([]combined_nn_search_result, 0),
266 | ci: ci,
267 | }
268 |
269 | for _, item := range ci.indices {
270 | indices, floats, err := (*item.index).GetNnsByVector(vector, n, k)
271 | if err != nil {
272 | return nil, nil, errors.NewInternalf(err.Error())
273 | } else {
274 | for i, item_idx := range indices {
275 | results.items = append(results.items, combined_nn_search_result{item: item_idx + ItemIndex(item.offset), dist: floats[i]})
276 | }
277 | }
278 | }
279 |
280 | sort.Sort(results)
281 |
282 | // Now remove duplicates.
283 | for i := 1; i < len(results.items); {
284 | if results.items[i].item == results.items[i-1].item {
285 | results.Remove(i)
286 | } else {
287 | i++ // only increment if we're not removing.
288 | }
289 | }
290 |
291 | items := make([]ItemIndex, 0)
292 | floats := make([]float32, 0)
293 |
294 | var max_index int
295 |
296 | if n < len(results.items) {
297 | max_index = n
298 | } else {
299 | max_index = len(results.items)
300 | }
301 |
302 | for i := 0; i < max_index; i++ {
303 | items = append(items, results.items[i].item)
304 | floats = append(floats, results.items[i].dist)
305 | }
306 |
307 | return items, floats, nil
308 | }
309 |
--------------------------------------------------------------------------------
/contextionary/core/combined_simple_test.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package contextionary
13 |
14 | import (
15 | "testing"
16 | )
17 |
18 | func TestSimpleCombinedIndex(t *testing.T) {
19 | builder1 := InMemoryBuilder(3)
20 | builder2 := InMemoryBuilder(3)
21 | builder3 := InMemoryBuilder(3)
22 |
23 | builder1.AddWord("a", NewVector([]float32{1, 0, 0}))
24 | builder2.AddWord("b", NewVector([]float32{0, 1, 0}))
25 | builder3.AddWord("c", NewVector([]float32{0, 0, 1}))
26 |
27 | memory_index1 := Contextionary(builder1.Build(3))
28 | memory_index2 := Contextionary(builder2.Build(3))
29 | memory_index3 := Contextionary(builder3.Build(3))
30 |
31 | var indices123 []Contextionary = []Contextionary{memory_index1, memory_index2, memory_index3}
32 | var indices231 []Contextionary = []Contextionary{memory_index2, memory_index3, memory_index1}
33 | var indices312 []Contextionary = []Contextionary{memory_index3, memory_index1, memory_index2}
34 |
35 | t.Run("indices 123", func(t *testing.T) { test_simple_combined(t, indices123) })
36 | t.Run("indices 231", func(t *testing.T) { test_simple_combined(t, indices231) })
37 | t.Run("indices 312", func(t *testing.T) { test_simple_combined(t, indices312) })
38 | }
39 |
40 | func test_simple_combined(t *testing.T, indices []Contextionary) {
41 | ci, err := CombineVectorIndices(indices)
42 | if err != nil {
43 | panic("should work")
44 | }
45 |
46 | a_idx := ci.WordToItemIndex("a")
47 | if !a_idx.IsPresent() {
48 | panic("should be present")
49 | }
50 |
51 | b_idx := ci.WordToItemIndex("b")
52 | if !b_idx.IsPresent() {
53 | panic("should be present")
54 | }
55 |
56 | c_idx := ci.WordToItemIndex("c")
57 | if !c_idx.IsPresent() {
58 | panic("should be present")
59 | }
60 |
61 | items, _, err := ci.GetNnsByItem(a_idx, 3, 3)
62 | if err != nil {
63 | panic("should work")
64 | }
65 |
66 | assert_eq_idx := func(name string, a, b ItemIndex) {
67 | if a != b {
68 | t.Errorf("Expected %v to be at %v, but was at %b", name, a, b)
69 | }
70 | }
71 |
72 | if len(items) != 3 {
73 | t.Errorf("got length %v, expected 3", len(items))
74 | t.FailNow()
75 | }
76 |
77 | // assert lexicographical order, if distances are equal
78 |
79 | assert_eq_idx("a", a_idx, items[0])
80 | assert_eq_idx("b", b_idx, items[1])
81 | assert_eq_idx("c", c_idx, items[2])
82 | }
83 |
--------------------------------------------------------------------------------
/contextionary/core/component_test.go:
--------------------------------------------------------------------------------
1 | // +build sentence
2 |
3 | package contextionary
4 |
5 | import (
6 | "fmt"
7 | "testing"
8 | )
9 |
10 | func TestDevelopmentEnvironmentForContextionary(t *testing.T) {
11 |
12 | // Make sure you have run ./tools/dev/gen_simple_contextionary.sh
13 | // from the project root or downloaded a full contextionary prior
14 | // to running those tests.
15 |
16 | c11y, err := LoadVectorFromDisk("../../tools/dev/example.knn", "../../tools/dev/example.idx")
17 | if err != nil {
18 | t.Fatalf("could not generate c11y: %s", err)
19 | }
20 |
21 | fmt.Printf("here's the c11y, do whatever you want with it: %#v", c11y)
22 |
23 | t.Errorf("... add whatever you like!")
24 | }
25 |
--------------------------------------------------------------------------------
/contextionary/core/contextionary.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 |
13 | // Package contextionary provides the toolset to add context to words.
14 | package contextionary
15 |
16 | // ItemIndex is an opaque type that models an index number used to identify a
17 | // word.
18 | type ItemIndex int
19 |
20 | // IsPresent can be used after retrieving a word index (which does not error on
21 | // its own), to see if the word was actually present in the contextionary.
22 | func (i *ItemIndex) IsPresent() bool {
23 | return *i >= 0
24 | }
25 |
26 | // Contextionary is the API to decouple the K-nn interface that is needed for
27 | // Weaviate from a concrete implementation.
28 | type Contextionary interface {
29 |
30 | // Return the number of items that is stored in the index.
31 | GetNumberOfItems() int
32 |
33 | // Returns the length of the used vectors.
34 | GetVectorLength() int
35 |
36 | // Look up a word, return an index.
37 | // Check for presence of the index with index.IsPresent()
38 | WordToItemIndex(word string) ItemIndex
39 |
40 | // Based on an index, return the assosiated word.
41 | ItemIndexToWord(item ItemIndex) (string, error)
42 |
43 | // Based on an index, return the assosiated word.
44 | ItemIndexToOccurrence(item ItemIndex) (uint64, error)
45 |
46 | //OccurrencePercentile shows the occurrence of the mentioned percentile in ascending order
47 | OccurrencePercentile(perc int) uint64
48 |
49 | // Get the vector of an item index.
50 | GetVectorForItemIndex(item ItemIndex) (*Vector, error)
51 |
52 | // Compute the distance between two items.
53 | GetDistance(a ItemIndex, b ItemIndex) (float32, error)
54 |
55 | // Get the n nearest neighbours of item, examining k trees.
56 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
57 | GetNnsByItem(item ItemIndex, n, k int) ([]ItemIndex, []float32, error)
58 |
59 | // Get the n nearest neighbours of item, examining k trees.
60 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
61 | GetNnsByVector(vector Vector, n, k int) ([]ItemIndex, []float32, error)
62 |
63 | // SafeGetSimilarWords returns n similar words in the contextionary,
64 | // examining k trees. It is guaratueed to have results, even if the word is
65 | // not in the contextionary. In this case the list only contains the word
66 | // itself. It can then still be used for exact match or levensthein-based
67 | // searches against db backends.
68 | SafeGetSimilarWords(word string, n, k int) ([]string, []float32)
69 |
70 | // SafeGetSimilarWordsWithCertainty returns similar words in the
71 | // contextionary, if they are close enough to match the required certainty.
72 | // It is guaratueed to have results, even if the word is not in the
73 | // contextionary. In this case the list only contains the word itself. It can
74 | // then still be used for exact match or levensthein-based searches against
75 | // db backends.
76 | SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string
77 | }
78 |
--------------------------------------------------------------------------------
/contextionary/core/generator/cmd/generator.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package main
13 |
14 | import (
15 | "os"
16 |
17 | flags "github.com/jessevdk/go-flags"
18 | "github.com/weaviate/contextionary/contextionary/core/generator"
19 | )
20 |
21 | func main() {
22 | var options generator.Options
23 | var parser = flags.NewParser(&options, flags.Default)
24 |
25 | if _, err := parser.Parse(); err != nil {
26 | if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp {
27 | os.Exit(0)
28 | } else {
29 | os.Exit(1)
30 | }
31 | }
32 |
33 | generator.Generate(options)
34 | }
35 |
--------------------------------------------------------------------------------
/contextionary/core/generator/generator.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package generator
13 |
14 | import (
15 | "bufio"
16 | "bytes"
17 | "encoding/binary"
18 | "encoding/gob"
19 | "encoding/json"
20 | "log"
21 | "os"
22 | "strconv"
23 | "strings"
24 |
25 | "github.com/syndtr/goleveldb/leveldb"
26 | annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex"
27 | )
28 |
29 | type Options struct {
30 | VectorCSVPath string `short:"c" long:"vector-csv-path" description:"Path to the output file of Glove" required:"true"`
31 | TempDBPath string `short:"t" long:"temp-db-path" description:"Location for the temporary database" default:".tmp_import"`
32 | OutputPrefix string `short:"p" long:"output-prefix" description:"The prefix of the names of the files" required:"true"`
33 | K int `short:"k" description:"number of forrests to generate" default:"20"`
34 | }
35 |
36 | type WordVectorInfo struct {
37 | numberOfWords int
38 | vectorWidth int
39 | k int
40 | metadata JsonMetadata
41 | }
42 |
43 | type JsonMetadata struct {
44 | K int `json:"k"` // the number of parallel forrests.
45 | }
46 |
47 | func Generate(options Options) {
48 | db, err := leveldb.OpenFile(options.TempDBPath, nil)
49 | defer db.Close()
50 |
51 | if err != nil {
52 | log.Fatalf("Could not open temporary database file %+v", err)
53 | }
54 |
55 | file, err := os.Open(options.VectorCSVPath)
56 | if err != nil {
57 | log.Fatal(err)
58 | }
59 | defer file.Close()
60 |
61 | log.Print("Processing and ordering raw trained data")
62 | info := readVectorsFromFileAndInsertIntoLevelDB(db, file)
63 |
64 | info.k = options.K
65 | info.metadata = JsonMetadata{options.K}
66 |
67 | log.Print("Generating wordlist")
68 | createWordList(db, info, options.OutputPrefix+".idx")
69 |
70 | log.Print("Generating k-nn index")
71 | createKnn(db, info, options.OutputPrefix+".knn")
72 |
73 | db.Close()
74 | os.RemoveAll(options.TempDBPath)
75 | }
76 |
77 | // read word vectors, insert them into level db, also return the dimension of the vectors.
78 | func readVectorsFromFileAndInsertIntoLevelDB(db *leveldb.DB, file *os.File) WordVectorInfo {
79 | var vector_length int = -1
80 | var nr_words int = 0
81 |
82 | scanner := bufio.NewScanner(file)
83 |
84 | for scanner.Scan() {
85 | nr_words += 1
86 | parts := strings.Split(scanner.Text(), " ")
87 |
88 | word := parts[0]
89 | if vector_length == -1 {
90 | vector_length = len(parts) - 1
91 | }
92 |
93 | if vector_length != len(parts)-1 {
94 | log.Print("Line corruption found for the word [" + word + "]. Lenght expected " + strconv.Itoa(vector_length) + " but found " + strconv.Itoa(len(parts)) + ". Word will be skipped.")
95 | continue
96 | }
97 |
98 | // pre-allocate a vector for speed.
99 | vector := make([]float32, vector_length)
100 |
101 | for i := 1; i <= vector_length; i++ {
102 | float, err := strconv.ParseFloat(parts[i], 64)
103 |
104 | if err != nil {
105 | log.Fatal("Error parsing float")
106 | }
107 |
108 | vector[i-1] = float32(float)
109 | }
110 |
111 | var buf bytes.Buffer
112 | if err := gob.NewEncoder(&buf).Encode(vector); err != nil {
113 | log.Fatal("Could not encode vector for temp db storage")
114 | }
115 |
116 | db.Put([]byte(word), buf.Bytes(), nil)
117 | }
118 |
119 | return WordVectorInfo{numberOfWords: nr_words, vectorWidth: vector_length}
120 | }
121 |
122 | func createWordList(db *leveldb.DB, info WordVectorInfo, outputFileName string) {
123 | file, err := os.Create(outputFileName)
124 | if err != nil {
125 | log.Fatal("Could not open wordlist output file")
126 | }
127 | defer file.Close()
128 |
129 | wbuf := bufio.NewWriter(file)
130 |
131 | // Write file header
132 | err = binary.Write(wbuf, binary.LittleEndian, uint64(info.numberOfWords))
133 | if err != nil {
134 | log.Fatal("Could not write length of wordlist.")
135 | }
136 |
137 | err = binary.Write(wbuf, binary.LittleEndian, uint64(info.vectorWidth))
138 | if err != nil {
139 | log.Fatal("Could not write with of the vector.")
140 | }
141 |
142 | metadata, err := json.Marshal(info.metadata)
143 | if err != nil {
144 | log.Fatal("Could not serialize metadata.")
145 | }
146 |
147 | err = binary.Write(wbuf, binary.LittleEndian, uint64(len(metadata)))
148 | if err != nil {
149 | log.Fatal("Could not write with of the vector.")
150 | }
151 |
152 | _, err = wbuf.Write(metadata)
153 | if err != nil {
154 | log.Fatal("Could not write the metadata")
155 | }
156 |
157 | var metadata_len = uint64(len(metadata))
158 | var metadata_padding = 4 - (metadata_len % 4)
159 | for i := 0; uint64(i) < metadata_padding; i++ {
160 | wbuf.WriteByte(byte(0))
161 | }
162 |
163 | var word_offset uint64 = (2 + uint64(info.numberOfWords)) * 8 // first two uint64's from the header, then the table of indices.
164 | word_offset += 8 + metadata_len + metadata_padding // and the metadata length + content & padding
165 |
166 | var orig_word_offset = word_offset
167 |
168 | // Iterate first time over all data, computing indices for all words.
169 | iter := db.NewIterator(nil, nil)
170 | for iter.Next() {
171 | key := iter.Key()
172 | word := string(key)
173 | length := len(word)
174 | err = binary.Write(wbuf, binary.LittleEndian, uint64(word_offset))
175 |
176 | if err != nil {
177 | log.Fatal("Could not write word offset to wordlist")
178 | }
179 |
180 | // reserve 8 bytes for occurence
181 | word_offset += 8
182 |
183 | word_offset += uint64(length) + 1
184 |
185 | // ensure padding on 4-bytes aligned memory
186 | padding := 4 - (word_offset % 4)
187 | word_offset += padding
188 | }
189 |
190 | iter.Release()
191 | word_offset = orig_word_offset
192 |
193 | // Iterate second time over all data, now inserting the words
194 | iter = db.NewIterator(nil, nil)
195 | for iter.Next() {
196 | key := iter.Key()
197 | word := string(key)
198 | length := len(word)
199 |
200 | // hard-code occurence to 102 for now
201 | err = binary.Write(wbuf, binary.LittleEndian, uint64(102))
202 |
203 | wbuf.Write([]byte(word))
204 | wbuf.WriteByte(byte(0))
205 | word_offset += uint64(length) + 1
206 |
207 | // ensure padding on 4-bytes aligned memory
208 | padding := 4 - (word_offset % 4)
209 | for i := 0; uint64(i) < padding; i++ {
210 | wbuf.WriteByte(byte(0))
211 | }
212 |
213 | word_offset += padding
214 | }
215 | wbuf.Flush()
216 | iter.Release()
217 | }
218 |
219 | func createKnn(db *leveldb.DB, info WordVectorInfo, outputFileName string) {
220 | var knn annoy.AnnoyIndex = annoy.NewAnnoyIndexEuclidean(info.vectorWidth)
221 | var idx int = -1
222 |
223 | iter := db.NewIterator(nil, nil)
224 |
225 | for iter.Next() {
226 | idx += 1
227 |
228 | vector := make([]float32, info.vectorWidth)
229 | err := gob.NewDecoder(bytes.NewBuffer(iter.Value())).Decode(&vector)
230 | if err != nil {
231 | log.Fatalf("Could not decode vector value %+v", err)
232 | }
233 | knn.AddItem(idx, vector)
234 | }
235 |
236 | knn.Build(info.k) // Hardcoded for now. Must be tweaked.
237 | knn.Save(outputFileName)
238 | knn.Unload()
239 | }
240 |
--------------------------------------------------------------------------------
/contextionary/core/memory_index.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package contextionary
13 |
14 | import (
15 | "fmt"
16 | "sort"
17 |
18 | annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex"
19 | )
20 |
21 | type MemoryIndex struct {
22 | dimensions int
23 | words []string
24 | knn annoy.AnnoyIndex
25 | }
26 |
27 | // Return the number of items that is stored in the index.
28 | func (mi *MemoryIndex) GetNumberOfItems() int {
29 | return len(mi.words)
30 | }
31 |
32 | // Returns the length of the used vectors.
33 | func (mi *MemoryIndex) GetVectorLength() int {
34 | return mi.dimensions
35 | }
36 |
37 | // Look up a word, return an index.
38 | // Perform binary search.
39 | func (mi *MemoryIndex) WordToItemIndex(word string) ItemIndex {
40 | for idx, w := range mi.words {
41 | if word == w {
42 | return ItemIndex(idx)
43 | }
44 | }
45 |
46 | return -1
47 | }
48 |
49 | func (mi *MemoryIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) {
50 | return 0, nil
51 | }
52 |
53 | func (mi *MemoryIndex) OccurrencePercentile(perc int) uint64 {
54 | return 0
55 | }
56 |
57 | // Based on an index, return the assosiated word.
58 | func (mi *MemoryIndex) ItemIndexToWord(item ItemIndex) (string, error) {
59 | if item >= 0 && int(item) <= len(mi.words) {
60 | return mi.words[item], nil
61 | } else {
62 | return "", fmt.Errorf("Index out of bounds")
63 | }
64 | }
65 |
66 | // Get the vector of an item index.
67 | // TODO: Is this ever used? Doesn't look like it as part of the investigation
68 | // in gh-25 and gh-26
69 | func (mi *MemoryIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) {
70 | if item >= 0 && int(item) <= len(mi.words) {
71 | var floats []float32
72 | mi.knn.GetItem(int(item), &floats)
73 |
74 | return &Vector{vector: floats}, nil
75 | } else {
76 | return nil, fmt.Errorf("Index out of bounds")
77 | }
78 | }
79 |
80 | // Compute the distance between two items.
81 | func (mi MemoryIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) {
82 | if a >= 0 && b >= 0 && int(a) <= len(mi.words) && int(b) <= len(mi.words) {
83 | return mi.knn.GetDistance(int(a), int(b)), nil
84 | } else {
85 | return 0, fmt.Errorf("Index out of bounds")
86 | }
87 | }
88 |
89 | // Get the n nearest neighbours of item, examining k trees.
90 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
91 | func (mi *MemoryIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) {
92 | if item >= 0 && int(item) <= len(mi.words) {
93 | var items []int
94 | var distances []float32
95 |
96 | mi.knn.GetNnsByItem(int(item), n, k, &items, &distances)
97 |
98 | var indices []ItemIndex = make([]ItemIndex, len(items))
99 | for i, x := range items {
100 | indices[i] = ItemIndex(x)
101 | }
102 |
103 | return indices, distances, nil
104 | } else {
105 | return nil, nil, fmt.Errorf("Index out of bounds")
106 | }
107 | }
108 |
109 | // Get the n nearest neighbours of item, examining k trees.
110 | // Returns an array of indices, and of distances between item and the n-nearest neighbors.
111 | func (mi *MemoryIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) {
112 | if len(vector.vector) == mi.dimensions {
113 | var items []int
114 | var distances []float32
115 |
116 | mi.knn.GetNnsByVector(vector.vector, n, k, &items, &distances)
117 |
118 | var indices []ItemIndex = make([]ItemIndex, len(items))
119 | for i, x := range items {
120 | indices[i] = ItemIndex(x)
121 | }
122 |
123 | return indices, distances, nil
124 | } else {
125 | return nil, nil, fmt.Errorf("Wrong vector length provided")
126 | }
127 | }
128 |
129 | // SafeGetSimilarWords returns n similar words in the contextionary,
130 | // examining k trees. It is guaratueed to have results, even if the word is
131 | // not in the contextionary. In this case the list only contains the word
132 | // itself. It can then still be used for exact match or levensthein-based
133 | // searches against db backends.
134 | func (mi *MemoryIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) {
135 | return safeGetSimilarWordsFromAny(mi, word, n, k)
136 | }
137 |
138 | // SafeGetSimilarWordsWithCertainty returns similar words in the
139 | // contextionary, if they are close enough to match the required certainty.
140 | // It is guaratueed to have results, even if the word is not in the
141 | // contextionary. In this case the list only contains the word itself. It can
142 | // then still be used for exact match or levensthein-based searches against
143 | // db backends.
144 | func (mi *MemoryIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string {
145 | return safeGetSimilarWordsWithCertaintyFromAny(mi, word, certainty)
146 | }
147 |
148 | // The rest of this file concerns itself with building the Memory Index.
149 | // This is done from the MemoryIndexBuilder struct.
150 |
151 | type MemoryIndexBuilder struct {
152 | dimensions int
153 | word_vectors mib_pairs
154 | }
155 |
156 | type mib_pair struct {
157 | word string
158 | vector Vector
159 | }
160 |
161 | // Define custom type, and implement functions required for sort.Sort.
162 | type mib_pairs []mib_pair
163 |
164 | func (a mib_pairs) Len() int { return len(a) }
165 | func (a mib_pairs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
166 | func (a mib_pairs) Less(i, j int) bool { return a[i].word < a[j].word }
167 |
168 | // Construct a new builder.
169 | func InMemoryBuilder(dimensions int) *MemoryIndexBuilder {
170 | mib := MemoryIndexBuilder{
171 | dimensions: dimensions,
172 | word_vectors: make([]mib_pair, 0),
173 | }
174 |
175 | return &mib
176 | }
177 |
178 | // Add a word and it's vector to the builder.
179 | func (mib *MemoryIndexBuilder) AddWord(word string, vector Vector) {
180 | wv := mib_pair{word: word, vector: vector}
181 | mib.word_vectors = append(mib.word_vectors, wv)
182 | }
183 |
184 | // Build an efficient lookup iddex from the builder.
185 | func (mib *MemoryIndexBuilder) Build(trees int) *MemoryIndex {
186 | mi := MemoryIndex{
187 | dimensions: mib.dimensions,
188 | words: make([]string, 0),
189 | knn: annoy.NewAnnoyIndexEuclidean(mib.dimensions),
190 | }
191 |
192 | // First sort the words; this way we can do binary search on the words.
193 | sort.Sort(mib.word_vectors)
194 |
195 | // Then fill up the data in the MemoryIndex
196 | for i, pair := range mib.word_vectors {
197 | mi.words = append(mi.words, pair.word)
198 | mi.knn.AddItem(i, pair.vector.vector)
199 | }
200 |
201 | // And instruct Annoy to build it's index
202 | mi.knn.Build(trees)
203 |
204 | return &mi
205 | }
206 |
--------------------------------------------------------------------------------
/contextionary/core/mmapped.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package contextionary
13 |
14 | import (
15 | "encoding/binary"
16 | "fmt"
17 | "log"
18 | "math"
19 | "os"
20 | "syscall"
21 |
22 | annoy "github.com/weaviate/contextionary/contextionary/core/annoyindex"
23 | )
24 |
25 | type mmappedIndex struct {
26 | word_index *Wordlist
27 | knn annoy.AnnoyIndex
28 | knnRaw []byte
29 | dimensions int
30 | }
31 |
32 | func (m *mmappedIndex) GetNumberOfItems() int {
33 | return int(m.word_index.numberOfWords)
34 | }
35 |
36 | // Returns the length of the used vectors.
37 | func (m *mmappedIndex) GetVectorLength() int {
38 | return int(m.word_index.vectorWidth)
39 | }
40 |
41 | func (m *mmappedIndex) WordToItemIndex(word string) ItemIndex {
42 | return m.word_index.FindIndexByWord(word)
43 | }
44 |
45 | func (m *mmappedIndex) ItemIndexToWord(item ItemIndex) (string, error) {
46 | if item >= 0 && item <= m.word_index.GetNumberOfWords() {
47 | w, _ := m.word_index.getWord(item)
48 | return w, nil
49 | } else {
50 | return "", fmt.Errorf("Index out of bounds")
51 | }
52 | }
53 |
54 | func (m *mmappedIndex) ItemIndexToOccurrence(item ItemIndex) (uint64, error) {
55 | if item >= 0 && item <= m.word_index.GetNumberOfWords() {
56 | _, occ := m.word_index.getWord(item)
57 | return occ, nil
58 | } else {
59 | return 0, fmt.Errorf("Index out of bounds")
60 | }
61 | }
62 |
63 | func (m *mmappedIndex) OccurrencePercentile(perc int) uint64 {
64 | return m.word_index.OccurrencePercentile(perc)
65 | }
66 |
67 | func (m *mmappedIndex) GetVectorForItemIndex(item ItemIndex) (*Vector, error) {
68 | if item < 0 && item > m.word_index.GetNumberOfWords() {
69 | return nil, fmt.Errorf("Index out of bounds")
70 | }
71 |
72 | var floats []float32
73 | floats = m.getItem(int(item))
74 |
75 | return &Vector{vector: floats}, nil
76 | }
77 |
78 | func (m *mmappedIndex) getItem(index int) []float32 {
79 | offset := 16
80 | vectorSize := m.dimensions * 4
81 | begin := index*(offset+vectorSize) + offset
82 | end := begin + vectorSize
83 | return vectorFromBytes(m.knnRaw[begin:end])
84 | }
85 |
86 | func vectorFromBytes(in []byte) []float32 {
87 | out := make([]float32, len(in)/4)
88 | for offset := 0; offset < len(in); offset += 4 {
89 | bits := binary.LittleEndian.Uint32(in[offset : offset+4])
90 | float := math.Float32frombits(bits)
91 | out[offset/4] = float
92 | }
93 |
94 | return out
95 | }
96 |
97 | // Compute the distance between two items.
98 | func (m *mmappedIndex) GetDistance(a ItemIndex, b ItemIndex) (float32, error) {
99 | if a >= 0 && b >= 0 && a <= m.word_index.GetNumberOfWords() && b <= m.word_index.GetNumberOfWords() {
100 | return m.knn.GetDistance(int(a), int(b)), nil
101 | } else {
102 | return 0, fmt.Errorf("Index out of bounds")
103 | }
104 | }
105 |
106 | func (m *mmappedIndex) GetNnsByItem(item ItemIndex, n int, k int) ([]ItemIndex, []float32, error) {
107 | if item >= 0 && item <= m.word_index.GetNumberOfWords() {
108 | var items []int
109 | var distances []float32
110 |
111 | m.knn.GetNnsByItem(int(item), n, k, &items, &distances)
112 |
113 | var indices []ItemIndex = make([]ItemIndex, len(items))
114 | for i, x := range items {
115 | indices[i] = ItemIndex(x)
116 | }
117 |
118 | return indices, distances, nil
119 | } else {
120 | return nil, nil, fmt.Errorf("Index out of bounds")
121 | }
122 | }
123 |
124 | func (m *mmappedIndex) GetNnsByVector(vector Vector, n int, k int) ([]ItemIndex, []float32, error) {
125 | if len(vector.vector) == m.GetVectorLength() {
126 | var items []int
127 | var distances []float32
128 |
129 | m.knn.GetNnsByVector(vector.vector, n, k, &items, &distances)
130 |
131 | var indices []ItemIndex = make([]ItemIndex, len(items))
132 | for i, x := range items {
133 | indices[i] = ItemIndex(x)
134 | }
135 |
136 | return indices, distances, nil
137 | } else {
138 | return nil, nil, fmt.Errorf("Wrong vector length provided")
139 | }
140 | }
141 |
142 | // SafeGetSimilarWords returns n similar words in the contextionary,
143 | // examining k trees. It is guaratueed to have results, even if the word is
144 | // not in the contextionary. In this case the list only contains the word
145 | // itself. It can then still be used for exact match or levensthein-based
146 | // searches against db backends.
147 | func (m *mmappedIndex) SafeGetSimilarWords(word string, n, k int) ([]string, []float32) {
148 | return safeGetSimilarWordsFromAny(m, word, n, k)
149 | }
150 |
151 | // SafeGetSimilarWordsWithCertainty returns similar words in the
152 | // contextionary, if they are close enough to match the required certainty.
153 | // It is guaratueed to have results, even if the word is not in the
154 | // contextionary. In this case the list only contains the word itself. It can
155 | // then still be used for exact match or levensthein-based searches against
156 | // db backends.
157 | func (m *mmappedIndex) SafeGetSimilarWordsWithCertainty(word string, certainty float32) []string {
158 | return safeGetSimilarWordsWithCertaintyFromAny(m, word, certainty)
159 | }
160 |
161 | func LoadVectorFromDisk(annoy_index string, word_index_file_name string) (Contextionary, error) {
162 | word_index, err := LoadWordlist(word_index_file_name)
163 |
164 | if err != nil {
165 | return nil, fmt.Errorf("Could not load vector: %+v", err)
166 | }
167 |
168 | knn := annoy.NewAnnoyIndexEuclidean(int(word_index.vectorWidth))
169 | knn.Load(annoy_index)
170 |
171 | knnRaw, err := loadAnnoyIndexDirectly(annoy_index)
172 | if err != nil {
173 | return nil, fmt.Errorf("load raw index: %v", err)
174 | }
175 |
176 | idx := &mmappedIndex{
177 | word_index: word_index,
178 | knn: knn,
179 | knnRaw: knnRaw,
180 | dimensions: int(word_index.vectorWidth),
181 | }
182 |
183 | return idx, nil
184 | }
185 |
186 | // directly load the annoy index file to avoid memory leaks in the annoy
187 | // go-port of the C library, see #26
188 | func loadAnnoyIndexDirectly(path string) ([]byte, error) {
189 | file, err := os.Open(path)
190 | if err != nil {
191 | log.Fatalf("Can't open the knn file at %s: %+v", path, err)
192 | }
193 |
194 | file_info, err := file.Stat()
195 | if err != nil {
196 | log.Fatalf("Can't stat the knn file at %s: %+v", path, err)
197 | }
198 |
199 | mmap, err := syscall.Mmap(int(file.Fd()), 0, int(file_info.Size()), syscall.PROT_READ, syscall.MAP_SHARED)
200 | if err != nil {
201 | log.Fatalf("Can't mmap the knn file %s: %+v", path, err)
202 | }
203 |
204 | return mmap, nil
205 | }
206 |
--------------------------------------------------------------------------------
/contextionary/core/similar_words.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package contextionary
12 |
13 | import (
14 | "regexp"
15 | )
16 |
17 | const simliarWordsLimit = 15
18 |
19 | func safeGetSimilarWordsFromAny(c11y Contextionary, word string, n, k int) ([]string, []float32) {
20 | i := c11y.WordToItemIndex(word)
21 | if !i.IsPresent() {
22 | return []string{word}, []float32{1}
23 | }
24 |
25 | indices, newCertainties, err := c11y.GetNnsByItem(i, n, k)
26 | if err != nil {
27 | return []string{word}, []float32{1}
28 | }
29 |
30 | var words []string
31 | var certainties []float32
32 | for i, index := range indices {
33 | word, err := c11y.ItemIndexToWord(index)
34 | if err != nil {
35 | continue
36 | }
37 |
38 | if wordHasIllegalCharacters(word) {
39 | continue
40 | }
41 |
42 | words = append(words, word)
43 | certainties = append(certainties, newCertainties[i])
44 | }
45 |
46 | return words, certainties
47 | }
48 |
49 | func safeGetSimilarWordsWithCertaintyFromAny(c11y Contextionary, word string, certainty float32) []string {
50 | var matchingWords []string
51 | var matchtingCertainties []float32
52 |
53 | count := 0
54 | words, certainties := c11y.SafeGetSimilarWords(word, 100, 32)
55 | for i, word := range words {
56 | if count >= simliarWordsLimit {
57 | break
58 | }
59 |
60 | var dist float32
61 | if dist = DistanceToCertainty(certainties[i]); dist < certainty {
62 | continue
63 | }
64 |
65 | count++
66 | matchingWords = append(matchingWords, alphanumeric(word))
67 | matchtingCertainties = append(matchtingCertainties, dist)
68 | }
69 |
70 | return matchingWords
71 | }
72 |
73 | func wordHasIllegalCharacters(word string) bool {
74 | // we know that the schema based contextionary uses a leading dollar sign for
75 | // the class and property centroids, so we can easily filter them out
76 | return regexp.MustCompile("^\\$").MatchString(word)
77 | }
78 |
79 | func alphanumeric(word string) string {
80 | return regexp.MustCompile("[^a-zA-Z0-9_]+").ReplaceAllString(word, "")
81 | }
82 |
--------------------------------------------------------------------------------
/contextionary/core/similar_words_test.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package contextionary
12 |
13 | import (
14 | "testing"
15 |
16 | "github.com/stretchr/testify/assert"
17 | )
18 |
19 | func TestSimilarWords(t *testing.T) {
20 |
21 | t.Run("with a word that's not in the c11y", func(t *testing.T) {
22 | c := newC11y()
23 | expectedWords := []string{"vehicle"}
24 |
25 | words := c.SafeGetSimilarWordsWithCertainty("vehicle", 0.8)
26 |
27 | assert.Equal(t, expectedWords, words)
28 | })
29 |
30 | t.Run("with a word thats present and a high certainty", func(t *testing.T) {
31 | c := newC11y()
32 | expectedWords := []string{"car", "automobile"}
33 |
34 | words := c.SafeGetSimilarWordsWithCertainty("car", 0.95)
35 |
36 | assert.Equal(t, expectedWords, words)
37 | })
38 |
39 | t.Run("with a word thats present and a medium certainty", func(t *testing.T) {
40 | c := newC11y()
41 | expectedWords := []string{"car", "automobile", "airplane"}
42 |
43 | words := c.SafeGetSimilarWordsWithCertainty("car", 0.7)
44 |
45 | assert.Equal(t, expectedWords, words)
46 | })
47 |
48 | t.Run("with a word thats present and a really low certainty", func(t *testing.T) {
49 | c := newC11y()
50 | expectedWords := []string{"car", "automobile", "airplane", "cabernetsauvignon"}
51 |
52 | words := c.SafeGetSimilarWordsWithCertainty("car", 0.001)
53 |
54 | assert.Equal(t, expectedWords, words)
55 | })
56 |
57 | }
58 |
59 | func newC11y() Contextionary {
60 | builder := InMemoryBuilder(3)
61 |
62 | builder.AddWord("car", NewVector([]float32{1, 0, 0}))
63 | builder.AddWord("automobile", NewVector([]float32{0.9, 0, 0}))
64 | builder.AddWord("airplane", NewVector([]float32{0.3, 0, 0}))
65 | builder.AddWord("cabernet-sauvignon", NewVector([]float32{0, 0, 10}))
66 | builder.AddWord("$THING[Car]", NewVector([]float32{1, 0, 0}))
67 |
68 | return Contextionary(builder.Build(3))
69 | }
70 |
--------------------------------------------------------------------------------
/contextionary/core/stopwords/detector.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package stopwords
12 |
13 | import (
14 | "encoding/json"
15 | "fmt"
16 | "io/ioutil"
17 | "os"
18 | )
19 |
20 | // Detector can be used to detect whether a word is a stopword
21 | type Detector struct {
22 | lookup map[string]int
23 | }
24 |
25 | type stopWordDoc struct {
26 | Language string `json:"language"`
27 | Words []string `json:"words"`
28 | }
29 |
30 | // NewFromFile creates an in-memory stopword detector based on a file read once
31 | // at init time
32 | func NewFromFile(path string) (*Detector, error) {
33 | file, err := os.Open(path)
34 | if err != nil {
35 | return nil, fmt.Errorf("could not open file at %s: %v", path, err)
36 | }
37 |
38 | fileBytes, err := ioutil.ReadAll(file)
39 | if err != nil {
40 | return nil, fmt.Errorf("could not read file contents: %v", err)
41 | }
42 |
43 | var doc stopWordDoc
44 | err = json.Unmarshal(fileBytes, &doc)
45 | if err != nil {
46 | return nil, fmt.Errorf("could not unmarshal json: %v", err)
47 | }
48 |
49 | lookup := buildLookupMap(doc.Words)
50 |
51 | return &Detector{
52 | lookup: lookup,
53 | }, nil
54 | }
55 |
56 | // IsStopWord returns true on stop words, false on all other words
57 | func (d *Detector) IsStopWord(word string) bool {
58 | if _, ok := d.lookup[word]; ok {
59 | return true
60 | }
61 |
62 | return false
63 | }
64 |
65 | func buildLookupMap(words []string) map[string]int {
66 | lookup := map[string]int{}
67 | for _, word := range words {
68 | lookup[word] = 1
69 | }
70 |
71 | return lookup
72 | }
73 |
--------------------------------------------------------------------------------
/contextionary/core/vector.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package contextionary
13 |
14 | import (
15 | "fmt"
16 | "math"
17 | )
18 |
19 | // Opque type that models a fixed-length vector.
20 | type Vector struct {
21 | vector []float32
22 | Source []InputElement
23 | }
24 |
25 | type InputElement struct {
26 | Concept string
27 | Weight float64
28 | Occurrence uint64
29 | }
30 |
31 | func NewVector(vector []float32) Vector {
32 | return Vector{vector: vector}
33 | }
34 |
35 | func (v *Vector) Equal(other *Vector) (bool, error) {
36 | if len(v.vector) != len(other.vector) {
37 | return false, fmt.Errorf("Vectors have different dimensions; %v vs %v", len(v.vector), len(other.vector))
38 | }
39 |
40 | for i, v := range v.vector {
41 | if other.vector[i] != v {
42 | return false, nil
43 | }
44 | }
45 |
46 | return true, nil
47 | }
48 |
49 | func (v *Vector) EqualEpsilon(other *Vector, epsilon float32) (bool, error) {
50 | if len(v.vector) != len(other.vector) {
51 | return false, fmt.Errorf("Vectors have different dimensions; %v vs %v", len(v.vector), len(other.vector))
52 | }
53 |
54 | for i, v := range v.vector {
55 | v_min := v - epsilon
56 | v_max := v + epsilon
57 | if other.vector[i] < v_min && other.vector[i] > v_max {
58 | return false, nil
59 | }
60 | }
61 |
62 | return true, nil
63 | }
64 |
65 | func (v *Vector) Len() int {
66 | return len(v.vector)
67 | }
68 |
69 | func (v *Vector) ToString() string {
70 | str := "["
71 | first := true
72 | for _, i := range v.vector {
73 | if first {
74 | first = false
75 | } else {
76 | str += ", "
77 | }
78 |
79 | str += fmt.Sprintf("%.6f", i)
80 | }
81 |
82 | str += "]"
83 |
84 | return str
85 | }
86 |
87 | func (v *Vector) ToArray() []float32 {
88 |
89 | var returner []float32
90 |
91 | for _, i := range v.vector {
92 | returner = append(returner, i)
93 | }
94 |
95 | return returner
96 | }
97 |
98 | func (v *Vector) Distance(other *Vector) (float32, error) {
99 | var sum float32
100 |
101 | if len(v.vector) != len(other.vector) {
102 | return 0.0, fmt.Errorf("Vectors have different dimensions")
103 | }
104 |
105 | for i := 0; i < len(v.vector); i++ {
106 | x := v.vector[i] - other.vector[i]
107 | sum += x * x
108 | }
109 |
110 | return float32(math.Sqrt(float64(sum))), nil
111 | }
112 |
--------------------------------------------------------------------------------
/contextionary/core/wordlist.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */
12 | package contextionary
13 |
14 | // //// #include
15 | // //import "C"
16 |
17 | import (
18 | "bytes"
19 | "encoding/binary"
20 | "encoding/json"
21 | "fmt"
22 | "os"
23 | "sort"
24 | "syscall"
25 | )
26 |
27 | type Wordlist struct {
28 | vectorWidth uint64
29 | numberOfWords uint64
30 | metadata map[string]interface{}
31 | occurrencePercentiles []uint64
32 |
33 | file os.File
34 | startOfTable int
35 | mmap []byte
36 | }
37 |
38 | func LoadWordlist(path string) (*Wordlist, error) {
39 | file, err := os.Open(path)
40 | if err != nil {
41 | return nil, fmt.Errorf("Can't open the wordlist at %s: %+v", path, err)
42 | }
43 |
44 | file_info, err := file.Stat()
45 | if err != nil {
46 | return nil, fmt.Errorf("Can't stat the wordlist at %s: %+v", path, err)
47 | }
48 |
49 | mmap, err := syscall.Mmap(int(file.Fd()), 0, int(file_info.Size()), syscall.PROT_READ, syscall.MAP_SHARED)
50 | if err != nil {
51 | return nil, fmt.Errorf("Can't mmap the file %s: %+v", path, err)
52 | }
53 |
54 | nrWordsBytes := mmap[0:8]
55 | vectorWidthBytes := mmap[8:16]
56 | metadataLengthBytes := mmap[16:24]
57 |
58 | nrWords := binary.LittleEndian.Uint64(nrWordsBytes)
59 | vectorWidth := binary.LittleEndian.Uint64(vectorWidthBytes)
60 | metadataLength := binary.LittleEndian.Uint64(metadataLengthBytes)
61 |
62 | metadataBytes := mmap[24 : 24+metadataLength]
63 | var metadata map[string]interface{}
64 |
65 | json.Unmarshal(metadataBytes, &metadata)
66 |
67 | // Compute beginning of word list lookup table.
68 | var start_of_table int = 24 + int(metadataLength)
69 | var offset int = 4 - (start_of_table % 4)
70 | start_of_table += offset
71 |
72 | wl := &Wordlist{
73 | vectorWidth: vectorWidth,
74 | numberOfWords: nrWords,
75 | metadata: metadata,
76 | startOfTable: start_of_table,
77 | mmap: mmap,
78 | }
79 |
80 | wl.initOccurrencePercentiles()
81 |
82 | return wl, nil
83 | }
84 |
85 | func (w *Wordlist) GetNumberOfWords() ItemIndex {
86 | return ItemIndex(w.numberOfWords)
87 | }
88 |
89 | func (w *Wordlist) OccurrencePercentile(percentile int) uint64 {
90 | if percentile < 0 || percentile > 100 {
91 | panic("incorrect usage of occurrence percentile, must be between 0 and 100")
92 | }
93 |
94 | return w.occurrencePercentiles[percentile]
95 | }
96 |
97 | func (w *Wordlist) FindIndexByWord(_needle string) ItemIndex {
98 | var needle = string([]byte(_needle))
99 | needle += "\x00"
100 |
101 | var bytes_needle = []byte(needle)
102 |
103 | var low ItemIndex = 0
104 | var high ItemIndex = ItemIndex(w.numberOfWords) - 1
105 |
106 | for low <= high {
107 | var midpoint ItemIndex = (low + high) / 2
108 |
109 | ptr := w.getWordPtr(midpoint)
110 |
111 | // if the last word in the index is shorter than our needle, we would panic
112 | // by accessing a non-existing adress. To prevent this, the higher boundary
113 | // can never be higher than the len(index)-1
114 | endPos := 8 + len(bytes_needle)
115 | if endPos >= len(ptr) {
116 | endPos = len(ptr) - 1
117 | }
118 |
119 | // ignore the first 8 bytes as they are reserved for occurrence
120 | word := ptr[8:endPos]
121 |
122 | var cmp = bytes.Compare(bytes_needle, word)
123 |
124 | if cmp == 0 {
125 | return midpoint
126 | } else if cmp < 0 {
127 | high = midpoint - 1
128 | } else {
129 | low = midpoint + 1
130 | }
131 | }
132 |
133 | return -1
134 | }
135 |
136 | func (w *Wordlist) getWordPtr(index ItemIndex) []byte {
137 | entry_addr := ItemIndex(w.startOfTable) + index*8
138 | word_address_bytes := w.mmap[entry_addr : entry_addr+8]
139 | word_address := binary.LittleEndian.Uint64(word_address_bytes)
140 | return w.mmap[word_address:]
141 | }
142 |
143 | func (w *Wordlist) getWord(index ItemIndex) (string, uint64) {
144 | ptr := w.getWordPtr(index)
145 | occurrence := binary.LittleEndian.Uint64(ptr[0:8])
146 | for i := 8; i < len(ptr); i++ {
147 | if ptr[i] == '\x00' {
148 | return string(ptr[8:i]), occurrence
149 | }
150 | }
151 |
152 | return "", 0
153 | }
154 |
155 | func (w *Wordlist) initOccurrencePercentiles() {
156 | w.occurrencePercentiles = make([]uint64, 101) // make 101 elements longs, so both index 0 and 100 are included
157 | max := int(w.GetNumberOfWords())
158 | allOccs := make([]uint64, max)
159 |
160 | for i := ItemIndex(0); int(i) < max; i++ {
161 | _, occ := w.getWord(i)
162 | allOccs[i] = occ
163 | }
164 |
165 | sort.Slice(allOccs, func(a, b int) bool { return allOccs[a] < allOccs[b] })
166 |
167 | for i := 0; i <= 100; i++ { // note that this is 101 elements!
168 | if i == 0 {
169 | w.occurrencePercentiles[i] = 0
170 | continue
171 | }
172 |
173 | if i == 100 {
174 | w.occurrencePercentiles[i] = allOccs[len(allOccs)-1]
175 | continue
176 | }
177 |
178 | occ := uint64(float64(i) / 100 * float64(len(allOccs)))
179 | w.occurrencePercentiles[i] = occ
180 | }
181 | }
182 |
--------------------------------------------------------------------------------
/contextionary/schema/contextionary.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package schema
12 |
13 | import contextionary "github.com/weaviate/contextionary/contextionary/core"
14 |
15 | // Contextionary composes a regular contextionary with additional
16 | // schema-related query methods
17 | type Contextionary struct {
18 | contextionary.Contextionary
19 | }
20 |
21 | // New creates a new Contextionary from a contextionary.Contextionary which it
22 | // extends with Schema-related search methods
23 | func New(c contextionary.Contextionary) *Contextionary {
24 | return &Contextionary{
25 | Contextionary: c,
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/contextionary/schema/schema_search.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package schema
12 |
13 | import (
14 | "fmt"
15 | "regexp"
16 | "strings"
17 |
18 | "github.com/fatih/camelcase"
19 | pb "github.com/weaviate/contextionary/contextionary"
20 | contextionary "github.com/weaviate/contextionary/contextionary/core"
21 | "github.com/weaviate/contextionary/errors"
22 | )
23 |
24 | // SearchResult is a single search result. See wrapping Search Results for the Type
25 | type SearchResult struct {
26 | Name string
27 | Certainty float32
28 | }
29 |
30 | // SearchResults is grouping of SearchResults for a SchemaSearch
31 | type SearchResults struct {
32 | Type SearchType
33 | Results []SearchResult
34 | }
35 |
36 | // Len of the result set
37 | func (r SearchResults) Len() int {
38 | return len(r.Results)
39 | }
40 |
41 | // SchemaSearch can be used to search for related classes and properties, see
42 | // documentation of SearchParams for more details on how to use it and
43 | // documentation on *pb.SchemaSearchResults for more details on how to use the return
44 | // value
45 | func (con *Contextionary) SchemaSearch(params *pb.SchemaSearchParams) (*pb.SchemaSearchResults, error) {
46 | p := SearchParams{params}
47 | if err := p.Validate(); err != nil {
48 | return nil, errors.NewInvalidUserInputf("invalid search params: %s", err)
49 | }
50 |
51 | centroid, err := con.centroidFromNameAndKeywords(p)
52 | if err != nil {
53 | return nil, errors.NewInvalidUserInputf("could not build centroid from name and keywords: %s", err)
54 | }
55 |
56 | rawResults, err := con.knnSearch(*centroid)
57 | if err != nil {
58 | return nil, errors.NewInternalf("could not perform knn search: %s", err)
59 | }
60 |
61 | if p.SearchType == pb.SearchType_CLASS {
62 | return con.handleClassSearch(p, rawResults)
63 | }
64 |
65 | // since we have passed validation we know that anything that's not a class
66 | // search must be a property search
67 | return con.handlePropertySearch(p, rawResults)
68 | }
69 |
70 | func (con *Contextionary) centroidFromNameAndKeywords(p SearchParams) (*contextionary.Vector, error) {
71 | nameVector, err := con.camelCaseWordToVector(p.Name)
72 | if err != nil {
73 | return nil, fmt.Errorf("invalid name in search: %s", err)
74 | }
75 |
76 | if len(p.Keywords) == 0 {
77 | return nameVector, nil
78 | }
79 |
80 | vectors := make([]contextionary.Vector, len(p.Keywords)+1, len(p.Keywords)+1)
81 | weights := make([]float32, len(p.Keywords)+1, len(p.Keywords)+1)
82 | // set last vector to className which always has weight=1
83 | vectors[len(vectors)-1] = *nameVector
84 | weights[len(vectors)-1] = 1
85 |
86 | for i, keyword := range p.Keywords {
87 | kwVector, err := con.wordToVector(keyword.Keyword)
88 | if err != nil {
89 | return nil, fmt.Errorf("invalid keyword in search: %s", err)
90 | }
91 | vectors[i] = *kwVector
92 | weights[i] = keyword.Weight
93 | }
94 |
95 | return contextionary.ComputeWeightedCentroid(vectors, weights)
96 | }
97 |
98 | func (con *Contextionary) camelCaseWordToVector(w string) (*contextionary.Vector, error) {
99 | parts := camelcase.Split(w)
100 | if len(parts) == 1 {
101 | // no camelcasing, no need to build a centroid
102 | return con.wordToVector(w)
103 | }
104 |
105 | vectors := make([]contextionary.Vector, len(parts), len(parts))
106 | weights := make([]float32, len(parts), len(parts))
107 | for i, part := range parts {
108 | v, err := con.wordToVector(part)
109 | if err != nil {
110 | return nil, fmt.Errorf("invalid camelCased compound word: %s", err)
111 | }
112 |
113 | vectors[i] = *v
114 | weights[i] = 1 // on camel-casing all parts are weighted equally
115 | }
116 |
117 | return contextionary.ComputeWeightedCentroid(vectors, weights)
118 | }
119 |
120 | func (con *Contextionary) wordToVector(w string) (*contextionary.Vector, error) {
121 | w = strings.ToLower(w)
122 | itemIndex := con.WordToItemIndex(w)
123 | if ok := itemIndex.IsPresent(); !ok {
124 | return nil, fmt.Errorf(
125 | "the word '%s' is not present in the contextionary and therefore not a valid search term", w)
126 | }
127 |
128 | vector, err := con.GetVectorForItemIndex(itemIndex)
129 | if err != nil {
130 | return nil, fmt.Errorf("could not get vector for word '%s' with itemIndex '%d': %s",
131 | w, itemIndex, err)
132 | }
133 |
134 | return vector, nil
135 | }
136 |
137 | func (con *Contextionary) handleClassSearch(p SearchParams, search rawResults) (*pb.SchemaSearchResults, error) {
138 | return &pb.SchemaSearchResults{
139 | Type: p.SearchType,
140 | Results: search.extractClassNames(p),
141 | }, nil
142 | }
143 |
144 | func (con *Contextionary) handlePropertySearch(p SearchParams, search rawResults) (*pb.SchemaSearchResults, error) {
145 | return &pb.SchemaSearchResults{
146 | Type: p.SearchType,
147 | Results: search.extractPropertyNames(p),
148 | }, nil
149 | }
150 |
151 | func (con *Contextionary) knnSearch(vector contextionary.Vector) (rawResults, error) {
152 | list, distances, err := con.GetNnsByVector(vector, 10000, 3)
153 | if err != nil {
154 | return nil, fmt.Errorf("could not get nearest neighbors for vector '%v': %s", vector, err)
155 | }
156 |
157 | results := make(rawResults, len(list), len(list))
158 | for i := range list {
159 | word, err := con.ItemIndexToWord(list[i])
160 | if err != nil {
161 | return results, fmt.Errorf("got a result from kNN search, but don't have a word for this index: %s", err)
162 | }
163 |
164 | results[i] = rawResult{
165 | name: word,
166 | distance: distances[i],
167 | }
168 | }
169 |
170 | return results, nil
171 | }
172 |
173 | // rawResult is a helper struct to contain the results of the kNN-search. It
174 | // does not yet contain the desired output. This means the names can be both
175 | // classes/properties and arbitrary words. Furthermore the certainty has not
176 | // yet been normalized , so it is merely the raw kNN distance
177 | type rawResult struct {
178 | name string
179 | distance float32
180 | }
181 |
182 | type rawResults []rawResult
183 |
184 | func (r rawResults) extractClassNames(p SearchParams) []*pb.SchemaSearchResult {
185 | var results []*pb.SchemaSearchResult
186 | regex := regexp.MustCompile(fmt.Sprintf("^\\$%s\\[([A-Za-z]+)\\]$", "OBJECT"))
187 |
188 | for _, rawRes := range r {
189 | if regex.MatchString(rawRes.name) {
190 | certainty := distanceToCertainty(rawRes.distance)
191 | if certainty < p.Certainty {
192 | continue
193 | }
194 |
195 | results = append(results, &pb.SchemaSearchResult{
196 | Name: regex.FindStringSubmatch(rawRes.name)[1], //safe because we ran .MatchString before
197 | Certainty: certainty,
198 | })
199 | }
200 | }
201 |
202 | return results
203 | }
204 |
205 | func (r rawResults) extractPropertyNames(p SearchParams) []*pb.SchemaSearchResult {
206 | var results []*pb.SchemaSearchResult
207 | regex := regexp.MustCompile("^\\$[A-Za-z]+\\[[A-Za-z]+\\]\\[([A-Za-z]+)\\]$")
208 |
209 | propsMap := map[string][]*pb.SchemaSearchResult{}
210 |
211 | for _, rawRes := range r {
212 | if regex.MatchString(rawRes.name) {
213 | name := regex.FindStringSubmatch(rawRes.name)[1] //safe because we ran .MatchString before
214 | certainty := distanceToCertainty(rawRes.distance)
215 | if certainty < p.Certainty {
216 | continue
217 | }
218 |
219 | res := &pb.SchemaSearchResult{
220 | Name: name,
221 | Certainty: certainty,
222 | }
223 | if _, ok := propsMap[name]; !ok {
224 | propsMap[name] = []*pb.SchemaSearchResult{res}
225 | } else {
226 | propsMap[name] = append(propsMap[name], res)
227 | }
228 | }
229 | }
230 |
231 | // now calculate mean of duplicate results
232 | for _, resultsPerName := range propsMap {
233 | results = append(results, &pb.SchemaSearchResult{
234 | Name: resultsPerName[0].Name,
235 | Certainty: meanCertainty(resultsPerName),
236 | })
237 | }
238 |
239 | return results
240 | }
241 |
242 | func meanCertainty(rs []*pb.SchemaSearchResult) float32 {
243 | var compound float32
244 | for _, r := range rs {
245 | compound += r.Certainty
246 | }
247 |
248 | return compound / float32(len(rs))
249 | }
250 |
251 | func distanceToCertainty(d float32) float32 {
252 | return 1 - d/12
253 | }
254 |
--------------------------------------------------------------------------------
/contextionary/schema/schema_search_params.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package schema
12 |
13 | import (
14 | "fmt"
15 |
16 | "github.com/fatih/camelcase"
17 | pb "github.com/weaviate/contextionary/contextionary"
18 | )
19 |
20 | // SearchType to search for either class names or property names
21 | type SearchType string
22 |
23 | const (
24 | // SearchTypeClass to search the contextionary for class names
25 | SearchTypeClass SearchType = "class"
26 | // SearchTypeProperty to search the contextionary for property names
27 | SearchTypeProperty SearchType = "property"
28 | )
29 |
30 | // SearchParams to be used for a SchemaSearch. See individual properties for
31 | // additional documentation on what they do
32 | type SearchParams struct {
33 | *pb.SchemaSearchParams
34 | }
35 |
36 | // Validate the feasibility of the specified arguments
37 | func (p SearchParams) Validate() error {
38 | if p.Name == "" {
39 | return fmt.Errorf("Name cannot be empty")
40 | }
41 |
42 | if err := p.validateCertaintyOrWeight(p.Certainty); err != nil {
43 | return fmt.Errorf("invalid Certainty: %s", err)
44 | }
45 |
46 | if p.SearchType != pb.SearchType_CLASS && p.SearchType != pb.SearchType_PROPERTY {
47 | return fmt.Errorf(
48 | "SearchType must be SearchType_CLASS or SearchType_PROPERTY, but got '%s'", p.SearchType)
49 | }
50 |
51 | for i, keyword := range p.Keywords {
52 | if err := p.validateKeyword(keyword); err != nil {
53 | return fmt.Errorf("invalid keyword at position %d: %s", i, err)
54 | }
55 | }
56 |
57 | return nil
58 | }
59 |
60 | func (p SearchParams) validateKeyword(kw *pb.Keyword) error {
61 | if kw.Keyword == "" {
62 | return fmt.Errorf("Keyword cannot be empty")
63 | }
64 |
65 | if len(camelcase.Split(kw.Keyword)) > 1 {
66 | return fmt.Errorf("invalid Keyword: keywords cannot be camelCased - "+
67 | "instead split your keyword up into several keywords, this way each word "+
68 | "of your camelCased string can have its own weight, got '%s'", kw.Keyword)
69 | }
70 |
71 | if err := p.validateCertaintyOrWeight(kw.Weight); err != nil {
72 | return fmt.Errorf("invalid Weight: %s", err)
73 | }
74 |
75 | return nil
76 | }
77 |
78 | func (p SearchParams) validateCertaintyOrWeight(c float32) error {
79 | if c >= 0 && c <= 1 {
80 | return nil
81 | }
82 |
83 | return fmt.Errorf("must be between 0 and 1, but got '%f'", c)
84 | }
85 |
--------------------------------------------------------------------------------
/contextionary/schema/schema_search_params_test.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package schema
12 |
13 | import (
14 | "errors"
15 | "testing"
16 |
17 | "github.com/stretchr/testify/assert"
18 | "github.com/weaviate/contextionary/contextionary"
19 | )
20 |
21 | func Test__SchemaSearch_Validation(t *testing.T) {
22 | tests := schemaSearchTests{
23 | {
24 | name: "valid params",
25 | searchParams: SearchParams{
26 | SchemaSearchParams: &contextionary.SchemaSearchParams{
27 | SearchType: contextionary.SearchType_CLASS,
28 | Name: "foo",
29 | Certainty: 1.0,
30 | },
31 | },
32 | expectedError: nil,
33 | },
34 | {
35 | name: "missing search name",
36 | searchParams: SearchParams{
37 | SchemaSearchParams: &contextionary.SchemaSearchParams{
38 | SearchType: contextionary.SearchType_CLASS,
39 | Name: "",
40 | Certainty: 0.0,
41 | },
42 | },
43 | expectedError: errors.New("Name cannot be empty"),
44 | },
45 | {
46 | name: "certainty too low",
47 | searchParams: SearchParams{
48 | SchemaSearchParams: &contextionary.SchemaSearchParams{
49 | SearchType: contextionary.SearchType_CLASS,
50 | Name: "bestName",
51 | Certainty: -4,
52 | },
53 | },
54 | expectedError: errors.New("invalid Certainty: must be between 0 and 1, but got '-4.000000'"),
55 | },
56 | {
57 | name: "certainty too high",
58 | searchParams: SearchParams{
59 | SchemaSearchParams: &contextionary.SchemaSearchParams{
60 | SearchType: contextionary.SearchType_CLASS,
61 | Name: "bestName",
62 | Certainty: 4,
63 | },
64 | },
65 | expectedError: errors.New("invalid Certainty: must be between 0 and 1, but got '4.000000'"),
66 | },
67 | {
68 | name: "missing kind on class search",
69 | searchParams: SearchParams{
70 | SchemaSearchParams: &contextionary.SchemaSearchParams{
71 | SearchType: contextionary.SearchType_CLASS,
72 | Name: "bestName",
73 | Certainty: 0.5,
74 | },
75 | },
76 | expectedError: errors.New("Kind cannot be empty"),
77 | },
78 | {
79 | name: "valid keywords",
80 | searchParams: SearchParams{
81 | SchemaSearchParams: &contextionary.SchemaSearchParams{
82 | SearchType: contextionary.SearchType_CLASS,
83 | Name: "foo",
84 | Certainty: 1.0,
85 | Keywords: []*contextionary.Keyword{
86 | {
87 | Keyword: "foobar",
88 | Weight: 1.0,
89 | },
90 | },
91 | },
92 | },
93 | expectedError: nil,
94 | },
95 | {
96 | name: "keywords with empty names",
97 | searchParams: SearchParams{
98 | SchemaSearchParams: &contextionary.SchemaSearchParams{
99 | SearchType: contextionary.SearchType_CLASS,
100 | Name: "foo",
101 | Certainty: 1.0,
102 | Keywords: []*contextionary.Keyword{
103 | {
104 | Keyword: "",
105 | Weight: 1.0,
106 | },
107 | },
108 | },
109 | },
110 | expectedError: errors.New("invalid keyword at position 0: Keyword cannot be empty"),
111 | },
112 | {
113 | name: "keywords with invalid weights",
114 | searchParams: SearchParams{
115 | SchemaSearchParams: &contextionary.SchemaSearchParams{
116 | SearchType: contextionary.SearchType_CLASS,
117 | Name: "foo",
118 | Certainty: 1.0,
119 | Keywords: []*contextionary.Keyword{{
120 | Keyword: "bestkeyword",
121 | Weight: 1.3,
122 | }},
123 | },
124 | },
125 | expectedError: errors.New("invalid keyword at position 0: invalid Weight: " +
126 | "must be between 0 and 1, but got '1.300000'"),
127 | },
128 | {
129 | name: "CamelCased keywords",
130 | searchParams: SearchParams{
131 | SchemaSearchParams: &contextionary.SchemaSearchParams{
132 | SearchType: contextionary.SearchType_CLASS,
133 | Name: "foo",
134 | Certainty: 1.0,
135 | Keywords: []*contextionary.Keyword{{
136 | Keyword: "worstKeyword",
137 | Weight: 0.8,
138 | }},
139 | },
140 | },
141 | expectedError: errors.New("invalid keyword at position 0: invalid Keyword: " +
142 | "keywords cannot be camelCased - instead split your keyword up into several keywords, " +
143 | "this way each word of your camelCased string can have its own weight, got 'worstKeyword'"),
144 | },
145 | }
146 |
147 | tests.AssertValidation(t)
148 | }
149 |
150 | func (s schemaSearchTests) AssertValidation(t *testing.T) {
151 | for _, test := range s {
152 | t.Run(test.name, func(t *testing.T) {
153 | err := test.searchParams.Validate()
154 |
155 | // assert error
156 | assert.Equal(t, test.expectedError, err, "should match the expected error")
157 |
158 | })
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/errors/errors.go:
--------------------------------------------------------------------------------
1 | package errors
2 |
3 | import "fmt"
4 |
5 | // InvalidUserInput indicates a client-side error
6 | type InvalidUserInput struct {
7 | msg string
8 | }
9 |
10 | func (e InvalidUserInput) Error() string {
11 | return e.msg
12 | }
13 |
14 | // NewInvalidUserInput with Errorf signature
15 | func NewInvalidUserInputf(format string, args ...interface{}) InvalidUserInput {
16 | return InvalidUserInput{msg: fmt.Sprintf(format, args...)}
17 | }
18 |
19 | // Internal indicates something went wrong during processing
20 | type Internal struct {
21 | msg string
22 | }
23 |
24 | func (e Internal) Error() string {
25 | return e.msg
26 | }
27 |
28 | // NewInternal with Errorf signature
29 | func NewInternalf(format string, args ...interface{}) Internal {
30 | return Internal{msg: fmt.Sprintf(format, args...)}
31 | }
32 |
33 | // NotFound indicates the desired resource doesn't exist
34 | type NotFound struct {
35 | msg string
36 | }
37 |
38 | func (e NotFound) Error() string {
39 | return e.msg
40 | }
41 |
42 | // NewNotFound with Errorf signature
43 | func NewNotFoundf(format string, args ...interface{}) NotFound {
44 | return NotFound{msg: fmt.Sprintf(format, args...)}
45 | }
46 |
--------------------------------------------------------------------------------
/extensions/extension.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | type Extension struct {
4 | Concept string `json:"concept"`
5 | Vector []float32 `json:"vector"`
6 | Occurrence int `json:"occurrence"`
7 | Input ExtensionInput `json:"input"`
8 | }
9 |
10 | type ExtensionInput struct {
11 | Definition string `json:"definition"`
12 | Weight float32 `json:"weight"`
13 | }
14 |
--------------------------------------------------------------------------------
/extensions/looker_upper.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "sync"
5 | )
6 |
7 | type LookerUpper struct {
8 | repo RetrieverRepo
9 | sync.Mutex
10 | db map[string]Extension
11 | }
12 |
13 | type RetrieverRepo interface {
14 | // WatchAll must send an immediate response after opening (for
15 | // initializiation), then send another response whenver the db has changed
16 | WatchAll() chan WatchResponse
17 | }
18 |
19 | func NewLookerUpper(repo RetrieverRepo) *LookerUpper {
20 | lu := &LookerUpper{
21 | repo: repo,
22 | db: map[string]Extension{},
23 | }
24 | lu.initWatcher()
25 | return lu
26 | }
27 |
28 | func (lu *LookerUpper) Lookup(concept string) (*Extension, error) {
29 | lu.Lock()
30 | defer lu.Unlock()
31 |
32 | ext, ok := lu.db[concept]
33 | if !ok {
34 | return nil, nil
35 | }
36 |
37 | return &ext, nil
38 | }
39 |
40 | type WatchResponse []Extension
41 |
42 | func (lu *LookerUpper) initWatcher() {
43 | updateCh := lu.repo.WatchAll()
44 |
45 | go func() {
46 | for res := range updateCh {
47 | lu.updateDB(res)
48 | }
49 | }()
50 | }
51 |
52 | func (lu *LookerUpper) updateDB(list []Extension) {
53 | lu.Lock()
54 | defer lu.Unlock()
55 |
56 | for _, ext := range list {
57 | lu.db[ext.Concept] = ext
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/extensions/looker_upper_test.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/stretchr/testify/assert"
8 | "github.com/stretchr/testify/require"
9 | )
10 |
11 | func Test_LookerUpper(t *testing.T) {
12 | t.Run("looking up a non-existant concept", func(t *testing.T) {
13 | repo := newFakeRepo()
14 | lu := NewLookerUpper(repo)
15 | extension, err := lu.Lookup("non_existing_concept")
16 | require.Nil(t, err)
17 | assert.Nil(t, extension)
18 | })
19 |
20 | t.Run("looking up existing concepts", func(t *testing.T) {
21 | repo := newFakeRepo()
22 | lu := NewLookerUpper(repo)
23 |
24 | t.Run("with an initial concept", func(t *testing.T) {
25 | ext := Extension{
26 | Concept: "flux_capacitor",
27 | Vector: []float32{0, 1, 2},
28 | Occurrence: 1000,
29 | }
30 | repo.add(ext)
31 | time.Sleep(100 * time.Millisecond)
32 | actual, err := lu.Lookup("flux_capacitor")
33 | require.Nil(t, err)
34 | assert.Equal(t, &ext, actual)
35 | })
36 |
37 | t.Run("with second concept", func(t *testing.T) {
38 | ext := Extension{
39 | Concept: "clux_fapacitor",
40 | Vector: []float32{0, 1, 2},
41 | Occurrence: 1000,
42 | }
43 | repo.add(ext)
44 | time.Sleep(100 * time.Millisecond)
45 |
46 | t.Run("looking up the original concept", func(t *testing.T) {
47 | actual, err := lu.Lookup("flux_capacitor")
48 | require.Nil(t, err)
49 | require.NotNil(t, actual)
50 | assert.Equal(t, "flux_capacitor", actual.Concept)
51 | })
52 |
53 | t.Run("looking up the second concept concept", func(t *testing.T) {
54 | actual, err := lu.Lookup("clux_fapacitor")
55 | require.Nil(t, err)
56 | require.NotNil(t, actual)
57 | assert.Equal(t, "clux_fapacitor", actual.Concept)
58 | })
59 | })
60 | })
61 | }
62 |
63 | func newFakeRepo() *fakeRepo {
64 | repo := &fakeRepo{
65 | ch: make(chan WatchResponse),
66 | }
67 |
68 | return repo
69 | }
70 |
71 | type fakeRepo struct {
72 | ch chan WatchResponse
73 | extensions []Extension
74 | }
75 |
76 | func (f *fakeRepo) WatchAll() chan WatchResponse {
77 | return f.ch
78 | }
79 |
80 | func (f *fakeRepo) add(ex Extension) {
81 | f.extensions = append(f.extensions, ex)
82 | f.ch <- f.extensions
83 | }
84 |
--------------------------------------------------------------------------------
/extensions/storer.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "strings"
7 | "unicode"
8 |
9 | "github.com/sirupsen/logrus"
10 | core "github.com/weaviate/contextionary/contextionary/core"
11 | "github.com/weaviate/contextionary/errors"
12 | )
13 |
14 | type Vectorizer interface {
15 | Corpi(corpi []string, overrides map[string]string) (*core.Vector, error)
16 | }
17 |
18 | type StorerRepo interface {
19 | Put(ctx context.Context, ext Extension) error
20 | }
21 |
22 | type Storer struct {
23 | vectorizer Vectorizer
24 | repo StorerRepo
25 | logger logrus.FieldLogger
26 | }
27 |
28 | func NewStorer(vectorizer Vectorizer, repo StorerRepo, logger logrus.FieldLogger) *Storer {
29 | return &Storer{vectorizer, repo, logger}
30 | }
31 |
32 | func (s *Storer) Put(ctx context.Context, concept string, input ExtensionInput) error {
33 | s.logger.WithField("action", "extensions_put").
34 | WithField("concept", concept).
35 | WithField("extension", input).
36 | Debug("received request to add/replace custom extension")
37 |
38 | err := s.validate(concept, input)
39 | if err != nil {
40 | return errors.NewInvalidUserInputf("invalid extension: %v", err)
41 | }
42 |
43 | vector, err := s.vectorizer.Corpi([]string{input.Definition}, nil)
44 | if err != nil {
45 | return errors.NewInternalf("vectorize definition: %v", err)
46 | }
47 |
48 | concept = s.compound(concept)
49 |
50 | ext := Extension{
51 | Concept: concept,
52 | Input: input,
53 | Vector: vector.ToArray(), // nil-check can be omitted as vectorizer will return non-nil if err==nil
54 | Occurrence: 1000, // TODO: Improve!
55 | }
56 |
57 | s.logger.WithField("action", "extensions_put_prestore").
58 | WithField("concept", ext.Concept).
59 | WithField("extension", ext).
60 | Debug("calculated vector, about to store in repo")
61 |
62 | err = s.repo.Put(ctx, ext)
63 | if err != nil {
64 | s.logger.WithField("action", "extensions_store_error").
65 | WithField("concept", ext.Concept).
66 | Errorf("repo put: %v", err)
67 | return errors.NewInternalf("store extension: %v", err)
68 | }
69 |
70 | s.logger.WithField("action", "extensions_put_poststore").
71 | WithField("concept", ext.Concept).
72 | Debug("successfully stored extension in repo")
73 |
74 | return nil
75 | }
76 |
77 | func (s *Storer) compound(inp string) string {
78 | parts := strings.Split(inp, " ")
79 | return strings.Join(parts, "_")
80 | }
81 |
82 | func (s *Storer) validate(concept string, input ExtensionInput) error {
83 | if len(concept) < 2 {
84 | return fmt.Errorf("concept must have at least two characters")
85 | }
86 |
87 | for _, r := range concept {
88 | if !unicode.IsLower(r) && !unicode.IsSpace(r) && !unicode.IsNumber(r) {
89 | return fmt.Errorf("concept must be made up of all lowercase letters and/or numbers, " +
90 | "for custom compund words use spaces, e.g. 'flux capacitor'")
91 | }
92 | }
93 |
94 | if len(input.Definition) == 0 {
95 | return fmt.Errorf("definition cannot be empty")
96 | }
97 |
98 | if input.Weight > 1 || input.Weight < 0 {
99 | return fmt.Errorf("weight must be between 0 and 1")
100 | }
101 |
102 | if input.Weight < 1 {
103 | return fmt.Errorf("weights below 1 (extending an existing concept) not supported yet - coming soon")
104 | }
105 |
106 | return nil
107 | }
108 |
--------------------------------------------------------------------------------
/extensions/storer_test.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "testing"
7 |
8 | "github.com/sirupsen/logrus/hooks/test"
9 | "github.com/stretchr/testify/assert"
10 | "github.com/stretchr/testify/mock"
11 | "github.com/stretchr/testify/require"
12 | core "github.com/weaviate/contextionary/contextionary/core"
13 | )
14 |
15 | func Test_Storer(t *testing.T) {
16 | t.Run("with invalid inputs", func(t *testing.T) {
17 | repo := &fakeStorerRepo{}
18 | logger, _ := test.NewNullLogger()
19 | s := NewStorer(&fakeVectorizer{}, repo, logger)
20 | inp := ExtensionInput{
21 | Definition: "an electrical device to store energy in the short term",
22 | Weight: 1,
23 | }
24 |
25 | type testCase struct {
26 | concept string
27 | inp ExtensionInput
28 | expectedErr error
29 | }
30 |
31 | tests := []testCase{
32 | testCase{
33 | concept: "lowerAndUpperCase",
34 | expectedErr: fmt.Errorf("invalid extension: concept must be made up of all lowercase letters and/or numbers, for custom compund words use spaces, e.g. 'flux capacitor'"),
35 | inp: inp,
36 | },
37 | testCase{
38 | concept: "a",
39 | expectedErr: fmt.Errorf("invalid extension: concept must have at least two characters"),
40 | inp: inp,
41 | },
42 | testCase{
43 | concept: "foo",
44 | expectedErr: fmt.Errorf("invalid extension: definition cannot be empty"),
45 | inp: ExtensionInput{Weight: 1},
46 | },
47 | testCase{
48 | concept: "foo",
49 | expectedErr: fmt.Errorf("invalid extension: weight must be between 0 and 1"),
50 | inp: ExtensionInput{Weight: -1, Definition: "foo bar"},
51 | },
52 | testCase{
53 | concept: "foo",
54 | expectedErr: fmt.Errorf("invalid extension: weight must be between 0 and 1"),
55 | inp: ExtensionInput{Weight: 3, Definition: "foo bar"},
56 | },
57 | testCase{ // TODO: add feature, then remove limitation
58 | concept: "foo",
59 | expectedErr: fmt.Errorf("invalid extension: weights below 1 (extending an existing concept) not supported yet - coming soon"),
60 | inp: ExtensionInput{Weight: 0.7, Definition: "foo bar"},
61 | },
62 | }
63 |
64 | for _, test := range tests {
65 | t.Run(test.concept, func(t *testing.T) {
66 | err := s.Put(context.Background(), test.concept, test.inp)
67 | assert.Equal(t, test.expectedErr.Error(), err.Error())
68 | })
69 | }
70 | })
71 |
72 | t.Run("with valid input (single word)", func(t *testing.T) {
73 | repo := &fakeStorerRepo{}
74 | logger, _ := test.NewNullLogger()
75 | s := NewStorer(&fakeVectorizer{}, repo, logger)
76 | concept := "capacitor"
77 | inp := ExtensionInput{
78 | Definition: "an electrical device to store energy in the short term",
79 | Weight: 1,
80 | }
81 |
82 | expectedExtension := Extension{
83 | Input: inp,
84 | Concept: concept,
85 | Vector: []float32{1, 2, 3},
86 | Occurrence: 1000,
87 | }
88 | repo.On("Put", expectedExtension).Return(nil)
89 | err := s.Put(context.Background(), concept, inp)
90 | require.Nil(t, err)
91 | repo.AssertExpectations(t)
92 |
93 | })
94 |
95 | t.Run("with valid input (compound word)", func(t *testing.T) {
96 | // this is a special case because users will input their words using
97 | // spaces, but we store them using snake_case
98 | repo := &fakeStorerRepo{}
99 | logger, _ := test.NewNullLogger()
100 | s := NewStorer(&fakeVectorizer{}, repo, logger)
101 | concept := "flux capacitor"
102 | inp := ExtensionInput{
103 | Definition: "an energy source for cars to travel through time",
104 | Weight: 1,
105 | }
106 |
107 | expectedExtension := Extension{
108 | Input: inp,
109 | Concept: "flux_capacitor",
110 | Vector: []float32{1, 2, 3},
111 | Occurrence: 1000,
112 | }
113 | repo.On("Put", expectedExtension).Return(nil)
114 | err := s.Put(context.Background(), concept, inp)
115 | require.Nil(t, err)
116 | repo.AssertExpectations(t)
117 | })
118 | }
119 |
120 | type fakeVectorizer struct{}
121 |
122 | func (f *fakeVectorizer) Corpi(corpi []string, overrides map[string]string) (*core.Vector, error) {
123 | v := core.NewVector([]float32{1, 2, 3})
124 | return &v, nil
125 | }
126 |
127 | type fakeStorerRepo struct {
128 | mock.Mock
129 | }
130 |
131 | func (f *fakeStorerRepo) Put(ctx context.Context, ext Extension) error {
132 | args := f.Called(ext)
133 | return args.Error(0)
134 | }
135 |
--------------------------------------------------------------------------------
/gen_proto_code.sh:
--------------------------------------------------------------------------------
1 | protoc -I contextionary/ contextionary/contextionary.proto --go_out=plugins=grpc:contextionary
2 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/weaviate/contextionary
2 |
3 | go 1.13
4 |
5 | require (
6 | github.com/fatih/camelcase v1.0.0
7 | github.com/golang/protobuf v1.4.3
8 | github.com/golang/snappy v0.0.3 // indirect
9 | github.com/jessevdk/go-flags v1.4.0
10 | github.com/onsi/ginkgo v1.15.2 // indirect
11 | github.com/onsi/gomega v1.11.0 // indirect
12 | github.com/sirupsen/logrus v1.6.0
13 | github.com/stretchr/testify v1.6.1
14 | github.com/syndtr/goleveldb v0.0.0-20180708030551-c4c61651e9e3
15 | google.golang.org/grpc v1.24.0
16 | )
17 |
--------------------------------------------------------------------------------
/logparser/parse.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "encoding/json"
6 | "fmt"
7 | "log"
8 | "os"
9 | )
10 |
11 | type logEntry struct {
12 | Action string `json:"action"`
13 | Words []word `json:"words"`
14 | }
15 |
16 | type word struct {
17 | Occurrence int `json:"occurrence"`
18 | Weight float64 `json:"weight"`
19 | Word string `json:"word"`
20 | }
21 |
22 | func main() {
23 | scanner := bufio.NewScanner(os.Stdin)
24 | var results []logEntry
25 |
26 | for scanner.Scan() {
27 | var current logEntry
28 | err := json.Unmarshal(scanner.Bytes(), ¤t)
29 | if err != nil {
30 | log.Fatal(err)
31 | }
32 |
33 | if current.Action == "debug_vector_weights" {
34 | results = append(results, current)
35 | }
36 | }
37 |
38 | marshalled, err := json.MarshalIndent(results, "", " ")
39 | if err != nil {
40 | log.Fatal(err)
41 | }
42 |
43 | fmt.Print(string(marshalled))
44 | }
45 |
--------------------------------------------------------------------------------
/main/splitter_preprocessor.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/weaviate/contextionary/preprocessing"
8 | )
9 |
10 | func main() {
11 | if len(os.Args) != 5 {
12 | missing := fmt.Errorf("Missing arguments requires: [.idx, .dic, .aff, output_file]")
13 | panic(missing.Error())
14 | }
15 |
16 | err := preprocessing.GenerateSplittingDictFile(os.Args[1], os.Args[2], os.Args[3], os.Args[4])
17 | if err != nil {
18 | panic(err.Error())
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/prepare_docker_buildx.sh:
--------------------------------------------------------------------------------
1 | docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
2 | docker buildx create --name multiarch --driver docker-container --use
3 | docker buildx inspect --bootstrap
4 |
--------------------------------------------------------------------------------
/preprocessing/dictionary_pre_processing.go:
--------------------------------------------------------------------------------
1 | package preprocessing
2 |
3 | import (
4 | "encoding/binary"
5 | "encoding/json"
6 | "fmt"
7 | "io/ioutil"
8 | "os"
9 | "strings"
10 | )
11 |
12 | // PreprocessDict temp storage for reading in the index file
13 | type PreprocessDict struct {
14 | dict map[string]int
15 | }
16 |
17 | // GenerateSplittingDictFile from
18 | //
19 | // contextionaryIndexFile binary .idx file containing the words for the specific language
20 | // languageDictionaryFile a hunspell .dic file for the specific language
21 | // languageAffixesFile a hunspell .aff file for the specific language
22 | // to reduce file- and hunspell dependencies for the splitter
23 | func GenerateSplittingDictFile(contextionaryIndexFile string, languageDictionaryFile string, languageAffixesFile string, outputFile string) error {
24 | dict := NewPreprocessDict(contextionaryIndexFile, languageDictionaryFile, languageAffixesFile)
25 | out, err := os.Create(outputFile)
26 | if err != nil {
27 | return err
28 | }
29 | defer out.Close()
30 |
31 | for word, occurrence := range dict.dict {
32 | line := fmt.Sprintf("%s,%v\n", word, occurrence)
33 | _, err := out.Write([]byte(line))
34 | if err != nil {
35 | return err
36 | }
37 | }
38 | return nil
39 | }
40 |
41 | // NewPreprocessDict from
42 | //
43 | // contextionaryIndexFile binary .idx file containing the words for the specific language
44 | // languageDictionaryFile a hunspell .dic file for the specific language
45 | // languageAffixesFile a hunspell .aff file for the specific language
46 | func NewPreprocessDict(contextionaryIndexFile string, languageDictionaryFile string, languageAffixesFile string) *PreprocessDict {
47 | dict := &PreprocessDict{
48 | dict: make(map[string]int, 1200000),
49 | }
50 | hunspellFilter := Hunspell(languageAffixesFile, languageDictionaryFile)
51 |
52 | err := dict.loadContextionary(contextionaryIndexFile, hunspellFilter)
53 | if err != nil {
54 | panic(err.Error())
55 | }
56 | return dict
57 | }
58 |
59 | // loadContextionary from binary file
60 | func (cd *PreprocessDict) loadContextionary(path string, filter *Hunhandle) error {
61 | data, readFileErr := ioutil.ReadFile(path)
62 | if readFileErr != nil {
63 | return readFileErr
64 | }
65 |
66 | // File format:
67 | // https://github.com/weaviate/weaviate-vector-generator#wordlist-file-format
68 | nrWordsBytes := data[0:8]
69 | //vectorLengthBytes := data[8:16]
70 | metaDataLengthBytes := data[16:24]
71 |
72 | nrWords := binary.LittleEndian.Uint64(nrWordsBytes)
73 | //vectorLength := binary.LittleEndian.Uint64(vectorLengthBytes)
74 | metaDataLength := binary.LittleEndian.Uint64(metaDataLengthBytes)
75 |
76 | // Read meta data
77 | metaDataBytes := data[24 : 24+metaDataLength]
78 | var metadata map[string]interface{}
79 | unMarshalErr := json.Unmarshal(metaDataBytes, &metadata)
80 | if unMarshalErr != nil {
81 | return unMarshalErr
82 | }
83 |
84 | var startOfTable uint64 = 24 + uint64(metaDataLength)
85 | var offset uint64 = 4 - (startOfTable % 4)
86 | startOfTable += offset
87 |
88 | for wordIndex := uint64(0); wordIndex < nrWords; wordIndex++ {
89 | // entryAddress is the index in the data where the pointer to
90 | // the word is located
91 | entryAddress := startOfTable + 8*wordIndex
92 | pointerToWordByte := data[entryAddress : entryAddress+8]
93 | pointerToWord := binary.LittleEndian.Uint64(pointerToWordByte)
94 | word, occurence := getWordAndOccurence(data, pointerToWord)
95 | // Only add the word if it passes the filter
96 | if passesFilter(word, filter) {
97 | cd.dict[word] = int(occurence)
98 | }
99 | }
100 |
101 | return nil
102 | }
103 |
104 | // getWordAndOccurence from the data frame indecated by the pointer
105 | func getWordAndOccurence(data []byte, pointer uint64) (string, uint64) {
106 | ocurrence := binary.LittleEndian.Uint64(data[pointer : pointer+8])
107 |
108 | pointer = pointer + 8
109 | for i := uint64(0); ; i++ {
110 | if data[pointer+i] == '\x00' {
111 | word := string(data[pointer : pointer+i])
112 | return word, ocurrence
113 | }
114 | }
115 | }
116 |
117 | // passesFilter if the word is in the dictionary of the given language
118 | func passesFilter(word string, filter *Hunhandle) bool {
119 | inDict := filter.Spell(word)
120 | if inDict {
121 | return true
122 | }
123 | // Check if upper case word
124 | inDict = filter.Spell(strings.Title(word))
125 | return inDict
126 | }
127 |
--------------------------------------------------------------------------------
/preprocessing/dictionary_pre_processing_test.go:
--------------------------------------------------------------------------------
1 | package preprocessing
2 |
3 | import (
4 | "bufio"
5 | "os"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/stretchr/testify/assert"
10 | "github.com/weaviate/contextionary/compoundsplitting"
11 | )
12 |
13 | func TestPreprocessorSplitterDictFile(t *testing.T) {
14 | // Create the file
15 | outputFile := "test_dict.splitdict"
16 | GenerateSplittingDictFile("../test/compoundsplitting/contextionary.idx", "../test/compoundsplitting/nl_NL.dic", "../test/compoundsplitting/nl_NL.aff", outputFile)
17 |
18 | // Validate the output file
19 | file, err := os.Open(outputFile)
20 | if err != nil {
21 | t.Fail()
22 | }
23 | defer file.Close()
24 |
25 | scanner := bufio.NewScanner(file)
26 | found := false
27 | for scanner.Scan() {
28 | line := scanner.Text()
29 | split := strings.Split(line, ",")
30 | if split[0] == "appellantes" {
31 | found = true
32 | break
33 | }
34 | }
35 | assert.True(t, found)
36 |
37 | if err := scanner.Err(); err != nil {
38 | t.Fail()
39 | }
40 |
41 | err = file.Close()
42 | if err != nil {
43 | t.Fail()
44 | }
45 |
46 | // Load from output file
47 | dict, err := compoundsplitting.NewContextionaryDict(outputFile)
48 | if err != nil {
49 | t.Fail()
50 | }
51 |
52 | assert.True(t, dict.Contains("amsterdam"))
53 | assert.True(t, dict.Contains("appellante"))
54 | assert.True(t, dict.Contains("appellantes"))
55 |
56 | // Remove test file
57 | err = os.Remove(outputFile)
58 | if err != nil {
59 | t.Fail()
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/preprocessing/hunspell.go:
--------------------------------------------------------------------------------
1 | package preprocessing
2 |
3 | // #cgo linux LDFLAGS: -lhunspell
4 | // #cgo darwin LDFLAGS: -lhunspell-1.7 -L/usr/local/Cellar/hunspell/1.7.0_2/lib
5 | // #cgo darwin CFLAGS: -I/usr/local/Cellar/hunspell/1.7.0_2/include/
6 | //
7 | // #include
8 | // #include
9 | // #include
10 | import "C"
11 | import (
12 | "reflect"
13 | "runtime"
14 | "sync"
15 | "unsafe"
16 | )
17 |
18 | // Code in this file copied/based on
19 | // https://github.com/sthorne/go-hunspell/blob/99efdad5368d3e39a44c8cdaf101c33a4f20f8b9/hunspell.go
20 | // Original is licensed under "MIT License" Original license located at:
21 | // https://github.com/sthorne/go-hunspell/blob/99efdad5368d3e39a44c8cdaf101c33a4f20f8b9/LICENSE
22 |
23 | type Hunhandle struct {
24 | handle *C.Hunhandle
25 | lock *sync.Mutex
26 | }
27 |
28 | func Hunspell(affpath string, dpath string) *Hunhandle {
29 |
30 | affpathcs := C.CString(affpath)
31 | defer C.free(unsafe.Pointer(affpathcs))
32 |
33 | dpathcs := C.CString(dpath)
34 | defer C.free(unsafe.Pointer(dpathcs))
35 |
36 | h := &Hunhandle{lock: new(sync.Mutex)}
37 | h.handle = C.Hunspell_create(affpathcs, dpathcs)
38 |
39 | runtime.SetFinalizer(h, func(handle *Hunhandle) {
40 | C.Hunspell_destroy(handle.handle)
41 | h.handle = nil
42 | })
43 |
44 | return h
45 | }
46 |
47 | func CArrayToString(c **C.char, l int) []string {
48 |
49 | s := []string{}
50 |
51 | hdr := reflect.SliceHeader{
52 | Data: uintptr(unsafe.Pointer(c)),
53 | Len: l,
54 | Cap: l,
55 | }
56 |
57 | for _, v := range *(*[]*C.char)(unsafe.Pointer(&hdr)) {
58 | s = append(s, C.GoString(v))
59 | }
60 |
61 | return s
62 | }
63 |
64 | func (handle *Hunhandle) Suggest(word string) []string {
65 | wordcs := C.CString(word)
66 | defer C.free(unsafe.Pointer(wordcs))
67 |
68 | var carray **C.char
69 | var length C.int
70 | handle.lock.Lock()
71 | length = C.Hunspell_suggest(handle.handle, &carray, wordcs)
72 | handle.lock.Unlock()
73 |
74 | words := CArrayToString(carray, int(length))
75 |
76 | C.Hunspell_free_list(handle.handle, &carray, length)
77 | return words
78 | }
79 |
80 | func (handle *Hunhandle) Add(word string) bool {
81 |
82 | cWord := C.CString(word)
83 | defer C.free(unsafe.Pointer(cWord))
84 |
85 | var r C.int
86 | r = C.Hunspell_add(handle.handle, cWord)
87 |
88 | if int(r) != 0 {
89 | return false
90 | }
91 |
92 | return true
93 | }
94 |
95 | func (handle *Hunhandle) Stem(word string) []string {
96 | wordcs := C.CString(word)
97 | defer C.free(unsafe.Pointer(wordcs))
98 | var carray **C.char
99 | var length C.int
100 | handle.lock.Lock()
101 | length = C.Hunspell_stem(handle.handle, &carray, wordcs)
102 | handle.lock.Unlock()
103 |
104 | words := CArrayToString(carray, int(length))
105 |
106 | C.Hunspell_free_list(handle.handle, &carray, length)
107 | return words
108 | }
109 |
110 | func (handle *Hunhandle) Spell(word string) bool {
111 | wordcs := C.CString(word)
112 | defer C.free(unsafe.Pointer(wordcs))
113 | handle.lock.Lock()
114 | res := C.Hunspell_spell(handle.handle, wordcs)
115 | handle.lock.Unlock()
116 |
117 | if int(res) == 0 {
118 | return false
119 | }
120 | return true
121 | }
122 |
--------------------------------------------------------------------------------
/preprocessing/hunspell_test.go:
--------------------------------------------------------------------------------
1 | package preprocessing
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/stretchr/testify/assert"
7 | )
8 |
9 | func TestImplementation(t *testing.T) {
10 |
11 | hsp := Hunspell("../test/compoundsplitting/nl_NL.aff", "../test/compoundsplitting/nl_NL.dic")
12 |
13 | assert.True(t, hsp.Spell("Amsterdam"))
14 | assert.True(t, hsp.Spell("appellante"))
15 | assert.True(t, hsp.Spell("appellantes"))
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/server/config/config.go:
--------------------------------------------------------------------------------
1 | package config
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "strconv"
7 |
8 | "github.com/sirupsen/logrus"
9 | )
10 |
11 | // Config is used to load application wide config from the environment
12 | type Config struct {
13 | logger logrus.FieldLogger
14 | KNNFile string
15 | IDXFile string
16 | StopwordsFile string
17 |
18 | SchemaProviderURL string
19 | SchemaProviderKey string
20 | ExtensionsPrefix string
21 | ExtensionsStorageOrigin string
22 | ExtensionsStorageMode string
23 |
24 | ServerPort int
25 |
26 | OccurrenceWeightStrategy string
27 | OccurrenceWeightLinearFactor float32
28 | MaxCompoundWordLength int
29 | MaximumBatchSize int
30 | MaximumVectorCacheSize int
31 | NeighborOccurrenceIgnorePercentile int
32 |
33 | EnableCompundSplitting bool
34 | CompoundSplittingDictionaryFile string
35 |
36 | LogLevel string
37 | }
38 |
39 | // New Config from the environment. Errors if required env vars can't be found
40 | func New(logger logrus.FieldLogger) (*Config, error) {
41 | cfg := &Config{logger: logger}
42 | if err := cfg.init(); err != nil {
43 | return nil, fmt.Errorf("could not load config from env: %v", err)
44 | }
45 |
46 | return cfg, nil
47 | }
48 |
49 | func (c *Config) init() error {
50 | knn, err := c.requiredString("KNN_FILE")
51 | if err != nil {
52 | return err
53 | }
54 | c.KNNFile = knn
55 |
56 | idx, err := c.requiredString("IDX_FILE")
57 | if err != nil {
58 | return err
59 | }
60 | c.IDXFile = idx
61 |
62 | sw, err := c.requiredString("STOPWORDS_FILE")
63 | if err != nil {
64 | return err
65 | }
66 | c.StopwordsFile = sw
67 |
68 | sp := c.optionalString("SCHEMA_PROVIDER_URL", "")
69 | c.SchemaProviderURL = sp
70 |
71 | spk := c.optionalString("SCHEMA_PROVIDER_KEY", "/weaviate/schema/state")
72 | c.SchemaProviderKey = spk
73 |
74 | ep := c.optionalString("EXTENSIONS_PREFIX", "/contextionary/")
75 | c.ExtensionsPrefix = ep
76 |
77 | extMode := c.optionalString("EXTENSIONS_STORAGE_MODE", "weaviate")
78 | c.ExtensionsStorageMode = extMode
79 |
80 | extOrigin := c.optionalString("EXTENSIONS_STORAGE_ORIGIN", "")
81 | c.ExtensionsStorageOrigin = extOrigin
82 |
83 | port, err := c.optionalInt("SERVER_PORT", 9999)
84 | if err != nil {
85 | return err
86 | }
87 | c.ServerPort = port
88 |
89 | factor, err := c.optionalFloat32("OCCURRENCE_WEIGHT_LINEAR_FACTOR", 0.5)
90 | if err != nil {
91 | return err
92 | }
93 | c.OccurrenceWeightLinearFactor = factor
94 |
95 | ignorePercentile, err := c.optionalInt("NEIGHBOR_OCCURRENCE_IGNORE_PERCENTILE", 5)
96 | if err != nil {
97 | return err
98 | }
99 |
100 | if ignorePercentile < 0 || ignorePercentile > 100 {
101 | return fmt.Errorf("minimum relative neighbor occurrence must be a value between 0 and 100, got: %d", ignorePercentile)
102 | }
103 |
104 | c.NeighborOccurrenceIgnorePercentile = ignorePercentile
105 |
106 | strategy := c.optionalString("OCCURRENCE_WEIGHT_STRATEGY", "log")
107 | c.OccurrenceWeightStrategy = strategy
108 |
109 | // this should match the underlying vector db file, a smaller value than in
110 | // the vector file will lead to missing out on compound words, whereas a
111 | // larger value will lead to unnecessary lookups slowing down the
112 | // vectorization process
113 | compoundLength, err := c.optionalInt("MAX_COMPOUND_WORD_LENGTH", 1)
114 | if err != nil {
115 | return err
116 | }
117 | c.MaxCompoundWordLength = compoundLength
118 |
119 | batchSize, err := c.optionalInt("MAX_BATCH_SIZE", 200)
120 | if err != nil {
121 | return err
122 | }
123 | c.MaximumBatchSize = batchSize
124 |
125 | vectorCacheSize, err := c.optionalInt("MAX_VECTORCACHE_SIZE", 10000)
126 | if err != nil {
127 | return err
128 | }
129 | c.MaximumVectorCacheSize = vectorCacheSize
130 |
131 | c.EnableCompundSplitting = c.optionalBool("ENABLE_COMPOUND_SPLITTING", false)
132 |
133 | if c.EnableCompundSplitting {
134 | compoundSplittingDictionaryFile, err := c.requiredString("COMPOUND_SPLITTING_DICTIONARY_FILE")
135 | if err != nil {
136 | return err
137 | }
138 | c.CompoundSplittingDictionaryFile = compoundSplittingDictionaryFile
139 | }
140 |
141 | loglevel := c.optionalString("LOG_LEVEL", "info")
142 | c.LogLevel = loglevel
143 |
144 | return nil
145 | }
146 |
147 | func (c *Config) optionalInt(varName string, defaultValue int) (int, error) {
148 | value := os.Getenv(varName)
149 | if value == "" {
150 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
151 | varName, defaultValue)
152 | return defaultValue, nil
153 | }
154 |
155 | asInt, err := strconv.Atoi(value)
156 | if err != nil {
157 | return 0, fmt.Errorf("cannot convert value of var '%s' ('%v') to int: %s",
158 | varName, value, err)
159 | }
160 |
161 | return asInt, nil
162 | }
163 |
164 | func (c *Config) optionalFloat32(varName string, defaultValue float32) (float32, error) {
165 | value := os.Getenv(varName)
166 | if value == "" {
167 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
168 | varName, defaultValue)
169 | return defaultValue, nil
170 | }
171 |
172 | asFloat, err := strconv.ParseFloat(value, 32)
173 | if err != nil {
174 | return 0, fmt.Errorf("cannot convert value of var '%s' ('%v') to int: %s",
175 | varName, value, err)
176 | }
177 |
178 | return float32(asFloat), nil
179 | }
180 |
181 | func (c *Config) requiredString(varName string) (string, error) {
182 | value := os.Getenv(varName)
183 | if value == "" {
184 | return "", fmt.Errorf("required variable '%s' is not set", varName)
185 | }
186 |
187 | return value, nil
188 | }
189 |
190 | func (c *Config) optionalString(varName, defaultInput string) string {
191 | value := os.Getenv(varName)
192 | if value == "" {
193 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
194 | varName, defaultInput)
195 | return defaultInput
196 | }
197 |
198 | return value
199 | }
200 |
201 | func (c *Config) optionalBool(varName string, defaultInput bool) bool {
202 | value := os.Getenv(varName)
203 | if value == "" {
204 | c.logger.Infof("optional var '%s' is not set, defaulting to '%v'",
205 | varName, defaultInput)
206 | return defaultInput
207 | }
208 |
209 | return value == "true" || value == "1" || value == "on" || value == "enabled"
210 | }
211 |
--------------------------------------------------------------------------------
/server/contextionary.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "time"
6 |
7 | "github.com/weaviate/contextionary/compoundsplitting"
8 |
9 | "github.com/weaviate/contextionary/adapters/repos"
10 | core "github.com/weaviate/contextionary/contextionary/core"
11 | "github.com/weaviate/contextionary/contextionary/core/stopwords"
12 | "github.com/weaviate/contextionary/extensions"
13 | )
14 |
15 | func (s *server) init() error {
16 | s.logger.WithField("config", s.config).Debugf("starting up with this config")
17 |
18 | if err := s.loadRawContextionary(); err != nil {
19 | return err
20 | }
21 |
22 | swDetector, err := stopwords.NewFromFile(s.config.StopwordsFile)
23 | if err != nil {
24 | return err
25 | }
26 | s.stopwordDetector = swDetector
27 |
28 | if err := s.buildContextionary(); err != nil {
29 | return err
30 | }
31 |
32 | var er extensionRepo
33 | var extensionRetriever extensionLookerUpper
34 |
35 | // ExtensionsStorageMode == "weaviate" is now a default storage option
36 | er = repos.NewExtensionsRepo(s.logger, s.config, 1*time.Second)
37 | extensionRetriever = extensions.NewLookerUpper(er)
38 |
39 | compoundSplitter, err := s.initCompoundSplitter()
40 | if err != nil {
41 | return err
42 | }
43 | vectorizer, err := NewVectorizer(s.rawContextionary, s.stopwordDetector, s.config, s.logger,
44 | NewSplitter(), extensionRetriever, compoundSplitter)
45 | if err != nil {
46 | return err
47 | }
48 |
49 | s.vectorizer = vectorizer
50 | s.extensionStorer = extensions.NewStorer(s.vectorizer, er, s.logger)
51 | s.extensionLookerUpper = extensionRetriever
52 |
53 | return nil
54 | }
55 |
56 | func (s *server) loadRawContextionary() error {
57 | c, err := core.LoadVectorFromDisk(s.config.KNNFile, s.config.IDXFile)
58 | if err != nil {
59 | return fmt.Errorf("could not initialize (raw) contextionary: %v", err)
60 | }
61 |
62 | s.rawContextionary = c
63 | return nil
64 | }
65 |
66 | type stopwordDetector interface {
67 | IsStopWord(word string) bool
68 | }
69 |
70 | // any time the schema changes the contextionary needs to be rebuilt.
71 | func (s *server) buildContextionary() error {
72 | s.combinedContextionary = s.rawContextionary
73 | return nil
74 | }
75 |
76 | func (s *server) initCompoundSplitter() (compoundSplitter, error) {
77 | if s.config.EnableCompundSplitting {
78 | dict, err := compoundsplitting.NewContextionaryDict(s.config.CompoundSplittingDictionaryFile)
79 | if err != nil {
80 | return nil, err
81 | }
82 | return compoundsplitting.NewSplitter(dict), nil
83 | } else {
84 | return compoundsplitting.NewNoopSplitter(), nil
85 | }
86 | }
87 |
88 | type extensionRepo interface {
89 | extensions.RetrieverRepo
90 | extensions.StorerRepo
91 | }
92 |
--------------------------------------------------------------------------------
/server/grpc_error.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/weaviate/contextionary/errors"
5 | "google.golang.org/grpc/codes"
6 | "google.golang.org/grpc/status"
7 | )
8 |
9 | func GrpcErrFromTyped(err error) error {
10 | if err == nil {
11 | return nil
12 | }
13 |
14 | switch err.(type) {
15 | case errors.InvalidUserInput:
16 | return status.Error(codes.InvalidArgument, err.Error())
17 | case errors.Internal:
18 | return status.Error(codes.Internal, err.Error())
19 | case errors.NotFound:
20 | return status.Error(codes.NotFound, err.Error())
21 | default:
22 | return status.Error(codes.Unknown, err.Error())
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/server/server.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "net"
6 | "os"
7 |
8 | "github.com/sirupsen/logrus"
9 | pb "github.com/weaviate/contextionary/contextionary"
10 | core "github.com/weaviate/contextionary/contextionary/core"
11 | "github.com/weaviate/contextionary/extensions"
12 | "github.com/weaviate/contextionary/server/config"
13 | grpc "google.golang.org/grpc"
14 | )
15 |
16 | // Version is filled through a build arg
17 | var Version string
18 |
19 | func main() {
20 | server := new()
21 | server.logger.WithField("version", Version).Info()
22 | grpcServer := grpc.NewServer()
23 | pb.RegisterContextionaryServer(grpcServer, server)
24 | lis, err := net.Listen("tcp", fmt.Sprintf(":%d", server.config.ServerPort))
25 | if err != nil {
26 | server.logger.Errorf("can't listen on port: %s", err)
27 | os.Exit(1)
28 | }
29 |
30 | grpcServer.Serve(lis)
31 | }
32 |
33 | type server struct {
34 | // to be used to serve rpc requests, combination of the raw contextionary
35 | // and the schema
36 | combinedContextionary core.Contextionary
37 |
38 | // initialized at startup, to be used to build the
39 | // schema contextionary
40 | rawContextionary core.Contextionary
41 |
42 | config *config.Config
43 |
44 | logger logrus.FieldLogger
45 |
46 | // ucs
47 | extensionStorer *extensions.Storer
48 | extensionLookerUpper extensionLookerUpper
49 | stopwordDetector stopwordDetector
50 | vectorizer *Vectorizer
51 | }
52 |
53 | // new gRPC server to serve the contextionary
54 | func new() *server {
55 | logger := logrus.New()
56 | logger.SetFormatter(&logrus.JSONFormatter{})
57 | cfg, err := config.New(logger)
58 | if err != nil {
59 | logger.
60 | WithError(err).
61 | Errorf("cannot start up")
62 | os.Exit(1)
63 | }
64 |
65 | loglevel, err := logrus.ParseLevel(cfg.LogLevel)
66 | if err != nil {
67 | logger.
68 | WithError(err).
69 | Errorf("cannot start up")
70 | os.Exit(1)
71 | }
72 | logger.SetLevel(loglevel)
73 | logger.WithField("log_level", loglevel.String()).Info()
74 |
75 | s := &server{
76 | config: cfg,
77 | logger: logger,
78 | }
79 |
80 | err = s.init()
81 | if err != nil {
82 | logger.
83 | WithError(err).
84 | Errorf("cannot start up")
85 | os.Exit(1)
86 | }
87 |
88 | return s
89 | }
90 |
--------------------------------------------------------------------------------
/server/splitter.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "strings"
5 | "unicode"
6 | )
7 |
8 | func NewSplitter() *Splitter {
9 | return &Splitter{}
10 | }
11 |
12 | type Splitter struct{}
13 |
14 | func (s *Splitter) Split(corpus string) []string {
15 | return strings.FieldsFunc(corpus, func(c rune) bool {
16 | return !unicode.IsLetter(c) && !unicode.IsNumber(c)
17 | })
18 | }
19 |
--------------------------------------------------------------------------------
/server/splitter_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/stretchr/testify/assert"
7 | )
8 |
9 | func Test_Splitter(t *testing.T) {
10 | type testcase struct {
11 | name string
12 | input string
13 | output []string
14 | }
15 |
16 | tests := []testcase{
17 | testcase{
18 | name: "single word",
19 | input: "single",
20 | output: []string{"single"},
21 | },
22 | testcase{
23 | name: "words separated by space",
24 | input: "hello my name is John",
25 | output: []string{"hello", "my", "name", "is", "John"},
26 | },
27 | testcase{
28 | name: "multiple spaces in between words",
29 | input: "hello John",
30 | output: []string{"hello", "John"},
31 | },
32 |
33 | testcase{
34 | name: "words with numbers",
35 | input: "foo1 foo2",
36 | output: []string{"foo1", "foo2"},
37 | },
38 |
39 | testcase{
40 | name: "hyphenated words",
41 | input: "r2-d2",
42 | output: []string{"r2", "d2"},
43 | },
44 |
45 | testcase{
46 | name: "on commas (with and without spaces)",
47 | input: "jane, john,anna",
48 | output: []string{"jane", "john", "anna"},
49 | },
50 |
51 | testcase{
52 | name: "on other characters",
53 | input: "foobar baz#(*@@baq",
54 | output: []string{"foobar", "baz", "baq"},
55 | },
56 |
57 | testcase{
58 | name: "words containing umlauts (upper and lower)",
59 | input: "Ölpreis über 80 dollar!",
60 | output: []string{"Ölpreis", "über", "80", "dollar"},
61 | },
62 |
63 | testcase{
64 | name: "words containing turkish characters",
65 | input: "Ölpreis über 80 dollar!",
66 | output: []string{"Ölpreis", "über", "80", "dollar"},
67 | },
68 |
69 | testcase{
70 | name: "words containing turkish characters",
71 | input: "Weaviate ayrıca Türkçe konuşabilir",
72 | output: []string{"Weaviate", "ayrıca", "Türkçe", "konuşabilir"},
73 | },
74 |
75 | testcase{
76 | name: "mixed characters including a '<'",
77 | input: "car, car#of,,,,brand 0 {
60 | e.parsedStack = append(e.parsedStack, strings.Join(currOperandDigits, ""))
61 | currOperandDigits = nil
62 | }
63 |
64 | // We will eventually append our current operator to the operator stack.
65 | // However, first it must be compared against current operators, if the
66 | // top of the stack has a higher or equal precedence to the current one,
67 | // we will pop that first. We continue this pattern until either the
68 | // stack is empty or the topmost element of the stack is of lower
69 | // precedence than the current
70 | for len(operatorStack) > 0 {
71 | topStack := operatorStack[len(operatorStack)-1]
72 | if operatorPrecedence(topStack) < operatorPrecedence(string(r)) {
73 | break
74 | }
75 |
76 | e.parsedStack = append(e.parsedStack, topStack)
77 | operatorStack = operatorStack[:len(operatorStack)-1]
78 | }
79 | operatorStack = append(operatorStack, string(r))
80 | }
81 |
82 | // in case the expression ends with an operand, we need to check again if the
83 | // temp digit stack still contains elements
84 | if len(currOperandDigits) > 0 {
85 | e.parsedStack = append(e.parsedStack, strings.Join(currOperandDigits, ""))
86 | currOperandDigits = nil
87 | }
88 |
89 | // append the remainder of the operatorStack (if any) to the parsed output in
90 | // reverse order
91 | e.parsedStack = append(e.parsedStack, reverseSlice(operatorStack)...)
92 | return nil
93 | }
94 |
95 | func (e *Evaluator) unrecognizedOperator(op string) error {
96 | if op == "(" || op == ")" {
97 | return fmt.Errorf("using parantheses in the expression is not supported")
98 | }
99 |
100 | return fmt.Errorf("unrecognized operator: %s", string(op))
101 | }
102 |
103 | func (e Evaluator) evaluate() (float64, error) {
104 | var operandStack []float64
105 | for _, item := range e.parsedStack {
106 | if !isOperator(item) {
107 | // not an operator, so it must be an operand
108 | num, err := e.parseNumberOrVariable(item)
109 | if err != nil {
110 | return 0, err
111 | }
112 |
113 | operandStack = append(operandStack, num)
114 | continue
115 | }
116 |
117 | // is an operator
118 | if len(operandStack) < 2 {
119 | return 0, fmt.Errorf("invalid or unsupported math expression")
120 | }
121 |
122 | // note that the topStack is the right operator, whereas topStack-1 is the left!
123 | op1, op2 := operandStack[len(operandStack)-2], operandStack[len(operandStack)-1]
124 | operandStack = operandStack[:len(operandStack)-2]
125 |
126 | res, err := evaluteOperator(item, op1, op2)
127 | if err != nil {
128 | return 0, err
129 | }
130 | operandStack = append(operandStack, res)
131 | }
132 |
133 | if len(operandStack) != 1 {
134 | return 0, fmt.Errorf("could not evaluate mathematical expression")
135 | }
136 |
137 | return operandStack[0], nil
138 | }
139 |
140 | func evaluteOperator(op string, left, right float64) (float64, error) {
141 | switch op {
142 | case "+":
143 | return left + right, nil
144 | case "-":
145 | return left - right, nil
146 | case "*":
147 | return left * right, nil
148 | case "/":
149 | return left / right, nil
150 | default:
151 | return 0, fmt.Errorf("this should be unreachable - or the implentation of an operator is missing")
152 | }
153 | }
154 |
155 | func isOperator(in string) bool {
156 | switch in {
157 | case "*", "+", "-", "/":
158 | return true
159 | default:
160 | return false
161 | }
162 | }
163 |
164 | // we allow numbers, the dot as a floating point symbol, as well as letters to
165 | // represent variables
166 | func isOperand(r rune) bool {
167 | if unicode.IsLetter(r) || unicode.IsNumber(r) || string(r) == "." {
168 | return true
169 | }
170 | return false
171 | }
172 |
173 | func (e *Evaluator) parseNumberOrVariable(in string) (float64, error) {
174 | r := rune(in[0])
175 | if unicode.IsNumber(r) {
176 | return strconv.ParseFloat(in, 64)
177 | } else {
178 | if in == "w" {
179 | return e.originalWeight, nil
180 | }
181 | return 0, fmt.Errorf("unrecognized variable '%s', use 'w' to represent original weight", in)
182 | }
183 | }
184 |
185 | func operatorPrecedence(op string) int {
186 | switch op {
187 | case "+", "-":
188 | return 1
189 | case "*", "/":
190 | return 2
191 | default:
192 | return -1
193 | }
194 | }
195 |
196 | // from https://github.com/golang/go/wiki/SliceTricks
197 | func reverseSlice(a []string) []string {
198 | for i := len(a)/2 - 1; i >= 0; i-- {
199 | opp := len(a) - 1 - i
200 | a[i], a[opp] = a[opp], a[i]
201 | }
202 |
203 | return a
204 | }
205 |
--------------------------------------------------------------------------------
/server/weight_manipulator_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/stretchr/testify/assert"
8 | "github.com/stretchr/testify/require"
9 | )
10 |
11 | func TestWeightManipulator(t *testing.T) {
12 |
13 | type test struct {
14 | originalWeight float64
15 | expression string
16 | expectedResult float64
17 | expectedError error
18 | name string
19 | }
20 |
21 | tests := []test{
22 |
23 | test{
24 | originalWeight: 2.0,
25 | expression: "7",
26 | expectedResult: 7.0,
27 | expectedError: nil,
28 | name: "single operand, no operators",
29 | },
30 | test{
31 | originalWeight: 2.0,
32 | expression: "17",
33 | expectedResult: 17.0,
34 | expectedError: nil,
35 | name: "single operand, more than one digit",
36 | },
37 | test{
38 | originalWeight: 2.0,
39 | expression: "15.662",
40 | expectedResult: 15.662,
41 | expectedError: nil,
42 | name: "single operand, floating point using . as decimal",
43 | },
44 | test{
45 | originalWeight: 2.0,
46 | expression: "w * 2",
47 | expectedResult: 4.0,
48 | expectedError: nil,
49 | name: "simple multiplication",
50 | },
51 | test{
52 | originalWeight: 2.0,
53 | expression: "w * 2 * 3 * 4",
54 | expectedResult: 48.0,
55 | expectedError: nil,
56 | name: "multiplication with several operands",
57 | },
58 | test{
59 | originalWeight: 2.0,
60 | expression: "w + 3",
61 | expectedResult: 5.0,
62 | expectedError: nil,
63 | name: "simple addition",
64 | },
65 | test{
66 | originalWeight: 2.0,
67 | expression: "w + 3 + 7",
68 | expectedResult: 12.0,
69 | expectedError: nil,
70 | name: "additional with several operands",
71 | },
72 | test{
73 | originalWeight: 2.0,
74 | expression: "1+2*3+4",
75 | expectedResult: 11.0,
76 | expectedError: nil,
77 | name: "mixing operators with different precedence",
78 | },
79 | test{
80 | originalWeight: 2.0,
81 | expression: "1+2*3-4",
82 | expectedResult: 3.0,
83 | expectedError: nil,
84 | name: "mixing operators with different precedence, including -",
85 | },
86 | test{
87 | originalWeight: 2.0,
88 | expression: "1+2/4-4",
89 | expectedResult: -2.5,
90 | expectedError: nil,
91 | name: "mixing operators with different precedence, including /",
92 | },
93 | test{
94 | originalWeight: 7.0,
95 | expression: "1+ 2.5/7 * w -4/2",
96 | expectedResult: 1.5,
97 | expectedError: nil,
98 | name: "long expression including all operators",
99 | },
100 | test{
101 | originalWeight: 7.0,
102 | expression: "w * w",
103 | expectedResult: 49,
104 | expectedError: nil,
105 | name: "including the weight variable multiple times",
106 | },
107 | test{
108 | originalWeight: 7.0,
109 | expression: "2 * (1+3)",
110 | expectedError: fmt.Errorf("using parantheses in the expression is not supported"),
111 | name: "using parantheses",
112 | },
113 | test{
114 | originalWeight: 7.0,
115 | expression: "a + b * c",
116 | expectedError: fmt.Errorf("unrecognized variable 'a', use 'w' to represent original weight"),
117 | name: "using a variable other than w",
118 | },
119 | }
120 |
121 | for _, test := range tests {
122 | t.Run(test.name, func(t *testing.T) {
123 | res, err := NewEvaluator(test.expression, test.originalWeight).Do()
124 | require.Equal(t, test.expectedError, err)
125 | assert.Equal(t, test.expectedResult, res)
126 | })
127 |
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/test/compoundsplitting/contextionary.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate/contextionary/327ffb5f74ff9ede347bd31a8973d79d25fcac9b/test/compoundsplitting/contextionary.idx
--------------------------------------------------------------------------------
/test/journey.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | # Jump to root directory
6 | cd "$( dirname "${BASH_SOURCE[0]}" )"/..
7 |
8 | # set some defaults so we can also run locally
9 | if [ -z "$DOCKER_ORG" ]
10 | then
11 | DOCKER_ORG=semitechnologies
12 | fi
13 |
14 | if [ -z "$DOCKER_REPO" ]
15 | then
16 | DOCKER_REPO=contextionary
17 | fi
18 |
19 | if [ -z "$SOFTWARE_VERSION" ]
20 | then
21 | SOFTWARE_VERSION=local
22 | fi
23 |
24 | if [ -z "$MODEL_VERSION" ]
25 | then
26 | MODEL_VERSION=0.16.0
27 | fi
28 |
29 | if [ -z "$LANGUAGE" ]
30 | then
31 | LANGUAGE=en
32 | fi
33 |
34 | VERSION="${MODEL_VERSION}-${SOFTWARE_VERSION}"
35 |
36 | docker tag "$DOCKER_ORG/$DOCKER_REPO:${LANGUAGE}$VERSION-minimal" c11y-local-journeytest-minimal
37 | docker tag "$DOCKER_ORG/$DOCKER_REPO:${LANGUAGE}$VERSION" c11y-local-journeytest-full
38 |
39 | echo "Cleaning up from previous runs"
40 | docker-compose -f ./test/journey/docker-compose.yml down
41 |
42 | echo "Starting containers"
43 | docker-compose -f ./test/journey/docker-compose.yml up -d minimal full weaviate
44 |
45 | echo "Building tests"
46 | docker-compose -f ./test/journey/docker-compose.yml build test-env
47 |
48 | echo "Running tests"
49 | docker-compose -f ./test/journey/docker-compose.yml run test-env go test .
50 |
51 |
--------------------------------------------------------------------------------
/test/journey/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.13
2 | WORKDIR /testfiles
3 | COPY go.mod go.sum ./
4 | RUN go mod download
5 |
6 |
--------------------------------------------------------------------------------
/test/journey/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.4"
2 | services:
3 | weaviate:
4 | image: semitechnologies/weaviate:1.18.0
5 | ports:
6 | - "8080:8080"
7 | environment:
8 | LOG_LEVEL: "debug"
9 | CONTEXTIONARY_URL: host.docker.internal:9999
10 | QUERY_DEFAULTS_LIMIT: 20
11 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
12 | PERSISTENCE_DATA_PATH: "./data"
13 | DEFAULT_VECTORIZER_MODULE: text2vec-contextionary
14 | minimal:
15 | image: c11y-local-journeytest-minimal
16 | environment:
17 | EXTENSIONS_STORAGE_MODE: weaviate
18 | EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080
19 | full:
20 | image: c11y-local-journeytest-full
21 | environment:
22 | EXTENSIONS_STORAGE_MODE: weaviate
23 | EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080
24 | LOG_LEVEL: debug
25 | MAX_COMPOUND_WORD_LENGTH: 4
26 | MAX_BATCH_SIZE: 200
27 | ports:
28 | - "9999:9999"
29 | test-env:
30 | build:
31 | context: . # paths are relative to the docker-compose file, so they point to ./test/journey/
32 | dockerfile: ./Dockerfile
33 | volumes:
34 | - ./:/testfiles
35 | environment:
36 | DIMENSIONS: "$DIMENSIONS"
37 |
38 |
39 |
--------------------------------------------------------------------------------
/test/journey/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/weaviate/contextionary/test/journey
2 |
3 | go 1.13
4 |
5 | require (
6 | github.com/stretchr/testify v1.6.1
7 | github.com/weaviate/contextionary v1.1.2-0.20230307155526-f7e24eb73eb0
8 | google.golang.org/grpc v1.24.0
9 | )
10 |
--------------------------------------------------------------------------------
/test/journey/journey_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "os"
7 | "strconv"
8 | "testing"
9 |
10 | "github.com/stretchr/testify/assert"
11 | "github.com/stretchr/testify/require"
12 | pb "github.com/weaviate/contextionary/contextionary"
13 | "google.golang.org/grpc"
14 | )
15 |
16 | var expectedDimensions int
17 |
18 | func init() {
19 |
20 | d, err := strconv.Atoi(os.Getenv("DIMENSIONS"))
21 | if err != nil {
22 | panic(err)
23 | }
24 |
25 | expectedDimensions = d
26 | }
27 |
28 | func Test_Contextionary_Journey(t *testing.T) {
29 | // minimal
30 | connMinimal, err := grpc.Dial("minimal:9999", grpc.WithInsecure())
31 | if err != nil {
32 | t.Fatalf("couldn't connect to minimal c11y: %s", err)
33 | }
34 | defer connMinimal.Close()
35 |
36 | connFull, err := grpc.Dial("full:9999", grpc.WithInsecure())
37 | if err != nil {
38 | t.Fatalf("couldn't connect to minimal c11y: %s", err)
39 | }
40 | defer connFull.Close()
41 |
42 | clientMinimal := pb.NewContextionaryClient(connMinimal)
43 | clientFull := pb.NewContextionaryClient(connFull)
44 |
45 | t.Run("the minimal contextionary", func(t *testing.T) {
46 | client := clientMinimal
47 |
48 | t.Run("testing words present", func(t *testing.T) {
49 | words := []string{"car", "engine", "automobile", "name"}
50 |
51 | for _, word := range words {
52 | t.Run(word, func(t *testing.T) {
53 | res, err := client.IsWordPresent(context.Background(), &pb.Word{Word: word})
54 | require.Nil(t, err)
55 | assert.Equal(t, true, res.Present)
56 | })
57 | }
58 | })
59 |
60 | t.Run("testing stopwords", func(t *testing.T) {
61 | words := []string{"of", "the"}
62 |
63 | for _, word := range words {
64 | t.Run(word, func(t *testing.T) {
65 | res, err := client.IsWordStopword(context.Background(), &pb.Word{Word: word})
66 | require.Nil(t, err)
67 | assert.Equal(t, true, res.Stopword)
68 | })
69 | }
70 | })
71 |
72 | t.Run("corpi to vector", func(t *testing.T) {
73 | t.Run("only stopwords", func(t *testing.T) {
74 | corpi := []string{"of", "the of"}
75 | _, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi})
76 | assert.NotNil(t, err)
77 | })
78 |
79 | t.Run("only stopwords", func(t *testing.T) {
80 | corpi := []string{"car", "car of brand mercedes", "color blue"}
81 | res, err := client.VectorForCorpi(context.Background(), &pb.Corpi{Corpi: corpi})
82 | assert.Nil(t, err)
83 | // TODO: also upgrade minimal one to 600 vectors
84 | assert.Len(t, res.Entries, 300)
85 | })
86 |
87 | t.Run("two corpi with and without splitting characters should lead to the same vector", func(t *testing.T) {
88 | corpi1 := []string{"car", "car of brand mercedes", "color blue"}
89 | corpi2 := []string{"car,", "car#of,,,,brand maxOcc {
84 | maxOcc = occurrence
85 | }
86 |
87 | occurrences = append(occurrences, occurrence)
88 | presentWords = append(presentWords, word)
89 | }
90 |
91 | }
92 |
93 | // calculate weights by normalizing the occurrences to 0..1
94 | weights := make([]float32, len(occurrences), len(occurrences))
95 | for i, occ := range occurrences {
96 | // _ = occ
97 | // weights[i] = 1
98 | weight := 1 - float32(occ-minOcc)/float32(maxOcc-minOcc)
99 | weights[i] = weight
100 |
101 | // fmt.Printf("%s: %f\n", presentWords[i], weight)
102 | }
103 |
104 | centroid, err := contextionary.ComputeWeightedCentroid(vectors, weights)
105 | fatal(err)
106 |
107 | // fmt.Printf("%d stop words out of %d removed. %d of the remainder contained\n", stopWords, total, len(vectors))
108 |
109 | return centroid
110 |
111 | }
112 |
--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/class_vectors/search.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package main
12 |
13 | import (
14 | "bytes"
15 | "encoding/json"
16 | "fmt"
17 | "io/ioutil"
18 | "log"
19 | "net/http"
20 | "strings"
21 |
22 | contextionary "github.com/weaviate/contextionary/contextionary/core"
23 | )
24 |
25 | func searchString(word string, c11y contextionary.Contextionary) {
26 | words := strings.Split(word, " ")
27 |
28 | var usableWords []string
29 | var vectors []contextionary.Vector
30 | var weights []float32
31 |
32 | for _, word := range words {
33 | if isStopWord(word) {
34 | continue
35 | }
36 |
37 | itemIndex := c11y.WordToItemIndex(word)
38 | if ok := itemIndex.IsPresent(); !ok {
39 | log.Fatalf("the word %s is not in the c11y", word)
40 | }
41 |
42 | vector, err := c11y.GetVectorForItemIndex(itemIndex)
43 | if err != nil {
44 | log.Fatalf("could not get vector for word '%s': %v", word, err)
45 | }
46 |
47 | usableWords = append(usableWords, word)
48 | vectors = append(vectors, *vector)
49 | weights = append(weights, 1.0)
50 | }
51 |
52 | stopWordsRatio := float32((len(words) - len(usableWords))) / float32(len(words))
53 | fmt.Printf("Original Search Term: %s\n", word)
54 | fmt.Printf("After stop word removal: %s (%2.0f%% removed)\n", strings.Join(usableWords, " "), stopWordsRatio*100)
55 | fmt.Printf("\n")
56 |
57 | centroid, err := contextionary.ComputeWeightedCentroid(vectors, weights)
58 | fatal(err)
59 |
60 | search(centroid.ToArray())
61 | fmt.Printf("\n\n")
62 | }
63 |
64 | func search(v []float32) {
65 | body := fmt.Sprintf(`{
66 | "query": {
67 | "function_score": {
68 | "query": {
69 | "bool": {
70 | "filter": {
71 | "match": {
72 | "sampleBoolProp": false
73 | }
74 | }
75 | }
76 | },
77 | "boost_mode": "replace",
78 | "script_score": {
79 | "script": {
80 | "inline": "binary_vector_score",
81 | "lang": "knn",
82 | "params": {
83 | "cosine": false,
84 | "field": "embedding_vector",
85 | "vector": [
86 | %s
87 | ]
88 | }
89 | }
90 | }
91 | }
92 | },
93 | "size": 3
94 | } `, printVector(v))
95 |
96 | req, _ := http.NewRequest("GET", "http://localhost:9900/documents/_search", bytes.NewReader([]byte(body)))
97 | res, err := (&http.Client{}).Do(req)
98 | if err != nil {
99 | panic(err)
100 | }
101 |
102 | if res.StatusCode != 200 {
103 | bb, _ := ioutil.ReadAll(res.Body)
104 | panic(fmt.Errorf("status is %d: %s", res.StatusCode, bb))
105 | }
106 |
107 | defer res.Body.Close()
108 | bytes, err := ioutil.ReadAll(res.Body)
109 | if err != nil {
110 | panic(err)
111 | }
112 |
113 | var eres elasticResult
114 | err = json.Unmarshal(bytes, &eres)
115 | if err != nil {
116 | panic(err)
117 | }
118 |
119 | for i, hit := range eres.Hits.Hits {
120 | content := firstChars(hit.Source.Content, 120)
121 | fmt.Printf("\n\tNo: %d\tScore: %2.3f\tName: %s\n\t Content: %s\n", i, hit.Score, hit.Source.Name, content)
122 | }
123 | }
124 |
125 | type elasticResult struct {
126 | Hits elasticHits `json:"hits"`
127 | }
128 |
129 | type elasticHits struct {
130 | Hits []elasticHit `json:"hits"`
131 | }
132 |
133 | type elasticHit struct {
134 | Score float32 `json:"_score"`
135 | Source document `json:"_source"`
136 | }
137 |
138 | func firstChars(input string, limit int) string {
139 | if len(input) < limit {
140 | return input
141 | }
142 | return input[:limit] + "..."
143 | }
144 |
--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/comparison/main.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package main
12 |
13 | import (
14 | "fmt"
15 | "os"
16 |
17 | contextionary "github.com/weaviate/contextionary/contextionary/core"
18 | )
19 |
20 | func fatal(err error) {
21 | if err != nil {
22 | fmt.Println(err.Error())
23 | os.Exit(1)
24 | }
25 | }
26 |
27 | func main() {
28 | root := os.Args[1]
29 | c1Path := root + "/filter-after-glove"
30 | c2Path := root + "/preprocessing"
31 | c3Path := root + "/stopword-removal"
32 |
33 | c1, err := contextionary.LoadVectorFromDisk(c1Path+"/contextionary-en.knn", c1Path+"/contextionary-en.idx")
34 | fatal(err)
35 |
36 | c2, err := contextionary.LoadVectorFromDisk(c2Path+"/contextionary-en.knn", c2Path+"/contextionary-en.idx")
37 | fatal(err)
38 |
39 | c3, err := contextionary.LoadVectorFromDisk(c3Path+"/contextionary-en.knn", c3Path+"/contextionary-en.idx")
40 | fatal(err)
41 |
42 | word := os.Args[2]
43 | c1Dist, c1Words := kNN(word, c1)
44 | c2Dist, c2Words := kNN(word, c2)
45 | c3Dist, c3Words := kNN(word, c3)
46 |
47 | for i := range c1Dist {
48 | fmt.Printf("%f %-15s\t\t\t%f %-15s\t\t\t%f %-15s\n", c1Dist[i], c1Words[i], c2Dist[i], c2Words[i], c3Dist[i], c3Words[i])
49 | }
50 | }
51 |
52 | func kNN(name string, contextionary contextionary.Contextionary) ([]float32, []string) {
53 | itemIndex := contextionary.WordToItemIndex(name)
54 | if ok := itemIndex.IsPresent(); !ok {
55 | fatal(fmt.Errorf("item index for %s is not present", name))
56 | }
57 |
58 | list, distances, err := contextionary.GetNnsByItem(itemIndex, 20, 3)
59 | if err != nil {
60 | fatal(fmt.Errorf("get nns errored: %s", err))
61 | }
62 |
63 | words := make([]string, len(list), len(list))
64 | for i := range list {
65 | w, err := contextionary.ItemIndexToWord(list[i])
66 | if err != nil {
67 | fmt.Printf("error: %s", err)
68 | }
69 | words[i] = w
70 | }
71 |
72 | return distances, words
73 | }
74 |
--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/main.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package main
12 |
13 | import (
14 | "fmt"
15 | "os"
16 |
17 | contextionary "github.com/weaviate/contextionary/contextionary/core"
18 | )
19 |
20 | func fatal(err error) {
21 | if err != nil {
22 | fmt.Println(err.Error())
23 | os.Exit(1)
24 | }
25 | }
26 |
27 | func main() {
28 | c13y, err := contextionary.LoadVectorFromDisk("./tools/dev/contextionary-playground/contextionary.knn", "./tools/dev/contextionary-playground/contextionary.idx")
29 | fatal(err)
30 |
31 | fmt.Println("results before building centroid based on keywords: ")
32 | kNN("city", c13y)
33 |
34 | // Combine contextionaries
35 | contextionaries := []contextionary.Contextionary{c13y}
36 | combined, err := contextionary.CombineVectorIndices(contextionaries)
37 | fatal(err)
38 |
39 | fmt.Println("results after building centroid based on keywords: ")
40 | kNN("ocean", combined)
41 | }
42 |
43 | func kNN(name string, contextionary contextionary.Contextionary) {
44 | itemIndex := contextionary.WordToItemIndex(name)
45 | if ok := itemIndex.IsPresent(); !ok {
46 | fatal(fmt.Errorf("item index for %s is not present", name))
47 | }
48 |
49 | list, distances, err := contextionary.GetNnsByItem(itemIndex, 1000000, 3)
50 | if err != nil {
51 | fatal(fmt.Errorf("get nns errored: %s", err))
52 | }
53 |
54 | for i := range list {
55 | w, err := contextionary.ItemIndexToWord(list[i])
56 | if err != nil {
57 | fmt.Printf("error: %s", err)
58 | }
59 | fmt.Printf("\n%d %f %s\n", list[i], distances[i], w)
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/tools/dev/contextionary-playground/schema/main.go:
--------------------------------------------------------------------------------
1 | /* _ _
2 | *__ _____ __ ___ ___ __ _| |_ ___
3 | *\ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
4 | * \ V V / __/ (_| |\ V /| | (_| | || __/
5 | * \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
6 | *
7 | * Copyright © 2016 - 2019 Weaviate. All rights reserved.
8 | * LICENSE: https://github.com/weaviate/weaviate/blob/master/LICENSE
9 | * DESIGN & CONCEPT: Bob van Luijt (@bobvanluijt)
10 | * CONTACT: hello@weaviate.io
11 | */package main
12 |
13 | import (
14 | "fmt"
15 | "os"
16 |
17 | contextionary "github.com/weaviate/contextionary/contextionary/core"
18 | )
19 |
20 | func fatal(err error) {
21 | if err != nil {
22 | fmt.Println(err.Error())
23 | os.Exit(1)
24 | }
25 | }
26 |
27 | func main() {
28 | c11y, err := contextionary.LoadVectorFromDisk("./test/contextionary/example.knn", "./test/contextionary/example.idx")
29 | fatal(err)
30 |
31 | fmt.Println("results before building centroid based on keywords: ")
32 | kNN("city", c11y)
33 |
34 | // Combine contextionaries
35 | contextionaries := []contextionary.Contextionary{c11y}
36 | combined, err := contextionary.CombineVectorIndices(contextionaries)
37 | fatal(err)
38 |
39 | fmt.Println("results after building centroid based on keywords: ")
40 | kNN("ocean", combined)
41 | }
42 |
43 | func kNN(name string, contextionary contextionary.Contextionary) {
44 | itemIndex := contextionary.WordToItemIndex(name)
45 | if ok := itemIndex.IsPresent(); !ok {
46 | fatal(fmt.Errorf("item index for %s is not present", name))
47 | }
48 |
49 | list, distances, err := contextionary.GetNnsByItem(itemIndex, 20, 3)
50 | if err != nil {
51 | fatal(fmt.Errorf("get nns errored: %s", err))
52 | }
53 |
54 | for i := range list {
55 | w, err := contextionary.ItemIndexToWord(list[i])
56 | if err != nil {
57 | fmt.Printf("error: %s", err)
58 | }
59 | fmt.Printf("\n%d %f %s\n", list[i], distances[i], w)
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/tools/dev/en_test-vectors-small.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate/contextionary/327ffb5f74ff9ede347bd31a8973d79d25fcac9b/tools/dev/en_test-vectors-small.txt.bz2
--------------------------------------------------------------------------------
/tools/dev/gen_simple_contextionary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | echo "Unpacking fixture vectors"
5 | rm -f tools/dev/en_test-vectors-small.txt || true
6 | bunzip2 -k tools/dev/en_test-vectors-small.txt.bz2
7 |
8 | # Fake stopword removal by removing the first 10 words. This will become
9 | # obsolete once we have released a new minimal c11y
10 |
11 | # build stopword.json
12 | cat tools/dev/en_test-vectors-small.txt | head | \
13 | while read -r word _; do echo "$word"; done | jq -nR '[inputs | select(length>0)] | { language: "en", words: . }' > tools/dev/stopwords.json
14 |
15 | # remove stop words
16 | sed -i.bak 1,10d tools/dev/en_test-vectors-small.txt && rm tools/dev/en_test-vectors-small.txt.bak
17 |
18 | if [ -f tools/dev/example.knn ]; then
19 | echo "Fixture contextionary already generated"
20 | else
21 | go run contextionary/core/generator/cmd/generator.go \
22 | -c tools/dev/en_test-vectors-small.txt \
23 | -p tools/dev/example
24 | fi
25 |
--------------------------------------------------------------------------------
/tools/dev/run.sh:
--------------------------------------------------------------------------------
1 | GO111MODULE=on \
2 | KNN_FILE="./tools/dev/example.knn" \
3 | IDX_FILE="./tools/dev/example.idx" \
4 | STOPWORDS_FILE="./tools/dev/stopwords.json" \
5 | SCHEMA_PROVIDER_URL="localhost:2379" \
6 | go run ./server 2>&1
7 |
--------------------------------------------------------------------------------
/tools/dev/stopwords.json:
--------------------------------------------------------------------------------
1 | {
2 | "language": "en",
3 | "words": [
4 | "the",
5 | "of",
6 | "and",
7 | "in",
8 | "to",
9 | "a",
10 | "was",
11 | "The",
12 | "is",
13 | "for"
14 | ]
15 | }
16 |
--------------------------------------------------------------------------------
/tools/download_contextionary.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | language=${1}
6 | version="${2}"
7 |
8 | rm -rf ./data && mkdir ./data
9 |
10 | # Download the latest files and remove old ones
11 | for FILE in stopwords.json contextionary.idx contextionary.knn; do
12 | echo "Start Downloading $FILE" && \
13 | #echo "Downloading url: https://c11y.semi.technology/$version/$language/$FILE"
14 | wget --quiet -O ./data/$FILE "https://c11y.semi.technology/$version/$language/$FILE" && \
15 | echo "$FILE = done" &
16 | done
17 |
18 | # Wait to finish download
19 | wait
20 |
21 | echo "Done downloading open source contextionary v$VECTORDB_VERSION."
22 | exit 0
23 |
--------------------------------------------------------------------------------
/tools/native_build_contextionary.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | #Download contextionary
4 | LANGUAGE=en
5 | MODEL_VERSION=0.16.0
6 | ./tools/download_contextionary.sh "$LANGUAGE" "$MODEL_VERSION"
7 |
8 | #Build the server
9 | VERSION=1.2.0
10 | CGO_ENABLED=1 go build -o ./contextionary-server -a -tags netgo -ldflags "-w -X main.Version=$VERSION" ./server
11 |
12 | #Generate contextionary
13 | tools/dev/gen_simple_contextionary.sh
14 |
15 | #Preprocess splitter dictionary
16 | /bin/bash ./tools/preprocess_splitter_dict_native_build.sh "$LANGUAGE" "./data/contextionary.idx"
17 |
18 | #Copy files to Alpine image
19 | cp ./contextionary-server $PWD
20 |
21 | #Set environment variables
22 | export KNN_FILE=./data/contextionary.knn
23 | export IDX_FILE=./data/contextionary.idx
24 | export STOPWORDS_FILE=./data/stopwords.json
25 | export COMPOUND_SPLITTING_DICTIONARY_FILE=./data/splitter_dict.csv
26 |
27 | #Run the server
28 | ./contextionary-server
29 |
--------------------------------------------------------------------------------
/tools/preprocess_splitter_dict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | language=${1}
6 | index_file=${2}
7 |
8 | # Get dictionaries
9 | git clone https://github.com/LibreOffice/dictionaries.git
10 |
11 | aff_file=""
12 | dic_file=""
13 |
14 | if [ "$language" == "en" ]; then
15 | aff_file="/app/dictionaries/en/en_US.aff"
16 | dic_file="/app/dictionaries/en/en_US.dic"
17 | fi
18 | if [ "$language" == "de" ]; then
19 | aff_file="/app/dictionaries/de/de_DE_frami.aff"
20 | dic_file="/app/dictionaries/de/de_DE_frami.dic"
21 | fi
22 | if [ "$language" == "nl" ]; then
23 | aff_file="/app/dictionaries/nl_NL/nl_NL.aff"
24 | dic_file="/app/dictionaries/nl_NL/nl_NL.dic"
25 | fi
26 | if [ "$language" == "it" ]; then
27 | aff_file="/app/dictionaries/it_IT/it_IT.aff"
28 | dic_file="/app/dictionaries/it_IT/it_IT.dic"
29 | fi
30 | if [ "$language" == "cs" ]; then
31 | aff_file="/app/dictionaries/cs_CZ/cs_CZ.aff"
32 | dic_file="/app/dictionaries/cs_CZ/cs_CZ.dic"
33 | fi
34 |
35 | if [ "$aff_file" == "" ]; then
36 | echo "Missing dictionary for preprocessor see process_splitter_dict.sh"
37 | exit 3
38 | fi
39 |
40 | echo "Building dict with:"
41 | go run main/splitter_preprocessor.go "$index_file" "$dic_file" "$aff_file" "/app/data/splitter_dict.csv"
42 |
43 |
--------------------------------------------------------------------------------
/tools/preprocess_splitter_dict_native_build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | language=${1}
6 | index_file=${2}
7 |
8 | # Get dictionaries
9 | git clone https://github.com/LibreOffice/dictionaries.git
10 |
11 | aff_file=""
12 | dic_file=""
13 |
14 | if [ "$language" == "en" ]; then
15 | aff_file="./dictionaries/en/en_US.aff"
16 | dic_file="./dictionaries/en/en_US.dic"
17 | fi
18 | if [ "$language" == "de" ]; then
19 | aff_file="./dictionaries/de/de_DE_frami.aff"
20 | dic_file="./dictionaries/de/de_DE_frami.dic"
21 | fi
22 | if [ "$language" == "nl" ]; then
23 | aff_file="./dictionaries/nl_NL/nl_NL.aff"
24 | dic_file="./dictionaries/nl_NL/nl_NL.dic"
25 | fi
26 | if [ "$language" == "it" ]; then
27 | aff_file="./dictionaries/it_IT/it_IT.aff"
28 | dic_file="./dictionaries/it_IT/it_IT.dic"
29 | fi
30 | if [ "$language" == "cs" ]; then
31 | aff_file="./dictionaries/cs_CZ/cs_CZ.aff"
32 | dic_file="./dictionaries/cs_CZ/cs_CZ.dic"
33 | fi
34 |
35 | if [ "$aff_file" == "" ]; then
36 | echo "Missing dictionary for preprocessor see process_splitter_dict.sh"
37 | exit 3
38 | fi
39 |
40 | echo "Building dict with:"
41 | go run main/splitter_preprocessor.go "$index_file" "$dic_file" "$aff_file" "./data/splitter_dict.csv"
42 |
43 |
--------------------------------------------------------------------------------