├── .github
├── koursaros.jpg
├── logo.svg
└── overview.svg
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── flows
├── .DS_Store
├── factchecking
│ ├── index
│ │ ├── docker-compose.yml
│ │ ├── flow.py
│ │ └── helm
│ │ │ ├── .helmignore
│ │ │ ├── Chart.yaml
│ │ │ ├── templates
│ │ │ ├── NOTES.txt
│ │ │ ├── main.yaml
│ │ │ ├── service.yaml
│ │ │ └── statefulset.yaml
│ │ │ └── values.yaml
│ ├── query
│ │ ├── flow.py
│ │ └── helm
│ │ │ ├── .helmignore
│ │ │ ├── Chart.yaml
│ │ │ ├── templates
│ │ │ ├── NOTES.txt
│ │ │ ├── main.yaml
│ │ │ ├── service.yaml
│ │ │ └── statefulset.yaml
│ │ │ └── values.yaml
│ └── train
│ │ └── train-compose.yml
└── yc_demo
│ ├── .DS_Store
│ ├── docker-compose-temp.yml
│ ├── docker-compose.yml
│ ├── flow.py
│ ├── helm
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ │ ├── NOTES.txt
│ │ ├── main.yaml
│ │ ├── service.yaml
│ │ └── statefulset.yaml
│ └── values.yaml
│ ├── index.k
│ └── query.k
├── koursaros
├── __init__.py
├── chart
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ │ ├── NOTES.txt
│ │ ├── main.yaml
│ │ ├── service.yaml
│ │ └── statefulset.yaml
│ └── values.yaml
├── cli
│ ├── __init__.py
│ ├── __main__.py
│ ├── build
│ │ └── __init__.py
│ ├── deploy
│ │ └── __init__.py
│ ├── manager.py
│ ├── show
│ │ └── __init__.py
│ ├── test
│ │ └── __init__.py
│ └── utils.py
├── flow
│ └── __init__.py
├── hub
│ ├── client
│ │ ├── .DS_Store
│ │ ├── postgres
│ │ │ ├── Dockerfile
│ │ │ ├── postgres.py
│ │ │ ├── testrerank.yml
│ │ │ └── wikititles.yml
│ │ └── sheet
│ │ │ ├── Dockerfile
│ │ │ ├── base.yml
│ │ │ ├── client.py
│ │ │ └── test.csv
│ ├── encoder
│ │ ├── robertainfer
│ │ │ ├── Dockerfile
│ │ │ └── dim64.yml
│ │ └── textbyte
│ │ │ ├── Dockerfile
│ │ │ ├── max1024.yml
│ │ │ ├── max256.yml
│ │ │ └── textbyte.py
│ ├── httpclient
│ │ └── http
│ │ │ └── Dockerfile
│ ├── indexer
│ │ ├── faisscpu
│ │ │ ├── Dockerfile
│ │ │ └── base.yml
│ │ ├── keyword
│ │ │ ├── Dockerfile
│ │ │ ├── base.yml
│ │ │ └── keyword.py
│ │ ├── lvdb
│ │ │ ├── Dockerfile
│ │ │ └── base.yml
│ │ ├── rocksdb
│ │ │ ├── Dockerfile
│ │ │ └── base.yml
│ │ ├── simple_dict
│ │ │ ├── Dockerfile
│ │ │ ├── base.yml
│ │ │ └── simple_dict.py
│ │ └── whoosh
│ │ │ ├── Dockerfile
│ │ │ ├── base.yml
│ │ │ └── whoosh.py
│ ├── preprocessor
│ │ ├── sentsplit
│ │ │ ├── Dockerfile
│ │ │ └── jsonmode.yml
│ │ └── unary
│ │ │ ├── Dockerfile
│ │ │ └── text.yml
│ ├── router
│ │ ├── block
│ │ │ ├── Dockerfile
│ │ │ ├── block.py
│ │ │ ├── block_query.yml
│ │ │ └── block_train.yml
│ │ ├── log
│ │ │ ├── Dockerfile
│ │ │ └── log.py
│ │ ├── rerank
│ │ │ ├── Dockerfile
│ │ │ ├── base.yml
│ │ │ └── rerank.py
│ │ └── resp_req
│ │ │ ├── Dockerfile
│ │ │ ├── base.yml
│ │ │ └── resp_req.py
│ └── tests
│ │ ├── reviews_sample.csv
│ │ ├── sonnets_small.txt
│ │ ├── test_block.py
│ │ ├── test_keyword.py
│ │ ├── test_reranker.py
│ │ ├── test_textbyte_encoder.py
│ │ ├── test_whoosh.py
│ │ └── yaml
│ │ ├── test-joint.yml
│ │ ├── test-keyword.yml
│ │ ├── test-reranker.yml
│ │ └── test-whoosh.yml
└── repo_creds
│ ├── README.md
│ └── __init__.py
├── requirements.txt
├── setup.py
├── tutorials
├── deploy_custom_model.md
└── fact_check.md
└── utils
├── modeling
├── __init__.py
├── data.py
├── migrating.py
├── model.py
└── models
│ ├── __init__.py
│ ├── generative_transformer.py
│ └── transformer_model.py
├── predictor
├── __init__.py
└── __main__.py
├── trainer
├── __init__.py
└── __main__.py
├── utils
├── __init__.py
├── bucket
│ └── __init__.py
├── cuda
│ ├── __init__.py
│ └── apex.sh
├── database
│ ├── __init__.py
│ └── psql.py
└── misc
│ ├── __init__.py
│ └── tree.sh
└── yamls.py
/.github/koursaros.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/.github/koursaros.jpg
--------------------------------------------------------------------------------
/.github/logo.svg:
--------------------------------------------------------------------------------
1 | Asset 1 Koursaros
--------------------------------------------------------------------------------
/.github/overview.svg:
--------------------------------------------------------------------------------
1 | Asset 2 1 2 3 4 S e r vi c e S tub
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | MANIFEST
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a fact-checking
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *.cover
46 | .hypothesis/
47 | .pytest_cache/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | db.sqlite3
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # Environments
84 | .env
85 | .venv
86 | env/
87 | venv/
88 | ENV/
89 | env.bak/
90 | venv.bak/
91 |
92 | # Spyder project settings
93 | .spyderproject
94 | .spyproject
95 |
96 | # Rope project settings
97 | .ropeproject
98 |
99 | # mkdocs documentation
100 | /site
101 |
102 | # mypy
103 | .mypy_cache/
104 |
105 | # Jetbrains
106 | .idea
107 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 koursaros-ai
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include koursaros *
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | Blog •
13 | Highlights •
14 | Overview •
15 | Install •
16 | Getting Started •
17 | Documentation •
18 | Tutorials •
19 | Contributing
20 |
21 |
22 | Koursaros is a distributed cloud platform for developing and deploying neural search and inference applications.
23 |
24 | Koursaros leverages a general-purpose microservice architecture to enable low-latency, scalable deep neural network training and can be directly deployed to kubernetes for production.
25 |
26 | ## Description
27 | This is page is a work in progress.
28 |
29 | ## Results
30 |
31 |
45 |
46 | ## Install
47 | ### Requirements
48 | You need Python 3.6 or later to run Koursaros.
49 |
50 | ### Stable Version
51 | #### Installing via pip
52 | We recommend installing Koursaros via pip:
53 | ```
54 | pip3 install koursaros
55 | ```
56 | Installation will use Python wheels from PyPI, available for OSX, Linux, and Windows.
57 |
58 | ### Latest Version
59 | ### Installing via pip-git
60 | You can install the latest version from Git:
61 | ```
62 | pip3 install git+https://git@github.com/koursaros-ai/koursaros.git
63 | ```
64 |
65 | ## Getting Started
66 | ### Creating a pipeline
67 | ```
68 | kctl deploy app
69 | ```
70 |
71 |
72 | ## Tutorials
73 | - Use Koursaros to get SoTA results in dev environment on the fever.ai benchmark using pretrained models.
74 | - Training custom models and deploying them as stubs
75 | - Training Elastic Search BM25 algorithm using Ax Bayesian Optimizer (coming soon)
76 | - Deploying fever.ai pipeline to production (Coming Soon)
77 |
--------------------------------------------------------------------------------
/flows/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/flows/.DS_Store
--------------------------------------------------------------------------------
/flows/factchecking/index/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.4'
2 | services:
3 | Frontend0:
4 | image: gnes/gnes:latest-alpine
5 | command: frontend --port_in 61973 --port_out 54596 --port_ctrl 57120 --parallel_backend
6 | process
7 | ports:
8 | - 8800:8800
9 | sentsplit:
10 | image: hub-preprocessor:latest-sentsplit
11 | command: --port_in 54596 --port_out 60639 --socket_in PULL_CONNECT --socket_out
12 | PUB_BIND --port_ctrl 56881 --parallel_backend process --num_parallel 2 --yaml_path
13 | jsonmode.yml
14 | deploy:
15 | replicas: 2
16 | textbyte:
17 | image: hub-encoder:latest-textbyte
18 | command: --port_in 60639 --port_out 58737 --socket_in SUB_CONNECT --port_ctrl
19 | 54010 --parallel_backend process --num_parallel 2 --yaml_path max256.yml
20 | deploy:
21 | replicas: 2
22 | keyword:
23 | image: hub-indexer:latest-keyword
24 | command: --port_in 58737 --port_out 61340 --socket_in PULL_CONNECT --socket_out
25 | PUSH_CONNECT --port_ctrl 64855 --parallel_backend process --num_parallel 2 --yaml_path
26 | base.yml
27 | deploy:
28 | replicas: 2
29 | lvdb:
30 | image: hub-indexer:latest-lvdb
31 | command: --port_in 60639 --port_out 61340 --socket_in SUB_CONNECT --socket_out
32 | PUSH_CONNECT --port_ctrl 54746 --parallel_backend process --num_parallel 2 --yaml_path
33 | base.yml
34 | deploy:
35 | replicas: 2
36 | basereducerouter:
37 | image: gnes/gnes:latest-alpine
38 | command: route --port_in 61340 --port_out 61973 --socket_out PUSH_CONNECT --port_ctrl
39 | 57894 --parallel_backend process --yaml_path BaseReduceRouter --num_part 2
--------------------------------------------------------------------------------
/flows/factchecking/index/flow.py:
--------------------------------------------------------------------------------
1 | from koursaros.gnes_addons import Flow
2 |
3 | flow = (
4 | Flow(check_version=True)
5 | .add_client(name='postgres', yaml_path='wikititles.yml')
6 | .add_preprocessor(name='sentsplit', replicas=2, storage='1Gi', yaml_path='jsonmode.yml')
7 | .add_encoder(name='textbyte', recv_from='sentsplit', replicas=2, yaml_path='max256.yml')
8 | .add_indexer(name='keyword', replicas=2, yaml_path='base.yml')
9 | .add_indexer(name='lvdb', recv_from='sentsplit', replicas=2, yaml_path='base.yml')
10 | .add_router(name='basereducerouter', num_part=2, recv_from=['keyword', 'lvdb'], yaml_path='BaseReduceRouter')
11 | )
12 |
13 | # checkout how the flow looks like (...and post it on Twitter, but hey what do I know about promoting OSS)
14 | # funny!
15 |
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 |
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 |
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Deployed flow!
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/main.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- with .Values.services }}
3 | {{- range list .frontend .preprocessor .encoder .indexer .router }}
4 | {{- range . }}
5 | ---
6 | {{ include "statefulset" .}}
7 | ---
8 | {{ include "service" .}}
9 | {{ end }}
10 | {{ end }}
11 | {{ end }}
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/service.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "service" -}}
3 | {{- $name := printf "%s-%s" .app .model -}}
4 | apiVersion: v1
5 | kind: Service
6 | metadata:
7 | name: {{ $name }}
8 | spec:
9 | selector:
10 | app: {{ $name }}
11 | clusterIP: None
12 | ports:
13 | {{- if .port_in }}
14 | - name: in
15 | port: {{ .port_in }}
16 | protocol: TCP
17 | {{- end }}
18 | {{- if .port_out }}
19 | - name: out
20 | port: {{ .port_out }}
21 | protocol: TCP
22 | {{- end -}}
23 | {{- if .grpc_port }}
24 | - name: grpc
25 | port: {{ .grpc_port }}
26 | protocol: TCP
27 | {{- end -}}
28 | {{- if .ctrl_port }}
29 | - name: ctrl
30 | port: {{ .ctrl_port }}
31 | protocol: TCP
32 | {{- end -}}
33 | {{ if .load_balancer }}
34 | type: LoadBalancer
35 | {{ end }}
36 | {{- end -}}
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/statefulset.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "statefulset" -}}
3 | {{- $name := printf "%s-%s" .app .model -}}
4 | apiVersion: apps/v1
5 | kind: StatefulSet
6 | metadata:
7 | name: {{ $name }}
8 | spec:
9 | replicas: {{ .replicas }}
10 | selector:
11 | matchLabels:
12 | app: {{ $name }}
13 | volumeClaimTemplates:
14 | - metadata:
15 | name: {{ $name }}
16 | spec:
17 | accessModes:
18 | - ReadWriteOnce
19 | {{- if .storage }}
20 | resources:
21 | requests:
22 | storage: {{ .storage }}
23 | {{- end }}
24 | template:
25 | metadata:
26 | labels:
27 | app: {{ $name }}
28 | spec:
29 | containers:
30 | - name: {{ $name }}
31 | image: {{ .image }}
32 | args:
33 | {{- range .command }}
34 | - {{ . | quote }}
35 | {{- end }}
36 | imagePullPolicy: null
37 | ports:
38 | {{- if .port_in }}
39 | - name: in
40 | containerPort: {{ .port_in }}
41 | protocol: TCP
42 | {{- end }}
43 | {{- if .port_out }}
44 | - name: out
45 | containerPort: {{ .port_out }}
46 | protocol: TCP
47 | {{- end }}
48 | {{- if .grpc_port }}
49 | - name: grpc
50 | containerPort: {{ .grpc_port }}
51 | protocol: TCP
52 | {{- end }}
53 | {{- if .grpc_port }}
54 | - name: ctrl
55 | containerPort: {{ .port_ctrl }}
56 | protocol: TCP
57 | {{- end }}
58 | resources:
59 | requests:
60 | {{- if .cpu }}
61 | cpu: {{ .cpu }}
62 | {{- end }}
63 | {{- if .memory }}
64 | memory: {{ .memory }}
65 | {{- end }}
66 |
67 | {{- if .resources -}}
68 | {{- toYaml .resources | nindent 8 -}}
69 | {{- end -}}
70 | {{- end -}}
--------------------------------------------------------------------------------
/flows/factchecking/index/helm/values.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | frontend:
3 | - name: Frontend0
4 | app: frontend
5 | model: base
6 | port_in: 61973
7 | port_out: 54596
8 | ctrl_port:
9 | grpc_port: 8800
10 | command:
11 | - frontend
12 | - --port_in
13 | - '61973'
14 | - --port_out
15 | - '54596'
16 | - --port_ctrl
17 | - '57120'
18 | - --parallel_backend
19 | - process
20 | replicas: 1
21 | storage: 500Mi
22 | memory: 500Mi
23 | cpu: 300m
24 | image: gnes/gnes:latest-alpine
25 | preprocessor:
26 | - name: sentsplit
27 | app: preprocessor
28 | model: sentsplit
29 | port_in: 54596
30 | port_out: 60639
31 | ctrl_port:
32 | grpc_port:
33 | command:
34 | - --port_in
35 | - '54596'
36 | - --port_out
37 | - '60639'
38 | - --socket_in
39 | - PULL_CONNECT
40 | - --socket_out
41 | - PUB_BIND
42 | - --port_ctrl
43 | - '56881'
44 | - --parallel_backend
45 | - process
46 | - --num_parallel
47 | - '2'
48 | - --yaml_path
49 | - jsonmode.yml
50 | replicas: 2
51 | storage: 1Gi
52 | memory: 1Gi
53 | cpu: 1Gi
54 | image: hub-preprocessor:latest-sentsplit
55 | encoder:
56 | - name: textbyte
57 | app: encoder
58 | model: textbyte
59 | port_in: 60639
60 | port_out: 58737
61 | ctrl_port:
62 | grpc_port:
63 | command:
64 | - --port_in
65 | - '60639'
66 | - --port_out
67 | - '58737'
68 | - --socket_in
69 | - SUB_CONNECT
70 | - --port_ctrl
71 | - '54010'
72 | - --parallel_backend
73 | - process
74 | - --num_parallel
75 | - '2'
76 | - --yaml_path
77 | - max256.yml
78 | replicas: 2
79 | storage: 500Mi
80 | memory: 500Mi
81 | cpu: 300m
82 | image: hub-encoder:latest-textbyte
83 | indexer:
84 | - name: keyword
85 | app: indexer
86 | model: keyword
87 | port_in: 58737
88 | port_out: 61340
89 | ctrl_port:
90 | grpc_port:
91 | command:
92 | - --port_in
93 | - '58737'
94 | - --port_out
95 | - '61340'
96 | - --socket_in
97 | - PULL_CONNECT
98 | - --socket_out
99 | - PUSH_CONNECT
100 | - --port_ctrl
101 | - '64855'
102 | - --parallel_backend
103 | - process
104 | - --num_parallel
105 | - '2'
106 | - --yaml_path
107 | - base.yml
108 | replicas: 2
109 | storage: 500Mi
110 | memory: 500Mi
111 | cpu: 300m
112 | image: hub-indexer:latest-keyword
113 | - name: lvdb
114 | app: indexer
115 | model: lvdb
116 | port_in: 60639
117 | port_out: 61340
118 | ctrl_port:
119 | grpc_port:
120 | command:
121 | - --port_in
122 | - '60639'
123 | - --port_out
124 | - '61340'
125 | - --socket_in
126 | - SUB_CONNECT
127 | - --socket_out
128 | - PUSH_CONNECT
129 | - --port_ctrl
130 | - '54746'
131 | - --parallel_backend
132 | - process
133 | - --num_parallel
134 | - '2'
135 | - --yaml_path
136 | - base.yml
137 | replicas: 2
138 | storage: 500Mi
139 | memory: 500Mi
140 | cpu: 300m
141 | image: hub-indexer:latest-lvdb
142 | router:
143 | - name: basereducerouter
144 | app: router
145 | model: basereducerouter
146 | port_in: 61340
147 | port_out: 61973
148 | ctrl_port:
149 | grpc_port:
150 | command:
151 | - route
152 | - --port_in
153 | - '61340'
154 | - --port_out
155 | - '61973'
156 | - --socket_out
157 | - PUSH_CONNECT
158 | - --port_ctrl
159 | - '57894'
160 | - --parallel_backend
161 | - process
162 | - --yaml_path
163 | - BaseReduceRouter
164 | - --num_part
165 | - '2'
166 | replicas: 1
167 | storage: 500Mi
168 | memory: 500Mi
169 | cpu: 300m
170 | image: gnes/gnes:latest-alpine
--------------------------------------------------------------------------------
/flows/factchecking/query/flow.py:
--------------------------------------------------------------------------------
1 | from koursaros.gnes_addons import Flow
2 |
3 | flow = (
4 | Flow(check_version=True)
5 | .add_client(name='postgres', yaml_path='clients/postgres/wikititles.yml')
6 | .add_preprocessor(name='sentsplit', replicas=2,
7 | yaml_path='services/preprocessors/sentsplit/jsonmode.yml')
8 | .add_encoder(name='textbyte', recv_from='sentsplit', replicas=2,
9 | yaml_path='services/encoders/textbyte/max256.yml')
10 | .add_indexer(name='keyword', replicas=2,
11 | yaml_path='services/indexers/keyword/base.yml')
12 | .add_indexer(name='lvdb', replicas=2, yaml_path='services/indexers/lvdb/base.yml')
13 | .add_encoder(name='robertainfer', replicas=2,
14 | yaml_path='services/encoders/robertainfer/dim64.yml')
15 | .add_router(name='reduce', num_part=2, yaml_path='BaseReduceRouter')
16 | )
17 |
18 |
19 | # checkout how the flow looks like (...and post it on Twitter, but hey what do I know about promoting OSS)
20 | # funny!
21 |
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 |
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 |
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | raise NotImplementedError
2 |
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/main.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- with .Values.services }}
3 | {{- range list .frontend .preprocessors .encoders .indexers .routers }}
4 | {{- range . }}
5 | ---
6 | {{ include "statefulset" .}}
7 | ---
8 | {{ include "service" .}}
9 | {{ end }}
10 | {{ end }}
11 | {{ end }}
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/service.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "service" -}}
3 | apiVersion: v1
4 | kind: Service
5 | spec:
6 | selector:
7 | app: {{ .name }}
8 | clusterIP: None
9 | ports:
10 | {{- if .port_in }}
11 | - name: in
12 | port: {{ .port_in }}
13 | protocol: TCP
14 | {{- end }}
15 | {{- if .port_out }}
16 | - name: out
17 | port: {{ .port_out }}
18 | protocol: TCP
19 | {{- end -}}
20 | {{- if .grpc_port }}
21 | - name: grpc
22 | port: {{ .grpc_port }}
23 | protocol: TCP
24 | {{- end -}}
25 | {{- if .grpc_port }}
26 | - name: ctrl
27 | port: {{ .port_ctrl }}
28 | protocol: TCP
29 | {{- end -}}
30 | {{ if .load_balancer }}
31 | type: LoadBalancer
32 | {{ end }}
33 | {{- end -}}
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/statefulset.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "statefulset" -}}
3 | apiVersion: apps/v1
4 | kind: StatefulSet
5 | spec:
6 | replicas: {{ .replicas }}
7 | selector:
8 | matchLabels:
9 | app: {{ .name }}
10 | volumeClaimTemplates:
11 | accessModes: 'ReadWriteOnce'
12 | resources:
13 | requests:
14 | storage: {{ .storage }}
15 | template:
16 | spec:
17 | containers:
18 | - name: {{ .name }}
19 | image: {{ .image }}
20 | args: {{ .command }}
21 | imagePullPolicy: null
22 | ports:
23 | {{- if .port_in }}
24 | - name: in
25 | containerPort: {{ .port_in }}
26 | protocol: TCP
27 | {{- end }}
28 | {{- if .port_out }}
29 | - name: out
30 | containerPort: {{ .port_out }}
31 | protocol: TCP
32 | {{- end -}}
33 | {{- if .grpc_port }}
34 | - name: grpc
35 | containerPort: {{ .grpc_port }}
36 | protocol: TCP
37 | {{- end -}}
38 | {{- if .grpc_port }}
39 | - name: ctrl
40 | containerPort: {{ .port_ctrl }}
41 | protocol: TCP
42 | {{- end -}}
43 | resources:
44 | requests:
45 | cpu: {{ .cpu }}
46 | memory: {{ .memory }}
47 |
48 | {{- if .resources -}}
49 | {{- toYaml .resources | nindent 8 -}}
50 | {{- end -}}
51 | {{- end -}}
--------------------------------------------------------------------------------
/flows/factchecking/query/helm/values.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | frontends:
3 | - name: Frontend0
4 | port_in: 63152
5 | port_out: 49972
6 | ctrl_port:
7 | grpc_port: 8800
8 | command: frontend --port_in 63152 --port_out 49972 --port_ctrl 55922 --parallel_backend
9 | process
10 | replicas: 1
11 | storage:
12 | memory:
13 | cpu:
14 | image: gnes-frontend:Frontend0
15 | preprocessors:
16 | - name: sent_split
17 | port_in: 49972
18 | port_out: 53012
19 | ctrl_port:
20 | grpc_port:
21 | command: preprocess --port_in 49972 --port_out 53012 --socket_in PULL_CONNECT
22 | --port_ctrl 54583 --parallel_backend process --yaml_path services/preprocessors/sent_split/json_mode.yml
23 | replicas: 2
24 | storage:
25 | memory:
26 | cpu:
27 | image: gnes-preprocessor:sent_split
28 | encoders:
29 | - name: text_byte
30 | port_in: 53012
31 | port_out: 54139
32 | ctrl_port:
33 | grpc_port:
34 | command: encode --port_in 53012 --port_out 54139 --socket_in PULL_CONNECT --port_ctrl
35 | 51629 --parallel_backend process --yaml_path services/encoders/text_byte/max_256.yml
36 | replicas: 2
37 | storage:
38 | memory:
39 | cpu:
40 | image: gnes-encoder:text_byte
41 | - name: roberta_infer
42 | port_in: 55961
43 | port_out: 52539
44 | ctrl_port:
45 | grpc_port:
46 | command: encode --port_in 55961 --port_out 52539 --socket_in PULL_CONNECT --port_ctrl
47 | 52568 --parallel_backend process --yaml_path services/encoders/roberta_infer/dim_64.yml
48 | replicas: 2
49 | storage:
50 | memory:
51 | cpu:
52 | image: gnes-encoder:roberta_infer
53 | indexers:
54 | - name: keyword
55 | port_in: 54139
56 | port_out: 60943
57 | ctrl_port:
58 | grpc_port:
59 | command: index --port_in 54139 --port_out 60943 --socket_in PULL_CONNECT --port_ctrl
60 | 63670 --parallel_backend process --yaml_path services/indexers/keyword/base.yml
61 | replicas: 2
62 | storage:
63 | memory:
64 | cpu:
65 | image: gnes-indexer:keyword
66 | - name: lvdb
67 | port_in: 60943
68 | port_out: 55961
69 | ctrl_port:
70 | grpc_port:
71 | command: index --port_in 60943 --port_out 55961 --socket_in PULL_CONNECT --port_ctrl
72 | 55890 --parallel_backend process --yaml_path services/indexers/lvdb/base.yml
73 | replicas: 2
74 | storage:
75 | memory:
76 | cpu:
77 | image: gnes-indexer:lvdb
78 | routers:
79 | - name: Reduce
80 | port_in: 52539
81 | port_out: 63152
82 | ctrl_port:
83 | grpc_port:
84 | command: route --port_in 52539 --port_out 63152 --socket_in PULL_CONNECT --socket_out
85 | PUSH_CONNECT --port_ctrl 50250 --parallel_backend process --yaml_path BaseReduceRouter
86 | --num_part 2
87 | replicas: 1
88 | storage:
89 | memory:
90 | cpu:
91 | image: gnes-router:Reduce
--------------------------------------------------------------------------------
/flows/factchecking/train/train-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.4'
2 | services:
3 | Frontend00:
4 | image: gnes/gnes:latest-alpine
5 | command: frontend --grpc_port 5566 --port_out 62401 --socket_out PUSH_BIND --port_in
6 | 60753 --socket_in PULL_BIND --host_in Encoder20 --host_out Preprocessor10
7 | ports:
8 | - 5566:5566
9 | Preprocessor10:
10 | image: services/preprocessors:word-split-preprocessor
11 | command: preprocess --port_in 62401 --socket_in PULL_CONNECT --port_out 54470
12 | --socket_out PUSH_CONNECT --yaml_path SentSplitPreprocessor
13 | --host_in Frontend00 --host_out Encoder20
14 | deploy:
15 | replicas: 3
16 | restart_policy:
17 | condition: on-failure
18 | max_attempts: 3
19 | Encoder20:
20 | image: services/encoders:siamese-bert
21 | command: --port_in 54470 --socket_in PULL_BIND --port_out 60753 --socket_out PUSH_CONNECT
22 | --host_out Frontend00 --host_in Preprocessor10
--------------------------------------------------------------------------------
/flows/yc_demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/flows/yc_demo/.DS_Store
--------------------------------------------------------------------------------
/flows/yc_demo/docker-compose-temp.yml:
--------------------------------------------------------------------------------
1 | services:
2 | block4:
3 | command: --socket_in SUB_CONNECT --socket_out PUSH_BIND --yaml_path block_train.yml
4 | --host_in router3 --port_in 58842 --port_out 55503
5 | ports: [55503:55503]
6 | frontend2:
7 | command: --socket_in PULL_BIND --socket_out PUSH_BIND --port_in 64750 --port_out
8 | 56531
9 | ports: [64750:64750, 56531:56531]
10 | http1:
11 | command: --socket_in RPC_BIND --socket_out RPC_CONNECT --port_in 61501 --host_out
12 | frontend2 --port_out 64750
13 | ports: [61501:61501]
14 | keyword7: {command: --socket_in PULL_CONNECT --socket_out PUSH_CONNECT --yaml_path
15 | base.yml --host_in textbyte6 --port_in 59483 --host_out rerank9 --port_out 64772}
16 | rerank9: {command: --socket_in PULL_CONNECT --socket_out PUSH_CONNECT --yaml_path
17 | base.yml --host_in router8 --port_in 56224 --host_out frontend2 --port_out 64750}
18 | router3:
19 | command: --socket_in PULL_CONNECT --socket_out PUB_BIND --yaml_path BaseRouter
20 | --host_in frontend2 --port_in 56531 --port_out 58842
21 | ports: [58842:58842]
22 | router8:
23 | command: --socket_in SUB_CONNECT --socket_out PUSH_BIND --yaml_path BaseRouter
24 | --host_in router3 --port_in 58842 --port_out 56224
25 | ports: [56224:56224]
26 | textbyte6:
27 | command: --socket_in PULL_CONNECT --socket_out PUSH_BIND --yaml_path max1024.yml
28 | --host_in unary5 --port_in 64036 --port_out 59483
29 | ports: [59483:59483]
30 | unary5:
31 | command: doc_type=1 --socket_in PULL_CONNECT --socket_out PUSH_BIND --yaml_path
32 | text.yml --host_in block4 --port_in 55503 --port_out 64036
33 | ports: [64036:64036]
34 | version: 3.4
35 |
--------------------------------------------------------------------------------
/flows/yc_demo/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.4'
2 | services:
3 | http:
4 | image: hub-httpclient:latest-http
5 | command: --grpc_host Frontend0 --start_doc_id 1
6 | ports:
7 | - 80:80
8 | Frontend0:
9 | image: gnes/gnes:latest-alpine
10 | command: frontend --port_in 57105 --port_out 65502 --port_ctrl 55166 --parallel_backend
11 | process
12 | Router0:
13 | image: gnes/gnes:latest-alpine
14 | command: route --port_in 65502 --port_out 58609 --socket_in PULL_CONNECT --socket_out
15 | PUB_BIND --port_ctrl 49407 --parallel_backend process --yaml_path BaseRouter
16 | --host_in Frontend0
17 | block:
18 | image: hub-router:latest-block
19 | command: --port_in 58609 --port_out 53283 --socket_in SUB_CONNECT --port_ctrl
20 | 52423 --parallel_backend process --yaml_path block_train.yml --host_in Router0
21 | unary:
22 | image: hub-preprocessor:latest-unary
23 | command: --port_in 53283 --port_out 51714 --socket_in PULL_CONNECT --port_ctrl
24 | 55377 --parallel_backend process --yaml_path text.yml --host_in block
25 | textbyte:
26 | image: hub-encoder:latest-textbyte
27 | command: --port_in 51714 --port_out 62690 --socket_in PULL_CONNECT --port_ctrl
28 | 57360 --parallel_backend process --yaml_path max1024.yml --host_in unary
29 | # --socket_out PUB_BIND # FOR INDEXING
30 | # whoosh:
31 | # image: hub-indexer:latest-whoosh
32 | # command: --port_in 62690 --port_out 57105 --socket_in SUB_CONNECT --port_ctrl
33 | # 60258 --parallel_backend process --yaml_path base.yml --host_in textbyte
34 | # --host_out Frontend0 --socket_out PUSH_CONNECT
35 | # volumes:
36 | # - ./.cache:/workspace
37 | # rocksdb:
38 | # image: hub-indexer:latest-rocksdb
39 | # command: --port_in 62690 --port_out 57105 --socket_in SUB_CONNECT --port_ctrl
40 | # 60258 --parallel_backend process --yaml_path base.yml --host_in textbyte
41 | # --host_out Frontend0 --socket_out PUSH_CONNECT
42 | # volumes:
43 | # - ./.cache:/workspace
44 | # FOR QUERYING
45 | whoosh:
46 | image: hub-indexer:latest-whoosh
47 | command: --port_in 62690 --port_out 61233 --socket_in PULL_CONNECT --port_ctrl
48 | 60258 --parallel_backend process --yaml_path base.yml --host_in textbyte
49 | volumes:
50 | - ./.cache:/workspace
51 | rocksdb:
52 | image: hub-indexer:latest-simple_dict
53 | command: --port_in 61233 --port_out 62155 --socket_in PULL_CONNECT --port_ctrl
54 | 60234 --parallel_backend process --yaml_path base.yml --host_in whoosh
55 | --host_out rerank --socket_out PUSH_CONNECT
56 | volumes:
57 | - ./.cache:/workspace
58 | # Router1:
59 | # image: gnes/gnes:latest-alpine
60 | # command: route --port_in 58609 --port_out 62155 --socket_in SUB_CONNECT --socket_out
61 | # PUSH_CONNECT --port_ctrl 50381 --parallel_backend process --yaml_path BaseRouter
62 | # --host_in Router0 --host_out rerank
63 | rerank:
64 | image: hub-router:latest-rerank
65 | command: --port_in 62155 --port_out 57105 --socket_out PUSH_CONNECT --port_ctrl
66 | 56641 --parallel_backend process --yaml_path base.yml --host_out Frontend0
--------------------------------------------------------------------------------
/flows/yc_demo/flow.py:
--------------------------------------------------------------------------------
1 | from koursaros.gnes_addons import Flow
2 |
3 |
4 | flow = (
5 | Flow(with_frontend=False)
6 | .add_http_client(name='http')
7 | .add_frontend(copy_flow=False)
8 | .add_router(yaml_path='BaseRouter')
9 | .add_router(name='block', yaml_path='block_train.yml')
10 | .add_preprocessor(name='unary', yaml_path='text.yml', doc_type=1)
11 | .add_encoder(name='textbyte', yaml_path='max1024.yml')
12 | .add_indexer(name='whoosh', yaml_path='base.yml')
13 | .add_indexer(name='simple_dict', yaml_path='base.yml')
14 | .add_router(yaml_path='BaseRouter', recv_from=['Router0'])
15 | .add_router(name='rerank', yaml_path='base.yml', recv_from=['rocksdb', 'Router1'])
16 | )
17 |
--------------------------------------------------------------------------------
/flows/yc_demo/helm/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 |
--------------------------------------------------------------------------------
/flows/yc_demo/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 |
--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Deployed flow!
--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/main.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- with .Values.services }}
3 | {{- range list .frontend .preprocessor .encoder .indexer .router }}
4 | {{- range . }}
5 | ---
6 | {{ include "statefulset" .}}
7 | ---
8 | {{ include "service" .}}
9 | {{ end }}
10 | {{ end }}
11 | {{ end }}
--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/service.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "service" -}}
3 | {{- $name := printf "%s-%s" .app .model -}}
4 | apiVersion: v1
5 | kind: Service
6 | metadata:
7 | name: {{ $name }}
8 | spec:
9 | selector:
10 | app: {{ $name }}
11 | clusterIP: None
12 | ports:
13 | {{- if .port_in }}
14 | - name: in
15 | port: {{ .port_in }}
16 | protocol: TCP
17 | {{- end }}
18 | {{- if .port_out }}
19 | - name: out
20 | port: {{ .port_out }}
21 | protocol: TCP
22 | {{- end -}}
23 | {{- if .grpc_port }}
24 | - name: grpc
25 | port: {{ .grpc_port }}
26 | protocol: TCP
27 | {{- end -}}
28 | {{- if .ctrl_port }}
29 | - name: ctrl
30 | port: {{ .ctrl_port }}
31 | protocol: TCP
32 | {{- end -}}
33 | {{ if .load_balancer }}
34 | type: LoadBalancer
35 | {{ end }}
36 | {{- end -}}
--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/statefulset.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "statefulset" -}}
3 | {{- $name := printf "%s-%s" .app .model -}}
4 | apiVersion: apps/v1
5 | kind: StatefulSet
6 | metadata:
7 | name: {{ $name }}
8 | spec:
9 | replicas: {{ .replicas }}
10 | selector:
11 | matchLabels:
12 | app: {{ $name }}
13 | volumeClaimTemplates:
14 | - metadata:
15 | name: {{ $name }}
16 | spec:
17 | accessModes:
18 | - ReadWriteOnce
19 | {{- if .storage }}
20 | resources:
21 | requests:
22 | storage: {{ .storage }}
23 | {{- end }}
24 | template:
25 | metadata:
26 | labels:
27 | app: {{ $name }}
28 | spec:
29 | containers:
30 | - name: {{ $name }}
31 | image: {{ .image }}
32 | args:
33 | {{- range .command }}
34 | - {{ . | quote }}
35 | {{- end }}
36 | imagePullPolicy: null
37 | ports:
38 | {{- if .port_in }}
39 | - name: in
40 | containerPort: {{ .port_in }}
41 | protocol: TCP
42 | {{- end }}
43 | {{- if .port_out }}
44 | - name: out
45 | containerPort: {{ .port_out }}
46 | protocol: TCP
47 | {{- end }}
48 | {{- if .grpc_port }}
49 | - name: grpc
50 | containerPort: {{ .grpc_port }}
51 | protocol: TCP
52 | {{- end }}
53 | {{- if .grpc_port }}
54 | - name: ctrl
55 | containerPort: {{ .port_ctrl }}
56 | protocol: TCP
57 | {{- end }}
58 | resources:
59 | requests:
60 | {{- if .cpu }}
61 | cpu: {{ .cpu }}
62 | {{- end }}
63 | {{- if .memory }}
64 | memory: {{ .memory }}
65 | {{- end }}
66 |
67 | {{- if .resources -}}
68 | {{- toYaml .resources | nindent 8 -}}
69 | {{- end -}}
70 | {{- end -}}
--------------------------------------------------------------------------------
/flows/yc_demo/helm/values.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | httpclient:
3 | - name: http
4 | app: httpclient
5 | model: http
6 | port_in:
7 | port_out:
8 | ctrl_port:
9 | grpc_port:
10 | command: []
11 | replicas: 1
12 | storage: 500Mi
13 | memory: 500Mi
14 | cpu: 300m
15 | image: hub-httpclient:latest-http
16 | frontend:
17 | - name: Frontend0
18 | app: frontend
19 | model: base
20 | port_in: 57105
21 | port_out: 65502
22 | ctrl_port:
23 | grpc_port:
24 | command:
25 | - frontend
26 | - --port_in
27 | - '57105'
28 | - --port_out
29 | - '65502'
30 | - --port_ctrl
31 | - '55166'
32 | - --parallel_backend
33 | - process
34 | replicas: 1
35 | storage: 500Mi
36 | memory: 500Mi
37 | cpu: 300m
38 | image: gnes/gnes:latest-alpine
39 | router:
40 | - name: Router0
41 | app: router
42 | model: base
43 | port_in: 65502
44 | port_out: 58609
45 | ctrl_port:
46 | grpc_port:
47 | command:
48 | - route
49 | - --port_in
50 | - '65502'
51 | - --port_out
52 | - '58609'
53 | - --socket_in
54 | - PULL_CONNECT
55 | - --socket_out
56 | - PUB_BIND
57 | - --port_ctrl
58 | - '49407'
59 | - --parallel_backend
60 | - process
61 | - --yaml_path
62 | - BaseRouter
63 | replicas: 1
64 | storage: 500Mi
65 | memory: 500Mi
66 | cpu: 300m
67 | image: gnes/gnes:latest-alpine
68 | - name: block
69 | app: router
70 | model: block
71 | port_in: 58609
72 | port_out: 53283
73 | ctrl_port:
74 | grpc_port:
75 | command:
76 | - --port_in
77 | - '58609'
78 | - --port_out
79 | - '53283'
80 | - --socket_in
81 | - SUB_CONNECT
82 | - --port_ctrl
83 | - '52423'
84 | - --parallel_backend
85 | - process
86 | - --yaml_path
87 | - block_train.yml
88 | replicas: 1
89 | storage: 500Mi
90 | memory: 500Mi
91 | cpu: 300m
92 | image: hub-router:latest-block
93 | - name: Router1
94 | app: router
95 | model: base
96 | port_in: 58609
97 | port_out: 62155
98 | ctrl_port:
99 | grpc_port:
100 | command:
101 | - route
102 | - --port_in
103 | - '58609'
104 | - --port_out
105 | - '62155'
106 | - --socket_in
107 | - SUB_CONNECT
108 | - --socket_out
109 | - PUSH_CONNECT
110 | - --port_ctrl
111 | - '50381'
112 | - --parallel_backend
113 | - process
114 | - --yaml_path
115 | - BaseRouter
116 | replicas: 1
117 | storage: 500Mi
118 | memory: 500Mi
119 | cpu: 300m
120 | image: gnes/gnes:latest-alpine
121 | - name: rerank
122 | app: router
123 | model: rerank
124 | port_in: 62155
125 | port_out: 57105
126 | ctrl_port:
127 | grpc_port:
128 | command:
129 | - --port_in
130 | - '62155'
131 | - --port_out
132 | - '57105'
133 | - --socket_out
134 | - PUSH_CONNECT
135 | - --port_ctrl
136 | - '56641'
137 | - --parallel_backend
138 | - process
139 | - --yaml_path
140 | - base.yml
141 | replicas: 1
142 | storage: 500Mi
143 | memory: 500Mi
144 | cpu: 300m
145 | image: hub-router:latest-rerank
146 | preprocessor:
147 | - name: unary
148 | app: preprocessor
149 | model: unary
150 | port_in: 53283
151 | port_out: 51714
152 | ctrl_port:
153 | grpc_port:
154 | command:
155 | - --port_in
156 | - '53283'
157 | - --port_out
158 | - '51714'
159 | - --socket_in
160 | - PULL_CONNECT
161 | - --port_ctrl
162 | - '55377'
163 | - --parallel_backend
164 | - process
165 | - --yaml_path
166 | - text.yml
167 | replicas: 1
168 | storage: 500Mi
169 | memory: 500Mi
170 | cpu: 300m
171 | image: hub-preprocessor:latest-unary
172 | encoder:
173 | - name: textbyte
174 | app: encoder
175 | model: textbyte
176 | port_in: 51714
177 | port_out: 62690
178 | ctrl_port:
179 | grpc_port:
180 | command:
181 | - --port_in
182 | - '51714'
183 | - --port_out
184 | - '62690'
185 | - --socket_in
186 | - PULL_CONNECT
187 | - --port_ctrl
188 | - '57360'
189 | - --parallel_backend
190 | - process
191 | - --yaml_path
192 | - max1024.yml
193 | replicas: 1
194 | storage: 500Mi
195 | memory: 500Mi
196 | cpu: 300m
197 | image: hub-encoder:latest-textbyte
198 | indexer:
199 | - name: keyword
200 | app: indexer
201 | model: keyword
202 | port_in: 62690
203 | port_out: 62155
204 | ctrl_port:
205 | grpc_port:
206 | command:
207 | - --port_in
208 | - '62690'
209 | - --port_out
210 | - '62155'
211 | - --socket_in
212 | - PULL_CONNECT
213 | - --socket_out
214 | - PUSH_CONNECT
215 | - --port_ctrl
216 | - '60258'
217 | - --parallel_backend
218 | - process
219 | - --yaml_path
220 | - base.yml
221 | replicas: 1
222 | storage: 500Mi
223 | memory: 500Mi
224 | cpu: 300m
225 | image: hub-indexer:latest-keyword
--------------------------------------------------------------------------------
/flows/yc_demo/index.k:
--------------------------------------------------------------------------------
1 | # | APP | MODEL | REPS | YAML_PATH | IN | OUT | CMD
2 | 1 | httpclient | http | 1 | | RPC: | RPC:2 |
3 | 2 | frontend | | 1 | | PULL: | PUSH: | frontend
4 | 3 | router | | 1 | BaseRouter | PULL:2 | PUB: | route
5 | 4 | router | block | 1 | block_train.yml | SUB:3 | PUSH: |
6 | 5 | preprocessor| unary | 1 | text.yml | PULL:4 | PUSH: |
7 | 6 | encoder | textbyte | 1 | max1024.yml | PULL:5 | PUB: |
8 | 7 | indexer | whoosh | 1 | base.yml | SUB:6 | PUSH:2 |
9 | 8 | indexer | rocksdb | 1 | base.yml | SUB:6 | PUSH:2 |
10 |
--------------------------------------------------------------------------------
/flows/yc_demo/query.k:
--------------------------------------------------------------------------------
1 | # | APP | MODEL | REPS | YAML_PATH | IN | OUT | CMD
2 | 1 | httpclient | http | 1 | | RPC: | RPC:2 |
3 | 2 | frontend | | 1 | | PULL: | PUSH: | frontend
4 | 3 | router | | 1 | BaseRouter | PULL:2 | PUB: | route
5 | 4 | router | block | 1 | block_train.yml | SUB:3 | PUSH: |
6 | 5 | preprocessor| unary | 1 | text.yml | PULL:4 | PUSH: |
7 | 6 | encoder | textbyte | 1 | max1024.yml | PULL:5 | PUSH: |
8 | 7 | indexer | whoosh | 1 | base.yml | PULL:6 | PUSH: |
9 | 8 | indexer | rocksdb | 1 | base.yml | PULL:7 | PUB:9 |
10 | 9 | router | rerank | 1 | base.yml | SUB: | PUSH:2 |
11 | 10| router | block | 1 | block_query.yml | SUB:3 | PUB:9 |
12 |
--------------------------------------------------------------------------------
/koursaros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/__init__.py
--------------------------------------------------------------------------------
/koursaros/chart/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 |
--------------------------------------------------------------------------------
/koursaros/chart/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 |
--------------------------------------------------------------------------------
/koursaros/chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Deployed flow!
--------------------------------------------------------------------------------
/koursaros/chart/templates/main.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- with .Values.services }}
3 | {{- range list .frontend .preprocessor .encoder .indexer .router }}
4 | {{- range . }}
5 | ---
6 | {{ include "statefulset" .}}
7 | ---
8 | {{ include "service" .}}
9 | {{ end }}
10 | {{ end }}
11 | {{ end }}
--------------------------------------------------------------------------------
/koursaros/chart/templates/service.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "service" -}}
3 | {{- $name := printf "%s-%s" .app .model -}}
4 | apiVersion: v1
5 | kind: Service
6 | metadata:
7 | name: {{ $name }}
8 | spec:
9 | selector:
10 | app: {{ $name }}
11 | clusterIP: None
12 | ports:
13 | {{- if .port_in }}
14 | - name: in
15 | port: {{ .port_in }}
16 | protocol: TCP
17 | {{- end }}
18 | {{- if .port_out }}
19 | - name: out
20 | port: {{ .port_out }}
21 | protocol: TCP
22 | {{- end -}}
23 | {{- if .grpc_port }}
24 | - name: grpc
25 | port: {{ .grpc_port }}
26 | protocol: TCP
27 | {{- end -}}
28 | {{- if .ctrl_port }}
29 | - name: ctrl
30 | port: {{ .ctrl_port }}
31 | protocol: TCP
32 | {{- end -}}
33 | {{ if .load_balancer }}
34 | type: LoadBalancer
35 | {{ end }}
36 | {{- end -}}
--------------------------------------------------------------------------------
/koursaros/chart/templates/statefulset.yaml:
--------------------------------------------------------------------------------
1 |
2 | {{- define "statefulset" -}}
3 | {{- $name := printf "%s-%s" .app .model -}}
4 | apiVersion: apps/v1
5 | kind: StatefulSet
6 | metadata:
7 | name: {{ $name }}
8 | spec:
9 | replicas: {{ .replicas }}
10 | selector:
11 | matchLabels:
12 | app: {{ $name }}
13 | volumeClaimTemplates:
14 | - metadata:
15 | name: {{ $name }}
16 | spec:
17 | accessModes:
18 | - ReadWriteOnce
19 | {{- if .storage }}
20 | resources:
21 | requests:
22 | storage: {{ .storage }}
23 | {{- end }}
24 | template:
25 | metadata:
26 | labels:
27 | app: {{ $name }}
28 | spec:
29 | containers:
30 | - name: {{ $name }}
31 | image: {{ .image }}
32 | args:
33 | {{- range .command }}
34 | - {{ . | quote }}
35 | {{- end }}
36 | imagePullPolicy: null
37 | ports:
38 | {{- if .port_in }}
39 | - name: in
40 | containerPort: {{ .port_in }}
41 | protocol: TCP
42 | {{- end }}
43 | {{- if .port_out }}
44 | - name: out
45 | containerPort: {{ .port_out }}
46 | protocol: TCP
47 | {{- end }}
48 | {{- if .grpc_port }}
49 | - name: grpc
50 | containerPort: {{ .grpc_port }}
51 | protocol: TCP
52 | {{- end }}
53 | {{- if .grpc_port }}
54 | - name: ctrl
55 | containerPort: {{ .port_ctrl }}
56 | protocol: TCP
57 | {{- end }}
58 | resources:
59 | requests:
60 | {{- if .cpu }}
61 | cpu: {{ .cpu }}
62 | {{- end }}
63 | {{- if .memory }}
64 | memory: {{ .memory }}
65 | {{- end }}
66 |
67 | {{- if .resources -}}
68 | {{- toYaml .resources | nindent 8 -}}
69 | {{- end -}}
70 | {{- end -}}
--------------------------------------------------------------------------------
/koursaros/chart/values.yaml:
--------------------------------------------------------------------------------
1 |
2 | nameOverride: ""
3 | fullnameOverride: ""
4 | imagePullPolicy: IfNotPresent
5 |
6 | services:
7 | frontend: {}
8 | preprocessors: {}
9 | encoders: {}
10 | indexers: {}
11 | routers: {}
12 |
13 | # Valid options for all services are:
14 | # - name: name of the service
15 | # image: -
16 | # port_in: -
17 | # port_out: -
18 | # command: command on container entrance
19 | # replicas: -
20 | # storage: storage on stateful claim
21 | # memory: ram
22 | # cpu: -
23 | # storage and memory are suffixed with Gi(gigabits) or Mi(Megabits) (just greater than a GB and MB)
24 | # cpu is suffixed with m(milliCPU or 1/1000 CPU)
--------------------------------------------------------------------------------
/koursaros/cli/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/koursaros/cli/__main__.py:
--------------------------------------------------------------------------------
1 |
2 | from .manager import AppManager
3 | from .deploy import deploy
4 | from .test import test
5 | from .show import show
6 | from .build import build
7 | import click
8 |
9 |
10 | @click.group()
11 | @click.pass_context
12 | def kctl(ctx):
13 | """
14 | kctl controls the \033[1;3;4;34mKoursaros\033[0m platform.
15 | Find more information at: https://github.com/koursaros-ai/koursaros
16 | """
17 | ctx.obj = AppManager()
18 |
19 |
20 | kctl.add_command(deploy)
21 | kctl.add_command(test)
22 | kctl.add_command(show)
23 | kctl.add_command(build)
24 |
25 |
26 | def main():
27 | kctl(prog_name=__package__)
28 |
29 |
30 | if __name__ == "__main__":
31 | main()
32 |
--------------------------------------------------------------------------------
/koursaros/cli/build/__init__.py:
--------------------------------------------------------------------------------
1 | from koursaros.repo_creds import get_creds
2 | import click
3 | from shutil import copytree, rmtree
4 |
5 |
6 | @click.group()
7 | def build():
8 | """Build docker images."""
9 |
10 |
11 | @build.command()
12 | @click.argument('flow_path')
13 | @click.option('-p', '--push')
14 | @click.option('-c', '--creds')
15 | @click.option('-n', '--no-caches', multiple=True)
16 | @click.pass_obj
17 | def flow(app_manager, flow_path, push, creds, no_caches):
18 | """Build images for a pipeline. """
19 |
20 | if push:
21 | if creds is None:
22 | raise ValueError('--creds repository must be specified if pushing')
23 |
24 | hub_creds = get_creds(creds).dockerhub
25 | app_manager.call('docker login -u %s -p %s' % (
26 | hub_creds.username, hub_creds.password), shell=True)
27 |
28 | # app_manager.call('eval $(minikube docker-env)', shell=True)
29 |
30 | _flow = app_manager.get_flow(flow_path)
31 |
32 | for service in _flow.services.values():
33 | if '/' not in service['image']:
34 | path = str(app_manager.find_model(service['app'], service['model']))
35 | tag = service['image']
36 | app_manager.logger.critical('Building %s from %s...' % (tag, path))
37 | cache = '--no-cache ' if service.get('name', None) in no_caches else ''
38 | _build = 'docker build ' + cache + '-t %s %s' % (tag, path)
39 | app_manager.call(_build, shell=True)
40 |
41 | if push:
42 | app_manager.logger.critical('Pushing %s...' % tag)
43 | app_manager.call('docker push %s/%s' % (push, tag), shell=True)
44 |
45 | """save swarm yaml"""
46 | _flow.swarm()
47 | # app_manager.logger.critical('Saved swarm yaml to %s' % str(out_path))
48 |
49 | """save helm chart"""
50 | # out_path = _flow.path.parent.joinpath('helm')
51 | # rmtree(str(out_path), ignore_errors=True)
52 | # copytree(str(app_manager.pkg_root.joinpath('chart')), str(out_path))
53 | # _flow.path.parent.joinpath('helm/values.yaml').write_text(helm_yaml)
54 | # app_manager.logger.critical('Saved helm chart to %s' % str(out_path))
--------------------------------------------------------------------------------
/koursaros/cli/deploy/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | import click
3 | from tqdm import tqdm
4 | import time
5 | import importlib.util
6 |
7 |
8 | @click.group()
9 | def deploy():
10 | """Deploy gnes services."""
11 |
12 |
13 | @deploy.group()
14 | def flow():
15 | """Deploy a pipeline with compose or k8s. """
16 |
17 |
18 | deploy.add_command(flow)
19 |
20 |
21 | @flow.command()
22 | @click.argument('flow_path')
23 | @click.pass_obj
24 | def compose(app_manager, flow_path):
25 | path = app_manager.get_flow(flow_path).path.parent.joinpath('docker-compose.yml')
26 | down = 'docker-compose -f %s down' % str(path)
27 | app_manager.call(down, shell=True)
28 | up = 'docker-compose -f %s up' % str(path)
29 | app_manager.call(up, shell=True)
30 |
31 |
32 | @flow.command()
33 | @click.argument('flow_name')
34 | @click.pass_obj
35 | def swarm(app_manager, flow_name):
36 | path = app_manager.get_flow(flow_name).path.parent.joinpath('docker-compose.yml')
37 | rm = 'docker stack rm %s' % flow_name
38 | app_manager.call(rm, shell=True)
39 | app_manager.logger.critical('Waiting for docker network resources...')
40 | [time.sleep(0.15) for _ in tqdm(range(100))]
41 | stack = 'docker stack deploy --compose-file %s %s' % (str(path), flow_name)
42 | app_manager.call(stack, shell=True)
43 |
44 |
45 | @flow.command()
46 | @click.argument('flow_name')
47 | @click.option('-d', '--dryrun', is_flag=True)
48 | @click.pass_obj
49 | def k8s(app_manager, flow_name, dryrun):
50 | path = app_manager.get_flow(flow_name).path.parent.joinpath('helm')
51 | purge = 'helm delete --purge $(helm ls --all --short)'
52 | app_manager.call(purge, shell=True)
53 | install = 'helm install ' + ('--dry-run --debug ' if dryrun else '') + str(path)
54 | app_manager.call(install, shell=True)
55 |
56 |
57 | @deploy.command(context_settings=dict(
58 | ignore_unknown_options=True,
59 | allow_extra_args=True))
60 | @click.argument('client_name')
61 | @click.pass_context
62 | def client(ctx, client_name):
63 | """Deploy a client. """
64 | app_manager = ctx.obj
65 | path = app_manager.find_model('client', client_name).joinpath('client.py')
66 | if not path.exists():
67 | raise FileNotFoundError('Could not find %s' % path)
68 | spec = importlib.util.spec_from_file_location(client_name, path)
69 | module = importlib.util.module_from_spec(spec)
70 | spec.loader.exec_module(module)
71 | module.Client(*ctx.args).run()
72 |
73 |
74 |
--------------------------------------------------------------------------------
/koursaros/cli/manager.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from gnes.helper import set_logger
4 | from importlib import machinery
5 | from koursaros.flow import Flow
6 | from pathlib import Path
7 | from typing import List
8 | import subprocess
9 | import git
10 | import os
11 |
12 |
13 | class AppManager:
14 | """Manager that keeps track of all of the koursaros
15 | paths and packages. Passed around at runtime to make
16 | things more efficient.
17 |
18 | :param dev: run on local koursaros repo
19 | """
20 |
21 | def __init__(self):
22 | self.git_root = Path(git.Repo(
23 | '.', search_parent_directories=True).working_tree_dir)
24 | self.pkg_root = Path(__file__).parent.parent
25 |
26 | self.logger = set_logger('kctl')
27 | self.cache = self.git_root.joinpath('.k')
28 | self.cache.mkdir(exist_ok=True)
29 |
30 | def call(self, cmd: List[str], shell=False):
31 | string = cmd if shell else ' '.join(cmd)
32 | self.logger.critical('subprocess.call: "%s"' % string)
33 | subprocess.call(cmd, shell=shell)
34 |
35 | @staticmethod
36 | def check_exists(path: 'Path'):
37 | if not path.exists():
38 | raise FileNotFoundError(path.absolute())
39 |
40 | def find_model(self, app: str, model: str) -> 'Path':
41 | path = self.pkg_root.joinpath('hub', app, model)
42 | self.check_exists(path)
43 | return path
44 |
45 | def get_flow(self, path) -> 'Flow':
46 | path = Path(path)
47 | self.check_exists(path)
48 | return Flow(path)
49 |
--------------------------------------------------------------------------------
/koursaros/cli/show/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import webbrowser
4 | import click
5 |
6 |
7 | @click.group()
8 | def show():
9 | """Show gnes architecture."""
10 |
11 |
12 | @show.command()
13 | @click.argument('flow_path')
14 | @click.pass_obj
15 | def flow(app_manager, flow_path):
16 | """Deploy a pipeline with compose or k8s. """
17 | url = app_manager.get_flow(flow_path).mermaid_url
18 |
19 | try:
20 | webbrowser.open_new_tab(url)
21 | except webbrowser.Error as ex:
22 | app_manager.logger.critical(
23 | '%s\nCould not open browser... Please visit:\n%s' % (ex, url))
24 |
--------------------------------------------------------------------------------
/koursaros/cli/test/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | import requests
3 | import click
4 | import json
5 |
6 |
7 | @click.group()
8 | @click.pass_context
9 | def test(ctx):
10 | """Test a running pipeline"""
11 |
12 |
13 | def log_json_res(res):
14 | logger = set_logger('TEST')
15 | logger.info(json.dumps(json.loads(res.content), indent=4)
16 | .encode().decode("unicode_escape"))
17 |
18 |
19 | @test.command()
20 | @click.argument('pipeline_name')
21 | @click.pass_context
22 | def pipeline(ctx, pipeline_name):
23 | logger = set_logger('TEST')
24 |
25 | if pipeline_name == 'telephone':
26 | url = 'http://localhost:5000'
27 | headers = {'Content-Type': 'application/json'}
28 |
29 | translations = json.dumps({
30 | 'translations': [{
31 | 'lang': 'en',
32 | 'text': input('What would you like to translate?\t')
33 | }]
34 | })
35 |
36 | logger.bold('POSTING %s on %s' % (translations, url))
37 | res = requests.post(url + '/send', data=translations, headers=headers)
38 | log_json_res(res)
39 | logger.bold('REQ STATUS')
40 | res = requests.get(url + '/status', data=translations, headers=headers)
41 | log_json_res(res)
42 |
--------------------------------------------------------------------------------
/koursaros/cli/utils.py:
--------------------------------------------------------------------------------
1 | def decorator_group(decorators):
2 | """returns a decorator which bundles the given decorators
3 |
4 | :param decorators: iterable of decorators
5 | :return: single decorator
6 |
7 | Example:
8 | deploy_options = decorator_group([
9 | click.option('-c', '--connection', required=True),
10 | click.option('-r', '--rebind', is_flag=True),
11 | click.option('-d', '--debug', is_flag=True),
12 | ])
13 |
14 | """
15 | def group(f):
16 | for decorator in decorators:
17 | f = decorator(f)
18 | return f
19 | return group
20 |
--------------------------------------------------------------------------------
/koursaros/flow/__init__.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import random
3 | from collections import defaultdict
4 | from base64 import b64encode
5 | from ruamel.yaml import YAML
6 |
7 | APPS = ['httpclient', 'frontend', 'router', 'preprocessor', 'encoder', 'indexer']
8 | IN_SOCKS = ['PULL', 'SUB', 'RPC']
9 | OUT_SOCKS = ['PUSH', 'PUB', 'RPC']
10 |
11 |
12 | def parse_line(line):
13 | try:
14 | line = [x.strip() for x in line.split('|')]
15 |
16 | if len(line) != 8:
17 | raise ValueError('Expected %s columns on line: %s' % (8, line))
18 |
19 | if not line[0].isnumeric():
20 | raise ValueError('expected numeric id but got %s' % line[0])
21 | id = int(line[0])
22 |
23 | app = line[1]
24 | if not app in APPS:
25 | raise ValueError('app must be in %s not %s' % (APPS, line[1]))
26 |
27 | model = line[2] if line[2] else None
28 | if model and not model.isidentifier():
29 | raise ValueError('model must be python identifier "%s"' % line[2])
30 |
31 | image = 'hub-%s:latest-%s' % (app, model) if model else 'gnes/gnes:latest-alpine'
32 |
33 | if not line[3].isnumeric():
34 | raise ValueError('replicas must be numeric not "%s"' % line[3])
35 | reps = int(line[3])
36 |
37 | yaml_path = line[4] if line[4] else None
38 |
39 | i = line[5].split(':')
40 | if len(i) != 2:
41 | raise ValueError('":" not found in %s' % i)
42 | if i[0] not in IN_SOCKS:
43 | raise ValueError('"%s" not in %s' % (i[0], IN_SOCKS) )
44 | if i[1] and not i[1].isnumeric():
45 | raise ValueError('in sock "%s" is not numeric' % i[1])
46 | i[1] = int(i[1]) if i[1] else None
47 | i[0] += '_CONNECT' if i[1] else '_BIND'
48 |
49 | o = line[6].split(':')
50 | if len(o) != 2:
51 | raise ValueError('":" not found in %s' % o)
52 | if o[0] not in OUT_SOCKS:
53 | raise ValueError('"%s" not in %s' % (o[0], OUT_SOCKS) )
54 | if o[1] and not o[1].isnumeric():
55 | raise ValueError('out sock "%s" is not numeric' % o[1])
56 | o[1] = int(o[1]) if o[1] else None
57 | o[0] += '_CONNECT' if o[1] else '_BIND'
58 |
59 | command = line[7] if line[7] else None
60 |
61 | return vars()
62 |
63 | except ValueError as e:
64 | raise ValueError('Error on line: %s\n\n%s' % (line, e))
65 |
66 |
67 | class Flow:
68 | def __init__(self, path: 'Path'):
69 | self.services = dict()
70 | self.ports = defaultdict(
71 | lambda: {'ins': set(), 'outs': set()})
72 | self.path = path
73 | self.lines = []
74 | self.p = list(range(53001, 65001))
75 | random.shuffle(self.p)
76 |
77 | with Path(path).open() as fh:
78 | for line in fh:
79 | self.add_line(line)
80 |
81 | def add_line(self, line: str):
82 | if not line.strip().startswith('#'):
83 | self.lines += [line]
84 | service = parse_line(line)
85 | self._add_service(service)
86 |
87 | def _add_service(self, s: dict):
88 | in_id = s['i'][1]
89 | if in_id:
90 | self.ports[in_id]['outs'].add(s['id'])
91 |
92 | out_id = s['o'][1]
93 | if out_id:
94 | self.ports[out_id]['ins'].add(s['id'])
95 |
96 | s['name'] = s['model'] + str(s['id']) if s['model'] else s['app'] + str(s['id'])
97 | s['local_in'] = self.p.pop()
98 | s['local_out'] = self.p.pop()
99 | self.services[s['id']] = s
100 |
101 | def swarm(self):
102 | y = {'version': '3.4', 'services': {}}
103 | for s in self.services.values():
104 | new = dict(volumes=['./.cache:/workspace'], image=s['image'])
105 | new['command'] = [s['command']] if s['command'] else []
106 | in_id = s['i'][1]
107 | out_id = s['o'][1]
108 |
109 | if s['app'] != 'httpclient':
110 |
111 | new['command'] += ['--socket_in', s['i'][0], '--socket_out', s['o'][0]]
112 |
113 | if s['yaml_path']:
114 | new['command'] += ['--yaml_path', s['yaml_path']]
115 |
116 | # if connecting in
117 |
118 | if in_id:
119 | new['command'] += ['--host_in', self.services[in_id]['name']]
120 | new['command'] += ['--port_in', self.services[in_id]['local_out']]
121 | # if binding in
122 | else:
123 | new['command'] += ['--port_in', s['local_in']]
124 |
125 | # if connecting out
126 | if out_id:
127 | new['command'] += ['--host_out', self.services[out_id]['name']]
128 | new['command'] += ['--port_out', self.services[out_id]['local_in']]
129 | # if binding out
130 | else:
131 | new['command'] += ['--port_out', s['local_out']]
132 |
133 | else:
134 | new['ports'] = ['80:80']
135 | new['command'] += ['--grpc_host', self.services[out_id]['name']]
136 |
137 | new['command'] = ' '.join([str(x) for x in new['command']])
138 | y['services'][s['name']] = new
139 |
140 | YAML().dump(y, open('docker-compose.yml', 'w'))
141 |
142 | @property
143 | def mermaid_url(self):
144 | app_colors = dict(
145 | httpclient=('#FFE0E0', '#000', '1px'),
146 | frontend=('#FFE0E0', '#000', '1px'),
147 | router=('#C9E8D2', '#000', '1px'),
148 | encoder=('#FFDAAF', '#000', '1px'),
149 | preprocessor=('#CED7EF', '#000', '1px'),
150 | indexer=('#FFFBC1', '#000', '1px'),
151 | )
152 |
153 | lines = ['graph TD']
154 | for cls, fmt in app_colors.items():
155 | lines += ['classDef {} fill:{},stroke:{},stroke-width:{};'.format(cls, *fmt)]
156 |
157 | def edge(left_s, right_s):
158 | return ['{ln}--{lt}-{rt}-->{rn}'.format(
159 | ln=left_s['name'],
160 | lt=left_s['o'][0],
161 | rt=right_s['i'][0],
162 | rn=right_s['name']
163 | )]
164 |
165 | for bound_id, port in self.ports.items():
166 | bound_s = self.services[bound_id]
167 | # lines += ['subgraph %s' % bound_s['name']]
168 |
169 | for conn_id in port['ins']:
170 | conn_s = self.services[conn_id]
171 | lines += edge(conn_s, bound_s)
172 |
173 | for conn_id in port['outs']:
174 | conn_s = self.services[conn_id]
175 | lines += edge(bound_s, conn_s)
176 |
177 | # lines += ['end']
178 |
179 | for s in self.services.values():
180 | lines += ['class {} {};'.format(s['name'], s['app'])]
181 |
182 | return 'https://mermaidjs.github.io/mermaid-live-editor/#/view/' + b64encode('\n'.join(lines).encode()).decode()
183 |
184 |
185 |
186 |
--------------------------------------------------------------------------------
/koursaros/hub/client/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/.DS_Store
--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 |
3 | RUN apt update
4 | RUN apt install libpq-dev gcc python3-dev musl-dev git -y
5 | RUN pip install psycopg2 git+https://git@github.com/koursaros-ai/koursaros.git
6 |
7 | ADD *.py *.yml ./
8 |
9 | ENTRYPOINT ["python", "postgres.py", "--start_doc_id", "1"]
--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/postgres.py:
--------------------------------------------------------------------------------
1 | from gnes.cli.parser import set_client_cli_parser
2 | from koursaros.repo_creds import get_creds
3 | from gnes.client.cli import CLIClient
4 | from gnes.base import TrainableBase
5 | import traceback
6 | import psycopg2
7 | import json
8 | import os
9 |
10 | VALID_MODES = ['json', 'raw']
11 |
12 |
13 | class PostgresClient(CLIClient):
14 |
15 | @property
16 | def bytes_generator(self):
17 | try:
18 | args = self.args
19 | creds = get_creds(args.creds)
20 |
21 | psql = creds.postgres
22 | os.environ['PGSSLMODE'] = psql.sslmode
23 | os.environ['PGSSLROOTCERT'] = psql.sslrootcert.path
24 |
25 | columns = ', '.join([args.id_column] + args.data_columns)
26 | query = '''SELECT %s FROM %s''' % (columns, args.table)
27 | query += ' ORDER BY %s ASC' % args.id_column
28 | query += ' LIMIT %d' % args.limit if args.limit > 0 else ''
29 |
30 | connection = psycopg2.connect(user=psql.username,
31 | password=psql.password,
32 | host=psql.host,
33 | port=psql.port,
34 | dbname=psql.dbname)
35 | cursor = connection.cursor()
36 | cursor.execute(query)
37 |
38 | if args.send_type not in VALID_MODES:
39 | raise ValueError('"mode" parameter must be one of %s' % VALID_MODES)
40 | else:
41 | for i, (_id, *row) in enumerate(cursor):
42 | msg_id = i + 1
43 | if msg_id != _id:
44 | raise ValueError(
45 | '"%s" column must by an incremental id starting from 1. '
46 | 'Got id %s for row %s' % (args.id_column, _id, msg_id))
47 |
48 | if args.send_type == 'json':
49 | yield (json.dumps(zip(columns, row))).encode()
50 | elif args.send_type == 'raw':
51 | yield ''.join(row).encode()
52 |
53 | except:
54 | self.logger.error('wut')
55 | self.logger.error(traceback.format_exc())
56 |
57 | def query_callback(self, req, resp):
58 | self.logger.info(req, resp)
59 |
60 |
61 | if __name__ == '__main__':
62 | parser = set_client_cli_parser()
63 | parser.add_argument('--limit', type=int, help='number of postgres rows (-1 for unlimited)')
64 | cred_repo_help = 'cred repo set up according to git:koursaros-ai/koursaros.credentials spec'
65 | parser.add_argument('--creds', type=str, required=True, help=cred_repo_help)
66 | parser.add_argument('--yaml_path', type=str)
67 | cli_args = parser.parse_args()
68 | yaml = TrainableBase.load_yaml(cli_args.yaml_path)
69 | for k, v in yaml['parameters'].items(): setattr(cli_args, k, v)
70 | PostgresClient(cli_args)
71 |
--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/testrerank.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 | table: 'test.train_article_rerank'
3 | columns: ['claim', 'label']
4 | mode: json
5 |
6 |
--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/wikititles.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 | table: wiki.articles
3 | id_column: id
4 | data_columns: [title]
5 | send_type: raw
6 | limit: 100
7 |
8 |
9 |
--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/Dockerfile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/sheet/Dockerfile
--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/base.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/sheet/base.yml
--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/client.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pathlib
3 | import csv
4 | import json
5 |
6 | HEADERS = {'Content-Type': 'application/json'}
7 | MODES = ['index', 'train', 'query']
8 |
9 |
10 | class Client:
11 |
12 | def __init__(self, mode, path, limit=None):
13 | self.path = pathlib.Path(path)
14 | self.csv = csv.DictReader(self.path.open())
15 | self.mode = mode
16 | self.limit = limit
17 | if mode not in MODES:
18 | raise ValueError('%s is not valid. Please choose one of %s' % (mode, MODES))
19 |
20 | self.iter_csv(getattr(self, mode))
21 |
22 | def post(self, data):
23 | print('Posting:', data)
24 | response = requests.post('http://localhost:80/%s' % self.mode, data=data, headers=HEADERS)
25 | res = json.loads(response.content)
26 | if 'res' in res:
27 | self.result = json.loads(res['res'][0])
28 | else:
29 | self.result = res
30 | print('Returned:', self.result)
31 |
32 | def iter_csv(self, get_body_from_row):
33 | i = 0
34 | to_send = []
35 | for row in self.csv:
36 | to_send.append(get_body_from_row(row))
37 | if self.limit is not None and i > self.limit: break
38 | i += 1
39 | self.post('\n'.join(to_send).encode())
40 |
41 | def index(self, row):
42 | body = list(row.values())[1]
43 | req = dict(data=body)
44 | req.update(row)
45 | return json.dumps(req, ensure_ascii=False)
46 |
47 | def train(self, row):
48 | return json.dumps(row, ensure_ascii=False)
49 |
50 | def query(self, row):
51 | return list(row.values())[0]
52 |
53 | def query_one(self, text):
54 | self.mode = 'query'
55 | self.post(text.encode())
56 | return self.text()
57 |
58 | def text(self):
59 | return self.result['search']['topkResults'][0]['doc']['chunks'][0]['text']
--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/test.csv:
--------------------------------------------------------------------------------
1 | i,x,y,same_security
2 | 0,semtech corp,semtech corporation,
3 | 1,vanguard mid cap index,vanguard midcap index - a,
4 | 2,spdr gold trust gold shares,spdr gold trust spdr gold shares,
5 | 3,vanguard total bond index adm,vanguard total bond market index,
6 | 4,oakmark international fund class i,oakmark international cl i,
7 | 5,pfizer inc div: 1.200,pfizer inc com,
8 | 6,spartan global ex us index fid adv cl,sptn glb xus idx adv,
9 | 7,vanguard total bond market idx-adm,vanguard total bond market index fund investor shares,
10 | 8,banco latinoamericano de exportacio class e com stk npv,banco latinoamericano come-e,
11 | 9,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc spons ads repr 0.10 ord cls a us0.00005,
12 | 10,whole foods market,whole foods markets inc div: 0.540,
13 | 11,walgreens boots alliance inc com,walgreens boots alli,
14 | 12,diageo plc new gb spon adr,diageo p l c spon adr new,
15 | 13,guggenheim bulletshares 2016,guggenheim bulletshares 2016 high yield,
16 | 14,vanguard small-cap index adm,vanguard small-cap index fund inst,
17 | 15,emerging markets,vanguard ftse emerging marke,
18 | 16,spdr s&p 500 etf iv,s&p 500 index spdr,
19 | 17,tegna inc com,tegna inc,
20 | 18,deere & company,deere co,
21 | 19,vanguard mid-cap index,vanguard mid-cap index fund institutional plus shares,
22 | 20,jpmorgan chase & co div: 1.760,jpmorgan chase & co,
23 | 21,american funds europacific growth fund - r6,af europac growth r6,
24 | 22,vanguard total bond market idx-adm,vang tot bd mk is pl,
25 | 23,unitedhealth gp inc div: 2.000,unitedhealth group incorporated,
26 | 24,american intl group inc warrant 01/19/2021,american intl gro 21 wtswarrants exp 01/19/21,
27 | 25,fifth street finance corp com,fifth street financial corp com,
28 | 26,ishares jpm embi global core,ishares jpm usd emrg mkt bnd etf,
29 | 27,metwest tot rtn bd m,metropolitan west tot ret bond,
30 | 28,exelixis inc com,exelixis inc,
31 | 29,glenmede large cap gwth,glenmede large cap growth,
32 | 30,af europac growth r6,american funds europacific growth r6,
33 | 31,dreamworks animation skg cl a,dreamworks animation skg inc cl a,
34 | 32,dfa us small cap value port instl,dfa u s small cap value cl i,
35 | 33,vanguard ltd-trm t/e adm,vanguard limited-term tax-exempt fund,
36 | 34,ishares trust msci united kingdom etf,ishares msci u k etf shs,
37 | 35,pimco total return cl a,pimco total return fund adm,
38 | 36,amg yacktman fund service class,amg yacktman service,
39 | 37,vanguard intermediate-term treasury fund admiral shares,vang intm treas adm,
40 | 38,pimco incm cl d,pimco fds income instl,
41 | 39,vanguard growth index fund investor shares,vang growth idx adm,
42 | 40,vanguard mid cap index,vanguard mid-cap index fund admiral shares,
43 | 41,vanguard value index fund institutional shares,vanguard value index inv,
44 | 42,vanguard target retirement 2060 fund,vanguard tgt rtrmnt 2060,
45 | 43,netflix inc,netflix inc.,
46 | 44,cisco sys inc com,cisco sys inc,
47 | 45,pimco income fund cl p,pimco income fd i,
48 | 46,united parcel service-cl b,united parcel svc inc cl b,
49 | 47,michael kors holdings ltd com npv,michael kors hldgs ltd,
50 | 48,alaska air group inc com,alaska air group inc,
51 | 49,vanguard total bond market index adm,vanguard ttl bnd mrk indx inst,
52 | 50,vanguard intermediate-term investment-grade fund admiral shares,vanguard intermediate term inv gr fd inv cl,
53 | 51,oppenhmr develpng mkts y,oppenheimer developing markets cl y,
54 | 52,texas instrs incorporated,texas instruments inc,
55 | 53,sptn intl index fai,spartan intl index fid adv class,
56 | 54,heartland pmt sys inc com,heartland paymnt sys,
57 | 55,vanguard total bond market index fund institutional shares,vanguard total bond market index,
58 | 56,ariel fund inv,ariel fund,
59 | 57,flir sys inc,flir systems inc,
60 | 58,pimco income fund cl d,pimco income instl,
61 | 59,vanguard shortterm investgrade adm,vanguard shrt trm invmnt grd-inv,
62 | 60,bristol myers squibb company,bristol myers squibb co,
63 | 61,metlife inc,metlife incorporated div: 1.500,
64 | 62,nxp semiconductors n v com,nxp semiconductors nv,
65 | 63,novo nordisk a/s-adr nvorepstg 1/2 cl b sh,novo-nordisk a-s fadr 1 adr reps 1 ord shs,
66 | 64,vanguard total bond market index fund institutional shares,vanguard total bond market idx instl pls,
67 | 65,accenture ltd ord,accenture plc ireland,
68 | 66,pimco total ret fd instl,pimco total return fund cl p,
69 | 67,pimco income a,pimco incm inst cl,
70 | 68,t. rowe price institutional large cap growth,t. rowe price institutional large cap growth fund,
71 | 69,hsbc hldgs plc spons adr new,hsbc hldgs plc spon adr new,
72 | 70,carnival corp ord (panama),carnival corp com,
73 | 71,baidu inc spons ads repr 0.10 ord cls a us0.00005,baidu inc - spon adr,
74 | 72,vanguard mid-cap index fund institutional shares,vanguard mid cap index ins,
75 | 73,ishares iboxx $ invt grade corp bd,ishares iboxx ig corp bond,
76 | 74,ultimate software group inc,ultimate software gp,
77 | 75,pimco income fund cl d,pimco income a,
78 | 76,franklin income series cl a,franklin incm fd cl a,
79 | 77,ormat technologies inc,ormat technologies,
80 | 78,aqr mgd futures strat fd cl i,aqr aqr mngd futures strategy i,
81 | 79,ishares russell midcap growth,ishares russell midcap g etf div: 0.903,
82 | 80,vanguard target retirement 2045 fund,vang target ret 2045,
83 | 81,vanguard total intl stk,vanguard total intl etf,
84 | 82,sptn inter treas bnd investor class,sptn int tr idx adv,
85 | 83,artisan intl value fund inv,artisan international value,
86 | 84,"stag industrials, inc. com",stag industrial inc com,
87 | 85,parnassus endeavor fd,parnassus endeavor fund investor shares,
88 | 86,johnson & johnson div: 3.000,johnson & johnson jnj,
89 | 87,eaton vance floating rate fd cl a,eaton vance bond fund cl i,
90 | 88,vanguard total bond index adm,vanguard ttl bnd mrk indx inst,
91 | 89,pimco total ret fd instl,pimco total return fund - class r,
92 | 90,pimco total return fund cl d,pimco tot return adm,
93 | 91,tivo inc com,tidewater inc com new,
94 | 92,zimmer biomet hldgs,zimmer biomet holdings inc com,
95 | 93,ford mtr company del com par $0.01,ford mtr co del com par $0.01,
96 | 94,guggenheim bullet shrs 2018 hi y c bd etf,guggenheim bulletshares 2018,
97 | 95,apple inc com,apple incorporated,
98 | 96,ishares jpm usd emr etf,ishares jpm usd emrg mkt bnd e tf,
99 | 97,edison international cmn,edison intl,
100 | 98,conagra foods inc div: 1.000,conagra foods inc,
101 | 99,advanced micro devices,advanced micro devices inc,
102 | 100,american tower corporation reit,american tower reit inc (hldg co) shs,
103 | 101,vang sm cap idx adm,vanguard small cap index fund,
104 | 102,vanguard short-term bond index fund investor shares,vanguard short-term bond index,
105 | 103,"vanguard small cap index, adm",vnguard index trust small cap idx instl,
106 | 104,ishares jpm usd emrg mkt bnd e tf,ishares jpm embi global core,
107 | 105,blackrock strat income i,blackrock strategic income opptys investor cl a,
108 | 106,ishares russell midcap,ishares russell mid-cap etf,
109 | 107,ishares core msci emg mkts etf,harding loevner emerging mkts,
110 | 108,vanguard intl equity index fds ftse emerging mkts etf,vanguard ftse emerging mark etf iv,
111 | 109,american funds europacfic r5,american euro pac gr r5,
112 | 110,vanguard crsp us small cap ind ex,vanguard small cap etf,
113 | 111,delta air lines inc dela new,delta air lines inc. (de),
114 | 112,ishares 20+ yr treasu bond etf div: 3.107,ishares 20+ year treasury bo,
115 | 113,sptn glb xus idx fai,spartan global ex us index fid adv cl,
116 | 114,fidelity new insights i,fidelity advisor new insights fund cl i,
117 | 115,ishares tr nat amt free bd,ishares tr natl mun bd etf fd,
118 | 116,vanguard small cap index adm,vanguard small-cap index fund institutional shares,
119 | 117,first eagle global fd cl a,first eagle global class a,
120 | 118,t rowe price mid cap growth,mid-cap growth fund,
121 | 119,skyworks solutions,skyworks solutions inc com,
122 | 120,tile shop hldgs inc,tile shop hldg inc,
123 | 121,t. rowe price health sciences,t rowe price health science fund inc,
124 | 122,coca-cola co/the,coca-cola company,
125 | 123,atwood oceanics inc com,atwood oceanics inc.,
126 | 124,dodge & cox funds income fund,dodge & cox income fund n/l,
127 | 125,vanguard small cap index fund,vnguard index trust small cap idx instl,
128 | 126,vanguard mid-cap value etf,vanguard mid cap value etf,
129 | 127,american funds euro pacific growth r6,am fnd europacfic grth r6,
130 | 128,wisdomtree intl smallcp dividend etf,wisdomtree tr intl smallcap divid fd isin #us9 sedol #b17fg17,
131 | 129,boeing company cmn,boeing company,
132 | 130,ishares msci emerging markets,harding loevner emerging mkts,
133 | 131,transocean ltd zug namen akt,transocean ltd ord,
134 | 132,vanguard small-cap index fund admiral shares,vanguard small-cap index adm,
135 | 133,pimco total ret fd instl,pimco total return fd cl c,
136 | 134,jpmorgan equity income fund cl a,jpmorgan us equity fund - class r6,
137 | 135,energy sector index spdr,energy sector spdr etf,
138 | 136,vanguard total internatlbnd etf iv,vanguard charlotte total intl bd index fd etf,
139 | 137,t rowe price international discovery fund,t. rowe price international discovery,
140 | 138,united sts stl cp (new),united states stl corp new,
141 | 139,coca cola co,coca-cola company,
142 | 140,spdr barclays high yield bond (jnk),spdr barclays capital high yield bond et,
143 | 141,vanguard smallcap index fund,vanguard small-cap index fund institutional shares,
144 | 142,fidelity select utilities portfolio,fid sel utilities,
145 | 143,select sector spdr trust the technology select sector spdr fund,technology sector sp etf,
146 | 144,guggenheim s&p 500 equal we cons etf,guggenheim s&p 500 equalwe cons etf,
147 | 145,visa inc class a shares,visa inc cl a div: 0.560,
148 | 146,skyworks solutions inc com,skyworks solutions inc,
149 | 147,alibaba group holding ltd spons ads,alibaba group hldg adr fsponsored adr 1 adr reps 1 ord,
150 | 148,pimco income instl,pimco incm inst cl,
151 | 149,vanguard growth index inv,vanguard growth index fund admiral shares,
152 | 150,canadian natl railway co,canadian natl ry co f,
153 | 151,first tr exchange traded fd dow jones internet index fd,first tr exchange traded fd dow jones in,
154 | 152,vanguard total bond index adm,vanguard total bond market index i,
155 | 153,ishares gold tr,ishares gold tr ishares,
156 | 154,oppenheimer developing markets y fund,oppenheimer developing markets cl y,
157 | 155,vanguard total bond market index adm,vang tot bd mk is pl,
158 | 156,t. rowe price new income,guidemark core fixed income,
159 | 157,vanguard mid-cap index fund institutional shares,vang midcap idx inst,
160 | 158,mfs international new discovery r5,afs international growth & income fund cl f1,
161 | 159,fitbit inc,fitbit inc cl a,
162 | 160,vanguard growth index fund investor shares,vanguard growth index fd admiral share,
163 | 161,comcast corp (new) class a div: 1.100,comcast corp cl a,
164 | 162,invesco diversified dividend investor cl,fidelity advisor diversified international fund cl c,
165 | 163,blackrock high yld bd port cl k,blackrock high yield bond portfolio svc,
166 | 164,sina com ord (caym is),sina corporation com,
167 | 165,t.rowe price new horizons-t,new horizons fund,
168 | 166,vanguard value etf (vtv),vanguard value etf,
169 | 167,berkshire hathawayinc del cl b new,berkshire hathawayinc,
170 | 168,schlumberger limited com usd0.01,schlumberger ltd.,
171 | 169,union pacific corp,union pac corp com,
172 | 170,alps etf tr alerian mlp,alps alerian mlp etf,
173 | 171,vanguard div growth fd investor shrs,vanguard dividend growth fund investor shares,
174 | 172,tyson foods inc-cl a tsn,tyson foods inc class a,
175 | 173,american mutual fund-a,american mutual fund cl a,
176 | 174,canadian national railway,canadian natl railway company com,
177 | 175,deutsche x-trackers msci eafe hedged equity etf,deutsche x-trackers msci eafe equity etf,
178 | 176,vanguard total bond index adm,vanguard total bond market idx instl pls,
179 | 177,western digital corp,western digital corp com,
180 | 178,ishares core msci emg mkts etf,harding loevner emerg mrkts port adv,
181 | 179,pimco high income fd com shs,pimco high income fund,
182 | 180,ishares inc core msci emerging mkts etf,harding loevner emerging mkts,
183 | 181,time warner inc com,time warner inc,
184 | 182,vanguard international growth fund admiral,vanguard international growth fund admiral shares,
185 | 183,ishares trust core msci total intl stk etf,ishares core msci ttl int stk,
186 | 184,ishares iboxx invt gradebond etf,ishares iboxx $ invt grade corp bd,
187 | 185,interactive brokers group inc. com,interactive brokers class a,
188 | 186,adobe systems,adobe systems incorporated,
189 | 187,vang tot bd mkt adm,vanguard total bond market idx instl pls,
190 | 188,yandex nv com,yandex n.v. com usd0.01 cl a,
191 | 189,vanguard small cap index - a,vanguard small-cap index fund inst,
192 | 190,sirius xm hldgs inc com isin #us5 sedol #bgldk10,sirius xm hldgs inc com,
193 | 191,vanguard target retirement 203 5 fund,vang target ret 2035,
194 | 192,communications sales&leas inc div: 2.400,communications sales&leas inc com,
195 | 193,vanguard index fds vanguard total stk mkt etf,us total stock market,
196 | 194,t. rowe price equity income fund,t. rowe price equity income,
197 | 195,ishares tr core us aggt bd etf,ishares core us aggregate bond etf,
198 | 196,american funds europacific growthr3,american funds europacific growth fund,
199 | 197,lazard emerging mkts eqty port opn,lzrd emrg mkts eq o,
200 | 198,pimco income a,pimco income administrative,
201 | 199,american express co,american express company,
202 | 200,taser international,taser intl inc del com,
203 | 201,vanguard short term tax exempt fd investor shr,vanguard short-term tax-exempt fund investor shares,
204 | 202,ishares core msci emerging markets etf,harding loevner emerg mrkts port adv,
205 | 203,vanguard institutional index fund institutional shares,vanguard institl index,
206 | 204,trp real estate adv,t rowe price real estate fund adv cl,
207 | 205,jp morgan chase & co com,jpmorgan chase & co div: 1.760,
208 | 206,vanguard 500 index fund admira l,vanguard 500 index fund admiral class,
209 | 207,dollar gen corp new com,dollar general corp,
210 | 208,us silica holdings inc,u s silica hldgs inc com,
211 | 209,alphabet inc cap stk cl c cap stk cl c,alphabet inc cl c,
212 | 210,ishares msci usa min volility etf,proshares short vix short term etf,
213 | 211,fidelity low-priced stock,fid low priced stk,
214 | 212,vang st invstgrd inv,vanguard short-term invest-grade,
215 | 213,goldman sachs mangd futures strategy a,aqr aqr mngd futures strategy i,
216 | 214,select sector spdr trust health care select index,health care select spdr fund,
217 | 215,metropolitan west fds total ret cl i,metropolitan west tot ret bond,
218 | 216,chubb limited com,chubb ltd,
219 | 217,vanguard ftse emerg mkts etf,vanguard intl equity index fds ftse emerging mkts etf,
220 | 218,energy transfer partners un,cheniere energy partners lp com,
221 | 219,baron partners,baron partners fund,
222 | 220,prudential financial inc cmn,prudential finl inc,
223 | 221,t rowe price retirement 2050 fund,t. rowe price retirement 2050 fund,
224 | 222,templeton global bond class a,templeton global bd r,
225 | 223,dominion resources inc va new,dominion resources inc/va,
226 | 224,ishares tr core us aggt bd etf,ishares core u.s. aggregate,
227 | 225,citigroup inc,citigroup inc new div: 0.200,
228 | 226,invesco comstock fund cl a,invesco comstock y,
229 | 227,oppen developing mkts a,oppenheimer developing mkts fd cl a,
230 | 228,alphabet inc shs cl a,alphabet inc voting,
231 | 229,national grid new adr each repr 5 ord gbp0.11395,national grid plc new spon adr,
232 | 230,ishares russell 3000 index etf,ishares russell 3000 etf,
233 | 231,titan international inc com,titan international inc,
234 | 232,proshares tr ii ultra bloomberg crude oi,proshares ultra bloomberg crude oil etf,
235 | 233,ishares core msci emerging markets etf,harding loevner emerging mkts,
236 | 234,vang tot bd mkt adm,vanguard total bond market index i,
237 | 235,mfs global total return fund cl a,mfs global total return cl a,
238 | 236,metropolitan west total return m,metropolitan west tot ret bond,
239 | 237,blckrck inflation protect,blackrock inflation protected bond instl,
240 | 238,dfa real estate securities i,dfa real estate securities fund institutional class,
241 | 239,leucadia natl corp com,leucadia national co,
242 | 240,pimco income fd i,pimco income instl,
243 | 241,trp retirement 2045,t. rowe price retirement 2045,
244 | 242,wal-mart stores inc com isin #us9311421039 sedol #2936921,scana corp new com isin #us7 sedol #2545844,
245 | 243,oppenheimer developing market a,oppenheimer developing mkts fd cl a,
246 | 244,nuveen high yield muni bond fund cl i,nuveen high yield municipal bond a,
247 | 245,vanguard short term invt grade admiral,vanguard short-term investment-grade fund investor shares,
248 | 246,the growth fund of america,amer fds grwth fd amr a,
249 | 247,fireeye inc,fireeye inc com usd0.0001,
250 | 248,templeton global bond fund advisor class,templeton global bond fund adv cl,
251 | 249,ishares inc core msci emerging mkts etf,harding loevner emerg mrkts port adv,
252 | 250,silver wheaton corp. ads,silver wheaton corporation com npv isin #ca6 sedol #b058zx6,
253 | 251,dfa us small cap value prtf instl,dfa us sm cap value,
254 | 252,united states oil fund lp exchange-traded fund,united states oil fund lp unit,
255 | 253,pimco total return instl,fund: pimco total return admin,
256 | 254,tesla motors inc com,tesla motors inc.,
257 | 255,schwab short term us treasury etf,schwab strategic tr short-term us treasury,
258 | 256,vanguard total bond index adm,vang tot bd mk is pl,
259 | 257,van small cap index admir,vang sm cap idx inst,
260 | 258,facebook incorporated class a,facebook inc cl a,
261 | 259,vanguard ext market index inst,vanguard extended market idx adm,
262 | 260,t. rowe price new horizons,new horizons fund,
263 | 261,vanguard total bond market index fund admiral shares,vanguard bond index total mkt investor,
264 | 262,lloyds banking group plc div: 0.129,lloyds banking group plc,
265 | 263,vbr:vanguard small-cap value etf,vanguard small cap valueetf iv,
266 | 264,constellation brands inc cl a,constellation brand class a,
267 | 265,jp morgan chase & co com,jp morgan chase & co,
268 | 266,pimco total return instl,pimco total return fund instl cl,
269 | 267,ishares gold etf,ishares gold trust com,
270 | 268,schwab intl core equity,schwab intl core eqty fd instl cl,
271 | 269,vanguard total bond market index-admiral,vang tot bd mkt inst,
272 | 270,pimco total return instl,pimco total return fund adm,
273 | 271,alibaba group hldg ltd sponsor,alibaba group hldg ltd adr,
274 | 272,ishares russell 1000 growth,russell 1000 growth (ishares),
275 | 273,walt disney co,disney,
276 | 274,vanguard s&p 500 etf (voo),vanguard index fds s&p 500 etf,
277 | 275,ishares msci eafe min volatility etf,ishares trust msci eafe min volatil etf,
278 | 276,kraft heinz co com,kraft heinz co div: 2.300,
279 | 277,metr w tot rtn bond cl m,metropolitan west tot ret bond,
280 | 278,berkshire hathaway cl-b new,berkshire hathaway inc.,
281 | 279,momenta pharmaceuticals,momenta pharmaceuticals inc com,
282 | 280,colgate palmolive,colgate palmolive co com,
283 | 281,ishares inc core msci emerging mkts etf,ishares msci emerging markets,
284 | 282,powershares qqq etf,powershares qqq trust sr 1 etf,
285 | 283,vanguard small-cap index fund admiral,vanguard small-cap index fund institutional shares,
286 | 284,corning inc cm,corning inc,
287 | 285,vereit inc reit,vereit inc,
288 | 286,ishares core msci emerging div: 0.995,ishares core msci emg mkts etf,
289 | 287,american funds europacific growth-r6,american funds europacific growth fund class r-6,
290 | 288,national oilwell varco inc com,national-oilwell varco inc,
291 | 289,vanguard 500 index fund admiral shares,vnguard 500 index admiral shares,
292 | 290,facebook inc.,facebook inc class a,
293 | 291,whole foods market inc,whole foods markets inc div: 0.540,
294 | 292,bhp billiton plc - adr,bhp billiton plc spons adr each rep 2 ord usd0.50,
295 | 293,ishares russell 2000 growth etf iv,ishares russell 2000 grwth etf div: 1.243,
296 | 294,osterweis strategic income fund,professionally mgd ptfl osterweis strategic inc fd,
297 | 295,vanguard small cap value etf,vanguard small cap valueetf iv,
298 | 296,wa core plus bond i,western asset core plus bond fd cl fi,
299 | 297,kraft heinz co div: 2.300,kraft heinz co com,
300 | 298,nokia corp-spon adr,nokia corp cls a adr (finnish),
301 | 299,citrix systems inc.,citrix systems inc,
302 | 300,devry education group inc div: 0.360,devry education group,
303 | 301,vanguard gnma fund admiral shares,vanguard gnma fund investor share,
304 | 302,vanguard star investor class,vanguard star fund investor shares,
305 | 303,vanguard total bond market index adm,vanguard total bond market index fund investor shares,
306 | 304,ishares gold tr,ishares gold etf,
307 | 305,eaton corp plc com,eaton corp plc f,
308 | 306,pepsico inc com,pepsico inc cmn,
309 | 307,wal-mart stores inc com,wal-mart stores inc.,
310 | 308,whole foods mkt inc com,whole foods market,
311 | 309,american capital world growth and income fd a,american capital world grth & inc a,
312 | 310,vanguard mid cap index fund - admiral,vanguard mid cap index fund admiral class,
313 | 311,fidelity corporate bond fund,baird core plus bond inst,
314 | 312,spartan extended mkt index fid adv class,spartan extended mkt index investor cl,
315 | 313,nxp semiconductors f,nxp semiconductors n v,
316 | 314,columbia dividend income,col dividend inc z,
317 | 315,vanguard sml-cap ind-adm,vanguard small-cap index fund admiral,
318 | 316,vanguard index 500 port,vanguard 500 index fund-inv,
319 | 317,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc sponsored adr repstg ord shares class a,
320 | 318,delaware value fund institutional,delaware value cl a,
321 | 319,vanguard total bond market index fund admiral shares,vanguard total bond market index inv,
322 | 320,arista networks inc,arista networks inc com usd0.0001,
323 | 321,ishares 1-3 yr treasury bnd etf,ishares 1-3 yr treasury bnd et f,
324 | 322,united sts oil fd lp units,united states oil fund lp exchange-traded fund,
325 | 323,pimco income fund cl p,pimco fds income instl,
326 | 324,diageo plc fadr 1 adr reps 4 ord shs,diageo p l c spon adr new,
327 | 325,harbor capital appreciation instl,harbor capital appreciation,
328 | 326,price t rowe group inc com isin #us74144t1088 sedol #2702337,c h robinson worldwide inc com new isin #us8 sedol #2116228,
329 | 327,ishares s&p midcap fund,ishares core s&p mid capetf,
330 | 328,pimco total return cl a,fund: pimco total return admin,
331 | 329,johnson and johnson,johnson and johnson com,
332 | 330,ishares msci cda etf,ishares msci canada index,
333 | 331,carnival corp com,carnival corp f,
334 | 332,priceline group,priceline grp inc com new,
335 | 333,westport innovation f,westport innovations inc,
336 | 334,wisdomtree emerging markets high dividend fund etf,wisdomtree emrg mrkt hg div etf,
337 | 335,d.r. horton inc,d r horton co,
338 | 336,vanguard total bond market index adm,vanguard total bond market idx instl pls,
339 | 337,ishares 7-10 year treas bond etf,ishares barclays 7-10 year treasury bond,
340 | 338,vanguard total bond market index-admiral,vanguard total bond market index fund institutional plus shares,
341 | 339,vanguard sml-cap ind-adm,vanguard small-cap index fund institutional shares,
342 | 340,vanguard interm-term investment-grde adm,vanguard intermediate term inv gr fd inv cl,
343 | 341,teva pharmaceuticals adr,teva pharmaceuticals ind ltd israel adr,
344 | 342,dfa intl small cap value cl i,dfa intl small cap value port instl,
345 | 343,prudential financial inc div: 2.800,prudential finl inc,
346 | 344,af bond fd amer r6,the bond fund of america-a,
347 | 345,allergan plc,allergan plc f,
348 | 346,vale sa,vale s a adr,
349 | 347,vanguard short-term corporate bond etf,vanguard short-term corporate bond,
350 | 348,vanguard index fds vanguard small cap growth vipers formerly,vanguard small-cap grwth etf,
351 | 349,vanguard short-term bondetf,vanguard short term etf,
352 | 350,goldman sachs mgd futures strat a,aqr mgd futures strat fd cl i,
353 | 351,vanguard total bond index adm,vanguard total bond market index fund investor shares,
354 | 352,xinyuan real estate com,xinyuan real estate co ltd spon adr,
355 | 353,ishares core u.s. aggregate bond etf,ishares core total us bond market etf,
356 | 354,devon energy corp,devon energy corporation (new) cmn,
357 | 355,berkshire hathaway class b,berkshire hathawayinc,
358 | 356,vanguard reit index investor,vanguard reit index inv,
359 | 357,markel corp hldg co,markel corp,
360 | 358,vanguard high-yield corporate fund investor shares,vanguard high yield corp fund admiral share,
361 | 359,oneok inc new div: 2.460,oneok inc cm (new),
362 | 360,iron mtn inc new com div: 1.940,iron mtn inc reit,
363 | 361,howard hughes corp com,howard hughes corp,
364 | 362,af bond fd amer r6,the bond fund of america,
365 | 363,kraft heinz co,kraft heinz co div: 2.300,
366 | 364,costco wholesale crp del,costco wholesale co,
367 | 365,first trust amex biotechnology index fund,first trust nyse arca biotechnology index fund,
368 | 366,american tower corporation isin #us0 sedol #b7fbfl2,american tower corp reit,
369 | 367,the growth fund of america,the growth fund of america-529a (1005),
370 | 368,priceline group inc com,priceline group inc,
371 | 369,pimco total return fund instl cl,total return fund (pimco),
372 | 370,southwest gas corp.,southwest gas corp div: 1.800,
373 | 371,vanguard dividend growth fund investor shares,vanguard dividend growth fund,
374 | 372,j p morgan chase & co,jpmorgan chase & co div: 1.760,
375 | 373,merck & company inc new,merck & co inc new,
376 | 374,pimco incm cl d,pimco income fd i,
377 | 375,vanguard fixed income secs inter term invt grade fd admiral cl,vanguard intermediate term inv gr fd inv cl,
378 | 376,vanguard growth index fund investor shares,vanguard growth index admiral,
379 | 377,guggenheim bulletshares 2018,guggenheim bulletshares 2018 high yield corp bd,
380 | 378,vanguard total bond market div: 2.009,vanguard total bond mkt,
381 | 379,blackhawk network hldgs inc cl a,blackhawk netwk hldgs inc,
382 | 380,vanguard balanced index fd inv cl shrs,vanguard balanced index fund admiral shares,
383 | 381,vanguard mid-cap value index fund,vanguard mid-cap value etf,
384 | 382,ishares tr natl mun bd etf fd,ishares nationl amt freemuni etf,
385 | 383,bank amer corp com,bank of america corp.,
386 | 384,russell 1000 growth (ishares),ishares russell 1000 grw etf div: 1.363,
387 | 385,american fd growth fd of america cl a,amer fds grwth fd amr a,
388 | 386,ishares us real estate etf,ishares u s real estate etf,
389 | 387,dfa emerging markets portfolio,dfa emrging markets,
390 | 388,alphabet inc-cl a,alphabet inc. class a,
391 | 389,raytheon co (new) div: 2.680,raytheon co com,
392 | 390,charles schwab new,schwab charles corp new,
393 | 391,alibaba group holding lt,alibaba group hldg ltd sponsor,
394 | 392,oakmark international i,oakmark fds oakmark intl,
395 | 393,mastercard incorporated cmn class a,mastercard inc-class a,
396 | 394,kimberly-clark corp,kimberly-clark corp.,
397 | 395,ishares 7-10 yr treasry bd etf div: 1.973,ishares barclays 7-10 year treasury bond,
398 | 396,vanguard small-cap index fund admiral shares,vanguard small-cap index fund inst,
399 | 397,foot locker inc com isin #us9 sedol #2980906,foot locker inc com isin #us3448491049 sedol #2980906,
400 | 398,vanguard ttl bond mkt idx adm,vanguard total bond market index inv,
401 | 399,chesapeake energy corporation oklahoma,chesapeake energy corp,
402 | 400,charles schwab corporation cmn,schwab charles corp new,
403 | 401,vanguard intl equity index fds ftse all world ex usa small cap index fd etf shs,vanguard ftse all world ex us small cap etf,
404 | 402,chicago bridge & iron co nv,chicago bridge & iron company n.v. eur0.01 reg,
405 | 403,amer fds grwth fd amr a,american gr fd of america a,
406 | 404,amg managers real estate securities fund,amg managers real estate securities fd,
407 | 405,american new perspective class a,new perspective fund cl a,
408 | 406,lloyds banking group plc sp adr,lloyds banking group plc div: 0.129,
409 | 407,vanguard 500 idx adm,vanguard s&p 500 index - a,
410 | 408,vanguard total bond market index admiral,vanguard total bond market index fund institutional shares,
411 | 409,j p morgan chase & co,jp morgan chase & co,
412 | 410,halliburton co hldg,halliburton company,
413 | 411,guggenheim bulletshares 2018 high yield corp bd,claymore exchange traded fd trust guggenheim bltshrs 2018 high yld cp bd,
414 | 412,ishares russell 2000 value etf,ishares russell 2000 value etf iv,
415 | 413,pimco total return a,total return fund (pimco),
416 | 414,vanguard index fds vanguard reit etf formerly vanguard index tr to 05/24/01 reit viper shs,vanguard index fds vanguard reit etf formerly vanguard index,
417 | 415,powershares emrg mkts sovrgn dbt etf,powershares emerging markets sovereign d,
418 | 416,vang tot bd mkt adm,vanguard total bond market index,
419 | 417,arena pharmaceuticals inc com,arena pharmaceuticals,
420 | 418,nokia corp spon adr f1 adr rep 1 nokia corps,nokia corp sponsored adr,
421 | 419,vanguard value index fund admiral shares,vanguard value index fund institutional shares,
422 | 420,vanguard total int bd idx etf,vanguard charlotte total intl bd index fd etf,
423 | 421,pimco total ret fd instl,pimco total return r,
424 | 422,intl business machines,intl business mach,
425 | 423,harbor international fund,harbor international,
426 | 424,vanguard intl bond index etf,vanguard total internatlbnd etf iv,
427 | 425,vanguard extended market index institutional class,vang ext mkt idx ins,
428 | 426,caterpillar inc del,caterpillar inc,
429 | 427,visa inc com cl a,salesforce.com inc com,
430 | 428,pimco total return admin,total return fund (pimco),
431 | 429,pepsico inc.,pepsico inc nc div: 2.810,
432 | 430,powershares fin pfd portfoli,powershares etf financial pfd portfolio,
433 | 431,fid sel biotech,fidelity select biotechnology,
434 | 432,arcelormittal sa luxembourg ny registry sh isin #us4 sedol #b295f26,arcelormittal sa (luxembourg),
435 | 433,realty incm corp reit,realty income corporation com,
436 | 434,vanguard total international stock index fund admiral shares,vanguard ttl intl stk ind adm,
437 | 435,c h robinson worldwide inc com new isin #us8 sedol #2116228,c.h. robinson worldwide inc,
438 | 436,sprint corp shs series -,sprint corp,
439 | 437,google inc cl c,alphabet inc cap stk cl c,
440 | 438,templeton glbal bond adv,templeton global bond adv,
441 | 439,apollo investments corp com,apollo invt corp com,
442 | 440,sptn us bond idx is,spartan us bond indx fidelity adv class,
443 | 441,ishares inc msci emrg mkts min volatility etf,ishares msci markets minvol etf,
444 | 442,vanguard total bond market index admiral,vanguard total bond market idx-adm,
445 | 443,southwest gas corp div: 1.800,southwest gas corp,
446 | 444,vanguard ftse developd mkt etf,vanguard ftse dev markets etf,
447 | 445,spdr nuveen barclays muni bond etf,spdr nuveen barclays muni,
448 | 446,dodge & cox interntl stock,dodge & cox international stock,
449 | 447,alexion pharms inc,alexion pharmaceuticals inc,
450 | 448,vanguard total bond market idx-adm,vanguard ttl bnd mrk indx inst,
451 | 449,general mtrs co,general motors co.,
452 | 450,zoes kitchen inc com isin #us7 sedol #bl95n36,zoes kitchen inc com,
453 | 451,loomis sayles bond fund cl i,loomis sayles mlti-asset inc a,
454 | 452,athenahealth inc delaware,athenahealth inc,
455 | 453,ishares silver trust etf,ishares silver shares,
456 | 454,emerging markets,harding loevner emerging mkts,
457 | 455,markel corp hldg co,markel corp holding company,
458 | 456,ameriprise financial inc,ameriprise finl inc,
459 | 457,vanguard malvern fds etf,vanguard short term inflation protected,
460 | 458,alibaba group hldg limited sponsored ads,alibaba group hldg adr fsponsored adr 1 adr reps 1 ord,
461 | 459,american funds american hi inc tr r3,american high-income trust,
462 | 460,first eagle global i,first eagle global fund cl i,
463 | 461,van mid cap index adm m4940,vanguard mid cap index,
464 | 462,chesapeake energy corp,chesapeake energy corp com,
465 | 463,starbucks corp. cmn,starbucks corp washington div: 0.800,
466 | 464,mc donalds corp div: 3.560,mcdonalds corp,
467 | 465,american funds europacifc r3,american funds europacific r3,
468 | 466,block h & r inc,block h&r inc,
469 | 467,vanguard ftse developed mkts etf,vanguard ftse developed market etf,
470 | 468,vang tot bd mkt adm,vang tot bd mk is pl,
471 | 469,pimco investment grade corporate bond fund - class a,fidelity conservativ income bond fd cl i,
472 | 470,vanguard mid cap index fund - admiral,vanguard mid cap index fd,
473 | 471,vanguard intrmd-term bond index adm,vanguard inter-term bond index port inv,
474 | 472,infinera corporation com isin #us1 sedol #b1yb5y4,infinera corp com,
475 | 473,vanguard crsp us small cap index,vanguard small cap etf,
476 | 474,oneok partners lp lp,oneok partners l p unit ltd partnership,
477 | 475,american mutual fund,american fd american mutual fd cl f2,
478 | 476,american funds europacific growth-r5,american europacific growth,
479 | 477,vang tot bd mkt adm,vanguard ttl bnd mrk indx inst,
480 | 478,breitburn energy partners lp c,breitburn energy partners lp com,
481 | 479,new york community bancorp inc.,new york community,
482 | 480,pimco income instl,pimco income administrative,
483 | 481,select sector spdr trust technology select index,sector spdr tr shs ben int technology,
484 | 482,vanguard ttl bond mkt idx adm,vanguard bond index total mkt investor,
485 | 483,tmpl global bond a,templeton global bond fund r,
486 | 484,ishares 20+ year treasury bo,ishares 20+ year,
487 | 485,vang sm cap idx adm,vnguard index trust small cap idx instl,
488 | 486,linkedin corp class a,linkedin corp-a,
489 | 487,primecap odyssey stock,primecap odyssey stock fund,
490 | 488,time warner inc,time warner inc com new,
491 | 489,vanguard total bond market index-admiral,vanguard total bond market index fund institutional plus,
492 | 490,marathon pete corporation,loews corporation div: 0.250,
493 | 491,ishares core msci emerging etf,harding loevner emerging mkts,
494 | 492,vanguard short term tax exempt admiral share,vanguard short-term tax-exempt fund investor shares,
495 | 493,ford motor com,ford mtr co,
496 | 494,vanguard ftse all world ex us etf,vanguard ftse all-world ex-u,
497 | 495,vmware inc cl a com,vmware inc.,
498 | 496,royal dutch shell plc spons adr a,royal dutch shell plc sponsored adr repstg a shs,
499 | 497,spdr nuveen barclays municipal bond etf,spdr nuveen barclays capital m div: 0.556,
500 | 498,united parcel service cl b,united parcel service inc cl b,
501 | 499,vang smcp gr idx adm,vanguard small cap growth index admiral,
502 | 500,novo-nordisk a s adr,novo-nordisk a s adr isin #us6 sedol #2651202,
503 | 501,fidelity mass muni income,nuveen equity premium income,
504 | 502,ishares russell mid-cap value etf,ishares russell mid cap value etf iv,
505 | 503,yamana gold inc cmn,yamana gold inc com,
506 | 504,vanguard total bond market idx-adm,vanguard total bond market idx instl pls,
507 | 505,otter tail corp com,otter tail corporation cmn,
508 | 506,sptn intl index ins,sptn intl index adv,
509 | 507,cnooc ltd. adr (sponsored),cnooc limited adr fsponsored adr 1 adr rep 100 cl h ord,
510 | 508,eaton vance global macro abs ret a,eaton vance global macro abslte rt cl a,
511 | 509,dfa u s small cap value cl i,dfa us small cap value prtf instl,
512 | 510,sears canada inc (canada),sears cda inc,
513 | 511,at&t inc com isin #us00206r1023 sedol #2831811,franklin res inc com isin #us8 sedol #2350684,
514 | 512,vanguard total international bond index etf,vanguard total international bond et,
515 | 513,wisdomtree japan hedged equity -,wisdomtree japan hedged eq,
516 | 514,templeton global bond fund advisor class,templeton glbal bond adv,
517 | 515,trp health sciences,t. rowe price health sciences fund,
--------------------------------------------------------------------------------
/koursaros/hub/encoder/robertainfer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "encode"]
--------------------------------------------------------------------------------
/koursaros/hub/encoder/robertainfer/dim64.yml:
--------------------------------------------------------------------------------
1 | !CharEmbeddingEncoder
2 | parameters:
3 | dim: 64
--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | RUN echo 'yo'
6 |
7 | ENTRYPOINT ["gnes", "encode", "--py_path", "textbyte.py"]
--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/max1024.yml:
--------------------------------------------------------------------------------
1 | !TextByteEncoder
2 | parameters:
3 | max_seq_len: 1024
--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/max256.yml:
--------------------------------------------------------------------------------
1 | !TextByteEncoder
2 | parameters:
3 | max_seq_len: 256
4 |
5 |
--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/textbyte.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import numpy as np
4 |
5 | from gnes.encoder.base import BaseTextEncoder
6 | from gnes.helper import batching
7 |
8 |
9 | class TextByteEncoder(BaseTextEncoder):
10 | """Returns np array of encoded text. Useful for text search."""
11 | is_trained = True
12 |
13 | def __init__(self, max_seq_len, *args, **kwargs):
14 | super().__init__(*args, **kwargs)
15 | self._msl = max_seq_len
16 |
17 | def pad_and_vector(self, sent):
18 | padded = sent.encode()[:self._msl] + b'\x00' * (self._msl - len(sent.encode()))
19 | try:
20 | bytes(padded).decode()
21 | return np.frombuffer(padded, dtype=np.uint8)
22 | except: # split aup a multibyte character, so take off one more
23 | padded = padded[:-2] + b'\x00' * 2
24 | return self.pad_and_vector(padded.decode())
25 |
26 | def encode(self, text: List[str], *args, **kwargs) -> np.ndarray:
27 | encoded = np.stack([self.pad_and_vector(sent) for sent in text])
28 | return encoded
29 |
30 |
--------------------------------------------------------------------------------
/koursaros/hub/httpclient/http/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
2 |
3 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp
4 | RUN pip install git+https://github.com/colethienes/gnes.git --no-cache-dir --compile
5 |
6 | COPY . ./workspace
7 | WORKDIR /workspace
8 |
9 | ENTRYPOINT ["gnes", "client", "http"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/faisscpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/hub-indexer:latest-faiss-cpu
2 |
3 | ADD *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "index"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/faisscpu/base.yml:
--------------------------------------------------------------------------------
1 | !FaissIndexer
2 | parameters:
3 | data_path: /workspace
4 | index_key: HNSW32
5 | num_dim: 64
--------------------------------------------------------------------------------
/koursaros/hub/indexer/keyword/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | RUN apk add gcc python3-dev musl-dev
4 | RUN pip install pyahocorasick
5 |
6 | ADD *.py *.yml ./
7 |
8 | ENTRYPOINT ["gnes", "index", "--py_path", "keyword.py"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/keyword/base.yml:
--------------------------------------------------------------------------------
1 | !KeywordIndexer {}
2 |
3 |
--------------------------------------------------------------------------------
/koursaros/hub/indexer/keyword/keyword.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple
2 | import numpy as np
3 | from collections import defaultdict
4 |
5 | from gnes.indexer.base import BaseChunkIndexer as BCI
6 |
7 |
8 | class KeywordIndexer(BCI):
9 |
10 | def __init__(self, *args, **kwargs):
11 | """
12 | Initialize an indexer that implements the AhoCorasick Algorithm
13 | """
14 | super().__init__(*args, **kwargs)
15 | import ahocorasick
16 | self._automaton = ahocorasick.Automaton()
17 | self.size = 0
18 |
19 | def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, _, *args, **kwargs):
20 | if vectors.dtype != np.uint8:
21 | raise ValueError('vectors should be ndarray of uint8')
22 |
23 | for key, vector in zip(keys, vectors):
24 | self._automaton.add_word(self.decode_textbytes(vector), key)
25 | self.size += 1
26 |
27 | self.logger.error(list(self._automaton.keys()))
28 |
29 | def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]:
30 | if keys.dtype != np.uint8:
31 | raise ValueError('vectors should be ndarray of uint8')
32 | elif not self.size > 0:
33 | print('Warning: empty index queried')
34 | return []
35 |
36 | self._automaton.make_automaton()
37 |
38 | ret = []
39 | for key in keys:
40 | ret_i = defaultdict(int)
41 | for _, (doc_id, offset) in self._automaton.iter(self.decode_textbytes(key)):
42 | ret_i[(doc_id, offset)] += 1
43 |
44 | # _doc_id, _offset, _weight, _relevance
45 | results = [(*k, 1.0, v) for k, v in ret_i.items()]
46 | # topk by number of keyword matches
47 | ret.append(sorted(results, reverse=True, key=lambda x: x[-1])[:top_k])
48 |
49 | return ret
50 |
51 | @staticmethod
52 | def decode_textbytes(vector: np.ndarray):
53 | return vector.tobytes().rstrip(b'\x00').decode()
54 |
--------------------------------------------------------------------------------
/koursaros/hub/indexer/lvdb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 |
3 | RUN pip install plyvel>=1.0.5 --no-cache-dir --compile
4 |
5 | ADD *.py *.yml ./
6 |
7 | ENTRYPOINT ["gnes", "index"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/lvdb/base.yml:
--------------------------------------------------------------------------------
1 | !LVDBIndexer
2 | parameters:
3 | data_path: /workspace
--------------------------------------------------------------------------------
/koursaros/hub/indexer/rocksdb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 |
3 | RUN apt-get update
4 | RUN apt-get install -y python-dev librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libgflags-dev
5 | RUN pip install python-rocksdb --no-cache-dir --compile
6 | RUN apt-get install -y git
7 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp
8 | RUN pip install --upgrade git+https://github.com/colethienes/gnes.git --no-cache-dir --compile
9 |
10 | ADD *.py *.yml ./
11 |
12 | ENTRYPOINT ["gnes", "index"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/rocksdb/base.yml:
--------------------------------------------------------------------------------
1 | !RocksDBIndexer
2 | parameters:
3 | data_path: /workspace
--------------------------------------------------------------------------------
/koursaros/hub/indexer/simple_dict/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 |
3 | RUN apt-get update
4 | RUN apt-get install -y git
5 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp
6 | RUN pip install --upgrade git+https://github.com/colethienes/gnes.git --no-cache-dir --compile
7 |
8 | ADD *.py *.yml ./
9 |
10 | ENTRYPOINT ["gnes", "index", "--py_path", "simple_dict.py"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/simple_dict/base.yml:
--------------------------------------------------------------------------------
1 | !SimpleDictIndexer {}
--------------------------------------------------------------------------------
/koursaros/hub/indexer/simple_dict/simple_dict.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from google.protobuf.json_format import MessageToJson, Parse
4 |
5 | from gnes.indexer.base import BaseDocIndexer as BDI
6 | from gnes.proto import gnes_pb2
7 |
8 |
9 | class SimpleDictIndexer(BDI):
10 |
11 | def __init__(self, *args, **kwargs):
12 | super().__init__(*args, **kwargs)
13 | self._content = {}
14 |
15 | @BDI.update_counter
16 | def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs):
17 | self.logger.error(keys)
18 | self.logger.error(docs)
19 | self._content.update({k: MessageToJson(d) for (k, d) in zip(keys, docs)})
20 |
21 | def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']:
22 | self.logger.error(keys)
23 | return [Parse(self._content[k], gnes_pb2.Document()) for k in keys]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/whoosh/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 |
3 | RUN pip install whoosh
4 |
5 | ADD *.py *.yml ./
6 |
7 | ENTRYPOINT ["gnes", "index", "--py_path", "whoosh.py"]
--------------------------------------------------------------------------------
/koursaros/hub/indexer/whoosh/base.yml:
--------------------------------------------------------------------------------
1 | !WhooshIndexer
2 | parameters:
3 | data_path: /workspace
--------------------------------------------------------------------------------
/koursaros/hub/indexer/whoosh/whoosh.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple
2 | import numpy as np
3 | import os, os.path
4 | from whoosh import index, scoring
5 | from whoosh.fields import Schema, TEXT, NUMERIC
6 | from whoosh.analysis import StemmingAnalyzer
7 | from whoosh.qparser import QueryParser
8 | from whoosh.writing import SegmentWriter
9 | from whoosh.codec import default_codec
10 | from whoosh.automata import lev
11 | from whoosh.searching import Searcher
12 | from whoosh import collectors
13 |
14 | import glob
15 |
16 | from gnes.indexer.base import BaseChunkIndexer as BCI
17 |
18 |
19 | class WhooshIndexer(BCI):
20 |
21 | def __init__(self, data_path, *args, **kwargs):
22 | """
23 | Initialize an indexer that implements the AhoCorasick Algorithm
24 | """
25 | super().__init__(*args, **kwargs)
26 | schema = Schema(doc_id=NUMERIC(stored=True),
27 | offset=NUMERIC(stored=True),
28 | body=TEXT(analyzer=StemmingAnalyzer()))
29 | if not os.path.exists(data_path):
30 | os.mkdir(data_path)
31 | self.logger.error('Please mount volume for persisting index.')
32 | try:
33 | self.ix = index.open_dir(data_path)
34 | except:
35 | self.logger.warning('Creating empty whoosh index')
36 | self.ix = index.create_in(data_path, schema)
37 |
38 | def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, _, *args, **kwargs):
39 | self.logger.error('Recieved add index request')
40 | self.logger.error(keys)
41 | if vectors.dtype != np.uint8:
42 | raise ValueError('vectors should be ndarray of uint8')
43 |
44 | writer = self.ix.writer()
45 | for key, vector in zip(keys, vectors):
46 | body = self.decode_textbytes(vector)
47 | writer.add_document(doc_id=key[0],offset=key[1],body=body)
48 |
49 | writer.commit()
50 |
51 | def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]:
52 | if keys.dtype != np.uint8:
53 | raise ValueError('vectors should be ndarray of uint8')
54 |
55 | ret = []
56 | qp = QueryParser("body", schema=self.ix.schema)
57 | with self.ix.searcher(weighting=scoring.TF_IDF()) as searcher:
58 | for key in keys:
59 | query = qp.parse(self.decode_textbytes(key))
60 | ret.append([
61 | (result['doc_id'],result['offset'], 1.0, 1.0)
62 | for result in searcher.search(query, limit=top_k)])
63 | self.logger.error(ret)
64 | return ret
65 |
66 | @staticmethod
67 | def decode_textbytes(vector: np.ndarray):
68 | return vector.tobytes().rstrip(b'\x00').decode()
69 |
70 | # def __getstate__(self):
71 | # import faiss
72 | # d = super().__getstate__()
73 | # faiss.write_index(self._faiss_index, self.data_path)
74 | # return d
75 |
--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/sentsplit/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "preprocess"]
--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/sentsplit/jsonmode.yml:
--------------------------------------------------------------------------------
1 | !SentSplitPreprocessor
2 | parameters:
3 | is_json: True
--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/unary/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "preprocess"]
--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/unary/text.yml:
--------------------------------------------------------------------------------
1 | !UnaryPreprocessor
2 | parameters:
3 | doc_type: 1
--------------------------------------------------------------------------------
/koursaros/hub/router/block/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "route", "--py_path", "block.py"]
--------------------------------------------------------------------------------
/koursaros/hub/router/block/block.py:
--------------------------------------------------------------------------------
1 | from gnes.router.base import BaseRouter
2 | from gnes.proto import gnes_pb2
3 | from gnes.service.base import BlockMessage
4 | from typing import List
5 |
6 |
7 | class BlockRouter(BaseRouter):
8 | """ :param block: runtimes to block"""
9 |
10 | def __init__(self, block: List[str] = [], *args, **kwargs):
11 | super().__init__(*args, **kwargs)
12 | self.block = block
13 |
14 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
15 | """
16 | Log the incoming message
17 | :param msg: incoming message
18 | """
19 |
20 | runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body')
21 | self.logger.error(runtime)
22 |
23 | if runtime in self.block:
24 | self.logger.info('Blocking %s msg...' % runtime)
25 | raise BlockMessage
26 |
--------------------------------------------------------------------------------
/koursaros/hub/router/block/block_query.yml:
--------------------------------------------------------------------------------
1 | !BlockRouter
2 | parameters:
3 | block: []
--------------------------------------------------------------------------------
/koursaros/hub/router/block/block_train.yml:
--------------------------------------------------------------------------------
1 | !BlockRouter
2 | parameters:
3 | block: [train]
--------------------------------------------------------------------------------
/koursaros/hub/router/log/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "--verbose", "route", "--py_path", "log.py"]
--------------------------------------------------------------------------------
/koursaros/hub/router/log/log.py:
--------------------------------------------------------------------------------
1 |
2 | from gnes.router.base import BaseRouter
3 |
4 |
5 | class LogRouter(BaseRouter):
6 | """ Base class for the router. Inherit from this class to create a new router.
7 | Router forwards messages between services. Essentially, it receives a 'gnes_pb2.Message'
8 | and call `apply()` method on it.
9 | """
10 |
11 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
12 | """
13 | Log the incoming message
14 | :param msg: incoming message
15 | """
16 | self.logger.info(msg)
17 |
--------------------------------------------------------------------------------
/koursaros/hub/router/rerank/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.2-cuda10.0-cudnn7-runtime
2 |
3 | RUN pip install -U transformers gnes --no-cache-dir --compile
4 |
5 | WORKDIR /
6 | ADD *.py *.yml ./
7 | RUN nvidia-smi
8 |
9 | ENTRYPOINT ["gnes", "route", "--py_path", "rerank.py"]
--------------------------------------------------------------------------------
/koursaros/hub/router/rerank/base.yml:
--------------------------------------------------------------------------------
1 | !RerankRouter
2 | parameters:
3 | model_name: bert-base-uncased
4 | data_dir: /workspace
5 |
--------------------------------------------------------------------------------
/koursaros/hub/router/rerank/rerank.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from gnes.router.base import BaseRouter
4 | from gnes.proto import gnes_pb2
5 | from gnes.helper import batching
6 | from gnes.service.base import BlockMessage
7 |
8 |
9 | from transformers import *
10 | import torch
11 | import torch.nn
12 | import numpy as np
13 |
14 |
15 | class RerankRouter(BaseRouter):
16 |
17 | def __init__(self, model_name: str = None, data_dir: str = None, *args, **kwargs):
18 | super().__init__(*args, **kwargs)
19 | self.model_name = model_name
20 | self.data_dir = data_dir
21 | self.max_grad_norm = 1.0
22 | self.lr = 1e-3
23 | self.query_dict = dict()
24 |
25 | def post_init(self):
26 | model_config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.data_dir)
27 | model_config.num_labels = 1 # set up for regression
28 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29 | if self.device == "cpu": self.logger.error("RUNING ON CPU")
30 | self.rerank_model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
31 | config=model_config,
32 | cache_dir=self.data_dir)
33 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.data_dir)
34 | self.rerank_model.to(self.device)
35 |
36 | self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False)
37 | self.scheduler = ConstantLRSchedule(self.optimizer)
38 |
39 | def get_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult') -> str:
40 | return x.doc.doc_id
41 |
42 | def set_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult', k: str) -> None:
43 | x.doc.doc_id = k
44 |
45 | # @batching
46 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
47 |
48 | all_scored_results = [sr for sr in msg.response.search.topk_results]
49 | runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body')
50 |
51 | if runtime == 'train': # training samples are given
52 | inputs = []
53 | labels = []
54 | for doc in msg.request.train.docs:
55 | ex = json.loads(doc.raw_bytes)
56 | inputs.append(
57 | self.tokenizer.encode_plus(ex['Query'], ex['Candidate'], add_special_tokens=True))
58 | labels.append(float(ex['Label']))
59 |
60 | labels = torch.tensor(labels, dtype=torch.float).to(self.device)
61 |
62 | elif runtime == 'search':
63 | if msg.WhichOneof('body') == 'request':
64 | self.logger.error('got request')
65 | if not msg.request.request_id in self.query_dict:
66 | self.query_dict[msg.request.request_id] = msg.request.search.query.raw_bytes.decode()
67 | raise BlockMessage
68 | else:
69 | query = msg.request.search.query.raw_bytes.decode()
70 | all_scored_results = self.query_dict[msg.request.request_id]
71 | else:
72 | self.logger.error('got response')
73 | if not msg.response.request_id in self.query_dict:
74 | self.query_dict[msg.request.request_id] = all_scored_results
75 | raise BlockMessage
76 | else:
77 | query = self.query_dict[msg.response.request_id]
78 | inputs = [
79 | self.tokenizer.encode_plus(
80 | query,
81 | sr.doc.chunks[0].text,
82 | add_special_tokens=True,
83 | ) for sr in all_scored_results]
84 | self.logger.error([sr.doc.chunks[0].text for sr in all_scored_results])
85 | labels = None
86 |
87 | else:
88 | raise BlockMessage
89 |
90 | if len(inputs) == 0:
91 | print("Warning: empty input set, ignoring.")
92 | return
93 |
94 | max_len = max(len(t['input_ids']) for t in inputs)
95 | input_ids = [t['input_ids'] + [0] * (max_len - len(t['input_ids'])) for t in inputs]
96 | token_type_ids = [t['token_type_ids'] + [0] * (max_len - len(t['token_type_ids'])) for t in inputs]
97 | attention_mask = [[1] * len(t['input_ids']) + [0] * (max_len - len(t['input_ids'])) for t in inputs]
98 |
99 | input_ids = torch.tensor(input_ids).to(self.device)
100 | token_type_ids = torch.tensor(token_type_ids).to(self.device)
101 | attention_mask = torch.tensor(attention_mask).to(self.device)
102 |
103 | if labels is not None:
104 | loss = self.rerank_model(input_ids, token_type_ids=token_type_ids,
105 | labels=labels, attention_mask=attention_mask)[0]
106 | loss.backward()
107 | torch.nn.utils.clip_grad_norm_(self.rerank_model.parameters(), self.max_grad_norm)
108 | self.optimizer.step()
109 | self.scheduler.step()
110 | self.rerank_model.zero_grad()
111 | msg.response.train.status = gnes_pb2.Response.Status.SUCCESS
112 |
113 | else:
114 | with torch.no_grad():
115 | logits = self.rerank_model(input_ids, token_type_ids=token_type_ids,
116 | attention_mask=attention_mask)[0]
117 | scores = np.squeeze(logits.detach().cpu().numpy())
118 | if len(logits) == 1:
119 | scores = [scores]
120 | ranked_results = []
121 | for sr, score in zip(all_scored_results, scores):
122 | ranked_results.append((sr.doc, score))
123 |
124 | k = msg.response.search.top_k
125 | top_k = sorted(ranked_results, key=lambda x: x[1], reverse=True)[:k]
126 |
127 | msg.response.search.ClearField('topk_results')
128 | for doc, score in top_k:
129 | sr = msg.response.search.topk_results.add()
130 | sr.score.value = float(score)
131 | sr.doc.CopyFrom(doc)
132 |
--------------------------------------------------------------------------------
/koursaros/hub/router/resp_req/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 |
3 | ADD *.py *.yml ./
4 |
5 | ENTRYPOINT ["gnes", "route", "--py_path", "resp_req.py"]
--------------------------------------------------------------------------------
/koursaros/hub/router/resp_req/base.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/router/resp_req/base.yml
--------------------------------------------------------------------------------
/koursaros/hub/router/resp_req/resp_req.py:
--------------------------------------------------------------------------------
1 | from gnes.router.base import BaseRouter
2 | from gnes.proto import gnes_pb2
3 |
4 | class RespReqRouter(BaseRouter):
5 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
6 | """
7 | Log the incoming message
8 | :param msg: incoming message
9 | """
10 |
11 | runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body')
12 | print('recieved msg')
13 | print(msg)
14 | print(runtime)
15 | if runtime == 'index':
16 | req = gnes_pb2.Message()
--------------------------------------------------------------------------------
/koursaros/hub/tests/sonnets_small.txt:
--------------------------------------------------------------------------------
1 | From fairest creatures we desire increase,
2 | That thereby beauty's rose might never die,
3 | But as the riper should by time decease,
4 | His tender heir might bear his memory:
5 | But thou contracted to thine own bright eyes,
6 | Feed'st thy light's flame with self-substantial fuel,
7 | Making a famine where abundance lies,
8 | Thy self thy foe, to thy sweet self too cruel:
9 | Thou that art now the world's fresh ornament,
10 | And only herald to the gaudy spring,
11 | Within thine own bud buriest thy content,
12 | And, tender churl, mak'st waste in niggarding:
13 | Pity the world, or else this glutton be,
14 | To eat the world's due, by the grave and thee.
15 |
16 | When forty winters shall besiege thy brow,
17 | And dig deep trenches in thy beauty's field,
18 | Thy youth's proud livery so gazed on now,
19 | Will be a totter'd weed of small worth held:
20 | Then being asked, where all thy beauty lies,
21 | Where all the treasure of thy lusty days;
22 | To say, within thine own deep sunken eyes,
23 | Were an all-eating shame, and thriftless praise.
24 | How much more praise deserv'd thy beauty's use,
25 | If thou couldst answer 'This fair child of mine
26 | Shall sum my count, and make my old excuse,'
27 | Proving his beauty by succession thine!
28 | This were to be new made when thou art old,
29 | And see thy blood warm when thou feel'st it cold.
30 |
31 | Look in thy glass and tell the face thou viewest
32 | Now is the time that face should form another;
33 | Whose fresh repair if now thou not renewest,
34 | Thou dost beguile the world, unbless some mother.
35 | For where is she so fair whose uneared womb
36 | Disdains the tillage of thy husbandry?
37 | Or who is he so fond will be the tomb
38 | Of his self-love, to stop posterity?
39 | Thou art thy mother's glass and she in thee
40 | Calls back the lovely April of her prime;
41 | So thou through windows of thine age shalt see,
42 | Despite of wrinkles, this thy golden time.
43 | But if thou live, remembered not to be,
44 | Die single and thine image dies with thee.
45 |
46 | Unthrifty loveliness, why dost thou spend
47 | Upon thy self thy beauty's legacy?
48 | Nature's bequest gives nothing, but doth lend,
49 | And being frank she lends to those are free:
50 | Then, beauteous niggard, why dost thou abuse
51 | The bounteous largess given thee to give?
52 | Profitless usurer, why dost thou use
53 | So great a sum of sums, yet canst not live?
54 | For having traffic with thy self alone,
55 | Thou of thy self thy sweet self dost deceive:
56 | Then how when nature calls thee to be gone,
57 | What acceptable audit canst thou leave?
58 | Thy unused beauty must be tombed with thee,
59 | Which, used, lives th' executor to be.
60 |
61 | Those hours, that with gentle work did frame
62 | The lovely gaze where every eye doth dwell,
63 | Will play the tyrants to the very same
64 | And that unfair which fairly doth excel;
65 | For never-resting time leads summer on
66 | To hideous winter, and confounds him there;
67 | Sap checked with frost, and lusty leaves quite gone,
68 | Beauty o'er-snowed and bareness every where:
69 | Then were not summer's distillation left,
70 | A liquid prisoner pent in walls of glass,
71 | Beauty's effect with beauty were bereft,
72 | Nor it, nor no remembrance what it was:
73 | But flowers distilled, though they with winter meet,
74 | Leese but their show; their substance still lives sweet.
75 |
76 | Then let not winter's ragged hand deface,
77 | In thee thy summer, ere thou be distilled:
78 | Make sweet some vial; treasure thou some place
79 | With beauty's treasure ere it be self-killed.
80 | That use is not forbidden usury,
81 | Which happies those that pay the willing loan;
82 | That's for thy self to breed another thee,
83 | Or ten times happier, be it ten for one;
84 | Ten times thy self were happier than thou art,
85 | If ten of thine ten times refigured thee:
86 | Then what could death do if thou shouldst depart,
87 | Leaving thee living in posterity?
88 | Be not self-willed, for thou art much too fair
89 | To be death's conquest and make worms thine heir.
90 |
91 | Lo! in the orient when the gracious light
92 | Lifts up his burning head, each under eye
93 | Doth homage to his new-appearing sight,
94 | Serving with looks his sacred majesty;
95 | And having climbed the steep-up heavenly hill,
96 | Resembling strong youth in his middle age,
97 | Yet mortal looks adore his beauty still,
98 | Attending on his golden pilgrimage:
99 | But when from highmost pitch, with weary car,
100 | Like feeble age, he reeleth from the day,
101 | The eyes, 'fore duteous, now converted are
102 | From his low tract, and look another way:
103 | So thou, thyself outgoing in thy noon
104 | Unlooked on diest unless thou get a son.
--------------------------------------------------------------------------------
/koursaros/hub/tests/test_block.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 |
4 | from gnes.cli.parser import set_router_parser, _set_client_parser
5 | from gnes.service.router import RouterService
6 | from gnes.service.base import SocketType
7 | from gnes.client.base import ZmqClient
8 | from gnes.proto import gnes_pb2
9 |
10 |
11 | class TestBlock(unittest.TestCase):
12 |
13 | def setUp(self):
14 | dirname = os.path.dirname(__file__)
15 | self.rerank_router_yaml = os.path.join(dirname, '../', 'router/block/block_train.yml')
16 | self.python_code = os.path.join(dirname, '../', 'router/block/block.py')
17 |
18 |
19 | self.args = set_router_parser().parse_args([
20 | '--yaml_path', self.rerank_router_yaml,
21 | '--socket_out', str(SocketType.PUB_BIND),
22 | '--py_path', self.python_code
23 | ])
24 | self.c_args = _set_client_parser().parse_args([
25 | '--port_in', str(self.args.port_out),
26 | '--port_out', str(self.args.port_in),
27 | '--socket_in', str(SocketType.SUB_CONNECT)
28 | ])
29 |
30 | def test_block_router(self):
31 | with RouterService(self.args), ZmqClient(self.c_args) as c1:
32 | msg = gnes_pb2.Message()
33 | msg.request.train.docs.add()
34 | c1.send_message(msg)
35 | msg = gnes_pb2.Message()
36 | msg.request.index.docs.add()
37 | c1.send_message(msg)
38 | r = c1.recv_message()
39 |
--------------------------------------------------------------------------------
/koursaros/hub/tests/test_keyword.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from gnes.proto import gnes_pb2
5 | from gnes.client.base import ZmqClient
6 | from gnes.service.base import SocketType
7 | from gnes.cli.parser import set_router_parser, _set_client_parser
8 | from gnes.service.indexer import IndexerService
9 | import numpy as np
10 |
11 |
12 | class TestKeyword(unittest.TestCase):
13 |
14 | def setUp(self):
15 | dirname = os.path.dirname(__file__)
16 | self.yaml = os.path.join(dirname, 'yaml', 'test-keyword.yml')
17 | self.python_code = os.path.join(dirname, '../', 'indexer/keyword/keyword.py')
18 |
19 | self.test_str = []
20 | self.test_vec = []
21 | self._msl = 512
22 | with open(os.path.join(dirname, 'sonnets_small.txt')) as f:
23 | for line in f:
24 | line = line.strip()
25 | if line == '': continue
26 | self.test_vec.append(np.frombuffer(
27 | line.encode()[:self._msl] + b'\x00' * (self._msl - len(line)),
28 | dtype=np.uint8
29 | ))
30 | self.test_str.append(line)
31 |
32 | def test_keyword(self):
33 | args = set_router_parser().parse_args([
34 | '--yaml_path', self.yaml,
35 | '--socket_out', str(SocketType.PUB_BIND),
36 | '--py_path', self.python_code,
37 | ])
38 | args.as_response = True
39 | c_args = _set_client_parser().parse_args([
40 | '--port_in', str(args.port_out),
41 | '--port_out', str(args.port_in),
42 | '--socket_in', str(SocketType.SUB_CONNECT)
43 | ])
44 | with IndexerService(args), ZmqClient(c_args) as c1:
45 | msg = gnes_pb2.Message()
46 | for i, vec in enumerate(self.test_vec):
47 | doc = msg.request.index.docs.add()
48 | doc.doc_id = i
49 | doc.raw_text = self.test_str[i]
50 | c = doc.chunks.add()
51 | c.doc_id = i
52 | c.offset = 0
53 | c.embedding.data = vec.tobytes()
54 | for d in vec.shape:
55 | c.embedding.shape.extend([d])
56 | c.embedding.dtype = str(vec.dtype)
57 | c.text = self.test_str[i]
58 | c1.send_message(msg)
59 |
60 | r = c1.recv_message()
61 | self.assert_(r.response.index)
62 |
63 | for i, vec in enumerate(self.test_vec):
64 | msg = gnes_pb2.Message()
65 | msg.request.search.query.doc_id = 1
66 | msg.request.search.top_k = 1
67 | c = msg.request.search.query.chunks.add()
68 | c.doc_id = 1
69 | c.embedding.data = vec.tobytes()
70 | for d in vec.shape:
71 | c.embedding.shape.extend([d])
72 | c.embedding.dtype = str(vec.dtype)
73 | c.offset = 0
74 | c.weight = 1
75 | c.text = self.test_str[i]
76 | c1.send_message(msg)
77 | r = c1.recv_message()
78 | self.assert_(r.response.search.topk_results[0].chunk.doc_id == i)
79 |
80 | def tearDown(self):
81 | pass
--------------------------------------------------------------------------------
/koursaros/hub/tests/test_reranker.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | import json
4 |
5 | from gnes.proto import gnes_pb2
6 | from gnes.client.base import ZmqClient
7 | from gnes.service.base import SocketType
8 | from gnes.cli.parser import set_router_parser, _set_client_parser
9 | from gnes.service.router import RouterService
10 |
11 |
12 | class TestReranker(unittest.TestCase):
13 |
14 | def setUp(self):
15 | dirname = os.path.dirname(__file__)
16 | self.rerank_router_yaml = os.path.join(dirname, 'yaml', 'test-reranker.yml')
17 | self.python_code = os.path.join(dirname, '../', 'router/rerank/rerank.py')
18 |
19 | self.test_str = []
20 | with open(os.path.join(dirname, 'sonnets_small.txt')) as f:
21 | for line in f:
22 | line = line.strip()
23 | if line:
24 | self.test_str.append(line)
25 |
26 | self.args = set_router_parser().parse_args([
27 | '--yaml_path', self.rerank_router_yaml,
28 | '--socket_out', str(SocketType.PUB_BIND),
29 | '--py_path', self.python_code
30 | ])
31 | self.c_args = _set_client_parser().parse_args([
32 | '--port_in', str(self.args.port_out),
33 | '--port_out', str(self.args.port_in),
34 | '--socket_in', str(SocketType.SUB_CONNECT)
35 | ])
36 |
37 | # @unittest.skip('SKIPPING TRAIN TEST')
38 | def test_rerank_train(self):
39 | with RouterService(self.args), ZmqClient(self.c_args) as c1:
40 | msg = gnes_pb2.Message()
41 | msg.response.search.ClearField('topk_results')
42 | msg.request.search.query.raw_text = 'This is a query'
43 |
44 | for i, line in enumerate(self.test_str[:5]):
45 | s = msg.response.search.topk_results.add()
46 | s.score.value = 0.1
47 | s.doc.doc_id = i
48 | s.doc.raw_text = line
49 |
50 | msg.envelope.num_part.extend([1])
51 | msg.response.search.top_k = 5
52 | c1.send_message(msg)
53 |
54 | r = c1.recv_message()
55 | print(r)
56 |
57 | msg = gnes_pb2.Message()
58 |
59 | for i, line in enumerate(self.test_str):
60 | doc = msg.request.train.docs.add()
61 | doc.doc_id = i
62 | doc.raw_bytes = json.dumps({
63 | 'Query' : 'test query',
64 | 'Candidate' : line,
65 | 'Label' : 1.0 if i % 2 == 0 else 0.0
66 | }).encode('utf-8')
67 |
68 | msg.envelope.num_part.extend([1])
69 | c1.send_message(msg)
70 | r = c1.recv_message()
71 | print(r)
72 |
73 | @unittest.skip("SKIPPING QUERY TEST")
74 | def test_rerank(self):
75 | with RouterService(self.args), ZmqClient(self.c_args) as c1:
76 | msg = gnes_pb2.Message()
77 | msg.response.search.ClearField('topk_results')
78 | msg.request.search.query.raw_text = 'This is a query'
79 |
80 | for i, line in enumerate(self.test_str):
81 | s = msg.response.search.topk_results.add()
82 | s.score.value = 0.1
83 | s.doc.doc_id = i
84 | s.doc.raw_text = line
85 |
86 | msg.envelope.num_part.extend([1])
87 | msg.response.search.top_k = 5
88 | c1.send_message(msg)
89 |
90 | r = c1.recv_message()
91 | # import pdb
92 | # pdb.set_trace()
93 | self.assertSequenceEqual(r.envelope.num_part, [1])
94 | self.assertEqual(len(r.response.search.topk_results), 5)
95 |
96 | msg = gnes_pb2.Message()
97 | msg.response.search.ClearField('topk_results')
98 |
99 | for i, line in enumerate(self.test_str[:1]):
100 | s = msg.response.search.topk_results.add()
101 | s.score.value = 0.1
102 | s.doc.doc_id = i
103 | s.doc.raw_text = line
104 |
105 | msg.envelope.num_part.extend([1])
106 | msg.response.search.top_k = 5
107 | c1.send_message(msg)
108 |
109 | r = c1.recv_message()
110 | self.assertSequenceEqual(r.envelope.num_part, [1])
111 | self.assertEqual(len(r.response.search.topk_results), 1)
112 |
113 | def tearDown(self):
114 | pass
--------------------------------------------------------------------------------
/koursaros/hub/tests/test_textbyte_encoder.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from koursaros.hub.encoder.textbyte.textbyte import TextByteEncoder
3 | import pathlib
4 | import csv
5 |
6 | import numpy as np
7 |
8 | class TestTextByte(unittest.TestCase):
9 |
10 | def setUp(self) -> None:
11 | self.msl = 1024
12 | self.model = TextByteEncoder(self.msl)
13 | self.path = pathlib.Path('reviews_sample.csv')
14 | self.csv = csv.DictReader(self.path.open())
15 |
16 | def test_textbyte(self):
17 | to_encode = []
18 | for row in self.csv:
19 | to_encode.append(list(row.values())[1])
20 | vectors = self.model.encode(to_encode)
21 | for vec in vectors:
22 | self.assertEqual(len(vec), self.msl)
23 | for vector in vectors:
24 | self.decode_textbytes(vector)
25 | self.decode_textbytes(vectors)
26 |
27 | @staticmethod
28 | def decode_textbytes(vector: np.ndarray):
29 | return vector.tobytes().rstrip(b'\x00').decode()
--------------------------------------------------------------------------------
/koursaros/hub/tests/test_whoosh.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from unittest import SkipTest
4 |
5 | from gnes.proto import gnes_pb2
6 | from gnes.client.base import ZmqClient
7 | from gnes.service.base import SocketType
8 | from gnes.cli.parser import set_router_parser, _set_client_parser
9 | from gnes.service.indexer import IndexerService
10 | import numpy as np
11 |
12 |
13 | class TestWhoosh(unittest.TestCase):
14 |
15 | def setUp(self):
16 | dirname = os.path.dirname(__file__)
17 | self.yaml = os.path.join(dirname, 'yaml', 'test-whoosh.yml')
18 | self.yaml_joint = os.path.join(dirname, 'yaml', 'test-joint.yml')
19 | self.python_code = os.path.join(dirname, '../', 'indexer/whoosh/whoosh.py')
20 |
21 | self.test_str = []
22 | self.test_vec = []
23 | self._msl = 512
24 | with open(os.path.join(dirname, 'sonnets_small.txt')) as f:
25 | for line in f:
26 | line = line.strip()
27 | if line == '': continue
28 | self.test_vec.append(np.frombuffer(
29 | line.encode()[:self._msl] + b'\x00' * (self._msl - len(line)),
30 | dtype=np.uint8
31 | ))
32 | self.test_str.append(line)
33 |
34 | def test_whoosh(self):
35 | args = set_router_parser().parse_args([
36 | '--yaml_path', self.yaml,
37 | '--socket_out', str(SocketType.PUB_BIND),
38 | '--py_path', self.python_code,
39 | ])
40 | args.as_response = True
41 | c_args = _set_client_parser().parse_args([
42 | '--port_in', str(args.port_out),
43 | '--port_out', str(args.port_in),
44 | '--socket_in', str(SocketType.SUB_CONNECT)
45 | ])
46 | with IndexerService(args), ZmqClient(c_args) as c1:
47 | msg = gnes_pb2.Message()
48 | for i, vec in enumerate(self.test_vec):
49 | doc = msg.request.index.docs.add()
50 | doc.doc_id = i
51 | doc.raw_text = self.test_str[i]
52 | c = doc.chunks.add()
53 | c.doc_id = i
54 | c.offset = 0
55 | c.embedding.data = vec.tobytes()
56 | for d in vec.shape:
57 | c.embedding.shape.extend([d])
58 | c.embedding.dtype = str(vec.dtype)
59 | c.text = self.test_str[i]
60 | c1.send_message(msg)
61 |
62 | r = c1.recv_message()
63 | self.assert_(r.response.index)
64 |
65 | for i, vec in enumerate(self.test_vec):
66 | msg = gnes_pb2.Message()
67 | msg.request.search.query.doc_id = 1
68 | msg.request.search.top_k = 1
69 | c = msg.request.search.query.chunks.add()
70 | c.doc_id = 1
71 | c.embedding.data = vec.tobytes()
72 | for d in vec.shape:
73 | c.embedding.shape.extend([d])
74 | c.embedding.dtype = str(vec.dtype)
75 | c.offset = 0
76 | c.weight = 1
77 | c.text = self.test_str[i]
78 | c1.send_message(msg)
79 | r = c1.recv_message()
80 | import pdb
81 | pdb.set_trace()
82 | try:
83 | self.assert_(r.response.search.topk_results[0].chunk.doc_id == i)
84 | except:
85 | pass
86 |
87 | @SkipTest
88 | def test_joint(self):
89 | args = set_router_parser().parse_args([
90 | '--yaml_path', self.yaml_joint,
91 | '--socket_out', str(SocketType.PUB_BIND),
92 | '--py_path', self.python_code,
93 | ])
94 | args.as_response = True
95 | c_args = _set_client_parser().parse_args([
96 | '--port_in', str(args.port_out),
97 | '--port_out', str(args.port_in),
98 | '--socket_in', str(SocketType.SUB_CONNECT)
99 | ])
100 | with IndexerService(args), ZmqClient(c_args) as c1:
101 | msg = gnes_pb2.Message()
102 | for i, vec in enumerate(self.test_vec):
103 | doc = msg.request.index.docs.add()
104 | doc.doc_id = i
105 | doc.raw_text = self.test_str[i]
106 | c = doc.chunks.add()
107 | c.doc_id = i
108 | c.offset = 0
109 | c.embedding.data = vec.tobytes()
110 | for d in vec.shape:
111 | c.embedding.shape.extend([d])
112 | c.embedding.dtype = str(vec.dtype)
113 | c.text = self.test_str[i]
114 | c1.send_message(msg)
115 |
116 | r = c1.recv_message()
117 | self.assert_(r.response.index)
118 |
119 | for i, vec in enumerate(self.test_vec):
120 | msg = gnes_pb2.Message()
121 | msg.request.search.query.doc_id = 1
122 | msg.request.search.top_k = 1
123 | c = msg.request.search.query.chunks.add()
124 | c.doc_id = 1
125 | c.embedding.data = vec.tobytes()
126 | for d in vec.shape:
127 | c.embedding.shape.extend([d])
128 | c.embedding.dtype = str(vec.dtype)
129 | c.offset = 0
130 | c.weight = 1
131 | c.text = self.test_str[i]
132 | c1.send_message(msg)
133 | r = c1.recv_message()
134 | try:
135 | self.assert_(r.response.search.topk_results[0].chunk.doc_id == i)
136 | except:
137 | pass
138 |
139 | def tearDown(self):
140 | pass
--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-joint.yml:
--------------------------------------------------------------------------------
1 | !JointIndexer
2 | components:
3 | - !RocksDBIndexer
4 | parameters:
5 | data_path: ./idx.doc_content
6 | - !WhooshIndexer
7 | parameters:
8 | data_path: ./idx.whoosh
--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-keyword.yml:
--------------------------------------------------------------------------------
1 | !KeywordIndexer {}
--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-reranker.yml:
--------------------------------------------------------------------------------
1 | !RerankRouter
2 | parameters:
3 | model_name: bert-base-uncased
--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-whoosh.yml:
--------------------------------------------------------------------------------
1 | !WhooshIndexer
2 | parameters:
3 | data_path: ./idx.doc_content
--------------------------------------------------------------------------------
/koursaros/repo_creds/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Description
3 |
4 | This module allows you to pull secure credentials into your python
5 | script. It assumes that you create a private git repository with
6 | your credentials in them prior to using get_creds().
7 |
8 | ## At a glance
9 |
10 | You can create a repository that looks like this:
11 |
12 | ```
13 | creds
14 | ├── creds.yaml
15 | ├── google
16 | │ └── bluehat.json
17 | └── postgres
18 | └── postgres.pem
19 | ```
20 |
21 | And a `creds.yaml` that looks like this:
22 | ```yaml
23 | creds:
24 | postgres:
25 | host: !!str 12.345.678.910
26 | username: !!str postgres
27 | password: !!str my_password
28 | replicas: !!int 5
29 | dbname: !!str fever
30 | sslmode: !!str verify-ca
31 | sslrootcert: !file postgres/postgres.pem
32 | google:
33 | app_creds: !file google/bluehat.json
34 | ```
35 |
36 | Let's say the repo you make is `madhatter/creds`, my username is `alice` and password is `cheshire`.
37 | You can get your credentials in a python script by doing the following:
38 | ```python
39 | from koursaros.credentials import get_creds
40 | from sys import argv
41 |
42 | # retrieve repo creds by adding login to script
43 | creds = get_creds('alice:cheshire@madhatter/creds')
44 | # or with cmd line args
45 | creds = get_creds(argv[1])
46 | # NOTE: you don't need to log in if your git credentials are stored locally
47 |
48 |
49 | # the !! denotes native python types. You can access them like:
50 | creds.postgres.password # my_password
51 | creds.postgres.replicas # 5
52 |
53 | # the special !file tag means that it is a file. You can access
54 | # three attributes from file objects (path, bytes, text):
55 | creds.google.app_creds.path # '/absolute/path/to/google/app_creds/bluehat.json'
56 | creds.google.app_creds.bytes # b'{"client_id": "293480342342034"}'
57 | creds.google.app_creds.text # '{"client_id": "293480342342034"}'
58 | ```
59 |
60 | ## How it works
61 | The `get_creds()` function clones the specified repo and caches it to the koursaros.credentials
62 | directory. If the creds repo already exists, the repo is git pulled.
--------------------------------------------------------------------------------
/koursaros/repo_creds/__init__.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | from pathlib import Path
3 | from box import Box
4 | import git
5 |
6 |
7 | DIR = Path(__file__).parent.absolute()
8 |
9 |
10 | class FileCred(yaml.YAMLObject):
11 | yaml_loader = yaml.SafeLoader
12 | yaml_tag = '!file'
13 |
14 | def __init__(self, relative_path):
15 | path = self.repo_path.joinpath(relative_path)
16 | self.bytes = path.read_bytes()
17 | self.text = path.read_text()
18 | self.path = str(path)
19 |
20 | @classmethod
21 | def from_yaml(cls, loader, node):
22 | return cls(node.value)
23 |
24 | @classmethod
25 | def set_repo_path(cls, repo_path):
26 | cls.repo_path = repo_path
27 |
28 |
29 | def get_creds(git_dsn):
30 | login, repo = git_dsn.split('@')
31 | login += '@'
32 | repo_path = DIR.joinpath(repo)
33 | repo_path.parent.mkdir(exist_ok=True)
34 | FileCred.set_repo_path(repo_path)
35 |
36 | if repo_path.exists():
37 | g = git.Git(repo_path)
38 | g.pull()
39 | else:
40 | g = git.Git(repo_path.parent)
41 | g.clone("https://%sgithub.com/%s" % (login, repo))
42 |
43 | creds = yaml.safe_load(repo_path.joinpath('creds.yaml').read_text())
44 | return Box(creds['creds'])
45 |
46 |
47 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | distro==1.4.0
2 | python-box
3 | tqdm
4 | torch
5 | transformers
6 | termcolor
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from pathlib import Path
3 |
4 | setup(
5 | name='koursaros',
6 | packages=find_packages(),
7 | include_package_data=True,
8 | version='0.0.1',
9 | license='MIT',
10 | description='Koursaros is a distributed, cloud-'
11 | 'native platform for developing and deploying '
12 | 'automated information retrieval and inference applications.',
13 | long_description=Path('README.md').read_text(),
14 | author='Koursaros',
15 | author_email='cole.thienes@gmail.com',
16 | url='https://github.com/koursaros-ai/koursaros',
17 | # download_url='https://github.com/koursaros-ai/koursaros/archive/0.0.1.tar.gz',
18 | keywords=['koursaros', 'distributed', 'cloud-native', 'neural', 'inference'],
19 | install_requires=[
20 | 'PyYAML', 'gitpython', 'python-box', 'gnes', 'tqdm', 'tabulate', 'click'],
21 | entry_points={'console_scripts': ['kctl=koursaros.cli.__main__:main']},
22 | classifiers=[
23 | 'Intended Audience :: Education',
24 | 'Intended Audience :: Science/Research',
25 | 'Intended Audience :: Developers',
26 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
27 | 'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
28 | 'Topic :: Scientific/Engineering',
29 | 'Topic :: Scientific/Engineering :: Mathematics',
30 | 'Topic :: Software Development',
31 | 'Topic :: Software Development :: Libraries',
32 | 'Topic :: Software Development :: Libraries :: Python Modules',
33 | 'License :: OSI Approved :: MIT License',
34 | 'Programming Language :: Python :: 3.5',
35 | 'Programming Language :: Python :: 3.6',
36 | 'Programming Language :: Python :: 3.7',
37 | ],
38 | )
39 |
--------------------------------------------------------------------------------
/tutorials/deploy_custom_model.md:
--------------------------------------------------------------------------------
1 | # Training + Deploying a Custom Transformer Model in 5 Minutes
2 |
3 | ## Training Sentence Classification or Regression
4 |
5 | Make sure you've installed the koursaros training package.
6 |
7 | Create a .yaml file for your model in the /services directory. Your project should look like:
8 |
9 | ```
10 | |-bases/
11 | |-pipelines/
12 | |---my_pipeline.yaml
13 | |-services/
14 | |---[name].yaml
15 | ```
16 |
17 | For loading mnli from a postgres table, the yaml file should look like this:
18 | ```yaml
19 | service:
20 | base:
21 | repo: gs://
22 | task:
23 | labels: # if classification, else nothing
24 | - neutral
25 | - contradiction
26 | - entailment
27 | training:
28 | checkpoint: bert-base-uncased # see transformers for options, or use custom filename
29 | epochs: 3
30 | learning_rate: 1e-05
31 | ```
32 |
33 | ### Loading data from postgresql
34 |
35 | For loading training data form postgres (recommended), add this to the service yaml. Adjust the schema and tables to point your your train / test data.
36 | ```yaml
37 | data:
38 | source: postgres
39 | schema: mnli
40 | train: train_set
41 | test: test_set
42 | ```
43 |
44 | And adjust your environment variables accordingly:
45 | ```bash
46 | export PGHOST=
47 | export PGUSER=
48 | export PGPASS=
49 | export PGDBNAME=
50 | # for ssl
51 | export PGSSLMODE=verify-ca
52 | export PGSSLROOTCERT=
53 | ```
54 |
55 | ### Loading data from tsv / excel
56 |
57 | ```yaml
58 | data:
59 | source: tsv
60 | train: train_set.tsv
61 | test: test_set.tsv
62 | ```
63 |
64 | ---
65 |
66 | **NOTE**
67 |
68 | The format for tables or TSV files for training should be ``
69 |
70 | ---
71 |
72 | ### Run training and push model to bucket
73 |
74 | Run `kctl train services/mnli.yaml`. The model will be cached locally, unless you specify a google storage bucket to upload to for deployment. Read about authentication in the google cloud storage API.
75 |
76 | ## Deploying
77 |
78 |
79 |
80 | ### Set up App
81 |
82 | ##
--------------------------------------------------------------------------------
/tutorials/fact_check.md:
--------------------------------------------------------------------------------
1 | # Creating a SoTA Production Fact Checker from Wikipedia
2 |
3 | ## Create App
4 | ## Train or Download Pretrained Models
5 | ## Dump Wikipedia to Elastic Search
6 | ## Benchmark
--------------------------------------------------------------------------------
/utils/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling.models import MODELS
2 | from koursaros.yamls import Yaml
3 | from kctl.logger import set_logger
4 |
5 | logger = set_logger('MODELS')
6 |
7 | def model_filename_resolver(name):
8 | if name.split('.')[-1] == 'yaml':
9 | return name
10 | return f'./services/{name}.yaml'
11 |
12 | def model_from_yaml(filename, **kwargs):
13 | config = Yaml(filename)
14 | return model_from_config(config, **kwargs)
15 |
16 | def model_from_config(config, training=False):
17 | for model_class in MODELS:
18 | if config.arch in model_class.architectures():
19 | model = model_class(config, training)
20 | logger.info('Loaded model {}'.format(config.arch))
21 | return model
22 | logger.error('Unsupported model architecture {}'.format(config.arch))
23 | raise NotImplementedError()
24 |
--------------------------------------------------------------------------------
/utils/modeling/data.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | def get_rows_from_tsv(fname):
4 | if fname.split('.')[-1] == 'tsv':
5 | delimiter = '\t'
6 | else:
7 | delimiter = ','
8 | with open(fname) as file:
9 | return csv.reader(file, delimiter=delimiter)
10 |
11 | def select_all(schema, table, random=True):
12 | query = f'select * from {schema}.{table}'
13 | if random:
14 | query += ' order by random()'
15 | return query
--------------------------------------------------------------------------------
/utils/modeling/migrating.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import transformers
3 | from fairseq.models import roberta
4 | from fairseq.data.data_utils import collate_tokens
5 | import time
6 | import torch.nn.functional as F
7 | import torch.hub
8 |
9 | # def roberta_to_transformer(path_to_roberta, path_to_yaml):
10 | # model = RobertaModel.from_pretrained(path_to_roberta, checkpoint_file='model.pt')
11 | # model.eval()
12 |
13 | MAX_LENGTH = 256
14 | PAD = True
15 |
16 | def predict_transformers(model, tokenizer):
17 | def predict_fn(*args):
18 | inputs = time_fn(transformers_encode_batch, tokenizer, *args)
19 | inputs_dict = {
20 | 'input_ids': inputs[0],
21 | 'attention_mask': inputs[1],
22 | 'token_type_ids': inputs[2]
23 | }
24 | outputs = model(**inputs_dict)
25 | logits = outputs[0]
26 | preds = F.log_softmax(logits, dim=-1)
27 | return preds.tolist()
28 | return predict_fn
29 |
30 |
31 | def predict_roberta(model):
32 | def pred_fn(*args):
33 | batch = time_fn(collate_tokens, [model.encode(*arg)[:MAX_LENGTH] for arg in zip(*args)], pad_idx=1)
34 | labels = model.predict('mnli', *batch).tolist()
35 | return labels
36 | return pred_fn
37 |
38 |
39 | def benchmark(pred_fn, n):
40 | args = 'All work and no play.', 'Make jack a very dull boy.'
41 | for i in range(0, n):
42 | assert(type(pred_fn(*args)) == list)
43 |
44 |
45 | def benchmark_mnli(samples):
46 | torch_hub_model = time_fn(torch.hub.load, 'pytorch/fairseq','roberta.large.mnli')
47 | try:
48 | transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
49 | 'roberta-large-mnli')
50 | except:
51 | transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
52 | 'roberta-large-mnli', force_download=True)
53 | transformers_tokenizer = time_fn(transformers.RobertaTokenizer.from_pretrained, 'roberta-large-mnli')
54 | pred_functions = {
55 | 'transformers' : predict_transformers(transformers_model, transformers_tokenizer),
56 | 'torch_hub' : predict_roberta(torch_hub_model)
57 | }
58 | for framework, pred_fn in pred_functions.items():
59 | print(f'Benchmarking {framework} with {samples} samples')
60 | time_fn(benchmark, pred_fn, samples)
61 |
62 | ### HELPERS
63 |
64 | def time_fn(fn, *args, **kwargs):
65 | start = time.time()
66 | res = fn(*args, **kwargs)
67 | print(f'Took {time.time() - start} seconds to run {fn.__name__}')
68 | return res
69 |
70 |
71 | def transformer_to_features(tokenizer, *args):
72 | inputs = tokenizer.encode_plus(
73 | *args,
74 | add_special_tokens=True,
75 | max_length=MAX_LENGTH,
76 | truncate_first_sequence=True
77 | )
78 | input_ids, token_type_ids = inputs["input_ids"][:MAX_LENGTH], \
79 | inputs["token_type_ids"][:MAX_LENGTH]
80 |
81 | attention_mask = [1] * len(input_ids)
82 |
83 | # Zero-pad up to the sequence length.
84 | if PAD:
85 | padding_length = MAX_LENGTH - len(input_ids)
86 | input_ids = ([0] * padding_length) + input_ids
87 | attention_mask = ([0] * padding_length) + attention_mask
88 | token_type_ids = ([0] * padding_length) + token_type_ids
89 |
90 | return (input_ids, attention_mask, token_type_ids)
91 |
92 |
93 | def transformers_encode_batch(tokenizer, *args):
94 | all_input_ids = []
95 | all_attention_mask = []
96 | all_token_type_ids = []
97 | for sample in zip(*args):
98 | input_ids, attention_mask, token_type_ids = transformer_to_features(tokenizer, *sample)
99 | all_input_ids.append(input_ids)
100 | all_attention_mask.append(attention_mask)
101 | all_token_type_ids.append(token_type_ids)
102 | return all_input_ids, all_attention_mask, all_token_type_ids
103 |
104 |
105 | if __name__ == '__main__':
106 | benchmark_mnli(10)
--------------------------------------------------------------------------------
/utils/modeling/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | from koursaros.utils.database.psql import Conn
3 | from koursaros.utils.misc import gb_free_space
4 | from koursaros.utils.bucket import bucket_contains, download_and_unzip
5 | from kctl.logger import set_logger
6 | from .data import *
7 |
8 | logger = set_logger('MODELS')
9 |
10 | class Model(object):
11 |
12 | def __init__(self, config, training):
13 | if gb_free_space() < 3:
14 | logger.error("There is not enough space on your disk, please allocate more!")
15 | raise SystemError
16 |
17 | self.config = config
18 | self.version = config.hash
19 | self.dir = '.model-data'
20 |
21 | if not os.path.exists(self.dir):
22 | os.makedirs(self.dir)
23 | self.ckpt_dir = f'{self.dir}/{self.version}/'
24 | logger.info("Local model cache dir %s" %self.ckpt_dir)
25 | if not 'training' in self.config: # use a default model
26 | logger.info('Loading model from default checkpoint')
27 | self.checkpoint = self.config.checkpoint
28 | self.trained = True
29 | elif os.path.exists(self.ckpt_dir + 'config.json') and not training: # model already trained
30 | logger.info('Loading trained model')
31 | self.checkpoint = self.ckpt_dir
32 | self.trained = True
33 | elif bucket_contains(f'{self.version}.tar.gz'):
34 | logger.info(f'Downloading and extracting from bucket {self.config.repo}')
35 | download_and_unzip(self.config.repo.split('//')[-1],
36 | f'{self.version}.tar.gz', self.dir)
37 | self.checkpoint = self.ckpt_dir
38 | assert(os.path.exists(self.ckpt_dir + 'config.json'))
39 | self.trained = True
40 | else: # init model for training
41 | logger.info('Initializing model for training')
42 | if not training:
43 | logger.error('Please train model before deploying')
44 | raise SystemError
45 | self.data_dir = os.path.join(self.dir, self.version)
46 | if not os.path.exists(self.data_dir):
47 | os.makedirs(self.data_dir)
48 | if not os.path.exists(self.ckpt_dir):
49 | os.makedirs(self.ckpt_dir)
50 | self.checkpoint = config.training.checkpoint
51 | self.trained = False
52 |
53 | def get_data(self):
54 | """
55 | Get training data based on yaml config and connection
56 | :return:
57 | """
58 | data = self.config.training.data
59 | if data.source == 'postgres':
60 | p = Conn()
61 | query_fn = p.query
62 | return query_fn(select_all(data.schema, data.train)), \
63 | query_fn(select_all(data.schema, data.test))
64 | else:
65 | return get_rows_from_tsv(data.train), get_rows_from_tsv(data.test)
66 |
67 | def train(self):
68 | """
69 | Runs training as defined in the model yaml. Saves model to directory
70 | .cache/
71 | :return: evaluation metric
72 | """
73 | raise NotImplementedError()
74 |
75 | def run(self, *args):
76 | """
77 | Runs inference on arbitrary args
78 | :param args: sent_a, sent_b for classification / regression task.
79 | :return:
80 | """
81 | raise NotImplementedError()
82 |
83 | def save_model(self):
84 | # append hash of yaml to model checkpoint
85 | raise NotImplementedError()
86 |
87 | @staticmethod
88 | def architectures():
89 | raise NotImplementedError()
90 |
91 | def getInputProto(self):
92 | raise NotImplementedError()
93 |
94 | def getOutputProto(self):
95 | raise NotImplementedError()
96 |
97 |
--------------------------------------------------------------------------------
/utils/modeling/models/__init__.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling.models.transformer_model import TransformerModel
2 |
3 | MODELS = [TransformerModel]
--------------------------------------------------------------------------------
/utils/modeling/models/generative_transformer.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling.model import Model
2 |
3 | from tqdm import trange
4 |
5 | import torch
6 | import torch.nn.functional as F
7 | import numpy as np
8 |
9 | from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig
10 |
11 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
12 | from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
13 | from transformers import XLNetLMHeadModel, XLNetTokenizer
14 | from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
15 | from transformers import XLMWithLMHeadModel, XLMTokenizer
16 |
17 |
18 | MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
19 |
20 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ())
21 |
22 |
23 | PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
24 | (except for Alexei and Maria) are discovered.
25 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
26 | remainder of the story. 1883 Western Siberia,
27 | a young Grigori Rasputin is asked by his father and a group of men to perform magic.
28 | Rasputin has a vision and denounces one of the men as a horse thief. Although his
29 | father initially slaps him for making such an accusation, Rasputin watches as the
30 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
31 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
32 | with people, even a bishop, begging for his blessing. """
33 |
34 | MODEL_CLASSES = {
35 | 'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
36 | 'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
37 | 'xlnet-gen': (XLNetLMHeadModel, XLNetTokenizer),
38 | 'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
39 | 'xlm-gen': (XLMWithLMHeadModel, XLMTokenizer),
40 | }
41 |
42 | class GenerativeTransformer(Model):
43 |
44 | def __init__(self, *args):
45 | super().__init__(*args)
46 | model_class, tokenizer_class = MODEL_CLASSES[self.config.base]
47 | self.model = model_class.from_pretrained(self.config.checkpoint)
48 | self.tokenizer = tokenizer_class.from_pretraiend(self.config.checkpoint)
49 |
50 | def set_seed(self, args):
51 | np.random.seed(args.seed)
52 | torch.manual_seed(args.seed)
53 | if args.n_gpu > 0:
54 | torch.cuda.manual_seed_all(args.seed)
55 |
56 | def top_k_top_p_filtering(self, logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
57 | """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
58 | Args:
59 | logits: logits distribution shape (vocabulary size)
60 | top_k > 0: keep only top k tokens with highest probability (top-k filtering).
61 | top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
62 | Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
63 | From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
64 | """
65 | assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
66 | top_k = min(top_k, logits.size(-1)) # Safety check
67 | if top_k > 0:
68 | # Remove all tokens with a probability less than the last token of the top-k
69 | indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
70 | logits[indices_to_remove] = filter_value
71 |
72 | if top_p > 0.0:
73 | sorted_logits, sorted_indices = torch.sort(logits, descending=True)
74 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
75 |
76 | # Remove tokens with cumulative probability above the threshold
77 | sorted_indices_to_remove = cumulative_probs > top_p
78 | # Shift the indices to the right to keep also the first token above the threshold
79 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
80 | sorted_indices_to_remove[..., 0] = 0
81 |
82 | indices_to_remove = sorted_indices[sorted_indices_to_remove]
83 | logits[indices_to_remove] = filter_value
84 | return logits
85 |
86 | def run(self, raw_text):
87 | context_tokens = self.tokenizer.encode(raw_text)
88 | out = self.sample_sequence(
89 | context=context_tokens,
90 | length=len(context_tokens)
91 | )
92 | out = out[0, len(context_tokens):].tolist()
93 |
94 | text = self.tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
95 | # text = text[: text.find(args.stop_token) if args.stop_token else None]
96 | return text
97 |
98 |
99 | def sample_sequence(self, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.9, is_xlnet=False,
100 | xlm_lang=None, device='cpu'):
101 | context = torch.tensor(context, dtype=torch.long, device=device)
102 | context = context.unsqueeze(0).repeat(num_samples, 1)
103 | generated = context
104 | with torch.no_grad():
105 | for _ in trange(length):
106 |
107 | inputs = {'input_ids': generated}
108 | if is_xlnet:
109 | # XLNet is a direct (predict same token, not next token) and bi-directional model by default
110 | # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
111 | input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
112 | perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float,
113 | device=device)
114 | perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
115 | target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
116 | target_mapping[0, 0, -1] = 1.0 # predict last token
117 | inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
118 |
119 | if xlm_lang is not None:
120 | inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)
121 |
122 | outputs = self.model(
123 | **inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
124 | next_token_logits = outputs[0][0, -1, :] / temperature
125 | filtered_logits = self.top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
126 | next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
127 | generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
128 | return generated
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/utils/modeling/models/transformer_model.py:
--------------------------------------------------------------------------------
1 | from ..model import Model
2 | import torch.nn, torch.tensor, torch.distributed, torch.jit
3 | from transformers import *
4 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
5 | TensorDataset, DistributedSampler)
6 | from tensorboardX import SummaryWriter
7 | from tqdm import tqdm
8 | import numpy as np
9 | import os
10 | from kctl.logger import set_logger
11 |
12 | from koursaros.utils.misc import batch_list
13 |
14 | logger = set_logger('MODELS')
15 |
16 | MODEL_CLASSES = {
17 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
18 | 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
19 | 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
20 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
21 | 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
22 | }
23 |
24 | class TransformerModel(Model):
25 |
26 | def __init__(self, *args, **kwargs):
27 | super().__init__(*args)
28 | if self.config.task == 'classification' or self.config.task == 'regression':
29 | config, model, tokenizer = MODEL_CLASSES[self.config.arch]
30 | else:
31 | raise NotImplementedError()
32 |
33 | self.model_config = config.from_pretrained(self.checkpoint, cache_dir=self.dir)
34 | self.model_config.num_labels = len(self.config.labels)
35 | self.model_config.torchscript = True
36 | self.model = model.from_pretrained(self.checkpoint, config=self.model_config,
37 | cache_dir=self.dir, **kwargs)
38 | self.tokenizer = tokenizer.from_pretrained(self.checkpoint, cache_dir=self.dir)
39 | self.batch_size = self.config.training.batch_size
40 | self.max_grad_norm = 1.0
41 | self.weight_decay = 0.0
42 | self.n_gpu = 1
43 | self.local_rank = -1
44 | self.gradient_accumulation_steps = 1
45 | self.max_length = 256
46 | self.fp16 = False
47 | self.evaluate_during_training = True
48 | self.pad_token_segment_id = 4 if self.config.arch == 'xlnet' else 0
49 | self.pad_on_left = True
50 | self.pad_token = 0
51 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52 | self.model.to(self.device)
53 | self.pad = True
54 | self.label_map = {label: i for i, label in enumerate(self.config.labels)}
55 | if self.trained:
56 | print('Tracing model for deployment...')
57 | self.model.eval()
58 | # self.trace_model()
59 | if self.config.task == 'classification':
60 | self.best_checkpoint_metric = 'acc'
61 | elif self.config.task == 'regression':
62 | self.best_checkpoint_metric = 'loss'
63 |
64 | def inputs_from_batch(self, batch):
65 | inputs = {'input_ids': batch[0],
66 | 'attention_mask': batch[1]}
67 | if self.config.arch != 'distilbert':
68 | inputs['token_type_ids'] = batch[2] if self.config.arch in ['bert',
69 | 'xlnet'] else None
70 | if len(batch) > 3:
71 | inputs['labels'] = batch[3]
72 | return inputs
73 |
74 | def tuple_inputs(self, inputs):
75 | return (
76 | inputs['input_ids'],
77 | inputs['attention_mask'],
78 | inputs['token_type_ids']
79 | )
80 |
81 | def trace_model(self):
82 | examples = [
83 | InputExample(
84 | guid=1,
85 | text_a="Once upon a time there was a boy",
86 | text_b="He liked to write code all day long"
87 | )
88 | ]
89 | features = [self.example_to_feature(example) for example in examples]
90 | all_inputs = self.features_to_inputs(features, True)
91 | inputs = self.inputs_from_batch(all_inputs)
92 | self.model = torch.jit.trace(self.model, self.tuple_inputs(inputs))
93 |
94 | def train(self, force_build_features=False):
95 | return self.do_train(force_build_features=force_build_features)
96 | # except:
97 | # logger.warning('Error during training, decrease batch size and try again')
98 | # raise SystemError()
99 | # self.batch_size = self.batch_size // 2 # back off batch_size
100 | # return self.train(force_build_features=True)
101 |
102 | def do_train(self, force_build_features=False):
103 | ### In Transformers, optimizer and schedules are splitted and instantiated like this:
104 |
105 | tb_writer = SummaryWriter()
106 |
107 | train_dataset, test_dataset = self.get_data()
108 | train_dataset = self.load_and_cache_examples(train_dataset, force_build_features=force_build_features)
109 | epochs = int(self.config.training.epochs)
110 | optimizer = AdamW(self.model.parameters(), lr=float(self.config.training.learning_rate),
111 | correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
112 | num_warmup_steps = int(0.06 * len(train_dataset))
113 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps,
114 | t_total=(self.config.training.epochs * len(train_dataset) / self.batch_size))
115 |
116 | train_sampler = RandomSampler(train_dataset)
117 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)
118 |
119 | t_total = len(train_dataloader)
120 |
121 | # Prepare optimizer and schedule (linear warmup and decay)
122 | no_decay = ['bias', 'LayerNorm.weight']
123 | optimizer_grouped_parameters = [
124 | {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
125 | 'weight_decay': self.weight_decay},
126 | {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
127 | 'weight_decay': 0.0}
128 | ]
129 |
130 | try:
131 | from apex import amp
132 | model, optimizer = amp.initialize(self.model, optimizer)
133 | self.fp16 = True
134 | except ImportError:
135 | logger.warning("Please install fp16 from https://github.com/NVIDIA/apex for better performance")
136 | self.fp16 = False
137 |
138 | # Train!
139 | logger.info("***** Running training *****")
140 | logger.info(" Num examples = %d" % len(train_dataset))
141 | logger.info(" Num Epochs = %d" % epochs)
142 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d" %
143 | self.batch_size * (
144 | torch.distributed.get_world_size() if self.local_rank != -1 else 1))
145 | logger.info(" Total optimization steps = %d" % t_total)
146 |
147 | if not 'eval_freq' in self.config.training:
148 | self.eval_freq = 2
149 | else:
150 | self.eval_freq = self.config.training.eval_freq
151 |
152 | self.eval_and_save_every = len(train_dataset) // self.batch_size // self.eval_freq
153 |
154 | global_step = 0
155 | tr_loss, logging_loss = 0.0, 0.0
156 | self.model.zero_grad()
157 | label_count = [0] * len(self.config.labels)
158 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.local_rank not in [-1, 0])
159 | num_correct = 0
160 | prev_best = None
161 | for step, batch in enumerate(epoch_iterator):
162 | self.model.train()
163 | correct_labels = batch[3]
164 | batch = tuple(t.to(self.device) for t in batch)
165 |
166 | inputs = self.inputs_from_batch(batch)
167 | outputs = self.model(**inputs)
168 | loss = outputs[0] # model outputs are always tuple in transformers (see doc)
169 | logits = outputs[1]
170 | preds = logits.detach().cpu().numpy()
171 | preds = np.argmax(preds, axis=1)
172 | for pred in preds:
173 | label_count[pred] += 1
174 | num_correct += np.sum(preds == correct_labels.detach().cpu().numpy())
175 | if step > 0:
176 | epoch_iterator.set_description("Accuracy: %.2f Label Counts: %s"
177 | % (num_correct / (step*self.batch_size), label_count))
178 | epoch_iterator.refresh() # to show immediately the update
179 |
180 | if self.n_gpu > 1:
181 | loss = loss.mean() # mean() to average on multi-gpu parallel training
182 |
183 | if self.fp16:
184 | with amp.scale_loss(loss, optimizer) as scaled_loss:
185 | scaled_loss.backward()
186 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm)
187 | else:
188 | loss.backward()
189 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
190 |
191 | tr_loss += loss.item()
192 | if (step + 1) % self.gradient_accumulation_steps == 0:
193 | optimizer.step()
194 | scheduler.step() # Update learning rate schedule
195 | self.model.zero_grad()
196 | global_step += 1
197 |
198 | if self.local_rank in [-1, 0] and global_step % self.eval_and_save_every == 0:
199 | # Log metrics
200 | if self.local_rank == -1 and self.evaluate_during_training:
201 | results = self.evaluate(test_dataset)
202 | for key, value in results.items():
203 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
204 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
205 | tb_writer.add_scalar('loss', (tr_loss - logging_loss) / self.eval_and_save_every, global_step)
206 | logging_loss = tr_loss
207 | if prev_best is None or results[self.best_checkpoint_metric] > prev_best:
208 | prev_best = results[self.best_checkpoint_metric]
209 | self.save_model()
210 |
211 | if self.local_rank in [-1, 0]:
212 | tb_writer.close()
213 |
214 | result = self.evaluate(test_dataset)
215 | if prev_best is None or result[self.best_checkpoint_metric] > prev_best:
216 | self.save_model()
217 |
218 | return global_step, tr_loss / global_step
219 |
220 | def save_model(self):
221 | # Save model checkpoint
222 | model_to_save = self.model.module if hasattr(self.model,
223 | 'module') else self.model
224 | model_to_save.save_pretrained(self.ckpt_dir)
225 | self.tokenizer.save_pretrained(self.ckpt_dir)
226 |
227 | def evaluate(self, test_dataset):
228 | eval_dataset = self.load_and_cache_examples(test_dataset, evaluate=True)
229 | eval_output_dir = os.path.join(self.data_dir, 'eval')
230 |
231 | if not os.path.exists(eval_output_dir) and self.local_rank in [-1, 0]:
232 | os.makedirs(eval_output_dir)
233 |
234 | # Note that DistributedSampler samples randomly
235 | eval_sampler = SequentialSampler(eval_dataset) if self.local_rank == -1 else DistributedSampler(
236 | eval_dataset)
237 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.batch_size)
238 |
239 | # Eval!
240 | logger.info("***** Running evaluation *****")
241 | logger.info(" Num examples = %d" % len(eval_dataset))
242 | logger.info(" Batch size = %d" % self.batch_size)
243 | eval_loss = 0.0
244 | nb_eval_steps = 0
245 | preds = None
246 | out_label_ids = None
247 | for batch in tqdm(eval_dataloader, desc="Evaluating"):
248 | self.model.eval()
249 | batch = tuple(t.to(self.device) for t in batch)
250 |
251 | with torch.no_grad():
252 | inputs = {'input_ids': batch[0],
253 | 'attention_mask': batch[1],
254 | 'labels': batch[3]}
255 | if self.config.arch != 'distilbert':
256 | inputs['token_type_ids'] = batch[2] if self.config.arch in ['bert',
257 | 'xlnet'] else None
258 | outputs = self.model(**inputs)
259 | tmp_eval_loss, logits = outputs[:2]
260 |
261 | eval_loss += tmp_eval_loss.mean().item()
262 | nb_eval_steps += 1
263 | if preds is None:
264 | preds = logits.detach().cpu().numpy()
265 | out_label_ids = inputs['labels'].detach().cpu().numpy()
266 | else:
267 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
268 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
269 |
270 | eval_loss = eval_loss / nb_eval_steps
271 | result = {
272 | "loss": eval_loss
273 | }
274 | if self.config.task == "classification":
275 | preds = np.argmax(preds, axis=1)
276 | result['acc'] = np.sum(preds == out_label_ids) / len(preds)
277 | elif self.config.task == "regression":
278 | preds = np.squeeze(preds)
279 |
280 | output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
281 | with open(output_eval_file, "w") as writer:
282 | logger.info("***** Eval results *****")
283 | for key in sorted(result.keys()):
284 | logger.info(" %s = %s" % (key, str(result[key])))
285 | writer.write("%s = %s\n" % (key, str(result[key])))
286 |
287 | return result
288 |
289 | def example_to_feature(self, example):
290 | inputs = self.tokenizer.encode_plus(
291 | example.text_a,
292 | example.text_b,
293 | add_special_tokens=True,
294 | max_length=self.max_length,
295 | truncate_first_sequence=True # We're truncating the first sequence in priority
296 | )
297 | input_ids, token_type_ids = inputs["input_ids"][:self.max_length], \
298 | inputs["token_type_ids"][:self.max_length]
299 |
300 | attention_mask = [1] * len(input_ids)
301 |
302 | # Zero-pad up to the sequence length.
303 | if self.pad:
304 | padding_length = self.max_length - len(input_ids)
305 | if self.pad_on_left:
306 | input_ids = ([self.pad_token] * padding_length) + input_ids
307 | attention_mask = ([0] * padding_length) + attention_mask
308 | token_type_ids = ([self.pad_token_segment_id] * padding_length) + token_type_ids
309 | else:
310 | input_ids = input_ids + ([self.pad_token] * padding_length)
311 | attention_mask = attention_mask + ([0] * padding_length)
312 | token_type_ids = token_type_ids + ([self.pad_token_segment_id] * padding_length)
313 |
314 | if example.label is not None:
315 | if self.config.task == "classification":
316 | if example.label in self.label_map:
317 | label = self.label_map[example.label]
318 | else:
319 | logger.warning("UNKNOWN LABEL %s, ignoring" % example.label)
320 | return
321 | elif self.config.task == "regression":
322 | label = float(example.label)
323 | else:
324 | logger.error("Only supported tasks are classification and regression")
325 | raise NotImplementedError()
326 | else:
327 | label = None
328 |
329 | return InputFeatures(input_ids=input_ids,
330 | attention_mask=attention_mask,
331 | token_type_ids=token_type_ids,
332 | label=label)
333 |
334 | def features_to_inputs(self, features, inference):
335 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(self.device)
336 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long).to(self.device)
337 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long).to(self.device)
338 | if not inference:
339 | if self.config.task == "classification":
340 | all_labels = torch.tensor([f.label for f in features], dtype=torch.long).to(self.device)
341 | elif self.config.task == "regression":
342 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float).to(self.device)
343 | else:
344 | raise NotImplementedError()
345 | return all_input_ids, all_attention_mask, all_token_type_ids, all_labels
346 | else:
347 | return all_input_ids, all_attention_mask, all_token_type_ids
348 |
349 |
350 | def load_and_cache_examples(self, data, evaluate=False, force_build_features=False):
351 | if self.local_rank not in [-1, 0] and not evaluate:
352 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
353 |
354 | cached_features_file = os.path.join(self.data_dir, 'features' if not evaluate else 'eval-features')
355 | if os.path.exists(os.path.join(cached_features_file)) and not force_build_features:
356 | logger.info("Loading features from cached file %s" % cached_features_file)
357 | features = torch.load(cached_features_file)
358 | else:
359 | logger.info("Creating features from dataset file at %s" % cached_features_file)
360 |
361 | examples = [
362 | InputExample(guid=i,
363 | text_a=ex[0],
364 | text_b=ex[1] if len(ex) == 3 else None,
365 | label=ex[-1]) for i, ex in enumerate(data)
366 | ]
367 |
368 | features = []
369 | for (ex_index, example) in enumerate(examples):
370 | if ex_index % 10000 == 0:
371 | logger.info("Writing example %d" % (ex_index))
372 | features.append(self.example_to_feature(example))
373 |
374 | if self.local_rank in [-1, 0]:
375 | logger.info("Saving features into cached file %s" % cached_features_file)
376 | torch.save(features, cached_features_file)
377 |
378 | if self.local_rank == 0 and not evaluate:
379 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
380 |
381 | # Convert to Tensors and build dataset
382 | dataset = TensorDataset(*self.features_to_inputs(features, False))
383 | return dataset
384 |
385 | def pred_from_output(self, outputs):
386 | logits = outputs[0]
387 | preds = logits.detach().cpu().numpy()
388 | if self.config.task == 'classification':
389 | preds = np.argmax(preds, axis=1)
390 | return [self.config.labels[int(pred)] for pred in preds]
391 | elif self.config.task == 'regression':
392 | return np.squeeze(preds)
393 | else:
394 | raise NotImplementedError()
395 |
396 | def run(self, *args):
397 | examples = [
398 | InputExample(
399 | guid=str(i),
400 | text_a=arg[0],
401 | text_b=None if len(arg) < 2 else arg[1]
402 | ) for i, arg in enumerate(zip(*args))
403 | ]
404 | features = [self.example_to_feature(example) for example in examples]
405 | all_inputs = self.features_to_inputs(features, True)
406 | inputs = self.inputs_from_batch(all_inputs)
407 | outputs = self.model(*self.tuple_inputs(inputs))
408 | return self.pred_from_output(outputs)
409 |
410 | def multi_gpu_training(self):
411 | # multi-gpu training (should be after apex fp16 initialization)
412 | if self.n_gpu > 1:
413 | model = torch.nn.DataParallel(self.model)
414 | # Distributed training (should be after apex fp16 initialization)
415 | if self.local_rank != -1:
416 | model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.local_rank],
417 | output_device=self.local_rank,
418 | find_unused_parameters=True)
419 |
420 | @staticmethod
421 | def architectures():
422 | return list(MODEL_CLASSES.keys())
423 |
--------------------------------------------------------------------------------
/utils/predictor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/predictor/__init__.py
--------------------------------------------------------------------------------
/utils/predictor/__main__.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling import model_from_yaml
2 | import sys, os
3 | from koursaros.utils.database.psql import Conn
4 | from koursaros.utils.misc import batch_list
5 | from koursaros.modeling.data import *
6 | import csv
7 | import time
8 |
9 | BATCH_SIZE = int(os.environ.get('BATCH_SIZE') or 4)
10 |
11 | def predict(model_file, data_source, data_target, truncate=False):
12 | model = model_from_yaml(model_file)
13 | extension = data_source.split('.')[-1]
14 | if extension in ['tsv', 'csv']:
15 | rows = get_rows_from_tsv(data_source)
16 | delimiter = '\t' if extension == 'tsv' else 'csv'
17 | open(data_target, 'w+') # touch file
18 |
19 | def write_fn(buffer):
20 | file = open(data_target, 'a')
21 | writer = csv.writer(file, delimiter=delimiter)
22 | for row in buffer: writer.writerow(row)
23 |
24 | else:
25 | p = Conn()
26 | query_fn = p.query
27 | schema, table = data_source.split('.')
28 | if truncate:
29 | p.execute(f'''truncate table {data_target}''')
30 | rows = query_fn(select_all(schema, table, random=False))
31 |
32 | def write_fn(buffer):
33 | p.insert(data_target, buffer)
34 | p.commit()
35 |
36 | buffer = []
37 | i = 0
38 | start = time.time()
39 | for step, batch in enumerate(batch_list(rows, BATCH_SIZE)):
40 | transposed = tuple(zip(*batch))
41 | inputs = transposed[:-1]
42 | ids = transposed[-1]
43 | buffer.extend(zip(ids, model.run(*inputs)))
44 | i += BATCH_SIZE
45 | if i > 500:
46 | total = step * BATCH_SIZE
47 | print('dumping example {}, rate: {} per second'.format(total, total/(time.time() - start) ))
48 | write_fn(buffer)
49 | buffer = []
50 | i = 0
51 |
52 | if len(buffer) > 0: write_fn(buffer)
53 |
54 | if __name__ == '__main__':
55 | model_file = sys.argv[1]
56 | data_source = sys.argv[2]
57 | data_target = sys.argv[3] if len(sys.argv) > 3 else './predictions.tsv'
58 | truncate = len(sys.argv) > 4 and sys.argv[4] == '-t'
59 | predict(model_file, data_source, data_target, truncate=truncate)
--------------------------------------------------------------------------------
/utils/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling import model_from_yaml
2 |
3 | def train(file):
4 | model = model_from_yaml(file, training=True)
5 | model.train()
--------------------------------------------------------------------------------
/utils/trainer/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from . import train
3 |
4 | if __name__ == '__main__':
5 | filename = sys.argv[1]
6 | train(filename)
--------------------------------------------------------------------------------
/utils/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/utils/__init__.py
--------------------------------------------------------------------------------
/utils/utils/bucket/__init__.py:
--------------------------------------------------------------------------------
1 | from google.cloud import storage
2 | import tarfile
3 | import os
4 | from pathlib import Path
5 |
6 | def download_blob(bucket_name, source_blob_name, destination_file_name):
7 | """Downloads a blob from the bucket."""
8 | storage_client = storage.Client()
9 | bucket = storage_client.get_bucket(bucket_name)
10 | blob = bucket.blob(source_blob_name)
11 |
12 | blob.download_to_filename(destination_file_name)
13 |
14 | print('Blob {} downloaded to {}.'.format(
15 | source_blob_name,
16 | destination_file_name))
17 |
18 | def bucket_contains(filename):
19 | storage_client = storage.Client()
20 | blobs = storage_client.list_blobs("poloma-models")
21 | for blob in blobs:
22 | if blob == filename: return True
23 | return False
24 |
25 | def download_and_unzip(bucket_name, source_blob_name, out_dir, archive=False):
26 | fname = source_blob_name.split("/")[-1]
27 | destination = out_dir + fname
28 | if not os.path.isfile(destination):
29 | download_blob(bucket_name, source_blob_name, destination)
30 | assert os.path.isfile(destination)
31 | if archive:
32 | tar = tarfile.open(destination, "r:gz")
33 | tar.extractall(out_dir)
34 | tar.close()
35 | print(f'extracted {destination} to {out_dir}')
--------------------------------------------------------------------------------
/utils/utils/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/utils/cuda/__init__.py
--------------------------------------------------------------------------------
/utils/utils/cuda/apex.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex
--------------------------------------------------------------------------------
/utils/utils/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .psql import *
2 |
3 |
4 |
--------------------------------------------------------------------------------
/utils/utils/database/psql.py:
--------------------------------------------------------------------------------
1 | from psycopg2 import extensions, extras
2 | import os
3 |
4 |
5 | def is_nested(nested):
6 | if any(not isinstance(i, (list, tuple)) for i in nested):
7 | raise ValueError('Hey dumbass - you can only dump nested lists/tuples.')
8 |
9 |
10 | class Conn(extensions.connection):
11 | def __init__(self, host=None, user=None, password=None, dbname=None, sslmode=None, cert_path=None):
12 | if sslmode:
13 | os.environ['PGSSLMODE'] = sslmode
14 | if cert_path:
15 | os.environ['PGSSLROOTCERT'] = cert_path
16 | if not host:
17 | host = os.environ.get('PGHOST')
18 | if not user:
19 | user = os.environ.get('PGUSER')
20 | if not password:
21 | password = os.environ.get('PGPASS')
22 | if not dbname:
23 | dbname = os.environ.get('PGDBNAME')
24 | dsn = f"dbname='{dbname}' user='{user}' host='{host}' password='{password}'"
25 | super(Conn, self).__init__(dsn=dsn)
26 |
27 | def _set_columns(self, cur):
28 | self.columns = [desc.name for desc in cur.description]
29 |
30 | def execute(self, query):
31 | cur = self.cursor()
32 | cur.execute(query)
33 |
34 | def iter_rows(self, query):
35 | cur = self.cursor()
36 | cur.execute(query)
37 | self._set_columns(cur)
38 | return cur
39 |
40 | def iter_chunk(self, query, chunksize):
41 | cur = self.cursor()
42 | cur.execute(query)
43 | self._set_columns(cur)
44 | chunk = cur.fetchmany(chunksize)
45 | while chunk:
46 | yield chunk
47 | chunk = cur.fetchmany(chunksize)
48 |
49 | def query(self, query):
50 | cur = self.cursor()
51 | cur.execute(query)
52 | fetched = cur.fetchall()
53 | self._set_columns(cur)
54 | return fetched
55 |
56 | def insert(self, table, nested):
57 | is_nested(nested)
58 | cur = self.cursor()
59 | template = f'INSERT INTO {table} VALUES %s'
60 | extras.execute_values(cur, template, nested)
61 |
62 | def table_exists(self, schema, table):
63 | query = f'''
64 | SELECT EXISTS (
65 | SELECT
66 | FROM information_schema.tables
67 | WHERE table_schema = '{schema}'
68 | AND table_name = '{table}'
69 | );
70 | '''
71 | cur = self.cursor()
72 | cur.execute(query)
73 | return cur.fetchone()[0]
74 |
75 | def database_exists(self, database):
76 | query = f'''
77 | SELECT EXISTS (
78 | SELECT
79 | FROM pg_database
80 | WHERE datname = '{database}'
81 | )
82 | '''
83 | cur = self.cursor()
84 | cur.execute(query)
85 | return cur.fetchone()[0]
86 |
87 | def create_database(self, database):
88 | query = f'''
89 | COPY (SELECT 1) TO PROGRAM 'createdb {database}';
90 | '''
91 | cur = self.cursor()
92 | cur.execute(query)
93 |
--------------------------------------------------------------------------------
/utils/utils/misc/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 |
4 | BOLD = '\033[1m{}\033[0m'
5 |
6 |
7 | def gb_free_space():
8 | statvfs = os.statvfs(os.getcwd())
9 | return statvfs.f_frsize * statvfs.f_bfree / 1e+9 # Actual number of free bytes
10 |
11 |
12 | def batch_fn(batch_size, call_fn, items):
13 | buffer = []
14 | for item in items:
15 | buffer.append(item)
16 | if len(buffer) % batch_size == 0:
17 | yield call_fn(buffer), buffer
18 | buffer = []
19 | if len(buffer) > 0:
20 | yield call_fn(buffer), buffer
21 |
22 |
23 | def batch_list(arr, n):
24 | buffer = []
25 | for i, item in enumerate(arr):
26 | buffer.append(item)
27 | if (i+1) % n == 0:
28 | yield buffer
29 | buffer = []
30 | if len(buffer) > 0:
31 | yield buffer
32 |
--------------------------------------------------------------------------------
/utils/utils/misc/tree.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ls -R | grep ":$" | sed -e 's/:$//' -e 's/[^-][^\/]*\//--/g' -e 's/^/ /' -e 's/-/|/'
--------------------------------------------------------------------------------
/utils/yamls.py:
--------------------------------------------------------------------------------
1 | from yaml import safe_load
2 | from hashlib import md5
3 | from enum import Enum
4 | from box import Box
5 |
6 |
7 | class YamlType(Enum):
8 | BASE = 0
9 | PIPELINE = 1
10 | SERVICE = 2
11 |
12 |
13 | def Yaml(path):
14 | """
15 | Sudo class for managing a yaml as a python object.
16 |
17 | :param path: path to .yaml file
18 | """
19 | __type__ = None
20 | __text__ = open(path).read()
21 | yaml = safe_load(__text__)
22 |
23 | for yaml_type in YamlType:
24 | if yaml_type.name.lower() in yaml:
25 | __type__ = yaml_type
26 |
27 | if __type__ is None:
28 | raise ValueError('Invalid yaml type for %s' % path)
29 |
30 | box = Box(yaml[__type__.name.lower()])
31 | box.__path__ = path
32 | box.__text__ = __text__
33 | box.__type__ = __type__
34 | box.hash = md5(__text__.encode()).hexdigest()
35 | return box
36 |
--------------------------------------------------------------------------------