├── .github ├── koursaros.jpg ├── logo.svg └── overview.svg ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── flows ├── .DS_Store ├── factchecking │ ├── index │ │ ├── docker-compose.yml │ │ ├── flow.py │ │ └── helm │ │ │ ├── .helmignore │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ ├── main.yaml │ │ │ ├── service.yaml │ │ │ └── statefulset.yaml │ │ │ └── values.yaml │ ├── query │ │ ├── flow.py │ │ └── helm │ │ │ ├── .helmignore │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ ├── main.yaml │ │ │ ├── service.yaml │ │ │ └── statefulset.yaml │ │ │ └── values.yaml │ └── train │ │ └── train-compose.yml └── yc_demo │ ├── .DS_Store │ ├── docker-compose-temp.yml │ ├── docker-compose.yml │ ├── flow.py │ ├── helm │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── NOTES.txt │ │ ├── main.yaml │ │ ├── service.yaml │ │ └── statefulset.yaml │ └── values.yaml │ ├── index.k │ └── query.k ├── koursaros ├── __init__.py ├── chart │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── NOTES.txt │ │ ├── main.yaml │ │ ├── service.yaml │ │ └── statefulset.yaml │ └── values.yaml ├── cli │ ├── __init__.py │ ├── __main__.py │ ├── build │ │ └── __init__.py │ ├── deploy │ │ └── __init__.py │ ├── manager.py │ ├── show │ │ └── __init__.py │ ├── test │ │ └── __init__.py │ └── utils.py ├── flow │ └── __init__.py ├── hub │ ├── client │ │ ├── .DS_Store │ │ ├── postgres │ │ │ ├── Dockerfile │ │ │ ├── postgres.py │ │ │ ├── testrerank.yml │ │ │ └── wikititles.yml │ │ └── sheet │ │ │ ├── Dockerfile │ │ │ ├── base.yml │ │ │ ├── client.py │ │ │ └── test.csv │ ├── encoder │ │ ├── robertainfer │ │ │ ├── Dockerfile │ │ │ └── dim64.yml │ │ └── textbyte │ │ │ ├── Dockerfile │ │ │ ├── max1024.yml │ │ │ ├── max256.yml │ │ │ └── textbyte.py │ ├── httpclient │ │ └── http │ │ │ └── Dockerfile │ ├── indexer │ │ ├── faisscpu │ │ │ ├── Dockerfile │ │ │ └── base.yml │ │ ├── keyword │ │ │ ├── Dockerfile │ │ │ ├── base.yml │ │ │ └── keyword.py │ │ ├── lvdb │ │ │ ├── Dockerfile │ │ │ └── base.yml │ │ ├── rocksdb │ │ │ ├── Dockerfile │ │ │ └── base.yml │ │ ├── simple_dict │ │ │ ├── Dockerfile │ │ │ ├── base.yml │ │ │ └── simple_dict.py │ │ └── whoosh │ │ │ ├── Dockerfile │ │ │ ├── base.yml │ │ │ └── whoosh.py │ ├── preprocessor │ │ ├── sentsplit │ │ │ ├── Dockerfile │ │ │ └── jsonmode.yml │ │ └── unary │ │ │ ├── Dockerfile │ │ │ └── text.yml │ ├── router │ │ ├── block │ │ │ ├── Dockerfile │ │ │ ├── block.py │ │ │ ├── block_query.yml │ │ │ └── block_train.yml │ │ ├── log │ │ │ ├── Dockerfile │ │ │ └── log.py │ │ ├── rerank │ │ │ ├── Dockerfile │ │ │ ├── base.yml │ │ │ └── rerank.py │ │ └── resp_req │ │ │ ├── Dockerfile │ │ │ ├── base.yml │ │ │ └── resp_req.py │ └── tests │ │ ├── reviews_sample.csv │ │ ├── sonnets_small.txt │ │ ├── test_block.py │ │ ├── test_keyword.py │ │ ├── test_reranker.py │ │ ├── test_textbyte_encoder.py │ │ ├── test_whoosh.py │ │ └── yaml │ │ ├── test-joint.yml │ │ ├── test-keyword.yml │ │ ├── test-reranker.yml │ │ └── test-whoosh.yml └── repo_creds │ ├── README.md │ └── __init__.py ├── requirements.txt ├── setup.py ├── tutorials ├── deploy_custom_model.md └── fact_check.md └── utils ├── modeling ├── __init__.py ├── data.py ├── migrating.py ├── model.py └── models │ ├── __init__.py │ ├── generative_transformer.py │ └── transformer_model.py ├── predictor ├── __init__.py └── __main__.py ├── trainer ├── __init__.py └── __main__.py ├── utils ├── __init__.py ├── bucket │ └── __init__.py ├── cuda │ ├── __init__.py │ └── apex.sh ├── database │ ├── __init__.py │ └── psql.py └── misc │ ├── __init__.py │ └── tree.sh └── yamls.py /.github/koursaros.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/.github/koursaros.jpg -------------------------------------------------------------------------------- /.github/logo.svg: -------------------------------------------------------------------------------- 1 | Asset 1Koursaros -------------------------------------------------------------------------------- /.github/overview.svg: -------------------------------------------------------------------------------- 1 | Asset 21234ServiceStub -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a fact-checking 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | db.sqlite3 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # Jetbrains 106 | .idea 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 koursaros-ai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include koursaros * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Koursaros 2 | 3 |
4 | 5 |

6 | 7 | PyPI - License 8 | 9 |

10 | 11 |

12 | Blog • 13 | Highlights • 14 | Overview • 15 | Install • 16 | Getting Started • 17 | Documentation • 18 | Tutorials • 19 | Contributing 20 |

21 | 22 | Koursaros is a distributed cloud platform for developing and deploying neural search and inference applications. 23 | 24 | Koursaros leverages a general-purpose microservice architecture to enable low-latency, scalable deep neural network training and can be directly deployed to kubernetes for production. 25 | 26 | ## Description 27 | This is page is a work in progress. 28 | 29 | ## Results 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 41 | 42 | 43 | 44 |
BenchmarkLabel AccuracyPaperModels
fever.ai 40 | 0.7396 (2nd)An Automated Fact Checker in Era of Fake Newscoming soon
45 | 46 | ## Install 47 | ### Requirements 48 | You need Python 3.6 or later to run Koursaros. 49 | 50 | ### Stable Version 51 | #### Installing via pip 52 | We recommend installing Koursaros via pip: 53 | ``` 54 | pip3 install koursaros 55 | ``` 56 | Installation will use Python wheels from PyPI, available for OSX, Linux, and Windows. 57 | 58 | ### Latest Version 59 | ### Installing via pip-git 60 | You can install the latest version from Git: 61 | ``` 62 | pip3 install git+https://git@github.com/koursaros-ai/koursaros.git 63 | ``` 64 | 65 | ## Getting Started 66 | ### Creating a pipeline 67 | ``` 68 | kctl deploy app 69 | ``` 70 | 71 | 72 | ## Tutorials 73 | - Use Koursaros to get SoTA results in dev environment on the fever.ai benchmark using pretrained models. 74 | - Training custom models and deploying them as stubs 75 | - Training Elastic Search BM25 algorithm using Ax Bayesian Optimizer (coming soon) 76 | - Deploying fever.ai pipeline to production (Coming Soon) 77 | -------------------------------------------------------------------------------- /flows/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/flows/.DS_Store -------------------------------------------------------------------------------- /flows/factchecking/index/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | Frontend0: 4 | image: gnes/gnes:latest-alpine 5 | command: frontend --port_in 61973 --port_out 54596 --port_ctrl 57120 --parallel_backend 6 | process 7 | ports: 8 | - 8800:8800 9 | sentsplit: 10 | image: hub-preprocessor:latest-sentsplit 11 | command: --port_in 54596 --port_out 60639 --socket_in PULL_CONNECT --socket_out 12 | PUB_BIND --port_ctrl 56881 --parallel_backend process --num_parallel 2 --yaml_path 13 | jsonmode.yml 14 | deploy: 15 | replicas: 2 16 | textbyte: 17 | image: hub-encoder:latest-textbyte 18 | command: --port_in 60639 --port_out 58737 --socket_in SUB_CONNECT --port_ctrl 19 | 54010 --parallel_backend process --num_parallel 2 --yaml_path max256.yml 20 | deploy: 21 | replicas: 2 22 | keyword: 23 | image: hub-indexer:latest-keyword 24 | command: --port_in 58737 --port_out 61340 --socket_in PULL_CONNECT --socket_out 25 | PUSH_CONNECT --port_ctrl 64855 --parallel_backend process --num_parallel 2 --yaml_path 26 | base.yml 27 | deploy: 28 | replicas: 2 29 | lvdb: 30 | image: hub-indexer:latest-lvdb 31 | command: --port_in 60639 --port_out 61340 --socket_in SUB_CONNECT --socket_out 32 | PUSH_CONNECT --port_ctrl 54746 --parallel_backend process --num_parallel 2 --yaml_path 33 | base.yml 34 | deploy: 35 | replicas: 2 36 | basereducerouter: 37 | image: gnes/gnes:latest-alpine 38 | command: route --port_in 61340 --port_out 61973 --socket_out PUSH_CONNECT --port_ctrl 39 | 57894 --parallel_backend process --yaml_path BaseReduceRouter --num_part 2 -------------------------------------------------------------------------------- /flows/factchecking/index/flow.py: -------------------------------------------------------------------------------- 1 | from koursaros.gnes_addons import Flow 2 | 3 | flow = ( 4 | Flow(check_version=True) 5 | .add_client(name='postgres', yaml_path='wikititles.yml') 6 | .add_preprocessor(name='sentsplit', replicas=2, storage='1Gi', yaml_path='jsonmode.yml') 7 | .add_encoder(name='textbyte', recv_from='sentsplit', replicas=2, yaml_path='max256.yml') 8 | .add_indexer(name='keyword', replicas=2, yaml_path='base.yml') 9 | .add_indexer(name='lvdb', recv_from='sentsplit', replicas=2, yaml_path='base.yml') 10 | .add_router(name='basereducerouter', num_part=2, recv_from=['keyword', 'lvdb'], yaml_path='BaseReduceRouter') 11 | ) 12 | 13 | # checkout how the flow looks like (...and post it on Twitter, but hey what do I know about promoting OSS) 14 | # funny! 15 | -------------------------------------------------------------------------------- /flows/factchecking/index/helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /flows/factchecking/index/helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for GNES 4 | name: gnes 5 | version: 0.1.0 6 | -------------------------------------------------------------------------------- /flows/factchecking/index/helm/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Deployed flow! -------------------------------------------------------------------------------- /flows/factchecking/index/helm/templates/main.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- with .Values.services }} 3 | {{- range list .frontend .preprocessor .encoder .indexer .router }} 4 | {{- range . }} 5 | --- 6 | {{ include "statefulset" .}} 7 | --- 8 | {{ include "service" .}} 9 | {{ end }} 10 | {{ end }} 11 | {{ end }} -------------------------------------------------------------------------------- /flows/factchecking/index/helm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "service" -}} 3 | {{- $name := printf "%s-%s" .app .model -}} 4 | apiVersion: v1 5 | kind: Service 6 | metadata: 7 | name: {{ $name }} 8 | spec: 9 | selector: 10 | app: {{ $name }} 11 | clusterIP: None 12 | ports: 13 | {{- if .port_in }} 14 | - name: in 15 | port: {{ .port_in }} 16 | protocol: TCP 17 | {{- end }} 18 | {{- if .port_out }} 19 | - name: out 20 | port: {{ .port_out }} 21 | protocol: TCP 22 | {{- end -}} 23 | {{- if .grpc_port }} 24 | - name: grpc 25 | port: {{ .grpc_port }} 26 | protocol: TCP 27 | {{- end -}} 28 | {{- if .ctrl_port }} 29 | - name: ctrl 30 | port: {{ .ctrl_port }} 31 | protocol: TCP 32 | {{- end -}} 33 | {{ if .load_balancer }} 34 | type: LoadBalancer 35 | {{ end }} 36 | {{- end -}} -------------------------------------------------------------------------------- /flows/factchecking/index/helm/templates/statefulset.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "statefulset" -}} 3 | {{- $name := printf "%s-%s" .app .model -}} 4 | apiVersion: apps/v1 5 | kind: StatefulSet 6 | metadata: 7 | name: {{ $name }} 8 | spec: 9 | replicas: {{ .replicas }} 10 | selector: 11 | matchLabels: 12 | app: {{ $name }} 13 | volumeClaimTemplates: 14 | - metadata: 15 | name: {{ $name }} 16 | spec: 17 | accessModes: 18 | - ReadWriteOnce 19 | {{- if .storage }} 20 | resources: 21 | requests: 22 | storage: {{ .storage }} 23 | {{- end }} 24 | template: 25 | metadata: 26 | labels: 27 | app: {{ $name }} 28 | spec: 29 | containers: 30 | - name: {{ $name }} 31 | image: {{ .image }} 32 | args: 33 | {{- range .command }} 34 | - {{ . | quote }} 35 | {{- end }} 36 | imagePullPolicy: null 37 | ports: 38 | {{- if .port_in }} 39 | - name: in 40 | containerPort: {{ .port_in }} 41 | protocol: TCP 42 | {{- end }} 43 | {{- if .port_out }} 44 | - name: out 45 | containerPort: {{ .port_out }} 46 | protocol: TCP 47 | {{- end }} 48 | {{- if .grpc_port }} 49 | - name: grpc 50 | containerPort: {{ .grpc_port }} 51 | protocol: TCP 52 | {{- end }} 53 | {{- if .grpc_port }} 54 | - name: ctrl 55 | containerPort: {{ .port_ctrl }} 56 | protocol: TCP 57 | {{- end }} 58 | resources: 59 | requests: 60 | {{- if .cpu }} 61 | cpu: {{ .cpu }} 62 | {{- end }} 63 | {{- if .memory }} 64 | memory: {{ .memory }} 65 | {{- end }} 66 | 67 | {{- if .resources -}} 68 | {{- toYaml .resources | nindent 8 -}} 69 | {{- end -}} 70 | {{- end -}} -------------------------------------------------------------------------------- /flows/factchecking/index/helm/values.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | frontend: 3 | - name: Frontend0 4 | app: frontend 5 | model: base 6 | port_in: 61973 7 | port_out: 54596 8 | ctrl_port: 9 | grpc_port: 8800 10 | command: 11 | - frontend 12 | - --port_in 13 | - '61973' 14 | - --port_out 15 | - '54596' 16 | - --port_ctrl 17 | - '57120' 18 | - --parallel_backend 19 | - process 20 | replicas: 1 21 | storage: 500Mi 22 | memory: 500Mi 23 | cpu: 300m 24 | image: gnes/gnes:latest-alpine 25 | preprocessor: 26 | - name: sentsplit 27 | app: preprocessor 28 | model: sentsplit 29 | port_in: 54596 30 | port_out: 60639 31 | ctrl_port: 32 | grpc_port: 33 | command: 34 | - --port_in 35 | - '54596' 36 | - --port_out 37 | - '60639' 38 | - --socket_in 39 | - PULL_CONNECT 40 | - --socket_out 41 | - PUB_BIND 42 | - --port_ctrl 43 | - '56881' 44 | - --parallel_backend 45 | - process 46 | - --num_parallel 47 | - '2' 48 | - --yaml_path 49 | - jsonmode.yml 50 | replicas: 2 51 | storage: 1Gi 52 | memory: 1Gi 53 | cpu: 1Gi 54 | image: hub-preprocessor:latest-sentsplit 55 | encoder: 56 | - name: textbyte 57 | app: encoder 58 | model: textbyte 59 | port_in: 60639 60 | port_out: 58737 61 | ctrl_port: 62 | grpc_port: 63 | command: 64 | - --port_in 65 | - '60639' 66 | - --port_out 67 | - '58737' 68 | - --socket_in 69 | - SUB_CONNECT 70 | - --port_ctrl 71 | - '54010' 72 | - --parallel_backend 73 | - process 74 | - --num_parallel 75 | - '2' 76 | - --yaml_path 77 | - max256.yml 78 | replicas: 2 79 | storage: 500Mi 80 | memory: 500Mi 81 | cpu: 300m 82 | image: hub-encoder:latest-textbyte 83 | indexer: 84 | - name: keyword 85 | app: indexer 86 | model: keyword 87 | port_in: 58737 88 | port_out: 61340 89 | ctrl_port: 90 | grpc_port: 91 | command: 92 | - --port_in 93 | - '58737' 94 | - --port_out 95 | - '61340' 96 | - --socket_in 97 | - PULL_CONNECT 98 | - --socket_out 99 | - PUSH_CONNECT 100 | - --port_ctrl 101 | - '64855' 102 | - --parallel_backend 103 | - process 104 | - --num_parallel 105 | - '2' 106 | - --yaml_path 107 | - base.yml 108 | replicas: 2 109 | storage: 500Mi 110 | memory: 500Mi 111 | cpu: 300m 112 | image: hub-indexer:latest-keyword 113 | - name: lvdb 114 | app: indexer 115 | model: lvdb 116 | port_in: 60639 117 | port_out: 61340 118 | ctrl_port: 119 | grpc_port: 120 | command: 121 | - --port_in 122 | - '60639' 123 | - --port_out 124 | - '61340' 125 | - --socket_in 126 | - SUB_CONNECT 127 | - --socket_out 128 | - PUSH_CONNECT 129 | - --port_ctrl 130 | - '54746' 131 | - --parallel_backend 132 | - process 133 | - --num_parallel 134 | - '2' 135 | - --yaml_path 136 | - base.yml 137 | replicas: 2 138 | storage: 500Mi 139 | memory: 500Mi 140 | cpu: 300m 141 | image: hub-indexer:latest-lvdb 142 | router: 143 | - name: basereducerouter 144 | app: router 145 | model: basereducerouter 146 | port_in: 61340 147 | port_out: 61973 148 | ctrl_port: 149 | grpc_port: 150 | command: 151 | - route 152 | - --port_in 153 | - '61340' 154 | - --port_out 155 | - '61973' 156 | - --socket_out 157 | - PUSH_CONNECT 158 | - --port_ctrl 159 | - '57894' 160 | - --parallel_backend 161 | - process 162 | - --yaml_path 163 | - BaseReduceRouter 164 | - --num_part 165 | - '2' 166 | replicas: 1 167 | storage: 500Mi 168 | memory: 500Mi 169 | cpu: 300m 170 | image: gnes/gnes:latest-alpine -------------------------------------------------------------------------------- /flows/factchecking/query/flow.py: -------------------------------------------------------------------------------- 1 | from koursaros.gnes_addons import Flow 2 | 3 | flow = ( 4 | Flow(check_version=True) 5 | .add_client(name='postgres', yaml_path='clients/postgres/wikititles.yml') 6 | .add_preprocessor(name='sentsplit', replicas=2, 7 | yaml_path='services/preprocessors/sentsplit/jsonmode.yml') 8 | .add_encoder(name='textbyte', recv_from='sentsplit', replicas=2, 9 | yaml_path='services/encoders/textbyte/max256.yml') 10 | .add_indexer(name='keyword', replicas=2, 11 | yaml_path='services/indexers/keyword/base.yml') 12 | .add_indexer(name='lvdb', replicas=2, yaml_path='services/indexers/lvdb/base.yml') 13 | .add_encoder(name='robertainfer', replicas=2, 14 | yaml_path='services/encoders/robertainfer/dim64.yml') 15 | .add_router(name='reduce', num_part=2, yaml_path='BaseReduceRouter') 16 | ) 17 | 18 | 19 | # checkout how the flow looks like (...and post it on Twitter, but hey what do I know about promoting OSS) 20 | # funny! 21 | -------------------------------------------------------------------------------- /flows/factchecking/query/helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /flows/factchecking/query/helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for GNES 4 | name: gnes 5 | version: 0.1.0 6 | -------------------------------------------------------------------------------- /flows/factchecking/query/helm/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | raise NotImplementedError 2 | -------------------------------------------------------------------------------- /flows/factchecking/query/helm/templates/main.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- with .Values.services }} 3 | {{- range list .frontend .preprocessors .encoders .indexers .routers }} 4 | {{- range . }} 5 | --- 6 | {{ include "statefulset" .}} 7 | --- 8 | {{ include "service" .}} 9 | {{ end }} 10 | {{ end }} 11 | {{ end }} -------------------------------------------------------------------------------- /flows/factchecking/query/helm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "service" -}} 3 | apiVersion: v1 4 | kind: Service 5 | spec: 6 | selector: 7 | app: {{ .name }} 8 | clusterIP: None 9 | ports: 10 | {{- if .port_in }} 11 | - name: in 12 | port: {{ .port_in }} 13 | protocol: TCP 14 | {{- end }} 15 | {{- if .port_out }} 16 | - name: out 17 | port: {{ .port_out }} 18 | protocol: TCP 19 | {{- end -}} 20 | {{- if .grpc_port }} 21 | - name: grpc 22 | port: {{ .grpc_port }} 23 | protocol: TCP 24 | {{- end -}} 25 | {{- if .grpc_port }} 26 | - name: ctrl 27 | port: {{ .port_ctrl }} 28 | protocol: TCP 29 | {{- end -}} 30 | {{ if .load_balancer }} 31 | type: LoadBalancer 32 | {{ end }} 33 | {{- end -}} -------------------------------------------------------------------------------- /flows/factchecking/query/helm/templates/statefulset.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "statefulset" -}} 3 | apiVersion: apps/v1 4 | kind: StatefulSet 5 | spec: 6 | replicas: {{ .replicas }} 7 | selector: 8 | matchLabels: 9 | app: {{ .name }} 10 | volumeClaimTemplates: 11 | accessModes: 'ReadWriteOnce' 12 | resources: 13 | requests: 14 | storage: {{ .storage }} 15 | template: 16 | spec: 17 | containers: 18 | - name: {{ .name }} 19 | image: {{ .image }} 20 | args: {{ .command }} 21 | imagePullPolicy: null 22 | ports: 23 | {{- if .port_in }} 24 | - name: in 25 | containerPort: {{ .port_in }} 26 | protocol: TCP 27 | {{- end }} 28 | {{- if .port_out }} 29 | - name: out 30 | containerPort: {{ .port_out }} 31 | protocol: TCP 32 | {{- end -}} 33 | {{- if .grpc_port }} 34 | - name: grpc 35 | containerPort: {{ .grpc_port }} 36 | protocol: TCP 37 | {{- end -}} 38 | {{- if .grpc_port }} 39 | - name: ctrl 40 | containerPort: {{ .port_ctrl }} 41 | protocol: TCP 42 | {{- end -}} 43 | resources: 44 | requests: 45 | cpu: {{ .cpu }} 46 | memory: {{ .memory }} 47 | 48 | {{- if .resources -}} 49 | {{- toYaml .resources | nindent 8 -}} 50 | {{- end -}} 51 | {{- end -}} -------------------------------------------------------------------------------- /flows/factchecking/query/helm/values.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | frontends: 3 | - name: Frontend0 4 | port_in: 63152 5 | port_out: 49972 6 | ctrl_port: 7 | grpc_port: 8800 8 | command: frontend --port_in 63152 --port_out 49972 --port_ctrl 55922 --parallel_backend 9 | process 10 | replicas: 1 11 | storage: 12 | memory: 13 | cpu: 14 | image: gnes-frontend:Frontend0 15 | preprocessors: 16 | - name: sent_split 17 | port_in: 49972 18 | port_out: 53012 19 | ctrl_port: 20 | grpc_port: 21 | command: preprocess --port_in 49972 --port_out 53012 --socket_in PULL_CONNECT 22 | --port_ctrl 54583 --parallel_backend process --yaml_path services/preprocessors/sent_split/json_mode.yml 23 | replicas: 2 24 | storage: 25 | memory: 26 | cpu: 27 | image: gnes-preprocessor:sent_split 28 | encoders: 29 | - name: text_byte 30 | port_in: 53012 31 | port_out: 54139 32 | ctrl_port: 33 | grpc_port: 34 | command: encode --port_in 53012 --port_out 54139 --socket_in PULL_CONNECT --port_ctrl 35 | 51629 --parallel_backend process --yaml_path services/encoders/text_byte/max_256.yml 36 | replicas: 2 37 | storage: 38 | memory: 39 | cpu: 40 | image: gnes-encoder:text_byte 41 | - name: roberta_infer 42 | port_in: 55961 43 | port_out: 52539 44 | ctrl_port: 45 | grpc_port: 46 | command: encode --port_in 55961 --port_out 52539 --socket_in PULL_CONNECT --port_ctrl 47 | 52568 --parallel_backend process --yaml_path services/encoders/roberta_infer/dim_64.yml 48 | replicas: 2 49 | storage: 50 | memory: 51 | cpu: 52 | image: gnes-encoder:roberta_infer 53 | indexers: 54 | - name: keyword 55 | port_in: 54139 56 | port_out: 60943 57 | ctrl_port: 58 | grpc_port: 59 | command: index --port_in 54139 --port_out 60943 --socket_in PULL_CONNECT --port_ctrl 60 | 63670 --parallel_backend process --yaml_path services/indexers/keyword/base.yml 61 | replicas: 2 62 | storage: 63 | memory: 64 | cpu: 65 | image: gnes-indexer:keyword 66 | - name: lvdb 67 | port_in: 60943 68 | port_out: 55961 69 | ctrl_port: 70 | grpc_port: 71 | command: index --port_in 60943 --port_out 55961 --socket_in PULL_CONNECT --port_ctrl 72 | 55890 --parallel_backend process --yaml_path services/indexers/lvdb/base.yml 73 | replicas: 2 74 | storage: 75 | memory: 76 | cpu: 77 | image: gnes-indexer:lvdb 78 | routers: 79 | - name: Reduce 80 | port_in: 52539 81 | port_out: 63152 82 | ctrl_port: 83 | grpc_port: 84 | command: route --port_in 52539 --port_out 63152 --socket_in PULL_CONNECT --socket_out 85 | PUSH_CONNECT --port_ctrl 50250 --parallel_backend process --yaml_path BaseReduceRouter 86 | --num_part 2 87 | replicas: 1 88 | storage: 89 | memory: 90 | cpu: 91 | image: gnes-router:Reduce -------------------------------------------------------------------------------- /flows/factchecking/train/train-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | Frontend00: 4 | image: gnes/gnes:latest-alpine 5 | command: frontend --grpc_port 5566 --port_out 62401 --socket_out PUSH_BIND --port_in 6 | 60753 --socket_in PULL_BIND --host_in Encoder20 --host_out Preprocessor10 7 | ports: 8 | - 5566:5566 9 | Preprocessor10: 10 | image: services/preprocessors:word-split-preprocessor 11 | command: preprocess --port_in 62401 --socket_in PULL_CONNECT --port_out 54470 12 | --socket_out PUSH_CONNECT --yaml_path SentSplitPreprocessor 13 | --host_in Frontend00 --host_out Encoder20 14 | deploy: 15 | replicas: 3 16 | restart_policy: 17 | condition: on-failure 18 | max_attempts: 3 19 | Encoder20: 20 | image: services/encoders:siamese-bert 21 | command: --port_in 54470 --socket_in PULL_BIND --port_out 60753 --socket_out PUSH_CONNECT 22 | --host_out Frontend00 --host_in Preprocessor10 -------------------------------------------------------------------------------- /flows/yc_demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/flows/yc_demo/.DS_Store -------------------------------------------------------------------------------- /flows/yc_demo/docker-compose-temp.yml: -------------------------------------------------------------------------------- 1 | services: 2 | block4: 3 | command: --socket_in SUB_CONNECT --socket_out PUSH_BIND --yaml_path block_train.yml 4 | --host_in router3 --port_in 58842 --port_out 55503 5 | ports: [55503:55503] 6 | frontend2: 7 | command: --socket_in PULL_BIND --socket_out PUSH_BIND --port_in 64750 --port_out 8 | 56531 9 | ports: [64750:64750, 56531:56531] 10 | http1: 11 | command: --socket_in RPC_BIND --socket_out RPC_CONNECT --port_in 61501 --host_out 12 | frontend2 --port_out 64750 13 | ports: [61501:61501] 14 | keyword7: {command: --socket_in PULL_CONNECT --socket_out PUSH_CONNECT --yaml_path 15 | base.yml --host_in textbyte6 --port_in 59483 --host_out rerank9 --port_out 64772} 16 | rerank9: {command: --socket_in PULL_CONNECT --socket_out PUSH_CONNECT --yaml_path 17 | base.yml --host_in router8 --port_in 56224 --host_out frontend2 --port_out 64750} 18 | router3: 19 | command: --socket_in PULL_CONNECT --socket_out PUB_BIND --yaml_path BaseRouter 20 | --host_in frontend2 --port_in 56531 --port_out 58842 21 | ports: [58842:58842] 22 | router8: 23 | command: --socket_in SUB_CONNECT --socket_out PUSH_BIND --yaml_path BaseRouter 24 | --host_in router3 --port_in 58842 --port_out 56224 25 | ports: [56224:56224] 26 | textbyte6: 27 | command: --socket_in PULL_CONNECT --socket_out PUSH_BIND --yaml_path max1024.yml 28 | --host_in unary5 --port_in 64036 --port_out 59483 29 | ports: [59483:59483] 30 | unary5: 31 | command: doc_type=1 --socket_in PULL_CONNECT --socket_out PUSH_BIND --yaml_path 32 | text.yml --host_in block4 --port_in 55503 --port_out 64036 33 | ports: [64036:64036] 34 | version: 3.4 35 | -------------------------------------------------------------------------------- /flows/yc_demo/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | http: 4 | image: hub-httpclient:latest-http 5 | command: --grpc_host Frontend0 --start_doc_id 1 6 | ports: 7 | - 80:80 8 | Frontend0: 9 | image: gnes/gnes:latest-alpine 10 | command: frontend --port_in 57105 --port_out 65502 --port_ctrl 55166 --parallel_backend 11 | process 12 | Router0: 13 | image: gnes/gnes:latest-alpine 14 | command: route --port_in 65502 --port_out 58609 --socket_in PULL_CONNECT --socket_out 15 | PUB_BIND --port_ctrl 49407 --parallel_backend process --yaml_path BaseRouter 16 | --host_in Frontend0 17 | block: 18 | image: hub-router:latest-block 19 | command: --port_in 58609 --port_out 53283 --socket_in SUB_CONNECT --port_ctrl 20 | 52423 --parallel_backend process --yaml_path block_train.yml --host_in Router0 21 | unary: 22 | image: hub-preprocessor:latest-unary 23 | command: --port_in 53283 --port_out 51714 --socket_in PULL_CONNECT --port_ctrl 24 | 55377 --parallel_backend process --yaml_path text.yml --host_in block 25 | textbyte: 26 | image: hub-encoder:latest-textbyte 27 | command: --port_in 51714 --port_out 62690 --socket_in PULL_CONNECT --port_ctrl 28 | 57360 --parallel_backend process --yaml_path max1024.yml --host_in unary 29 | # --socket_out PUB_BIND # FOR INDEXING 30 | # whoosh: 31 | # image: hub-indexer:latest-whoosh 32 | # command: --port_in 62690 --port_out 57105 --socket_in SUB_CONNECT --port_ctrl 33 | # 60258 --parallel_backend process --yaml_path base.yml --host_in textbyte 34 | # --host_out Frontend0 --socket_out PUSH_CONNECT 35 | # volumes: 36 | # - ./.cache:/workspace 37 | # rocksdb: 38 | # image: hub-indexer:latest-rocksdb 39 | # command: --port_in 62690 --port_out 57105 --socket_in SUB_CONNECT --port_ctrl 40 | # 60258 --parallel_backend process --yaml_path base.yml --host_in textbyte 41 | # --host_out Frontend0 --socket_out PUSH_CONNECT 42 | # volumes: 43 | # - ./.cache:/workspace 44 | # FOR QUERYING 45 | whoosh: 46 | image: hub-indexer:latest-whoosh 47 | command: --port_in 62690 --port_out 61233 --socket_in PULL_CONNECT --port_ctrl 48 | 60258 --parallel_backend process --yaml_path base.yml --host_in textbyte 49 | volumes: 50 | - ./.cache:/workspace 51 | rocksdb: 52 | image: hub-indexer:latest-simple_dict 53 | command: --port_in 61233 --port_out 62155 --socket_in PULL_CONNECT --port_ctrl 54 | 60234 --parallel_backend process --yaml_path base.yml --host_in whoosh 55 | --host_out rerank --socket_out PUSH_CONNECT 56 | volumes: 57 | - ./.cache:/workspace 58 | # Router1: 59 | # image: gnes/gnes:latest-alpine 60 | # command: route --port_in 58609 --port_out 62155 --socket_in SUB_CONNECT --socket_out 61 | # PUSH_CONNECT --port_ctrl 50381 --parallel_backend process --yaml_path BaseRouter 62 | # --host_in Router0 --host_out rerank 63 | rerank: 64 | image: hub-router:latest-rerank 65 | command: --port_in 62155 --port_out 57105 --socket_out PUSH_CONNECT --port_ctrl 66 | 56641 --parallel_backend process --yaml_path base.yml --host_out Frontend0 -------------------------------------------------------------------------------- /flows/yc_demo/flow.py: -------------------------------------------------------------------------------- 1 | from koursaros.gnes_addons import Flow 2 | 3 | 4 | flow = ( 5 | Flow(with_frontend=False) 6 | .add_http_client(name='http') 7 | .add_frontend(copy_flow=False) 8 | .add_router(yaml_path='BaseRouter') 9 | .add_router(name='block', yaml_path='block_train.yml') 10 | .add_preprocessor(name='unary', yaml_path='text.yml', doc_type=1) 11 | .add_encoder(name='textbyte', yaml_path='max1024.yml') 12 | .add_indexer(name='whoosh', yaml_path='base.yml') 13 | .add_indexer(name='simple_dict', yaml_path='base.yml') 14 | .add_router(yaml_path='BaseRouter', recv_from=['Router0']) 15 | .add_router(name='rerank', yaml_path='base.yml', recv_from=['rocksdb', 'Router1']) 16 | ) 17 | -------------------------------------------------------------------------------- /flows/yc_demo/helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /flows/yc_demo/helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for GNES 4 | name: gnes 5 | version: 0.1.0 6 | -------------------------------------------------------------------------------- /flows/yc_demo/helm/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Deployed flow! -------------------------------------------------------------------------------- /flows/yc_demo/helm/templates/main.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- with .Values.services }} 3 | {{- range list .frontend .preprocessor .encoder .indexer .router }} 4 | {{- range . }} 5 | --- 6 | {{ include "statefulset" .}} 7 | --- 8 | {{ include "service" .}} 9 | {{ end }} 10 | {{ end }} 11 | {{ end }} -------------------------------------------------------------------------------- /flows/yc_demo/helm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "service" -}} 3 | {{- $name := printf "%s-%s" .app .model -}} 4 | apiVersion: v1 5 | kind: Service 6 | metadata: 7 | name: {{ $name }} 8 | spec: 9 | selector: 10 | app: {{ $name }} 11 | clusterIP: None 12 | ports: 13 | {{- if .port_in }} 14 | - name: in 15 | port: {{ .port_in }} 16 | protocol: TCP 17 | {{- end }} 18 | {{- if .port_out }} 19 | - name: out 20 | port: {{ .port_out }} 21 | protocol: TCP 22 | {{- end -}} 23 | {{- if .grpc_port }} 24 | - name: grpc 25 | port: {{ .grpc_port }} 26 | protocol: TCP 27 | {{- end -}} 28 | {{- if .ctrl_port }} 29 | - name: ctrl 30 | port: {{ .ctrl_port }} 31 | protocol: TCP 32 | {{- end -}} 33 | {{ if .load_balancer }} 34 | type: LoadBalancer 35 | {{ end }} 36 | {{- end -}} -------------------------------------------------------------------------------- /flows/yc_demo/helm/templates/statefulset.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "statefulset" -}} 3 | {{- $name := printf "%s-%s" .app .model -}} 4 | apiVersion: apps/v1 5 | kind: StatefulSet 6 | metadata: 7 | name: {{ $name }} 8 | spec: 9 | replicas: {{ .replicas }} 10 | selector: 11 | matchLabels: 12 | app: {{ $name }} 13 | volumeClaimTemplates: 14 | - metadata: 15 | name: {{ $name }} 16 | spec: 17 | accessModes: 18 | - ReadWriteOnce 19 | {{- if .storage }} 20 | resources: 21 | requests: 22 | storage: {{ .storage }} 23 | {{- end }} 24 | template: 25 | metadata: 26 | labels: 27 | app: {{ $name }} 28 | spec: 29 | containers: 30 | - name: {{ $name }} 31 | image: {{ .image }} 32 | args: 33 | {{- range .command }} 34 | - {{ . | quote }} 35 | {{- end }} 36 | imagePullPolicy: null 37 | ports: 38 | {{- if .port_in }} 39 | - name: in 40 | containerPort: {{ .port_in }} 41 | protocol: TCP 42 | {{- end }} 43 | {{- if .port_out }} 44 | - name: out 45 | containerPort: {{ .port_out }} 46 | protocol: TCP 47 | {{- end }} 48 | {{- if .grpc_port }} 49 | - name: grpc 50 | containerPort: {{ .grpc_port }} 51 | protocol: TCP 52 | {{- end }} 53 | {{- if .grpc_port }} 54 | - name: ctrl 55 | containerPort: {{ .port_ctrl }} 56 | protocol: TCP 57 | {{- end }} 58 | resources: 59 | requests: 60 | {{- if .cpu }} 61 | cpu: {{ .cpu }} 62 | {{- end }} 63 | {{- if .memory }} 64 | memory: {{ .memory }} 65 | {{- end }} 66 | 67 | {{- if .resources -}} 68 | {{- toYaml .resources | nindent 8 -}} 69 | {{- end -}} 70 | {{- end -}} -------------------------------------------------------------------------------- /flows/yc_demo/helm/values.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | httpclient: 3 | - name: http 4 | app: httpclient 5 | model: http 6 | port_in: 7 | port_out: 8 | ctrl_port: 9 | grpc_port: 10 | command: [] 11 | replicas: 1 12 | storage: 500Mi 13 | memory: 500Mi 14 | cpu: 300m 15 | image: hub-httpclient:latest-http 16 | frontend: 17 | - name: Frontend0 18 | app: frontend 19 | model: base 20 | port_in: 57105 21 | port_out: 65502 22 | ctrl_port: 23 | grpc_port: 24 | command: 25 | - frontend 26 | - --port_in 27 | - '57105' 28 | - --port_out 29 | - '65502' 30 | - --port_ctrl 31 | - '55166' 32 | - --parallel_backend 33 | - process 34 | replicas: 1 35 | storage: 500Mi 36 | memory: 500Mi 37 | cpu: 300m 38 | image: gnes/gnes:latest-alpine 39 | router: 40 | - name: Router0 41 | app: router 42 | model: base 43 | port_in: 65502 44 | port_out: 58609 45 | ctrl_port: 46 | grpc_port: 47 | command: 48 | - route 49 | - --port_in 50 | - '65502' 51 | - --port_out 52 | - '58609' 53 | - --socket_in 54 | - PULL_CONNECT 55 | - --socket_out 56 | - PUB_BIND 57 | - --port_ctrl 58 | - '49407' 59 | - --parallel_backend 60 | - process 61 | - --yaml_path 62 | - BaseRouter 63 | replicas: 1 64 | storage: 500Mi 65 | memory: 500Mi 66 | cpu: 300m 67 | image: gnes/gnes:latest-alpine 68 | - name: block 69 | app: router 70 | model: block 71 | port_in: 58609 72 | port_out: 53283 73 | ctrl_port: 74 | grpc_port: 75 | command: 76 | - --port_in 77 | - '58609' 78 | - --port_out 79 | - '53283' 80 | - --socket_in 81 | - SUB_CONNECT 82 | - --port_ctrl 83 | - '52423' 84 | - --parallel_backend 85 | - process 86 | - --yaml_path 87 | - block_train.yml 88 | replicas: 1 89 | storage: 500Mi 90 | memory: 500Mi 91 | cpu: 300m 92 | image: hub-router:latest-block 93 | - name: Router1 94 | app: router 95 | model: base 96 | port_in: 58609 97 | port_out: 62155 98 | ctrl_port: 99 | grpc_port: 100 | command: 101 | - route 102 | - --port_in 103 | - '58609' 104 | - --port_out 105 | - '62155' 106 | - --socket_in 107 | - SUB_CONNECT 108 | - --socket_out 109 | - PUSH_CONNECT 110 | - --port_ctrl 111 | - '50381' 112 | - --parallel_backend 113 | - process 114 | - --yaml_path 115 | - BaseRouter 116 | replicas: 1 117 | storage: 500Mi 118 | memory: 500Mi 119 | cpu: 300m 120 | image: gnes/gnes:latest-alpine 121 | - name: rerank 122 | app: router 123 | model: rerank 124 | port_in: 62155 125 | port_out: 57105 126 | ctrl_port: 127 | grpc_port: 128 | command: 129 | - --port_in 130 | - '62155' 131 | - --port_out 132 | - '57105' 133 | - --socket_out 134 | - PUSH_CONNECT 135 | - --port_ctrl 136 | - '56641' 137 | - --parallel_backend 138 | - process 139 | - --yaml_path 140 | - base.yml 141 | replicas: 1 142 | storage: 500Mi 143 | memory: 500Mi 144 | cpu: 300m 145 | image: hub-router:latest-rerank 146 | preprocessor: 147 | - name: unary 148 | app: preprocessor 149 | model: unary 150 | port_in: 53283 151 | port_out: 51714 152 | ctrl_port: 153 | grpc_port: 154 | command: 155 | - --port_in 156 | - '53283' 157 | - --port_out 158 | - '51714' 159 | - --socket_in 160 | - PULL_CONNECT 161 | - --port_ctrl 162 | - '55377' 163 | - --parallel_backend 164 | - process 165 | - --yaml_path 166 | - text.yml 167 | replicas: 1 168 | storage: 500Mi 169 | memory: 500Mi 170 | cpu: 300m 171 | image: hub-preprocessor:latest-unary 172 | encoder: 173 | - name: textbyte 174 | app: encoder 175 | model: textbyte 176 | port_in: 51714 177 | port_out: 62690 178 | ctrl_port: 179 | grpc_port: 180 | command: 181 | - --port_in 182 | - '51714' 183 | - --port_out 184 | - '62690' 185 | - --socket_in 186 | - PULL_CONNECT 187 | - --port_ctrl 188 | - '57360' 189 | - --parallel_backend 190 | - process 191 | - --yaml_path 192 | - max1024.yml 193 | replicas: 1 194 | storage: 500Mi 195 | memory: 500Mi 196 | cpu: 300m 197 | image: hub-encoder:latest-textbyte 198 | indexer: 199 | - name: keyword 200 | app: indexer 201 | model: keyword 202 | port_in: 62690 203 | port_out: 62155 204 | ctrl_port: 205 | grpc_port: 206 | command: 207 | - --port_in 208 | - '62690' 209 | - --port_out 210 | - '62155' 211 | - --socket_in 212 | - PULL_CONNECT 213 | - --socket_out 214 | - PUSH_CONNECT 215 | - --port_ctrl 216 | - '60258' 217 | - --parallel_backend 218 | - process 219 | - --yaml_path 220 | - base.yml 221 | replicas: 1 222 | storage: 500Mi 223 | memory: 500Mi 224 | cpu: 300m 225 | image: hub-indexer:latest-keyword -------------------------------------------------------------------------------- /flows/yc_demo/index.k: -------------------------------------------------------------------------------- 1 | # | APP | MODEL | REPS | YAML_PATH | IN | OUT | CMD 2 | 1 | httpclient | http | 1 | | RPC: | RPC:2 | 3 | 2 | frontend | | 1 | | PULL: | PUSH: | frontend 4 | 3 | router | | 1 | BaseRouter | PULL:2 | PUB: | route 5 | 4 | router | block | 1 | block_train.yml | SUB:3 | PUSH: | 6 | 5 | preprocessor| unary | 1 | text.yml | PULL:4 | PUSH: | 7 | 6 | encoder | textbyte | 1 | max1024.yml | PULL:5 | PUB: | 8 | 7 | indexer | whoosh | 1 | base.yml | SUB:6 | PUSH:2 | 9 | 8 | indexer | rocksdb | 1 | base.yml | SUB:6 | PUSH:2 | 10 | -------------------------------------------------------------------------------- /flows/yc_demo/query.k: -------------------------------------------------------------------------------- 1 | # | APP | MODEL | REPS | YAML_PATH | IN | OUT | CMD 2 | 1 | httpclient | http | 1 | | RPC: | RPC:2 | 3 | 2 | frontend | | 1 | | PULL: | PUSH: | frontend 4 | 3 | router | | 1 | BaseRouter | PULL:2 | PUB: | route 5 | 4 | router | block | 1 | block_train.yml | SUB:3 | PUSH: | 6 | 5 | preprocessor| unary | 1 | text.yml | PULL:4 | PUSH: | 7 | 6 | encoder | textbyte | 1 | max1024.yml | PULL:5 | PUSH: | 8 | 7 | indexer | whoosh | 1 | base.yml | PULL:6 | PUSH: | 9 | 8 | indexer | rocksdb | 1 | base.yml | PULL:7 | PUB:9 | 10 | 9 | router | rerank | 1 | base.yml | SUB: | PUSH:2 | 11 | 10| router | block | 1 | block_query.yml | SUB:3 | PUB:9 | 12 | -------------------------------------------------------------------------------- /koursaros/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/__init__.py -------------------------------------------------------------------------------- /koursaros/chart/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /koursaros/chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for GNES 4 | name: gnes 5 | version: 0.1.0 6 | -------------------------------------------------------------------------------- /koursaros/chart/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Deployed flow! -------------------------------------------------------------------------------- /koursaros/chart/templates/main.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- with .Values.services }} 3 | {{- range list .frontend .preprocessor .encoder .indexer .router }} 4 | {{- range . }} 5 | --- 6 | {{ include "statefulset" .}} 7 | --- 8 | {{ include "service" .}} 9 | {{ end }} 10 | {{ end }} 11 | {{ end }} -------------------------------------------------------------------------------- /koursaros/chart/templates/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "service" -}} 3 | {{- $name := printf "%s-%s" .app .model -}} 4 | apiVersion: v1 5 | kind: Service 6 | metadata: 7 | name: {{ $name }} 8 | spec: 9 | selector: 10 | app: {{ $name }} 11 | clusterIP: None 12 | ports: 13 | {{- if .port_in }} 14 | - name: in 15 | port: {{ .port_in }} 16 | protocol: TCP 17 | {{- end }} 18 | {{- if .port_out }} 19 | - name: out 20 | port: {{ .port_out }} 21 | protocol: TCP 22 | {{- end -}} 23 | {{- if .grpc_port }} 24 | - name: grpc 25 | port: {{ .grpc_port }} 26 | protocol: TCP 27 | {{- end -}} 28 | {{- if .ctrl_port }} 29 | - name: ctrl 30 | port: {{ .ctrl_port }} 31 | protocol: TCP 32 | {{- end -}} 33 | {{ if .load_balancer }} 34 | type: LoadBalancer 35 | {{ end }} 36 | {{- end -}} -------------------------------------------------------------------------------- /koursaros/chart/templates/statefulset.yaml: -------------------------------------------------------------------------------- 1 | 2 | {{- define "statefulset" -}} 3 | {{- $name := printf "%s-%s" .app .model -}} 4 | apiVersion: apps/v1 5 | kind: StatefulSet 6 | metadata: 7 | name: {{ $name }} 8 | spec: 9 | replicas: {{ .replicas }} 10 | selector: 11 | matchLabels: 12 | app: {{ $name }} 13 | volumeClaimTemplates: 14 | - metadata: 15 | name: {{ $name }} 16 | spec: 17 | accessModes: 18 | - ReadWriteOnce 19 | {{- if .storage }} 20 | resources: 21 | requests: 22 | storage: {{ .storage }} 23 | {{- end }} 24 | template: 25 | metadata: 26 | labels: 27 | app: {{ $name }} 28 | spec: 29 | containers: 30 | - name: {{ $name }} 31 | image: {{ .image }} 32 | args: 33 | {{- range .command }} 34 | - {{ . | quote }} 35 | {{- end }} 36 | imagePullPolicy: null 37 | ports: 38 | {{- if .port_in }} 39 | - name: in 40 | containerPort: {{ .port_in }} 41 | protocol: TCP 42 | {{- end }} 43 | {{- if .port_out }} 44 | - name: out 45 | containerPort: {{ .port_out }} 46 | protocol: TCP 47 | {{- end }} 48 | {{- if .grpc_port }} 49 | - name: grpc 50 | containerPort: {{ .grpc_port }} 51 | protocol: TCP 52 | {{- end }} 53 | {{- if .grpc_port }} 54 | - name: ctrl 55 | containerPort: {{ .port_ctrl }} 56 | protocol: TCP 57 | {{- end }} 58 | resources: 59 | requests: 60 | {{- if .cpu }} 61 | cpu: {{ .cpu }} 62 | {{- end }} 63 | {{- if .memory }} 64 | memory: {{ .memory }} 65 | {{- end }} 66 | 67 | {{- if .resources -}} 68 | {{- toYaml .resources | nindent 8 -}} 69 | {{- end -}} 70 | {{- end -}} -------------------------------------------------------------------------------- /koursaros/chart/values.yaml: -------------------------------------------------------------------------------- 1 | 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | imagePullPolicy: IfNotPresent 5 | 6 | services: 7 | frontend: {} 8 | preprocessors: {} 9 | encoders: {} 10 | indexers: {} 11 | routers: {} 12 | 13 | # Valid options for all services are: 14 | # - name: name of the service 15 | # image: - 16 | # port_in: - 17 | # port_out: - 18 | # command: command on container entrance 19 | # replicas: - 20 | # storage: storage on stateful claim 21 | # memory: ram 22 | # cpu: - 23 | # storage and memory are suffixed with Gi(gigabits) or Mi(Megabits) (just greater than a GB and MB) 24 | # cpu is suffixed with m(milliCPU or 1/1000 CPU) -------------------------------------------------------------------------------- /koursaros/cli/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /koursaros/cli/__main__.py: -------------------------------------------------------------------------------- 1 | 2 | from .manager import AppManager 3 | from .deploy import deploy 4 | from .test import test 5 | from .show import show 6 | from .build import build 7 | import click 8 | 9 | 10 | @click.group() 11 | @click.pass_context 12 | def kctl(ctx): 13 | """ 14 | kctl controls the \033[1;3;4;34mKoursaros\033[0m platform. 15 | Find more information at: https://github.com/koursaros-ai/koursaros 16 | """ 17 | ctx.obj = AppManager() 18 | 19 | 20 | kctl.add_command(deploy) 21 | kctl.add_command(test) 22 | kctl.add_command(show) 23 | kctl.add_command(build) 24 | 25 | 26 | def main(): 27 | kctl(prog_name=__package__) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /koursaros/cli/build/__init__.py: -------------------------------------------------------------------------------- 1 | from koursaros.repo_creds import get_creds 2 | import click 3 | from shutil import copytree, rmtree 4 | 5 | 6 | @click.group() 7 | def build(): 8 | """Build docker images.""" 9 | 10 | 11 | @build.command() 12 | @click.argument('flow_path') 13 | @click.option('-p', '--push') 14 | @click.option('-c', '--creds') 15 | @click.option('-n', '--no-caches', multiple=True) 16 | @click.pass_obj 17 | def flow(app_manager, flow_path, push, creds, no_caches): 18 | """Build images for a pipeline. """ 19 | 20 | if push: 21 | if creds is None: 22 | raise ValueError('--creds repository must be specified if pushing') 23 | 24 | hub_creds = get_creds(creds).dockerhub 25 | app_manager.call('docker login -u %s -p %s' % ( 26 | hub_creds.username, hub_creds.password), shell=True) 27 | 28 | # app_manager.call('eval $(minikube docker-env)', shell=True) 29 | 30 | _flow = app_manager.get_flow(flow_path) 31 | 32 | for service in _flow.services.values(): 33 | if '/' not in service['image']: 34 | path = str(app_manager.find_model(service['app'], service['model'])) 35 | tag = service['image'] 36 | app_manager.logger.critical('Building %s from %s...' % (tag, path)) 37 | cache = '--no-cache ' if service.get('name', None) in no_caches else '' 38 | _build = 'docker build ' + cache + '-t %s %s' % (tag, path) 39 | app_manager.call(_build, shell=True) 40 | 41 | if push: 42 | app_manager.logger.critical('Pushing %s...' % tag) 43 | app_manager.call('docker push %s/%s' % (push, tag), shell=True) 44 | 45 | """save swarm yaml""" 46 | _flow.swarm() 47 | # app_manager.logger.critical('Saved swarm yaml to %s' % str(out_path)) 48 | 49 | """save helm chart""" 50 | # out_path = _flow.path.parent.joinpath('helm') 51 | # rmtree(str(out_path), ignore_errors=True) 52 | # copytree(str(app_manager.pkg_root.joinpath('chart')), str(out_path)) 53 | # _flow.path.parent.joinpath('helm/values.yaml').write_text(helm_yaml) 54 | # app_manager.logger.critical('Saved helm chart to %s' % str(out_path)) -------------------------------------------------------------------------------- /koursaros/cli/deploy/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import click 3 | from tqdm import tqdm 4 | import time 5 | import importlib.util 6 | 7 | 8 | @click.group() 9 | def deploy(): 10 | """Deploy gnes services.""" 11 | 12 | 13 | @deploy.group() 14 | def flow(): 15 | """Deploy a pipeline with compose or k8s. """ 16 | 17 | 18 | deploy.add_command(flow) 19 | 20 | 21 | @flow.command() 22 | @click.argument('flow_path') 23 | @click.pass_obj 24 | def compose(app_manager, flow_path): 25 | path = app_manager.get_flow(flow_path).path.parent.joinpath('docker-compose.yml') 26 | down = 'docker-compose -f %s down' % str(path) 27 | app_manager.call(down, shell=True) 28 | up = 'docker-compose -f %s up' % str(path) 29 | app_manager.call(up, shell=True) 30 | 31 | 32 | @flow.command() 33 | @click.argument('flow_name') 34 | @click.pass_obj 35 | def swarm(app_manager, flow_name): 36 | path = app_manager.get_flow(flow_name).path.parent.joinpath('docker-compose.yml') 37 | rm = 'docker stack rm %s' % flow_name 38 | app_manager.call(rm, shell=True) 39 | app_manager.logger.critical('Waiting for docker network resources...') 40 | [time.sleep(0.15) for _ in tqdm(range(100))] 41 | stack = 'docker stack deploy --compose-file %s %s' % (str(path), flow_name) 42 | app_manager.call(stack, shell=True) 43 | 44 | 45 | @flow.command() 46 | @click.argument('flow_name') 47 | @click.option('-d', '--dryrun', is_flag=True) 48 | @click.pass_obj 49 | def k8s(app_manager, flow_name, dryrun): 50 | path = app_manager.get_flow(flow_name).path.parent.joinpath('helm') 51 | purge = 'helm delete --purge $(helm ls --all --short)' 52 | app_manager.call(purge, shell=True) 53 | install = 'helm install ' + ('--dry-run --debug ' if dryrun else '') + str(path) 54 | app_manager.call(install, shell=True) 55 | 56 | 57 | @deploy.command(context_settings=dict( 58 | ignore_unknown_options=True, 59 | allow_extra_args=True)) 60 | @click.argument('client_name') 61 | @click.pass_context 62 | def client(ctx, client_name): 63 | """Deploy a client. """ 64 | app_manager = ctx.obj 65 | path = app_manager.find_model('client', client_name).joinpath('client.py') 66 | if not path.exists(): 67 | raise FileNotFoundError('Could not find %s' % path) 68 | spec = importlib.util.spec_from_file_location(client_name, path) 69 | module = importlib.util.module_from_spec(spec) 70 | spec.loader.exec_module(module) 71 | module.Client(*ctx.args).run() 72 | 73 | 74 | -------------------------------------------------------------------------------- /koursaros/cli/manager.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from gnes.helper import set_logger 4 | from importlib import machinery 5 | from koursaros.flow import Flow 6 | from pathlib import Path 7 | from typing import List 8 | import subprocess 9 | import git 10 | import os 11 | 12 | 13 | class AppManager: 14 | """Manager that keeps track of all of the koursaros 15 | paths and packages. Passed around at runtime to make 16 | things more efficient. 17 | 18 | :param dev: run on local koursaros repo 19 | """ 20 | 21 | def __init__(self): 22 | self.git_root = Path(git.Repo( 23 | '.', search_parent_directories=True).working_tree_dir) 24 | self.pkg_root = Path(__file__).parent.parent 25 | 26 | self.logger = set_logger('kctl') 27 | self.cache = self.git_root.joinpath('.k') 28 | self.cache.mkdir(exist_ok=True) 29 | 30 | def call(self, cmd: List[str], shell=False): 31 | string = cmd if shell else ' '.join(cmd) 32 | self.logger.critical('subprocess.call: "%s"' % string) 33 | subprocess.call(cmd, shell=shell) 34 | 35 | @staticmethod 36 | def check_exists(path: 'Path'): 37 | if not path.exists(): 38 | raise FileNotFoundError(path.absolute()) 39 | 40 | def find_model(self, app: str, model: str) -> 'Path': 41 | path = self.pkg_root.joinpath('hub', app, model) 42 | self.check_exists(path) 43 | return path 44 | 45 | def get_flow(self, path) -> 'Flow': 46 | path = Path(path) 47 | self.check_exists(path) 48 | return Flow(path) 49 | -------------------------------------------------------------------------------- /koursaros/cli/show/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import webbrowser 4 | import click 5 | 6 | 7 | @click.group() 8 | def show(): 9 | """Show gnes architecture.""" 10 | 11 | 12 | @show.command() 13 | @click.argument('flow_path') 14 | @click.pass_obj 15 | def flow(app_manager, flow_path): 16 | """Deploy a pipeline with compose or k8s. """ 17 | url = app_manager.get_flow(flow_path).mermaid_url 18 | 19 | try: 20 | webbrowser.open_new_tab(url) 21 | except webbrowser.Error as ex: 22 | app_manager.logger.critical( 23 | '%s\nCould not open browser... Please visit:\n%s' % (ex, url)) 24 | -------------------------------------------------------------------------------- /koursaros/cli/test/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | import click 4 | import json 5 | 6 | 7 | @click.group() 8 | @click.pass_context 9 | def test(ctx): 10 | """Test a running pipeline""" 11 | 12 | 13 | def log_json_res(res): 14 | logger = set_logger('TEST') 15 | logger.info(json.dumps(json.loads(res.content), indent=4) 16 | .encode().decode("unicode_escape")) 17 | 18 | 19 | @test.command() 20 | @click.argument('pipeline_name') 21 | @click.pass_context 22 | def pipeline(ctx, pipeline_name): 23 | logger = set_logger('TEST') 24 | 25 | if pipeline_name == 'telephone': 26 | url = 'http://localhost:5000' 27 | headers = {'Content-Type': 'application/json'} 28 | 29 | translations = json.dumps({ 30 | 'translations': [{ 31 | 'lang': 'en', 32 | 'text': input('What would you like to translate?\t') 33 | }] 34 | }) 35 | 36 | logger.bold('POSTING %s on %s' % (translations, url)) 37 | res = requests.post(url + '/send', data=translations, headers=headers) 38 | log_json_res(res) 39 | logger.bold('REQ STATUS') 40 | res = requests.get(url + '/status', data=translations, headers=headers) 41 | log_json_res(res) 42 | -------------------------------------------------------------------------------- /koursaros/cli/utils.py: -------------------------------------------------------------------------------- 1 | def decorator_group(decorators): 2 | """returns a decorator which bundles the given decorators 3 | 4 | :param decorators: iterable of decorators 5 | :return: single decorator 6 | 7 | Example: 8 | deploy_options = decorator_group([ 9 | click.option('-c', '--connection', required=True), 10 | click.option('-r', '--rebind', is_flag=True), 11 | click.option('-d', '--debug', is_flag=True), 12 | ]) 13 | 14 | """ 15 | def group(f): 16 | for decorator in decorators: 17 | f = decorator(f) 18 | return f 19 | return group 20 | -------------------------------------------------------------------------------- /koursaros/flow/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import random 3 | from collections import defaultdict 4 | from base64 import b64encode 5 | from ruamel.yaml import YAML 6 | 7 | APPS = ['httpclient', 'frontend', 'router', 'preprocessor', 'encoder', 'indexer'] 8 | IN_SOCKS = ['PULL', 'SUB', 'RPC'] 9 | OUT_SOCKS = ['PUSH', 'PUB', 'RPC'] 10 | 11 | 12 | def parse_line(line): 13 | try: 14 | line = [x.strip() for x in line.split('|')] 15 | 16 | if len(line) != 8: 17 | raise ValueError('Expected %s columns on line: %s' % (8, line)) 18 | 19 | if not line[0].isnumeric(): 20 | raise ValueError('expected numeric id but got %s' % line[0]) 21 | id = int(line[0]) 22 | 23 | app = line[1] 24 | if not app in APPS: 25 | raise ValueError('app must be in %s not %s' % (APPS, line[1])) 26 | 27 | model = line[2] if line[2] else None 28 | if model and not model.isidentifier(): 29 | raise ValueError('model must be python identifier "%s"' % line[2]) 30 | 31 | image = 'hub-%s:latest-%s' % (app, model) if model else 'gnes/gnes:latest-alpine' 32 | 33 | if not line[3].isnumeric(): 34 | raise ValueError('replicas must be numeric not "%s"' % line[3]) 35 | reps = int(line[3]) 36 | 37 | yaml_path = line[4] if line[4] else None 38 | 39 | i = line[5].split(':') 40 | if len(i) != 2: 41 | raise ValueError('":" not found in %s' % i) 42 | if i[0] not in IN_SOCKS: 43 | raise ValueError('"%s" not in %s' % (i[0], IN_SOCKS) ) 44 | if i[1] and not i[1].isnumeric(): 45 | raise ValueError('in sock "%s" is not numeric' % i[1]) 46 | i[1] = int(i[1]) if i[1] else None 47 | i[0] += '_CONNECT' if i[1] else '_BIND' 48 | 49 | o = line[6].split(':') 50 | if len(o) != 2: 51 | raise ValueError('":" not found in %s' % o) 52 | if o[0] not in OUT_SOCKS: 53 | raise ValueError('"%s" not in %s' % (o[0], OUT_SOCKS) ) 54 | if o[1] and not o[1].isnumeric(): 55 | raise ValueError('out sock "%s" is not numeric' % o[1]) 56 | o[1] = int(o[1]) if o[1] else None 57 | o[0] += '_CONNECT' if o[1] else '_BIND' 58 | 59 | command = line[7] if line[7] else None 60 | 61 | return vars() 62 | 63 | except ValueError as e: 64 | raise ValueError('Error on line: %s\n\n%s' % (line, e)) 65 | 66 | 67 | class Flow: 68 | def __init__(self, path: 'Path'): 69 | self.services = dict() 70 | self.ports = defaultdict( 71 | lambda: {'ins': set(), 'outs': set()}) 72 | self.path = path 73 | self.lines = [] 74 | self.p = list(range(53001, 65001)) 75 | random.shuffle(self.p) 76 | 77 | with Path(path).open() as fh: 78 | for line in fh: 79 | self.add_line(line) 80 | 81 | def add_line(self, line: str): 82 | if not line.strip().startswith('#'): 83 | self.lines += [line] 84 | service = parse_line(line) 85 | self._add_service(service) 86 | 87 | def _add_service(self, s: dict): 88 | in_id = s['i'][1] 89 | if in_id: 90 | self.ports[in_id]['outs'].add(s['id']) 91 | 92 | out_id = s['o'][1] 93 | if out_id: 94 | self.ports[out_id]['ins'].add(s['id']) 95 | 96 | s['name'] = s['model'] + str(s['id']) if s['model'] else s['app'] + str(s['id']) 97 | s['local_in'] = self.p.pop() 98 | s['local_out'] = self.p.pop() 99 | self.services[s['id']] = s 100 | 101 | def swarm(self): 102 | y = {'version': '3.4', 'services': {}} 103 | for s in self.services.values(): 104 | new = dict(volumes=['./.cache:/workspace'], image=s['image']) 105 | new['command'] = [s['command']] if s['command'] else [] 106 | in_id = s['i'][1] 107 | out_id = s['o'][1] 108 | 109 | if s['app'] != 'httpclient': 110 | 111 | new['command'] += ['--socket_in', s['i'][0], '--socket_out', s['o'][0]] 112 | 113 | if s['yaml_path']: 114 | new['command'] += ['--yaml_path', s['yaml_path']] 115 | 116 | # if connecting in 117 | 118 | if in_id: 119 | new['command'] += ['--host_in', self.services[in_id]['name']] 120 | new['command'] += ['--port_in', self.services[in_id]['local_out']] 121 | # if binding in 122 | else: 123 | new['command'] += ['--port_in', s['local_in']] 124 | 125 | # if connecting out 126 | if out_id: 127 | new['command'] += ['--host_out', self.services[out_id]['name']] 128 | new['command'] += ['--port_out', self.services[out_id]['local_in']] 129 | # if binding out 130 | else: 131 | new['command'] += ['--port_out', s['local_out']] 132 | 133 | else: 134 | new['ports'] = ['80:80'] 135 | new['command'] += ['--grpc_host', self.services[out_id]['name']] 136 | 137 | new['command'] = ' '.join([str(x) for x in new['command']]) 138 | y['services'][s['name']] = new 139 | 140 | YAML().dump(y, open('docker-compose.yml', 'w')) 141 | 142 | @property 143 | def mermaid_url(self): 144 | app_colors = dict( 145 | httpclient=('#FFE0E0', '#000', '1px'), 146 | frontend=('#FFE0E0', '#000', '1px'), 147 | router=('#C9E8D2', '#000', '1px'), 148 | encoder=('#FFDAAF', '#000', '1px'), 149 | preprocessor=('#CED7EF', '#000', '1px'), 150 | indexer=('#FFFBC1', '#000', '1px'), 151 | ) 152 | 153 | lines = ['graph TD'] 154 | for cls, fmt in app_colors.items(): 155 | lines += ['classDef {} fill:{},stroke:{},stroke-width:{};'.format(cls, *fmt)] 156 | 157 | def edge(left_s, right_s): 158 | return ['{ln}--{lt}-{rt}-->{rn}'.format( 159 | ln=left_s['name'], 160 | lt=left_s['o'][0], 161 | rt=right_s['i'][0], 162 | rn=right_s['name'] 163 | )] 164 | 165 | for bound_id, port in self.ports.items(): 166 | bound_s = self.services[bound_id] 167 | # lines += ['subgraph %s' % bound_s['name']] 168 | 169 | for conn_id in port['ins']: 170 | conn_s = self.services[conn_id] 171 | lines += edge(conn_s, bound_s) 172 | 173 | for conn_id in port['outs']: 174 | conn_s = self.services[conn_id] 175 | lines += edge(bound_s, conn_s) 176 | 177 | # lines += ['end'] 178 | 179 | for s in self.services.values(): 180 | lines += ['class {} {};'.format(s['name'], s['app'])] 181 | 182 | return 'https://mermaidjs.github.io/mermaid-live-editor/#/view/' + b64encode('\n'.join(lines).encode()).decode() 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /koursaros/hub/client/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/.DS_Store -------------------------------------------------------------------------------- /koursaros/hub/client/postgres/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-buster 2 | 3 | RUN apt update 4 | RUN apt install libpq-dev gcc python3-dev musl-dev git -y 5 | RUN pip install psycopg2 git+https://git@github.com/koursaros-ai/koursaros.git 6 | 7 | ADD *.py *.yml ./ 8 | 9 | ENTRYPOINT ["python", "postgres.py", "--start_doc_id", "1"] -------------------------------------------------------------------------------- /koursaros/hub/client/postgres/postgres.py: -------------------------------------------------------------------------------- 1 | from gnes.cli.parser import set_client_cli_parser 2 | from koursaros.repo_creds import get_creds 3 | from gnes.client.cli import CLIClient 4 | from gnes.base import TrainableBase 5 | import traceback 6 | import psycopg2 7 | import json 8 | import os 9 | 10 | VALID_MODES = ['json', 'raw'] 11 | 12 | 13 | class PostgresClient(CLIClient): 14 | 15 | @property 16 | def bytes_generator(self): 17 | try: 18 | args = self.args 19 | creds = get_creds(args.creds) 20 | 21 | psql = creds.postgres 22 | os.environ['PGSSLMODE'] = psql.sslmode 23 | os.environ['PGSSLROOTCERT'] = psql.sslrootcert.path 24 | 25 | columns = ', '.join([args.id_column] + args.data_columns) 26 | query = '''SELECT %s FROM %s''' % (columns, args.table) 27 | query += ' ORDER BY %s ASC' % args.id_column 28 | query += ' LIMIT %d' % args.limit if args.limit > 0 else '' 29 | 30 | connection = psycopg2.connect(user=psql.username, 31 | password=psql.password, 32 | host=psql.host, 33 | port=psql.port, 34 | dbname=psql.dbname) 35 | cursor = connection.cursor() 36 | cursor.execute(query) 37 | 38 | if args.send_type not in VALID_MODES: 39 | raise ValueError('"mode" parameter must be one of %s' % VALID_MODES) 40 | else: 41 | for i, (_id, *row) in enumerate(cursor): 42 | msg_id = i + 1 43 | if msg_id != _id: 44 | raise ValueError( 45 | '"%s" column must by an incremental id starting from 1. ' 46 | 'Got id %s for row %s' % (args.id_column, _id, msg_id)) 47 | 48 | if args.send_type == 'json': 49 | yield (json.dumps(zip(columns, row))).encode() 50 | elif args.send_type == 'raw': 51 | yield ''.join(row).encode() 52 | 53 | except: 54 | self.logger.error('wut') 55 | self.logger.error(traceback.format_exc()) 56 | 57 | def query_callback(self, req, resp): 58 | self.logger.info(req, resp) 59 | 60 | 61 | if __name__ == '__main__': 62 | parser = set_client_cli_parser() 63 | parser.add_argument('--limit', type=int, help='number of postgres rows (-1 for unlimited)') 64 | cred_repo_help = 'cred repo set up according to git:koursaros-ai/koursaros.credentials spec' 65 | parser.add_argument('--creds', type=str, required=True, help=cred_repo_help) 66 | parser.add_argument('--yaml_path', type=str) 67 | cli_args = parser.parse_args() 68 | yaml = TrainableBase.load_yaml(cli_args.yaml_path) 69 | for k, v in yaml['parameters'].items(): setattr(cli_args, k, v) 70 | PostgresClient(cli_args) 71 | -------------------------------------------------------------------------------- /koursaros/hub/client/postgres/testrerank.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | table: 'test.train_article_rerank' 3 | columns: ['claim', 'label'] 4 | mode: json 5 | 6 | -------------------------------------------------------------------------------- /koursaros/hub/client/postgres/wikititles.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | table: wiki.articles 3 | id_column: id 4 | data_columns: [title] 5 | send_type: raw 6 | limit: 100 7 | 8 | 9 | -------------------------------------------------------------------------------- /koursaros/hub/client/sheet/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/sheet/Dockerfile -------------------------------------------------------------------------------- /koursaros/hub/client/sheet/base.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/sheet/base.yml -------------------------------------------------------------------------------- /koursaros/hub/client/sheet/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pathlib 3 | import csv 4 | import json 5 | 6 | HEADERS = {'Content-Type': 'application/json'} 7 | MODES = ['index', 'train', 'query'] 8 | 9 | 10 | class Client: 11 | 12 | def __init__(self, mode, path, limit=None): 13 | self.path = pathlib.Path(path) 14 | self.csv = csv.DictReader(self.path.open()) 15 | self.mode = mode 16 | self.limit = limit 17 | if mode not in MODES: 18 | raise ValueError('%s is not valid. Please choose one of %s' % (mode, MODES)) 19 | 20 | self.iter_csv(getattr(self, mode)) 21 | 22 | def post(self, data): 23 | print('Posting:', data) 24 | response = requests.post('http://localhost:80/%s' % self.mode, data=data, headers=HEADERS) 25 | res = json.loads(response.content) 26 | if 'res' in res: 27 | self.result = json.loads(res['res'][0]) 28 | else: 29 | self.result = res 30 | print('Returned:', self.result) 31 | 32 | def iter_csv(self, get_body_from_row): 33 | i = 0 34 | to_send = [] 35 | for row in self.csv: 36 | to_send.append(get_body_from_row(row)) 37 | if self.limit is not None and i > self.limit: break 38 | i += 1 39 | self.post('\n'.join(to_send).encode()) 40 | 41 | def index(self, row): 42 | body = list(row.values())[1] 43 | req = dict(data=body) 44 | req.update(row) 45 | return json.dumps(req, ensure_ascii=False) 46 | 47 | def train(self, row): 48 | return json.dumps(row, ensure_ascii=False) 49 | 50 | def query(self, row): 51 | return list(row.values())[0] 52 | 53 | def query_one(self, text): 54 | self.mode = 'query' 55 | self.post(text.encode()) 56 | return self.text() 57 | 58 | def text(self): 59 | return self.result['search']['topkResults'][0]['doc']['chunks'][0]['text'] -------------------------------------------------------------------------------- /koursaros/hub/client/sheet/test.csv: -------------------------------------------------------------------------------- 1 | i,x,y,same_security 2 | 0,semtech corp,semtech corporation, 3 | 1,vanguard mid cap index,vanguard midcap index - a, 4 | 2,spdr gold trust gold shares,spdr gold trust spdr gold shares, 5 | 3,vanguard total bond index adm,vanguard total bond market index, 6 | 4,oakmark international fund class i,oakmark international cl i, 7 | 5,pfizer inc div: 1.200,pfizer inc com, 8 | 6,spartan global ex us index fid adv cl,sptn glb xus idx adv, 9 | 7,vanguard total bond market idx-adm,vanguard total bond market index fund investor shares, 10 | 8,banco latinoamericano de exportacio class e com stk npv,banco latinoamericano come-e, 11 | 9,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc spons ads repr 0.10 ord cls a us0.00005, 12 | 10,whole foods market,whole foods markets inc div: 0.540, 13 | 11,walgreens boots alliance inc com,walgreens boots alli, 14 | 12,diageo plc new gb spon adr,diageo p l c spon adr new, 15 | 13,guggenheim bulletshares 2016,guggenheim bulletshares 2016 high yield, 16 | 14,vanguard small-cap index adm,vanguard small-cap index fund inst, 17 | 15,emerging markets,vanguard ftse emerging marke, 18 | 16,spdr s&p 500 etf iv,s&p 500 index spdr, 19 | 17,tegna inc com,tegna inc, 20 | 18,deere & company,deere co, 21 | 19,vanguard mid-cap index,vanguard mid-cap index fund institutional plus shares, 22 | 20,jpmorgan chase & co div: 1.760,jpmorgan chase & co, 23 | 21,american funds europacific growth fund - r6,af europac growth r6, 24 | 22,vanguard total bond market idx-adm,vang tot bd mk is pl, 25 | 23,unitedhealth gp inc div: 2.000,unitedhealth group incorporated, 26 | 24,american intl group inc warrant 01/19/2021,american intl gro 21 wtswarrants exp 01/19/21, 27 | 25,fifth street finance corp com,fifth street financial corp com, 28 | 26,ishares jpm embi global core,ishares jpm usd emrg mkt bnd etf, 29 | 27,metwest tot rtn bd m,metropolitan west tot ret bond, 30 | 28,exelixis inc com,exelixis inc, 31 | 29,glenmede large cap gwth,glenmede large cap growth, 32 | 30,af europac growth r6,american funds europacific growth r6, 33 | 31,dreamworks animation skg cl a,dreamworks animation skg inc cl a, 34 | 32,dfa us small cap value port instl,dfa u s small cap value cl i, 35 | 33,vanguard ltd-trm t/e adm,vanguard limited-term tax-exempt fund, 36 | 34,ishares trust msci united kingdom etf,ishares msci u k etf shs, 37 | 35,pimco total return cl a,pimco total return fund adm, 38 | 36,amg yacktman fund service class,amg yacktman service, 39 | 37,vanguard intermediate-term treasury fund admiral shares,vang intm treas adm, 40 | 38,pimco incm cl d,pimco fds income instl, 41 | 39,vanguard growth index fund investor shares,vang growth idx adm, 42 | 40,vanguard mid cap index,vanguard mid-cap index fund admiral shares, 43 | 41,vanguard value index fund institutional shares,vanguard value index inv, 44 | 42,vanguard target retirement 2060 fund,vanguard tgt rtrmnt 2060, 45 | 43,netflix inc,netflix inc., 46 | 44,cisco sys inc com,cisco sys inc, 47 | 45,pimco income fund cl p,pimco income fd i, 48 | 46,united parcel service-cl b,united parcel svc inc cl b, 49 | 47,michael kors holdings ltd com npv,michael kors hldgs ltd, 50 | 48,alaska air group inc com,alaska air group inc, 51 | 49,vanguard total bond market index adm,vanguard ttl bnd mrk indx inst, 52 | 50,vanguard intermediate-term investment-grade fund admiral shares,vanguard intermediate term inv gr fd inv cl, 53 | 51,oppenhmr develpng mkts y,oppenheimer developing markets cl y, 54 | 52,texas instrs incorporated,texas instruments inc, 55 | 53,sptn intl index fai,spartan intl index fid adv class, 56 | 54,heartland pmt sys inc com,heartland paymnt sys, 57 | 55,vanguard total bond market index fund institutional shares,vanguard total bond market index, 58 | 56,ariel fund inv,ariel fund, 59 | 57,flir sys inc,flir systems inc, 60 | 58,pimco income fund cl d,pimco income instl, 61 | 59,vanguard shortterm investgrade adm,vanguard shrt trm invmnt grd-inv, 62 | 60,bristol myers squibb company,bristol myers squibb co, 63 | 61,metlife inc,metlife incorporated div: 1.500, 64 | 62,nxp semiconductors n v com,nxp semiconductors nv, 65 | 63,novo nordisk a/s-adr nvorepstg 1/2 cl b sh,novo-nordisk a-s fadr 1 adr reps 1 ord shs, 66 | 64,vanguard total bond market index fund institutional shares,vanguard total bond market idx instl pls, 67 | 65,accenture ltd ord,accenture plc ireland, 68 | 66,pimco total ret fd instl,pimco total return fund cl p, 69 | 67,pimco income a,pimco incm inst cl, 70 | 68,t. rowe price institutional large cap growth,t. rowe price institutional large cap growth fund, 71 | 69,hsbc hldgs plc spons adr new,hsbc hldgs plc spon adr new, 72 | 70,carnival corp ord (panama),carnival corp com, 73 | 71,baidu inc spons ads repr 0.10 ord cls a us0.00005,baidu inc - spon adr, 74 | 72,vanguard mid-cap index fund institutional shares,vanguard mid cap index ins, 75 | 73,ishares iboxx $ invt grade corp bd,ishares iboxx ig corp bond, 76 | 74,ultimate software group inc,ultimate software gp, 77 | 75,pimco income fund cl d,pimco income a, 78 | 76,franklin income series cl a,franklin incm fd cl a, 79 | 77,ormat technologies inc,ormat technologies, 80 | 78,aqr mgd futures strat fd cl i,aqr aqr mngd futures strategy i, 81 | 79,ishares russell midcap growth,ishares russell midcap g etf div: 0.903, 82 | 80,vanguard target retirement 2045 fund,vang target ret 2045, 83 | 81,vanguard total intl stk,vanguard total intl etf, 84 | 82,sptn inter treas bnd investor class,sptn int tr idx adv, 85 | 83,artisan intl value fund inv,artisan international value, 86 | 84,"stag industrials, inc. com",stag industrial inc com, 87 | 85,parnassus endeavor fd,parnassus endeavor fund investor shares, 88 | 86,johnson & johnson div: 3.000,johnson & johnson jnj, 89 | 87,eaton vance floating rate fd cl a,eaton vance bond fund cl i, 90 | 88,vanguard total bond index adm,vanguard ttl bnd mrk indx inst, 91 | 89,pimco total ret fd instl,pimco total return fund - class r, 92 | 90,pimco total return fund cl d,pimco tot return adm, 93 | 91,tivo inc com,tidewater inc com new, 94 | 92,zimmer biomet hldgs,zimmer biomet holdings inc com, 95 | 93,ford mtr company del com par $0.01,ford mtr co del com par $0.01, 96 | 94,guggenheim bullet shrs 2018 hi y c bd etf,guggenheim bulletshares 2018, 97 | 95,apple inc com,apple incorporated, 98 | 96,ishares jpm usd emr etf,ishares jpm usd emrg mkt bnd e tf, 99 | 97,edison international cmn,edison intl, 100 | 98,conagra foods inc div: 1.000,conagra foods inc, 101 | 99,advanced micro devices,advanced micro devices inc, 102 | 100,american tower corporation reit,american tower reit inc (hldg co) shs, 103 | 101,vang sm cap idx adm,vanguard small cap index fund, 104 | 102,vanguard short-term bond index fund investor shares,vanguard short-term bond index, 105 | 103,"vanguard small cap index, adm",vnguard index trust small cap idx instl, 106 | 104,ishares jpm usd emrg mkt bnd e tf,ishares jpm embi global core, 107 | 105,blackrock strat income i,blackrock strategic income opptys investor cl a, 108 | 106,ishares russell midcap,ishares russell mid-cap etf, 109 | 107,ishares core msci emg mkts etf,harding loevner emerging mkts, 110 | 108,vanguard intl equity index fds ftse emerging mkts etf,vanguard ftse emerging mark etf iv, 111 | 109,american funds europacfic r5,american euro pac gr r5, 112 | 110,vanguard crsp us small cap ind ex,vanguard small cap etf, 113 | 111,delta air lines inc dela new,delta air lines inc. (de), 114 | 112,ishares 20+ yr treasu bond etf div: 3.107,ishares 20+ year treasury bo, 115 | 113,sptn glb xus idx fai,spartan global ex us index fid adv cl, 116 | 114,fidelity new insights i,fidelity advisor new insights fund cl i, 117 | 115,ishares tr nat amt free bd,ishares tr natl mun bd etf fd, 118 | 116,vanguard small cap index adm,vanguard small-cap index fund institutional shares, 119 | 117,first eagle global fd cl a,first eagle global class a, 120 | 118,t rowe price mid cap growth,mid-cap growth fund, 121 | 119,skyworks solutions,skyworks solutions inc com, 122 | 120,tile shop hldgs inc,tile shop hldg inc, 123 | 121,t. rowe price health sciences,t rowe price health science fund inc, 124 | 122,coca-cola co/the,coca-cola company, 125 | 123,atwood oceanics inc com,atwood oceanics inc., 126 | 124,dodge & cox funds income fund,dodge & cox income fund n/l, 127 | 125,vanguard small cap index fund,vnguard index trust small cap idx instl, 128 | 126,vanguard mid-cap value etf,vanguard mid cap value etf, 129 | 127,american funds euro pacific growth r6,am fnd europacfic grth r6, 130 | 128,wisdomtree intl smallcp dividend etf,wisdomtree tr intl smallcap divid fd isin #us9 sedol #b17fg17, 131 | 129,boeing company cmn,boeing company, 132 | 130,ishares msci emerging markets,harding loevner emerging mkts, 133 | 131,transocean ltd zug namen akt,transocean ltd ord, 134 | 132,vanguard small-cap index fund admiral shares,vanguard small-cap index adm, 135 | 133,pimco total ret fd instl,pimco total return fd cl c, 136 | 134,jpmorgan equity income fund cl a,jpmorgan us equity fund - class r6, 137 | 135,energy sector index spdr,energy sector spdr etf, 138 | 136,vanguard total internatlbnd etf iv,vanguard charlotte total intl bd index fd etf, 139 | 137,t rowe price international discovery fund,t. rowe price international discovery, 140 | 138,united sts stl cp (new),united states stl corp new, 141 | 139,coca cola co,coca-cola company, 142 | 140,spdr barclays high yield bond (jnk),spdr barclays capital high yield bond et, 143 | 141,vanguard smallcap index fund,vanguard small-cap index fund institutional shares, 144 | 142,fidelity select utilities portfolio,fid sel utilities, 145 | 143,select sector spdr trust the technology select sector spdr fund,technology sector sp etf, 146 | 144,guggenheim s&p 500 equal we cons etf,guggenheim s&p 500 equalwe cons etf, 147 | 145,visa inc class a shares,visa inc cl a div: 0.560, 148 | 146,skyworks solutions inc com,skyworks solutions inc, 149 | 147,alibaba group holding ltd spons ads,alibaba group hldg adr fsponsored adr 1 adr reps 1 ord, 150 | 148,pimco income instl,pimco incm inst cl, 151 | 149,vanguard growth index inv,vanguard growth index fund admiral shares, 152 | 150,canadian natl railway co,canadian natl ry co f, 153 | 151,first tr exchange traded fd dow jones internet index fd,first tr exchange traded fd dow jones in, 154 | 152,vanguard total bond index adm,vanguard total bond market index i, 155 | 153,ishares gold tr,ishares gold tr ishares, 156 | 154,oppenheimer developing markets y fund,oppenheimer developing markets cl y, 157 | 155,vanguard total bond market index adm,vang tot bd mk is pl, 158 | 156,t. rowe price new income,guidemark core fixed income, 159 | 157,vanguard mid-cap index fund institutional shares,vang midcap idx inst, 160 | 158,mfs international new discovery r5,afs international growth & income fund cl f1, 161 | 159,fitbit inc,fitbit inc cl a, 162 | 160,vanguard growth index fund investor shares,vanguard growth index fd admiral share, 163 | 161,comcast corp (new) class a div: 1.100,comcast corp cl a, 164 | 162,invesco diversified dividend investor cl,fidelity advisor diversified international fund cl c, 165 | 163,blackrock high yld bd port cl k,blackrock high yield bond portfolio svc, 166 | 164,sina com ord (caym is),sina corporation com, 167 | 165,t.rowe price new horizons-t,new horizons fund, 168 | 166,vanguard value etf (vtv),vanguard value etf, 169 | 167,berkshire hathawayinc del cl b new,berkshire hathawayinc, 170 | 168,schlumberger limited com usd0.01,schlumberger ltd., 171 | 169,union pacific corp,union pac corp com, 172 | 170,alps etf tr alerian mlp,alps alerian mlp etf, 173 | 171,vanguard div growth fd investor shrs,vanguard dividend growth fund investor shares, 174 | 172,tyson foods inc-cl a tsn,tyson foods inc class a, 175 | 173,american mutual fund-a,american mutual fund cl a, 176 | 174,canadian national railway,canadian natl railway company com, 177 | 175,deutsche x-trackers msci eafe hedged equity etf,deutsche x-trackers msci eafe equity etf, 178 | 176,vanguard total bond index adm,vanguard total bond market idx instl pls, 179 | 177,western digital corp,western digital corp com, 180 | 178,ishares core msci emg mkts etf,harding loevner emerg mrkts port adv, 181 | 179,pimco high income fd com shs,pimco high income fund, 182 | 180,ishares inc core msci emerging mkts etf,harding loevner emerging mkts, 183 | 181,time warner inc com,time warner inc, 184 | 182,vanguard international growth fund admiral,vanguard international growth fund admiral shares, 185 | 183,ishares trust core msci total intl stk etf,ishares core msci ttl int stk, 186 | 184,ishares iboxx invt gradebond etf,ishares iboxx $ invt grade corp bd, 187 | 185,interactive brokers group inc. com,interactive brokers class a, 188 | 186,adobe systems,adobe systems incorporated, 189 | 187,vang tot bd mkt adm,vanguard total bond market idx instl pls, 190 | 188,yandex nv com,yandex n.v. com usd0.01 cl a, 191 | 189,vanguard small cap index - a,vanguard small-cap index fund inst, 192 | 190,sirius xm hldgs inc com isin #us5 sedol #bgldk10,sirius xm hldgs inc com, 193 | 191,vanguard target retirement 203 5 fund,vang target ret 2035, 194 | 192,communications sales&leas inc div: 2.400,communications sales&leas inc com, 195 | 193,vanguard index fds vanguard total stk mkt etf,us total stock market, 196 | 194,t. rowe price equity income fund,t. rowe price equity income, 197 | 195,ishares tr core us aggt bd etf,ishares core us aggregate bond etf, 198 | 196,american funds europacific growthr3,american funds europacific growth fund, 199 | 197,lazard emerging mkts eqty port opn,lzrd emrg mkts eq o, 200 | 198,pimco income a,pimco income administrative, 201 | 199,american express co,american express company, 202 | 200,taser international,taser intl inc del com, 203 | 201,vanguard short term tax exempt fd investor shr,vanguard short-term tax-exempt fund investor shares, 204 | 202,ishares core msci emerging markets etf,harding loevner emerg mrkts port adv, 205 | 203,vanguard institutional index fund institutional shares,vanguard institl index, 206 | 204,trp real estate adv,t rowe price real estate fund adv cl, 207 | 205,jp morgan chase & co com,jpmorgan chase & co div: 1.760, 208 | 206,vanguard 500 index fund admira l,vanguard 500 index fund admiral class, 209 | 207,dollar gen corp new com,dollar general corp, 210 | 208,us silica holdings inc,u s silica hldgs inc com, 211 | 209,alphabet inc cap stk cl c cap stk cl c,alphabet inc cl c, 212 | 210,ishares msci usa min volility etf,proshares short vix short term etf, 213 | 211,fidelity low-priced stock,fid low priced stk, 214 | 212,vang st invstgrd inv,vanguard short-term invest-grade, 215 | 213,goldman sachs mangd futures strategy a,aqr aqr mngd futures strategy i, 216 | 214,select sector spdr trust health care select index,health care select spdr fund, 217 | 215,metropolitan west fds total ret cl i,metropolitan west tot ret bond, 218 | 216,chubb limited com,chubb ltd, 219 | 217,vanguard ftse emerg mkts etf,vanguard intl equity index fds ftse emerging mkts etf, 220 | 218,energy transfer partners un,cheniere energy partners lp com, 221 | 219,baron partners,baron partners fund, 222 | 220,prudential financial inc cmn,prudential finl inc, 223 | 221,t rowe price retirement 2050 fund,t. rowe price retirement 2050 fund, 224 | 222,templeton global bond class a,templeton global bd r, 225 | 223,dominion resources inc va new,dominion resources inc/va, 226 | 224,ishares tr core us aggt bd etf,ishares core u.s. aggregate, 227 | 225,citigroup inc,citigroup inc new div: 0.200, 228 | 226,invesco comstock fund cl a,invesco comstock y, 229 | 227,oppen developing mkts a,oppenheimer developing mkts fd cl a, 230 | 228,alphabet inc shs cl a,alphabet inc voting, 231 | 229,national grid new adr each repr 5 ord gbp0.11395,national grid plc new spon adr, 232 | 230,ishares russell 3000 index etf,ishares russell 3000 etf, 233 | 231,titan international inc com,titan international inc, 234 | 232,proshares tr ii ultra bloomberg crude oi,proshares ultra bloomberg crude oil etf, 235 | 233,ishares core msci emerging markets etf,harding loevner emerging mkts, 236 | 234,vang tot bd mkt adm,vanguard total bond market index i, 237 | 235,mfs global total return fund cl a,mfs global total return cl a, 238 | 236,metropolitan west total return m,metropolitan west tot ret bond, 239 | 237,blckrck inflation protect,blackrock inflation protected bond instl, 240 | 238,dfa real estate securities i,dfa real estate securities fund institutional class, 241 | 239,leucadia natl corp com,leucadia national co, 242 | 240,pimco income fd i,pimco income instl, 243 | 241,trp retirement 2045,t. rowe price retirement 2045, 244 | 242,wal-mart stores inc com isin #us9311421039 sedol #2936921,scana corp new com isin #us7 sedol #2545844, 245 | 243,oppenheimer developing market a,oppenheimer developing mkts fd cl a, 246 | 244,nuveen high yield muni bond fund cl i,nuveen high yield municipal bond a, 247 | 245,vanguard short term invt grade admiral,vanguard short-term investment-grade fund investor shares, 248 | 246,the growth fund of america,amer fds grwth fd amr a, 249 | 247,fireeye inc,fireeye inc com usd0.0001, 250 | 248,templeton global bond fund advisor class,templeton global bond fund adv cl, 251 | 249,ishares inc core msci emerging mkts etf,harding loevner emerg mrkts port adv, 252 | 250,silver wheaton corp. ads,silver wheaton corporation com npv isin #ca6 sedol #b058zx6, 253 | 251,dfa us small cap value prtf instl,dfa us sm cap value, 254 | 252,united states oil fund lp exchange-traded fund,united states oil fund lp unit, 255 | 253,pimco total return instl,fund: pimco total return admin, 256 | 254,tesla motors inc com,tesla motors inc., 257 | 255,schwab short term us treasury etf,schwab strategic tr short-term us treasury, 258 | 256,vanguard total bond index adm,vang tot bd mk is pl, 259 | 257,van small cap index admir,vang sm cap idx inst, 260 | 258,facebook incorporated class a,facebook inc cl a, 261 | 259,vanguard ext market index inst,vanguard extended market idx adm, 262 | 260,t. rowe price new horizons,new horizons fund, 263 | 261,vanguard total bond market index fund admiral shares,vanguard bond index total mkt investor, 264 | 262,lloyds banking group plc div: 0.129,lloyds banking group plc, 265 | 263,vbr:vanguard small-cap value etf,vanguard small cap valueetf iv, 266 | 264,constellation brands inc cl a,constellation brand class a, 267 | 265,jp morgan chase & co com,jp morgan chase & co, 268 | 266,pimco total return instl,pimco total return fund instl cl, 269 | 267,ishares gold etf,ishares gold trust com, 270 | 268,schwab intl core equity,schwab intl core eqty fd instl cl, 271 | 269,vanguard total bond market index-admiral,vang tot bd mkt inst, 272 | 270,pimco total return instl,pimco total return fund adm, 273 | 271,alibaba group hldg ltd sponsor,alibaba group hldg ltd adr, 274 | 272,ishares russell 1000 growth,russell 1000 growth (ishares), 275 | 273,walt disney co,disney, 276 | 274,vanguard s&p 500 etf (voo),vanguard index fds s&p 500 etf, 277 | 275,ishares msci eafe min volatility etf,ishares trust msci eafe min volatil etf, 278 | 276,kraft heinz co com,kraft heinz co div: 2.300, 279 | 277,metr w tot rtn bond cl m,metropolitan west tot ret bond, 280 | 278,berkshire hathaway cl-b new,berkshire hathaway inc., 281 | 279,momenta pharmaceuticals,momenta pharmaceuticals inc com, 282 | 280,colgate palmolive,colgate palmolive co com, 283 | 281,ishares inc core msci emerging mkts etf,ishares msci emerging markets, 284 | 282,powershares qqq etf,powershares qqq trust sr 1 etf, 285 | 283,vanguard small-cap index fund admiral,vanguard small-cap index fund institutional shares, 286 | 284,corning inc cm,corning inc, 287 | 285,vereit inc reit,vereit inc, 288 | 286,ishares core msci emerging div: 0.995,ishares core msci emg mkts etf, 289 | 287,american funds europacific growth-r6,american funds europacific growth fund class r-6, 290 | 288,national oilwell varco inc com,national-oilwell varco inc, 291 | 289,vanguard 500 index fund admiral shares,vnguard 500 index admiral shares, 292 | 290,facebook inc.,facebook inc class a, 293 | 291,whole foods market inc,whole foods markets inc div: 0.540, 294 | 292,bhp billiton plc - adr,bhp billiton plc spons adr each rep 2 ord usd0.50, 295 | 293,ishares russell 2000 growth etf iv,ishares russell 2000 grwth etf div: 1.243, 296 | 294,osterweis strategic income fund,professionally mgd ptfl osterweis strategic inc fd, 297 | 295,vanguard small cap value etf,vanguard small cap valueetf iv, 298 | 296,wa core plus bond i,western asset core plus bond fd cl fi, 299 | 297,kraft heinz co div: 2.300,kraft heinz co com, 300 | 298,nokia corp-spon adr,nokia corp cls a adr (finnish), 301 | 299,citrix systems inc.,citrix systems inc, 302 | 300,devry education group inc div: 0.360,devry education group, 303 | 301,vanguard gnma fund admiral shares,vanguard gnma fund investor share, 304 | 302,vanguard star investor class,vanguard star fund investor shares, 305 | 303,vanguard total bond market index adm,vanguard total bond market index fund investor shares, 306 | 304,ishares gold tr,ishares gold etf, 307 | 305,eaton corp plc com,eaton corp plc f, 308 | 306,pepsico inc com,pepsico inc cmn, 309 | 307,wal-mart stores inc com,wal-mart stores inc., 310 | 308,whole foods mkt inc com,whole foods market, 311 | 309,american capital world growth and income fd a,american capital world grth & inc a, 312 | 310,vanguard mid cap index fund - admiral,vanguard mid cap index fund admiral class, 313 | 311,fidelity corporate bond fund,baird core plus bond inst, 314 | 312,spartan extended mkt index fid adv class,spartan extended mkt index investor cl, 315 | 313,nxp semiconductors f,nxp semiconductors n v, 316 | 314,columbia dividend income,col dividend inc z, 317 | 315,vanguard sml-cap ind-adm,vanguard small-cap index fund admiral, 318 | 316,vanguard index 500 port,vanguard 500 index fund-inv, 319 | 317,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc sponsored adr repstg ord shares class a, 320 | 318,delaware value fund institutional,delaware value cl a, 321 | 319,vanguard total bond market index fund admiral shares,vanguard total bond market index inv, 322 | 320,arista networks inc,arista networks inc com usd0.0001, 323 | 321,ishares 1-3 yr treasury bnd etf,ishares 1-3 yr treasury bnd et f, 324 | 322,united sts oil fd lp units,united states oil fund lp exchange-traded fund, 325 | 323,pimco income fund cl p,pimco fds income instl, 326 | 324,diageo plc fadr 1 adr reps 4 ord shs,diageo p l c spon adr new, 327 | 325,harbor capital appreciation instl,harbor capital appreciation, 328 | 326,price t rowe group inc com isin #us74144t1088 sedol #2702337,c h robinson worldwide inc com new isin #us8 sedol #2116228, 329 | 327,ishares s&p midcap fund,ishares core s&p mid capetf, 330 | 328,pimco total return cl a,fund: pimco total return admin, 331 | 329,johnson and johnson,johnson and johnson com, 332 | 330,ishares msci cda etf,ishares msci canada index, 333 | 331,carnival corp com,carnival corp f, 334 | 332,priceline group,priceline grp inc com new, 335 | 333,westport innovation f,westport innovations inc, 336 | 334,wisdomtree emerging markets high dividend fund etf,wisdomtree emrg mrkt hg div etf, 337 | 335,d.r. horton inc,d r horton co, 338 | 336,vanguard total bond market index adm,vanguard total bond market idx instl pls, 339 | 337,ishares 7-10 year treas bond etf,ishares barclays 7-10 year treasury bond, 340 | 338,vanguard total bond market index-admiral,vanguard total bond market index fund institutional plus shares, 341 | 339,vanguard sml-cap ind-adm,vanguard small-cap index fund institutional shares, 342 | 340,vanguard interm-term investment-grde adm,vanguard intermediate term inv gr fd inv cl, 343 | 341,teva pharmaceuticals adr,teva pharmaceuticals ind ltd israel adr, 344 | 342,dfa intl small cap value cl i,dfa intl small cap value port instl, 345 | 343,prudential financial inc div: 2.800,prudential finl inc, 346 | 344,af bond fd amer r6,the bond fund of america-a, 347 | 345,allergan plc,allergan plc f, 348 | 346,vale sa,vale s a adr, 349 | 347,vanguard short-term corporate bond etf,vanguard short-term corporate bond, 350 | 348,vanguard index fds vanguard small cap growth vipers formerly,vanguard small-cap grwth etf, 351 | 349,vanguard short-term bondetf,vanguard short term etf, 352 | 350,goldman sachs mgd futures strat a,aqr mgd futures strat fd cl i, 353 | 351,vanguard total bond index adm,vanguard total bond market index fund investor shares, 354 | 352,xinyuan real estate com,xinyuan real estate co ltd spon adr, 355 | 353,ishares core u.s. aggregate bond etf,ishares core total us bond market etf, 356 | 354,devon energy corp,devon energy corporation (new) cmn, 357 | 355,berkshire hathaway class b,berkshire hathawayinc, 358 | 356,vanguard reit index investor,vanguard reit index inv, 359 | 357,markel corp hldg co,markel corp, 360 | 358,vanguard high-yield corporate fund investor shares,vanguard high yield corp fund admiral share, 361 | 359,oneok inc new div: 2.460,oneok inc cm (new), 362 | 360,iron mtn inc new com div: 1.940,iron mtn inc reit, 363 | 361,howard hughes corp com,howard hughes corp, 364 | 362,af bond fd amer r6,the bond fund of america, 365 | 363,kraft heinz co,kraft heinz co div: 2.300, 366 | 364,costco wholesale crp del,costco wholesale co, 367 | 365,first trust amex biotechnology index fund,first trust nyse arca biotechnology index fund, 368 | 366,american tower corporation isin #us0 sedol #b7fbfl2,american tower corp reit, 369 | 367,the growth fund of america,the growth fund of america-529a (1005), 370 | 368,priceline group inc com,priceline group inc, 371 | 369,pimco total return fund instl cl,total return fund (pimco), 372 | 370,southwest gas corp.,southwest gas corp div: 1.800, 373 | 371,vanguard dividend growth fund investor shares,vanguard dividend growth fund, 374 | 372,j p morgan chase & co,jpmorgan chase & co div: 1.760, 375 | 373,merck & company inc new,merck & co inc new, 376 | 374,pimco incm cl d,pimco income fd i, 377 | 375,vanguard fixed income secs inter term invt grade fd admiral cl,vanguard intermediate term inv gr fd inv cl, 378 | 376,vanguard growth index fund investor shares,vanguard growth index admiral, 379 | 377,guggenheim bulletshares 2018,guggenheim bulletshares 2018 high yield corp bd, 380 | 378,vanguard total bond market div: 2.009,vanguard total bond mkt, 381 | 379,blackhawk network hldgs inc cl a,blackhawk netwk hldgs inc, 382 | 380,vanguard balanced index fd inv cl shrs,vanguard balanced index fund admiral shares, 383 | 381,vanguard mid-cap value index fund,vanguard mid-cap value etf, 384 | 382,ishares tr natl mun bd etf fd,ishares nationl amt freemuni etf, 385 | 383,bank amer corp com,bank of america corp., 386 | 384,russell 1000 growth (ishares),ishares russell 1000 grw etf div: 1.363, 387 | 385,american fd growth fd of america cl a,amer fds grwth fd amr a, 388 | 386,ishares us real estate etf,ishares u s real estate etf, 389 | 387,dfa emerging markets portfolio,dfa emrging markets, 390 | 388,alphabet inc-cl a,alphabet inc. class a, 391 | 389,raytheon co (new) div: 2.680,raytheon co com, 392 | 390,charles schwab new,schwab charles corp new, 393 | 391,alibaba group holding lt,alibaba group hldg ltd sponsor, 394 | 392,oakmark international i,oakmark fds oakmark intl, 395 | 393,mastercard incorporated cmn class a,mastercard inc-class a, 396 | 394,kimberly-clark corp,kimberly-clark corp., 397 | 395,ishares 7-10 yr treasry bd etf div: 1.973,ishares barclays 7-10 year treasury bond, 398 | 396,vanguard small-cap index fund admiral shares,vanguard small-cap index fund inst, 399 | 397,foot locker inc com isin #us9 sedol #2980906,foot locker inc com isin #us3448491049 sedol #2980906, 400 | 398,vanguard ttl bond mkt idx adm,vanguard total bond market index inv, 401 | 399,chesapeake energy corporation oklahoma,chesapeake energy corp, 402 | 400,charles schwab corporation cmn,schwab charles corp new, 403 | 401,vanguard intl equity index fds ftse all world ex usa small cap index fd etf shs,vanguard ftse all world ex us small cap etf, 404 | 402,chicago bridge & iron co nv,chicago bridge & iron company n.v. eur0.01 reg, 405 | 403,amer fds grwth fd amr a,american gr fd of america a, 406 | 404,amg managers real estate securities fund,amg managers real estate securities fd, 407 | 405,american new perspective class a,new perspective fund cl a, 408 | 406,lloyds banking group plc sp adr,lloyds banking group plc div: 0.129, 409 | 407,vanguard 500 idx adm,vanguard s&p 500 index - a, 410 | 408,vanguard total bond market index admiral,vanguard total bond market index fund institutional shares, 411 | 409,j p morgan chase & co,jp morgan chase & co, 412 | 410,halliburton co hldg,halliburton company, 413 | 411,guggenheim bulletshares 2018 high yield corp bd,claymore exchange traded fd trust guggenheim bltshrs 2018 high yld cp bd, 414 | 412,ishares russell 2000 value etf,ishares russell 2000 value etf iv, 415 | 413,pimco total return a,total return fund (pimco), 416 | 414,vanguard index fds vanguard reit etf formerly vanguard index tr to 05/24/01 reit viper shs,vanguard index fds vanguard reit etf formerly vanguard index, 417 | 415,powershares emrg mkts sovrgn dbt etf,powershares emerging markets sovereign d, 418 | 416,vang tot bd mkt adm,vanguard total bond market index, 419 | 417,arena pharmaceuticals inc com,arena pharmaceuticals, 420 | 418,nokia corp spon adr f1 adr rep 1 nokia corps,nokia corp sponsored adr, 421 | 419,vanguard value index fund admiral shares,vanguard value index fund institutional shares, 422 | 420,vanguard total int bd idx etf,vanguard charlotte total intl bd index fd etf, 423 | 421,pimco total ret fd instl,pimco total return r, 424 | 422,intl business machines,intl business mach, 425 | 423,harbor international fund,harbor international, 426 | 424,vanguard intl bond index etf,vanguard total internatlbnd etf iv, 427 | 425,vanguard extended market index institutional class,vang ext mkt idx ins, 428 | 426,caterpillar inc del,caterpillar inc, 429 | 427,visa inc com cl a,salesforce.com inc com, 430 | 428,pimco total return admin,total return fund (pimco), 431 | 429,pepsico inc.,pepsico inc nc div: 2.810, 432 | 430,powershares fin pfd portfoli,powershares etf financial pfd portfolio, 433 | 431,fid sel biotech,fidelity select biotechnology, 434 | 432,arcelormittal sa luxembourg ny registry sh isin #us4 sedol #b295f26,arcelormittal sa (luxembourg), 435 | 433,realty incm corp reit,realty income corporation com, 436 | 434,vanguard total international stock index fund admiral shares,vanguard ttl intl stk ind adm, 437 | 435,c h robinson worldwide inc com new isin #us8 sedol #2116228,c.h. robinson worldwide inc, 438 | 436,sprint corp shs series -,sprint corp, 439 | 437,google inc cl c,alphabet inc cap stk cl c, 440 | 438,templeton glbal bond adv,templeton global bond adv, 441 | 439,apollo investments corp com,apollo invt corp com, 442 | 440,sptn us bond idx is,spartan us bond indx fidelity adv class, 443 | 441,ishares inc msci emrg mkts min volatility etf,ishares msci markets minvol etf, 444 | 442,vanguard total bond market index admiral,vanguard total bond market idx-adm, 445 | 443,southwest gas corp div: 1.800,southwest gas corp, 446 | 444,vanguard ftse developd mkt etf,vanguard ftse dev markets etf, 447 | 445,spdr nuveen barclays muni bond etf,spdr nuveen barclays muni, 448 | 446,dodge & cox interntl stock,dodge & cox international stock, 449 | 447,alexion pharms inc,alexion pharmaceuticals inc, 450 | 448,vanguard total bond market idx-adm,vanguard ttl bnd mrk indx inst, 451 | 449,general mtrs co,general motors co., 452 | 450,zoes kitchen inc com isin #us7 sedol #bl95n36,zoes kitchen inc com, 453 | 451,loomis sayles bond fund cl i,loomis sayles mlti-asset inc a, 454 | 452,athenahealth inc delaware,athenahealth inc, 455 | 453,ishares silver trust etf,ishares silver shares, 456 | 454,emerging markets,harding loevner emerging mkts, 457 | 455,markel corp hldg co,markel corp holding company, 458 | 456,ameriprise financial inc,ameriprise finl inc, 459 | 457,vanguard malvern fds etf,vanguard short term inflation protected, 460 | 458,alibaba group hldg limited sponsored ads,alibaba group hldg adr fsponsored adr 1 adr reps 1 ord, 461 | 459,american funds american hi inc tr r3,american high-income trust, 462 | 460,first eagle global i,first eagle global fund cl i, 463 | 461,van mid cap index adm m4940,vanguard mid cap index, 464 | 462,chesapeake energy corp,chesapeake energy corp com, 465 | 463,starbucks corp. cmn,starbucks corp washington div: 0.800, 466 | 464,mc donalds corp div: 3.560,mcdonalds corp, 467 | 465,american funds europacifc r3,american funds europacific r3, 468 | 466,block h & r inc,block h&r inc, 469 | 467,vanguard ftse developed mkts etf,vanguard ftse developed market etf, 470 | 468,vang tot bd mkt adm,vang tot bd mk is pl, 471 | 469,pimco investment grade corporate bond fund - class a,fidelity conservativ income bond fd cl i, 472 | 470,vanguard mid cap index fund - admiral,vanguard mid cap index fd, 473 | 471,vanguard intrmd-term bond index adm,vanguard inter-term bond index port inv, 474 | 472,infinera corporation com isin #us1 sedol #b1yb5y4,infinera corp com, 475 | 473,vanguard crsp us small cap index,vanguard small cap etf, 476 | 474,oneok partners lp lp,oneok partners l p unit ltd partnership, 477 | 475,american mutual fund,american fd american mutual fd cl f2, 478 | 476,american funds europacific growth-r5,american europacific growth, 479 | 477,vang tot bd mkt adm,vanguard ttl bnd mrk indx inst, 480 | 478,breitburn energy partners lp c,breitburn energy partners lp com, 481 | 479,new york community bancorp inc.,new york community, 482 | 480,pimco income instl,pimco income administrative, 483 | 481,select sector spdr trust technology select index,sector spdr tr shs ben int technology, 484 | 482,vanguard ttl bond mkt idx adm,vanguard bond index total mkt investor, 485 | 483,tmpl global bond a,templeton global bond fund r, 486 | 484,ishares 20+ year treasury bo,ishares 20+ year, 487 | 485,vang sm cap idx adm,vnguard index trust small cap idx instl, 488 | 486,linkedin corp class a,linkedin corp-a, 489 | 487,primecap odyssey stock,primecap odyssey stock fund, 490 | 488,time warner inc,time warner inc com new, 491 | 489,vanguard total bond market index-admiral,vanguard total bond market index fund institutional plus, 492 | 490,marathon pete corporation,loews corporation div: 0.250, 493 | 491,ishares core msci emerging etf,harding loevner emerging mkts, 494 | 492,vanguard short term tax exempt admiral share,vanguard short-term tax-exempt fund investor shares, 495 | 493,ford motor com,ford mtr co, 496 | 494,vanguard ftse all world ex us etf,vanguard ftse all-world ex-u, 497 | 495,vmware inc cl a com,vmware inc., 498 | 496,royal dutch shell plc spons adr a,royal dutch shell plc sponsored adr repstg a shs, 499 | 497,spdr nuveen barclays municipal bond etf,spdr nuveen barclays capital m div: 0.556, 500 | 498,united parcel service cl b,united parcel service inc cl b, 501 | 499,vang smcp gr idx adm,vanguard small cap growth index admiral, 502 | 500,novo-nordisk a s adr,novo-nordisk a s adr isin #us6 sedol #2651202, 503 | 501,fidelity mass muni income,nuveen equity premium income, 504 | 502,ishares russell mid-cap value etf,ishares russell mid cap value etf iv, 505 | 503,yamana gold inc cmn,yamana gold inc com, 506 | 504,vanguard total bond market idx-adm,vanguard total bond market idx instl pls, 507 | 505,otter tail corp com,otter tail corporation cmn, 508 | 506,sptn intl index ins,sptn intl index adv, 509 | 507,cnooc ltd. adr (sponsored),cnooc limited adr fsponsored adr 1 adr rep 100 cl h ord, 510 | 508,eaton vance global macro abs ret a,eaton vance global macro abslte rt cl a, 511 | 509,dfa u s small cap value cl i,dfa us small cap value prtf instl, 512 | 510,sears canada inc (canada),sears cda inc, 513 | 511,at&t inc com isin #us00206r1023 sedol #2831811,franklin res inc com isin #us8 sedol #2350684, 514 | 512,vanguard total international bond index etf,vanguard total international bond et, 515 | 513,wisdomtree japan hedged equity -,wisdomtree japan hedged eq, 516 | 514,templeton global bond fund advisor class,templeton glbal bond adv, 517 | 515,trp health sciences,t. rowe price health sciences fund, -------------------------------------------------------------------------------- /koursaros/hub/encoder/robertainfer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "encode"] -------------------------------------------------------------------------------- /koursaros/hub/encoder/robertainfer/dim64.yml: -------------------------------------------------------------------------------- 1 | !CharEmbeddingEncoder 2 | parameters: 3 | dim: 64 -------------------------------------------------------------------------------- /koursaros/hub/encoder/textbyte/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | RUN echo 'yo' 6 | 7 | ENTRYPOINT ["gnes", "encode", "--py_path", "textbyte.py"] -------------------------------------------------------------------------------- /koursaros/hub/encoder/textbyte/max1024.yml: -------------------------------------------------------------------------------- 1 | !TextByteEncoder 2 | parameters: 3 | max_seq_len: 1024 -------------------------------------------------------------------------------- /koursaros/hub/encoder/textbyte/max256.yml: -------------------------------------------------------------------------------- 1 | !TextByteEncoder 2 | parameters: 3 | max_seq_len: 256 4 | 5 | -------------------------------------------------------------------------------- /koursaros/hub/encoder/textbyte/textbyte.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | 5 | from gnes.encoder.base import BaseTextEncoder 6 | from gnes.helper import batching 7 | 8 | 9 | class TextByteEncoder(BaseTextEncoder): 10 | """Returns np array of encoded text. Useful for text search.""" 11 | is_trained = True 12 | 13 | def __init__(self, max_seq_len, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self._msl = max_seq_len 16 | 17 | def pad_and_vector(self, sent): 18 | padded = sent.encode()[:self._msl] + b'\x00' * (self._msl - len(sent.encode())) 19 | try: 20 | bytes(padded).decode() 21 | return np.frombuffer(padded, dtype=np.uint8) 22 | except: # split aup a multibyte character, so take off one more 23 | padded = padded[:-2] + b'\x00' * 2 24 | return self.pad_and_vector(padded.decode()) 25 | 26 | def encode(self, text: List[str], *args, **kwargs) -> np.ndarray: 27 | encoded = np.stack([self.pad_and_vector(sent) for sent in text]) 28 | return encoded 29 | 30 | -------------------------------------------------------------------------------- /koursaros/hub/httpclient/http/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime 2 | 3 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp 4 | RUN pip install git+https://github.com/colethienes/gnes.git --no-cache-dir --compile 5 | 6 | COPY . ./workspace 7 | WORKDIR /workspace 8 | 9 | ENTRYPOINT ["gnes", "client", "http"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/faisscpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/hub-indexer:latest-faiss-cpu 2 | 3 | ADD *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "index"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/faisscpu/base.yml: -------------------------------------------------------------------------------- 1 | !FaissIndexer 2 | parameters: 3 | data_path: /workspace 4 | index_key: HNSW32 5 | num_dim: 64 -------------------------------------------------------------------------------- /koursaros/hub/indexer/keyword/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | RUN apk add gcc python3-dev musl-dev 4 | RUN pip install pyahocorasick 5 | 6 | ADD *.py *.yml ./ 7 | 8 | ENTRYPOINT ["gnes", "index", "--py_path", "keyword.py"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/keyword/base.yml: -------------------------------------------------------------------------------- 1 | !KeywordIndexer {} 2 | 3 | -------------------------------------------------------------------------------- /koursaros/hub/indexer/keyword/keyword.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import numpy as np 3 | from collections import defaultdict 4 | 5 | from gnes.indexer.base import BaseChunkIndexer as BCI 6 | 7 | 8 | class KeywordIndexer(BCI): 9 | 10 | def __init__(self, *args, **kwargs): 11 | """ 12 | Initialize an indexer that implements the AhoCorasick Algorithm 13 | """ 14 | super().__init__(*args, **kwargs) 15 | import ahocorasick 16 | self._automaton = ahocorasick.Automaton() 17 | self.size = 0 18 | 19 | def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, _, *args, **kwargs): 20 | if vectors.dtype != np.uint8: 21 | raise ValueError('vectors should be ndarray of uint8') 22 | 23 | for key, vector in zip(keys, vectors): 24 | self._automaton.add_word(self.decode_textbytes(vector), key) 25 | self.size += 1 26 | 27 | self.logger.error(list(self._automaton.keys())) 28 | 29 | def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]: 30 | if keys.dtype != np.uint8: 31 | raise ValueError('vectors should be ndarray of uint8') 32 | elif not self.size > 0: 33 | print('Warning: empty index queried') 34 | return [] 35 | 36 | self._automaton.make_automaton() 37 | 38 | ret = [] 39 | for key in keys: 40 | ret_i = defaultdict(int) 41 | for _, (doc_id, offset) in self._automaton.iter(self.decode_textbytes(key)): 42 | ret_i[(doc_id, offset)] += 1 43 | 44 | # _doc_id, _offset, _weight, _relevance 45 | results = [(*k, 1.0, v) for k, v in ret_i.items()] 46 | # topk by number of keyword matches 47 | ret.append(sorted(results, reverse=True, key=lambda x: x[-1])[:top_k]) 48 | 49 | return ret 50 | 51 | @staticmethod 52 | def decode_textbytes(vector: np.ndarray): 53 | return vector.tobytes().rstrip(b'\x00').decode() 54 | -------------------------------------------------------------------------------- /koursaros/hub/indexer/lvdb/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-buster 2 | 3 | RUN pip install plyvel>=1.0.5 --no-cache-dir --compile 4 | 5 | ADD *.py *.yml ./ 6 | 7 | ENTRYPOINT ["gnes", "index"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/lvdb/base.yml: -------------------------------------------------------------------------------- 1 | !LVDBIndexer 2 | parameters: 3 | data_path: /workspace -------------------------------------------------------------------------------- /koursaros/hub/indexer/rocksdb/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-buster 2 | 3 | RUN apt-get update 4 | RUN apt-get install -y python-dev librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libgflags-dev 5 | RUN pip install python-rocksdb --no-cache-dir --compile 6 | RUN apt-get install -y git 7 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp 8 | RUN pip install --upgrade git+https://github.com/colethienes/gnes.git --no-cache-dir --compile 9 | 10 | ADD *.py *.yml ./ 11 | 12 | ENTRYPOINT ["gnes", "index"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/rocksdb/base.yml: -------------------------------------------------------------------------------- 1 | !RocksDBIndexer 2 | parameters: 3 | data_path: /workspace -------------------------------------------------------------------------------- /koursaros/hub/indexer/simple_dict/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-buster 2 | 3 | RUN apt-get update 4 | RUN apt-get install -y git 5 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp 6 | RUN pip install --upgrade git+https://github.com/colethienes/gnes.git --no-cache-dir --compile 7 | 8 | ADD *.py *.yml ./ 9 | 10 | ENTRYPOINT ["gnes", "index", "--py_path", "simple_dict.py"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/simple_dict/base.yml: -------------------------------------------------------------------------------- 1 | !SimpleDictIndexer {} -------------------------------------------------------------------------------- /koursaros/hub/indexer/simple_dict/simple_dict.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from google.protobuf.json_format import MessageToJson, Parse 4 | 5 | from gnes.indexer.base import BaseDocIndexer as BDI 6 | from gnes.proto import gnes_pb2 7 | 8 | 9 | class SimpleDictIndexer(BDI): 10 | 11 | def __init__(self, *args, **kwargs): 12 | super().__init__(*args, **kwargs) 13 | self._content = {} 14 | 15 | @BDI.update_counter 16 | def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs): 17 | self.logger.error(keys) 18 | self.logger.error(docs) 19 | self._content.update({k: MessageToJson(d) for (k, d) in zip(keys, docs)}) 20 | 21 | def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']: 22 | self.logger.error(keys) 23 | return [Parse(self._content[k], gnes_pb2.Document()) for k in keys] -------------------------------------------------------------------------------- /koursaros/hub/indexer/whoosh/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-buster 2 | 3 | RUN pip install whoosh 4 | 5 | ADD *.py *.yml ./ 6 | 7 | ENTRYPOINT ["gnes", "index", "--py_path", "whoosh.py"] -------------------------------------------------------------------------------- /koursaros/hub/indexer/whoosh/base.yml: -------------------------------------------------------------------------------- 1 | !WhooshIndexer 2 | parameters: 3 | data_path: /workspace -------------------------------------------------------------------------------- /koursaros/hub/indexer/whoosh/whoosh.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import numpy as np 3 | import os, os.path 4 | from whoosh import index, scoring 5 | from whoosh.fields import Schema, TEXT, NUMERIC 6 | from whoosh.analysis import StemmingAnalyzer 7 | from whoosh.qparser import QueryParser 8 | from whoosh.writing import SegmentWriter 9 | from whoosh.codec import default_codec 10 | from whoosh.automata import lev 11 | from whoosh.searching import Searcher 12 | from whoosh import collectors 13 | 14 | import glob 15 | 16 | from gnes.indexer.base import BaseChunkIndexer as BCI 17 | 18 | 19 | class WhooshIndexer(BCI): 20 | 21 | def __init__(self, data_path, *args, **kwargs): 22 | """ 23 | Initialize an indexer that implements the AhoCorasick Algorithm 24 | """ 25 | super().__init__(*args, **kwargs) 26 | schema = Schema(doc_id=NUMERIC(stored=True), 27 | offset=NUMERIC(stored=True), 28 | body=TEXT(analyzer=StemmingAnalyzer())) 29 | if not os.path.exists(data_path): 30 | os.mkdir(data_path) 31 | self.logger.error('Please mount volume for persisting index.') 32 | try: 33 | self.ix = index.open_dir(data_path) 34 | except: 35 | self.logger.warning('Creating empty whoosh index') 36 | self.ix = index.create_in(data_path, schema) 37 | 38 | def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, _, *args, **kwargs): 39 | self.logger.error('Recieved add index request') 40 | self.logger.error(keys) 41 | if vectors.dtype != np.uint8: 42 | raise ValueError('vectors should be ndarray of uint8') 43 | 44 | writer = self.ix.writer() 45 | for key, vector in zip(keys, vectors): 46 | body = self.decode_textbytes(vector) 47 | writer.add_document(doc_id=key[0],offset=key[1],body=body) 48 | 49 | writer.commit() 50 | 51 | def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]: 52 | if keys.dtype != np.uint8: 53 | raise ValueError('vectors should be ndarray of uint8') 54 | 55 | ret = [] 56 | qp = QueryParser("body", schema=self.ix.schema) 57 | with self.ix.searcher(weighting=scoring.TF_IDF()) as searcher: 58 | for key in keys: 59 | query = qp.parse(self.decode_textbytes(key)) 60 | ret.append([ 61 | (result['doc_id'],result['offset'], 1.0, 1.0) 62 | for result in searcher.search(query, limit=top_k)]) 63 | self.logger.error(ret) 64 | return ret 65 | 66 | @staticmethod 67 | def decode_textbytes(vector: np.ndarray): 68 | return vector.tobytes().rstrip(b'\x00').decode() 69 | 70 | # def __getstate__(self): 71 | # import faiss 72 | # d = super().__getstate__() 73 | # faiss.write_index(self._faiss_index, self.data_path) 74 | # return d 75 | -------------------------------------------------------------------------------- /koursaros/hub/preprocessor/sentsplit/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "preprocess"] -------------------------------------------------------------------------------- /koursaros/hub/preprocessor/sentsplit/jsonmode.yml: -------------------------------------------------------------------------------- 1 | !SentSplitPreprocessor 2 | parameters: 3 | is_json: True -------------------------------------------------------------------------------- /koursaros/hub/preprocessor/unary/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "preprocess"] -------------------------------------------------------------------------------- /koursaros/hub/preprocessor/unary/text.yml: -------------------------------------------------------------------------------- 1 | !UnaryPreprocessor 2 | parameters: 3 | doc_type: 1 -------------------------------------------------------------------------------- /koursaros/hub/router/block/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "route", "--py_path", "block.py"] -------------------------------------------------------------------------------- /koursaros/hub/router/block/block.py: -------------------------------------------------------------------------------- 1 | from gnes.router.base import BaseRouter 2 | from gnes.proto import gnes_pb2 3 | from gnes.service.base import BlockMessage 4 | from typing import List 5 | 6 | 7 | class BlockRouter(BaseRouter): 8 | """ :param block: runtimes to block""" 9 | 10 | def __init__(self, block: List[str] = [], *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.block = block 13 | 14 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs): 15 | """ 16 | Log the incoming message 17 | :param msg: incoming message 18 | """ 19 | 20 | runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body') 21 | self.logger.error(runtime) 22 | 23 | if runtime in self.block: 24 | self.logger.info('Blocking %s msg...' % runtime) 25 | raise BlockMessage 26 | -------------------------------------------------------------------------------- /koursaros/hub/router/block/block_query.yml: -------------------------------------------------------------------------------- 1 | !BlockRouter 2 | parameters: 3 | block: [] -------------------------------------------------------------------------------- /koursaros/hub/router/block/block_train.yml: -------------------------------------------------------------------------------- 1 | !BlockRouter 2 | parameters: 3 | block: [train] -------------------------------------------------------------------------------- /koursaros/hub/router/log/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "--verbose", "route", "--py_path", "log.py"] -------------------------------------------------------------------------------- /koursaros/hub/router/log/log.py: -------------------------------------------------------------------------------- 1 | 2 | from gnes.router.base import BaseRouter 3 | 4 | 5 | class LogRouter(BaseRouter): 6 | """ Base class for the router. Inherit from this class to create a new router. 7 | Router forwards messages between services. Essentially, it receives a 'gnes_pb2.Message' 8 | and call `apply()` method on it. 9 | """ 10 | 11 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs): 12 | """ 13 | Log the incoming message 14 | :param msg: incoming message 15 | """ 16 | self.logger.info(msg) 17 | -------------------------------------------------------------------------------- /koursaros/hub/router/rerank/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.2-cuda10.0-cudnn7-runtime 2 | 3 | RUN pip install -U transformers gnes --no-cache-dir --compile 4 | 5 | WORKDIR / 6 | ADD *.py *.yml ./ 7 | RUN nvidia-smi 8 | 9 | ENTRYPOINT ["gnes", "route", "--py_path", "rerank.py"] -------------------------------------------------------------------------------- /koursaros/hub/router/rerank/base.yml: -------------------------------------------------------------------------------- 1 | !RerankRouter 2 | parameters: 3 | model_name: bert-base-uncased 4 | data_dir: /workspace 5 | -------------------------------------------------------------------------------- /koursaros/hub/router/rerank/rerank.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from gnes.router.base import BaseRouter 4 | from gnes.proto import gnes_pb2 5 | from gnes.helper import batching 6 | from gnes.service.base import BlockMessage 7 | 8 | 9 | from transformers import * 10 | import torch 11 | import torch.nn 12 | import numpy as np 13 | 14 | 15 | class RerankRouter(BaseRouter): 16 | 17 | def __init__(self, model_name: str = None, data_dir: str = None, *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self.model_name = model_name 20 | self.data_dir = data_dir 21 | self.max_grad_norm = 1.0 22 | self.lr = 1e-3 23 | self.query_dict = dict() 24 | 25 | def post_init(self): 26 | model_config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.data_dir) 27 | model_config.num_labels = 1 # set up for regression 28 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | if self.device == "cpu": self.logger.error("RUNING ON CPU") 30 | self.rerank_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, 31 | config=model_config, 32 | cache_dir=self.data_dir) 33 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.data_dir) 34 | self.rerank_model.to(self.device) 35 | 36 | self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False) 37 | self.scheduler = ConstantLRSchedule(self.optimizer) 38 | 39 | def get_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult') -> str: 40 | return x.doc.doc_id 41 | 42 | def set_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult', k: str) -> None: 43 | x.doc.doc_id = k 44 | 45 | # @batching 46 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs): 47 | 48 | all_scored_results = [sr for sr in msg.response.search.topk_results] 49 | runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body') 50 | 51 | if runtime == 'train': # training samples are given 52 | inputs = [] 53 | labels = [] 54 | for doc in msg.request.train.docs: 55 | ex = json.loads(doc.raw_bytes) 56 | inputs.append( 57 | self.tokenizer.encode_plus(ex['Query'], ex['Candidate'], add_special_tokens=True)) 58 | labels.append(float(ex['Label'])) 59 | 60 | labels = torch.tensor(labels, dtype=torch.float).to(self.device) 61 | 62 | elif runtime == 'search': 63 | if msg.WhichOneof('body') == 'request': 64 | self.logger.error('got request') 65 | if not msg.request.request_id in self.query_dict: 66 | self.query_dict[msg.request.request_id] = msg.request.search.query.raw_bytes.decode() 67 | raise BlockMessage 68 | else: 69 | query = msg.request.search.query.raw_bytes.decode() 70 | all_scored_results = self.query_dict[msg.request.request_id] 71 | else: 72 | self.logger.error('got response') 73 | if not msg.response.request_id in self.query_dict: 74 | self.query_dict[msg.request.request_id] = all_scored_results 75 | raise BlockMessage 76 | else: 77 | query = self.query_dict[msg.response.request_id] 78 | inputs = [ 79 | self.tokenizer.encode_plus( 80 | query, 81 | sr.doc.chunks[0].text, 82 | add_special_tokens=True, 83 | ) for sr in all_scored_results] 84 | self.logger.error([sr.doc.chunks[0].text for sr in all_scored_results]) 85 | labels = None 86 | 87 | else: 88 | raise BlockMessage 89 | 90 | if len(inputs) == 0: 91 | print("Warning: empty input set, ignoring.") 92 | return 93 | 94 | max_len = max(len(t['input_ids']) for t in inputs) 95 | input_ids = [t['input_ids'] + [0] * (max_len - len(t['input_ids'])) for t in inputs] 96 | token_type_ids = [t['token_type_ids'] + [0] * (max_len - len(t['token_type_ids'])) for t in inputs] 97 | attention_mask = [[1] * len(t['input_ids']) + [0] * (max_len - len(t['input_ids'])) for t in inputs] 98 | 99 | input_ids = torch.tensor(input_ids).to(self.device) 100 | token_type_ids = torch.tensor(token_type_ids).to(self.device) 101 | attention_mask = torch.tensor(attention_mask).to(self.device) 102 | 103 | if labels is not None: 104 | loss = self.rerank_model(input_ids, token_type_ids=token_type_ids, 105 | labels=labels, attention_mask=attention_mask)[0] 106 | loss.backward() 107 | torch.nn.utils.clip_grad_norm_(self.rerank_model.parameters(), self.max_grad_norm) 108 | self.optimizer.step() 109 | self.scheduler.step() 110 | self.rerank_model.zero_grad() 111 | msg.response.train.status = gnes_pb2.Response.Status.SUCCESS 112 | 113 | else: 114 | with torch.no_grad(): 115 | logits = self.rerank_model(input_ids, token_type_ids=token_type_ids, 116 | attention_mask=attention_mask)[0] 117 | scores = np.squeeze(logits.detach().cpu().numpy()) 118 | if len(logits) == 1: 119 | scores = [scores] 120 | ranked_results = [] 121 | for sr, score in zip(all_scored_results, scores): 122 | ranked_results.append((sr.doc, score)) 123 | 124 | k = msg.response.search.top_k 125 | top_k = sorted(ranked_results, key=lambda x: x[1], reverse=True)[:k] 126 | 127 | msg.response.search.ClearField('topk_results') 128 | for doc, score in top_k: 129 | sr = msg.response.search.topk_results.add() 130 | sr.score.value = float(score) 131 | sr.doc.CopyFrom(doc) 132 | -------------------------------------------------------------------------------- /koursaros/hub/router/resp_req/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gnes/gnes:latest-alpine 2 | 3 | ADD *.py *.yml ./ 4 | 5 | ENTRYPOINT ["gnes", "route", "--py_path", "resp_req.py"] -------------------------------------------------------------------------------- /koursaros/hub/router/resp_req/base.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/router/resp_req/base.yml -------------------------------------------------------------------------------- /koursaros/hub/router/resp_req/resp_req.py: -------------------------------------------------------------------------------- 1 | from gnes.router.base import BaseRouter 2 | from gnes.proto import gnes_pb2 3 | 4 | class RespReqRouter(BaseRouter): 5 | def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs): 6 | """ 7 | Log the incoming message 8 | :param msg: incoming message 9 | """ 10 | 11 | runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body') 12 | print('recieved msg') 13 | print(msg) 14 | print(runtime) 15 | if runtime == 'index': 16 | req = gnes_pb2.Message() -------------------------------------------------------------------------------- /koursaros/hub/tests/sonnets_small.txt: -------------------------------------------------------------------------------- 1 | From fairest creatures we desire increase, 2 | That thereby beauty's rose might never die, 3 | But as the riper should by time decease, 4 | His tender heir might bear his memory: 5 | But thou contracted to thine own bright eyes, 6 | Feed'st thy light's flame with self-substantial fuel, 7 | Making a famine where abundance lies, 8 | Thy self thy foe, to thy sweet self too cruel: 9 | Thou that art now the world's fresh ornament, 10 | And only herald to the gaudy spring, 11 | Within thine own bud buriest thy content, 12 | And, tender churl, mak'st waste in niggarding: 13 | Pity the world, or else this glutton be, 14 | To eat the world's due, by the grave and thee. 15 | 16 | When forty winters shall besiege thy brow, 17 | And dig deep trenches in thy beauty's field, 18 | Thy youth's proud livery so gazed on now, 19 | Will be a totter'd weed of small worth held: 20 | Then being asked, where all thy beauty lies, 21 | Where all the treasure of thy lusty days; 22 | To say, within thine own deep sunken eyes, 23 | Were an all-eating shame, and thriftless praise. 24 | How much more praise deserv'd thy beauty's use, 25 | If thou couldst answer 'This fair child of mine 26 | Shall sum my count, and make my old excuse,' 27 | Proving his beauty by succession thine! 28 | This were to be new made when thou art old, 29 | And see thy blood warm when thou feel'st it cold. 30 | 31 | Look in thy glass and tell the face thou viewest 32 | Now is the time that face should form another; 33 | Whose fresh repair if now thou not renewest, 34 | Thou dost beguile the world, unbless some mother. 35 | For where is she so fair whose uneared womb 36 | Disdains the tillage of thy husbandry? 37 | Or who is he so fond will be the tomb 38 | Of his self-love, to stop posterity? 39 | Thou art thy mother's glass and she in thee 40 | Calls back the lovely April of her prime; 41 | So thou through windows of thine age shalt see, 42 | Despite of wrinkles, this thy golden time. 43 | But if thou live, remembered not to be, 44 | Die single and thine image dies with thee. 45 | 46 | Unthrifty loveliness, why dost thou spend 47 | Upon thy self thy beauty's legacy? 48 | Nature's bequest gives nothing, but doth lend, 49 | And being frank she lends to those are free: 50 | Then, beauteous niggard, why dost thou abuse 51 | The bounteous largess given thee to give? 52 | Profitless usurer, why dost thou use 53 | So great a sum of sums, yet canst not live? 54 | For having traffic with thy self alone, 55 | Thou of thy self thy sweet self dost deceive: 56 | Then how when nature calls thee to be gone, 57 | What acceptable audit canst thou leave? 58 | Thy unused beauty must be tombed with thee, 59 | Which, used, lives th' executor to be. 60 | 61 | Those hours, that with gentle work did frame 62 | The lovely gaze where every eye doth dwell, 63 | Will play the tyrants to the very same 64 | And that unfair which fairly doth excel; 65 | For never-resting time leads summer on 66 | To hideous winter, and confounds him there; 67 | Sap checked with frost, and lusty leaves quite gone, 68 | Beauty o'er-snowed and bareness every where: 69 | Then were not summer's distillation left, 70 | A liquid prisoner pent in walls of glass, 71 | Beauty's effect with beauty were bereft, 72 | Nor it, nor no remembrance what it was: 73 | But flowers distilled, though they with winter meet, 74 | Leese but their show; their substance still lives sweet. 75 | 76 | Then let not winter's ragged hand deface, 77 | In thee thy summer, ere thou be distilled: 78 | Make sweet some vial; treasure thou some place 79 | With beauty's treasure ere it be self-killed. 80 | That use is not forbidden usury, 81 | Which happies those that pay the willing loan; 82 | That's for thy self to breed another thee, 83 | Or ten times happier, be it ten for one; 84 | Ten times thy self were happier than thou art, 85 | If ten of thine ten times refigured thee: 86 | Then what could death do if thou shouldst depart, 87 | Leaving thee living in posterity? 88 | Be not self-willed, for thou art much too fair 89 | To be death's conquest and make worms thine heir. 90 | 91 | Lo! in the orient when the gracious light 92 | Lifts up his burning head, each under eye 93 | Doth homage to his new-appearing sight, 94 | Serving with looks his sacred majesty; 95 | And having climbed the steep-up heavenly hill, 96 | Resembling strong youth in his middle age, 97 | Yet mortal looks adore his beauty still, 98 | Attending on his golden pilgrimage: 99 | But when from highmost pitch, with weary car, 100 | Like feeble age, he reeleth from the day, 101 | The eyes, 'fore duteous, now converted are 102 | From his low tract, and look another way: 103 | So thou, thyself outgoing in thy noon 104 | Unlooked on diest unless thou get a son. -------------------------------------------------------------------------------- /koursaros/hub/tests/test_block.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | from gnes.cli.parser import set_router_parser, _set_client_parser 5 | from gnes.service.router import RouterService 6 | from gnes.service.base import SocketType 7 | from gnes.client.base import ZmqClient 8 | from gnes.proto import gnes_pb2 9 | 10 | 11 | class TestBlock(unittest.TestCase): 12 | 13 | def setUp(self): 14 | dirname = os.path.dirname(__file__) 15 | self.rerank_router_yaml = os.path.join(dirname, '../', 'router/block/block_train.yml') 16 | self.python_code = os.path.join(dirname, '../', 'router/block/block.py') 17 | 18 | 19 | self.args = set_router_parser().parse_args([ 20 | '--yaml_path', self.rerank_router_yaml, 21 | '--socket_out', str(SocketType.PUB_BIND), 22 | '--py_path', self.python_code 23 | ]) 24 | self.c_args = _set_client_parser().parse_args([ 25 | '--port_in', str(self.args.port_out), 26 | '--port_out', str(self.args.port_in), 27 | '--socket_in', str(SocketType.SUB_CONNECT) 28 | ]) 29 | 30 | def test_block_router(self): 31 | with RouterService(self.args), ZmqClient(self.c_args) as c1: 32 | msg = gnes_pb2.Message() 33 | msg.request.train.docs.add() 34 | c1.send_message(msg) 35 | msg = gnes_pb2.Message() 36 | msg.request.index.docs.add() 37 | c1.send_message(msg) 38 | r = c1.recv_message() 39 | -------------------------------------------------------------------------------- /koursaros/hub/tests/test_keyword.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from gnes.proto import gnes_pb2 5 | from gnes.client.base import ZmqClient 6 | from gnes.service.base import SocketType 7 | from gnes.cli.parser import set_router_parser, _set_client_parser 8 | from gnes.service.indexer import IndexerService 9 | import numpy as np 10 | 11 | 12 | class TestKeyword(unittest.TestCase): 13 | 14 | def setUp(self): 15 | dirname = os.path.dirname(__file__) 16 | self.yaml = os.path.join(dirname, 'yaml', 'test-keyword.yml') 17 | self.python_code = os.path.join(dirname, '../', 'indexer/keyword/keyword.py') 18 | 19 | self.test_str = [] 20 | self.test_vec = [] 21 | self._msl = 512 22 | with open(os.path.join(dirname, 'sonnets_small.txt')) as f: 23 | for line in f: 24 | line = line.strip() 25 | if line == '': continue 26 | self.test_vec.append(np.frombuffer( 27 | line.encode()[:self._msl] + b'\x00' * (self._msl - len(line)), 28 | dtype=np.uint8 29 | )) 30 | self.test_str.append(line) 31 | 32 | def test_keyword(self): 33 | args = set_router_parser().parse_args([ 34 | '--yaml_path', self.yaml, 35 | '--socket_out', str(SocketType.PUB_BIND), 36 | '--py_path', self.python_code, 37 | ]) 38 | args.as_response = True 39 | c_args = _set_client_parser().parse_args([ 40 | '--port_in', str(args.port_out), 41 | '--port_out', str(args.port_in), 42 | '--socket_in', str(SocketType.SUB_CONNECT) 43 | ]) 44 | with IndexerService(args), ZmqClient(c_args) as c1: 45 | msg = gnes_pb2.Message() 46 | for i, vec in enumerate(self.test_vec): 47 | doc = msg.request.index.docs.add() 48 | doc.doc_id = i 49 | doc.raw_text = self.test_str[i] 50 | c = doc.chunks.add() 51 | c.doc_id = i 52 | c.offset = 0 53 | c.embedding.data = vec.tobytes() 54 | for d in vec.shape: 55 | c.embedding.shape.extend([d]) 56 | c.embedding.dtype = str(vec.dtype) 57 | c.text = self.test_str[i] 58 | c1.send_message(msg) 59 | 60 | r = c1.recv_message() 61 | self.assert_(r.response.index) 62 | 63 | for i, vec in enumerate(self.test_vec): 64 | msg = gnes_pb2.Message() 65 | msg.request.search.query.doc_id = 1 66 | msg.request.search.top_k = 1 67 | c = msg.request.search.query.chunks.add() 68 | c.doc_id = 1 69 | c.embedding.data = vec.tobytes() 70 | for d in vec.shape: 71 | c.embedding.shape.extend([d]) 72 | c.embedding.dtype = str(vec.dtype) 73 | c.offset = 0 74 | c.weight = 1 75 | c.text = self.test_str[i] 76 | c1.send_message(msg) 77 | r = c1.recv_message() 78 | self.assert_(r.response.search.topk_results[0].chunk.doc_id == i) 79 | 80 | def tearDown(self): 81 | pass -------------------------------------------------------------------------------- /koursaros/hub/tests/test_reranker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import json 4 | 5 | from gnes.proto import gnes_pb2 6 | from gnes.client.base import ZmqClient 7 | from gnes.service.base import SocketType 8 | from gnes.cli.parser import set_router_parser, _set_client_parser 9 | from gnes.service.router import RouterService 10 | 11 | 12 | class TestReranker(unittest.TestCase): 13 | 14 | def setUp(self): 15 | dirname = os.path.dirname(__file__) 16 | self.rerank_router_yaml = os.path.join(dirname, 'yaml', 'test-reranker.yml') 17 | self.python_code = os.path.join(dirname, '../', 'router/rerank/rerank.py') 18 | 19 | self.test_str = [] 20 | with open(os.path.join(dirname, 'sonnets_small.txt')) as f: 21 | for line in f: 22 | line = line.strip() 23 | if line: 24 | self.test_str.append(line) 25 | 26 | self.args = set_router_parser().parse_args([ 27 | '--yaml_path', self.rerank_router_yaml, 28 | '--socket_out', str(SocketType.PUB_BIND), 29 | '--py_path', self.python_code 30 | ]) 31 | self.c_args = _set_client_parser().parse_args([ 32 | '--port_in', str(self.args.port_out), 33 | '--port_out', str(self.args.port_in), 34 | '--socket_in', str(SocketType.SUB_CONNECT) 35 | ]) 36 | 37 | # @unittest.skip('SKIPPING TRAIN TEST') 38 | def test_rerank_train(self): 39 | with RouterService(self.args), ZmqClient(self.c_args) as c1: 40 | msg = gnes_pb2.Message() 41 | msg.response.search.ClearField('topk_results') 42 | msg.request.search.query.raw_text = 'This is a query' 43 | 44 | for i, line in enumerate(self.test_str[:5]): 45 | s = msg.response.search.topk_results.add() 46 | s.score.value = 0.1 47 | s.doc.doc_id = i 48 | s.doc.raw_text = line 49 | 50 | msg.envelope.num_part.extend([1]) 51 | msg.response.search.top_k = 5 52 | c1.send_message(msg) 53 | 54 | r = c1.recv_message() 55 | print(r) 56 | 57 | msg = gnes_pb2.Message() 58 | 59 | for i, line in enumerate(self.test_str): 60 | doc = msg.request.train.docs.add() 61 | doc.doc_id = i 62 | doc.raw_bytes = json.dumps({ 63 | 'Query' : 'test query', 64 | 'Candidate' : line, 65 | 'Label' : 1.0 if i % 2 == 0 else 0.0 66 | }).encode('utf-8') 67 | 68 | msg.envelope.num_part.extend([1]) 69 | c1.send_message(msg) 70 | r = c1.recv_message() 71 | print(r) 72 | 73 | @unittest.skip("SKIPPING QUERY TEST") 74 | def test_rerank(self): 75 | with RouterService(self.args), ZmqClient(self.c_args) as c1: 76 | msg = gnes_pb2.Message() 77 | msg.response.search.ClearField('topk_results') 78 | msg.request.search.query.raw_text = 'This is a query' 79 | 80 | for i, line in enumerate(self.test_str): 81 | s = msg.response.search.topk_results.add() 82 | s.score.value = 0.1 83 | s.doc.doc_id = i 84 | s.doc.raw_text = line 85 | 86 | msg.envelope.num_part.extend([1]) 87 | msg.response.search.top_k = 5 88 | c1.send_message(msg) 89 | 90 | r = c1.recv_message() 91 | # import pdb 92 | # pdb.set_trace() 93 | self.assertSequenceEqual(r.envelope.num_part, [1]) 94 | self.assertEqual(len(r.response.search.topk_results), 5) 95 | 96 | msg = gnes_pb2.Message() 97 | msg.response.search.ClearField('topk_results') 98 | 99 | for i, line in enumerate(self.test_str[:1]): 100 | s = msg.response.search.topk_results.add() 101 | s.score.value = 0.1 102 | s.doc.doc_id = i 103 | s.doc.raw_text = line 104 | 105 | msg.envelope.num_part.extend([1]) 106 | msg.response.search.top_k = 5 107 | c1.send_message(msg) 108 | 109 | r = c1.recv_message() 110 | self.assertSequenceEqual(r.envelope.num_part, [1]) 111 | self.assertEqual(len(r.response.search.topk_results), 1) 112 | 113 | def tearDown(self): 114 | pass -------------------------------------------------------------------------------- /koursaros/hub/tests/test_textbyte_encoder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from koursaros.hub.encoder.textbyte.textbyte import TextByteEncoder 3 | import pathlib 4 | import csv 5 | 6 | import numpy as np 7 | 8 | class TestTextByte(unittest.TestCase): 9 | 10 | def setUp(self) -> None: 11 | self.msl = 1024 12 | self.model = TextByteEncoder(self.msl) 13 | self.path = pathlib.Path('reviews_sample.csv') 14 | self.csv = csv.DictReader(self.path.open()) 15 | 16 | def test_textbyte(self): 17 | to_encode = [] 18 | for row in self.csv: 19 | to_encode.append(list(row.values())[1]) 20 | vectors = self.model.encode(to_encode) 21 | for vec in vectors: 22 | self.assertEqual(len(vec), self.msl) 23 | for vector in vectors: 24 | self.decode_textbytes(vector) 25 | self.decode_textbytes(vectors) 26 | 27 | @staticmethod 28 | def decode_textbytes(vector: np.ndarray): 29 | return vector.tobytes().rstrip(b'\x00').decode() -------------------------------------------------------------------------------- /koursaros/hub/tests/test_whoosh.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from unittest import SkipTest 4 | 5 | from gnes.proto import gnes_pb2 6 | from gnes.client.base import ZmqClient 7 | from gnes.service.base import SocketType 8 | from gnes.cli.parser import set_router_parser, _set_client_parser 9 | from gnes.service.indexer import IndexerService 10 | import numpy as np 11 | 12 | 13 | class TestWhoosh(unittest.TestCase): 14 | 15 | def setUp(self): 16 | dirname = os.path.dirname(__file__) 17 | self.yaml = os.path.join(dirname, 'yaml', 'test-whoosh.yml') 18 | self.yaml_joint = os.path.join(dirname, 'yaml', 'test-joint.yml') 19 | self.python_code = os.path.join(dirname, '../', 'indexer/whoosh/whoosh.py') 20 | 21 | self.test_str = [] 22 | self.test_vec = [] 23 | self._msl = 512 24 | with open(os.path.join(dirname, 'sonnets_small.txt')) as f: 25 | for line in f: 26 | line = line.strip() 27 | if line == '': continue 28 | self.test_vec.append(np.frombuffer( 29 | line.encode()[:self._msl] + b'\x00' * (self._msl - len(line)), 30 | dtype=np.uint8 31 | )) 32 | self.test_str.append(line) 33 | 34 | def test_whoosh(self): 35 | args = set_router_parser().parse_args([ 36 | '--yaml_path', self.yaml, 37 | '--socket_out', str(SocketType.PUB_BIND), 38 | '--py_path', self.python_code, 39 | ]) 40 | args.as_response = True 41 | c_args = _set_client_parser().parse_args([ 42 | '--port_in', str(args.port_out), 43 | '--port_out', str(args.port_in), 44 | '--socket_in', str(SocketType.SUB_CONNECT) 45 | ]) 46 | with IndexerService(args), ZmqClient(c_args) as c1: 47 | msg = gnes_pb2.Message() 48 | for i, vec in enumerate(self.test_vec): 49 | doc = msg.request.index.docs.add() 50 | doc.doc_id = i 51 | doc.raw_text = self.test_str[i] 52 | c = doc.chunks.add() 53 | c.doc_id = i 54 | c.offset = 0 55 | c.embedding.data = vec.tobytes() 56 | for d in vec.shape: 57 | c.embedding.shape.extend([d]) 58 | c.embedding.dtype = str(vec.dtype) 59 | c.text = self.test_str[i] 60 | c1.send_message(msg) 61 | 62 | r = c1.recv_message() 63 | self.assert_(r.response.index) 64 | 65 | for i, vec in enumerate(self.test_vec): 66 | msg = gnes_pb2.Message() 67 | msg.request.search.query.doc_id = 1 68 | msg.request.search.top_k = 1 69 | c = msg.request.search.query.chunks.add() 70 | c.doc_id = 1 71 | c.embedding.data = vec.tobytes() 72 | for d in vec.shape: 73 | c.embedding.shape.extend([d]) 74 | c.embedding.dtype = str(vec.dtype) 75 | c.offset = 0 76 | c.weight = 1 77 | c.text = self.test_str[i] 78 | c1.send_message(msg) 79 | r = c1.recv_message() 80 | import pdb 81 | pdb.set_trace() 82 | try: 83 | self.assert_(r.response.search.topk_results[0].chunk.doc_id == i) 84 | except: 85 | pass 86 | 87 | @SkipTest 88 | def test_joint(self): 89 | args = set_router_parser().parse_args([ 90 | '--yaml_path', self.yaml_joint, 91 | '--socket_out', str(SocketType.PUB_BIND), 92 | '--py_path', self.python_code, 93 | ]) 94 | args.as_response = True 95 | c_args = _set_client_parser().parse_args([ 96 | '--port_in', str(args.port_out), 97 | '--port_out', str(args.port_in), 98 | '--socket_in', str(SocketType.SUB_CONNECT) 99 | ]) 100 | with IndexerService(args), ZmqClient(c_args) as c1: 101 | msg = gnes_pb2.Message() 102 | for i, vec in enumerate(self.test_vec): 103 | doc = msg.request.index.docs.add() 104 | doc.doc_id = i 105 | doc.raw_text = self.test_str[i] 106 | c = doc.chunks.add() 107 | c.doc_id = i 108 | c.offset = 0 109 | c.embedding.data = vec.tobytes() 110 | for d in vec.shape: 111 | c.embedding.shape.extend([d]) 112 | c.embedding.dtype = str(vec.dtype) 113 | c.text = self.test_str[i] 114 | c1.send_message(msg) 115 | 116 | r = c1.recv_message() 117 | self.assert_(r.response.index) 118 | 119 | for i, vec in enumerate(self.test_vec): 120 | msg = gnes_pb2.Message() 121 | msg.request.search.query.doc_id = 1 122 | msg.request.search.top_k = 1 123 | c = msg.request.search.query.chunks.add() 124 | c.doc_id = 1 125 | c.embedding.data = vec.tobytes() 126 | for d in vec.shape: 127 | c.embedding.shape.extend([d]) 128 | c.embedding.dtype = str(vec.dtype) 129 | c.offset = 0 130 | c.weight = 1 131 | c.text = self.test_str[i] 132 | c1.send_message(msg) 133 | r = c1.recv_message() 134 | try: 135 | self.assert_(r.response.search.topk_results[0].chunk.doc_id == i) 136 | except: 137 | pass 138 | 139 | def tearDown(self): 140 | pass -------------------------------------------------------------------------------- /koursaros/hub/tests/yaml/test-joint.yml: -------------------------------------------------------------------------------- 1 | !JointIndexer 2 | components: 3 | - !RocksDBIndexer 4 | parameters: 5 | data_path: ./idx.doc_content 6 | - !WhooshIndexer 7 | parameters: 8 | data_path: ./idx.whoosh -------------------------------------------------------------------------------- /koursaros/hub/tests/yaml/test-keyword.yml: -------------------------------------------------------------------------------- 1 | !KeywordIndexer {} -------------------------------------------------------------------------------- /koursaros/hub/tests/yaml/test-reranker.yml: -------------------------------------------------------------------------------- 1 | !RerankRouter 2 | parameters: 3 | model_name: bert-base-uncased -------------------------------------------------------------------------------- /koursaros/hub/tests/yaml/test-whoosh.yml: -------------------------------------------------------------------------------- 1 | !WhooshIndexer 2 | parameters: 3 | data_path: ./idx.doc_content -------------------------------------------------------------------------------- /koursaros/repo_creds/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Description 3 | 4 | This module allows you to pull secure credentials into your python 5 | script. It assumes that you create a private git repository with 6 | your credentials in them prior to using get_creds(). 7 | 8 | ## At a glance 9 | 10 | You can create a repository that looks like this: 11 | 12 | ``` 13 | creds 14 | ├── creds.yaml 15 | ├── google 16 | │ └── bluehat.json 17 | └── postgres 18 | └── postgres.pem 19 | ``` 20 | 21 | And a `creds.yaml` that looks like this: 22 | ```yaml 23 | creds: 24 | postgres: 25 | host: !!str 12.345.678.910 26 | username: !!str postgres 27 | password: !!str my_password 28 | replicas: !!int 5 29 | dbname: !!str fever 30 | sslmode: !!str verify-ca 31 | sslrootcert: !file postgres/postgres.pem 32 | google: 33 | app_creds: !file google/bluehat.json 34 | ``` 35 | 36 | Let's say the repo you make is `madhatter/creds`, my username is `alice` and password is `cheshire`. 37 | You can get your credentials in a python script by doing the following: 38 | ```python 39 | from koursaros.credentials import get_creds 40 | from sys import argv 41 | 42 | # retrieve repo creds by adding login to script 43 | creds = get_creds('alice:cheshire@madhatter/creds') 44 | # or with cmd line args 45 | creds = get_creds(argv[1]) 46 | # NOTE: you don't need to log in if your git credentials are stored locally 47 | 48 | 49 | # the !! denotes native python types. You can access them like: 50 | creds.postgres.password # my_password 51 | creds.postgres.replicas # 5 52 | 53 | # the special !file tag means that it is a file. You can access 54 | # three attributes from file objects (path, bytes, text): 55 | creds.google.app_creds.path # '/absolute/path/to/google/app_creds/bluehat.json' 56 | creds.google.app_creds.bytes # b'{"client_id": "293480342342034"}' 57 | creds.google.app_creds.text # '{"client_id": "293480342342034"}' 58 | ``` 59 | 60 | ## How it works 61 | The `get_creds()` function clones the specified repo and caches it to the koursaros.credentials 62 | directory. If the creds repo already exists, the repo is git pulled. -------------------------------------------------------------------------------- /koursaros/repo_creds/__init__.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from pathlib import Path 3 | from box import Box 4 | import git 5 | 6 | 7 | DIR = Path(__file__).parent.absolute() 8 | 9 | 10 | class FileCred(yaml.YAMLObject): 11 | yaml_loader = yaml.SafeLoader 12 | yaml_tag = '!file' 13 | 14 | def __init__(self, relative_path): 15 | path = self.repo_path.joinpath(relative_path) 16 | self.bytes = path.read_bytes() 17 | self.text = path.read_text() 18 | self.path = str(path) 19 | 20 | @classmethod 21 | def from_yaml(cls, loader, node): 22 | return cls(node.value) 23 | 24 | @classmethod 25 | def set_repo_path(cls, repo_path): 26 | cls.repo_path = repo_path 27 | 28 | 29 | def get_creds(git_dsn): 30 | login, repo = git_dsn.split('@') 31 | login += '@' 32 | repo_path = DIR.joinpath(repo) 33 | repo_path.parent.mkdir(exist_ok=True) 34 | FileCred.set_repo_path(repo_path) 35 | 36 | if repo_path.exists(): 37 | g = git.Git(repo_path) 38 | g.pull() 39 | else: 40 | g = git.Git(repo_path.parent) 41 | g.clone("https://%sgithub.com/%s" % (login, repo)) 42 | 43 | creds = yaml.safe_load(repo_path.joinpath('creds.yaml').read_text()) 44 | return Box(creds['creds']) 45 | 46 | 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | distro==1.4.0 2 | python-box 3 | tqdm 4 | torch 5 | transformers 6 | termcolor -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | 4 | setup( 5 | name='koursaros', 6 | packages=find_packages(), 7 | include_package_data=True, 8 | version='0.0.1', 9 | license='MIT', 10 | description='Koursaros is a distributed, cloud-' 11 | 'native platform for developing and deploying ' 12 | 'automated information retrieval and inference applications.', 13 | long_description=Path('README.md').read_text(), 14 | author='Koursaros', 15 | author_email='cole.thienes@gmail.com', 16 | url='https://github.com/koursaros-ai/koursaros', 17 | # download_url='https://github.com/koursaros-ai/koursaros/archive/0.0.1.tar.gz', 18 | keywords=['koursaros', 'distributed', 'cloud-native', 'neural', 'inference'], 19 | install_requires=[ 20 | 'PyYAML', 'gitpython', 'python-box', 'gnes', 'tqdm', 'tabulate', 'click'], 21 | entry_points={'console_scripts': ['kctl=koursaros.cli.__main__:main']}, 22 | classifiers=[ 23 | 'Intended Audience :: Education', 24 | 'Intended Audience :: Science/Research', 25 | 'Intended Audience :: Developers', 26 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 27 | 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 28 | 'Topic :: Scientific/Engineering', 29 | 'Topic :: Scientific/Engineering :: Mathematics', 30 | 'Topic :: Software Development', 31 | 'Topic :: Software Development :: Libraries', 32 | 'Topic :: Software Development :: Libraries :: Python Modules', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Programming Language :: Python :: 3.5', 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /tutorials/deploy_custom_model.md: -------------------------------------------------------------------------------- 1 | # Training + Deploying a Custom Transformer Model in 5 Minutes 2 | 3 | ## Training Sentence Classification or Regression 4 | 5 | Make sure you've installed the koursaros training package. 6 | 7 | Create a .yaml file for your model in the /services directory. Your project should look like: 8 | 9 | ``` 10 | |-bases/ 11 | |-pipelines/ 12 | |---my_pipeline.yaml 13 | |-services/ 14 | |---[name].yaml 15 | ``` 16 | 17 | For loading mnli from a postgres table, the yaml file should look like this: 18 | ```yaml 19 | service: 20 | base: 21 | repo: gs:// 22 | task: 23 | labels: # if classification, else nothing 24 | - neutral 25 | - contradiction 26 | - entailment 27 | training: 28 | checkpoint: bert-base-uncased # see transformers for options, or use custom filename 29 | epochs: 3 30 | learning_rate: 1e-05 31 | ``` 32 | 33 | ### Loading data from postgresql 34 | 35 | For loading training data form postgres (recommended), add this to the service yaml. Adjust the schema and tables to point your your train / test data. 36 | ```yaml 37 | data: 38 | source: postgres 39 | schema: mnli 40 | train: train_set 41 | test: test_set 42 | ``` 43 | 44 | And adjust your environment variables accordingly: 45 | ```bash 46 | export PGHOST= 47 | export PGUSER= 48 | export PGPASS= 49 | export PGDBNAME= 50 | # for ssl 51 | export PGSSLMODE=verify-ca 52 | export PGSSLROOTCERT= 53 | ``` 54 | 55 | ### Loading data from tsv / excel 56 | 57 | ```yaml 58 | data: 59 | source: tsv 60 | train: train_set.tsv 61 | test: test_set.tsv 62 | ``` 63 | 64 | --- 65 | 66 | **NOTE** 67 | 68 | The format for tables or TSV files for training should be `` 69 | 70 | --- 71 | 72 | ### Run training and push model to bucket 73 | 74 | Run `kctl train services/mnli.yaml`. The model will be cached locally, unless you specify a google storage bucket to upload to for deployment. Read about authentication in the google cloud storage API. 75 | 76 | ## Deploying 77 | 78 | 79 | 80 | ### Set up App 81 | 82 | ## -------------------------------------------------------------------------------- /tutorials/fact_check.md: -------------------------------------------------------------------------------- 1 | # Creating a SoTA Production Fact Checker from Wikipedia 2 | 3 | ## Create App 4 | ## Train or Download Pretrained Models 5 | ## Dump Wikipedia to Elastic Search 6 | ## Benchmark -------------------------------------------------------------------------------- /utils/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from koursaros.modeling.models import MODELS 2 | from koursaros.yamls import Yaml 3 | from kctl.logger import set_logger 4 | 5 | logger = set_logger('MODELS') 6 | 7 | def model_filename_resolver(name): 8 | if name.split('.')[-1] == 'yaml': 9 | return name 10 | return f'./services/{name}.yaml' 11 | 12 | def model_from_yaml(filename, **kwargs): 13 | config = Yaml(filename) 14 | return model_from_config(config, **kwargs) 15 | 16 | def model_from_config(config, training=False): 17 | for model_class in MODELS: 18 | if config.arch in model_class.architectures(): 19 | model = model_class(config, training) 20 | logger.info('Loaded model {}'.format(config.arch)) 21 | return model 22 | logger.error('Unsupported model architecture {}'.format(config.arch)) 23 | raise NotImplementedError() 24 | -------------------------------------------------------------------------------- /utils/modeling/data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | def get_rows_from_tsv(fname): 4 | if fname.split('.')[-1] == 'tsv': 5 | delimiter = '\t' 6 | else: 7 | delimiter = ',' 8 | with open(fname) as file: 9 | return csv.reader(file, delimiter=delimiter) 10 | 11 | def select_all(schema, table, random=True): 12 | query = f'select * from {schema}.{table}' 13 | if random: 14 | query += ' order by random()' 15 | return query -------------------------------------------------------------------------------- /utils/modeling/migrating.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import transformers 3 | from fairseq.models import roberta 4 | from fairseq.data.data_utils import collate_tokens 5 | import time 6 | import torch.nn.functional as F 7 | import torch.hub 8 | 9 | # def roberta_to_transformer(path_to_roberta, path_to_yaml): 10 | # model = RobertaModel.from_pretrained(path_to_roberta, checkpoint_file='model.pt') 11 | # model.eval() 12 | 13 | MAX_LENGTH = 256 14 | PAD = True 15 | 16 | def predict_transformers(model, tokenizer): 17 | def predict_fn(*args): 18 | inputs = time_fn(transformers_encode_batch, tokenizer, *args) 19 | inputs_dict = { 20 | 'input_ids': inputs[0], 21 | 'attention_mask': inputs[1], 22 | 'token_type_ids': inputs[2] 23 | } 24 | outputs = model(**inputs_dict) 25 | logits = outputs[0] 26 | preds = F.log_softmax(logits, dim=-1) 27 | return preds.tolist() 28 | return predict_fn 29 | 30 | 31 | def predict_roberta(model): 32 | def pred_fn(*args): 33 | batch = time_fn(collate_tokens, [model.encode(*arg)[:MAX_LENGTH] for arg in zip(*args)], pad_idx=1) 34 | labels = model.predict('mnli', *batch).tolist() 35 | return labels 36 | return pred_fn 37 | 38 | 39 | def benchmark(pred_fn, n): 40 | args = 'All work and no play.', 'Make jack a very dull boy.' 41 | for i in range(0, n): 42 | assert(type(pred_fn(*args)) == list) 43 | 44 | 45 | def benchmark_mnli(samples): 46 | torch_hub_model = time_fn(torch.hub.load, 'pytorch/fairseq','roberta.large.mnli') 47 | try: 48 | transformers_model = time_fn(transformers.RobertaModel.from_pretrained, 49 | 'roberta-large-mnli') 50 | except: 51 | transformers_model = time_fn(transformers.RobertaModel.from_pretrained, 52 | 'roberta-large-mnli', force_download=True) 53 | transformers_tokenizer = time_fn(transformers.RobertaTokenizer.from_pretrained, 'roberta-large-mnli') 54 | pred_functions = { 55 | 'transformers' : predict_transformers(transformers_model, transformers_tokenizer), 56 | 'torch_hub' : predict_roberta(torch_hub_model) 57 | } 58 | for framework, pred_fn in pred_functions.items(): 59 | print(f'Benchmarking {framework} with {samples} samples') 60 | time_fn(benchmark, pred_fn, samples) 61 | 62 | ### HELPERS 63 | 64 | def time_fn(fn, *args, **kwargs): 65 | start = time.time() 66 | res = fn(*args, **kwargs) 67 | print(f'Took {time.time() - start} seconds to run {fn.__name__}') 68 | return res 69 | 70 | 71 | def transformer_to_features(tokenizer, *args): 72 | inputs = tokenizer.encode_plus( 73 | *args, 74 | add_special_tokens=True, 75 | max_length=MAX_LENGTH, 76 | truncate_first_sequence=True 77 | ) 78 | input_ids, token_type_ids = inputs["input_ids"][:MAX_LENGTH], \ 79 | inputs["token_type_ids"][:MAX_LENGTH] 80 | 81 | attention_mask = [1] * len(input_ids) 82 | 83 | # Zero-pad up to the sequence length. 84 | if PAD: 85 | padding_length = MAX_LENGTH - len(input_ids) 86 | input_ids = ([0] * padding_length) + input_ids 87 | attention_mask = ([0] * padding_length) + attention_mask 88 | token_type_ids = ([0] * padding_length) + token_type_ids 89 | 90 | return (input_ids, attention_mask, token_type_ids) 91 | 92 | 93 | def transformers_encode_batch(tokenizer, *args): 94 | all_input_ids = [] 95 | all_attention_mask = [] 96 | all_token_type_ids = [] 97 | for sample in zip(*args): 98 | input_ids, attention_mask, token_type_ids = transformer_to_features(tokenizer, *sample) 99 | all_input_ids.append(input_ids) 100 | all_attention_mask.append(attention_mask) 101 | all_token_type_ids.append(token_type_ids) 102 | return all_input_ids, all_attention_mask, all_token_type_ids 103 | 104 | 105 | if __name__ == '__main__': 106 | benchmark_mnli(10) -------------------------------------------------------------------------------- /utils/modeling/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from koursaros.utils.database.psql import Conn 3 | from koursaros.utils.misc import gb_free_space 4 | from koursaros.utils.bucket import bucket_contains, download_and_unzip 5 | from kctl.logger import set_logger 6 | from .data import * 7 | 8 | logger = set_logger('MODELS') 9 | 10 | class Model(object): 11 | 12 | def __init__(self, config, training): 13 | if gb_free_space() < 3: 14 | logger.error("There is not enough space on your disk, please allocate more!") 15 | raise SystemError 16 | 17 | self.config = config 18 | self.version = config.hash 19 | self.dir = '.model-data' 20 | 21 | if not os.path.exists(self.dir): 22 | os.makedirs(self.dir) 23 | self.ckpt_dir = f'{self.dir}/{self.version}/' 24 | logger.info("Local model cache dir %s" %self.ckpt_dir) 25 | if not 'training' in self.config: # use a default model 26 | logger.info('Loading model from default checkpoint') 27 | self.checkpoint = self.config.checkpoint 28 | self.trained = True 29 | elif os.path.exists(self.ckpt_dir + 'config.json') and not training: # model already trained 30 | logger.info('Loading trained model') 31 | self.checkpoint = self.ckpt_dir 32 | self.trained = True 33 | elif bucket_contains(f'{self.version}.tar.gz'): 34 | logger.info(f'Downloading and extracting from bucket {self.config.repo}') 35 | download_and_unzip(self.config.repo.split('//')[-1], 36 | f'{self.version}.tar.gz', self.dir) 37 | self.checkpoint = self.ckpt_dir 38 | assert(os.path.exists(self.ckpt_dir + 'config.json')) 39 | self.trained = True 40 | else: # init model for training 41 | logger.info('Initializing model for training') 42 | if not training: 43 | logger.error('Please train model before deploying') 44 | raise SystemError 45 | self.data_dir = os.path.join(self.dir, self.version) 46 | if not os.path.exists(self.data_dir): 47 | os.makedirs(self.data_dir) 48 | if not os.path.exists(self.ckpt_dir): 49 | os.makedirs(self.ckpt_dir) 50 | self.checkpoint = config.training.checkpoint 51 | self.trained = False 52 | 53 | def get_data(self): 54 | """ 55 | Get training data based on yaml config and connection 56 | :return: 57 | """ 58 | data = self.config.training.data 59 | if data.source == 'postgres': 60 | p = Conn() 61 | query_fn = p.query 62 | return query_fn(select_all(data.schema, data.train)), \ 63 | query_fn(select_all(data.schema, data.test)) 64 | else: 65 | return get_rows_from_tsv(data.train), get_rows_from_tsv(data.test) 66 | 67 | def train(self): 68 | """ 69 | Runs training as defined in the model yaml. Saves model to directory 70 | .cache/ 71 | :return: evaluation metric 72 | """ 73 | raise NotImplementedError() 74 | 75 | def run(self, *args): 76 | """ 77 | Runs inference on arbitrary args 78 | :param args: sent_a, sent_b for classification / regression task. 79 | :return: 80 | """ 81 | raise NotImplementedError() 82 | 83 | def save_model(self): 84 | # append hash of yaml to model checkpoint 85 | raise NotImplementedError() 86 | 87 | @staticmethod 88 | def architectures(): 89 | raise NotImplementedError() 90 | 91 | def getInputProto(self): 92 | raise NotImplementedError() 93 | 94 | def getOutputProto(self): 95 | raise NotImplementedError() 96 | 97 | -------------------------------------------------------------------------------- /utils/modeling/models/__init__.py: -------------------------------------------------------------------------------- 1 | from koursaros.modeling.models.transformer_model import TransformerModel 2 | 3 | MODELS = [TransformerModel] -------------------------------------------------------------------------------- /utils/modeling/models/generative_transformer.py: -------------------------------------------------------------------------------- 1 | from koursaros.modeling.model import Model 2 | 3 | from tqdm import trange 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | import numpy as np 8 | 9 | from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig 10 | 11 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 12 | from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer 13 | from transformers import XLNetLMHeadModel, XLNetTokenizer 14 | from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer 15 | from transformers import XLMWithLMHeadModel, XLMTokenizer 16 | 17 | 18 | MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop 19 | 20 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ()) 21 | 22 | 23 | PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family 24 | (except for Alexei and Maria) are discovered. 25 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the 26 | remainder of the story. 1883 Western Siberia, 27 | a young Grigori Rasputin is asked by his father and a group of men to perform magic. 28 | Rasputin has a vision and denounces one of the men as a horse thief. Although his 29 | father initially slaps him for making such an accusation, Rasputin watches as the 30 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of 31 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, 32 | with people, even a bishop, begging for his blessing. """ 33 | 34 | MODEL_CLASSES = { 35 | 'gpt2': (GPT2LMHeadModel, GPT2Tokenizer), 36 | 'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), 37 | 'xlnet-gen': (XLNetLMHeadModel, XLNetTokenizer), 38 | 'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer), 39 | 'xlm-gen': (XLMWithLMHeadModel, XLMTokenizer), 40 | } 41 | 42 | class GenerativeTransformer(Model): 43 | 44 | def __init__(self, *args): 45 | super().__init__(*args) 46 | model_class, tokenizer_class = MODEL_CLASSES[self.config.base] 47 | self.model = model_class.from_pretrained(self.config.checkpoint) 48 | self.tokenizer = tokenizer_class.from_pretraiend(self.config.checkpoint) 49 | 50 | def set_seed(self, args): 51 | np.random.seed(args.seed) 52 | torch.manual_seed(args.seed) 53 | if args.n_gpu > 0: 54 | torch.cuda.manual_seed_all(args.seed) 55 | 56 | def top_k_top_p_filtering(self, logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): 57 | """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering 58 | Args: 59 | logits: logits distribution shape (vocabulary size) 60 | top_k > 0: keep only top k tokens with highest probability (top-k filtering). 61 | top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). 62 | Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) 63 | From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 64 | """ 65 | assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear 66 | top_k = min(top_k, logits.size(-1)) # Safety check 67 | if top_k > 0: 68 | # Remove all tokens with a probability less than the last token of the top-k 69 | indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] 70 | logits[indices_to_remove] = filter_value 71 | 72 | if top_p > 0.0: 73 | sorted_logits, sorted_indices = torch.sort(logits, descending=True) 74 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 75 | 76 | # Remove tokens with cumulative probability above the threshold 77 | sorted_indices_to_remove = cumulative_probs > top_p 78 | # Shift the indices to the right to keep also the first token above the threshold 79 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() 80 | sorted_indices_to_remove[..., 0] = 0 81 | 82 | indices_to_remove = sorted_indices[sorted_indices_to_remove] 83 | logits[indices_to_remove] = filter_value 84 | return logits 85 | 86 | def run(self, raw_text): 87 | context_tokens = self.tokenizer.encode(raw_text) 88 | out = self.sample_sequence( 89 | context=context_tokens, 90 | length=len(context_tokens) 91 | ) 92 | out = out[0, len(context_tokens):].tolist() 93 | 94 | text = self.tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True) 95 | # text = text[: text.find(args.stop_token) if args.stop_token else None] 96 | return text 97 | 98 | 99 | def sample_sequence(self, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.9, is_xlnet=False, 100 | xlm_lang=None, device='cpu'): 101 | context = torch.tensor(context, dtype=torch.long, device=device) 102 | context = context.unsqueeze(0).repeat(num_samples, 1) 103 | generated = context 104 | with torch.no_grad(): 105 | for _ in trange(length): 106 | 107 | inputs = {'input_ids': generated} 108 | if is_xlnet: 109 | # XLNet is a direct (predict same token, not next token) and bi-directional model by default 110 | # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring) 111 | input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1) 112 | perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, 113 | device=device) 114 | perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token 115 | target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device) 116 | target_mapping[0, 0, -1] = 1.0 # predict last token 117 | inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping} 118 | 119 | if xlm_lang is not None: 120 | inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1) 121 | 122 | outputs = self.model( 123 | **inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states) 124 | next_token_logits = outputs[0][0, -1, :] / temperature 125 | filtered_logits = self.top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) 126 | next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) 127 | generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1) 128 | return generated 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /utils/modeling/models/transformer_model.py: -------------------------------------------------------------------------------- 1 | from ..model import Model 2 | import torch.nn, torch.tensor, torch.distributed, torch.jit 3 | from transformers import * 4 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 5 | TensorDataset, DistributedSampler) 6 | from tensorboardX import SummaryWriter 7 | from tqdm import tqdm 8 | import numpy as np 9 | import os 10 | from kctl.logger import set_logger 11 | 12 | from koursaros.utils.misc import batch_list 13 | 14 | logger = set_logger('MODELS') 15 | 16 | MODEL_CLASSES = { 17 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), 18 | 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 19 | 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 20 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 21 | 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer) 22 | } 23 | 24 | class TransformerModel(Model): 25 | 26 | def __init__(self, *args, **kwargs): 27 | super().__init__(*args) 28 | if self.config.task == 'classification' or self.config.task == 'regression': 29 | config, model, tokenizer = MODEL_CLASSES[self.config.arch] 30 | else: 31 | raise NotImplementedError() 32 | 33 | self.model_config = config.from_pretrained(self.checkpoint, cache_dir=self.dir) 34 | self.model_config.num_labels = len(self.config.labels) 35 | self.model_config.torchscript = True 36 | self.model = model.from_pretrained(self.checkpoint, config=self.model_config, 37 | cache_dir=self.dir, **kwargs) 38 | self.tokenizer = tokenizer.from_pretrained(self.checkpoint, cache_dir=self.dir) 39 | self.batch_size = self.config.training.batch_size 40 | self.max_grad_norm = 1.0 41 | self.weight_decay = 0.0 42 | self.n_gpu = 1 43 | self.local_rank = -1 44 | self.gradient_accumulation_steps = 1 45 | self.max_length = 256 46 | self.fp16 = False 47 | self.evaluate_during_training = True 48 | self.pad_token_segment_id = 4 if self.config.arch == 'xlnet' else 0 49 | self.pad_on_left = True 50 | self.pad_token = 0 51 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 52 | self.model.to(self.device) 53 | self.pad = True 54 | self.label_map = {label: i for i, label in enumerate(self.config.labels)} 55 | if self.trained: 56 | print('Tracing model for deployment...') 57 | self.model.eval() 58 | # self.trace_model() 59 | if self.config.task == 'classification': 60 | self.best_checkpoint_metric = 'acc' 61 | elif self.config.task == 'regression': 62 | self.best_checkpoint_metric = 'loss' 63 | 64 | def inputs_from_batch(self, batch): 65 | inputs = {'input_ids': batch[0], 66 | 'attention_mask': batch[1]} 67 | if self.config.arch != 'distilbert': 68 | inputs['token_type_ids'] = batch[2] if self.config.arch in ['bert', 69 | 'xlnet'] else None 70 | if len(batch) > 3: 71 | inputs['labels'] = batch[3] 72 | return inputs 73 | 74 | def tuple_inputs(self, inputs): 75 | return ( 76 | inputs['input_ids'], 77 | inputs['attention_mask'], 78 | inputs['token_type_ids'] 79 | ) 80 | 81 | def trace_model(self): 82 | examples = [ 83 | InputExample( 84 | guid=1, 85 | text_a="Once upon a time there was a boy", 86 | text_b="He liked to write code all day long" 87 | ) 88 | ] 89 | features = [self.example_to_feature(example) for example in examples] 90 | all_inputs = self.features_to_inputs(features, True) 91 | inputs = self.inputs_from_batch(all_inputs) 92 | self.model = torch.jit.trace(self.model, self.tuple_inputs(inputs)) 93 | 94 | def train(self, force_build_features=False): 95 | return self.do_train(force_build_features=force_build_features) 96 | # except: 97 | # logger.warning('Error during training, decrease batch size and try again') 98 | # raise SystemError() 99 | # self.batch_size = self.batch_size // 2 # back off batch_size 100 | # return self.train(force_build_features=True) 101 | 102 | def do_train(self, force_build_features=False): 103 | ### In Transformers, optimizer and schedules are splitted and instantiated like this: 104 | 105 | tb_writer = SummaryWriter() 106 | 107 | train_dataset, test_dataset = self.get_data() 108 | train_dataset = self.load_and_cache_examples(train_dataset, force_build_features=force_build_features) 109 | epochs = int(self.config.training.epochs) 110 | optimizer = AdamW(self.model.parameters(), lr=float(self.config.training.learning_rate), 111 | correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False 112 | num_warmup_steps = int(0.06 * len(train_dataset)) 113 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, 114 | t_total=(self.config.training.epochs * len(train_dataset) / self.batch_size)) 115 | 116 | train_sampler = RandomSampler(train_dataset) 117 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size) 118 | 119 | t_total = len(train_dataloader) 120 | 121 | # Prepare optimizer and schedule (linear warmup and decay) 122 | no_decay = ['bias', 'LayerNorm.weight'] 123 | optimizer_grouped_parameters = [ 124 | {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 125 | 'weight_decay': self.weight_decay}, 126 | {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 127 | 'weight_decay': 0.0} 128 | ] 129 | 130 | try: 131 | from apex import amp 132 | model, optimizer = amp.initialize(self.model, optimizer) 133 | self.fp16 = True 134 | except ImportError: 135 | logger.warning("Please install fp16 from https://github.com/NVIDIA/apex for better performance") 136 | self.fp16 = False 137 | 138 | # Train! 139 | logger.info("***** Running training *****") 140 | logger.info(" Num examples = %d" % len(train_dataset)) 141 | logger.info(" Num Epochs = %d" % epochs) 142 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d" % 143 | self.batch_size * ( 144 | torch.distributed.get_world_size() if self.local_rank != -1 else 1)) 145 | logger.info(" Total optimization steps = %d" % t_total) 146 | 147 | if not 'eval_freq' in self.config.training: 148 | self.eval_freq = 2 149 | else: 150 | self.eval_freq = self.config.training.eval_freq 151 | 152 | self.eval_and_save_every = len(train_dataset) // self.batch_size // self.eval_freq 153 | 154 | global_step = 0 155 | tr_loss, logging_loss = 0.0, 0.0 156 | self.model.zero_grad() 157 | label_count = [0] * len(self.config.labels) 158 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.local_rank not in [-1, 0]) 159 | num_correct = 0 160 | prev_best = None 161 | for step, batch in enumerate(epoch_iterator): 162 | self.model.train() 163 | correct_labels = batch[3] 164 | batch = tuple(t.to(self.device) for t in batch) 165 | 166 | inputs = self.inputs_from_batch(batch) 167 | outputs = self.model(**inputs) 168 | loss = outputs[0] # model outputs are always tuple in transformers (see doc) 169 | logits = outputs[1] 170 | preds = logits.detach().cpu().numpy() 171 | preds = np.argmax(preds, axis=1) 172 | for pred in preds: 173 | label_count[pred] += 1 174 | num_correct += np.sum(preds == correct_labels.detach().cpu().numpy()) 175 | if step > 0: 176 | epoch_iterator.set_description("Accuracy: %.2f Label Counts: %s" 177 | % (num_correct / (step*self.batch_size), label_count)) 178 | epoch_iterator.refresh() # to show immediately the update 179 | 180 | if self.n_gpu > 1: 181 | loss = loss.mean() # mean() to average on multi-gpu parallel training 182 | 183 | if self.fp16: 184 | with amp.scale_loss(loss, optimizer) as scaled_loss: 185 | scaled_loss.backward() 186 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm) 187 | else: 188 | loss.backward() 189 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) 190 | 191 | tr_loss += loss.item() 192 | if (step + 1) % self.gradient_accumulation_steps == 0: 193 | optimizer.step() 194 | scheduler.step() # Update learning rate schedule 195 | self.model.zero_grad() 196 | global_step += 1 197 | 198 | if self.local_rank in [-1, 0] and global_step % self.eval_and_save_every == 0: 199 | # Log metrics 200 | if self.local_rank == -1 and self.evaluate_during_training: 201 | results = self.evaluate(test_dataset) 202 | for key, value in results.items(): 203 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 204 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) 205 | tb_writer.add_scalar('loss', (tr_loss - logging_loss) / self.eval_and_save_every, global_step) 206 | logging_loss = tr_loss 207 | if prev_best is None or results[self.best_checkpoint_metric] > prev_best: 208 | prev_best = results[self.best_checkpoint_metric] 209 | self.save_model() 210 | 211 | if self.local_rank in [-1, 0]: 212 | tb_writer.close() 213 | 214 | result = self.evaluate(test_dataset) 215 | if prev_best is None or result[self.best_checkpoint_metric] > prev_best: 216 | self.save_model() 217 | 218 | return global_step, tr_loss / global_step 219 | 220 | def save_model(self): 221 | # Save model checkpoint 222 | model_to_save = self.model.module if hasattr(self.model, 223 | 'module') else self.model 224 | model_to_save.save_pretrained(self.ckpt_dir) 225 | self.tokenizer.save_pretrained(self.ckpt_dir) 226 | 227 | def evaluate(self, test_dataset): 228 | eval_dataset = self.load_and_cache_examples(test_dataset, evaluate=True) 229 | eval_output_dir = os.path.join(self.data_dir, 'eval') 230 | 231 | if not os.path.exists(eval_output_dir) and self.local_rank in [-1, 0]: 232 | os.makedirs(eval_output_dir) 233 | 234 | # Note that DistributedSampler samples randomly 235 | eval_sampler = SequentialSampler(eval_dataset) if self.local_rank == -1 else DistributedSampler( 236 | eval_dataset) 237 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.batch_size) 238 | 239 | # Eval! 240 | logger.info("***** Running evaluation *****") 241 | logger.info(" Num examples = %d" % len(eval_dataset)) 242 | logger.info(" Batch size = %d" % self.batch_size) 243 | eval_loss = 0.0 244 | nb_eval_steps = 0 245 | preds = None 246 | out_label_ids = None 247 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 248 | self.model.eval() 249 | batch = tuple(t.to(self.device) for t in batch) 250 | 251 | with torch.no_grad(): 252 | inputs = {'input_ids': batch[0], 253 | 'attention_mask': batch[1], 254 | 'labels': batch[3]} 255 | if self.config.arch != 'distilbert': 256 | inputs['token_type_ids'] = batch[2] if self.config.arch in ['bert', 257 | 'xlnet'] else None 258 | outputs = self.model(**inputs) 259 | tmp_eval_loss, logits = outputs[:2] 260 | 261 | eval_loss += tmp_eval_loss.mean().item() 262 | nb_eval_steps += 1 263 | if preds is None: 264 | preds = logits.detach().cpu().numpy() 265 | out_label_ids = inputs['labels'].detach().cpu().numpy() 266 | else: 267 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 268 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 269 | 270 | eval_loss = eval_loss / nb_eval_steps 271 | result = { 272 | "loss": eval_loss 273 | } 274 | if self.config.task == "classification": 275 | preds = np.argmax(preds, axis=1) 276 | result['acc'] = np.sum(preds == out_label_ids) / len(preds) 277 | elif self.config.task == "regression": 278 | preds = np.squeeze(preds) 279 | 280 | output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") 281 | with open(output_eval_file, "w") as writer: 282 | logger.info("***** Eval results *****") 283 | for key in sorted(result.keys()): 284 | logger.info(" %s = %s" % (key, str(result[key]))) 285 | writer.write("%s = %s\n" % (key, str(result[key]))) 286 | 287 | return result 288 | 289 | def example_to_feature(self, example): 290 | inputs = self.tokenizer.encode_plus( 291 | example.text_a, 292 | example.text_b, 293 | add_special_tokens=True, 294 | max_length=self.max_length, 295 | truncate_first_sequence=True # We're truncating the first sequence in priority 296 | ) 297 | input_ids, token_type_ids = inputs["input_ids"][:self.max_length], \ 298 | inputs["token_type_ids"][:self.max_length] 299 | 300 | attention_mask = [1] * len(input_ids) 301 | 302 | # Zero-pad up to the sequence length. 303 | if self.pad: 304 | padding_length = self.max_length - len(input_ids) 305 | if self.pad_on_left: 306 | input_ids = ([self.pad_token] * padding_length) + input_ids 307 | attention_mask = ([0] * padding_length) + attention_mask 308 | token_type_ids = ([self.pad_token_segment_id] * padding_length) + token_type_ids 309 | else: 310 | input_ids = input_ids + ([self.pad_token] * padding_length) 311 | attention_mask = attention_mask + ([0] * padding_length) 312 | token_type_ids = token_type_ids + ([self.pad_token_segment_id] * padding_length) 313 | 314 | if example.label is not None: 315 | if self.config.task == "classification": 316 | if example.label in self.label_map: 317 | label = self.label_map[example.label] 318 | else: 319 | logger.warning("UNKNOWN LABEL %s, ignoring" % example.label) 320 | return 321 | elif self.config.task == "regression": 322 | label = float(example.label) 323 | else: 324 | logger.error("Only supported tasks are classification and regression") 325 | raise NotImplementedError() 326 | else: 327 | label = None 328 | 329 | return InputFeatures(input_ids=input_ids, 330 | attention_mask=attention_mask, 331 | token_type_ids=token_type_ids, 332 | label=label) 333 | 334 | def features_to_inputs(self, features, inference): 335 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(self.device) 336 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long).to(self.device) 337 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long).to(self.device) 338 | if not inference: 339 | if self.config.task == "classification": 340 | all_labels = torch.tensor([f.label for f in features], dtype=torch.long).to(self.device) 341 | elif self.config.task == "regression": 342 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float).to(self.device) 343 | else: 344 | raise NotImplementedError() 345 | return all_input_ids, all_attention_mask, all_token_type_ids, all_labels 346 | else: 347 | return all_input_ids, all_attention_mask, all_token_type_ids 348 | 349 | 350 | def load_and_cache_examples(self, data, evaluate=False, force_build_features=False): 351 | if self.local_rank not in [-1, 0] and not evaluate: 352 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 353 | 354 | cached_features_file = os.path.join(self.data_dir, 'features' if not evaluate else 'eval-features') 355 | if os.path.exists(os.path.join(cached_features_file)) and not force_build_features: 356 | logger.info("Loading features from cached file %s" % cached_features_file) 357 | features = torch.load(cached_features_file) 358 | else: 359 | logger.info("Creating features from dataset file at %s" % cached_features_file) 360 | 361 | examples = [ 362 | InputExample(guid=i, 363 | text_a=ex[0], 364 | text_b=ex[1] if len(ex) == 3 else None, 365 | label=ex[-1]) for i, ex in enumerate(data) 366 | ] 367 | 368 | features = [] 369 | for (ex_index, example) in enumerate(examples): 370 | if ex_index % 10000 == 0: 371 | logger.info("Writing example %d" % (ex_index)) 372 | features.append(self.example_to_feature(example)) 373 | 374 | if self.local_rank in [-1, 0]: 375 | logger.info("Saving features into cached file %s" % cached_features_file) 376 | torch.save(features, cached_features_file) 377 | 378 | if self.local_rank == 0 and not evaluate: 379 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 380 | 381 | # Convert to Tensors and build dataset 382 | dataset = TensorDataset(*self.features_to_inputs(features, False)) 383 | return dataset 384 | 385 | def pred_from_output(self, outputs): 386 | logits = outputs[0] 387 | preds = logits.detach().cpu().numpy() 388 | if self.config.task == 'classification': 389 | preds = np.argmax(preds, axis=1) 390 | return [self.config.labels[int(pred)] for pred in preds] 391 | elif self.config.task == 'regression': 392 | return np.squeeze(preds) 393 | else: 394 | raise NotImplementedError() 395 | 396 | def run(self, *args): 397 | examples = [ 398 | InputExample( 399 | guid=str(i), 400 | text_a=arg[0], 401 | text_b=None if len(arg) < 2 else arg[1] 402 | ) for i, arg in enumerate(zip(*args)) 403 | ] 404 | features = [self.example_to_feature(example) for example in examples] 405 | all_inputs = self.features_to_inputs(features, True) 406 | inputs = self.inputs_from_batch(all_inputs) 407 | outputs = self.model(*self.tuple_inputs(inputs)) 408 | return self.pred_from_output(outputs) 409 | 410 | def multi_gpu_training(self): 411 | # multi-gpu training (should be after apex fp16 initialization) 412 | if self.n_gpu > 1: 413 | model = torch.nn.DataParallel(self.model) 414 | # Distributed training (should be after apex fp16 initialization) 415 | if self.local_rank != -1: 416 | model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.local_rank], 417 | output_device=self.local_rank, 418 | find_unused_parameters=True) 419 | 420 | @staticmethod 421 | def architectures(): 422 | return list(MODEL_CLASSES.keys()) 423 | -------------------------------------------------------------------------------- /utils/predictor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/predictor/__init__.py -------------------------------------------------------------------------------- /utils/predictor/__main__.py: -------------------------------------------------------------------------------- 1 | from koursaros.modeling import model_from_yaml 2 | import sys, os 3 | from koursaros.utils.database.psql import Conn 4 | from koursaros.utils.misc import batch_list 5 | from koursaros.modeling.data import * 6 | import csv 7 | import time 8 | 9 | BATCH_SIZE = int(os.environ.get('BATCH_SIZE') or 4) 10 | 11 | def predict(model_file, data_source, data_target, truncate=False): 12 | model = model_from_yaml(model_file) 13 | extension = data_source.split('.')[-1] 14 | if extension in ['tsv', 'csv']: 15 | rows = get_rows_from_tsv(data_source) 16 | delimiter = '\t' if extension == 'tsv' else 'csv' 17 | open(data_target, 'w+') # touch file 18 | 19 | def write_fn(buffer): 20 | file = open(data_target, 'a') 21 | writer = csv.writer(file, delimiter=delimiter) 22 | for row in buffer: writer.writerow(row) 23 | 24 | else: 25 | p = Conn() 26 | query_fn = p.query 27 | schema, table = data_source.split('.') 28 | if truncate: 29 | p.execute(f'''truncate table {data_target}''') 30 | rows = query_fn(select_all(schema, table, random=False)) 31 | 32 | def write_fn(buffer): 33 | p.insert(data_target, buffer) 34 | p.commit() 35 | 36 | buffer = [] 37 | i = 0 38 | start = time.time() 39 | for step, batch in enumerate(batch_list(rows, BATCH_SIZE)): 40 | transposed = tuple(zip(*batch)) 41 | inputs = transposed[:-1] 42 | ids = transposed[-1] 43 | buffer.extend(zip(ids, model.run(*inputs))) 44 | i += BATCH_SIZE 45 | if i > 500: 46 | total = step * BATCH_SIZE 47 | print('dumping example {}, rate: {} per second'.format(total, total/(time.time() - start) )) 48 | write_fn(buffer) 49 | buffer = [] 50 | i = 0 51 | 52 | if len(buffer) > 0: write_fn(buffer) 53 | 54 | if __name__ == '__main__': 55 | model_file = sys.argv[1] 56 | data_source = sys.argv[2] 57 | data_target = sys.argv[3] if len(sys.argv) > 3 else './predictions.tsv' 58 | truncate = len(sys.argv) > 4 and sys.argv[4] == '-t' 59 | predict(model_file, data_source, data_target, truncate=truncate) -------------------------------------------------------------------------------- /utils/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from koursaros.modeling import model_from_yaml 2 | 3 | def train(file): 4 | model = model_from_yaml(file, training=True) 5 | model.train() -------------------------------------------------------------------------------- /utils/trainer/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from . import train 3 | 4 | if __name__ == '__main__': 5 | filename = sys.argv[1] 6 | train(filename) -------------------------------------------------------------------------------- /utils/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/utils/__init__.py -------------------------------------------------------------------------------- /utils/utils/bucket/__init__.py: -------------------------------------------------------------------------------- 1 | from google.cloud import storage 2 | import tarfile 3 | import os 4 | from pathlib import Path 5 | 6 | def download_blob(bucket_name, source_blob_name, destination_file_name): 7 | """Downloads a blob from the bucket.""" 8 | storage_client = storage.Client() 9 | bucket = storage_client.get_bucket(bucket_name) 10 | blob = bucket.blob(source_blob_name) 11 | 12 | blob.download_to_filename(destination_file_name) 13 | 14 | print('Blob {} downloaded to {}.'.format( 15 | source_blob_name, 16 | destination_file_name)) 17 | 18 | def bucket_contains(filename): 19 | storage_client = storage.Client() 20 | blobs = storage_client.list_blobs("poloma-models") 21 | for blob in blobs: 22 | if blob == filename: return True 23 | return False 24 | 25 | def download_and_unzip(bucket_name, source_blob_name, out_dir, archive=False): 26 | fname = source_blob_name.split("/")[-1] 27 | destination = out_dir + fname 28 | if not os.path.isfile(destination): 29 | download_blob(bucket_name, source_blob_name, destination) 30 | assert os.path.isfile(destination) 31 | if archive: 32 | tar = tarfile.open(destination, "r:gz") 33 | tar.extractall(out_dir) 34 | tar.close() 35 | print(f'extracted {destination} to {out_dir}') -------------------------------------------------------------------------------- /utils/utils/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/utils/cuda/__init__.py -------------------------------------------------------------------------------- /utils/utils/cuda/apex.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex -------------------------------------------------------------------------------- /utils/utils/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .psql import * 2 | 3 | 4 | -------------------------------------------------------------------------------- /utils/utils/database/psql.py: -------------------------------------------------------------------------------- 1 | from psycopg2 import extensions, extras 2 | import os 3 | 4 | 5 | def is_nested(nested): 6 | if any(not isinstance(i, (list, tuple)) for i in nested): 7 | raise ValueError('Hey dumbass - you can only dump nested lists/tuples.') 8 | 9 | 10 | class Conn(extensions.connection): 11 | def __init__(self, host=None, user=None, password=None, dbname=None, sslmode=None, cert_path=None): 12 | if sslmode: 13 | os.environ['PGSSLMODE'] = sslmode 14 | if cert_path: 15 | os.environ['PGSSLROOTCERT'] = cert_path 16 | if not host: 17 | host = os.environ.get('PGHOST') 18 | if not user: 19 | user = os.environ.get('PGUSER') 20 | if not password: 21 | password = os.environ.get('PGPASS') 22 | if not dbname: 23 | dbname = os.environ.get('PGDBNAME') 24 | dsn = f"dbname='{dbname}' user='{user}' host='{host}' password='{password}'" 25 | super(Conn, self).__init__(dsn=dsn) 26 | 27 | def _set_columns(self, cur): 28 | self.columns = [desc.name for desc in cur.description] 29 | 30 | def execute(self, query): 31 | cur = self.cursor() 32 | cur.execute(query) 33 | 34 | def iter_rows(self, query): 35 | cur = self.cursor() 36 | cur.execute(query) 37 | self._set_columns(cur) 38 | return cur 39 | 40 | def iter_chunk(self, query, chunksize): 41 | cur = self.cursor() 42 | cur.execute(query) 43 | self._set_columns(cur) 44 | chunk = cur.fetchmany(chunksize) 45 | while chunk: 46 | yield chunk 47 | chunk = cur.fetchmany(chunksize) 48 | 49 | def query(self, query): 50 | cur = self.cursor() 51 | cur.execute(query) 52 | fetched = cur.fetchall() 53 | self._set_columns(cur) 54 | return fetched 55 | 56 | def insert(self, table, nested): 57 | is_nested(nested) 58 | cur = self.cursor() 59 | template = f'INSERT INTO {table} VALUES %s' 60 | extras.execute_values(cur, template, nested) 61 | 62 | def table_exists(self, schema, table): 63 | query = f''' 64 | SELECT EXISTS ( 65 | SELECT 66 | FROM information_schema.tables 67 | WHERE table_schema = '{schema}' 68 | AND table_name = '{table}' 69 | ); 70 | ''' 71 | cur = self.cursor() 72 | cur.execute(query) 73 | return cur.fetchone()[0] 74 | 75 | def database_exists(self, database): 76 | query = f''' 77 | SELECT EXISTS ( 78 | SELECT 79 | FROM pg_database 80 | WHERE datname = '{database}' 81 | ) 82 | ''' 83 | cur = self.cursor() 84 | cur.execute(query) 85 | return cur.fetchone()[0] 86 | 87 | def create_database(self, database): 88 | query = f''' 89 | COPY (SELECT 1) TO PROGRAM 'createdb {database}'; 90 | ''' 91 | cur = self.cursor() 92 | cur.execute(query) 93 | -------------------------------------------------------------------------------- /utils/utils/misc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | BOLD = '\033[1m{}\033[0m' 5 | 6 | 7 | def gb_free_space(): 8 | statvfs = os.statvfs(os.getcwd()) 9 | return statvfs.f_frsize * statvfs.f_bfree / 1e+9 # Actual number of free bytes 10 | 11 | 12 | def batch_fn(batch_size, call_fn, items): 13 | buffer = [] 14 | for item in items: 15 | buffer.append(item) 16 | if len(buffer) % batch_size == 0: 17 | yield call_fn(buffer), buffer 18 | buffer = [] 19 | if len(buffer) > 0: 20 | yield call_fn(buffer), buffer 21 | 22 | 23 | def batch_list(arr, n): 24 | buffer = [] 25 | for i, item in enumerate(arr): 26 | buffer.append(item) 27 | if (i+1) % n == 0: 28 | yield buffer 29 | buffer = [] 30 | if len(buffer) > 0: 31 | yield buffer 32 | -------------------------------------------------------------------------------- /utils/utils/misc/tree.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ls -R | grep ":$" | sed -e 's/:$//' -e 's/[^-][^\/]*\//--/g' -e 's/^/ /' -e 's/-/|/' -------------------------------------------------------------------------------- /utils/yamls.py: -------------------------------------------------------------------------------- 1 | from yaml import safe_load 2 | from hashlib import md5 3 | from enum import Enum 4 | from box import Box 5 | 6 | 7 | class YamlType(Enum): 8 | BASE = 0 9 | PIPELINE = 1 10 | SERVICE = 2 11 | 12 | 13 | def Yaml(path): 14 | """ 15 | Sudo class for managing a yaml as a python object. 16 | 17 | :param path: path to .yaml file 18 | """ 19 | __type__ = None 20 | __text__ = open(path).read() 21 | yaml = safe_load(__text__) 22 | 23 | for yaml_type in YamlType: 24 | if yaml_type.name.lower() in yaml: 25 | __type__ = yaml_type 26 | 27 | if __type__ is None: 28 | raise ValueError('Invalid yaml type for %s' % path) 29 | 30 | box = Box(yaml[__type__.name.lower()]) 31 | box.__path__ = path 32 | box.__text__ = __text__ 33 | box.__type__ = __type__ 34 | box.hash = md5(__text__.encode()).hexdigest() 35 | return box 36 | --------------------------------------------------------------------------------