├── .github
    ├── koursaros.jpg
    ├── logo.svg
    └── overview.svg
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── flows
    ├── .DS_Store
    ├── factchecking
    │   ├── index
    │   │   ├── docker-compose.yml
    │   │   ├── flow.py
    │   │   └── helm
    │   │   │   ├── .helmignore
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       ├── NOTES.txt
    │   │   │       ├── main.yaml
    │   │   │       ├── service.yaml
    │   │   │       └── statefulset.yaml
    │   │   │   └── values.yaml
    │   ├── query
    │   │   ├── flow.py
    │   │   └── helm
    │   │   │   ├── .helmignore
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       ├── NOTES.txt
    │   │   │       ├── main.yaml
    │   │   │       ├── service.yaml
    │   │   │       └── statefulset.yaml
    │   │   │   └── values.yaml
    │   └── train
    │   │   └── train-compose.yml
    └── yc_demo
    │   ├── .DS_Store
    │   ├── docker-compose-temp.yml
    │   ├── docker-compose.yml
    │   ├── flow.py
    │   ├── helm
    │       ├── .helmignore
    │       ├── Chart.yaml
    │       ├── templates
    │       │   ├── NOTES.txt
    │       │   ├── main.yaml
    │       │   ├── service.yaml
    │       │   └── statefulset.yaml
    │       └── values.yaml
    │   ├── index.k
    │   └── query.k
├── koursaros
    ├── __init__.py
    ├── chart
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── templates
    │   │   ├── NOTES.txt
    │   │   ├── main.yaml
    │   │   ├── service.yaml
    │   │   └── statefulset.yaml
    │   └── values.yaml
    ├── cli
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── build
    │   │   └── __init__.py
    │   ├── deploy
    │   │   └── __init__.py
    │   ├── manager.py
    │   ├── show
    │   │   └── __init__.py
    │   ├── test
    │   │   └── __init__.py
    │   └── utils.py
    ├── flow
    │   └── __init__.py
    ├── hub
    │   ├── client
    │   │   ├── .DS_Store
    │   │   ├── postgres
    │   │   │   ├── Dockerfile
    │   │   │   ├── postgres.py
    │   │   │   ├── testrerank.yml
    │   │   │   └── wikititles.yml
    │   │   └── sheet
    │   │   │   ├── Dockerfile
    │   │   │   ├── base.yml
    │   │   │   ├── client.py
    │   │   │   └── test.csv
    │   ├── encoder
    │   │   ├── robertainfer
    │   │   │   ├── Dockerfile
    │   │   │   └── dim64.yml
    │   │   └── textbyte
    │   │   │   ├── Dockerfile
    │   │   │   ├── max1024.yml
    │   │   │   ├── max256.yml
    │   │   │   └── textbyte.py
    │   ├── httpclient
    │   │   └── http
    │   │   │   └── Dockerfile
    │   ├── indexer
    │   │   ├── faisscpu
    │   │   │   ├── Dockerfile
    │   │   │   └── base.yml
    │   │   ├── keyword
    │   │   │   ├── Dockerfile
    │   │   │   ├── base.yml
    │   │   │   └── keyword.py
    │   │   ├── lvdb
    │   │   │   ├── Dockerfile
    │   │   │   └── base.yml
    │   │   ├── rocksdb
    │   │   │   ├── Dockerfile
    │   │   │   └── base.yml
    │   │   ├── simple_dict
    │   │   │   ├── Dockerfile
    │   │   │   ├── base.yml
    │   │   │   └── simple_dict.py
    │   │   └── whoosh
    │   │   │   ├── Dockerfile
    │   │   │   ├── base.yml
    │   │   │   └── whoosh.py
    │   ├── preprocessor
    │   │   ├── sentsplit
    │   │   │   ├── Dockerfile
    │   │   │   └── jsonmode.yml
    │   │   └── unary
    │   │   │   ├── Dockerfile
    │   │   │   └── text.yml
    │   ├── router
    │   │   ├── block
    │   │   │   ├── Dockerfile
    │   │   │   ├── block.py
    │   │   │   ├── block_query.yml
    │   │   │   └── block_train.yml
    │   │   ├── log
    │   │   │   ├── Dockerfile
    │   │   │   └── log.py
    │   │   ├── rerank
    │   │   │   ├── Dockerfile
    │   │   │   ├── base.yml
    │   │   │   └── rerank.py
    │   │   └── resp_req
    │   │   │   ├── Dockerfile
    │   │   │   ├── base.yml
    │   │   │   └── resp_req.py
    │   └── tests
    │   │   ├── reviews_sample.csv
    │   │   ├── sonnets_small.txt
    │   │   ├── test_block.py
    │   │   ├── test_keyword.py
    │   │   ├── test_reranker.py
    │   │   ├── test_textbyte_encoder.py
    │   │   ├── test_whoosh.py
    │   │   └── yaml
    │   │       ├── test-joint.yml
    │   │       ├── test-keyword.yml
    │   │       ├── test-reranker.yml
    │   │       └── test-whoosh.yml
    └── repo_creds
    │   ├── README.md
    │   └── __init__.py
├── requirements.txt
├── setup.py
├── tutorials
    ├── deploy_custom_model.md
    └── fact_check.md
└── utils
    ├── modeling
        ├── __init__.py
        ├── data.py
        ├── migrating.py
        ├── model.py
        └── models
        │   ├── __init__.py
        │   ├── generative_transformer.py
        │   └── transformer_model.py
    ├── predictor
        ├── __init__.py
        └── __main__.py
    ├── trainer
        ├── __init__.py
        └── __main__.py
    ├── utils
        ├── __init__.py
        ├── bucket
        │   └── __init__.py
        ├── cuda
        │   ├── __init__.py
        │   └── apex.sh
        ├── database
        │   ├── __init__.py
        │   └── psql.py
        └── misc
        │   ├── __init__.py
        │   └── tree.sh
    └── yamls.py


/.github/koursaros.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/.github/koursaros.jpg


--------------------------------------------------------------------------------
/.github/logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 908.14 167.53"><defs><style>.cls-1{fill:url(#linear-gradient);}.cls-2{fill:url(#linear-gradient-2);}.cls-3{font-size:138px;font-family:Helvetica-Light, Helvetica;font-weight:300;letter-spacing:0.04em;}.cls-4{letter-spacing:-0.02em;}</style><linearGradient id="linear-gradient" x1="100.31" y1="48.89" x2="44.63" y2="126.99" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#9eb6ff"/><stop offset="0.19" stop-color="#7e8fd5"/><stop offset="0.42" stop-color="#5a64a6"/><stop offset="0.62" stop-color="#404584"/><stop offset="0.79" stop-color="#313270"/><stop offset="0.9" stop-color="#2b2b68"/></linearGradient><linearGradient id="linear-gradient-2" x1="119.77" y1="6.27" x2="98.65" y2="91.48" xlink:href="#linear-gradient"/></defs><title>Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-1" d="M0,66.61S16.77,127,85.3,131.68,185,38.63,185,38.63,51.14,93.33,0,66.61Z"/><path class="cls-2" d="M18.7,8.11S107,68,139.8,31.2s70.59,15.39,70.59,15.39-10-1.34-18.73,7.36-35.8,45.49-77.11,45.49S26.73,89.41,18.7,8.11Z"/><text class="cls-3" transform="translate(228.27 118.54) scale(1.02 1)">Koursaro<tspan class="cls-4" x="600.71" y="0">s</tspan></text></g></g></svg>


--------------------------------------------------------------------------------
/.github/overview.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 530.42 257.04"><defs><style>.cls-1,.cls-10,.cls-13{fill:none;}.cls-2{fill:#00b3a1;}.cls-3{fill:#fff;}.cls-4,.cls-5,.cls-6,.cls-7,.cls-8,.cls-9{stroke:#fff;}.cls-10,.cls-13,.cls-4,.cls-5,.cls-6,.cls-7,.cls-8,.cls-9{stroke-miterlimit:10;}.cls-5{fill:#008a7c;}.cls-6{fill:#00cfba;}.cls-7{fill:#b05eff;}.cls-8{fill:#7f2ad1;}.cls-9{fill:#b76eff;}.cls-10{stroke:#29abe2;stroke-width:2px;}.cls-11{fill:#29abe2;}.cls-12{font-size:12px;font-family:MyriadPro-Regular, Myriad Pro;}.cls-13{stroke:#000;}.cls-14{letter-spacing:0.01em;}.cls-15{letter-spacing:0em;}.cls-16{letter-spacing:0.02em;}.cls-17{letter-spacing:-0.01em;}.cls-18{letter-spacing:0em;}</style></defs><title>Asset 2</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-1" width="530.42" height="257.04"/><polygon points="106.39 93.07 141.63 72.59 178.17 93.69 142.93 114.17 106.39 93.07"/><polygon points="106.51 51.08 141.75 30.6 141.63 72.59 106.39 93.07 106.51 51.08"/><polygon class="cls-2" points="143.05 72.17 142.93 114.17 106.39 93.07 106.51 51.08 143.05 72.17"/><polygon class="cls-3" points="143.05 72.17 178.29 51.7 178.17 93.69 142.93 114.17 143.05 72.17"/><path class="cls-4" d="M105.8,49.85l38,21.92-.12,43.62-38-21.92Zm37.13,64.32.12-42L106.51,51.08l-.12,42,36.54,21.1"/><polygon class="cls-3" points="106.51 51.08 141.75 30.6 178.29 51.7 143.05 72.17 106.51 51.08"/><polygon class="cls-5" points="143.76 71.77 179 51.29 178.88 94.91 143.64 115.39 143.76 71.77"/><polygon class="cls-6" points="105.8 49.85 141.04 29.37 179 51.29 143.76 71.77 105.8 49.85"/><polygon points="195.38 217.65 230.63 197.18 267.17 218.28 231.93 238.75 195.38 217.65"/><polygon points="195.5 175.66 230.74 155.18 230.63 197.18 195.38 217.65 195.5 175.66"/><polygon class="cls-2" points="232.05 196.76 231.93 238.75 195.38 217.65 195.5 175.66 232.05 196.76"/><polygon class="cls-3" points="232.05 196.76 267.29 176.28 267.17 218.28 231.93 238.75 232.05 196.76"/><path class="cls-4" d="M194.8,174.44l38,21.91L232.64,240l-38-21.92Zm37.13,64.31.12-42-36.55-21.1-.12,42,36.55,21.1"/><polygon class="cls-3" points="195.5 175.66 230.74 155.18 267.29 176.28 232.05 196.76 195.5 175.66"/><polygon class="cls-5" points="232.76 196.35 268 175.88 267.88 219.5 232.64 239.98 232.76 196.35"/><polygon class="cls-6" points="194.8 174.44 230.04 153.96 268 175.88 232.76 196.35 194.8 174.44"/><polygon class="cls-7" points="241.48 179.13 241.45 188.71 232.72 183.89 232.75 174.32 241.48 179.13"/><polygon class="cls-8" points="242.06 178.91 249.97 174.32 249.94 184.11 242.03 188.71 242.06 178.91"/><polygon class="cls-9" points="232.99 173.29 240.97 168.65 249.56 173.62 241.58 178.25 232.99 173.29"/><polygon class="cls-7" points="159.78 98.71 159.75 108.29 151.02 103.48 151.05 93.9 159.78 98.71"/><polygon class="cls-8" points="160.35 98.5 168.26 93.9 168.24 103.69 160.33 108.29 160.35 98.5"/><polygon class="cls-9" points="151.28 92.87 159.26 88.24 167.86 93.2 159.88 97.84 151.28 92.87"/><polygon class="cls-7" points="168.54 73.51 168.52 83.09 159.79 78.27 159.82 68.7 168.54 73.51"/><polygon class="cls-8" points="169.12 73.29 177.03 68.7 177 78.49 169.09 83.09 169.12 73.29"/><polygon class="cls-9" points="160.05 67.67 168.03 63.03 176.63 68 168.65 72.64 160.05 67.67"/><polygon points="301.68 96.58 336.92 76.1 373.46 97.2 338.22 117.68 301.68 96.58"/><polygon points="301.8 54.58 337.04 34.1 336.92 76.1 301.68 96.58 301.8 54.58"/><polygon class="cls-2" points="338.34 75.68 338.22 117.68 301.68 96.58 301.8 54.58 338.34 75.68"/><polygon class="cls-3" points="338.34 75.68 373.58 55.2 373.46 97.2 338.22 117.68 338.34 75.68"/><path class="cls-4" d="M301.09,53.36l38,21.92-.12,43.62L301,97Zm37.13,64.32.12-42L301.8,54.58l-.12,42,36.54,21.1"/><polygon class="cls-3" points="301.8 54.58 337.04 34.1 373.58 55.2 338.34 75.68 301.8 54.58"/><polygon class="cls-5" points="339.05 75.28 374.29 54.8 374.17 98.42 338.93 118.9 339.05 75.28"/><polygon class="cls-6" points="301.09 53.36 336.33 32.88 374.29 54.8 339.05 75.28 301.09 53.36"/><polygon class="cls-7" points="358.66 91.54 358.63 101.11 349.9 96.3 349.93 86.73 358.66 91.54"/><polygon class="cls-8" points="359.24 91.32 367.15 86.73 367.12 96.52 359.21 101.11 359.24 91.32"/><polygon class="cls-9" points="350.17 85.7 358.15 81.06 366.75 86.03 358.76 90.66 350.17 85.7"/><polygon class="cls-7" points="327.77 92.9 327.74 102.48 319.01 97.67 319.04 88.09 327.77 92.9"/><polygon class="cls-8" points="328.35 92.69 336.26 88.09 336.23 97.88 328.32 102.48 328.35 92.69"/><polygon class="cls-9" points="319.27 87.06 327.25 82.42 335.85 87.39 327.87 92.03 319.27 87.06"/><polygon class="cls-7" points="308.82 82.3 308.79 91.88 300.06 87.06 300.09 77.49 308.82 82.3"/><polygon class="cls-8" points="309.4 82.08 317.31 77.49 317.28 87.28 309.37 91.88 309.4 82.08"/><polygon class="cls-9" points="300.32 76.46 308.31 71.82 316.9 76.79 308.92 81.42 300.32 76.46"/><polygon points="357.33 209.39 392.57 188.91 429.12 210.01 393.88 230.49 357.33 209.39"/><polygon points="357.45 167.4 392.69 146.92 392.57 188.91 357.33 209.39 357.45 167.4"/><polygon class="cls-2" points="394 188.5 393.88 230.49 357.33 209.39 357.45 167.4 394 188.5"/><polygon class="cls-3" points="394 188.5 429.24 168.02 429.12 210.01 393.88 230.49 394 188.5"/><path class="cls-4" d="M356.75,166.17l38,21.92-.12,43.63-38-21.92Zm37.13,64.32.12-42-36.55-21.1-.12,42,36.55,21.1"/><polygon class="cls-3" points="357.45 167.4 392.69 146.92 429.24 168.02 394 188.5 357.45 167.4"/><polygon class="cls-5" points="394.71 188.09 429.95 167.61 429.83 211.24 394.59 231.72 394.71 188.09"/><polygon class="cls-6" points="356.75 166.18 391.99 145.7 429.95 167.61 394.71 188.09 356.75 166.18"/><polygon class="cls-7" points="394.84 166.2 394.81 175.78 386.09 170.96 386.12 161.39 394.84 166.2"/><polygon class="cls-8" points="395.42 165.98 403.33 161.39 403.3 171.18 395.39 175.78 395.42 165.98"/><polygon class="cls-9" points="386.35 160.36 394.33 155.72 402.93 160.69 394.95 165.32 386.35 160.36"/><path class="cls-10" d="M218.52,166.82s-30.54-20.17-27.29-35.6-23-28.64-23-28.64"/><polygon class="cls-7" points="221.82 164.43 221.79 174.01 213.06 169.2 213.09 159.62 221.82 164.43"/><polygon class="cls-8" points="222.4 164.22 230.31 159.62 230.28 169.41 222.37 174.01 222.4 164.22"/><polygon class="cls-9" points="213.33 158.6 221.31 153.96 229.91 158.92 221.92 163.56 213.33 158.6"/><path class="cls-10" d="M173.06,75.28s51.28,10,50.5,44.71l-.78,34.68"/><polyline class="cls-11" points="222.93 158.76 219.51 146.89 223.06 150.63 226.34 146.89"/><polyline class="cls-11" points="164.78 100.41 175.31 103.62 170.79 104.45 171.99 108.72"/><polyline class="cls-11" points="304.44 83.22 294.45 90.48 296.76 85.87 292.12 84.07"/><path class="cls-10" d="M296.76,85.87s-56.83,5.64-55.49,87.58"/><path class="cls-10" d="M363.18,93.92s35.67,4.67,32.21,63.87"/><polyline class="cls-11" points="395.4 161.76 392.22 149.82 395.68 153.64 399.04 149.96"/><text class="cls-12" transform="translate(115.79 121.64)">1</text><ellipse class="cls-13" cx="118.75" cy="117.37" rx="4.69" ry="7.67"/><text class="cls-12" transform="translate(199.56 240.7)">2</text><ellipse class="cls-13" cx="202.52" cy="236.42" rx="4.69" ry="7.67"/><text class="cls-12" transform="translate(311.26 124.41)">3</text><ellipse class="cls-13" cx="314.22" cy="120.13" rx="4.69" ry="7.67"/><text class="cls-12" transform="translate(363.75 235.52)">4</text><ellipse class="cls-13" cx="366.64" cy="231.24" rx="4.69" ry="7.67"/><rect class="cls-6" x="422.95" y="43.76" width="12.46" height="12.46"/><text class="cls-12" transform="translate(437.41 53.99)"><tspan class="cls-14">S</tspan><tspan class="cls-15" x="5.99" y="0">e</tspan><tspan class="cls-16" x="12" y="0">r</tspan><tspan x="16.22" y="0">vi</tspan><tspan class="cls-17" x="24.8" y="0">c</tspan><tspan class="cls-15" x="30.11" y="0">e</tspan></text><rect class="cls-7" x="422.45" y="62.46" width="12.46" height="12.46"/><text class="cls-12" transform="translate(436.91 72.69)"><tspan class="cls-18">S</tspan><tspan x="5.88" y="0">tub</tspan></text></g></g></svg>


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | MANIFEST
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a fact-checking
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *.cover
 46 | .hypothesis/
 47 | .pytest_cache/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | db.sqlite3
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # Environments
 84 | .env
 85 | .venv
 86 | env/
 87 | venv/
 88 | ENV/
 89 | env.bak/
 90 | venv.bak/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 
105 | # Jetbrains
106 | .idea
107 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 koursaros-ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include koursaros *


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src=".github/koursaros.jpg" alt="Koursaros">
 2 | 
 3 | <hr/>
 4 | 
 5 | <p align="center">
 6 | <a href='https://github.com/koursaros-ai/koursaros/blob/master/LICENSE'>
 7 |     <img alt="PyPI - License" src="https://img.shields.io/badge/license-MIT-green.svg">
 8 | </a>
 9 | </p>
10 | 
11 | <p align="center">
12 |   <a href='https://koursaros-ai.github.io'>Blog</a> •
13 |   <a href="#highlights">Highlights</a> •
14 |   <a href="#overview">Overview</a> •
15 |   <a href="#install">Install</a> •
16 |   <a href="#getting-started">Getting Started</a> •
17 |   <a href="#documentation">Documentation</a> •
18 |   <a href="#tutorials">Tutorials</a> •
19 |   <a href="#contributing">Contributing</a>
20 | </p>
21 | 
22 | Koursaros is a distributed cloud platform for developing and deploying neural search and inference applications.
23 | 
24 | Koursaros leverages a general-purpose microservice architecture to enable low-latency, scalable deep neural network training and can be directly deployed to kubernetes for production.
25 | 
26 | ## Description
27 | This is page is a work in progress.
28 | 
29 | ## Results
30 | 
31 | <table>
32 |   <tr>
33 |     <th>Benchmark</th>
34 |     <th>Label Accuracy</th>
35 |     <th>Paper</th>
36 |     <th>Models</th>
37 |   </tr>
38 |   <tr>
39 |     <td><a href="http://fever.ai">fever.ai</a>
40 |     <td>0.7396 (2nd)</td>
41 |     <td><a href='https://koursaros-ai.github.io/Live-Fact-Checking-Algorithms-in-the-Era-of-Fake-News/'>An Automated Fact Checker in Era of Fake News</a></td>
42 |     <td>coming soon</td>
43 |   </tr>
44 | </table>
45 | 
46 | ## Install
47 | ### Requirements
48 | You need Python 3.6 or later to run Koursaros.
49 | 
50 | ### Stable Version
51 | #### Installing via pip
52 | We recommend installing Koursaros via pip:
53 | ```
54 | pip3 install koursaros
55 | ```
56 | Installation will use Python wheels from PyPI, available for OSX, Linux, and Windows.
57 | 
58 | ### Latest Version
59 | ### Installing via pip-git
60 | You can install the latest version from Git:
61 | ```
62 | pip3 install git+https://git@github.com/koursaros-ai/koursaros.git
63 | ```
64 | 
65 | ## Getting Started
66 | ### Creating a pipeline
67 | ```
68 | kctl deploy app
69 | ```
70 | 
71 |    
72 | ## Tutorials
73 | - <a href = 'tutorials/fact_check.md'>Use Koursaros to get SoTA results in dev environment</a> on the <a href='fever.ai'>fever.ai</a> benchmark using pretrained models.
74 | - <a href = 'tutorials/deploy_custom_model.md'>Training custom models and deploying them as stubs</a>
75 | - Training Elastic Search BM25 algorithm using Ax Bayesian Optimizer (coming soon)
76 | - Deploying fever.ai pipeline to production (Coming Soon)
77 | 


--------------------------------------------------------------------------------
/flows/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/flows/.DS_Store


--------------------------------------------------------------------------------
/flows/factchecking/index/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.4'
 2 | services:
 3 |   Frontend0:
 4 |     image: gnes/gnes:latest-alpine
 5 |     command: frontend --port_in 61973 --port_out 54596 --port_ctrl 57120 --parallel_backend
 6 |       process
 7 |     ports:
 8 |     - 8800:8800
 9 |   sentsplit:
10 |     image: hub-preprocessor:latest-sentsplit
11 |     command: --port_in 54596 --port_out 60639 --socket_in PULL_CONNECT --socket_out
12 |       PUB_BIND --port_ctrl 56881 --parallel_backend process --num_parallel 2 --yaml_path
13 |       jsonmode.yml
14 |     deploy:
15 |       replicas: 2
16 |   textbyte:
17 |     image: hub-encoder:latest-textbyte
18 |     command: --port_in 60639 --port_out 58737 --socket_in SUB_CONNECT --port_ctrl
19 |       54010 --parallel_backend process --num_parallel 2 --yaml_path max256.yml
20 |     deploy:
21 |       replicas: 2
22 |   keyword:
23 |     image: hub-indexer:latest-keyword
24 |     command: --port_in 58737 --port_out 61340 --socket_in PULL_CONNECT --socket_out
25 |       PUSH_CONNECT --port_ctrl 64855 --parallel_backend process --num_parallel 2 --yaml_path
26 |       base.yml
27 |     deploy:
28 |       replicas: 2
29 |   lvdb:
30 |     image: hub-indexer:latest-lvdb
31 |     command: --port_in 60639 --port_out 61340 --socket_in SUB_CONNECT --socket_out
32 |       PUSH_CONNECT --port_ctrl 54746 --parallel_backend process --num_parallel 2 --yaml_path
33 |       base.yml
34 |     deploy:
35 |       replicas: 2
36 |   basereducerouter:
37 |     image: gnes/gnes:latest-alpine
38 |     command: route --port_in 61340 --port_out 61973 --socket_out PUSH_CONNECT --port_ctrl
39 |       57894 --parallel_backend process --yaml_path BaseReduceRouter --num_part 2


--------------------------------------------------------------------------------
/flows/factchecking/index/flow.py:
--------------------------------------------------------------------------------
 1 | from koursaros.gnes_addons import Flow
 2 | 
 3 | flow = (
 4 |     Flow(check_version=True)
 5 |     .add_client(name='postgres', yaml_path='wikititles.yml')
 6 |     .add_preprocessor(name='sentsplit', replicas=2, storage='1Gi', yaml_path='jsonmode.yml')
 7 |     .add_encoder(name='textbyte', recv_from='sentsplit', replicas=2, yaml_path='max256.yml')
 8 |     .add_indexer(name='keyword', replicas=2, yaml_path='base.yml')
 9 |     .add_indexer(name='lvdb', recv_from='sentsplit', replicas=2, yaml_path='base.yml')
10 |     .add_router(name='basereducerouter', num_part=2, recv_from=['keyword', 'lvdb'], yaml_path='BaseReduceRouter')
11 | )
12 | 
13 | # checkout how the flow looks like (...and post it on Twitter, but hey what do I know about promoting OSS)
14 | # funny!
15 | 


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 | 


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 | 


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Deployed flow!


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/main.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- with .Values.services }}
 3 | {{- range list .frontend .preprocessor .encoder .indexer .router }}
 4 | {{- range . }}
 5 | ---
 6 | {{ include "statefulset" .}}
 7 | ---
 8 | {{ include "service" .}}
 9 | {{ end }}
10 | {{ end }}
11 | {{ end }}


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "service" -}}
 3 | {{- $name := printf "%s-%s" .app .model -}}
 4 | apiVersion: v1
 5 | kind: Service
 6 | metadata:
 7 |   name: {{ $name }}
 8 | spec:
 9 |   selector:
10 |     app: {{ $name }}
11 |   clusterIP: None
12 |   ports:
13 |     {{- if .port_in }}
14 |     - name: in
15 |       port: {{ .port_in }}
16 |       protocol: TCP
17 |     {{- end }}
18 |     {{- if .port_out }}
19 |     - name: out
20 |       port: {{ .port_out }}
21 |       protocol: TCP
22 |     {{- end -}}
23 |     {{- if .grpc_port }}
24 |     - name: grpc
25 |       port: {{ .grpc_port }}
26 |       protocol: TCP
27 |     {{- end -}}
28 |     {{- if .ctrl_port }}
29 |     - name: ctrl
30 |       port: {{ .ctrl_port }}
31 |       protocol: TCP
32 |     {{- end -}}
33 |   {{ if .load_balancer }}
34 |   type: LoadBalancer
35 |   {{ end }}
36 | {{- end -}}


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/templates/statefulset.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "statefulset" -}}
 3 | {{- $name := printf "%s-%s" .app .model -}}
 4 | apiVersion: apps/v1
 5 | kind: StatefulSet
 6 | metadata:
 7 |   name: {{ $name }}
 8 | spec:
 9 |   replicas: {{ .replicas }}
10 |   selector:
11 |     matchLabels:
12 |       app: {{ $name }}
13 |   volumeClaimTemplates:
14 |   - metadata:
15 |       name: {{ $name }}
16 |     spec:
17 |       accessModes:
18 |         - ReadWriteOnce
19 |       {{- if .storage }}
20 |       resources:
21 |         requests:
22 |           storage: {{ .storage }}
23 |       {{- end }}
24 |   template:
25 |     metadata:
26 |       labels:
27 |         app: {{ $name }}
28 |     spec:
29 |       containers:
30 |       - name: {{ $name }}
31 |         image: {{ .image }}
32 |         args:
33 |         {{- range .command }}
34 |         - {{ . | quote }}
35 |         {{- end }}
36 |         imagePullPolicy: null
37 |         ports:
38 |         {{- if .port_in }}
39 |         - name: in
40 |           containerPort: {{ .port_in }}
41 |           protocol: TCP
42 |         {{- end }}
43 |         {{- if .port_out }}
44 |         - name: out
45 |           containerPort: {{ .port_out }}
46 |           protocol: TCP
47 |         {{- end }}
48 |         {{- if .grpc_port }}
49 |         - name: grpc
50 |           containerPort: {{ .grpc_port }}
51 |           protocol: TCP
52 |         {{- end }}
53 |         {{- if .grpc_port }}
54 |         - name: ctrl
55 |           containerPort: {{ .port_ctrl }}
56 |           protocol: TCP
57 |         {{- end }}
58 |           resources:
59 |             requests:
60 |               {{- if .cpu }}
61 |               cpu: {{ .cpu }}
62 |               {{- end }}
63 |               {{- if .memory }}
64 |               memory: {{ .memory }}
65 |               {{- end }}
66 | 
67 |         {{- if .resources -}}
68 |         {{- toYaml .resources | nindent 8 -}}
69 |         {{- end -}}
70 | {{- end -}}


--------------------------------------------------------------------------------
/flows/factchecking/index/helm/values.yaml:
--------------------------------------------------------------------------------
  1 | services:
  2 |   frontend:
  3 |   - name: Frontend0
  4 |     app: frontend
  5 |     model: base
  6 |     port_in: 61973
  7 |     port_out: 54596
  8 |     ctrl_port:
  9 |     grpc_port: 8800
 10 |     command:
 11 |     - frontend
 12 |     - --port_in
 13 |     - '61973'
 14 |     - --port_out
 15 |     - '54596'
 16 |     - --port_ctrl
 17 |     - '57120'
 18 |     - --parallel_backend
 19 |     - process
 20 |     replicas: 1
 21 |     storage: 500Mi
 22 |     memory: 500Mi
 23 |     cpu: 300m
 24 |     image: gnes/gnes:latest-alpine
 25 |   preprocessor:
 26 |   - name: sentsplit
 27 |     app: preprocessor
 28 |     model: sentsplit
 29 |     port_in: 54596
 30 |     port_out: 60639
 31 |     ctrl_port:
 32 |     grpc_port:
 33 |     command:
 34 |     - --port_in
 35 |     - '54596'
 36 |     - --port_out
 37 |     - '60639'
 38 |     - --socket_in
 39 |     - PULL_CONNECT
 40 |     - --socket_out
 41 |     - PUB_BIND
 42 |     - --port_ctrl
 43 |     - '56881'
 44 |     - --parallel_backend
 45 |     - process
 46 |     - --num_parallel
 47 |     - '2'
 48 |     - --yaml_path
 49 |     - jsonmode.yml
 50 |     replicas: 2
 51 |     storage: 1Gi
 52 |     memory: 1Gi
 53 |     cpu: 1Gi
 54 |     image: hub-preprocessor:latest-sentsplit
 55 |   encoder:
 56 |   - name: textbyte
 57 |     app: encoder
 58 |     model: textbyte
 59 |     port_in: 60639
 60 |     port_out: 58737
 61 |     ctrl_port:
 62 |     grpc_port:
 63 |     command:
 64 |     - --port_in
 65 |     - '60639'
 66 |     - --port_out
 67 |     - '58737'
 68 |     - --socket_in
 69 |     - SUB_CONNECT
 70 |     - --port_ctrl
 71 |     - '54010'
 72 |     - --parallel_backend
 73 |     - process
 74 |     - --num_parallel
 75 |     - '2'
 76 |     - --yaml_path
 77 |     - max256.yml
 78 |     replicas: 2
 79 |     storage: 500Mi
 80 |     memory: 500Mi
 81 |     cpu: 300m
 82 |     image: hub-encoder:latest-textbyte
 83 |   indexer:
 84 |   - name: keyword
 85 |     app: indexer
 86 |     model: keyword
 87 |     port_in: 58737
 88 |     port_out: 61340
 89 |     ctrl_port:
 90 |     grpc_port:
 91 |     command:
 92 |     - --port_in
 93 |     - '58737'
 94 |     - --port_out
 95 |     - '61340'
 96 |     - --socket_in
 97 |     - PULL_CONNECT
 98 |     - --socket_out
 99 |     - PUSH_CONNECT
100 |     - --port_ctrl
101 |     - '64855'
102 |     - --parallel_backend
103 |     - process
104 |     - --num_parallel
105 |     - '2'
106 |     - --yaml_path
107 |     - base.yml
108 |     replicas: 2
109 |     storage: 500Mi
110 |     memory: 500Mi
111 |     cpu: 300m
112 |     image: hub-indexer:latest-keyword
113 |   - name: lvdb
114 |     app: indexer
115 |     model: lvdb
116 |     port_in: 60639
117 |     port_out: 61340
118 |     ctrl_port:
119 |     grpc_port:
120 |     command:
121 |     - --port_in
122 |     - '60639'
123 |     - --port_out
124 |     - '61340'
125 |     - --socket_in
126 |     - SUB_CONNECT
127 |     - --socket_out
128 |     - PUSH_CONNECT
129 |     - --port_ctrl
130 |     - '54746'
131 |     - --parallel_backend
132 |     - process
133 |     - --num_parallel
134 |     - '2'
135 |     - --yaml_path
136 |     - base.yml
137 |     replicas: 2
138 |     storage: 500Mi
139 |     memory: 500Mi
140 |     cpu: 300m
141 |     image: hub-indexer:latest-lvdb
142 |   router:
143 |   - name: basereducerouter
144 |     app: router
145 |     model: basereducerouter
146 |     port_in: 61340
147 |     port_out: 61973
148 |     ctrl_port:
149 |     grpc_port:
150 |     command:
151 |     - route
152 |     - --port_in
153 |     - '61340'
154 |     - --port_out
155 |     - '61973'
156 |     - --socket_out
157 |     - PUSH_CONNECT
158 |     - --port_ctrl
159 |     - '57894'
160 |     - --parallel_backend
161 |     - process
162 |     - --yaml_path
163 |     - BaseReduceRouter
164 |     - --num_part
165 |     - '2'
166 |     replicas: 1
167 |     storage: 500Mi
168 |     memory: 500Mi
169 |     cpu: 300m
170 |     image: gnes/gnes:latest-alpine


--------------------------------------------------------------------------------
/flows/factchecking/query/flow.py:
--------------------------------------------------------------------------------
 1 | from koursaros.gnes_addons import Flow
 2 | 
 3 | flow = (
 4 |     Flow(check_version=True)
 5 |     .add_client(name='postgres', yaml_path='clients/postgres/wikititles.yml')
 6 |     .add_preprocessor(name='sentsplit', replicas=2,
 7 |                       yaml_path='services/preprocessors/sentsplit/jsonmode.yml')
 8 |     .add_encoder(name='textbyte', recv_from='sentsplit', replicas=2,
 9 |                  yaml_path='services/encoders/textbyte/max256.yml')
10 |     .add_indexer(name='keyword', replicas=2,
11 |                  yaml_path='services/indexers/keyword/base.yml')
12 |     .add_indexer(name='lvdb', replicas=2, yaml_path='services/indexers/lvdb/base.yml')
13 |     .add_encoder(name='robertainfer', replicas=2,
14 |                  yaml_path='services/encoders/robertainfer/dim64.yml')
15 |     .add_router(name='reduce', num_part=2, yaml_path='BaseReduceRouter')
16 | )
17 | 
18 | 
19 | # checkout how the flow looks like (...and post it on Twitter, but hey what do I know about promoting OSS)
20 | # funny!
21 | 


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 | 


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 | 


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | raise NotImplementedError
2 | 


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/main.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- with .Values.services }}
 3 | {{- range list .frontend .preprocessors .encoders .indexers .routers }}
 4 | {{- range . }}
 5 | ---
 6 | {{ include "statefulset" .}}
 7 | ---
 8 | {{ include "service" .}}
 9 | {{ end }}
10 | {{ end }}
11 | {{ end }}


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "service" -}}
 3 | apiVersion: v1
 4 | kind: Service
 5 | spec:
 6 |   selector:
 7 |     app: {{ .name }}
 8 |   clusterIP: None
 9 |     ports:
10 |       {{- if .port_in }}
11 |       - name: in
12 |         port: {{ .port_in }}
13 |         protocol: TCP
14 |       {{- end }}
15 |       {{- if .port_out }}
16 |       - name: out
17 |         port: {{ .port_out }}
18 |         protocol: TCP
19 |       {{- end -}}
20 |       {{- if .grpc_port }}
21 |       - name: grpc
22 |         port: {{ .grpc_port }}
23 |         protocol: TCP
24 |       {{- end -}}
25 |       {{- if .grpc_port }}
26 |       - name: ctrl
27 |         port: {{ .port_ctrl }}
28 |         protocol: TCP
29 |       {{- end -}}
30 |   {{ if .load_balancer }}
31 |   type: LoadBalancer
32 |   {{ end }}
33 | {{- end -}}


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/templates/statefulset.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "statefulset" -}}
 3 | apiVersion: apps/v1
 4 | kind: StatefulSet
 5 | spec:
 6 |   replicas: {{ .replicas }}
 7 |   selector:
 8 |     matchLabels:
 9 |       app: {{ .name }}
10 |   volumeClaimTemplates:
11 |     accessModes: 'ReadWriteOnce'
12 |     resources:
13 |       requests:
14 |         storage: {{ .storage }}
15 |   template:
16 |     spec:
17 |       containers:
18 |         - name: {{ .name }}
19 |           image: {{ .image }}
20 |           args: {{ .command }}
21 |           imagePullPolicy: null
22 |           ports:
23 |             {{- if .port_in }}
24 |             - name: in
25 |               containerPort: {{ .port_in }}
26 |               protocol: TCP
27 |             {{- end }}
28 |             {{- if .port_out }}
29 |             - name: out
30 |               containerPort: {{ .port_out }}
31 |               protocol: TCP
32 |             {{- end -}}
33 |             {{- if .grpc_port }}
34 |             - name: grpc
35 |               containerPort: {{ .grpc_port }}
36 |               protocol: TCP
37 |             {{- end -}}
38 |             {{- if .grpc_port }}
39 |             - name: ctrl
40 |               containerPort: {{ .port_ctrl }}
41 |               protocol: TCP
42 |             {{- end -}}
43 |           resources:
44 |             requests:
45 |               cpu: {{ .cpu }}
46 |               memory: {{ .memory }}
47 | 
48 |         {{- if .resources -}}
49 |         {{- toYaml .resources | nindent 8 -}}
50 |         {{- end -}}
51 | {{- end -}}


--------------------------------------------------------------------------------
/flows/factchecking/query/helm/values.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   frontends:
 3 |   - name: Frontend0
 4 |     port_in: 63152
 5 |     port_out: 49972
 6 |     ctrl_port:
 7 |     grpc_port: 8800
 8 |     command: frontend --port_in 63152 --port_out 49972 --port_ctrl 55922 --parallel_backend
 9 |       process
10 |     replicas: 1
11 |     storage:
12 |     memory:
13 |     cpu:
14 |     image: gnes-frontend:Frontend0
15 |   preprocessors:
16 |   - name: sent_split
17 |     port_in: 49972
18 |     port_out: 53012
19 |     ctrl_port:
20 |     grpc_port:
21 |     command: preprocess --port_in 49972 --port_out 53012 --socket_in PULL_CONNECT
22 |       --port_ctrl 54583 --parallel_backend process --yaml_path services/preprocessors/sent_split/json_mode.yml
23 |     replicas: 2
24 |     storage:
25 |     memory:
26 |     cpu:
27 |     image: gnes-preprocessor:sent_split
28 |   encoders:
29 |   - name: text_byte
30 |     port_in: 53012
31 |     port_out: 54139
32 |     ctrl_port:
33 |     grpc_port:
34 |     command: encode --port_in 53012 --port_out 54139 --socket_in PULL_CONNECT --port_ctrl
35 |       51629 --parallel_backend process --yaml_path services/encoders/text_byte/max_256.yml
36 |     replicas: 2
37 |     storage:
38 |     memory:
39 |     cpu:
40 |     image: gnes-encoder:text_byte
41 |   - name: roberta_infer
42 |     port_in: 55961
43 |     port_out: 52539
44 |     ctrl_port:
45 |     grpc_port:
46 |     command: encode --port_in 55961 --port_out 52539 --socket_in PULL_CONNECT --port_ctrl
47 |       52568 --parallel_backend process --yaml_path services/encoders/roberta_infer/dim_64.yml
48 |     replicas: 2
49 |     storage:
50 |     memory:
51 |     cpu:
52 |     image: gnes-encoder:roberta_infer
53 |   indexers:
54 |   - name: keyword
55 |     port_in: 54139
56 |     port_out: 60943
57 |     ctrl_port:
58 |     grpc_port:
59 |     command: index --port_in 54139 --port_out 60943 --socket_in PULL_CONNECT --port_ctrl
60 |       63670 --parallel_backend process --yaml_path services/indexers/keyword/base.yml
61 |     replicas: 2
62 |     storage:
63 |     memory:
64 |     cpu:
65 |     image: gnes-indexer:keyword
66 |   - name: lvdb
67 |     port_in: 60943
68 |     port_out: 55961
69 |     ctrl_port:
70 |     grpc_port:
71 |     command: index --port_in 60943 --port_out 55961 --socket_in PULL_CONNECT --port_ctrl
72 |       55890 --parallel_backend process --yaml_path services/indexers/lvdb/base.yml
73 |     replicas: 2
74 |     storage:
75 |     memory:
76 |     cpu:
77 |     image: gnes-indexer:lvdb
78 |   routers:
79 |   - name: Reduce
80 |     port_in: 52539
81 |     port_out: 63152
82 |     ctrl_port:
83 |     grpc_port:
84 |     command: route --port_in 52539 --port_out 63152 --socket_in PULL_CONNECT --socket_out
85 |       PUSH_CONNECT --port_ctrl 50250 --parallel_backend process --yaml_path BaseReduceRouter
86 |       --num_part 2
87 |     replicas: 1
88 |     storage:
89 |     memory:
90 |     cpu:
91 |     image: gnes-router:Reduce


--------------------------------------------------------------------------------
/flows/factchecking/train/train-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.4'
 2 | services:
 3 |   Frontend00:
 4 |     image: gnes/gnes:latest-alpine
 5 |     command: frontend --grpc_port 5566 --port_out 62401 --socket_out PUSH_BIND --port_in
 6 |       60753 --socket_in PULL_BIND --host_in Encoder20 --host_out Preprocessor10
 7 |     ports:
 8 |     - 5566:5566
 9 |   Preprocessor10:
10 |     image: services/preprocessors:word-split-preprocessor
11 |     command: preprocess --port_in 62401 --socket_in PULL_CONNECT --port_out 54470
12 |       --socket_out PUSH_CONNECT --yaml_path SentSplitPreprocessor
13 |       --host_in Frontend00 --host_out Encoder20
14 |     deploy:
15 |       replicas: 3
16 |       restart_policy:
17 |         condition: on-failure
18 |         max_attempts: 3
19 |   Encoder20:
20 |     image: services/encoders:siamese-bert
21 |     command: --port_in 54470 --socket_in PULL_BIND --port_out 60753 --socket_out PUSH_CONNECT
22 |       --host_out Frontend00 --host_in Preprocessor10


--------------------------------------------------------------------------------
/flows/yc_demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/flows/yc_demo/.DS_Store


--------------------------------------------------------------------------------
/flows/yc_demo/docker-compose-temp.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   block4:
 3 |     command: --socket_in SUB_CONNECT --socket_out PUSH_BIND --yaml_path block_train.yml
 4 |       --host_in router3 --port_in 58842 --port_out 55503
 5 |     ports: [55503:55503]
 6 |   frontend2:
 7 |     command: --socket_in PULL_BIND --socket_out PUSH_BIND --port_in 64750 --port_out
 8 |       56531
 9 |     ports: [64750:64750, 56531:56531]
10 |   http1:
11 |     command: --socket_in RPC_BIND --socket_out RPC_CONNECT --port_in 61501 --host_out
12 |       frontend2 --port_out 64750
13 |     ports: [61501:61501]
14 |   keyword7: {command: --socket_in PULL_CONNECT --socket_out PUSH_CONNECT --yaml_path
15 |       base.yml --host_in textbyte6 --port_in 59483 --host_out rerank9 --port_out 64772}
16 |   rerank9: {command: --socket_in PULL_CONNECT --socket_out PUSH_CONNECT --yaml_path
17 |       base.yml --host_in router8 --port_in 56224 --host_out frontend2 --port_out 64750}
18 |   router3:
19 |     command: --socket_in PULL_CONNECT --socket_out PUB_BIND --yaml_path BaseRouter
20 |       --host_in frontend2 --port_in 56531 --port_out 58842
21 |     ports: [58842:58842]
22 |   router8:
23 |     command: --socket_in SUB_CONNECT --socket_out PUSH_BIND --yaml_path BaseRouter
24 |       --host_in router3 --port_in 58842 --port_out 56224
25 |     ports: [56224:56224]
26 |   textbyte6:
27 |     command: --socket_in PULL_CONNECT --socket_out PUSH_BIND --yaml_path max1024.yml
28 |       --host_in unary5 --port_in 64036 --port_out 59483
29 |     ports: [59483:59483]
30 |   unary5:
31 |     command: doc_type=1 --socket_in PULL_CONNECT --socket_out PUSH_BIND --yaml_path
32 |       text.yml --host_in block4 --port_in 55503 --port_out 64036
33 |     ports: [64036:64036]
34 | version: 3.4
35 | 


--------------------------------------------------------------------------------
/flows/yc_demo/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.4'
 2 | services:
 3 |   http:
 4 |     image: hub-httpclient:latest-http
 5 |     command: --grpc_host Frontend0 --start_doc_id 1
 6 |     ports:
 7 |     - 80:80
 8 |   Frontend0:
 9 |     image: gnes/gnes:latest-alpine
10 |     command: frontend --port_in 57105 --port_out 65502 --port_ctrl 55166 --parallel_backend
11 |       process
12 |   Router0:
13 |     image: gnes/gnes:latest-alpine
14 |     command: route --port_in 65502 --port_out 58609 --socket_in PULL_CONNECT --socket_out
15 |       PUB_BIND --port_ctrl 49407 --parallel_backend process --yaml_path BaseRouter
16 |       --host_in Frontend0
17 |   block:
18 |     image: hub-router:latest-block
19 |     command: --port_in 58609 --port_out 53283 --socket_in SUB_CONNECT --port_ctrl
20 |       52423 --parallel_backend process --yaml_path block_train.yml --host_in Router0
21 |   unary:
22 |     image: hub-preprocessor:latest-unary
23 |     command: --port_in 53283 --port_out 51714 --socket_in PULL_CONNECT --port_ctrl
24 |       55377 --parallel_backend process --yaml_path text.yml --host_in block
25 |   textbyte:
26 |     image: hub-encoder:latest-textbyte
27 |     command: --port_in 51714 --port_out 62690 --socket_in PULL_CONNECT --port_ctrl
28 |       57360 --parallel_backend process --yaml_path max1024.yml --host_in unary
29 | #      --socket_out PUB_BIND # FOR INDEXING
30 | #  whoosh:
31 | #    image: hub-indexer:latest-whoosh
32 | #    command: --port_in 62690 --port_out 57105 --socket_in SUB_CONNECT --port_ctrl
33 | #      60258 --parallel_backend process --yaml_path base.yml --host_in textbyte
34 | #      --host_out Frontend0 --socket_out PUSH_CONNECT
35 | #    volumes:
36 | #      - ./.cache:/workspace
37 | #  rocksdb:
38 | #    image: hub-indexer:latest-rocksdb
39 | #    command: --port_in 62690 --port_out 57105 --socket_in SUB_CONNECT --port_ctrl
40 | #      60258 --parallel_backend process --yaml_path base.yml --host_in textbyte
41 | #      --host_out Frontend0 --socket_out PUSH_CONNECT
42 | #    volumes:
43 | #      - ./.cache:/workspace
44 |   # FOR QUERYING
45 |   whoosh:
46 |     image: hub-indexer:latest-whoosh
47 |     command: --port_in 62690 --port_out 61233 --socket_in PULL_CONNECT --port_ctrl
48 |       60258 --parallel_backend process --yaml_path base.yml --host_in textbyte
49 |     volumes:
50 |       - ./.cache:/workspace
51 |   rocksdb:
52 |     image: hub-indexer:latest-simple_dict
53 |     command: --port_in 61233 --port_out 62155 --socket_in PULL_CONNECT --port_ctrl
54 |       60234 --parallel_backend process --yaml_path base.yml --host_in whoosh
55 |       --host_out rerank --socket_out PUSH_CONNECT
56 |     volumes:
57 |       - ./.cache:/workspace
58 | #  Router1:
59 | #    image: gnes/gnes:latest-alpine
60 | #    command: route --port_in 58609 --port_out 62155 --socket_in SUB_CONNECT --socket_out
61 | #      PUSH_CONNECT --port_ctrl 50381 --parallel_backend process --yaml_path BaseRouter
62 | #      --host_in Router0 --host_out rerank
63 |   rerank:
64 |     image: hub-router:latest-rerank
65 |     command: --port_in 62155 --port_out 57105 --socket_out PUSH_CONNECT --port_ctrl
66 |       56641 --parallel_backend process --yaml_path base.yml --host_out Frontend0


--------------------------------------------------------------------------------
/flows/yc_demo/flow.py:
--------------------------------------------------------------------------------
 1 | from koursaros.gnes_addons import Flow
 2 | 
 3 | 
 4 | flow = (
 5 |     Flow(with_frontend=False)
 6 |     .add_http_client(name='http')
 7 |     .add_frontend(copy_flow=False)
 8 |     .add_router(yaml_path='BaseRouter')
 9 |     .add_router(name='block', yaml_path='block_train.yml')
10 |     .add_preprocessor(name='unary', yaml_path='text.yml', doc_type=1)
11 |     .add_encoder(name='textbyte', yaml_path='max1024.yml')
12 |     .add_indexer(name='whoosh', yaml_path='base.yml')
13 |     .add_indexer(name='simple_dict', yaml_path='base.yml')
14 |     .add_router(yaml_path='BaseRouter', recv_from=['Router0'])
15 |     .add_router(name='rerank', yaml_path='base.yml', recv_from=['rocksdb', 'Router1'])
16 | )
17 | 


--------------------------------------------------------------------------------
/flows/yc_demo/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 | 


--------------------------------------------------------------------------------
/flows/yc_demo/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 | 


--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Deployed flow!


--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/main.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- with .Values.services }}
 3 | {{- range list .frontend .preprocessor .encoder .indexer .router }}
 4 | {{- range . }}
 5 | ---
 6 | {{ include "statefulset" .}}
 7 | ---
 8 | {{ include "service" .}}
 9 | {{ end }}
10 | {{ end }}
11 | {{ end }}


--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "service" -}}
 3 | {{- $name := printf "%s-%s" .app .model -}}
 4 | apiVersion: v1
 5 | kind: Service
 6 | metadata:
 7 |   name: {{ $name }}
 8 | spec:
 9 |   selector:
10 |     app: {{ $name }}
11 |   clusterIP: None
12 |   ports:
13 |     {{- if .port_in }}
14 |     - name: in
15 |       port: {{ .port_in }}
16 |       protocol: TCP
17 |     {{- end }}
18 |     {{- if .port_out }}
19 |     - name: out
20 |       port: {{ .port_out }}
21 |       protocol: TCP
22 |     {{- end -}}
23 |     {{- if .grpc_port }}
24 |     - name: grpc
25 |       port: {{ .grpc_port }}
26 |       protocol: TCP
27 |     {{- end -}}
28 |     {{- if .ctrl_port }}
29 |     - name: ctrl
30 |       port: {{ .ctrl_port }}
31 |       protocol: TCP
32 |     {{- end -}}
33 |   {{ if .load_balancer }}
34 |   type: LoadBalancer
35 |   {{ end }}
36 | {{- end -}}


--------------------------------------------------------------------------------
/flows/yc_demo/helm/templates/statefulset.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "statefulset" -}}
 3 | {{- $name := printf "%s-%s" .app .model -}}
 4 | apiVersion: apps/v1
 5 | kind: StatefulSet
 6 | metadata:
 7 |   name: {{ $name }}
 8 | spec:
 9 |   replicas: {{ .replicas }}
10 |   selector:
11 |     matchLabels:
12 |       app: {{ $name }}
13 |   volumeClaimTemplates:
14 |   - metadata:
15 |       name: {{ $name }}
16 |     spec:
17 |       accessModes:
18 |         - ReadWriteOnce
19 |       {{- if .storage }}
20 |       resources:
21 |         requests:
22 |           storage: {{ .storage }}
23 |       {{- end }}
24 |   template:
25 |     metadata:
26 |       labels:
27 |         app: {{ $name }}
28 |     spec:
29 |       containers:
30 |       - name: {{ $name }}
31 |         image: {{ .image }}
32 |         args:
33 |         {{- range .command }}
34 |         - {{ . | quote }}
35 |         {{- end }}
36 |         imagePullPolicy: null
37 |         ports:
38 |         {{- if .port_in }}
39 |         - name: in
40 |           containerPort: {{ .port_in }}
41 |           protocol: TCP
42 |         {{- end }}
43 |         {{- if .port_out }}
44 |         - name: out
45 |           containerPort: {{ .port_out }}
46 |           protocol: TCP
47 |         {{- end }}
48 |         {{- if .grpc_port }}
49 |         - name: grpc
50 |           containerPort: {{ .grpc_port }}
51 |           protocol: TCP
52 |         {{- end }}
53 |         {{- if .grpc_port }}
54 |         - name: ctrl
55 |           containerPort: {{ .port_ctrl }}
56 |           protocol: TCP
57 |         {{- end }}
58 |           resources:
59 |             requests:
60 |               {{- if .cpu }}
61 |               cpu: {{ .cpu }}
62 |               {{- end }}
63 |               {{- if .memory }}
64 |               memory: {{ .memory }}
65 |               {{- end }}
66 | 
67 |         {{- if .resources -}}
68 |         {{- toYaml .resources | nindent 8 -}}
69 |         {{- end -}}
70 | {{- end -}}


--------------------------------------------------------------------------------
/flows/yc_demo/helm/values.yaml:
--------------------------------------------------------------------------------
  1 | services:
  2 |   httpclient:
  3 |   - name: http
  4 |     app: httpclient
  5 |     model: http
  6 |     port_in:
  7 |     port_out:
  8 |     ctrl_port:
  9 |     grpc_port:
 10 |     command: []
 11 |     replicas: 1
 12 |     storage: 500Mi
 13 |     memory: 500Mi
 14 |     cpu: 300m
 15 |     image: hub-httpclient:latest-http
 16 |   frontend:
 17 |   - name: Frontend0
 18 |     app: frontend
 19 |     model: base
 20 |     port_in: 57105
 21 |     port_out: 65502
 22 |     ctrl_port:
 23 |     grpc_port:
 24 |     command:
 25 |     - frontend
 26 |     - --port_in
 27 |     - '57105'
 28 |     - --port_out
 29 |     - '65502'
 30 |     - --port_ctrl
 31 |     - '55166'
 32 |     - --parallel_backend
 33 |     - process
 34 |     replicas: 1
 35 |     storage: 500Mi
 36 |     memory: 500Mi
 37 |     cpu: 300m
 38 |     image: gnes/gnes:latest-alpine
 39 |   router:
 40 |   - name: Router0
 41 |     app: router
 42 |     model: base
 43 |     port_in: 65502
 44 |     port_out: 58609
 45 |     ctrl_port:
 46 |     grpc_port:
 47 |     command:
 48 |     - route
 49 |     - --port_in
 50 |     - '65502'
 51 |     - --port_out
 52 |     - '58609'
 53 |     - --socket_in
 54 |     - PULL_CONNECT
 55 |     - --socket_out
 56 |     - PUB_BIND
 57 |     - --port_ctrl
 58 |     - '49407'
 59 |     - --parallel_backend
 60 |     - process
 61 |     - --yaml_path
 62 |     - BaseRouter
 63 |     replicas: 1
 64 |     storage: 500Mi
 65 |     memory: 500Mi
 66 |     cpu: 300m
 67 |     image: gnes/gnes:latest-alpine
 68 |   - name: block
 69 |     app: router
 70 |     model: block
 71 |     port_in: 58609
 72 |     port_out: 53283
 73 |     ctrl_port:
 74 |     grpc_port:
 75 |     command:
 76 |     - --port_in
 77 |     - '58609'
 78 |     - --port_out
 79 |     - '53283'
 80 |     - --socket_in
 81 |     - SUB_CONNECT
 82 |     - --port_ctrl
 83 |     - '52423'
 84 |     - --parallel_backend
 85 |     - process
 86 |     - --yaml_path
 87 |     - block_train.yml
 88 |     replicas: 1
 89 |     storage: 500Mi
 90 |     memory: 500Mi
 91 |     cpu: 300m
 92 |     image: hub-router:latest-block
 93 |   - name: Router1
 94 |     app: router
 95 |     model: base
 96 |     port_in: 58609
 97 |     port_out: 62155
 98 |     ctrl_port:
 99 |     grpc_port:
100 |     command:
101 |     - route
102 |     - --port_in
103 |     - '58609'
104 |     - --port_out
105 |     - '62155'
106 |     - --socket_in
107 |     - SUB_CONNECT
108 |     - --socket_out
109 |     - PUSH_CONNECT
110 |     - --port_ctrl
111 |     - '50381'
112 |     - --parallel_backend
113 |     - process
114 |     - --yaml_path
115 |     - BaseRouter
116 |     replicas: 1
117 |     storage: 500Mi
118 |     memory: 500Mi
119 |     cpu: 300m
120 |     image: gnes/gnes:latest-alpine
121 |   - name: rerank
122 |     app: router
123 |     model: rerank
124 |     port_in: 62155
125 |     port_out: 57105
126 |     ctrl_port:
127 |     grpc_port:
128 |     command:
129 |     - --port_in
130 |     - '62155'
131 |     - --port_out
132 |     - '57105'
133 |     - --socket_out
134 |     - PUSH_CONNECT
135 |     - --port_ctrl
136 |     - '56641'
137 |     - --parallel_backend
138 |     - process
139 |     - --yaml_path
140 |     - base.yml
141 |     replicas: 1
142 |     storage: 500Mi
143 |     memory: 500Mi
144 |     cpu: 300m
145 |     image: hub-router:latest-rerank
146 |   preprocessor:
147 |   - name: unary
148 |     app: preprocessor
149 |     model: unary
150 |     port_in: 53283
151 |     port_out: 51714
152 |     ctrl_port:
153 |     grpc_port:
154 |     command:
155 |     - --port_in
156 |     - '53283'
157 |     - --port_out
158 |     - '51714'
159 |     - --socket_in
160 |     - PULL_CONNECT
161 |     - --port_ctrl
162 |     - '55377'
163 |     - --parallel_backend
164 |     - process
165 |     - --yaml_path
166 |     - text.yml
167 |     replicas: 1
168 |     storage: 500Mi
169 |     memory: 500Mi
170 |     cpu: 300m
171 |     image: hub-preprocessor:latest-unary
172 |   encoder:
173 |   - name: textbyte
174 |     app: encoder
175 |     model: textbyte
176 |     port_in: 51714
177 |     port_out: 62690
178 |     ctrl_port:
179 |     grpc_port:
180 |     command:
181 |     - --port_in
182 |     - '51714'
183 |     - --port_out
184 |     - '62690'
185 |     - --socket_in
186 |     - PULL_CONNECT
187 |     - --port_ctrl
188 |     - '57360'
189 |     - --parallel_backend
190 |     - process
191 |     - --yaml_path
192 |     - max1024.yml
193 |     replicas: 1
194 |     storage: 500Mi
195 |     memory: 500Mi
196 |     cpu: 300m
197 |     image: hub-encoder:latest-textbyte
198 |   indexer:
199 |   - name: keyword
200 |     app: indexer
201 |     model: keyword
202 |     port_in: 62690
203 |     port_out: 62155
204 |     ctrl_port:
205 |     grpc_port:
206 |     command:
207 |     - --port_in
208 |     - '62690'
209 |     - --port_out
210 |     - '62155'
211 |     - --socket_in
212 |     - PULL_CONNECT
213 |     - --socket_out
214 |     - PUSH_CONNECT
215 |     - --port_ctrl
216 |     - '60258'
217 |     - --parallel_backend
218 |     - process
219 |     - --yaml_path
220 |     - base.yml
221 |     replicas: 1
222 |     storage: 500Mi
223 |     memory: 500Mi
224 |     cpu: 300m
225 |     image: hub-indexer:latest-keyword


--------------------------------------------------------------------------------
/flows/yc_demo/index.k:
--------------------------------------------------------------------------------
 1 | # | APP         | MODEL     | REPS  | YAML_PATH         | IN        | OUT      | CMD
 2 | 1 | httpclient  | http      | 1     |                   | RPC:      | RPC:2    |
 3 | 2 | frontend    |           | 1     |                   | PULL:     | PUSH:    | frontend
 4 | 3 | router      |           | 1     | BaseRouter        | PULL:2    | PUB:     | route
 5 | 4 | router      | block     | 1     | block_train.yml   | SUB:3     | PUSH:    |
 6 | 5 | preprocessor| unary     | 1     | text.yml          | PULL:4    | PUSH:    |
 7 | 6 | encoder     | textbyte  | 1     | max1024.yml       | PULL:5    | PUB:     |
 8 | 7 | indexer     | whoosh    | 1     | base.yml          | SUB:6     | PUSH:2   |
 9 | 8 | indexer     | rocksdb   | 1     | base.yml          | SUB:6     | PUSH:2   |
10 | 


--------------------------------------------------------------------------------
/flows/yc_demo/query.k:
--------------------------------------------------------------------------------
 1 | # | APP         | MODEL     | REPS  | YAML_PATH         | IN        | OUT      | CMD
 2 | 1 | httpclient  | http      | 1     |                   | RPC:      | RPC:2    |
 3 | 2 | frontend    |           | 1     |                   | PULL:     | PUSH:    | frontend
 4 | 3 | router      |           | 1     | BaseRouter        | PULL:2    | PUB:     | route
 5 | 4 | router      | block     | 1     | block_train.yml   | SUB:3     | PUSH:    |
 6 | 5 | preprocessor| unary     | 1     | text.yml          | PULL:4    | PUSH:    |
 7 | 6 | encoder     | textbyte  | 1     | max1024.yml       | PULL:5    | PUSH:    |
 8 | 7 | indexer     | whoosh    | 1     | base.yml          | PULL:6    | PUSH:    |
 9 | 8 | indexer     | rocksdb   | 1     | base.yml          | PULL:7    | PUB:9    |
10 | 9 | router      | rerank    | 1     | base.yml          | SUB:      | PUSH:2   |
11 | 10| router      | block     | 1     | block_query.yml   | SUB:3     | PUB:9    |
12 | 


--------------------------------------------------------------------------------
/koursaros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/__init__.py


--------------------------------------------------------------------------------
/koursaros/chart/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 | 


--------------------------------------------------------------------------------
/koursaros/chart/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for GNES
4 | name: gnes
5 | version: 0.1.0
6 | 


--------------------------------------------------------------------------------
/koursaros/chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Deployed flow!


--------------------------------------------------------------------------------
/koursaros/chart/templates/main.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- with .Values.services }}
 3 | {{- range list .frontend .preprocessor .encoder .indexer .router }}
 4 | {{- range . }}
 5 | ---
 6 | {{ include "statefulset" .}}
 7 | ---
 8 | {{ include "service" .}}
 9 | {{ end }}
10 | {{ end }}
11 | {{ end }}


--------------------------------------------------------------------------------
/koursaros/chart/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "service" -}}
 3 | {{- $name := printf "%s-%s" .app .model -}}
 4 | apiVersion: v1
 5 | kind: Service
 6 | metadata:
 7 |   name: {{ $name }}
 8 | spec:
 9 |   selector:
10 |     app: {{ $name }}
11 |   clusterIP: None
12 |   ports:
13 |     {{- if .port_in }}
14 |     - name: in
15 |       port: {{ .port_in }}
16 |       protocol: TCP
17 |     {{- end }}
18 |     {{- if .port_out }}
19 |     - name: out
20 |       port: {{ .port_out }}
21 |       protocol: TCP
22 |     {{- end -}}
23 |     {{- if .grpc_port }}
24 |     - name: grpc
25 |       port: {{ .grpc_port }}
26 |       protocol: TCP
27 |     {{- end -}}
28 |     {{- if .ctrl_port }}
29 |     - name: ctrl
30 |       port: {{ .ctrl_port }}
31 |       protocol: TCP
32 |     {{- end -}}
33 |   {{ if .load_balancer }}
34 |   type: LoadBalancer
35 |   {{ end }}
36 | {{- end -}}


--------------------------------------------------------------------------------
/koursaros/chart/templates/statefulset.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {{- define "statefulset" -}}
 3 | {{- $name := printf "%s-%s" .app .model -}}
 4 | apiVersion: apps/v1
 5 | kind: StatefulSet
 6 | metadata:
 7 |   name: {{ $name }}
 8 | spec:
 9 |   replicas: {{ .replicas }}
10 |   selector:
11 |     matchLabels:
12 |       app: {{ $name }}
13 |   volumeClaimTemplates:
14 |   - metadata:
15 |       name: {{ $name }}
16 |     spec:
17 |       accessModes:
18 |         - ReadWriteOnce
19 |       {{- if .storage }}
20 |       resources:
21 |         requests:
22 |           storage: {{ .storage }}
23 |       {{- end }}
24 |   template:
25 |     metadata:
26 |       labels:
27 |         app: {{ $name }}
28 |     spec:
29 |       containers:
30 |       - name: {{ $name }}
31 |         image: {{ .image }}
32 |         args:
33 |         {{- range .command }}
34 |         - {{ . | quote }}
35 |         {{- end }}
36 |         imagePullPolicy: null
37 |         ports:
38 |         {{- if .port_in }}
39 |         - name: in
40 |           containerPort: {{ .port_in }}
41 |           protocol: TCP
42 |         {{- end }}
43 |         {{- if .port_out }}
44 |         - name: out
45 |           containerPort: {{ .port_out }}
46 |           protocol: TCP
47 |         {{- end }}
48 |         {{- if .grpc_port }}
49 |         - name: grpc
50 |           containerPort: {{ .grpc_port }}
51 |           protocol: TCP
52 |         {{- end }}
53 |         {{- if .grpc_port }}
54 |         - name: ctrl
55 |           containerPort: {{ .port_ctrl }}
56 |           protocol: TCP
57 |         {{- end }}
58 |           resources:
59 |             requests:
60 |               {{- if .cpu }}
61 |               cpu: {{ .cpu }}
62 |               {{- end }}
63 |               {{- if .memory }}
64 |               memory: {{ .memory }}
65 |               {{- end }}
66 | 
67 |         {{- if .resources -}}
68 |         {{- toYaml .resources | nindent 8 -}}
69 |         {{- end -}}
70 | {{- end -}}


--------------------------------------------------------------------------------
/koursaros/chart/values.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | nameOverride: ""
 3 | fullnameOverride: ""
 4 | imagePullPolicy: IfNotPresent
 5 | 
 6 | services:
 7 |   frontend: {}
 8 |   preprocessors: {}
 9 |   encoders: {}
10 |   indexers: {}
11 |   routers: {}
12 | 
13 |   # Valid options for all services are:
14 |   # - name: name of the service
15 |   #   image: -
16 |   #   port_in: -
17 |   #   port_out: -
18 |   #   command: command on container entrance
19 |   #   replicas: -
20 |   #   storage: storage on stateful claim
21 |   #   memory: ram
22 |   #   cpu: -
23 |   # storage and memory are suffixed with Gi(gigabits) or Mi(Megabits) (just greater than a GB and MB)
24 |   # cpu is suffixed with m(milliCPU or 1/1000 CPU)


--------------------------------------------------------------------------------
/koursaros/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/koursaros/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .manager import AppManager
 3 | from .deploy import deploy
 4 | from .test import test
 5 | from .show import show
 6 | from .build import build
 7 | import click
 8 | 
 9 | 
10 | @click.group()
11 | @click.pass_context
12 | def kctl(ctx):
13 |     """
14 |     kctl controls the \033[1;3;4;34mKoursaros\033[0m platform.
15 |     Find more information at: https://github.com/koursaros-ai/koursaros
16 |     """
17 |     ctx.obj = AppManager()
18 | 
19 | 
20 | kctl.add_command(deploy)
21 | kctl.add_command(test)
22 | kctl.add_command(show)
23 | kctl.add_command(build)
24 | 
25 | 
26 | def main():
27 |     kctl(prog_name=__package__)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------
/koursaros/cli/build/__init__.py:
--------------------------------------------------------------------------------
 1 | from koursaros.repo_creds import get_creds
 2 | import click
 3 | from shutil import copytree, rmtree
 4 | 
 5 | 
 6 | @click.group()
 7 | def build():
 8 |     """Build docker images."""
 9 | 
10 | 
11 | @build.command()
12 | @click.argument('flow_path')
13 | @click.option('-p', '--push')
14 | @click.option('-c', '--creds')
15 | @click.option('-n', '--no-caches', multiple=True)
16 | @click.pass_obj
17 | def flow(app_manager, flow_path, push, creds, no_caches):
18 |     """Build images for a pipeline. """
19 | 
20 |     if push:
21 |         if creds is None:
22 |             raise ValueError('--creds repository must be specified if pushing')
23 | 
24 |         hub_creds = get_creds(creds).dockerhub
25 |         app_manager.call('docker login -u %s -p %s' % (
26 |             hub_creds.username, hub_creds.password), shell=True)
27 | 
28 |     # app_manager.call('eval $(minikube docker-env)', shell=True)
29 | 
30 |     _flow = app_manager.get_flow(flow_path)
31 | 
32 |     for service in _flow.services.values():
33 |         if '/' not in service['image']:
34 |             path = str(app_manager.find_model(service['app'], service['model']))
35 |             tag = service['image']
36 |             app_manager.logger.critical('Building %s from %s...' % (tag, path))
37 |             cache = '--no-cache ' if service.get('name', None) in no_caches else ''
38 |             _build = 'docker build ' + cache + '-t %s %s' % (tag, path)
39 |             app_manager.call(_build, shell=True)
40 | 
41 |             if push:
42 |                 app_manager.logger.critical('Pushing %s...' % tag)
43 |                 app_manager.call('docker push %s/%s' % (push, tag), shell=True)
44 | 
45 |     """save swarm yaml"""
46 |     _flow.swarm()
47 |     # app_manager.logger.critical('Saved swarm yaml to %s' % str(out_path))
48 | 
49 |     """save helm chart"""
50 |     # out_path = _flow.path.parent.joinpath('helm')
51 |     # rmtree(str(out_path), ignore_errors=True)
52 |     # copytree(str(app_manager.pkg_root.joinpath('chart')), str(out_path))
53 |     # _flow.path.parent.joinpath('helm/values.yaml').write_text(helm_yaml)
54 |     # app_manager.logger.critical('Saved helm chart to %s' % str(out_path))


--------------------------------------------------------------------------------
/koursaros/cli/deploy/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import click
 3 | from tqdm import tqdm
 4 | import time
 5 | import importlib.util
 6 | 
 7 | 
 8 | @click.group()
 9 | def deploy():
10 |     """Deploy gnes services."""
11 | 
12 | 
13 | @deploy.group()
14 | def flow():
15 |     """Deploy a pipeline with compose or k8s. """
16 | 
17 | 
18 | deploy.add_command(flow)
19 | 
20 | 
21 | @flow.command()
22 | @click.argument('flow_path')
23 | @click.pass_obj
24 | def compose(app_manager, flow_path):
25 |     path = app_manager.get_flow(flow_path).path.parent.joinpath('docker-compose.yml')
26 |     down = 'docker-compose -f %s down' % str(path)
27 |     app_manager.call(down, shell=True)
28 |     up = 'docker-compose -f %s up' % str(path)
29 |     app_manager.call(up, shell=True)
30 | 
31 | 
32 | @flow.command()
33 | @click.argument('flow_name')
34 | @click.pass_obj
35 | def swarm(app_manager, flow_name):
36 |     path = app_manager.get_flow(flow_name).path.parent.joinpath('docker-compose.yml')
37 |     rm = 'docker stack rm %s' % flow_name
38 |     app_manager.call(rm, shell=True)
39 |     app_manager.logger.critical('Waiting for docker network resources...')
40 |     [time.sleep(0.15) for _ in tqdm(range(100))]
41 |     stack = 'docker stack deploy --compose-file %s %s' % (str(path), flow_name)
42 |     app_manager.call(stack, shell=True)
43 | 
44 | 
45 | @flow.command()
46 | @click.argument('flow_name')
47 | @click.option('-d', '--dryrun', is_flag=True)
48 | @click.pass_obj
49 | def k8s(app_manager, flow_name, dryrun):
50 |     path = app_manager.get_flow(flow_name).path.parent.joinpath('helm')
51 |     purge = 'helm delete --purge $(helm ls --all --short)'
52 |     app_manager.call(purge, shell=True)
53 |     install = 'helm install ' + ('--dry-run --debug ' if dryrun else '') + str(path)
54 |     app_manager.call(install, shell=True)
55 | 
56 | 
57 | @deploy.command(context_settings=dict(
58 |     ignore_unknown_options=True,
59 |     allow_extra_args=True))
60 | @click.argument('client_name')
61 | @click.pass_context
62 | def client(ctx, client_name):
63 |     """Deploy a client. """
64 |     app_manager = ctx.obj
65 |     path = app_manager.find_model('client', client_name).joinpath('client.py')
66 |     if not path.exists():
67 |         raise FileNotFoundError('Could not find %s' % path)
68 |     spec = importlib.util.spec_from_file_location(client_name, path)
69 |     module = importlib.util.module_from_spec(spec)
70 |     spec.loader.exec_module(module)
71 |     module.Client(*ctx.args).run()
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/koursaros/cli/manager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from gnes.helper import set_logger
 4 | from importlib import machinery
 5 | from koursaros.flow import Flow
 6 | from pathlib import Path
 7 | from typing import List
 8 | import subprocess
 9 | import git
10 | import os
11 | 
12 | 
13 | class AppManager:
14 |     """Manager that keeps track of all of the koursaros
15 |     paths and packages. Passed around at runtime to make
16 |     things more efficient.
17 | 
18 |     :param dev: run on local koursaros repo
19 |     """
20 | 
21 |     def __init__(self):
22 |         self.git_root = Path(git.Repo(
23 |             '.', search_parent_directories=True).working_tree_dir)
24 |         self.pkg_root = Path(__file__).parent.parent
25 | 
26 |         self.logger = set_logger('kctl')
27 |         self.cache = self.git_root.joinpath('.k')
28 |         self.cache.mkdir(exist_ok=True)
29 | 
30 |     def call(self, cmd: List[str], shell=False):
31 |         string = cmd if shell else ' '.join(cmd)
32 |         self.logger.critical('subprocess.call: "%s"' % string)
33 |         subprocess.call(cmd, shell=shell)
34 | 
35 |     @staticmethod
36 |     def check_exists(path: 'Path'):
37 |         if not path.exists():
38 |             raise FileNotFoundError(path.absolute())
39 | 
40 |     def find_model(self, app: str, model: str) -> 'Path':
41 |         path = self.pkg_root.joinpath('hub', app, model)
42 |         self.check_exists(path)
43 |         return path
44 | 
45 |     def get_flow(self, path) -> 'Flow':
46 |         path = Path(path)
47 |         self.check_exists(path)
48 |         return Flow(path)
49 | 


--------------------------------------------------------------------------------
/koursaros/cli/show/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import webbrowser
 4 | import click
 5 | 
 6 | 
 7 | @click.group()
 8 | def show():
 9 |     """Show gnes architecture."""
10 | 
11 | 
12 | @show.command()
13 | @click.argument('flow_path')
14 | @click.pass_obj
15 | def flow(app_manager, flow_path):
16 |     """Deploy a pipeline with compose or k8s. """
17 |     url = app_manager.get_flow(flow_path).mermaid_url
18 | 
19 |     try:
20 |         webbrowser.open_new_tab(url)
21 |     except webbrowser.Error as ex:
22 |         app_manager.logger.critical(
23 |             '%s\nCould not open browser... Please visit:\n%s' % (ex, url))
24 | 


--------------------------------------------------------------------------------
/koursaros/cli/test/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import requests
 3 | import click
 4 | import json
 5 | 
 6 | 
 7 | @click.group()
 8 | @click.pass_context
 9 | def test(ctx):
10 |     """Test a running pipeline"""
11 | 
12 | 
13 | def log_json_res(res):
14 |     logger = set_logger('TEST')
15 |     logger.info(json.dumps(json.loads(res.content), indent=4)
16 |                 .encode().decode("unicode_escape"))
17 | 
18 | 
19 | @test.command()
20 | @click.argument('pipeline_name')
21 | @click.pass_context
22 | def pipeline(ctx, pipeline_name):
23 |     logger = set_logger('TEST')
24 | 
25 |     if pipeline_name == 'telephone':
26 |         url = 'http://localhost:5000'
27 |         headers = {'Content-Type': 'application/json'}
28 | 
29 |         translations = json.dumps({
30 |             'translations': [{
31 |                 'lang': 'en',
32 |                 'text': input('What would you like to translate?\t')
33 |             }]
34 |         })
35 | 
36 |         logger.bold('POSTING %s on %s' % (translations, url))
37 |         res = requests.post(url + '/send', data=translations, headers=headers)
38 |         log_json_res(res)
39 |         logger.bold('REQ STATUS')
40 |         res = requests.get(url + '/status', data=translations, headers=headers)
41 |         log_json_res(res)
42 | 


--------------------------------------------------------------------------------
/koursaros/cli/utils.py:
--------------------------------------------------------------------------------
 1 | def decorator_group(decorators):
 2 |     """returns a decorator which bundles the given decorators
 3 | 
 4 |     :param decorators: iterable of decorators
 5 |     :return: single decorator
 6 | 
 7 |     Example:
 8 |         deploy_options = decorator_group([
 9 |             click.option('-c', '--connection', required=True),
10 |             click.option('-r', '--rebind', is_flag=True),
11 |             click.option('-d', '--debug', is_flag=True),
12 |         ])
13 | 
14 |     """
15 |     def group(f):
16 |         for decorator in decorators:
17 |             f = decorator(f)
18 |         return f
19 |     return group
20 | 


--------------------------------------------------------------------------------
/koursaros/flow/__init__.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import random
  3 | from collections import defaultdict
  4 | from base64 import b64encode
  5 | from ruamel.yaml import YAML
  6 | 
  7 | APPS = ['httpclient', 'frontend', 'router', 'preprocessor', 'encoder', 'indexer']
  8 | IN_SOCKS = ['PULL', 'SUB', 'RPC']
  9 | OUT_SOCKS = ['PUSH', 'PUB', 'RPC']
 10 | 
 11 | 
 12 | def parse_line(line):
 13 |     try:
 14 |         line = [x.strip() for x in line.split('|')]
 15 | 
 16 |         if len(line) != 8:
 17 |             raise ValueError('Expected %s columns on line: %s' % (8, line))
 18 | 
 19 |         if not line[0].isnumeric():
 20 |             raise ValueError('expected numeric id but got %s' % line[0])
 21 |         id = int(line[0])
 22 | 
 23 |         app = line[1]
 24 |         if not app in APPS:
 25 |             raise ValueError('app must be in %s not %s' % (APPS, line[1]))
 26 | 
 27 |         model = line[2] if line[2] else None
 28 |         if model and not model.isidentifier():
 29 |             raise ValueError('model must be python identifier "%s"' % line[2])
 30 | 
 31 |         image = 'hub-%s:latest-%s' % (app, model) if model else 'gnes/gnes:latest-alpine'
 32 | 
 33 |         if not line[3].isnumeric():
 34 |             raise ValueError('replicas must be numeric not "%s"' % line[3])
 35 |         reps = int(line[3])
 36 | 
 37 |         yaml_path = line[4] if line[4] else None
 38 | 
 39 |         i = line[5].split(':')
 40 |         if len(i) != 2:
 41 |             raise ValueError('":" not found in %s' % i)
 42 |         if i[0] not in IN_SOCKS:
 43 |             raise ValueError('"%s" not in %s' % (i[0], IN_SOCKS) )
 44 |         if i[1] and not i[1].isnumeric():
 45 |             raise ValueError('in sock "%s" is not numeric' % i[1])
 46 |         i[1] = int(i[1]) if i[1] else None
 47 |         i[0] += '_CONNECT' if i[1] else '_BIND'
 48 | 
 49 |         o = line[6].split(':')
 50 |         if len(o) != 2:
 51 |             raise ValueError('":" not found in %s' % o)
 52 |         if o[0] not in OUT_SOCKS:
 53 |             raise ValueError('"%s" not in %s' % (o[0], OUT_SOCKS) )
 54 |         if o[1] and not o[1].isnumeric():
 55 |             raise ValueError('out sock "%s" is not numeric' % o[1])
 56 |         o[1] = int(o[1]) if o[1] else None
 57 |         o[0] += '_CONNECT' if o[1] else '_BIND'
 58 | 
 59 |         command = line[7] if line[7] else None
 60 | 
 61 |         return vars()
 62 | 
 63 |     except ValueError as e:
 64 |         raise ValueError('Error on line: %s\n\n%s' % (line, e))
 65 | 
 66 | 
 67 | class Flow:
 68 |     def __init__(self, path: 'Path'):
 69 |         self.services = dict()
 70 |         self.ports = defaultdict(
 71 |             lambda: {'ins': set(), 'outs': set()})
 72 |         self.path = path
 73 |         self.lines = []
 74 |         self.p = list(range(53001, 65001))
 75 |         random.shuffle(self.p)
 76 | 
 77 |         with Path(path).open() as fh:
 78 |             for line in fh:
 79 |                 self.add_line(line)
 80 | 
 81 |     def add_line(self, line: str):
 82 |         if not line.strip().startswith('#'):
 83 |             self.lines += [line]
 84 |             service = parse_line(line)
 85 |             self._add_service(service)
 86 | 
 87 |     def _add_service(self, s: dict):
 88 |         in_id = s['i'][1]
 89 |         if in_id:
 90 |             self.ports[in_id]['outs'].add(s['id'])
 91 | 
 92 |         out_id = s['o'][1]
 93 |         if out_id:
 94 |             self.ports[out_id]['ins'].add(s['id'])
 95 | 
 96 |         s['name'] = s['model'] + str(s['id']) if s['model'] else s['app'] + str(s['id'])
 97 |         s['local_in'] = self.p.pop()
 98 |         s['local_out'] = self.p.pop()
 99 |         self.services[s['id']] = s
100 | 
101 |     def swarm(self):
102 |         y = {'version': '3.4', 'services': {}}
103 |         for s in self.services.values():
104 |             new = dict(volumes=['./.cache:/workspace'], image=s['image'])
105 |             new['command'] = [s['command']] if s['command'] else []
106 |             in_id = s['i'][1]
107 |             out_id = s['o'][1]
108 | 
109 |             if s['app'] != 'httpclient':
110 | 
111 |                 new['command'] += ['--socket_in', s['i'][0], '--socket_out', s['o'][0]]
112 | 
113 |                 if s['yaml_path']:
114 |                     new['command'] += ['--yaml_path', s['yaml_path']]
115 | 
116 |                 # if connecting in
117 | 
118 |                 if in_id:
119 |                     new['command'] += ['--host_in', self.services[in_id]['name']]
120 |                     new['command'] += ['--port_in', self.services[in_id]['local_out']]
121 |                 # if binding in
122 |                 else:
123 |                     new['command'] += ['--port_in', s['local_in']]
124 | 
125 |                 # if connecting out
126 |                 if out_id:
127 |                     new['command'] += ['--host_out', self.services[out_id]['name']]
128 |                     new['command'] += ['--port_out', self.services[out_id]['local_in']]
129 |                 # if binding out
130 |                 else:
131 |                     new['command'] += ['--port_out', s['local_out']]
132 | 
133 |             else:
134 |                 new['ports'] = ['80:80']
135 |                 new['command'] += ['--grpc_host', self.services[out_id]['name']]
136 | 
137 |             new['command'] = ' '.join([str(x) for x in new['command']])
138 |             y['services'][s['name']] = new
139 | 
140 |         YAML().dump(y, open('docker-compose.yml', 'w'))
141 | 
142 |     @property
143 |     def mermaid_url(self):
144 |         app_colors = dict(
145 |             httpclient=('#FFE0E0', '#000', '1px'),
146 |             frontend=('#FFE0E0', '#000', '1px'),
147 |             router=('#C9E8D2', '#000', '1px'),
148 |             encoder=('#FFDAAF', '#000', '1px'),
149 |             preprocessor=('#CED7EF', '#000', '1px'),
150 |             indexer=('#FFFBC1', '#000', '1px'),
151 |         )
152 | 
153 |         lines = ['graph TD']
154 |         for cls, fmt in app_colors.items():
155 |             lines += ['classDef {} fill:{},stroke:{},stroke-width:{};'.format(cls, *fmt)]
156 | 
157 |         def edge(left_s, right_s):
158 |             return ['{ln}--{lt}-{rt}-->{rn}'.format(
159 |                 ln=left_s['name'],
160 |                 lt=left_s['o'][0],
161 |                 rt=right_s['i'][0],
162 |                 rn=right_s['name']
163 |             )]
164 | 
165 |         for bound_id, port in self.ports.items():
166 |             bound_s = self.services[bound_id]
167 |             # lines += ['subgraph %s' % bound_s['name']]
168 | 
169 |             for conn_id in port['ins']:
170 |                 conn_s = self.services[conn_id]
171 |                 lines += edge(conn_s, bound_s)
172 | 
173 |             for conn_id in port['outs']:
174 |                 conn_s = self.services[conn_id]
175 |                 lines += edge(bound_s, conn_s)
176 | 
177 |             # lines += ['end']
178 | 
179 |         for s in self.services.values():
180 |             lines += ['class {} {};'.format(s['name'], s['app'])]
181 | 
182 |         return 'https://mermaidjs.github.io/mermaid-live-editor/#/view/' + b64encode('\n'.join(lines).encode()).decode()
183 | 
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/koursaros/hub/client/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/.DS_Store


--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 | 
3 | RUN apt update
4 | RUN apt install libpq-dev gcc python3-dev musl-dev git -y
5 | RUN pip install psycopg2 git+https://git@github.com/koursaros-ai/koursaros.git
6 | 
7 | ADD *.py *.yml ./
8 | 
9 | ENTRYPOINT ["python", "postgres.py", "--start_doc_id", "1"]


--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/postgres.py:
--------------------------------------------------------------------------------
 1 | from gnes.cli.parser import set_client_cli_parser
 2 | from koursaros.repo_creds import get_creds
 3 | from gnes.client.cli import CLIClient
 4 | from gnes.base import TrainableBase
 5 | import traceback
 6 | import psycopg2
 7 | import json
 8 | import os
 9 | 
10 | VALID_MODES = ['json', 'raw']
11 | 
12 | 
13 | class PostgresClient(CLIClient):
14 | 
15 |     @property
16 |     def bytes_generator(self):
17 |         try:
18 |             args = self.args
19 |             creds = get_creds(args.creds)
20 | 
21 |             psql = creds.postgres
22 |             os.environ['PGSSLMODE'] = psql.sslmode
23 |             os.environ['PGSSLROOTCERT'] = psql.sslrootcert.path
24 | 
25 |             columns = ', '.join([args.id_column] + args.data_columns)
26 |             query = '''SELECT %s FROM %s''' % (columns, args.table)
27 |             query += ' ORDER BY %s ASC' % args.id_column
28 |             query += ' LIMIT %d' % args.limit if args.limit > 0 else ''
29 | 
30 |             connection = psycopg2.connect(user=psql.username,
31 |                                           password=psql.password,
32 |                                           host=psql.host,
33 |                                           port=psql.port,
34 |                                           dbname=psql.dbname)
35 |             cursor = connection.cursor()
36 |             cursor.execute(query)
37 | 
38 |             if args.send_type not in VALID_MODES:
39 |                 raise ValueError('"mode" parameter must be one of %s' % VALID_MODES)
40 |             else:
41 |                 for i, (_id, *row) in enumerate(cursor):
42 |                     msg_id = i + 1
43 |                     if msg_id != _id:
44 |                         raise ValueError(
45 |                             '"%s" column must by an incremental id starting from 1. '
46 |                             'Got id %s for row %s' % (args.id_column, _id, msg_id))
47 | 
48 |                     if args.send_type == 'json':
49 |                         yield (json.dumps(zip(columns, row))).encode()
50 |                     elif args.send_type == 'raw':
51 |                         yield ''.join(row).encode()
52 | 
53 |         except:
54 |             self.logger.error('wut')
55 |             self.logger.error(traceback.format_exc())
56 | 
57 |     def query_callback(self, req, resp):
58 |         self.logger.info(req, resp)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     parser = set_client_cli_parser()
63 |     parser.add_argument('--limit', type=int, help='number of postgres rows (-1 for unlimited)')
64 |     cred_repo_help = 'cred repo set up according to git:koursaros-ai/koursaros.credentials spec'
65 |     parser.add_argument('--creds', type=str, required=True, help=cred_repo_help)
66 |     parser.add_argument('--yaml_path', type=str)
67 |     cli_args = parser.parse_args()
68 |     yaml = TrainableBase.load_yaml(cli_args.yaml_path)
69 |     for k, v in yaml['parameters'].items(): setattr(cli_args, k, v)
70 |     PostgresClient(cli_args)
71 | 


--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/testrerank.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 |   table: 'test.train_article_rerank'
3 |   columns: ['claim', 'label']
4 |   mode: json
5 | 
6 | 


--------------------------------------------------------------------------------
/koursaros/hub/client/postgres/wikititles.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 |   table: wiki.articles
3 |   id_column: id
4 |   data_columns: [title]
5 |   send_type: raw
6 |   limit: 100
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/Dockerfile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/sheet/Dockerfile


--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/base.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/client/sheet/base.yml


--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/client.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pathlib
 3 | import csv
 4 | import json
 5 | 
 6 | HEADERS = {'Content-Type': 'application/json'}
 7 | MODES = ['index', 'train', 'query']
 8 | 
 9 | 
10 | class Client:
11 | 
12 |     def __init__(self, mode, path, limit=None):
13 |         self.path = pathlib.Path(path)
14 |         self.csv = csv.DictReader(self.path.open())
15 |         self.mode = mode
16 |         self.limit = limit
17 |         if mode not in MODES:
18 |             raise ValueError('%s is not valid. Please choose one of %s' % (mode, MODES))
19 | 
20 |         self.iter_csv(getattr(self, mode))
21 | 
22 |     def post(self, data):
23 |         print('Posting:', data)
24 |         response = requests.post('http://localhost:80/%s' % self.mode, data=data, headers=HEADERS)
25 |         res = json.loads(response.content)
26 |         if 'res' in res:
27 |             self.result = json.loads(res['res'][0])
28 |         else:
29 |             self.result = res
30 |         print('Returned:', self.result)
31 | 
32 |     def iter_csv(self, get_body_from_row):
33 |         i = 0
34 |         to_send = []
35 |         for row in self.csv:
36 |             to_send.append(get_body_from_row(row))
37 |             if self.limit is not None and i > self.limit: break
38 |             i += 1
39 |         self.post('\n'.join(to_send).encode())
40 | 
41 |     def index(self, row):
42 |         body = list(row.values())[1]
43 |         req = dict(data=body)
44 |         req.update(row)
45 |         return json.dumps(req, ensure_ascii=False)
46 | 
47 |     def train(self, row):
48 |         return json.dumps(row, ensure_ascii=False)
49 | 
50 |     def query(self, row):
51 |         return list(row.values())[0]
52 | 
53 |     def query_one(self, text):
54 |         self.mode = 'query'
55 |         self.post(text.encode())
56 |         return self.text()
57 | 
58 |     def text(self):
59 |         return self.result['search']['topkResults'][0]['doc']['chunks'][0]['text']


--------------------------------------------------------------------------------
/koursaros/hub/client/sheet/test.csv:
--------------------------------------------------------------------------------
  1 | i,x,y,same_security
  2 | 0,semtech corp,semtech corporation,
  3 | 1,vanguard mid cap index,vanguard midcap index - a,
  4 | 2,spdr gold trust gold shares,spdr gold trust spdr gold shares,
  5 | 3,vanguard total bond index adm,vanguard total bond market index,
  6 | 4,oakmark international fund class i,oakmark international cl i,
  7 | 5,pfizer inc div: 1.200,pfizer inc com,
  8 | 6,spartan global ex us index fid adv cl,sptn glb xus idx adv,
  9 | 7,vanguard total bond market idx-adm,vanguard total bond market index fund investor shares,
 10 | 8,banco latinoamericano de exportacio class e com stk npv,banco latinoamericano come-e,
 11 | 9,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc spons ads repr 0.10 ord cls a us0.00005,
 12 | 10,whole foods market,whole foods markets inc div: 0.540,
 13 | 11,walgreens boots alliance inc com,walgreens boots alli,
 14 | 12,diageo plc new gb spon adr,diageo p l c spon adr new,
 15 | 13,guggenheim bulletshares 2016,guggenheim bulletshares 2016 high yield,
 16 | 14,vanguard small-cap index adm,vanguard small-cap index fund inst,
 17 | 15,emerging markets,vanguard ftse emerging marke,
 18 | 16,spdr s&p 500 etf iv,s&p 500 index spdr,
 19 | 17,tegna inc com,tegna inc,
 20 | 18,deere & company,deere co,
 21 | 19,vanguard mid-cap index,vanguard mid-cap index fund institutional plus shares,
 22 | 20,jpmorgan chase & co div: 1.760,jpmorgan chase & co,
 23 | 21,american funds europacific growth fund - r6,af europac growth r6,
 24 | 22,vanguard total bond market idx-adm,vang tot bd mk is pl,
 25 | 23,unitedhealth gp inc div: 2.000,unitedhealth group incorporated,
 26 | 24,american intl group inc warrant 01/19/2021,american intl gro 21 wtswarrants exp 01/19/21,
 27 | 25,fifth street finance corp com,fifth street financial corp com,
 28 | 26,ishares jpm embi global core,ishares jpm usd emrg mkt bnd etf,
 29 | 27,metwest tot rtn bd m,metropolitan west tot ret bond,
 30 | 28,exelixis inc com,exelixis inc,
 31 | 29,glenmede large cap gwth,glenmede large cap growth,
 32 | 30,af europac growth r6,american funds europacific growth r6,
 33 | 31,dreamworks animation skg cl a,dreamworks animation skg inc cl a,
 34 | 32,dfa us small cap value port instl,dfa u s small cap value cl i,
 35 | 33,vanguard ltd-trm t/e adm,vanguard limited-term tax-exempt fund,
 36 | 34,ishares trust msci united kingdom etf,ishares msci u k etf shs,
 37 | 35,pimco total return cl a,pimco total return fund adm,
 38 | 36,amg yacktman fund service class,amg yacktman service,
 39 | 37,vanguard intermediate-term treasury fund admiral shares,vang intm treas adm,
 40 | 38,pimco incm cl d,pimco fds income instl,
 41 | 39,vanguard growth index fund investor shares,vang growth idx adm,
 42 | 40,vanguard mid cap index,vanguard mid-cap index fund admiral shares,
 43 | 41,vanguard value index fund institutional shares,vanguard value index inv,
 44 | 42,vanguard target retirement 2060 fund,vanguard tgt rtrmnt 2060,
 45 | 43,netflix inc,netflix inc.,
 46 | 44,cisco sys inc com,cisco sys inc,
 47 | 45,pimco income fund cl p,pimco income fd i,
 48 | 46,united parcel service-cl b,united parcel svc inc cl b,
 49 | 47,michael kors holdings ltd com npv,michael kors hldgs ltd,
 50 | 48,alaska air group inc com,alaska air group inc,
 51 | 49,vanguard total bond market index adm,vanguard ttl bnd mrk indx inst,
 52 | 50,vanguard intermediate-term investment-grade fund admiral shares,vanguard intermediate term inv gr fd inv cl,
 53 | 51,oppenhmr develpng mkts y,oppenheimer developing markets cl y,
 54 | 52,texas instrs incorporated,texas instruments inc,
 55 | 53,sptn intl index fai,spartan intl index fid adv class,
 56 | 54,heartland pmt sys inc com,heartland paymnt sys,
 57 | 55,vanguard total bond market index fund institutional shares,vanguard total bond market index,
 58 | 56,ariel fund inv,ariel fund,
 59 | 57,flir sys inc,flir systems inc,
 60 | 58,pimco income fund cl d,pimco income instl,
 61 | 59,vanguard shortterm investgrade adm,vanguard shrt trm invmnt grd-inv,
 62 | 60,bristol myers squibb company,bristol myers squibb co,
 63 | 61,metlife inc,metlife incorporated div: 1.500,
 64 | 62,nxp semiconductors n v com,nxp semiconductors nv,
 65 | 63,novo nordisk a/s-adr nvorepstg 1/2 cl b sh,novo-nordisk a-s fadr 1 adr reps 1 ord shs,
 66 | 64,vanguard total bond market index fund institutional shares,vanguard total bond market idx instl pls,
 67 | 65,accenture ltd ord,accenture plc ireland,
 68 | 66,pimco total ret fd instl,pimco total return fund cl p,
 69 | 67,pimco income a,pimco incm inst cl,
 70 | 68,t. rowe price institutional large cap growth,t. rowe price institutional large cap growth fund,
 71 | 69,hsbc hldgs plc spons adr new,hsbc hldgs plc spon adr new,
 72 | 70,carnival corp ord (panama),carnival corp com,
 73 | 71,baidu inc spons ads repr 0.10 ord cls a us0.00005,baidu inc - spon adr,
 74 | 72,vanguard mid-cap index fund institutional shares,vanguard mid cap index ins,
 75 | 73,ishares iboxx $ invt grade corp bd,ishares iboxx ig corp bond,
 76 | 74,ultimate software group inc,ultimate software gp,
 77 | 75,pimco income fund cl d,pimco income a,
 78 | 76,franklin income series cl a,franklin incm fd cl a,
 79 | 77,ormat technologies inc,ormat technologies,
 80 | 78,aqr mgd futures strat fd cl i,aqr aqr mngd futures strategy i,
 81 | 79,ishares russell midcap growth,ishares russell midcap g etf div: 0.903,
 82 | 80,vanguard target retirement 2045 fund,vang target ret 2045,
 83 | 81,vanguard total intl stk,vanguard total intl etf,
 84 | 82,sptn inter treas bnd investor class,sptn int tr idx adv,
 85 | 83,artisan intl value fund inv,artisan international value,
 86 | 84,"stag industrials, inc. com",stag industrial inc com,
 87 | 85,parnassus endeavor fd,parnassus endeavor fund investor shares,
 88 | 86,johnson & johnson div: 3.000,johnson & johnson jnj,
 89 | 87,eaton vance floating rate fd cl a,eaton vance bond fund cl i,
 90 | 88,vanguard total bond index adm,vanguard ttl bnd mrk indx inst,
 91 | 89,pimco total ret fd instl,pimco total return fund - class r,
 92 | 90,pimco total return fund cl d,pimco tot return adm,
 93 | 91,tivo inc com,tidewater inc com new,
 94 | 92,zimmer biomet hldgs,zimmer biomet holdings inc com,
 95 | 93,ford mtr company del com par $0.01,ford mtr co del com par $0.01,
 96 | 94,guggenheim bullet shrs 2018 hi y c bd etf,guggenheim bulletshares 2018,
 97 | 95,apple inc com,apple incorporated,
 98 | 96,ishares jpm usd emr etf,ishares jpm usd emrg mkt bnd e tf,
 99 | 97,edison international cmn,edison intl,
100 | 98,conagra foods inc div: 1.000,conagra foods inc,
101 | 99,advanced micro devices,advanced micro devices inc,
102 | 100,american tower corporation reit,american tower reit inc (hldg co) shs,
103 | 101,vang sm cap idx adm,vanguard small cap index fund,
104 | 102,vanguard short-term bond index fund investor shares,vanguard short-term bond index,
105 | 103,"vanguard small cap index, adm",vnguard index trust small cap idx instl,
106 | 104,ishares jpm usd emrg mkt bnd e tf,ishares jpm embi global core,
107 | 105,blackrock strat income i,blackrock strategic income opptys investor cl a,
108 | 106,ishares russell midcap,ishares russell mid-cap etf,
109 | 107,ishares core msci emg mkts etf,harding loevner emerging mkts,
110 | 108,vanguard intl equity index fds ftse emerging mkts etf,vanguard ftse emerging mark etf iv,
111 | 109,american funds europacfic r5,american euro pac gr r5,
112 | 110,vanguard crsp us small cap ind ex,vanguard small cap etf,
113 | 111,delta air lines inc dela new,delta air lines inc. (de),
114 | 112,ishares 20+ yr treasu bond etf div: 3.107,ishares 20+ year treasury bo,
115 | 113,sptn glb xus idx fai,spartan global ex us index fid adv cl,
116 | 114,fidelity new insights i,fidelity advisor new insights fund cl i,
117 | 115,ishares tr nat amt free bd,ishares tr natl mun bd etf fd,
118 | 116,vanguard small cap index adm,vanguard small-cap index fund institutional shares,
119 | 117,first eagle global fd cl a,first eagle global class a,
120 | 118,t rowe price mid cap growth,mid-cap growth fund,
121 | 119,skyworks solutions,skyworks solutions inc com,
122 | 120,tile shop hldgs inc,tile shop hldg inc,
123 | 121,t. rowe price health sciences,t rowe price health science fund inc,
124 | 122,coca-cola co/the,coca-cola company,
125 | 123,atwood oceanics inc com,atwood oceanics inc.,
126 | 124,dodge & cox funds income fund,dodge & cox income fund n/l,
127 | 125,vanguard small cap index fund,vnguard index trust small cap idx instl,
128 | 126,vanguard mid-cap value etf,vanguard mid cap value etf,
129 | 127,american funds euro pacific growth r6,am fnd europacfic grth r6,
130 | 128,wisdomtree intl smallcp dividend etf,wisdomtree tr intl smallcap divid fd isin #us9 sedol #b17fg17,
131 | 129,boeing company cmn,boeing company,
132 | 130,ishares msci emerging markets,harding loevner emerging mkts,
133 | 131,transocean ltd zug namen akt,transocean ltd ord,
134 | 132,vanguard small-cap index fund admiral shares,vanguard small-cap index adm,
135 | 133,pimco total ret fd instl,pimco total return fd cl c,
136 | 134,jpmorgan equity income fund cl a,jpmorgan us equity fund - class r6,
137 | 135,energy sector index spdr,energy sector spdr etf,
138 | 136,vanguard total internatlbnd etf iv,vanguard charlotte total intl bd index fd etf,
139 | 137,t rowe price international discovery fund,t. rowe price international discovery,
140 | 138,united sts stl cp (new),united states stl corp new,
141 | 139,coca cola co,coca-cola company,
142 | 140,spdr barclays high yield bond (jnk),spdr barclays capital high yield bond et,
143 | 141,vanguard smallcap index fund,vanguard small-cap index fund institutional shares,
144 | 142,fidelity select utilities portfolio,fid sel utilities,
145 | 143,select sector spdr trust the technology select sector spdr fund,technology sector sp etf,
146 | 144,guggenheim s&p 500 equal we cons etf,guggenheim s&p 500 equalwe cons etf,
147 | 145,visa inc class a shares,visa inc cl a div: 0.560,
148 | 146,skyworks solutions inc com,skyworks solutions inc,
149 | 147,alibaba group holding ltd spons ads,alibaba group hldg adr fsponsored adr 1 adr reps 1 ord,
150 | 148,pimco income instl,pimco incm inst cl,
151 | 149,vanguard growth index inv,vanguard growth index fund admiral shares,
152 | 150,canadian natl railway co,canadian natl ry co f,
153 | 151,first tr exchange traded fd dow jones internet index fd,first tr exchange traded fd dow jones in,
154 | 152,vanguard total bond index adm,vanguard total bond market index i,
155 | 153,ishares gold tr,ishares gold tr ishares,
156 | 154,oppenheimer developing markets y fund,oppenheimer developing markets cl y,
157 | 155,vanguard total bond market index adm,vang tot bd mk is pl,
158 | 156,t. rowe price new income,guidemark core fixed income,
159 | 157,vanguard mid-cap index fund institutional shares,vang midcap idx inst,
160 | 158,mfs international new discovery r5,afs international growth & income fund cl f1,
161 | 159,fitbit inc,fitbit inc cl a,
162 | 160,vanguard growth index fund investor shares,vanguard growth index fd admiral share,
163 | 161,comcast corp (new) class a div: 1.100,comcast corp cl a,
164 | 162,invesco diversified dividend investor cl,fidelity advisor diversified international fund cl c,
165 | 163,blackrock high yld bd port cl k,blackrock high yield bond portfolio svc,
166 | 164,sina com ord (caym is),sina corporation com,
167 | 165,t.rowe price new horizons-t,new horizons fund,
168 | 166,vanguard value etf (vtv),vanguard value etf,
169 | 167,berkshire hathawayinc del cl b new,berkshire hathawayinc,
170 | 168,schlumberger limited com usd0.01,schlumberger ltd.,
171 | 169,union pacific corp,union pac corp com,
172 | 170,alps etf tr alerian mlp,alps alerian mlp etf,
173 | 171,vanguard div growth fd investor shrs,vanguard dividend growth fund investor shares,
174 | 172,tyson foods inc-cl a tsn,tyson foods inc class a,
175 | 173,american mutual fund-a,american mutual fund cl a,
176 | 174,canadian national railway,canadian natl railway company com,
177 | 175,deutsche x-trackers msci eafe hedged equity etf,deutsche x-trackers msci eafe equity etf,
178 | 176,vanguard total bond index adm,vanguard total bond market idx instl pls,
179 | 177,western digital corp,western digital corp com,
180 | 178,ishares core msci emg mkts etf,harding loevner emerg mrkts port adv,
181 | 179,pimco high income fd com shs,pimco high income fund,
182 | 180,ishares inc core msci emerging mkts etf,harding loevner emerging mkts,
183 | 181,time warner inc com,time warner inc,
184 | 182,vanguard international growth fund admiral,vanguard international growth fund admiral shares,
185 | 183,ishares trust core msci total intl stk etf,ishares core msci ttl int stk,
186 | 184,ishares iboxx invt gradebond etf,ishares iboxx $ invt grade corp bd,
187 | 185,interactive brokers group inc. com,interactive brokers class a,
188 | 186,adobe systems,adobe systems incorporated,
189 | 187,vang tot bd mkt adm,vanguard total bond market idx instl pls,
190 | 188,yandex nv com,yandex n.v. com usd0.01 cl a,
191 | 189,vanguard small cap index - a,vanguard small-cap index fund inst,
192 | 190,sirius xm hldgs inc com isin #us5 sedol #bgldk10,sirius xm hldgs inc com,
193 | 191,vanguard target retirement 203 5 fund,vang target ret 2035,
194 | 192,communications sales&leas inc div: 2.400,communications sales&leas inc com,
195 | 193,vanguard index fds vanguard total stk mkt etf,us total stock market,
196 | 194,t. rowe price equity income fund,t. rowe price equity income,
197 | 195,ishares tr core us aggt bd etf,ishares core us aggregate bond etf,
198 | 196,american funds europacific growthr3,american funds europacific growth fund,
199 | 197,lazard emerging mkts eqty port opn,lzrd emrg mkts eq o,
200 | 198,pimco income a,pimco income administrative,
201 | 199,american express co,american express company,
202 | 200,taser international,taser intl inc del com,
203 | 201,vanguard short term tax exempt fd investor shr,vanguard short-term tax-exempt fund investor shares,
204 | 202,ishares core msci emerging markets etf,harding loevner emerg mrkts port adv,
205 | 203,vanguard institutional index fund institutional shares,vanguard institl index,
206 | 204,trp real estate adv,t rowe price real estate fund adv cl,
207 | 205,jp morgan chase & co com,jpmorgan chase & co div: 1.760,
208 | 206,vanguard 500 index fund admira l,vanguard 500 index fund admiral class,
209 | 207,dollar gen corp new com,dollar general corp,
210 | 208,us silica holdings inc,u s silica hldgs inc com,
211 | 209,alphabet inc cap stk cl c cap stk cl c,alphabet inc cl c,
212 | 210,ishares msci usa min volility etf,proshares short vix short term etf,
213 | 211,fidelity low-priced stock,fid low priced stk,
214 | 212,vang st invstgrd inv,vanguard short-term invest-grade,
215 | 213,goldman sachs mangd futures strategy a,aqr aqr mngd futures strategy i,
216 | 214,select sector spdr trust health care select index,health care select spdr fund,
217 | 215,metropolitan west fds total ret cl i,metropolitan west tot ret bond,
218 | 216,chubb limited com,chubb ltd,
219 | 217,vanguard ftse emerg mkts etf,vanguard intl equity index fds ftse emerging mkts etf,
220 | 218,energy transfer partners un,cheniere energy partners lp com,
221 | 219,baron partners,baron partners fund,
222 | 220,prudential financial inc cmn,prudential finl inc,
223 | 221,t rowe price retirement 2050 fund,t. rowe price retirement 2050 fund,
224 | 222,templeton global bond class a,templeton global bd r,
225 | 223,dominion resources inc va new,dominion resources inc/va,
226 | 224,ishares tr core us aggt bd etf,ishares core u.s. aggregate,
227 | 225,citigroup inc,citigroup inc new div: 0.200,
228 | 226,invesco comstock fund cl a,invesco comstock y,
229 | 227,oppen developing mkts a,oppenheimer developing mkts fd cl a,
230 | 228,alphabet inc shs cl a,alphabet inc voting,
231 | 229,national grid new adr each repr 5 ord gbp0.11395,national grid plc new spon adr,
232 | 230,ishares russell 3000 index etf,ishares russell 3000 etf,
233 | 231,titan international inc com,titan international inc,
234 | 232,proshares tr ii ultra bloomberg crude oi,proshares ultra bloomberg crude oil etf,
235 | 233,ishares core msci emerging markets etf,harding loevner emerging mkts,
236 | 234,vang tot bd mkt adm,vanguard total bond market index i,
237 | 235,mfs global total return fund cl a,mfs global total return cl a,
238 | 236,metropolitan west total return m,metropolitan west tot ret bond,
239 | 237,blckrck inflation protect,blackrock inflation protected bond instl,
240 | 238,dfa real estate securities i,dfa real estate securities fund institutional class,
241 | 239,leucadia natl corp com,leucadia national co,
242 | 240,pimco income fd i,pimco income instl,
243 | 241,trp retirement 2045,t. rowe price retirement 2045,
244 | 242,wal-mart stores inc com isin #us9311421039 sedol #2936921,scana corp new com isin #us7 sedol #2545844,
245 | 243,oppenheimer developing market a,oppenheimer developing mkts fd cl a,
246 | 244,nuveen high yield muni bond fund cl i,nuveen high yield municipal bond a,
247 | 245,vanguard short term invt grade admiral,vanguard short-term investment-grade fund investor shares,
248 | 246,the growth fund of america,amer fds grwth fd amr a,
249 | 247,fireeye inc,fireeye inc com usd0.0001,
250 | 248,templeton global bond fund advisor class,templeton global bond fund adv cl,
251 | 249,ishares inc core msci emerging mkts etf,harding loevner emerg mrkts port adv,
252 | 250,silver wheaton corp. ads,silver wheaton corporation com npv isin #ca6 sedol #b058zx6,
253 | 251,dfa us small cap value prtf instl,dfa us sm cap value,
254 | 252,united states oil fund lp exchange-traded fund,united states oil fund lp unit,
255 | 253,pimco total return instl,fund: pimco total return admin,
256 | 254,tesla motors inc com,tesla motors inc.,
257 | 255,schwab short term us treasury etf,schwab strategic tr short-term us treasury,
258 | 256,vanguard total bond index adm,vang tot bd mk is pl,
259 | 257,van small cap index admir,vang sm cap idx inst,
260 | 258,facebook incorporated class a,facebook inc cl a,
261 | 259,vanguard ext market index inst,vanguard extended market idx adm,
262 | 260,t. rowe price new horizons,new horizons fund,
263 | 261,vanguard total bond market index fund admiral shares,vanguard bond index total mkt investor,
264 | 262,lloyds banking group plc div: 0.129,lloyds banking group plc,
265 | 263,vbr:vanguard small-cap value etf,vanguard small cap valueetf iv,
266 | 264,constellation brands inc cl a,constellation brand class a,
267 | 265,jp morgan chase & co com,jp morgan chase & co,
268 | 266,pimco total return instl,pimco total return fund instl cl,
269 | 267,ishares gold etf,ishares gold trust com,
270 | 268,schwab intl core equity,schwab intl core eqty fd instl cl,
271 | 269,vanguard total bond market index-admiral,vang tot bd mkt inst,
272 | 270,pimco total return instl,pimco total return fund adm,
273 | 271,alibaba group hldg ltd sponsor,alibaba group hldg ltd adr,
274 | 272,ishares russell 1000 growth,russell 1000 growth (ishares),
275 | 273,walt disney co,disney,
276 | 274,vanguard s&p 500 etf (voo),vanguard index fds s&p 500 etf,
277 | 275,ishares msci eafe min volatility etf,ishares trust msci eafe min volatil etf,
278 | 276,kraft heinz co com,kraft heinz co div: 2.300,
279 | 277,metr w tot rtn bond cl m,metropolitan west tot ret bond,
280 | 278,berkshire hathaway cl-b new,berkshire hathaway inc.,
281 | 279,momenta pharmaceuticals,momenta pharmaceuticals inc com,
282 | 280,colgate palmolive,colgate palmolive co com,
283 | 281,ishares inc core msci emerging mkts etf,ishares msci emerging markets,
284 | 282,powershares qqq etf,powershares qqq trust sr 1 etf,
285 | 283,vanguard small-cap index fund admiral,vanguard small-cap index fund institutional shares,
286 | 284,corning inc cm,corning inc,
287 | 285,vereit inc reit,vereit inc,
288 | 286,ishares core msci emerging div: 0.995,ishares core msci emg mkts etf,
289 | 287,american funds europacific growth-r6,american funds europacific growth fund class r-6,
290 | 288,national oilwell varco inc com,national-oilwell varco inc,
291 | 289,vanguard 500 index fund admiral shares,vnguard 500 index admiral shares,
292 | 290,facebook inc.,facebook inc class a,
293 | 291,whole foods market inc,whole foods markets inc div: 0.540,
294 | 292,bhp billiton plc - adr,bhp billiton plc spons adr each rep 2 ord usd0.50,
295 | 293,ishares russell 2000 growth etf iv,ishares russell 2000 grwth etf div: 1.243,
296 | 294,osterweis strategic income fund,professionally mgd ptfl osterweis strategic inc fd,
297 | 295,vanguard small cap value etf,vanguard small cap valueetf iv,
298 | 296,wa core plus bond i,western asset core plus bond fd cl fi,
299 | 297,kraft heinz co div: 2.300,kraft heinz co com,
300 | 298,nokia corp-spon adr,nokia corp cls a adr (finnish),
301 | 299,citrix systems inc.,citrix systems inc,
302 | 300,devry education group inc div: 0.360,devry education group,
303 | 301,vanguard gnma fund admiral shares,vanguard gnma fund investor share,
304 | 302,vanguard star investor class,vanguard star fund investor shares,
305 | 303,vanguard total bond market index adm,vanguard total bond market index fund investor shares,
306 | 304,ishares gold tr,ishares gold etf,
307 | 305,eaton corp plc com,eaton corp plc f,
308 | 306,pepsico inc com,pepsico inc cmn,
309 | 307,wal-mart stores inc com,wal-mart stores inc.,
310 | 308,whole foods mkt inc com,whole foods market,
311 | 309,american capital world growth and income fd a,american capital world grth & inc a,
312 | 310,vanguard mid cap index fund - admiral,vanguard mid cap index fund admiral class,
313 | 311,fidelity corporate bond fund,baird core plus bond inst,
314 | 312,spartan extended mkt index fid adv class,spartan extended mkt index investor cl,
315 | 313,nxp semiconductors f,nxp semiconductors n v,
316 | 314,columbia dividend income,col dividend inc z,
317 | 315,vanguard sml-cap ind-adm,vanguard small-cap index fund admiral,
318 | 316,vanguard index 500 port,vanguard 500 index fund-inv,
319 | 317,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc sponsored adr repstg ord shares class a,
320 | 318,delaware value fund institutional,delaware value cl a,
321 | 319,vanguard total bond market index fund admiral shares,vanguard total bond market index inv,
322 | 320,arista networks inc,arista networks inc com usd0.0001,
323 | 321,ishares 1-3 yr treasury bnd etf,ishares 1-3 yr treasury bnd et f,
324 | 322,united sts oil fd lp units,united states oil fund lp exchange-traded fund,
325 | 323,pimco income fund cl p,pimco fds income instl,
326 | 324,diageo plc fadr 1 adr reps 4 ord shs,diageo p l c spon adr new,
327 | 325,harbor capital appreciation instl,harbor capital appreciation,
328 | 326,price t rowe group inc com isin #us74144t1088 sedol #2702337,c h robinson worldwide inc com new isin #us8 sedol #2116228,
329 | 327,ishares s&p midcap fund,ishares core s&p mid capetf,
330 | 328,pimco total return cl a,fund: pimco total return admin,
331 | 329,johnson and johnson,johnson and johnson com,
332 | 330,ishares msci cda etf,ishares msci canada index,
333 | 331,carnival corp com,carnival corp f,
334 | 332,priceline group,priceline grp inc com new,
335 | 333,westport innovation f,westport innovations inc,
336 | 334,wisdomtree emerging markets high dividend fund etf,wisdomtree emrg mrkt hg div etf,
337 | 335,d.r. horton inc,d r horton co,
338 | 336,vanguard total bond market index adm,vanguard total bond market idx instl pls,
339 | 337,ishares 7-10 year treas bond etf,ishares barclays 7-10 year treasury bond,
340 | 338,vanguard total bond market index-admiral,vanguard total bond market index fund institutional plus shares,
341 | 339,vanguard sml-cap ind-adm,vanguard small-cap index fund institutional shares,
342 | 340,vanguard interm-term investment-grde adm,vanguard intermediate term inv gr fd inv cl,
343 | 341,teva pharmaceuticals adr,teva pharmaceuticals ind ltd israel adr,
344 | 342,dfa intl small cap value cl i,dfa intl small cap value port instl,
345 | 343,prudential financial inc div: 2.800,prudential finl inc,
346 | 344,af bond fd amer r6,the bond fund of america-a,
347 | 345,allergan plc,allergan plc f,
348 | 346,vale sa,vale s a adr,
349 | 347,vanguard short-term corporate bond etf,vanguard short-term corporate bond,
350 | 348,vanguard index fds vanguard small cap growth vipers formerly,vanguard small-cap grwth etf,
351 | 349,vanguard short-term bondetf,vanguard short term etf,
352 | 350,goldman sachs mgd futures strat a,aqr mgd futures strat fd cl i,
353 | 351,vanguard total bond index adm,vanguard total bond market index fund investor shares,
354 | 352,xinyuan real estate com,xinyuan real estate co ltd spon adr,
355 | 353,ishares core u.s. aggregate bond etf,ishares core total us bond market etf,
356 | 354,devon energy corp,devon energy corporation (new) cmn,
357 | 355,berkshire hathaway class b,berkshire hathawayinc,
358 | 356,vanguard reit index investor,vanguard reit index inv,
359 | 357,markel corp hldg co,markel corp,
360 | 358,vanguard high-yield corporate fund investor shares,vanguard high yield corp fund admiral share,
361 | 359,oneok inc new div: 2.460,oneok inc cm (new),
362 | 360,iron mtn inc new com div: 1.940,iron mtn inc reit,
363 | 361,howard hughes corp com,howard hughes corp,
364 | 362,af bond fd amer r6,the bond fund of america,
365 | 363,kraft heinz co,kraft heinz co div: 2.300,
366 | 364,costco wholesale crp del,costco wholesale co,
367 | 365,first trust amex biotechnology index fund,first trust nyse arca biotechnology index fund,
368 | 366,american tower corporation isin #us0 sedol #b7fbfl2,american tower corp reit,
369 | 367,the growth fund of america,the growth fund of america-529a (1005),
370 | 368,priceline group inc com,priceline group inc,
371 | 369,pimco total return fund instl cl,total return fund (pimco),
372 | 370,southwest gas corp.,southwest gas corp div: 1.800,
373 | 371,vanguard dividend growth fund investor shares,vanguard dividend growth fund,
374 | 372,j p morgan chase & co,jpmorgan chase & co div: 1.760,
375 | 373,merck & company inc new,merck & co inc new,
376 | 374,pimco incm cl d,pimco income fd i,
377 | 375,vanguard fixed income secs inter term invt grade fd admiral cl,vanguard intermediate term inv gr fd inv cl,
378 | 376,vanguard growth index fund investor shares,vanguard growth index admiral,
379 | 377,guggenheim bulletshares 2018,guggenheim bulletshares 2018 high yield corp bd,
380 | 378,vanguard total bond market div: 2.009,vanguard total bond mkt,
381 | 379,blackhawk network hldgs inc cl a,blackhawk netwk hldgs inc,
382 | 380,vanguard balanced index fd inv cl shrs,vanguard balanced index fund admiral shares,
383 | 381,vanguard mid-cap value index fund,vanguard mid-cap value etf,
384 | 382,ishares tr natl mun bd etf fd,ishares nationl amt freemuni etf,
385 | 383,bank amer corp com,bank of america corp.,
386 | 384,russell 1000 growth (ishares),ishares russell 1000 grw etf div: 1.363,
387 | 385,american fd growth fd of america cl a,amer fds grwth fd amr a,
388 | 386,ishares us real estate etf,ishares u s real estate etf,
389 | 387,dfa emerging markets portfolio,dfa emrging markets,
390 | 388,alphabet inc-cl a,alphabet inc. class a,
391 | 389,raytheon co (new) div: 2.680,raytheon co com,
392 | 390,charles schwab new,schwab charles corp new,
393 | 391,alibaba group holding lt,alibaba group hldg ltd sponsor,
394 | 392,oakmark international i,oakmark fds oakmark intl,
395 | 393,mastercard incorporated cmn class a,mastercard inc-class a,
396 | 394,kimberly-clark corp,kimberly-clark corp.,
397 | 395,ishares 7-10 yr treasry bd etf div: 1.973,ishares barclays 7-10 year treasury bond,
398 | 396,vanguard small-cap index fund admiral shares,vanguard small-cap index fund inst,
399 | 397,foot locker inc com isin #us9 sedol #2980906,foot locker inc com isin #us3448491049 sedol #2980906,
400 | 398,vanguard ttl bond mkt idx adm,vanguard total bond market index inv,
401 | 399,chesapeake energy corporation oklahoma,chesapeake energy corp,
402 | 400,charles schwab corporation cmn,schwab charles corp new,
403 | 401,vanguard intl equity index fds ftse all world ex usa small cap index fd etf shs,vanguard ftse all world ex us small cap etf,
404 | 402,chicago bridge & iron co nv,chicago bridge & iron company n.v. eur0.01 reg,
405 | 403,amer fds grwth fd amr a,american gr fd of america a,
406 | 404,amg managers real estate securities fund,amg managers real estate securities fd,
407 | 405,american new perspective class a,new perspective fund cl a,
408 | 406,lloyds banking group plc sp adr,lloyds banking group plc div: 0.129,
409 | 407,vanguard 500 idx adm,vanguard s&p 500 index - a,
410 | 408,vanguard total bond market index admiral,vanguard total bond market index fund institutional shares,
411 | 409,j p morgan chase & co,jp morgan chase & co,
412 | 410,halliburton co hldg,halliburton company,
413 | 411,guggenheim bulletshares 2018 high yield corp bd,claymore exchange traded fd trust guggenheim bltshrs 2018 high yld cp bd,
414 | 412,ishares russell 2000 value etf,ishares russell 2000 value etf iv,
415 | 413,pimco total return a,total return fund (pimco),
416 | 414,vanguard index fds vanguard reit etf formerly vanguard index tr to 05/24/01 reit viper shs,vanguard index fds vanguard reit etf formerly vanguard index,
417 | 415,powershares emrg mkts sovrgn dbt etf,powershares emerging markets sovereign d,
418 | 416,vang tot bd mkt adm,vanguard total bond market index,
419 | 417,arena pharmaceuticals inc com,arena pharmaceuticals,
420 | 418,nokia corp spon adr f1 adr rep 1 nokia corps,nokia corp sponsored adr,
421 | 419,vanguard value index fund admiral shares,vanguard value index fund institutional shares,
422 | 420,vanguard total int bd idx etf,vanguard charlotte total intl bd index fd etf,
423 | 421,pimco total ret fd instl,pimco total return r,
424 | 422,intl business machines,intl business mach,
425 | 423,harbor international fund,harbor international,
426 | 424,vanguard intl bond index etf,vanguard total internatlbnd etf iv,
427 | 425,vanguard extended market index institutional class,vang ext mkt idx ins,
428 | 426,caterpillar inc del,caterpillar inc,
429 | 427,visa inc com cl a,salesforce.com inc com,
430 | 428,pimco total return admin,total return fund (pimco),
431 | 429,pepsico inc.,pepsico inc nc div: 2.810,
432 | 430,powershares fin pfd portfoli,powershares etf financial pfd portfolio,
433 | 431,fid sel biotech,fidelity select biotechnology,
434 | 432,arcelormittal sa luxembourg ny registry sh isin #us4 sedol #b295f26,arcelormittal sa (luxembourg),
435 | 433,realty incm corp reit,realty income corporation com,
436 | 434,vanguard total international stock index fund admiral shares,vanguard ttl intl stk ind adm,
437 | 435,c h robinson worldwide inc com new isin #us8 sedol #2116228,c.h. robinson worldwide inc,
438 | 436,sprint corp shs series -,sprint corp,
439 | 437,google inc cl c,alphabet inc cap stk cl c,
440 | 438,templeton glbal bond adv,templeton global bond adv,
441 | 439,apollo investments corp com,apollo invt corp com,
442 | 440,sptn us bond idx is,spartan us bond indx fidelity adv class,
443 | 441,ishares inc msci emrg mkts min volatility etf,ishares msci markets minvol etf,
444 | 442,vanguard total bond market index admiral,vanguard total bond market idx-adm,
445 | 443,southwest gas corp div: 1.800,southwest gas corp,
446 | 444,vanguard ftse developd mkt etf,vanguard ftse dev markets etf,
447 | 445,spdr nuveen barclays muni bond etf,spdr nuveen barclays muni,
448 | 446,dodge & cox interntl stock,dodge & cox international stock,
449 | 447,alexion pharms inc,alexion pharmaceuticals inc,
450 | 448,vanguard total bond market idx-adm,vanguard ttl bnd mrk indx inst,
451 | 449,general mtrs co,general motors co.,
452 | 450,zoes kitchen inc com isin #us7 sedol #bl95n36,zoes kitchen inc com,
453 | 451,loomis sayles bond fund cl i,loomis sayles mlti-asset inc a,
454 | 452,athenahealth inc delaware,athenahealth inc,
455 | 453,ishares silver trust etf,ishares silver shares,
456 | 454,emerging markets,harding loevner emerging mkts,
457 | 455,markel corp hldg co,markel corp holding company,
458 | 456,ameriprise financial inc,ameriprise finl inc,
459 | 457,vanguard malvern fds etf,vanguard short term inflation protected,
460 | 458,alibaba group hldg limited sponsored ads,alibaba group hldg adr fsponsored adr 1 adr reps 1 ord,
461 | 459,american funds american hi inc tr r3,american high-income trust,
462 | 460,first eagle global i,first eagle global fund cl i,
463 | 461,van mid cap index adm m4940,vanguard mid cap index,
464 | 462,chesapeake energy corp,chesapeake energy corp com,
465 | 463,starbucks corp. cmn,starbucks corp washington div: 0.800,
466 | 464,mc donalds corp div: 3.560,mcdonalds corp,
467 | 465,american funds europacifc r3,american funds europacific r3,
468 | 466,block h & r inc,block h&r inc,
469 | 467,vanguard ftse developed mkts etf,vanguard ftse developed market etf,
470 | 468,vang tot bd mkt adm,vang tot bd mk is pl,
471 | 469,pimco investment grade corporate bond fund - class a,fidelity conservativ income bond fd cl i,
472 | 470,vanguard mid cap index fund - admiral,vanguard mid cap index fd,
473 | 471,vanguard intrmd-term bond index adm,vanguard inter-term bond index port inv,
474 | 472,infinera corporation com isin #us1 sedol #b1yb5y4,infinera corp com,
475 | 473,vanguard crsp us small cap index,vanguard small cap etf,
476 | 474,oneok partners lp lp,oneok partners l p unit ltd partnership,
477 | 475,american mutual fund,american fd american mutual fd cl f2,
478 | 476,american funds europacific growth-r5,american europacific growth,
479 | 477,vang tot bd mkt adm,vanguard ttl bnd mrk indx inst,
480 | 478,breitburn energy partners lp c,breitburn energy partners lp com,
481 | 479,new york community bancorp inc.,new york community,
482 | 480,pimco income instl,pimco income administrative,
483 | 481,select sector spdr trust technology select index,sector spdr tr shs ben int technology,
484 | 482,vanguard ttl bond mkt idx adm,vanguard bond index total mkt investor,
485 | 483,tmpl global bond a,templeton global bond fund r,
486 | 484,ishares 20+ year treasury bo,ishares 20+ year,
487 | 485,vang sm cap idx adm,vnguard index trust small cap idx instl,
488 | 486,linkedin corp class a,linkedin corp-a,
489 | 487,primecap odyssey stock,primecap odyssey stock fund,
490 | 488,time warner inc,time warner inc com new,
491 | 489,vanguard total bond market index-admiral,vanguard total bond market index fund institutional plus,
492 | 490,marathon pete corporation,loews corporation div: 0.250,
493 | 491,ishares core msci emerging etf,harding loevner emerging mkts,
494 | 492,vanguard short term tax exempt admiral share,vanguard short-term tax-exempt fund investor shares,
495 | 493,ford motor com,ford mtr co,
496 | 494,vanguard ftse all world ex us etf,vanguard ftse all-world ex-u,
497 | 495,vmware inc cl a com,vmware inc.,
498 | 496,royal dutch shell plc spons adr a,royal dutch shell plc sponsored adr repstg a shs,
499 | 497,spdr nuveen barclays municipal bond etf,spdr nuveen barclays capital m div: 0.556,
500 | 498,united parcel service cl b,united parcel service inc cl b,
501 | 499,vang smcp gr idx adm,vanguard small cap growth index admiral,
502 | 500,novo-nordisk a s adr,novo-nordisk a s adr isin #us6 sedol #2651202,
503 | 501,fidelity mass muni income,nuveen equity premium income,
504 | 502,ishares russell mid-cap value etf,ishares russell mid cap value etf iv,
505 | 503,yamana gold inc cmn,yamana gold inc com,
506 | 504,vanguard total bond market idx-adm,vanguard total bond market idx instl pls,
507 | 505,otter tail corp com,otter tail corporation cmn,
508 | 506,sptn intl index ins,sptn intl index adv,
509 | 507,cnooc ltd. adr (sponsored),cnooc limited adr fsponsored adr 1 adr rep 100 cl h ord,
510 | 508,eaton vance global macro abs ret a,eaton vance global macro abslte rt cl a,
511 | 509,dfa u s small cap value cl i,dfa us small cap value prtf instl,
512 | 510,sears canada inc (canada),sears cda inc,
513 | 511,at&t inc com isin #us00206r1023 sedol #2831811,franklin res inc com isin #us8 sedol #2350684,
514 | 512,vanguard total international bond index etf,vanguard total international bond et,
515 | 513,wisdomtree japan hedged equity -,wisdomtree japan hedged eq,
516 | 514,templeton global bond fund advisor class,templeton glbal bond adv,
517 | 515,trp health sciences,t. rowe price health sciences fund,


--------------------------------------------------------------------------------
/koursaros/hub/encoder/robertainfer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "encode"]


--------------------------------------------------------------------------------
/koursaros/hub/encoder/robertainfer/dim64.yml:
--------------------------------------------------------------------------------
1 | !CharEmbeddingEncoder
2 | parameters:
3 |   dim: 64


--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | RUN echo 'yo'
6 | 
7 | ENTRYPOINT ["gnes", "encode", "--py_path", "textbyte.py"]


--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/max1024.yml:
--------------------------------------------------------------------------------
1 | !TextByteEncoder
2 | parameters:
3 |   max_seq_len: 1024


--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/max256.yml:
--------------------------------------------------------------------------------
1 | !TextByteEncoder
2 | parameters:
3 |   max_seq_len: 256
4 | 
5 | 


--------------------------------------------------------------------------------
/koursaros/hub/encoder/textbyte/textbyte.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | 
 5 | from gnes.encoder.base import BaseTextEncoder
 6 | from gnes.helper import batching
 7 | 
 8 | 
 9 | class TextByteEncoder(BaseTextEncoder):
10 |     """Returns np array of encoded text. Useful for text search."""
11 |     is_trained = True
12 | 
13 |     def __init__(self, max_seq_len, *args, **kwargs):
14 |         super().__init__(*args, **kwargs)
15 |         self._msl = max_seq_len
16 | 
17 |     def pad_and_vector(self, sent):
18 |         padded = sent.encode()[:self._msl] + b'\x00' * (self._msl - len(sent.encode()))
19 |         try:
20 |             bytes(padded).decode()
21 |             return np.frombuffer(padded, dtype=np.uint8)
22 |         except:  # split aup a multibyte character, so take off one more
23 |             padded = padded[:-2] + b'\x00' * 2
24 |             return self.pad_and_vector(padded.decode())
25 | 
26 |     def encode(self, text: List[str], *args, **kwargs) -> np.ndarray:
27 |         encoded = np.stack([self.pad_and_vector(sent) for sent in text])
28 |         return encoded
29 | 
30 | 


--------------------------------------------------------------------------------
/koursaros/hub/httpclient/http/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
2 | 
3 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp
4 | RUN pip install git+https://github.com/colethienes/gnes.git --no-cache-dir --compile
5 | 
6 | COPY . ./workspace
7 | WORKDIR /workspace
8 | 
9 | ENTRYPOINT ["gnes", "client", "http"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/faisscpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/hub-indexer:latest-faiss-cpu
2 | 
3 | ADD *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "index"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/faisscpu/base.yml:
--------------------------------------------------------------------------------
1 | !FaissIndexer
2 | parameters:
3 |   data_path: /workspace
4 |   index_key: HNSW32
5 |   num_dim: 64


--------------------------------------------------------------------------------
/koursaros/hub/indexer/keyword/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | RUN apk add gcc python3-dev musl-dev
4 | RUN pip install pyahocorasick
5 | 
6 | ADD *.py *.yml ./
7 | 
8 | ENTRYPOINT ["gnes", "index", "--py_path", "keyword.py"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/keyword/base.yml:
--------------------------------------------------------------------------------
1 | !KeywordIndexer {}
2 | 
3 | 


--------------------------------------------------------------------------------
/koursaros/hub/indexer/keyword/keyword.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import numpy as np
 3 | from collections import defaultdict
 4 | 
 5 | from gnes.indexer.base import BaseChunkIndexer as BCI
 6 | 
 7 | 
 8 | class KeywordIndexer(BCI):
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         """
12 |         Initialize an indexer that implements the AhoCorasick Algorithm
13 |         """
14 |         super().__init__(*args, **kwargs)
15 |         import ahocorasick
16 |         self._automaton = ahocorasick.Automaton()
17 |         self.size = 0
18 | 
19 |     def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, _, *args, **kwargs):
20 |         if vectors.dtype != np.uint8:
21 |             raise ValueError('vectors should be ndarray of uint8')
22 | 
23 |         for key, vector in zip(keys, vectors):
24 |             self._automaton.add_word(self.decode_textbytes(vector), key)
25 |             self.size += 1
26 | 
27 |         self.logger.error(list(self._automaton.keys()))
28 | 
29 |     def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]:
30 |         if keys.dtype != np.uint8:
31 |             raise ValueError('vectors should be ndarray of uint8')
32 |         elif not self.size > 0:
33 |             print('Warning: empty index queried')
34 |             return []
35 | 
36 |         self._automaton.make_automaton()
37 | 
38 |         ret = []
39 |         for key in keys:
40 |             ret_i = defaultdict(int)
41 |             for _, (doc_id, offset) in self._automaton.iter(self.decode_textbytes(key)):
42 |                 ret_i[(doc_id, offset)] += 1
43 | 
44 |             # _doc_id, _offset, _weight, _relevance
45 |             results = [(*k, 1.0, v) for k, v in ret_i.items()]
46 |             # topk by number of keyword matches
47 |             ret.append(sorted(results, reverse=True, key=lambda x: x[-1])[:top_k])
48 | 
49 |         return ret
50 | 
51 |     @staticmethod
52 |     def decode_textbytes(vector: np.ndarray):
53 |         return vector.tobytes().rstrip(b'\x00').decode()
54 | 


--------------------------------------------------------------------------------
/koursaros/hub/indexer/lvdb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 | 
3 | RUN pip install plyvel>=1.0.5 --no-cache-dir --compile
4 | 
5 | ADD *.py *.yml ./
6 | 
7 | ENTRYPOINT ["gnes", "index"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/lvdb/base.yml:
--------------------------------------------------------------------------------
1 | !LVDBIndexer
2 | parameters:
3 |   data_path: /workspace


--------------------------------------------------------------------------------
/koursaros/hub/indexer/rocksdb/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gnes/gnes:latest-buster
 2 | 
 3 | RUN apt-get update
 4 | RUN apt-get install -y python-dev librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libgflags-dev
 5 | RUN pip install python-rocksdb --no-cache-dir --compile
 6 | RUN apt-get install -y git
 7 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp
 8 | RUN pip install --upgrade git+https://github.com/colethienes/gnes.git --no-cache-dir --compile
 9 | 
10 | ADD *.py *.yml ./
11 | 
12 | ENTRYPOINT ["gnes", "index"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/rocksdb/base.yml:
--------------------------------------------------------------------------------
1 | !RocksDBIndexer
2 | parameters:
3 |   data_path: /workspace


--------------------------------------------------------------------------------
/koursaros/hub/indexer/simple_dict/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gnes/gnes:latest-buster
 2 | 
 3 | RUN apt-get update
 4 | RUN apt-get install -y git
 5 | RUN pip install grpcio pyzmq protobuf ruamel.yaml ruamel.yaml.clib aiohttp
 6 | RUN pip install --upgrade git+https://github.com/colethienes/gnes.git --no-cache-dir --compile
 7 | 
 8 | ADD *.py *.yml ./
 9 | 
10 | ENTRYPOINT ["gnes", "index", "--py_path", "simple_dict.py"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/simple_dict/base.yml:
--------------------------------------------------------------------------------
1 | !SimpleDictIndexer {}


--------------------------------------------------------------------------------
/koursaros/hub/indexer/simple_dict/simple_dict.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from google.protobuf.json_format import MessageToJson, Parse
 4 | 
 5 | from gnes.indexer.base import BaseDocIndexer as BDI
 6 | from gnes.proto import gnes_pb2
 7 | 
 8 | 
 9 | class SimpleDictIndexer(BDI):
10 | 
11 |     def __init__(self, *args, **kwargs):
12 |         super().__init__(*args, **kwargs)
13 |         self._content = {}
14 | 
15 |     @BDI.update_counter
16 |     def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs):
17 |         self.logger.error(keys)
18 |         self.logger.error(docs)
19 |         self._content.update({k: MessageToJson(d) for (k, d) in zip(keys, docs)})
20 | 
21 |     def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']:
22 |         self.logger.error(keys)
23 |         return [Parse(self._content[k], gnes_pb2.Document()) for k in keys]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/whoosh/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-buster
2 | 
3 | RUN pip install whoosh
4 | 
5 | ADD *.py *.yml ./
6 | 
7 | ENTRYPOINT ["gnes", "index", "--py_path", "whoosh.py"]


--------------------------------------------------------------------------------
/koursaros/hub/indexer/whoosh/base.yml:
--------------------------------------------------------------------------------
1 | !WhooshIndexer
2 | parameters:
3 |   data_path: /workspace


--------------------------------------------------------------------------------
/koursaros/hub/indexer/whoosh/whoosh.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import numpy as np
 3 | import os, os.path
 4 | from whoosh import index, scoring
 5 | from whoosh.fields import Schema, TEXT, NUMERIC
 6 | from whoosh.analysis import StemmingAnalyzer
 7 | from whoosh.qparser import QueryParser
 8 | from whoosh.writing import SegmentWriter
 9 | from whoosh.codec import default_codec
10 | from whoosh.automata import lev
11 | from whoosh.searching import Searcher
12 | from whoosh import collectors
13 | 
14 | import glob
15 | 
16 | from gnes.indexer.base import BaseChunkIndexer as BCI
17 | 
18 | 
19 | class WhooshIndexer(BCI):
20 | 
21 |     def __init__(self, data_path, *args, **kwargs):
22 |         """
23 |         Initialize an indexer that implements the AhoCorasick Algorithm
24 |         """
25 |         super().__init__(*args, **kwargs)
26 |         schema = Schema(doc_id=NUMERIC(stored=True),
27 |                         offset=NUMERIC(stored=True),
28 |                         body=TEXT(analyzer=StemmingAnalyzer()))
29 |         if not os.path.exists(data_path):
30 |             os.mkdir(data_path)
31 |             self.logger.error('Please mount volume for persisting index.')
32 |         try:
33 |             self.ix = index.open_dir(data_path)
34 |         except:
35 |             self.logger.warning('Creating empty whoosh index')
36 |             self.ix = index.create_in(data_path, schema)
37 | 
38 |     def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, _, *args, **kwargs):
39 |         self.logger.error('Recieved add index request')
40 |         self.logger.error(keys)
41 |         if vectors.dtype != np.uint8:
42 |             raise ValueError('vectors should be ndarray of uint8')
43 | 
44 |         writer = self.ix.writer()
45 |         for key, vector in zip(keys, vectors):
46 |             body = self.decode_textbytes(vector)
47 |             writer.add_document(doc_id=key[0],offset=key[1],body=body)
48 | 
49 |         writer.commit()
50 | 
51 |     def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tuple]]:
52 |         if keys.dtype != np.uint8:
53 |             raise ValueError('vectors should be ndarray of uint8')
54 | 
55 |         ret = []
56 |         qp = QueryParser("body", schema=self.ix.schema)
57 |         with self.ix.searcher(weighting=scoring.TF_IDF()) as searcher:
58 |             for key in keys:
59 |                 query = qp.parse(self.decode_textbytes(key))
60 |                 ret.append([
61 |                     (result['doc_id'],result['offset'], 1.0, 1.0)
62 |                     for result in searcher.search(query, limit=top_k)])
63 |         self.logger.error(ret)
64 |         return ret
65 | 
66 |     @staticmethod
67 |     def decode_textbytes(vector: np.ndarray):
68 |         return vector.tobytes().rstrip(b'\x00').decode()
69 | 
70 |     # def __getstate__(self):
71 |     #     import faiss
72 |     #     d = super().__getstate__()
73 |     #     faiss.write_index(self._faiss_index, self.data_path)
74 |     #     return d
75 | 


--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/sentsplit/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "preprocess"]


--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/sentsplit/jsonmode.yml:
--------------------------------------------------------------------------------
1 | !SentSplitPreprocessor
2 | parameters:
3 |   is_json: True


--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/unary/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "preprocess"]


--------------------------------------------------------------------------------
/koursaros/hub/preprocessor/unary/text.yml:
--------------------------------------------------------------------------------
1 | !UnaryPreprocessor
2 | parameters:
3 |   doc_type: 1


--------------------------------------------------------------------------------
/koursaros/hub/router/block/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "route", "--py_path", "block.py"]


--------------------------------------------------------------------------------
/koursaros/hub/router/block/block.py:
--------------------------------------------------------------------------------
 1 | from gnes.router.base import BaseRouter
 2 | from gnes.proto import gnes_pb2
 3 | from gnes.service.base import BlockMessage
 4 | from typing import List
 5 | 
 6 | 
 7 | class BlockRouter(BaseRouter):
 8 |     """ :param block: runtimes to block"""
 9 | 
10 |     def __init__(self, block: List[str] = [], *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 |         self.block = block
13 | 
14 |     def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
15 |         """
16 |         Log the incoming message
17 |         :param msg: incoming message
18 |         """
19 | 
20 |         runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body')
21 |         self.logger.error(runtime)
22 | 
23 |         if runtime in self.block:
24 |             self.logger.info('Blocking %s msg...' % runtime)
25 |             raise BlockMessage
26 | 


--------------------------------------------------------------------------------
/koursaros/hub/router/block/block_query.yml:
--------------------------------------------------------------------------------
1 | !BlockRouter
2 | parameters:
3 |   block: []


--------------------------------------------------------------------------------
/koursaros/hub/router/block/block_train.yml:
--------------------------------------------------------------------------------
1 | !BlockRouter
2 | parameters:
3 |   block: [train]


--------------------------------------------------------------------------------
/koursaros/hub/router/log/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "--verbose", "route", "--py_path", "log.py"]


--------------------------------------------------------------------------------
/koursaros/hub/router/log/log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from gnes.router.base import BaseRouter
 3 | 
 4 | 
 5 | class LogRouter(BaseRouter):
 6 |     """ Base class for the router. Inherit from this class to create a new router.
 7 |     Router forwards messages between services. Essentially, it receives a 'gnes_pb2.Message'
 8 |     and call `apply()` method on it.
 9 |     """
10 | 
11 |     def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
12 |         """
13 |         Log the incoming message
14 |         :param msg: incoming message
15 |         """
16 |         self.logger.info(msg)
17 | 


--------------------------------------------------------------------------------
/koursaros/hub/router/rerank/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.2-cuda10.0-cudnn7-runtime
2 | 
3 | RUN pip install -U transformers gnes --no-cache-dir --compile
4 | 
5 | WORKDIR /
6 | ADD *.py *.yml ./
7 | RUN nvidia-smi
8 | 
9 | ENTRYPOINT ["gnes", "route", "--py_path", "rerank.py"]


--------------------------------------------------------------------------------
/koursaros/hub/router/rerank/base.yml:
--------------------------------------------------------------------------------
1 | !RerankRouter
2 | parameters:
3 |   model_name: bert-base-uncased
4 |   data_dir: /workspace
5 | 


--------------------------------------------------------------------------------
/koursaros/hub/router/rerank/rerank.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from gnes.router.base import BaseRouter
  4 | from gnes.proto import gnes_pb2
  5 | from gnes.helper import batching
  6 | from gnes.service.base import BlockMessage
  7 | 
  8 | 
  9 | from transformers import *
 10 | import torch
 11 | import torch.nn
 12 | import numpy as np
 13 | 
 14 | 
 15 | class RerankRouter(BaseRouter):
 16 | 
 17 |     def __init__(self, model_name: str = None, data_dir: str = None, *args, **kwargs):
 18 |         super().__init__(*args, **kwargs)
 19 |         self.model_name = model_name
 20 |         self.data_dir = data_dir
 21 |         self.max_grad_norm = 1.0
 22 |         self.lr = 1e-3
 23 |         self.query_dict = dict()
 24 | 
 25 |     def post_init(self):
 26 |         model_config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.data_dir)
 27 |         model_config.num_labels = 1 # set up for regression
 28 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 29 |         if self.device == "cpu": self.logger.error("RUNING ON CPU")
 30 |         self.rerank_model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
 31 |                                                                                config=model_config,
 32 |                                                                                cache_dir=self.data_dir)
 33 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.data_dir)
 34 |         self.rerank_model.to(self.device)
 35 | 
 36 |         self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False)
 37 |         self.scheduler = ConstantLRSchedule(self.optimizer)
 38 | 
 39 |     def get_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult') -> str:
 40 |         return x.doc.doc_id
 41 | 
 42 |     def set_key(self, x: 'gnes_pb2.Response.QueryResponse.ScoredResult', k: str) -> None:
 43 |         x.doc.doc_id = k
 44 | 
 45 |     # @batching
 46 |     def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
 47 | 
 48 |         all_scored_results = [sr for sr in msg.response.search.topk_results]
 49 |         runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body')
 50 | 
 51 |         if runtime == 'train':  # training samples are given
 52 |             inputs = []
 53 |             labels = []
 54 |             for doc in msg.request.train.docs:
 55 |                 ex = json.loads(doc.raw_bytes)
 56 |                 inputs.append(
 57 |                     self.tokenizer.encode_plus(ex['Query'], ex['Candidate'], add_special_tokens=True))
 58 |                 labels.append(float(ex['Label']))
 59 | 
 60 |             labels = torch.tensor(labels, dtype=torch.float).to(self.device)
 61 | 
 62 |         elif runtime == 'search':
 63 |             if msg.WhichOneof('body') == 'request':
 64 |                 self.logger.error('got request')
 65 |                 if not msg.request.request_id in self.query_dict:
 66 |                     self.query_dict[msg.request.request_id] = msg.request.search.query.raw_bytes.decode()
 67 |                     raise BlockMessage
 68 |                 else:
 69 |                     query = msg.request.search.query.raw_bytes.decode()
 70 |                     all_scored_results = self.query_dict[msg.request.request_id]
 71 |             else:
 72 |                 self.logger.error('got response')
 73 |                 if not msg.response.request_id in self.query_dict:
 74 |                     self.query_dict[msg.request.request_id] = all_scored_results
 75 |                     raise BlockMessage
 76 |                 else:
 77 |                     query = self.query_dict[msg.response.request_id]
 78 |             inputs = [
 79 |                 self.tokenizer.encode_plus(
 80 |                     query,
 81 |                     sr.doc.chunks[0].text,
 82 |                     add_special_tokens=True,
 83 |                 ) for sr in all_scored_results]
 84 |             self.logger.error([sr.doc.chunks[0].text for sr in all_scored_results])
 85 |             labels = None
 86 | 
 87 |         else:
 88 |             raise BlockMessage
 89 | 
 90 |         if len(inputs) == 0:
 91 |             print("Warning: empty input set, ignoring.")
 92 |             return
 93 | 
 94 |         max_len = max(len(t['input_ids']) for t in inputs)
 95 |         input_ids = [t['input_ids'] + [0] * (max_len - len(t['input_ids'])) for t in inputs]
 96 |         token_type_ids = [t['token_type_ids'] + [0] * (max_len - len(t['token_type_ids'])) for t in inputs]
 97 |         attention_mask = [[1] * len(t['input_ids']) + [0] * (max_len - len(t['input_ids'])) for t in inputs]
 98 | 
 99 |         input_ids = torch.tensor(input_ids).to(self.device)
100 |         token_type_ids = torch.tensor(token_type_ids).to(self.device)
101 |         attention_mask = torch.tensor(attention_mask).to(self.device)
102 | 
103 |         if labels is not None:
104 |             loss = self.rerank_model(input_ids, token_type_ids=token_type_ids,
105 |                                      labels=labels, attention_mask=attention_mask)[0]
106 |             loss.backward()
107 |             torch.nn.utils.clip_grad_norm_(self.rerank_model.parameters(), self.max_grad_norm)
108 |             self.optimizer.step()
109 |             self.scheduler.step()
110 |             self.rerank_model.zero_grad()
111 |             msg.response.train.status = gnes_pb2.Response.Status.SUCCESS
112 | 
113 |         else:
114 |             with torch.no_grad():
115 |                 logits = self.rerank_model(input_ids, token_type_ids=token_type_ids,
116 |                                            attention_mask=attention_mask)[0]
117 |                 scores = np.squeeze(logits.detach().cpu().numpy())
118 |                 if len(logits) == 1:
119 |                     scores = [scores]
120 |             ranked_results = []
121 |             for sr, score in zip(all_scored_results, scores):
122 |                 ranked_results.append((sr.doc, score))
123 | 
124 |             k = msg.response.search.top_k
125 |             top_k = sorted(ranked_results, key=lambda x: x[1], reverse=True)[:k]
126 | 
127 |             msg.response.search.ClearField('topk_results')
128 |             for doc, score in top_k:
129 |                 sr = msg.response.search.topk_results.add()
130 |                 sr.score.value = float(score)
131 |                 sr.doc.CopyFrom(doc)
132 | 


--------------------------------------------------------------------------------
/koursaros/hub/router/resp_req/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gnes/gnes:latest-alpine
2 | 
3 | ADD *.py *.yml ./
4 | 
5 | ENTRYPOINT ["gnes", "route", "--py_path", "resp_req.py"]


--------------------------------------------------------------------------------
/koursaros/hub/router/resp_req/base.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/koursaros/hub/router/resp_req/base.yml


--------------------------------------------------------------------------------
/koursaros/hub/router/resp_req/resp_req.py:
--------------------------------------------------------------------------------
 1 | from gnes.router.base import BaseRouter
 2 | from gnes.proto import gnes_pb2
 3 | 
 4 | class RespReqRouter(BaseRouter):
 5 |     def apply(self, msg: 'gnes_pb2.Message', *args, **kwargs):
 6 |         """
 7 |         Log the incoming message
 8 |         :param msg: incoming message
 9 |         """
10 | 
11 |         runtime = getattr(msg, msg.WhichOneof('body')).WhichOneof('body')
12 |         print('recieved msg')
13 |         print(msg)
14 |         print(runtime)
15 |         if runtime == 'index':
16 |             req = gnes_pb2.Message()


--------------------------------------------------------------------------------
/koursaros/hub/tests/sonnets_small.txt:
--------------------------------------------------------------------------------
  1 | From fairest creatures we desire increase,
  2 | That thereby beauty's rose might never die,
  3 | But as the riper should by time decease,
  4 | His tender heir might bear his memory:
  5 | But thou contracted to thine own bright eyes,
  6 | Feed'st thy light's flame with self-substantial fuel,
  7 | Making a famine where abundance lies,
  8 | Thy self thy foe, to thy sweet self too cruel:
  9 | Thou that art now the world's fresh ornament,
 10 | And only herald to the gaudy spring,
 11 | Within thine own bud buriest thy content,
 12 | And, tender churl, mak'st waste in niggarding:
 13 | Pity the world, or else this glutton be,
 14 | To eat the world's due, by the grave and thee.
 15 | 
 16 | When forty winters shall besiege thy brow,
 17 | And dig deep trenches in thy beauty's field,
 18 | Thy youth's proud livery so gazed on now,
 19 | Will be a totter'd weed of small worth held:
 20 | Then being asked, where all thy beauty lies,
 21 | Where all the treasure of thy lusty days;
 22 | To say, within thine own deep sunken eyes,
 23 | Were an all-eating shame, and thriftless praise.
 24 | How much more praise deserv'd thy beauty's use,
 25 | If thou couldst answer 'This fair child of mine
 26 | Shall sum my count, and make my old excuse,'
 27 | Proving his beauty by succession thine!
 28 | This were to be new made when thou art old,
 29 | And see thy blood warm when thou feel'st it cold.
 30 | 
 31 | Look in thy glass and tell the face thou viewest
 32 | Now is the time that face should form another;
 33 | Whose fresh repair if now thou not renewest,
 34 | Thou dost beguile the world, unbless some mother.
 35 | For where is she so fair whose uneared womb
 36 | Disdains the tillage of thy husbandry?
 37 | Or who is he so fond will be the tomb
 38 | Of his self-love, to stop posterity?
 39 | Thou art thy mother's glass and she in thee
 40 | Calls back the lovely April of her prime;
 41 | So thou through windows of thine age shalt see,
 42 | Despite of wrinkles, this thy golden time.
 43 | But if thou live, remembered not to be,
 44 | Die single and thine image dies with thee.
 45 | 
 46 | Unthrifty loveliness, why dost thou spend
 47 | Upon thy self thy beauty's legacy?
 48 | Nature's bequest gives nothing, but doth lend,
 49 | And being frank she lends to those are free:
 50 | Then, beauteous niggard, why dost thou abuse
 51 | The bounteous largess given thee to give?
 52 | Profitless usurer, why dost thou use
 53 | So great a sum of sums, yet canst not live?
 54 | For having traffic with thy self alone,
 55 | Thou of thy self thy sweet self dost deceive:
 56 | Then how when nature calls thee to be gone,
 57 | What acceptable audit canst thou leave?
 58 | Thy unused beauty must be tombed with thee,
 59 | Which, used, lives th' executor to be.
 60 | 
 61 | Those hours, that with gentle work did frame
 62 | The lovely gaze where every eye doth dwell,
 63 | Will play the tyrants to the very same
 64 | And that unfair which fairly doth excel;
 65 | For never-resting time leads summer on
 66 | To hideous winter, and confounds him there;
 67 | Sap checked with frost, and lusty leaves quite gone,
 68 | Beauty o'er-snowed and bareness every where:
 69 | Then were not summer's distillation left,
 70 | A liquid prisoner pent in walls of glass,
 71 | Beauty's effect with beauty were bereft,
 72 | Nor it, nor no remembrance what it was:
 73 | But flowers distilled, though they with winter meet,
 74 | Leese but their show; their substance still lives sweet.
 75 | 
 76 | Then let not winter's ragged hand deface,
 77 | In thee thy summer, ere thou be distilled:
 78 | Make sweet some vial; treasure thou some place
 79 | With beauty's treasure ere it be self-killed.
 80 | That use is not forbidden usury,
 81 | Which happies those that pay the willing loan;
 82 | That's for thy self to breed another thee,
 83 | Or ten times happier, be it ten for one;
 84 | Ten times thy self were happier than thou art,
 85 | If ten of thine ten times refigured thee:
 86 | Then what could death do if thou shouldst depart,
 87 | Leaving thee living in posterity?
 88 | Be not self-willed, for thou art much too fair
 89 | To be death's conquest and make worms thine heir.
 90 | 
 91 | Lo! in the orient when the gracious light
 92 | Lifts up his burning head, each under eye
 93 | Doth homage to his new-appearing sight,
 94 | Serving with looks his sacred majesty;
 95 | And having climbed the steep-up heavenly hill,
 96 | Resembling strong youth in his middle age,
 97 | Yet mortal looks adore his beauty still,
 98 | Attending on his golden pilgrimage:
 99 | But when from highmost pitch, with weary car,
100 | Like feeble age, he reeleth from the day,
101 | The eyes, 'fore duteous, now converted are
102 | From his low tract, and look another way:
103 | So thou, thyself outgoing in thy noon
104 | Unlooked on diest unless thou get a son.


--------------------------------------------------------------------------------
/koursaros/hub/tests/test_block.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | 
 4 | from gnes.cli.parser import set_router_parser, _set_client_parser
 5 | from gnes.service.router import RouterService
 6 | from gnes.service.base import SocketType
 7 | from gnes.client.base import ZmqClient
 8 | from gnes.proto import gnes_pb2
 9 | 
10 | 
11 | class TestBlock(unittest.TestCase):
12 | 
13 |     def setUp(self):
14 |         dirname = os.path.dirname(__file__)
15 |         self.rerank_router_yaml = os.path.join(dirname, '../', 'router/block/block_train.yml')
16 |         self.python_code = os.path.join(dirname, '../', 'router/block/block.py')
17 | 
18 | 
19 |         self.args = set_router_parser().parse_args([
20 |             '--yaml_path', self.rerank_router_yaml,
21 |             '--socket_out', str(SocketType.PUB_BIND),
22 |             '--py_path', self.python_code
23 |         ])
24 |         self.c_args = _set_client_parser().parse_args([
25 |             '--port_in', str(self.args.port_out),
26 |             '--port_out', str(self.args.port_in),
27 |             '--socket_in', str(SocketType.SUB_CONNECT)
28 |         ])
29 | 
30 |     def test_block_router(self):
31 |         with RouterService(self.args), ZmqClient(self.c_args) as c1:
32 |             msg = gnes_pb2.Message()
33 |             msg.request.train.docs.add()
34 |             c1.send_message(msg)
35 |             msg = gnes_pb2.Message()
36 |             msg.request.index.docs.add()
37 |             c1.send_message(msg)
38 |             r = c1.recv_message()
39 | 


--------------------------------------------------------------------------------
/koursaros/hub/tests/test_keyword.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from gnes.proto import gnes_pb2
 5 | from gnes.client.base import ZmqClient
 6 | from gnes.service.base import SocketType
 7 | from gnes.cli.parser import set_router_parser, _set_client_parser
 8 | from gnes.service.indexer import IndexerService
 9 | import numpy as np
10 | 
11 | 
12 | class TestKeyword(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         dirname = os.path.dirname(__file__)
16 |         self.yaml = os.path.join(dirname, 'yaml', 'test-keyword.yml')
17 |         self.python_code = os.path.join(dirname, '../', 'indexer/keyword/keyword.py')
18 | 
19 |         self.test_str = []
20 |         self.test_vec = []
21 |         self._msl = 512
22 |         with open(os.path.join(dirname, 'sonnets_small.txt')) as f:
23 |             for line in f:
24 |                 line = line.strip()
25 |                 if line == '': continue
26 |                 self.test_vec.append(np.frombuffer(
27 |                         line.encode()[:self._msl] + b'\x00' * (self._msl - len(line)),
28 |                         dtype=np.uint8
29 |                 ))
30 |                 self.test_str.append(line)
31 | 
32 |     def test_keyword(self):
33 |         args = set_router_parser().parse_args([
34 |             '--yaml_path', self.yaml,
35 |             '--socket_out', str(SocketType.PUB_BIND),
36 |             '--py_path', self.python_code,
37 |         ])
38 |         args.as_response = True
39 |         c_args = _set_client_parser().parse_args([
40 |             '--port_in', str(args.port_out),
41 |             '--port_out', str(args.port_in),
42 |             '--socket_in', str(SocketType.SUB_CONNECT)
43 |         ])
44 |         with IndexerService(args), ZmqClient(c_args) as c1:
45 |             msg = gnes_pb2.Message()
46 |             for i, vec in enumerate(self.test_vec):
47 |                 doc = msg.request.index.docs.add()
48 |                 doc.doc_id = i
49 |                 doc.raw_text = self.test_str[i]
50 |                 c = doc.chunks.add()
51 |                 c.doc_id = i
52 |                 c.offset = 0
53 |                 c.embedding.data = vec.tobytes()
54 |                 for d in vec.shape:
55 |                     c.embedding.shape.extend([d])
56 |                 c.embedding.dtype = str(vec.dtype)
57 |                 c.text = self.test_str[i]
58 |             c1.send_message(msg)
59 | 
60 |             r = c1.recv_message()
61 |             self.assert_(r.response.index)
62 | 
63 |             for i, vec in enumerate(self.test_vec):
64 |                 msg = gnes_pb2.Message()
65 |                 msg.request.search.query.doc_id = 1
66 |                 msg.request.search.top_k = 1
67 |                 c = msg.request.search.query.chunks.add()
68 |                 c.doc_id = 1
69 |                 c.embedding.data = vec.tobytes()
70 |                 for d in vec.shape:
71 |                     c.embedding.shape.extend([d])
72 |                 c.embedding.dtype = str(vec.dtype)
73 |                 c.offset = 0
74 |                 c.weight = 1
75 |                 c.text = self.test_str[i]
76 |                 c1.send_message(msg)
77 |                 r = c1.recv_message()
78 |                 self.assert_(r.response.search.topk_results[0].chunk.doc_id == i)
79 | 
80 |     def tearDown(self):
81 |         pass


--------------------------------------------------------------------------------
/koursaros/hub/tests/test_reranker.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | import json
  4 | 
  5 | from gnes.proto import gnes_pb2
  6 | from gnes.client.base import ZmqClient
  7 | from gnes.service.base import SocketType
  8 | from gnes.cli.parser import set_router_parser, _set_client_parser
  9 | from gnes.service.router import RouterService
 10 | 
 11 | 
 12 | class TestReranker(unittest.TestCase):
 13 | 
 14 |     def setUp(self):
 15 |         dirname = os.path.dirname(__file__)
 16 |         self.rerank_router_yaml = os.path.join(dirname, 'yaml', 'test-reranker.yml')
 17 |         self.python_code = os.path.join(dirname, '../', 'router/rerank/rerank.py')
 18 | 
 19 |         self.test_str = []
 20 |         with open(os.path.join(dirname, 'sonnets_small.txt')) as f:
 21 |             for line in f:
 22 |                 line = line.strip()
 23 |                 if line:
 24 |                     self.test_str.append(line)
 25 | 
 26 |         self.args = set_router_parser().parse_args([
 27 |             '--yaml_path', self.rerank_router_yaml,
 28 |             '--socket_out', str(SocketType.PUB_BIND),
 29 |             '--py_path', self.python_code
 30 |         ])
 31 |         self.c_args = _set_client_parser().parse_args([
 32 |             '--port_in', str(self.args.port_out),
 33 |             '--port_out', str(self.args.port_in),
 34 |             '--socket_in', str(SocketType.SUB_CONNECT)
 35 |         ])
 36 | 
 37 |     # @unittest.skip('SKIPPING TRAIN TEST')
 38 |     def test_rerank_train(self):
 39 |         with RouterService(self.args), ZmqClient(self.c_args) as c1:
 40 |             msg = gnes_pb2.Message()
 41 |             msg.response.search.ClearField('topk_results')
 42 |             msg.request.search.query.raw_text = 'This is a query'
 43 | 
 44 |             for i, line in enumerate(self.test_str[:5]):
 45 |                 s = msg.response.search.topk_results.add()
 46 |                 s.score.value = 0.1
 47 |                 s.doc.doc_id = i
 48 |                 s.doc.raw_text = line
 49 | 
 50 |             msg.envelope.num_part.extend([1])
 51 |             msg.response.search.top_k = 5
 52 |             c1.send_message(msg)
 53 | 
 54 |             r = c1.recv_message()
 55 |             print(r)
 56 | 
 57 |             msg = gnes_pb2.Message()
 58 | 
 59 |             for i, line in enumerate(self.test_str):
 60 |                 doc = msg.request.train.docs.add()
 61 |                 doc.doc_id = i
 62 |                 doc.raw_bytes = json.dumps({
 63 |                     'Query' : 'test query',
 64 |                     'Candidate' : line,
 65 |                     'Label' : 1.0 if i % 2 == 0 else 0.0
 66 |                 }).encode('utf-8')
 67 | 
 68 |             msg.envelope.num_part.extend([1])
 69 |             c1.send_message(msg)
 70 |             r = c1.recv_message()
 71 |             print(r)
 72 | 
 73 |     @unittest.skip("SKIPPING QUERY TEST")
 74 |     def test_rerank(self):
 75 |         with RouterService(self.args), ZmqClient(self.c_args) as c1:
 76 |             msg = gnes_pb2.Message()
 77 |             msg.response.search.ClearField('topk_results')
 78 |             msg.request.search.query.raw_text = 'This is a query'
 79 | 
 80 |             for i, line in enumerate(self.test_str):
 81 |                 s = msg.response.search.topk_results.add()
 82 |                 s.score.value = 0.1
 83 |                 s.doc.doc_id = i
 84 |                 s.doc.raw_text = line
 85 | 
 86 |             msg.envelope.num_part.extend([1])
 87 |             msg.response.search.top_k = 5
 88 |             c1.send_message(msg)
 89 | 
 90 |             r = c1.recv_message()
 91 |             # import pdb
 92 |             # pdb.set_trace()
 93 |             self.assertSequenceEqual(r.envelope.num_part, [1])
 94 |             self.assertEqual(len(r.response.search.topk_results), 5)
 95 | 
 96 |             msg = gnes_pb2.Message()
 97 |             msg.response.search.ClearField('topk_results')
 98 | 
 99 |             for i, line in enumerate(self.test_str[:1]):
100 |                 s = msg.response.search.topk_results.add()
101 |                 s.score.value = 0.1
102 |                 s.doc.doc_id = i
103 |                 s.doc.raw_text = line
104 | 
105 |             msg.envelope.num_part.extend([1])
106 |             msg.response.search.top_k = 5
107 |             c1.send_message(msg)
108 | 
109 |             r = c1.recv_message()
110 |             self.assertSequenceEqual(r.envelope.num_part, [1])
111 |             self.assertEqual(len(r.response.search.topk_results), 1)
112 | 
113 |     def tearDown(self):
114 |         pass


--------------------------------------------------------------------------------
/koursaros/hub/tests/test_textbyte_encoder.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from koursaros.hub.encoder.textbyte.textbyte import TextByteEncoder
 3 | import pathlib
 4 | import csv
 5 | 
 6 | import numpy as np
 7 | 
 8 | class TestTextByte(unittest.TestCase):
 9 | 
10 |     def setUp(self) -> None:
11 |         self.msl = 1024
12 |         self.model = TextByteEncoder(self.msl)
13 |         self.path = pathlib.Path('reviews_sample.csv')
14 |         self.csv = csv.DictReader(self.path.open())
15 | 
16 |     def test_textbyte(self):
17 |         to_encode = []
18 |         for row in self.csv:
19 |             to_encode.append(list(row.values())[1])
20 |         vectors = self.model.encode(to_encode)
21 |         for vec in vectors:
22 |             self.assertEqual(len(vec), self.msl)
23 |         for vector in vectors:
24 |             self.decode_textbytes(vector)
25 |         self.decode_textbytes(vectors)
26 | 
27 |     @staticmethod
28 |     def decode_textbytes(vector: np.ndarray):
29 |         return vector.tobytes().rstrip(b'\x00').decode()


--------------------------------------------------------------------------------
/koursaros/hub/tests/test_whoosh.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from unittest import SkipTest
  4 | 
  5 | from gnes.proto import gnes_pb2
  6 | from gnes.client.base import ZmqClient
  7 | from gnes.service.base import SocketType
  8 | from gnes.cli.parser import set_router_parser, _set_client_parser
  9 | from gnes.service.indexer import IndexerService
 10 | import numpy as np
 11 | 
 12 | 
 13 | class TestWhoosh(unittest.TestCase):
 14 | 
 15 |     def setUp(self):
 16 |         dirname = os.path.dirname(__file__)
 17 |         self.yaml = os.path.join(dirname, 'yaml', 'test-whoosh.yml')
 18 |         self.yaml_joint = os.path.join(dirname, 'yaml', 'test-joint.yml')
 19 |         self.python_code = os.path.join(dirname, '../', 'indexer/whoosh/whoosh.py')
 20 | 
 21 |         self.test_str = []
 22 |         self.test_vec = []
 23 |         self._msl = 512
 24 |         with open(os.path.join(dirname, 'sonnets_small.txt')) as f:
 25 |             for line in f:
 26 |                 line = line.strip()
 27 |                 if line == '': continue
 28 |                 self.test_vec.append(np.frombuffer(
 29 |                         line.encode()[:self._msl] + b'\x00' * (self._msl - len(line)),
 30 |                         dtype=np.uint8
 31 |                 ))
 32 |                 self.test_str.append(line)
 33 | 
 34 |     def test_whoosh(self):
 35 |         args = set_router_parser().parse_args([
 36 |             '--yaml_path', self.yaml,
 37 |             '--socket_out', str(SocketType.PUB_BIND),
 38 |             '--py_path', self.python_code,
 39 |         ])
 40 |         args.as_response = True
 41 |         c_args = _set_client_parser().parse_args([
 42 |             '--port_in', str(args.port_out),
 43 |             '--port_out', str(args.port_in),
 44 |             '--socket_in', str(SocketType.SUB_CONNECT)
 45 |         ])
 46 |         with IndexerService(args), ZmqClient(c_args) as c1:
 47 |             msg = gnes_pb2.Message()
 48 |             for i, vec in enumerate(self.test_vec):
 49 |                 doc = msg.request.index.docs.add()
 50 |                 doc.doc_id = i
 51 |                 doc.raw_text = self.test_str[i]
 52 |                 c = doc.chunks.add()
 53 |                 c.doc_id = i
 54 |                 c.offset = 0
 55 |                 c.embedding.data = vec.tobytes()
 56 |                 for d in vec.shape:
 57 |                     c.embedding.shape.extend([d])
 58 |                 c.embedding.dtype = str(vec.dtype)
 59 |                 c.text = self.test_str[i]
 60 |             c1.send_message(msg)
 61 | 
 62 |             r = c1.recv_message()
 63 |             self.assert_(r.response.index)
 64 | 
 65 |             for i, vec in enumerate(self.test_vec):
 66 |                 msg = gnes_pb2.Message()
 67 |                 msg.request.search.query.doc_id = 1
 68 |                 msg.request.search.top_k = 1
 69 |                 c = msg.request.search.query.chunks.add()
 70 |                 c.doc_id = 1
 71 |                 c.embedding.data = vec.tobytes()
 72 |                 for d in vec.shape:
 73 |                     c.embedding.shape.extend([d])
 74 |                 c.embedding.dtype = str(vec.dtype)
 75 |                 c.offset = 0
 76 |                 c.weight = 1
 77 |                 c.text = self.test_str[i]
 78 |                 c1.send_message(msg)
 79 |                 r = c1.recv_message()
 80 |                 import pdb
 81 |                 pdb.set_trace()
 82 |                 try:
 83 |                     self.assert_(r.response.search.topk_results[0].chunk.doc_id == i)
 84 |                 except:
 85 |                     pass
 86 | 
 87 |     @SkipTest
 88 |     def test_joint(self):
 89 |         args = set_router_parser().parse_args([
 90 |             '--yaml_path', self.yaml_joint,
 91 |             '--socket_out', str(SocketType.PUB_BIND),
 92 |             '--py_path', self.python_code,
 93 |         ])
 94 |         args.as_response = True
 95 |         c_args = _set_client_parser().parse_args([
 96 |             '--port_in', str(args.port_out),
 97 |             '--port_out', str(args.port_in),
 98 |             '--socket_in', str(SocketType.SUB_CONNECT)
 99 |         ])
100 |         with IndexerService(args), ZmqClient(c_args) as c1:
101 |             msg = gnes_pb2.Message()
102 |             for i, vec in enumerate(self.test_vec):
103 |                 doc = msg.request.index.docs.add()
104 |                 doc.doc_id = i
105 |                 doc.raw_text = self.test_str[i]
106 |                 c = doc.chunks.add()
107 |                 c.doc_id = i
108 |                 c.offset = 0
109 |                 c.embedding.data = vec.tobytes()
110 |                 for d in vec.shape:
111 |                     c.embedding.shape.extend([d])
112 |                 c.embedding.dtype = str(vec.dtype)
113 |                 c.text = self.test_str[i]
114 |             c1.send_message(msg)
115 | 
116 |             r = c1.recv_message()
117 |             self.assert_(r.response.index)
118 | 
119 |             for i, vec in enumerate(self.test_vec):
120 |                 msg = gnes_pb2.Message()
121 |                 msg.request.search.query.doc_id = 1
122 |                 msg.request.search.top_k = 1
123 |                 c = msg.request.search.query.chunks.add()
124 |                 c.doc_id = 1
125 |                 c.embedding.data = vec.tobytes()
126 |                 for d in vec.shape:
127 |                     c.embedding.shape.extend([d])
128 |                 c.embedding.dtype = str(vec.dtype)
129 |                 c.offset = 0
130 |                 c.weight = 1
131 |                 c.text = self.test_str[i]
132 |                 c1.send_message(msg)
133 |                 r = c1.recv_message()
134 |                 try:
135 |                     self.assert_(r.response.search.topk_results[0].chunk.doc_id == i)
136 |                 except:
137 |                     pass
138 | 
139 |     def tearDown(self):
140 |         pass


--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-joint.yml:
--------------------------------------------------------------------------------
1 | !JointIndexer
2 | components:
3 |   - !RocksDBIndexer
4 |     parameters:
5 |       data_path: ./idx.doc_content
6 |   - !WhooshIndexer
7 |     parameters:
8 |       data_path: ./idx.whoosh


--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-keyword.yml:
--------------------------------------------------------------------------------
1 | !KeywordIndexer {}


--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-reranker.yml:
--------------------------------------------------------------------------------
1 | !RerankRouter
2 | parameters:
3 |   model_name: bert-base-uncased


--------------------------------------------------------------------------------
/koursaros/hub/tests/yaml/test-whoosh.yml:
--------------------------------------------------------------------------------
1 | !WhooshIndexer
2 | parameters:
3 |   data_path: ./idx.doc_content


--------------------------------------------------------------------------------
/koursaros/repo_creds/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Description
 3 | 
 4 | This module allows you to pull secure credentials into your python
 5 | script. It assumes that you create a private git repository with
 6 | your credentials in them prior to using get_creds().
 7 | 
 8 | ## At a glance 
 9 | 
10 | You can create a repository that looks like this:
11 | 
12 | ```
13 | creds
14 | ├── creds.yaml
15 | ├── google
16 | │   └── bluehat.json
17 | └── postgres
18 |     └── postgres.pem
19 | ```
20 | 
21 | And a `creds.yaml` that looks like this:
22 | ```yaml
23 | creds:
24 |   postgres:
25 |     host: !!str 12.345.678.910
26 |     username: !!str postgres
27 |     password: !!str my_password
28 |     replicas: !!int 5
29 |     dbname: !!str fever
30 |     sslmode: !!str verify-ca
31 |     sslrootcert: !file postgres/postgres.pem
32 |   google:
33 |     app_creds: !file google/bluehat.json
34 | ```
35 | 
36 | Let's say the repo you make is `madhatter/creds`, my username is `alice` and password is `cheshire`.
37 | You can get your credentials in a python script by doing the following:
38 | ```python
39 | from koursaros.credentials import get_creds
40 | from sys import argv
41 | 
42 | # retrieve repo creds by adding login to script
43 | creds = get_creds('alice:cheshire@madhatter/creds')
44 | # or with cmd line args
45 | creds = get_creds(argv[1])
46 | # NOTE: you don't need to log in if your git credentials are stored locally
47 | 
48 | 
49 | # the !! denotes native python types. You can access them like:
50 | creds.postgres.password # my_password
51 | creds.postgres.replicas # 5
52 | 
53 | # the special !file tag means that it is a file. You can access
54 | # three attributes from file objects (path, bytes, text):
55 | creds.google.app_creds.path # '/absolute/path/to/google/app_creds/bluehat.json'
56 | creds.google.app_creds.bytes # b'{"client_id": "293480342342034"}'
57 | creds.google.app_creds.text # '{"client_id": "293480342342034"}'
58 | ```
59 | 
60 | ## How it works
61 | The `get_creds()` function clones the specified repo and caches it to the koursaros.credentials
62 | directory. If the creds repo already exists, the repo is git pulled.


--------------------------------------------------------------------------------
/koursaros/repo_creds/__init__.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from pathlib import Path
 3 | from box import Box
 4 | import git
 5 | 
 6 | 
 7 | DIR = Path(__file__).parent.absolute()
 8 | 
 9 | 
10 | class FileCred(yaml.YAMLObject):
11 |     yaml_loader = yaml.SafeLoader
12 |     yaml_tag = '!file'
13 | 
14 |     def __init__(self, relative_path):
15 |         path = self.repo_path.joinpath(relative_path)
16 |         self.bytes = path.read_bytes()
17 |         self.text = path.read_text()
18 |         self.path = str(path)
19 | 
20 |     @classmethod
21 |     def from_yaml(cls, loader, node):
22 |         return cls(node.value)
23 | 
24 |     @classmethod
25 |     def set_repo_path(cls, repo_path):
26 |         cls.repo_path = repo_path
27 | 
28 | 
29 | def get_creds(git_dsn):
30 |     login, repo = git_dsn.split('@')
31 |     login += '@'
32 |     repo_path = DIR.joinpath(repo)
33 |     repo_path.parent.mkdir(exist_ok=True)
34 |     FileCred.set_repo_path(repo_path)
35 | 
36 |     if repo_path.exists():
37 |         g = git.Git(repo_path)
38 |         g.pull()
39 |     else:
40 |         g = git.Git(repo_path.parent)
41 |         g.clone("https://%sgithub.com/%s" % (login, repo))
42 | 
43 |     creds = yaml.safe_load(repo_path.joinpath('creds.yaml').read_text())
44 |     return Box(creds['creds'])
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | distro==1.4.0
2 | python-box
3 | tqdm
4 | torch
5 | transformers
6 | termcolor


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from pathlib import Path
 3 | 
 4 | setup(
 5 |     name='koursaros',
 6 |     packages=find_packages(),
 7 |     include_package_data=True,
 8 |     version='0.0.1',
 9 |     license='MIT',
10 |     description='Koursaros is a distributed, cloud-'
11 |                 'native platform for developing and deploying '
12 |                 'automated information retrieval and inference applications.',
13 |     long_description=Path('README.md').read_text(),
14 |     author='Koursaros',
15 |     author_email='cole.thienes@gmail.com',
16 |     url='https://github.com/koursaros-ai/koursaros',
17 |     # download_url='https://github.com/koursaros-ai/koursaros/archive/0.0.1.tar.gz',
18 |     keywords=['koursaros', 'distributed', 'cloud-native', 'neural', 'inference'],
19 |     install_requires=[
20 |         'PyYAML', 'gitpython', 'python-box', 'gnes', 'tqdm', 'tabulate', 'click'],
21 |     entry_points={'console_scripts': ['kctl=koursaros.cli.__main__:main']},
22 |     classifiers=[
23 |         'Intended Audience :: Education',
24 |         'Intended Audience :: Science/Research',
25 |         'Intended Audience :: Developers',
26 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
27 |         'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
28 |         'Topic :: Scientific/Engineering',
29 |         'Topic :: Scientific/Engineering :: Mathematics',
30 |         'Topic :: Software Development',
31 |         'Topic :: Software Development :: Libraries',
32 |         'Topic :: Software Development :: Libraries :: Python Modules',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Programming Language :: Python :: 3.5',
35 |         'Programming Language :: Python :: 3.6',
36 |         'Programming Language :: Python :: 3.7',
37 |     ],
38 | )
39 | 


--------------------------------------------------------------------------------
/tutorials/deploy_custom_model.md:
--------------------------------------------------------------------------------
 1 | # Training + Deploying a Custom Transformer Model in 5 Minutes
 2 | 
 3 | ## Training Sentence Classification or Regression
 4 | 
 5 | Make sure you've installed the koursaros training package.
 6 | 
 7 | Create a <name>.yaml file for your model in the /services directory. Your project should look like:
 8 | 
 9 | ```
10 |    |-bases/
11 |    |-pipelines/
12 |    |---my_pipeline.yaml
13 |    |-services/
14 |    |---[name].yaml
15 | ```
16 | 
17 | For loading mnli from a postgres table, the yaml file should look like this:
18 | ```yaml
19 | service:
20 |   base: <bert|roberta|xlnet|distilibert|xlm>
21 |   repo: gs://<your-bucket-name>
22 |   task: <classification|regression>
23 |   labels: # if classification, else nothing
24 |     - neutral
25 |     - contradiction
26 |     - entailment
27 |   training:
28 |     checkpoint: bert-base-uncased # see transformers for options, or use custom filename
29 |     epochs: 3
30 |     learning_rate: 1e-05
31 | ```
32 | 
33 | ### Loading data from postgresql
34 | 
35 | For loading training data form postgres (recommended), add this to the service yaml. Adjust the schema and tables to point your your train / test data.
36 | ```yaml
37 |   data:
38 |     source: postgres
39 |     schema: mnli
40 |     train: train_set
41 |     test: test_set
42 | ```
43 | 
44 | And adjust your environment variables accordingly:
45 | ```bash
46 | export PGHOST=
47 | export PGUSER=
48 | export PGPASS=
49 | export PGDBNAME=
50 | # for ssl
51 | export PGSSLMODE=verify-ca
52 | export PGSSLROOTCERT=
53 | ```
54 | 
55 | ### Loading data from tsv / excel
56 | 
57 | ```yaml
58 | data:
59 |     source: tsv
60 |     train: train_set.tsv
61 |     test: test_set.tsv
62 | ```
63 | 
64 | ---
65 | 
66 | **NOTE**
67 | 
68 | The format for tables or TSV files for training should be `<text_a, optional_text_b, label>`
69 | 
70 | ---
71 | 
72 | ### Run training and push model to bucket
73 | 
74 | Run `kctl train services/mnli.yaml`. The model will be cached locally, unless you specify a google storage bucket to upload to for deployment. Read about authentication in the google cloud storage API.
75 | 
76 | ## Deploying
77 | 
78 | 
79 | 
80 | ### Set up App
81 | 
82 | ## 


--------------------------------------------------------------------------------
/tutorials/fact_check.md:
--------------------------------------------------------------------------------
1 | # Creating a SoTA Production Fact Checker from Wikipedia
2 | 
3 | ## Create App
4 | ## Train or Download Pretrained Models
5 | ## Dump Wikipedia to Elastic Search
6 | ## Benchmark


--------------------------------------------------------------------------------
/utils/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | from koursaros.modeling.models import MODELS
 2 | from koursaros.yamls import Yaml
 3 | from kctl.logger import set_logger
 4 | 
 5 | logger = set_logger('MODELS')
 6 | 
 7 | def model_filename_resolver(name):
 8 |     if name.split('.')[-1] == 'yaml':
 9 |         return name
10 |     return f'./services/{name}.yaml'
11 | 
12 | def model_from_yaml(filename, **kwargs):
13 |     config = Yaml(filename)
14 |     return model_from_config(config, **kwargs)
15 | 
16 | def model_from_config(config, training=False):
17 |     for model_class in MODELS:
18 |         if config.arch in model_class.architectures():
19 |             model = model_class(config, training)
20 |             logger.info('Loaded model {}'.format(config.arch))
21 |             return model
22 |     logger.error('Unsupported model architecture {}'.format(config.arch))
23 |     raise NotImplementedError()
24 | 


--------------------------------------------------------------------------------
/utils/modeling/data.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | def get_rows_from_tsv(fname):
 4 |     if fname.split('.')[-1] == 'tsv':
 5 |         delimiter = '\t'
 6 |     else:
 7 |         delimiter = ','
 8 |     with open(fname) as file:
 9 |       return csv.reader(file, delimiter=delimiter)
10 | 
11 | def select_all(schema, table, random=True):
12 |     query = f'select * from {schema}.{table}'
13 |     if random:
14 |         query += ' order by random()'
15 |     return query


--------------------------------------------------------------------------------
/utils/modeling/migrating.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import transformers
  3 | from fairseq.models import roberta
  4 | from fairseq.data.data_utils import collate_tokens
  5 | import time
  6 | import torch.nn.functional as F
  7 | import torch.hub
  8 | 
  9 | # def roberta_to_transformer(path_to_roberta, path_to_yaml):
 10 | #     model = RobertaModel.from_pretrained(path_to_roberta, checkpoint_file='model.pt')
 11 | #     model.eval()
 12 | 
 13 | MAX_LENGTH = 256
 14 | PAD = True
 15 | 
 16 | def predict_transformers(model, tokenizer):
 17 |     def predict_fn(*args):
 18 |         inputs = time_fn(transformers_encode_batch, tokenizer, *args)
 19 |         inputs_dict = {
 20 |             'input_ids': inputs[0],
 21 |             'attention_mask': inputs[1],
 22 |             'token_type_ids': inputs[2]
 23 |         }
 24 |         outputs = model(**inputs_dict)
 25 |         logits = outputs[0]
 26 |         preds = F.log_softmax(logits, dim=-1)
 27 |         return preds.tolist()
 28 |     return predict_fn
 29 | 
 30 | 
 31 | def predict_roberta(model):
 32 |     def pred_fn(*args):
 33 |         batch = time_fn(collate_tokens, [model.encode(*arg)[:MAX_LENGTH] for arg in zip(*args)], pad_idx=1)
 34 |         labels = model.predict('mnli', *batch).tolist()
 35 |         return labels
 36 |     return pred_fn
 37 | 
 38 | 
 39 | def benchmark(pred_fn, n):
 40 |     args = 'All work and no play.', 'Make jack a very dull boy.'
 41 |     for i in range(0, n):
 42 |         assert(type(pred_fn(*args)) == list)
 43 | 
 44 | 
 45 | def benchmark_mnli(samples):
 46 |     torch_hub_model = time_fn(torch.hub.load, 'pytorch/fairseq','roberta.large.mnli')
 47 |     try:
 48 |         transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
 49 |                                      'roberta-large-mnli')
 50 |     except:
 51 |         transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
 52 |                                      'roberta-large-mnli', force_download=True)
 53 |     transformers_tokenizer = time_fn(transformers.RobertaTokenizer.from_pretrained, 'roberta-large-mnli')
 54 |     pred_functions = {
 55 |         'transformers' : predict_transformers(transformers_model, transformers_tokenizer),
 56 |         'torch_hub' : predict_roberta(torch_hub_model)
 57 |     }
 58 |     for framework, pred_fn in pred_functions.items():
 59 |         print(f'Benchmarking {framework} with {samples} samples')
 60 |         time_fn(benchmark, pred_fn, samples)
 61 | 
 62 | ### HELPERS
 63 | 
 64 | def time_fn(fn, *args, **kwargs):
 65 |     start = time.time()
 66 |     res = fn(*args, **kwargs)
 67 |     print(f'Took {time.time() - start} seconds to run {fn.__name__}')
 68 |     return res
 69 | 
 70 | 
 71 | def transformer_to_features(tokenizer, *args):
 72 |     inputs = tokenizer.encode_plus(
 73 |         *args,
 74 |         add_special_tokens=True,
 75 |         max_length=MAX_LENGTH,
 76 |         truncate_first_sequence=True
 77 |     )
 78 |     input_ids, token_type_ids = inputs["input_ids"][:MAX_LENGTH], \
 79 |                                 inputs["token_type_ids"][:MAX_LENGTH]
 80 | 
 81 |     attention_mask = [1] * len(input_ids)
 82 | 
 83 |     # Zero-pad up to the sequence length.
 84 |     if PAD:
 85 |         padding_length = MAX_LENGTH - len(input_ids)
 86 |         input_ids = ([0] * padding_length) + input_ids
 87 |         attention_mask = ([0] * padding_length) + attention_mask
 88 |         token_type_ids = ([0] * padding_length) + token_type_ids
 89 | 
 90 |     return (input_ids, attention_mask, token_type_ids)
 91 | 
 92 | 
 93 | def transformers_encode_batch(tokenizer, *args):
 94 |     all_input_ids = []
 95 |     all_attention_mask = []
 96 |     all_token_type_ids = []
 97 |     for sample in zip(*args):
 98 |         input_ids, attention_mask, token_type_ids = transformer_to_features(tokenizer, *sample)
 99 |         all_input_ids.append(input_ids)
100 |         all_attention_mask.append(attention_mask)
101 |         all_token_type_ids.append(token_type_ids)
102 |     return all_input_ids, all_attention_mask, all_token_type_ids
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     benchmark_mnli(10)


--------------------------------------------------------------------------------
/utils/modeling/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from koursaros.utils.database.psql import Conn
 3 | from koursaros.utils.misc import gb_free_space
 4 | from koursaros.utils.bucket import bucket_contains, download_and_unzip
 5 | from kctl.logger import set_logger
 6 | from .data import *
 7 | 
 8 | logger = set_logger('MODELS')
 9 | 
10 | class Model(object):
11 | 
12 |     def __init__(self, config, training):
13 |         if gb_free_space() < 3:
14 |             logger.error("There is not enough space on your disk, please allocate more!")
15 |             raise SystemError
16 | 
17 |         self.config = config
18 |         self.version = config.hash
19 |         self.dir = '.model-data'
20 | 
21 |         if not os.path.exists(self.dir):
22 |             os.makedirs(self.dir)
23 |         self.ckpt_dir = f'{self.dir}/{self.version}/'
24 |         logger.info("Local model cache dir %s" %self.ckpt_dir)
25 |         if not 'training' in self.config: # use a default model
26 |             logger.info('Loading model from default checkpoint')
27 |             self.checkpoint = self.config.checkpoint
28 |             self.trained = True
29 |         elif os.path.exists(self.ckpt_dir + 'config.json') and not training: # model already trained
30 |             logger.info('Loading trained model')
31 |             self.checkpoint = self.ckpt_dir
32 |             self.trained = True
33 |         elif bucket_contains(f'{self.version}.tar.gz'):
34 |             logger.info(f'Downloading and extracting from bucket {self.config.repo}')
35 |             download_and_unzip(self.config.repo.split('//')[-1],
36 |                                f'{self.version}.tar.gz', self.dir)
37 |             self.checkpoint = self.ckpt_dir
38 |             assert(os.path.exists(self.ckpt_dir + 'config.json'))
39 |             self.trained = True
40 |         else: # init model for training
41 |             logger.info('Initializing model for training')
42 |             if not training:
43 |                 logger.error('Please train model before deploying')
44 |                 raise SystemError
45 |             self.data_dir = os.path.join(self.dir, self.version)
46 |             if not os.path.exists(self.data_dir):
47 |                 os.makedirs(self.data_dir)
48 |             if not os.path.exists(self.ckpt_dir):
49 |                 os.makedirs(self.ckpt_dir)
50 |             self.checkpoint = config.training.checkpoint
51 |             self.trained = False
52 | 
53 |     def get_data(self):
54 |         """
55 |         Get training data based on yaml config and connection
56 |         :return:
57 |         """
58 |         data = self.config.training.data
59 |         if data.source == 'postgres':
60 |             p = Conn()
61 |             query_fn = p.query
62 |             return query_fn(select_all(data.schema, data.train)), \
63 |                    query_fn(select_all(data.schema, data.test))
64 |         else:
65 |             return get_rows_from_tsv(data.train), get_rows_from_tsv(data.test)
66 | 
67 |     def train(self):
68 |         """
69 |         Runs training as defined in the model yaml. Saves model to directory
70 |         .cache/<md5 hash of yaml>
71 |         :return: evaluation metric
72 |         """
73 |         raise NotImplementedError()
74 | 
75 |     def run(self, *args):
76 |         """
77 |         Runs inference on arbitrary args
78 |         :param args: sent_a, sent_b for classification / regression task.
79 |         :return:
80 |         """
81 |         raise NotImplementedError()
82 | 
83 |     def save_model(self):
84 |         # append hash of yaml to model checkpoint
85 |         raise NotImplementedError()
86 | 
87 |     @staticmethod
88 |     def architectures():
89 |         raise NotImplementedError()
90 | 
91 |     def getInputProto(self):
92 |         raise NotImplementedError()
93 | 
94 |     def getOutputProto(self):
95 |         raise NotImplementedError()
96 | 
97 | 


--------------------------------------------------------------------------------
/utils/modeling/models/__init__.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling.models.transformer_model import TransformerModel
2 | 
3 | MODELS = [TransformerModel]


--------------------------------------------------------------------------------
/utils/modeling/models/generative_transformer.py:
--------------------------------------------------------------------------------
  1 | from koursaros.modeling.model import Model
  2 | 
  3 | from tqdm import trange
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | 
  9 | from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig
 10 | 
 11 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
 12 | from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
 13 | from transformers import XLNetLMHeadModel, XLNetTokenizer
 14 | from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
 15 | from transformers import XLMWithLMHeadModel, XLMTokenizer
 16 | 
 17 | 
 18 | MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
 19 | 
 20 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ())
 21 | 
 22 | 
 23 | PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
 24 | (except for Alexei and Maria) are discovered.
 25 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
 26 | remainder of the story. 1883 Western Siberia,
 27 | a young Grigori Rasputin is asked by his father and a group of men to perform magic.
 28 | Rasputin has a vision and denounces one of the men as a horse thief. Although his
 29 | father initially slaps him for making such an accusation, Rasputin watches as the
 30 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
 31 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
 32 | with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
 33 | 
 34 | MODEL_CLASSES = {
 35 |     'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
 36 |     'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
 37 |     'xlnet-gen': (XLNetLMHeadModel, XLNetTokenizer),
 38 |     'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
 39 |     'xlm-gen': (XLMWithLMHeadModel, XLMTokenizer),
 40 | }
 41 | 
 42 | class GenerativeTransformer(Model):
 43 | 
 44 |     def __init__(self, *args):
 45 |         super().__init__(*args)
 46 |         model_class, tokenizer_class = MODEL_CLASSES[self.config.base]
 47 |         self.model = model_class.from_pretrained(self.config.checkpoint)
 48 |         self.tokenizer = tokenizer_class.from_pretraiend(self.config.checkpoint)
 49 | 
 50 |     def set_seed(self, args):
 51 |         np.random.seed(args.seed)
 52 |         torch.manual_seed(args.seed)
 53 |         if args.n_gpu > 0:
 54 |             torch.cuda.manual_seed_all(args.seed)
 55 | 
 56 |     def top_k_top_p_filtering(self, logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
 57 |         """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 58 |             Args:
 59 |                 logits: logits distribution shape (vocabulary size)
 60 |                 top_k > 0: keep only top k tokens with highest probability (top-k filtering).
 61 |                 top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
 62 |                     Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
 63 |             From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
 64 |         """
 65 |         assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
 66 |         top_k = min(top_k, logits.size(-1))  # Safety check
 67 |         if top_k > 0:
 68 |             # Remove all tokens with a probability less than the last token of the top-k
 69 |             indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
 70 |             logits[indices_to_remove] = filter_value
 71 | 
 72 |         if top_p > 0.0:
 73 |             sorted_logits, sorted_indices = torch.sort(logits, descending=True)
 74 |             cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
 75 | 
 76 |             # Remove tokens with cumulative probability above the threshold
 77 |             sorted_indices_to_remove = cumulative_probs > top_p
 78 |             # Shift the indices to the right to keep also the first token above the threshold
 79 |             sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
 80 |             sorted_indices_to_remove[..., 0] = 0
 81 | 
 82 |             indices_to_remove = sorted_indices[sorted_indices_to_remove]
 83 |             logits[indices_to_remove] = filter_value
 84 |         return logits
 85 | 
 86 |     def run(self, raw_text):
 87 |         context_tokens = self.tokenizer.encode(raw_text)
 88 |         out = self.sample_sequence(
 89 |             context=context_tokens,
 90 |             length=len(context_tokens)
 91 |         )
 92 |         out = out[0, len(context_tokens):].tolist()
 93 | 
 94 |         text = self.tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
 95 |         # text = text[: text.find(args.stop_token) if args.stop_token else None]
 96 |         return text
 97 | 
 98 | 
 99 |     def sample_sequence(self, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.9, is_xlnet=False,
100 |                         xlm_lang=None, device='cpu'):
101 |         context = torch.tensor(context, dtype=torch.long, device=device)
102 |         context = context.unsqueeze(0).repeat(num_samples, 1)
103 |         generated = context
104 |         with torch.no_grad():
105 |             for _ in trange(length):
106 | 
107 |                 inputs = {'input_ids': generated}
108 |                 if is_xlnet:
109 |                     # XLNet is a direct (predict same token, not next token) and bi-directional model by default
110 |                     # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
111 |                     input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
112 |                     perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float,
113 |                                             device=device)
114 |                     perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
115 |                     target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
116 |                     target_mapping[0, 0, -1] = 1.0  # predict last token
117 |                     inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
118 | 
119 |                 if xlm_lang is not None:
120 |                     inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)
121 | 
122 |                 outputs = self.model(
123 |                     **inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
124 |                 next_token_logits = outputs[0][0, -1, :] / temperature
125 |                 filtered_logits = self.top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
126 |                 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
127 |                 generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
128 |         return generated
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/utils/modeling/models/transformer_model.py:
--------------------------------------------------------------------------------
  1 | from ..model import Model
  2 | import torch.nn, torch.tensor, torch.distributed, torch.jit
  3 | from transformers import *
  4 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
  5 |                               TensorDataset, DistributedSampler)
  6 | from tensorboardX import SummaryWriter
  7 | from tqdm import tqdm
  8 | import numpy as np
  9 | import os
 10 | from kctl.logger import set_logger
 11 | 
 12 | from koursaros.utils.misc import batch_list
 13 | 
 14 | logger = set_logger('MODELS')
 15 | 
 16 | MODEL_CLASSES = {
 17 |     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
 18 |     'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
 19 |     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 20 |     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
 21 |     'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 22 | }
 23 | 
 24 | class TransformerModel(Model):
 25 | 
 26 |     def __init__(self, *args, **kwargs):
 27 |         super().__init__(*args)
 28 |         if self.config.task == 'classification' or self.config.task == 'regression':
 29 |             config, model, tokenizer = MODEL_CLASSES[self.config.arch]
 30 |         else:
 31 |             raise NotImplementedError()
 32 | 
 33 |         self.model_config = config.from_pretrained(self.checkpoint, cache_dir=self.dir)
 34 |         self.model_config.num_labels = len(self.config.labels)
 35 |         self.model_config.torchscript = True
 36 |         self.model = model.from_pretrained(self.checkpoint, config=self.model_config,
 37 |                                            cache_dir=self.dir, **kwargs)
 38 |         self.tokenizer = tokenizer.from_pretrained(self.checkpoint, cache_dir=self.dir)
 39 |         self.batch_size = self.config.training.batch_size
 40 |         self.max_grad_norm = 1.0
 41 |         self.weight_decay = 0.0
 42 |         self.n_gpu = 1
 43 |         self.local_rank = -1
 44 |         self.gradient_accumulation_steps = 1
 45 |         self.max_length = 256
 46 |         self.fp16 = False
 47 |         self.evaluate_during_training = True
 48 |         self.pad_token_segment_id = 4 if self.config.arch == 'xlnet' else 0
 49 |         self.pad_on_left = True
 50 |         self.pad_token = 0
 51 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 52 |         self.model.to(self.device)
 53 |         self.pad = True
 54 |         self.label_map = {label: i for i, label in enumerate(self.config.labels)}
 55 |         if self.trained:
 56 |             print('Tracing model for deployment...')
 57 |             self.model.eval()
 58 |             # self.trace_model()
 59 |         if self.config.task == 'classification':
 60 |             self.best_checkpoint_metric = 'acc'
 61 |         elif self.config.task == 'regression':
 62 |             self.best_checkpoint_metric = 'loss'
 63 | 
 64 |     def inputs_from_batch(self, batch):
 65 |         inputs = {'input_ids': batch[0],
 66 |                   'attention_mask': batch[1]}
 67 |         if self.config.arch != 'distilbert':
 68 |             inputs['token_type_ids'] = batch[2] if self.config.arch in ['bert',
 69 |                                                                         'xlnet'] else None
 70 |         if len(batch) > 3:
 71 |             inputs['labels'] = batch[3]
 72 |         return inputs
 73 | 
 74 |     def tuple_inputs(self, inputs):
 75 |         return (
 76 |             inputs['input_ids'],
 77 |             inputs['attention_mask'],
 78 |             inputs['token_type_ids']
 79 |         )
 80 | 
 81 |     def trace_model(self):
 82 |         examples = [
 83 |             InputExample(
 84 |                 guid=1,
 85 |                 text_a="Once upon a time there was a boy",
 86 |                 text_b="He liked to write code all day long"
 87 |             )
 88 |         ]
 89 |         features = [self.example_to_feature(example) for example in examples]
 90 |         all_inputs = self.features_to_inputs(features, True)
 91 |         inputs = self.inputs_from_batch(all_inputs)
 92 |         self.model = torch.jit.trace(self.model, self.tuple_inputs(inputs))
 93 | 
 94 |     def train(self, force_build_features=False):
 95 |         return self.do_train(force_build_features=force_build_features)
 96 |         # except:
 97 |         #     logger.warning('Error during training, decrease batch size and try again')
 98 |         #     raise SystemError()
 99 |         #     self.batch_size = self.batch_size // 2 # back off batch_size
100 |         #     return self.train(force_build_features=True)
101 | 
102 |     def do_train(self, force_build_features=False):
103 |         ### In Transformers, optimizer and schedules are splitted and instantiated like this:
104 | 
105 |         tb_writer = SummaryWriter()
106 | 
107 |         train_dataset, test_dataset = self.get_data()
108 |         train_dataset = self.load_and_cache_examples(train_dataset, force_build_features=force_build_features)
109 |         epochs = int(self.config.training.epochs)
110 |         optimizer = AdamW(self.model.parameters(), lr=float(self.config.training.learning_rate),
111 |                           correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
112 |         num_warmup_steps = int(0.06 * len(train_dataset))
113 |         scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps,
114 |                                          t_total=(self.config.training.epochs * len(train_dataset) / self.batch_size))
115 | 
116 |         train_sampler = RandomSampler(train_dataset)
117 |         train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)
118 | 
119 |         t_total = len(train_dataloader)
120 | 
121 |         # Prepare optimizer and schedule (linear warmup and decay)
122 |         no_decay = ['bias', 'LayerNorm.weight']
123 |         optimizer_grouped_parameters = [
124 |             {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
125 |              'weight_decay': self.weight_decay},
126 |             {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
127 |              'weight_decay': 0.0}
128 |         ]
129 | 
130 |         try:
131 |             from apex import amp
132 |             model, optimizer = amp.initialize(self.model, optimizer)
133 |             self.fp16 = True
134 |         except ImportError:
135 |             logger.warning("Please install fp16 from https://github.com/NVIDIA/apex for better performance")
136 |             self.fp16 = False
137 | 
138 |         # Train!
139 |         logger.info("***** Running training *****")
140 |         logger.info("  Num examples = %d" % len(train_dataset))
141 |         logger.info("  Num Epochs = %d" % epochs)
142 |         logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d" %
143 |                     self.batch_size * (
144 |                         torch.distributed.get_world_size() if self.local_rank != -1 else 1))
145 |         logger.info("  Total optimization steps = %d" % t_total)
146 | 
147 |         if not 'eval_freq' in self.config.training:
148 |             self.eval_freq = 2
149 |         else:
150 |             self.eval_freq = self.config.training.eval_freq
151 | 
152 |         self.eval_and_save_every = len(train_dataset) // self.batch_size // self.eval_freq
153 | 
154 |         global_step = 0
155 |         tr_loss, logging_loss = 0.0, 0.0
156 |         self.model.zero_grad()
157 |         label_count = [0] * len(self.config.labels)
158 |         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.local_rank not in [-1, 0])
159 |         num_correct = 0
160 |         prev_best = None
161 |         for step, batch in enumerate(epoch_iterator):
162 |             self.model.train()
163 |             correct_labels = batch[3]
164 |             batch = tuple(t.to(self.device) for t in batch)
165 | 
166 |             inputs = self.inputs_from_batch(batch)
167 |             outputs = self.model(**inputs)
168 |             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
169 |             logits = outputs[1]
170 |             preds = logits.detach().cpu().numpy()
171 |             preds = np.argmax(preds, axis=1)
172 |             for pred in preds:
173 |                 label_count[pred] += 1
174 |             num_correct += np.sum(preds == correct_labels.detach().cpu().numpy())
175 |             if step > 0:
176 |                 epoch_iterator.set_description("Accuracy: %.2f Label Counts: %s"
177 |                                                % (num_correct / (step*self.batch_size), label_count))
178 |                 epoch_iterator.refresh()  # to show immediately the update
179 | 
180 |             if self.n_gpu > 1:
181 |                 loss = loss.mean()  # mean() to average on multi-gpu parallel training
182 | 
183 |             if self.fp16:
184 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
185 |                     scaled_loss.backward()
186 |                 torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm)
187 |             else:
188 |                 loss.backward()
189 |                 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
190 | 
191 |             tr_loss += loss.item()
192 |             if (step + 1) % self.gradient_accumulation_steps == 0:
193 |                 optimizer.step()
194 |                 scheduler.step()  # Update learning rate schedule
195 |                 self.model.zero_grad()
196 |                 global_step += 1
197 | 
198 |                 if self.local_rank in [-1, 0]  and global_step % self.eval_and_save_every == 0:
199 |                     # Log metrics
200 |                     if self.local_rank == -1 and self.evaluate_during_training:
201 |                         results = self.evaluate(test_dataset)
202 |                         for key, value in results.items():
203 |                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
204 |                         tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
205 |                         tb_writer.add_scalar('loss', (tr_loss - logging_loss) / self.eval_and_save_every, global_step)
206 |                         logging_loss = tr_loss
207 |                         if prev_best is None or results[self.best_checkpoint_metric] > prev_best:
208 |                             prev_best = results[self.best_checkpoint_metric]
209 |                             self.save_model()
210 | 
211 |         if self.local_rank in [-1, 0]:
212 |             tb_writer.close()
213 | 
214 |         result = self.evaluate(test_dataset)
215 |         if prev_best is None or result[self.best_checkpoint_metric] > prev_best:
216 |             self.save_model()
217 | 
218 |         return global_step, tr_loss / global_step
219 | 
220 |     def save_model(self):
221 |         # Save model checkpoint
222 |         model_to_save = self.model.module if hasattr(self.model,
223 |                                                      'module') else self.model
224 |         model_to_save.save_pretrained(self.ckpt_dir)
225 |         self.tokenizer.save_pretrained(self.ckpt_dir)
226 | 
227 |     def evaluate(self, test_dataset):
228 |         eval_dataset = self.load_and_cache_examples(test_dataset, evaluate=True)
229 |         eval_output_dir = os.path.join(self.data_dir, 'eval')
230 | 
231 |         if not os.path.exists(eval_output_dir) and self.local_rank in [-1, 0]:
232 |             os.makedirs(eval_output_dir)
233 | 
234 |         # Note that DistributedSampler samples randomly
235 |         eval_sampler = SequentialSampler(eval_dataset) if self.local_rank == -1 else DistributedSampler(
236 |             eval_dataset)
237 |         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.batch_size)
238 | 
239 |         # Eval!
240 |         logger.info("***** Running evaluation *****")
241 |         logger.info("  Num examples = %d" % len(eval_dataset))
242 |         logger.info("  Batch size = %d" % self.batch_size)
243 |         eval_loss = 0.0
244 |         nb_eval_steps = 0
245 |         preds = None
246 |         out_label_ids = None
247 |         for batch in tqdm(eval_dataloader, desc="Evaluating"):
248 |             self.model.eval()
249 |             batch = tuple(t.to(self.device) for t in batch)
250 | 
251 |             with torch.no_grad():
252 |                 inputs = {'input_ids': batch[0],
253 |                           'attention_mask': batch[1],
254 |                           'labels': batch[3]}
255 |                 if self.config.arch != 'distilbert':
256 |                     inputs['token_type_ids'] = batch[2] if self.config.arch in ['bert',
257 |                                                                                'xlnet'] else None
258 |                 outputs = self.model(**inputs)
259 |                 tmp_eval_loss, logits = outputs[:2]
260 | 
261 |                 eval_loss += tmp_eval_loss.mean().item()
262 |             nb_eval_steps += 1
263 |             if preds is None:
264 |                 preds = logits.detach().cpu().numpy()
265 |                 out_label_ids = inputs['labels'].detach().cpu().numpy()
266 |             else:
267 |                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
268 |                 out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
269 | 
270 |         eval_loss = eval_loss / nb_eval_steps
271 |         result = {
272 |             "loss": eval_loss
273 |         }
274 |         if self.config.task == "classification":
275 |             preds = np.argmax(preds, axis=1)
276 |             result['acc'] = np.sum(preds == out_label_ids) / len(preds)
277 |         elif self.config.task == "regression":
278 |             preds = np.squeeze(preds)
279 | 
280 |         output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
281 |         with open(output_eval_file, "w") as writer:
282 |             logger.info("***** Eval results *****")
283 |             for key in sorted(result.keys()):
284 |                 logger.info("  %s = %s" % (key, str(result[key])))
285 |                 writer.write("%s = %s\n" % (key, str(result[key])))
286 | 
287 |         return result
288 | 
289 |     def example_to_feature(self, example):
290 |         inputs = self.tokenizer.encode_plus(
291 |             example.text_a,
292 |             example.text_b,
293 |             add_special_tokens=True,
294 |             max_length=self.max_length,
295 |             truncate_first_sequence=True  # We're truncating the first sequence in priority
296 |         )
297 |         input_ids, token_type_ids = inputs["input_ids"][:self.max_length], \
298 |                                     inputs["token_type_ids"][:self.max_length]
299 | 
300 |         attention_mask = [1] * len(input_ids)
301 | 
302 |         # Zero-pad up to the sequence length.
303 |         if self.pad:
304 |             padding_length = self.max_length - len(input_ids)
305 |             if self.pad_on_left:
306 |                 input_ids = ([self.pad_token] * padding_length) + input_ids
307 |                 attention_mask = ([0] * padding_length) + attention_mask
308 |                 token_type_ids = ([self.pad_token_segment_id] * padding_length) + token_type_ids
309 |             else:
310 |                 input_ids = input_ids + ([self.pad_token] * padding_length)
311 |                 attention_mask = attention_mask + ([0] * padding_length)
312 |                 token_type_ids = token_type_ids + ([self.pad_token_segment_id] * padding_length)
313 | 
314 |         if example.label is not None:
315 |             if self.config.task == "classification":
316 |                 if example.label in self.label_map:
317 |                     label = self.label_map[example.label]
318 |                 else:
319 |                     logger.warning("UNKNOWN LABEL %s, ignoring" % example.label)
320 |                     return
321 |             elif self.config.task == "regression":
322 |                 label = float(example.label)
323 |             else:
324 |                 logger.error("Only supported tasks are classification and regression")
325 |                 raise NotImplementedError()
326 |         else:
327 |             label = None
328 | 
329 |         return InputFeatures(input_ids=input_ids,
330 |                           attention_mask=attention_mask,
331 |                           token_type_ids=token_type_ids,
332 |                           label=label)
333 | 
334 |     def features_to_inputs(self, features, inference):
335 |         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(self.device)
336 |         all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long).to(self.device)
337 |         all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long).to(self.device)
338 |         if not inference:
339 |             if self.config.task == "classification":
340 |                 all_labels = torch.tensor([f.label for f in features], dtype=torch.long).to(self.device)
341 |             elif self.config.task == "regression":
342 |                 all_labels = torch.tensor([f.label for f in features], dtype=torch.float).to(self.device)
343 |             else:
344 |                 raise NotImplementedError()
345 |             return all_input_ids, all_attention_mask, all_token_type_ids, all_labels
346 |         else:
347 |             return all_input_ids, all_attention_mask, all_token_type_ids
348 | 
349 | 
350 |     def load_and_cache_examples(self, data, evaluate=False, force_build_features=False):
351 |         if self.local_rank not in [-1, 0] and not evaluate:
352 |             torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
353 | 
354 |         cached_features_file = os.path.join(self.data_dir, 'features' if not evaluate else 'eval-features')
355 |         if os.path.exists(os.path.join(cached_features_file)) and not force_build_features:
356 |             logger.info("Loading features from cached file %s" % cached_features_file)
357 |             features = torch.load(cached_features_file)
358 |         else:
359 |             logger.info("Creating features from dataset file at %s" % cached_features_file)
360 | 
361 |             examples = [
362 |                 InputExample(guid=i,
363 |                              text_a=ex[0],
364 |                              text_b=ex[1] if len(ex) == 3 else None,
365 |                              label=ex[-1]) for i, ex in enumerate(data)
366 |             ]
367 | 
368 |             features = []
369 |             for (ex_index, example) in enumerate(examples):
370 |                 if ex_index % 10000 == 0:
371 |                     logger.info("Writing example %d" % (ex_index))
372 |                 features.append(self.example_to_feature(example))
373 | 
374 |             if self.local_rank in [-1, 0]:
375 |                 logger.info("Saving features into cached file %s" % cached_features_file)
376 |                 torch.save(features, cached_features_file)
377 | 
378 |         if self.local_rank == 0 and not evaluate:
379 |             torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
380 | 
381 |         # Convert to Tensors and build dataset
382 |         dataset = TensorDataset(*self.features_to_inputs(features, False))
383 |         return dataset
384 | 
385 |     def pred_from_output(self, outputs):
386 |         logits = outputs[0]
387 |         preds = logits.detach().cpu().numpy()
388 |         if self.config.task == 'classification':
389 |             preds = np.argmax(preds, axis=1)
390 |             return [self.config.labels[int(pred)] for pred in preds]
391 |         elif self.config.task == 'regression':
392 |             return np.squeeze(preds)
393 |         else:
394 |             raise NotImplementedError()
395 | 
396 |     def run(self, *args):
397 |         examples = [
398 |             InputExample(
399 |                 guid=str(i),
400 |                 text_a=arg[0],
401 |                 text_b=None if len(arg) < 2 else arg[1]
402 |             ) for i, arg in enumerate(zip(*args))
403 |         ]
404 |         features = [self.example_to_feature(example) for example in examples]
405 |         all_inputs = self.features_to_inputs(features, True)
406 |         inputs = self.inputs_from_batch(all_inputs)
407 |         outputs = self.model(*self.tuple_inputs(inputs))
408 |         return self.pred_from_output(outputs)
409 | 
410 |     def multi_gpu_training(self):
411 |         # multi-gpu training (should be after apex fp16 initialization)
412 |         if self.n_gpu > 1:
413 |             model = torch.nn.DataParallel(self.model)
414 |         # Distributed training (should be after apex fp16 initialization)
415 |         if self.local_rank != -1:
416 |             model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.local_rank],
417 |                                                               output_device=self.local_rank,
418 |                                                               find_unused_parameters=True)
419 | 
420 |     @staticmethod
421 |     def architectures():
422 |         return list(MODEL_CLASSES.keys())
423 | 


--------------------------------------------------------------------------------
/utils/predictor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/predictor/__init__.py


--------------------------------------------------------------------------------
/utils/predictor/__main__.py:
--------------------------------------------------------------------------------
 1 | from koursaros.modeling import model_from_yaml
 2 | import sys, os
 3 | from koursaros.utils.database.psql import Conn
 4 | from koursaros.utils.misc import batch_list
 5 | from koursaros.modeling.data import *
 6 | import csv
 7 | import time
 8 | 
 9 | BATCH_SIZE = int(os.environ.get('BATCH_SIZE') or 4)
10 | 
11 | def predict(model_file, data_source, data_target, truncate=False):
12 |     model = model_from_yaml(model_file)
13 |     extension = data_source.split('.')[-1]
14 |     if extension in ['tsv', 'csv']:
15 |         rows = get_rows_from_tsv(data_source)
16 |         delimiter = '\t' if extension == 'tsv' else 'csv'
17 |         open(data_target, 'w+') # touch file
18 | 
19 |         def write_fn(buffer):
20 |             file = open(data_target, 'a')
21 |             writer = csv.writer(file, delimiter=delimiter)
22 |             for row in buffer: writer.writerow(row)
23 | 
24 |     else:
25 |         p = Conn()
26 |         query_fn = p.query
27 |         schema, table = data_source.split('.')
28 |         if truncate:
29 |             p.execute(f'''truncate table {data_target}''')
30 |         rows = query_fn(select_all(schema, table, random=False))
31 | 
32 |         def write_fn(buffer):
33 |             p.insert(data_target, buffer)
34 |             p.commit()
35 | 
36 |     buffer = []
37 |     i = 0
38 |     start = time.time()
39 |     for step, batch in enumerate(batch_list(rows, BATCH_SIZE)):
40 |         transposed = tuple(zip(*batch))
41 |         inputs = transposed[:-1]
42 |         ids = transposed[-1]
43 |         buffer.extend(zip(ids, model.run(*inputs)))
44 |         i += BATCH_SIZE
45 |         if i > 500:
46 |             total = step * BATCH_SIZE
47 |             print('dumping example {}, rate: {} per second'.format(total, total/(time.time() - start) ))
48 |             write_fn(buffer)
49 |             buffer = []
50 |             i = 0
51 | 
52 |     if len(buffer) > 0: write_fn(buffer)
53 | 
54 | if __name__ == '__main__':
55 |     model_file = sys.argv[1]
56 |     data_source = sys.argv[2]
57 |     data_target = sys.argv[3] if len(sys.argv) > 3 else './predictions.tsv'
58 |     truncate = len(sys.argv) > 4 and sys.argv[4] == '-t'
59 |     predict(model_file, data_source, data_target, truncate=truncate)


--------------------------------------------------------------------------------
/utils/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from koursaros.modeling import model_from_yaml
2 | 
3 | def train(file):
4 |     model = model_from_yaml(file, training=True)
5 |     model.train()


--------------------------------------------------------------------------------
/utils/trainer/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from . import train
3 | 
4 | if __name__ == '__main__':
5 |     filename = sys.argv[1]
6 |     train(filename)


--------------------------------------------------------------------------------
/utils/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/utils/__init__.py


--------------------------------------------------------------------------------
/utils/utils/bucket/__init__.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import storage
 2 | import tarfile
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | def download_blob(bucket_name, source_blob_name, destination_file_name):
 7 |     """Downloads a blob from the bucket."""
 8 |     storage_client = storage.Client()
 9 |     bucket = storage_client.get_bucket(bucket_name)
10 |     blob = bucket.blob(source_blob_name)
11 | 
12 |     blob.download_to_filename(destination_file_name)
13 | 
14 |     print('Blob {} downloaded to {}.'.format(
15 |         source_blob_name,
16 |         destination_file_name))
17 | 
18 | def bucket_contains(filename):
19 |     storage_client = storage.Client()
20 |     blobs = storage_client.list_blobs("poloma-models")
21 |     for blob in blobs:
22 |         if blob == filename: return True
23 |     return False
24 | 
25 | def download_and_unzip(bucket_name, source_blob_name, out_dir, archive=False):
26 |     fname = source_blob_name.split("/")[-1]
27 |     destination = out_dir + fname
28 |     if not os.path.isfile(destination):
29 |         download_blob(bucket_name, source_blob_name, destination)
30 |     assert os.path.isfile(destination)
31 |     if archive:
32 |         tar = tarfile.open(destination, "r:gz")
33 |         tar.extractall(out_dir)
34 |         tar.close()
35 |         print(f'extracted {destination} to {out_dir}')


--------------------------------------------------------------------------------
/utils/utils/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koursaros-ai/microservices/9613595ba62d00cb918feafa329834634bb76dc4/utils/utils/cuda/__init__.py


--------------------------------------------------------------------------------
/utils/utils/cuda/apex.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex


--------------------------------------------------------------------------------
/utils/utils/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .psql import *
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/utils/utils/database/psql.py:
--------------------------------------------------------------------------------
 1 | from psycopg2 import extensions, extras
 2 | import os
 3 | 
 4 | 
 5 | def is_nested(nested):
 6 |     if any(not isinstance(i, (list, tuple)) for i in nested):
 7 |         raise ValueError('Hey dumbass - you can only dump nested lists/tuples.')
 8 | 
 9 | 
10 | class Conn(extensions.connection):
11 |     def __init__(self, host=None, user=None, password=None, dbname=None, sslmode=None, cert_path=None):
12 |         if sslmode:
13 |             os.environ['PGSSLMODE'] = sslmode
14 |         if cert_path:
15 |             os.environ['PGSSLROOTCERT'] = cert_path
16 |         if not host:
17 |             host = os.environ.get('PGHOST')
18 |         if not user:
19 |             user = os.environ.get('PGUSER')
20 |         if not password:
21 |             password = os.environ.get('PGPASS')
22 |         if not dbname:
23 |             dbname = os.environ.get('PGDBNAME')
24 |         dsn = f"dbname='{dbname}' user='{user}' host='{host}' password='{password}'"
25 |         super(Conn, self).__init__(dsn=dsn)
26 | 
27 |     def _set_columns(self, cur):
28 |         self.columns = [desc.name for desc in cur.description]
29 | 
30 |     def execute(self, query):
31 |         cur = self.cursor()
32 |         cur.execute(query)
33 | 
34 |     def iter_rows(self, query):
35 |         cur = self.cursor()
36 |         cur.execute(query)
37 |         self._set_columns(cur)
38 |         return cur
39 | 
40 |     def iter_chunk(self, query, chunksize):
41 |         cur = self.cursor()
42 |         cur.execute(query)
43 |         self._set_columns(cur)
44 |         chunk = cur.fetchmany(chunksize)
45 |         while chunk:
46 |             yield chunk
47 |             chunk = cur.fetchmany(chunksize)
48 | 
49 |     def query(self, query):
50 |         cur = self.cursor()
51 |         cur.execute(query)
52 |         fetched = cur.fetchall()
53 |         self._set_columns(cur)
54 |         return fetched
55 | 
56 |     def insert(self, table, nested):
57 |         is_nested(nested)
58 |         cur = self.cursor()
59 |         template = f'INSERT INTO {table} VALUES %s'
60 |         extras.execute_values(cur, template, nested)
61 | 
62 |     def table_exists(self, schema, table):
63 |         query = f'''
64 |         SELECT EXISTS (
65 |            SELECT
66 |            FROM   information_schema.tables 
67 |            WHERE  table_schema = '{schema}'
68 |            AND    table_name = '{table}'
69 |         );
70 |         '''
71 |         cur = self.cursor()
72 |         cur.execute(query)
73 |         return cur.fetchone()[0]
74 | 
75 |     def database_exists(self, database):
76 |         query = f'''
77 |         SELECT EXISTS (
78 |             SELECT
79 |             FROM pg_database
80 |             WHERE datname = '{database}'
81 |         )
82 |         '''
83 |         cur = self.cursor()
84 |         cur.execute(query)
85 |         return cur.fetchone()[0]
86 | 
87 |     def create_database(self, database):
88 |         query = f'''
89 |         COPY (SELECT 1) TO PROGRAM 'createdb {database}';
90 |         '''
91 |         cur = self.cursor()
92 |         cur.execute(query)
93 | 


--------------------------------------------------------------------------------
/utils/utils/misc/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | 
 4 | BOLD = '\033[1m{}\033[0m'
 5 | 
 6 | 
 7 | def gb_free_space():
 8 |     statvfs = os.statvfs(os.getcwd())
 9 |     return statvfs.f_frsize * statvfs.f_bfree / 1e+9      # Actual number of free bytes
10 | 
11 | 
12 | def batch_fn(batch_size, call_fn, items):
13 |     buffer = []
14 |     for item in items:
15 |         buffer.append(item)
16 |         if len(buffer) % batch_size == 0:
17 |             yield call_fn(buffer), buffer
18 |             buffer = []
19 |     if len(buffer) > 0:
20 |         yield call_fn(buffer), buffer
21 | 
22 | 
23 | def batch_list(arr, n):
24 |     buffer = []
25 |     for i, item in enumerate(arr):
26 |         buffer.append(item)
27 |         if (i+1) % n == 0:
28 |             yield buffer
29 |             buffer = []
30 |     if len(buffer) > 0:
31 |         yield buffer
32 | 


--------------------------------------------------------------------------------
/utils/utils/misc/tree.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ls -R | grep ":$" | sed -e 's/:$//' -e 's/[^-][^\/]*\//--/g' -e 's/^/   /' -e 's/-/|/'


--------------------------------------------------------------------------------
/utils/yamls.py:
--------------------------------------------------------------------------------
 1 | from yaml import safe_load
 2 | from hashlib import md5
 3 | from enum import Enum
 4 | from box import Box
 5 | 
 6 | 
 7 | class YamlType(Enum):
 8 |     BASE = 0
 9 |     PIPELINE = 1
10 |     SERVICE = 2
11 | 
12 | 
13 | def Yaml(path):
14 |     """
15 |     Sudo class for managing a yaml as a python object.
16 | 
17 |     :param path: path to .yaml file
18 |     """
19 |     __type__ = None
20 |     __text__ = open(path).read()
21 |     yaml = safe_load(__text__)
22 | 
23 |     for yaml_type in YamlType:
24 |         if yaml_type.name.lower() in yaml:
25 |             __type__ = yaml_type
26 | 
27 |     if __type__ is None:
28 |         raise ValueError('Invalid yaml type for %s' % path)
29 | 
30 |     box = Box(yaml[__type__.name.lower()])
31 |     box.__path__ = path
32 |     box.__text__ = __text__
33 |     box.__type__ = __type__
34 |     box.hash = md5(__text__.encode()).hexdigest()
35 |     return box
36 | 


--------------------------------------------------------------------------------