├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── documentation.md
│ ├── feature_request.md
│ └── usability-improvement.md
└── dependabot.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── architecture.svg
├── header.png
├── publish.sh
├── refinery.gif
├── refinery
├── __init__.py
├── cli.py
├── docker-compose.tmpl
├── kratos
│ ├── identity.schema.json
│ └── kratos.yml
├── oathkeeper
│ ├── access-rules.yml
│ └── oathkeeper.yml
├── settings.json
└── versions.json
├── requirements.txt
├── setup.py
├── start
├── start.bat
├── stop
├── stop.bat
├── update
└── update.bat
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG] - Your title"
5 | labels: bug, enhancement
6 |
7 | ---
8 |
9 | **Describe the bug**
10 | A clear and concise description of what the bug is.
11 |
12 | **To Reproduce**
13 | Steps to reproduce the behavior:
14 | 1. Go to '...'
15 | 2. Click on '....'
16 | 3. Scroll down to '....'
17 | 4. See error
18 |
19 | **Expected behavior**
20 | A clear and concise description of what you expected to happen.
21 |
22 | **Screenshots**
23 | If applicable, add screenshots to help explain your problem.
24 |
25 | **Desktop (please complete the following information):**
26 | - OS: [e.g. iOS]
27 | - Browser [e.g. chrome, safari]
28 | - Version [e.g. 22]
29 |
30 | **Additional context**
31 | Add any other context about the problem here.
32 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Documentation
3 | about: Help us improve the documentation of refinery
4 | title: "[DOC] - Your description"
5 | labels: documentation
6 |
7 | ---
8 |
9 | Please pick one of the following options:
10 |
11 | **What is missing in the docs?**
12 | What were you trying to do that was not covered by the documentation?
13 |
14 | _OR_
15 |
16 | **Which page is this issue related to?**
17 | If you find an issue with our documentation, what would you like to have improved/changed?
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 |
7 | ---
8 |
9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 |
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 |
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/usability-improvement.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Usability improvement
3 | about: Help us improve our workflow
4 | title: "[UX] - Your title"
5 | labels: usability
6 |
7 | ---
8 |
9 | **What would you improve?**
10 | List some screenshots here or describe what you're missing.
11 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
4 | - package-ecosystem: "pip"
5 | directory: "/"
6 | schedule:
7 | interval: "daily"
8 | # default is / which breaks drone
9 | pull-request-branch-name:
10 | separator: "-"
11 | # not created automatically for version updates so only security ones are created
12 | # https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file
13 | open-pull-requests-limit: 0
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # generated by start script
2 | # oathkeeper/*
3 | # kratos/kratos.yml
4 | refinery/docker-compose.yml
5 | refinery/docker-compose.yml.bak
6 | refinery/oathkeeper/jwks.json
7 | refinery/postgres-data
8 | refinery/minio-data
9 | refinery/qdrant-data
10 | refinery/qdrant-storage
11 | refinery/qdrant-volume
12 | refinery/graphql-sqlite
13 | refinery/kratos-sqlite
14 | refinery/backup
15 | refinery/config
16 |
17 | .DS_Store
18 |
19 | # Byte-compiled / optimized / DLL files
20 | __pycache__/
21 | *.py[cod]
22 | *$py.class
23 |
24 | # C extensions
25 | *.so
26 |
27 | # Distribution / packaging
28 | .Python
29 | build/
30 | develop-eggs/
31 | dist/
32 | downloads/
33 | eggs/
34 | .eggs/
35 | lib/
36 | lib64/
37 | parts/
38 | sdist/
39 | var/
40 | wheels/
41 | pip-wheel-metadata/
42 | share/python-wheels/
43 | *.egg-info/
44 | .installed.cfg
45 | *.egg
46 | MANIFEST
47 |
48 | # PyInstaller
49 | # Usually these files are written by a python script from a template
50 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
51 | *.manifest
52 | *.spec
53 |
54 | # Installer logs
55 | pip-log.txt
56 | pip-delete-this-directory.txt
57 |
58 | # Unit test / coverage reports
59 | htmlcov/
60 | .tox/
61 | .nox/
62 | .coverage
63 | .coverage.*
64 | .cache
65 | nosetests.xml
66 | coverage.xml
67 | *.cover
68 | *.py,cover
69 | .hypothesis/
70 | .pytest_cache/
71 |
72 | # Translations
73 | *.mo
74 | *.pot
75 |
76 | # Django stuff:
77 | *.log
78 | local_settings.py
79 | db.sqlite3
80 | db.sqlite3-journal
81 |
82 | # Flask stuff:
83 | instance/
84 | .webassets-cache
85 |
86 | # Scrapy stuff:
87 | .scrapy
88 |
89 | # Sphinx documentation
90 | docs/_build/
91 |
92 | # PyBuilder
93 | target/
94 |
95 | # Jupyter Notebook
96 | .ipynb_checkpoints
97 |
98 | # IPython
99 | profile_default/
100 | ipython_config.py
101 |
102 | # pyenv
103 | .python-version
104 |
105 | # pipenv
106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | # install all needed dependencies.
110 | #Pipfile.lock
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | info@kern.ai.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2024 Kern AI GmbH
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 |
4 |
The data scientist's open-source choice to scale, assess and maintain natural language data.
5 |
Treat training data like a software artifact.
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | Does one of these scenarios sounds familiar to you?
23 |
24 | - You are working on your own side-project in NLP, but you don't have enough labeled data to train a good model.
25 | - You are working in a team and already have some labeled data, but your training data is just stored in a spreadsheet or some TXT-file, and you have no idea how _good_ it actually is.
26 | - You are working in a team about to start a new project with limited resources (annotators, budget, time), and now want to understand how you can best make use of them
27 |
28 | If so, you are one of the people we've built refinery for. refinery helps you to build better NLP models in a data-centric approach. Semi-automate your labeling, find low-quality subsets in your training data, and monitor your data in one place.
29 |
30 | _refinery_ doesn't get rid of manual labeling, but it makes sure that your valuable time is spent well. Also, the makers of _refinery_ currently work on integrations to other labeling tools, such that you can easily switch between different choices.
31 |
32 | 
33 |
34 | > **_DEMO:_** You can interact with the application in a (mostly read-only) online playground. Check it out [here](https://demo.kern.ai)
35 |
36 | _refinery_ is a multi-repository project, you can find all integrated services in the architecture below. The app builds on top of [🤗 Hugging Face](https://www.huggingface.co) and [spaCy](https://spacy.io/) to leverage pre-built language models for your NLP tasks, as well as [qdrant](https://github.com/qdrant/qdrant) for neural search.
37 |
38 | ## Table of contents
39 |
40 | - [🧑💻 Why _refinery_?](#-why-refinery)
41 | - [Enabling ideas of one-person-armies](#enabling-ideas-of-one-person-armies)
42 | - [Extending your existing labeling approach](#extending-your-existing-labeling-approach)
43 | - [Put structure into unstructured data](#put-structure-into-unstructured-data)
44 | - [Pushing collaboration](#pushing-collaboration)
45 | - [Open-source, and treating training data as a software artifact](#open-source-and-treating-training-data-as-a-software-artifact)
46 | - [Integrations](#integrations)
47 | - [Your benefits](#your-benefits)
48 | - [How does Kern AI make money, if refinery is open-source?](#how-does-kern-ai-make-money-if-refinery-is-open-source)
49 | - [🤓 Features](#-features)
50 | - [(Semi-)automated labeling workflow for NLP tasks](#semi-automated-labeling-workflow-for-nlp-tasks)
51 | - [Extensive data management and monitoring](#extensive-data-management-and-monitoring)
52 | - [Team workspaces in the managed version](#team-workspaces-in-the-managed-version)
53 | - [☕ Installation](#-installation)
54 | - [From pip](#from-pip)
55 | - [From repository](#from-repository)
56 | - [Persisting data](#persisting-data)
57 | - [📘 Documentation and tutorials](#-documentation-and-tutorials)
58 | - [😵💫 Need help?](#-need-help)
59 | - [🪢 Community and contact](#-community-and-contact)
60 | - [🙌 Contributing](#-contributing)
61 | - [❓ FAQ](#-faq)
62 | - [Concept questions](#concept-questions)
63 | - [Technical questions](#technical-questions)
64 | - [Service and hosting questions](#service-and-hosting-questions)
65 | - [🐍 Python SDK](#-python-sdk)
66 | - [🏠 Architecture](#-architecture)
67 | - [🏫 Glossary](#-glossary)
68 | - [👩💻👨💻 Team and contributors](#-team-and-contributors)
69 | - [🌟 Star History](#-star-history)
70 | - [📃 License](#-license)
71 |
72 | ## 🧑💻 Why _refinery_?
73 |
74 | There are already many other tools available to build training data. Why did we decide to build _yet another one_?
75 |
76 | ### Enabling ideas of one-person-armies
77 |
78 | We believe that developers can have crazy ideas, and we want to lower the barrier for them to go for that idea. _refinery_ is designed to build labeled training data much faster, so that it takes you very little time to prototype an idea. We've received much love for exactly that, so make sure to give it a try for your next project.
79 |
80 | ### Extending your existing labeling approach
81 |
82 | _refinery_ is more than a labeling tool. It has a built-in labeling editor, but its main advantages come with automation and data management. You can integrate any kind of heuristic to label what is possible automatically, and then focus on headache-causing subsets afterwards. Whether you do the labeling in _refinery_ or any other tool (even crowd labeled) doesn't matter!
83 |
84 | ### Put structure into unstructured data
85 |
86 | _refinery_ is the tool that brings new perspectives into your data. You're working on multilingual, human-written texts? Via our integration to [bricks](https://github.com/code-kern-ai/bricks), you can easily enrich your texts with metadata such as the detected language, sentence complexity and many more. You can use this both to analyze your data, but also to orchestrate your labeling workflow.
87 |
88 | ### Pushing collaboration
89 |
90 | While doing so, we aim to improve the collaboration between engineers and subject matter experts (SMEs). In the past, we've seen how our application was being used in meetings to discuss label patterns in form of labeling functions and distant supervisors. We believe that data-centric AI is the best way to leverage collaboration.
91 |
92 | ### Open-source, and treating training data as a software artifact
93 |
94 | We hate the idea that there are still use cases in which the training data is just a plain CSV-file. That is okay if you _really_ just quickly want to prototype something at hand with a few records, but any serious software should be maintainable. We believe an open-source solution for training data management is what's needed here. _refinery_ is the tool helping you to document your data. That's how you treat training data as a software artifact.
95 |
96 | ### Integrations
97 |
98 | Lastly, _refinery_ supports [SDK actions](https://github.com/code-kern-ai/refinery-python) like pulling and pushing data. Data-centric AI redefines labeling to be more than a one-time job by giving it an iterative workflow, so we aim to give you more power every day by providing end-to-end capabilities, growing the large-scale availability of high-quality training data. Use our SDK to program integrations with your existing landscapes.
99 |
100 | ## Your benefits
101 |
102 | You can automate tons of repetitive tasks, gain better insights into the data labeling workflow, receive an implicit documentation for your training data, and can ultimately build better models in shorter time.
103 |
104 | Our goal is to make training data building feel more like a programmatic and enjoyable task, instead of something tedious and repetitive. _refinery_ is our contribution to this goal. And we're constantly aiming to improve this contribution.
105 |
106 | If you like what we're working on, please leave a ⭐!
107 |
108 | ## How does Kern AI make money, if refinery is open-source?
109 |
110 | You won't believe how often we get that question - and it is a fair one 🙂 Put short, the open-source version of _refinery_ is currently a single-user version, and you can get access to a multi-user environment with our commercial options. Additionally, we have commercial products on top of _refinery_, e.g. to use the _refinery_ automations as an actual realtime prediction API.
111 |
112 | Generally, we are passionate about open-source and want to contribute as much as possible.
113 |
114 | ## 🤓 Features
115 |
116 | For a detailed overview of features, please look into our [docs](https://docs.kern.ai).
117 |
118 | ### (Semi-)automated labeling workflow for NLP tasks
119 |
120 | - Both manual and programmatic for classifications and span-labeling
121 | - Integration with state-of-the-art libraries and frameworks
122 | - Creation and management of lookup lists/knowledge bases to support during labeling
123 | - Neural search-based retrieval of similar records and outliers
124 | - Sliceable labeling sessions to drill-down on specific subsets
125 | - Multiple labeling tasks possible per project
126 | - Rich library of ready-made automations in our open-source [bricks](https://github.com/code-kern-ai/bricks) library
127 |
128 | ### Extensive data management and monitoring
129 |
130 | - Best-in-class data management capabilities via our databrowser. Filter, sort and search your data e.g. by confidence, heuristic overlap, user, note, etc.
131 | - Integration with [🤗 Hugging Face](https://www.huggingface.co) to automatically create document- and token-level embeddings
132 | - JSON-based data model for up- and downloads
133 | - Overview of project metrics like confidence and label distributions and confusion matrix
134 | - Data accessible and extendable via our [Python SDK](https://github.com/code-kern-ai/refinery-python)
135 | - Attribute modifications to extend your attributes (e.g. with sentence complexity metrics) in-place
136 | - Again, you can use [bricks](https://github.com/code-kern-ai/bricks) to enrich your data with metadata
137 |
138 | ### Team workspaces in the [managed version](https://www.kern.ai/pricing)
139 |
140 | - Allow multiple users to label your data with role-based access and minimized labeling views
141 | - Integrate crowd labeling workflows
142 | - Automated calculation of inter-annotator agreements
143 |
144 | ## ☕ Installation
145 |
146 | ### From pip
147 |
148 | ```
149 | pip install kern-refinery
150 | ```
151 |
152 | Once the library is installed, go to the directory where you want to store the data and run `refinery start`. This will automatically `git clone` this repository first if you haven't done so yet. To stop the server, run `refinery stop`.
153 |
154 | ### From repository
155 |
156 | **TL;DR:**
157 |
158 | ```
159 | $ git clone https://github.com/code-kern-ai/refinery.git
160 | $ cd refinery
161 | ```
162 |
163 | If you're on Mac/Linux:
164 |
165 | ```
166 | $ ./start
167 | ```
168 |
169 | If you're on Windows:
170 |
171 | ```
172 | $ start.bat
173 | ```
174 |
175 | To stop, type `./stop` (Mac/Linux) or `stop.bat`.
176 |
177 | _refinery_ consists of multiple services that need to be run together. To do so, we've set up a setup file, which will automatically pull and connect the respective services for you. The file is part of this repository, so you can just clone it and run `./start` (Mac/Linux) or `start.bat` (Windows) in the repository. After some minutes (now is a good time to grab a coffee ☕), the setup is done and you can access `http://localhost:4455` in your browser. To stop the server, run `./stop` (Mac/Linux) or `./stop.bat` (Windows).
178 |
179 | **You're ready to start! 🙌 🎉**
180 |
181 | If you run into any issues during installation, please don't hesitate to reach out to us (see community section below).
182 |
183 | ### Persisting data
184 |
185 | By default, we store the data to the directory `refinery/postgres-data`. If you want to change that path, you need to modify the variable `LOCAL_VOLUME` of the `start` script of your operating system. To remove data, simply delete the volume folder. **Make sure to delete only if you don't need the data any longer - this is irreversible!**
186 |
187 | ## 📘 Documentation and tutorials
188 |
189 | The best way to start with _refinery_ is our [**quick start**](https://docs.kern.ai/refinery/quickstart).
190 |
191 | You can find extensive guides in our [docs](https://docs.kern.ai) and [tutorials](https://www.youtube.com/@kern_ai/videos) on our YouTube channel. We've also prepared a [repository with sample projects](https://github.com/code-kern-ai/sample-projects) which you can clone.
192 |
193 | If you need help writing your first labeling functions, look into our open-source content library [bricks](https://github.com/code-kern-ai/bricks).
194 |
195 | You can find our changelog [here](https://changelog.kern.ai).
196 |
197 | ## 😵💫 Need help?
198 |
199 | No worries, we've got you. If you have questions, reach out to us on [Discord](https://discord.gg/qf4rGCEphW), or [open a ticket](https://github.com/code-kern-ai/refinery/discussions/categories/q-a) in the "q&a" category of our forum.
200 |
201 | ## 🪢 Community and contact
202 |
203 | Feel free to join our [Discord](https://discord.gg/qf4rGCEphW), where we'll happily help you building your training data:
204 |
205 | We send out a (mostly) weekly newsletter about recent findings in data-centric AI, product highlights in development and more. You can subscribe to the newsletter [here](https://www.kern.ai/newsletter).
206 |
207 | Also, you can follow us on [Twitter](https://twitter.com/MeetKern) and [LinkedIn](https://www.linkedin.com/company/kern-ai).
208 |
209 | ## 🙌 Contributing
210 |
211 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. You can do so by providing feedback about [desired features and bugs](https://github.com/code-kern-ai/refinery/issues) you might detect.
212 |
213 | If you actively want to participate in extending the code base, reach out to us. We'll explain you how the architecture is set up, so you can customize the application as you desire.
214 |
215 | ## ❓ FAQ
216 |
217 | ### Concept questions
218 |
219 |
220 | What is a heuristic?
221 | Heuristics are the ingredients for scaling your data labeling. They don't have to be 100% accurate, heuristics can be e.g. simple Python functions expressing some domain knowledge. When you add and run several of these heuristics, you create what is called a noisy label matrix, that is matched against the reference data that you manually labeled. This allows us to analyze correlations, conflicts, overlaps, the number of hits for a data set, and the accuracy of each heuristic.
222 |
223 |
224 |
225 | How can I build an active learning model?
226 | We use pre-trained models to create embeddings in the first place. Once this is done, the embeddings are available in the application (both for building active learning heuristics and neural search). In our active learning IDE, you can then build a simple classification or extraction head on top of the embedding, and we'll manage then execution in a containerized environment.
227 |
228 |
229 |
230 | How do I know whether my heuristic is good?
231 | A heuristic can be “good” with respect to both coverage and precision. For coverage there basically is no limitation at all, for precision we generally recommend some value above 70%, depending on how many heuristics you have. The more heuristics you have, the more overlaps and conflicts will be given, the better weak supervision can work.
232 |
233 |
234 |
235 | I have less than 1,000 records - Do I need this?
236 | You can definitely use the system for smaller datasets, too! It not only shines via programmatic labeling, but also has a simple and beautiful UI. Go for it 😁
237 |
238 |
239 | ### Technical questions
240 |
241 |
242 | Help!! I forgot my password!
243 | No worries, you can send a reset link even on your local machine. However, the link isn't sent to your email, but to the mailhog. Access it via http://localhost:4436.
244 |
245 |
246 |
247 | I want to install a library for my labeling function
248 | For this, we need to change the requirements.txt of the lf-exec-env, the containerized execution environment for your labeling functions. Please just open an issue, and we'll integrate your library as soon as possible.
249 |
250 |
251 |
252 | Which data formats are supported?
253 | We’ve structured our data formats around JSON, so you can upload most file types natively. This includes spreadsheets, text files, CSV data, generic JSON and many more.
254 |
255 |
256 |
257 | How can I upload data?
258 | We use pandas internally for matching your data to our JSON-based data model. You can upload the data via our UI, or via our Python SDK.
259 |
260 |
261 |
262 | How can I download data, and what format does it have?
263 | You can download your data in our UI or via the Python SDK, where we also provide e.g. adapters to Rasa. The export looks something like this:
264 |
265 | [
266 | {
267 | "running_id": "0",
268 | "headline": "T. Rowe Price (TROW) Dips More Than Broader Markets",
269 | "date": "Jun-30-22 06:00PM\u00a0\u00a0",
270 | "headline__sentiment__MANUAL": null,
271 | "headline__sentiment__WEAK_SUPERVISION": "NEGATIVE",
272 | "headline__sentiment__WEAK_SUPERVISION__confidence": 0.62,
273 | "headline__entities__MANUAL": null,
274 | "headline__entities__WEAK_SUPERVISION": [
275 | "STOCK", "STOCK", "STOCK", "STOCK", "STOCK", "STOCK", "O", "O", "O", "O", "O"
276 | ],
277 | "headline__entities__WEAK_SUPERVISION__confidence": [
278 | 0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.00, 0.00, 0.00, 0.00, 0.00
279 | ]
280 | }
281 | ]
282 |
283 |
284 |
285 | ### Service and hosting questions
286 |
287 |
288 | Are there options for an enterprise on-prem solution?
289 | If you're interested in running the multi-user version on your premises, please reach out to us. We can help you to set up the deployment and prepare your project(s) e.g. with workshops.
290 |
291 |
292 |
293 | I don't want to label myself. What are my options?
294 | Do you want to outsource your labeling, and let your engineers use _refinery_ as a mission control for your training data? Reach out to us, so we can discuss how we can help you with your use case.
295 |
296 |
297 |
298 | How can I reach support?
299 | In our open-source solution, you can reach out to us via Discord. For our managed version, you have an in-app chat to directly contact our support team.
300 |
301 |
302 | ## 🐍 Python SDK
303 |
304 | You can extend your projects by using our [Python SDK](https://github.com/code-kern-ai/refinery-python). With it, you can easily export labeled data of your current project and import new files both programmatically and via CLI (`rsdk pull` and `rsdk push `). It also comes with adapters, e.g. to [Rasa](https://github.com/RasaHQ/rasa).
305 |
306 | ## 🏠 Architecture
307 |
308 | Our architecture follows some main patterns:
309 |
310 | - Shared service database to efficiently transfer large data loads. To avoid redundant code in the services, we use submodules to share the data model
311 | - Containerized function execution for labeling functions, active learning and the record ide
312 | - Machine learning logic is implemented in stand-alone libraries (e.g. [sequence-learn](https://github.com/code-kern-ai/sequence-learn))
313 |
314 |
315 |
316 | 
317 |
318 | Some edges are not displayed for simplicity's sake.
319 |
320 | The color of the edges have no implicit meaning, and are only used for better readability.
321 |
322 |
323 |
324 | **Service overview (maintained by Kern AI)**
325 | | Service | Description |
326 | |--- |--- |
327 | | [ml-exec-env](https://github.com/code-kern-ai/refinery-ml-exec-env) | Execution environment for the active learning module. Containerized function as a service to build active learning models using scikit-learn and sequence-learn. |
328 | | [embedder](https://github.com/code-kern-ai/refinery-embedder) | Embedder for _refinery_. Manages the creation of document- and token-level embeddings using the embedders library. |
329 | | [weak-supervisor](https://github.com/code-kern-ai/refinery-weak-supervisor) | Weak supervision for _refinery_. Manages the integration of heuristics such as labeling functions, active learners or zero-shot classifiers. Uses the weak-nlp library for the actual integration logic and algorithms. |
330 | | [record-ide-env](https://github.com/code-kern-ai/refinery-record-ide-env) | Execution environment for the record IDE. Containerized function as a service to build record-specific "quick-and-dirty" code snippets for exploration and debugging. |
331 | | [config](https://github.com/code-kern-ai/refinery-config) | Configuration of _refinery_. Amongst others, this manages endpoints and available language models for spaCy. |
332 | | [tokenizer](https://github.com/code-kern-ai/refinery-tokenizer) | Tokenizer for _refinery_. Manages the creation and storage of spaCy tokens for text-based record attributes and supports multiple language models. |
333 | | [gateway](https://github.com/code-kern-ai/refinery-gateway) | Gateway for _refinery_. Manages incoming requests and holds the workflow logic. To interact with the gateway, the UI or Python SDK can be used. |
334 | | [authorizer](https://github.com/code-kern-ai/refinery-authorizer) | Evaluates whether a user has access to certain resources. |
335 | | [websocket](https://github.com/code-kern-ai/refinery-websocket) | Websocket module for refinery. Enables asynchronous notifications inside the application. |
336 | | [lf-exec-env](https://github.com/code-kern-ai/refinery-lf-exec-env) | Execution environment for labeling functions. Containerized function as a service to execute user-defined Python scripts. |
337 | | [ac-exec-env](https://github.com/code-kern-ai/refinery-ac-exec-env) | Execution environment for attribute calulaction. Containerized function as a service to generate new attributes using Python scripts. |
338 | | [updater](https://github.com/code-kern-ai/refinery-updater) | Updater for _refinery_. Manages migration logic to new versions if required. |
339 | | [neural-search](https://github.com/code-kern-ai/refinery-neural-search) | Neural search for _refinery_. Manages similarity search powered by Qdrant and outlier detection, both based on vector representations of the project records. |
340 | | [zero-shot](https://github.com/code-kern-ai/refinery-zero-shot) | Zero-shot module for _refinery_. Enables the integration of 🤗 Hugging Face zero-shot classifiers as an off-the-shelf no-code heuristic. |
341 | | [entry](https://github.com/code-kern-ai/refinery-entry) | Login and registration screen for refinery. Implemented via Ory Kratos. |
342 | | [ui](https://github.com/code-kern-ai/refinery-ui) | UI for _refinery_. Used to interact with the whole system; to find out how to best work with the system, check out our docs. |
343 | | [doc-ock](https://github.com/code-kern-ai/refinery-doc-ock) | Usage statistics collection for _refinery_. If users allow it, this collects product insight data used to optimize the user experience. |
344 | | [gateway-proxy](https://github.com/code-kern-ai/refinery-gateway-proxy) | Gateway proxy for _refinery_. Manages incoming requests and forwards them to the gateway. Used by the Python SDK. |
345 | | [parent-images](https://github.com/code-kern-ai/refinery-exec-env-parent-image) | Shared images used by _refinery_. Used to reduce the required space for _refinery_. _Not yet listed in architecture diagram_ |
346 | | [ac-exec-env](https://github.com/code-kern-ai/refinery-ac-exec-env) | Execution environment for attribute calculation in _refinery_. Containerized function as a service to build custom attributes derived from the original data. _Not yet listed in architecture diagram_ |
347 | | [alfred](https://github.com/code-kern-ai/alfred) | Controls the start process of the _refinery_ app. Named after Batman's butler Alfred. _Not yet listed in architecture diagram_|
348 |
349 | **Service overview (open-source 3rd party)**
350 | | Service | Description |
351 | |--- |--- |
352 | | [qdrant/qdrant](https://github.com/qdrant/qdrant) | Qdrant - Vector Search Engine for the next generation of AI applications |
353 | | [postgres/postgres](https://github.com/postgres/postgres) | PostgreSQL: The World's Most Advanced Open Source Relational Database |
354 | | [minio/minio](https://github.com/minio/minio) | Multi-Cloud ☁️ Object Storage |
355 | | [mailhog/MailHog](https://github.com/mailhog/MailHog) | Web and API based SMTP testing |
356 | | [ory/kratos](https://github.com/ory/kratos) | Next-gen identity server (think Auth0, Okta, Firebase) with Ory-hardened authentication, MFA, FIDO2, TOTP, WebAuthn, profile management, identity schemas, social sign in, registration, account recovery, passwordless. Golang, headless, API-only - without templating or theming headaches. Available as a cloud service. |
357 | | [ory/oathkeeper](https://github.com/ory/oathkeeper) | A cloud native Identity & Access Proxy / API (IAP) and Access Control Decision API that authenticates, authorizes, and mutates incoming HTTP(s) requests. Inspired by the BeyondCorp / Zero Trust white paper. Written in Go. |
358 |
359 | **Integrations overview (maintained by Kern AI)**
360 | | Integration | Description |
361 | |--- |--- |
362 | | [refinery-python](https://github.com/code-kern-ai/refinery-python) | Official Python SDK for Kern AI refinery. |
363 | | [sequence-learn](https://github.com/code-kern-ai/sequence-learn) | With sequence-learn, you can build models for named entity recognition as quickly as if you were building a sklearn classifier. |
364 | | [embedders](https://github.com/code-kern-ai/embedders) | With embedders, you can easily convert your texts into sentence- or token-level embeddings within a few lines of code. Use cases for this include similarity search between texts, information extraction such as named entity recognition, or basic text classification. Integrates 🤗 Hugging Face transformer models |
365 | | [weak-nlp](https://github.com/code-kern-ai/weak-nlp) | With weak-nlp, you can integrate heuristics like labeling functions and active learners based on weak supervision. Automate data labeling and improve label quality. |
366 |
367 | **Integrations overview (open-source 3rd party)**
368 | | Integration | Description |
369 | |--- |--- |
370 | | [huggingface/transformers](https://github.com/huggingface/transformers) | 🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX. |
371 | | [scikit-learn/scikit-learn](https://github.com/scikit-learn/scikit-learn) | scikit-learn: machine learning in Python |
372 | | [explosion/spaCy](https://github.com/explosion/spaCy) | 💫 Industrial-strength Natural Language Processing (NLP) in Python |
373 |
374 | **Submodules overview**
375 |
376 | Not listed in the architecture, but for internal code management, we apply git submodules.
377 | | Submodule | Description |
378 | |--- |--- |
379 | | [submodule-model](https://github.com/code-kern-ai/refinery-submodule-model) | Data model for refinery. Manages entities and their access for multiple services, e.g. the gateway. |
380 | | [submodule-s3](https://github.com/code-kern-ai/refinery-submodule-s3) | S3 related AWS and Minio logic. |
381 |
382 | ## 🏫 Glossary
383 |
384 | | Term | Meaning |
385 | | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
386 | | Weak supervision | Technique/methodology to integrate different kinds of noisy and imperfect heuristics like labeling functions. It can be used not only to automate data labeling, but generally as an approach to improve your existing label quality. |
387 | | Neural search | Embedding-based approach to retrieve information; instead of telling a machine a set of constraints, neural search analyzes the vector space of data (encoded via e.g. pre-trained neural networks). Can be used e.g. to find nearest neighbors. |
388 | | Active learning | As data is labeled manually, a model is trained continuously to support the annotator. Can be used e.g. stand-alone, or as a heuristic for weak supervision. |
389 | | Vector encoding (embedding) | Using pre-trained models such as transformers from [🤗 Hugging Face](https://www.huggingface.co), texts can be transformed into vector space. This is both helpful for neural search and active learning (in the latter case, simple classifiers can be applied on top of the embedding, which enables fast re-training on the vector representations). |
390 |
391 | Missing anything in the glossary? [Add the term](https://github.com/code-kern-ai/refinery/issues) in an issue with the tag "enhancement".
392 |
393 |
394 |
395 | ## 👩💻👨💻 Team and contributors
396 |
397 |
533 |
534 | ## 🌟 Star History
535 |
536 | [](https://star-history.com/#code-kern-ai/refinery&Date)
537 |
538 | ## 📃 License
539 |
540 | _refinery_ is licensed under the Apache License, Version 2.0. View a copy of the [License file](LICENSE).
541 |
--------------------------------------------------------------------------------
/header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/refinery/7972dc9878ba470f5ca626520a6599bdfffc7f85/header.png
--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | rm -rf dist/*
3 | python3 setup.py bdist_wheel --universal
4 | twine upload dist/*
--------------------------------------------------------------------------------
/refinery.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/refinery/7972dc9878ba470f5ca626520a6599bdfffc7f85/refinery.gif
--------------------------------------------------------------------------------
/refinery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/refinery/7972dc9878ba470f5ca626520a6599bdfffc7f85/refinery/__init__.py
--------------------------------------------------------------------------------
/refinery/cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import platform
4 | import subprocess
5 | from git import Repo
6 | from wasabi import msg
7 |
8 | REFINERY_REPO = "https://github.com/code-kern-ai/refinery"
9 | REFINERY_FOLDER = "refinery"
10 |
11 |
12 | def start(cur_dir: str):
13 | """Starts the refinery server; if the refinery repository does not exist, it will be cloned from git first.
14 |
15 | Args:
16 | cur_dir (str): The current directory.
17 | """
18 |
19 | def _start_server():
20 | if platform.system() == "Windows":
21 | subprocess.run(["start.bat"])
22 | else:
23 | subprocess.run(["./start"])
24 |
25 | if not os.path.exists(REFINERY_FOLDER):
26 | msg.info(
27 | f"Cloning from code-kern-ai/refinery into repository {REFINERY_FOLDER}"
28 | )
29 | Repo.clone_from(REFINERY_REPO, REFINERY_FOLDER)
30 | with cd(REFINERY_FOLDER):
31 | _start_server()
32 | elif cur_dir == REFINERY_FOLDER:
33 | _start_server()
34 | else:
35 | with cd(REFINERY_FOLDER):
36 | _start_server()
37 |
38 |
39 | def stop(cur_dir: str):
40 | """Stops the refinery server.
41 |
42 | Args:
43 | cur_dir (str): The current directory.
44 | """
45 |
46 | def _stop_server():
47 | if platform.system() == "Windows":
48 | subprocess.run(["stop.bat"])
49 | else:
50 | subprocess.run(["./stop"])
51 |
52 | if cur_dir == REFINERY_FOLDER:
53 | _stop_server()
54 | elif os.path.exists(REFINERY_FOLDER):
55 | with cd(REFINERY_FOLDER):
56 | _stop_server()
57 | else:
58 | msg.fail(f"Could not find repository {REFINERY_FOLDER}.")
59 |
60 |
61 | def update(cur_dir: str):
62 | """Updates the refinery repository.
63 |
64 | Args:
65 | cur_dir (str): The current directory.
66 | """
67 |
68 | def _update():
69 | if platform.system() == "Windows":
70 | subprocess.run(["update.bat"])
71 | else:
72 | subprocess.run(["./update"])
73 |
74 | if cur_dir == REFINERY_FOLDER:
75 | _update()
76 | elif os.path.exists(REFINERY_FOLDER):
77 | with cd(REFINERY_FOLDER):
78 | _update()
79 | else:
80 | msg.fail(f"Could not find repository {REFINERY_FOLDER}.")
81 |
82 |
83 | def help():
84 | msg.info("Available commands:")
85 | msg.info(" - `refinery start` to start the server")
86 | msg.info(" - `refinery stop` to end it")
87 | msg.info(" - `refinery update` to update the repository")
88 |
89 |
90 | def main():
91 | cli_args = sys.argv[1:]
92 | if len(cli_args) == 0:
93 | msg.fail("Please provide arguments when running the `refinery` command.")
94 | msg.fail("`refinery start` to start the server, `refinery stop` to end it.")
95 | command = cli_args[0]
96 | cur_dir = os.path.split(os.getcwd())[-1]
97 | if command == "start":
98 | start(cur_dir)
99 | elif command == "stop":
100 | stop(cur_dir)
101 | elif command == "update":
102 | update(cur_dir)
103 | elif command == "help":
104 | help()
105 | else:
106 | msg.fail(
107 | f"Could not understand command `{command}`. Type `refinery help` for some instructions."
108 | )
109 |
110 |
111 | # https://stackoverflow.com/questions/431684/equivalent-of-shell-cd-command-to-change-the-working-directory/24176022#24176022
112 | class cd:
113 | """Context manager for changing the current working directory"""
114 |
115 | def __init__(self, new_path):
116 | self.new_path = os.path.expanduser(new_path)
117 |
118 | def __enter__(self):
119 | self.saved_path = os.getcwd()
120 | os.chdir(self.new_path)
121 |
122 | def __exit__(self, etype, value, traceback):
123 | os.chdir(self.saved_path)
124 |
--------------------------------------------------------------------------------
/refinery/docker-compose.tmpl:
--------------------------------------------------------------------------------
1 | version: "3.9"
2 |
3 | services:
4 | kratos-migrate:
5 | image: oryd/kratos:{KRATOS}
6 | environment:
7 | - DSN=sqlite:///var/lib/sqlite/db.sqlite?_fk=true&mode=rwc
8 | volumes:
9 | - type: volume
10 | source: kratos-sqlite
11 | target: /var/lib/sqlite
12 | read_only: false
13 | - {LOCAL_VOLUME_KRATOS}/kratos.yml:/home/.kratos.yaml
14 | command: migrate sql -e --yes
15 | restart: on-failure
16 | networks:
17 | - default
18 |
19 | kratos:
20 | depends_on:
21 | - kratos-migrate
22 | image: oryd/kratos:{KRATOS}
23 | restart: unless-stopped
24 | environment:
25 | - DSN=sqlite:///var/lib/sqlite/db.sqlite?_fk=true
26 | - LOG_LEVEL=trace
27 | - SERVE_PUBLIC_BASE_URL=http://localhost:4455/.ory/kratos/public/
28 | command: serve -c /etc/config/kratos/kratos.yml --dev --watch-courier
29 | volumes:
30 | - {LOCAL_VOLUME_KRATOS}:/etc/config/kratos:Z
31 | - kratos-sqlite:/var/lib/sqlite
32 | networks:
33 | - default
34 |
35 | oathkeeper:
36 | image: oryd/oathkeeper:{OATHKEEPER}
37 | depends_on:
38 | - kratos
39 | ports:
40 | - 4455:4455
41 | command: serve proxy -c "/etc/config/oathkeeper/oathkeeper.yml"
42 | environment:
43 | - LOG_LEVEL=debug
44 | restart: on-failure
45 | networks:
46 | - default
47 | volumes:
48 | - {LOCAL_VOLUME_OATHKEEPER}:/etc/config/oathkeeper:Z
49 |
50 | refinery-authorizer:
51 | image: kernai/refinery-authorizer:{AUTHORIZER}
52 | restart: always
53 | expose:
54 | - 80
55 | networks:
56 | - default
57 |
58 | mailhog:
59 | image: mailhog/mailhog:{MAILHOG}
60 | ports:
61 | - 1025:1025
62 | - 4436:8025
63 | networks:
64 | - default
65 |
66 | refinery-ui:
67 | image: kernai/refinery-ui:{UI}
68 | restart: always
69 | ports:
70 | - 7050:3000
71 | expose:
72 | - 3000
73 | networks:
74 | - default
75 |
76 | refinery-entry:
77 | image: kernai/refinery-entry:{ENTRY}
78 | restart: always
79 | expose:
80 | - 3000
81 | networks:
82 | - default
83 |
84 | postgres-migrate:
85 | depends_on:
86 | - graphql-postgres
87 | image: kernai/refinery-gateway:{GATEWAY}
88 | environment:
89 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
90 | command: alembic upgrade head
91 | networks:
92 | - default
93 |
94 | refinery-gateway:
95 | depends_on:
96 | - refinery-config
97 | image: kernai/refinery-gateway:{GATEWAY}
98 | restart: always
99 | ports:
100 | - 7051:80
101 | expose:
102 | - 80
103 | volumes:
104 | - /var/run/docker.sock:/var/run/docker.sock:Z
105 | - graphql-sqlite:/sqlite
106 | environment:
107 | - AC_EXEC_ENV_IMAGE=kernai/refinery-ac-exec-env:{AC_EXEC_ENV}
108 | - LF_EXEC_ENV_IMAGE=kernai/refinery-lf-exec-env:{LF_EXEC_ENV}
109 | - ML_EXEC_ENV_IMAGE=kernai/refinery-ml-exec-env:{ML_EXEC_ENV}
110 | - RECORD_IDE_IMAGE=kernai/refinery-record-ide-env:{RECORD_IDE_ENV}
111 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
112 | - LF_NETWORK=refinery_default
113 | - WEAK_SUPERVISION=http://refinery-weak-supervisor:80
114 | - EMBEDDING_SERVICE=http://refinery-embedder:80
115 | - DOC_OCK=http://refinery-doc-ock:80
116 | - TOKENIZER=http://refinery-tokenizer:80
117 | - ZERO_SHOT=http://refinery-zero-shot:80
118 | - NEURAL_SEARCH=http://refinery-neural-search:80
119 | - KRATOS_ADMIN_URL=http://kratos:4434
120 | - WS_NOTIFY_ENDPOINT=http://refinery-websocket:8080
121 | - TASK_QUEUE_SLOTS=1
122 | - PRIORITY_TASK_QUEUE_SLOTS=1
123 | - UPDATER=http://refinery-updater:80
124 | - S3_URI=object-storage:9000 # remove as soon as multipart upload is merged
125 | - S3_ENDPOINT={CRED_ENDPOINT}
126 | - S3_ENDPOINT_LOCAL=object-storage:9000
127 | - S3_ACCESS_KEY=onetask
128 | - S3_SECRET_KEY=JRZtI0SLsEDb3imTy03R
129 | - SQLITE=/sqlite/db.sqlite
130 | # - TYPESENSE_SEARCH=http://typesense-api:80
131 | - SECRET_KEY=default
132 | networks:
133 | - default
134 |
135 | graphql-postgres:
136 | image: docker.io/postgres:{POSTGRES}
137 | restart: always
138 | ports:
139 | - 7052:5432
140 | environment:
141 | - POSTGRES_PASSWORD=onetask
142 | - POSTGRES_USER=postgres
143 | expose:
144 | - 5432
145 | networks:
146 | - default
147 | volumes:
148 | - {LOCAL_VOLUME_POSTGRES}:/var/lib/postgresql/data
149 |
150 | qdrant:
151 | # hallo
152 | image: qdrant/qdrant:{QDRANT}
153 | restart: always
154 | ports:
155 | - 6333:6333
156 | expose:
157 | - 6333
158 | networks:
159 | - default
160 | volumes:
161 | - {LOCAL_VOLUME_QDRANT}:/qdrant/storage
162 |
163 | refinery-gateway-proxy:
164 | depends_on:
165 | - graphql-postgres
166 | image: kernai/refinery-gateway-proxy:{GATEWAY_PROXY}
167 | restart: always
168 | expose:
169 | - 80
170 | environment:
171 | - POSTGRES=postgresql://postgres:onetask@postgres:5432
172 | - GATEWAY=http://refinery-gateway:80
173 | - KRATOS=http://kratos:4433
174 | - CONFIG=http://refinery-config:80
175 | links:
176 | - "graphql-postgres:postgres"
177 | networks:
178 | - default
179 |
180 | object-storage:
181 | image: docker.io/minio/minio:{MINIO}
182 | restart: always
183 | ports:
184 | - 7053:9000
185 | - 9001:9001
186 | expose:
187 | - 9000
188 | environment:
189 | - MINIO_ROOT_USER=onetask
190 | - MINIO_ROOT_PASSWORD=JRZtI0SLsEDb3imTy03R
191 | - MINIO_NOTIFY_WEBHOOK_ENABLE=on
192 | - MINIO_NOTIFY_WEBHOOK_ENDPOINT=http://refinery-gateway:80/notify
193 | command: server /data --address :9000 --console-address ":9001"
194 | networks:
195 | - default
196 | volumes:
197 | - {LOCAL_VOLUME_MINIO}:/data
198 |
199 | refinery-weak-supervisor:
200 | image: kernai/refinery-weak-supervisor:{WEAK_SUPERVISOR}
201 | restart: unless-stopped
202 | ports:
203 | - 7054:80
204 | expose:
205 | - 80
206 | environment:
207 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
208 | - WS_NOTIFY_ENDPOINT=http://refinery-websocket:8080
209 | networks:
210 | - default
211 |
212 | refinery-embedder:
213 | image: kernai/refinery-embedder:{EMBEDDER}
214 | restart: unless-stopped
215 | ports:
216 | - 7058:80
217 | environment:
218 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
219 | - S3_ENDPOINT_LOCAL=object-storage:9000
220 | - S3_ACCESS_KEY=onetask
221 | - S3_SECRET_KEY=JRZtI0SLsEDb3imTy03R
222 | - DOC_OCK=http://refinery-doc-ock:80
223 | - WS_NOTIFY_ENDPOINT=http://refinery-websocket:8080
224 | - NEURAL_SEARCH=http://refinery-neural-search:80
225 | expose:
226 | - 80
227 | networks:
228 | - default
229 |
230 | refinery-config:
231 | image: kernai/refinery-config:{CONFIG}
232 | restart: unless-stopped
233 | ports:
234 | - 7059:80
235 | environment:
236 | - IS_MANAGED=0
237 | - KERN_S3_ENDPOINT={MINIO_ENDPOINT}
238 | expose:
239 | - 80
240 | networks:
241 | - default
242 | volumes:
243 | - {LOCAL_VOLUME_CONFIG}:/config
244 |
245 | refinery-doc-ock:
246 | depends_on:
247 | - refinery-config
248 | image: kernai/refinery-doc-ock:{DOC_OCK}
249 | restart: unless-stopped
250 | ports:
251 | - 7060:80
252 | expose:
253 | - 80
254 | environment:
255 | - TELEMETRY_URI=https://telemetry.kern.ai
256 |
257 | refinery-websocket:
258 | image: kernai/refinery-websocket:{WEBSOCKET}
259 | restart: unless-stopped
260 | environment:
261 | - DB_DSN=postgresql://postgres:onetask@graphql-postgres:5432?sslmode=disable
262 | expose:
263 | - 8080
264 | networks:
265 | - default
266 |
267 | refinery-tokenizer:
268 | depends_on:
269 | - refinery-config
270 | image: kernai/refinery-tokenizer:{TOKENIZER}
271 | restart: unless-stopped
272 | ports:
273 | - 7061:80
274 | environment:
275 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
276 | - S3_ENDPOINT_LOCAL=object-storage:9000
277 | - S3_ACCESS_KEY=onetask
278 | - S3_SECRET_KEY=JRZtI0SLsEDb3imTy03R
279 | - DOC_OCK=http://refinery-doc-ock:80
280 | - WS_NOTIFY_ENDPOINT=http://refinery-websocket:8080
281 | expose:
282 | - 80
283 | networks:
284 | - default
285 |
286 | refinery-updater:
287 | image: kernai/refinery-updater:{UPDATER}
288 | restart: unless-stopped
289 | ports:
290 | - 7062:80
291 | environment:
292 | - NEURAL_SEARCH=http://refinery-neural-search:80
293 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
294 | - S3_ENDPOINT_LOCAL=object-storage:9000
295 | - S3_ACCESS_KEY=onetask
296 | - S3_SECRET_KEY=JRZtI0SLsEDb3imTy03R
297 | expose:
298 | - 80
299 | networks:
300 | - default
301 |
302 | refinery-neural-search:
303 | image: kernai/refinery-neural-search:{NEURAL_SEARCH}
304 | restart: unless-stopped
305 | ports:
306 | - 7063:80
307 | environment:
308 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
309 | - QDRANT_PORT=6333
310 | expose:
311 | - 80
312 | networks:
313 | - default
314 |
315 | refinery-zero-shot:
316 | image: kernai/refinery-zero-shot:{ZERO_SHOT}
317 | restart: unless-stopped
318 | ports:
319 | - 7064:80
320 | environment:
321 | - POSTGRES=postgresql://postgres:onetask@graphql-postgres:5432
322 | - WS_NOTIFY_ENDPOINT=http://refinery-websocket:8080
323 | expose:
324 | - 80
325 | networks:
326 | - default
327 |
328 | networks:
329 | default:
330 |
331 | volumes:
332 | kratos-sqlite:
333 | graphql-sqlite:
334 | config:
--------------------------------------------------------------------------------
/refinery/kratos/identity.schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "$id": "https://schemas.ory.sh/presets/kratos/quickstart/email-password/identity.schema.json",
3 | "$schema": "http://json-schema.org/draft-07/schema#",
4 | "title": "Person",
5 | "type": "object",
6 | "properties": {
7 | "traits": {
8 | "type": "object",
9 | "properties": {
10 | "email": {
11 | "type": "string",
12 | "format": "email",
13 | "title": "E-Mail",
14 | "minLength": 3,
15 | "ory.sh/kratos": {
16 | "credentials": {
17 | "password": {
18 | "identifier": true
19 | }
20 | },
21 | "verification": {
22 | "via": "email"
23 | },
24 | "recovery": {
25 | "via": "email"
26 | }
27 | }
28 | },
29 | "name": {
30 | "type": "object",
31 | "properties": {
32 | "first": {
33 | "title": "First Name",
34 | "type": "string"
35 | },
36 | "last": {
37 | "title": "Last Name",
38 | "type": "string"
39 | }
40 | }
41 | }
42 | },
43 | "required": [
44 | "email"
45 | ],
46 | "additionalProperties": true
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/refinery/kratos/kratos.yml:
--------------------------------------------------------------------------------
1 | version: v1.1.0
2 |
3 | dsn: memory
4 |
5 | serve:
6 | public:
7 | base_url: http://kratos:4433/
8 | cors:
9 | enabled: true
10 | admin:
11 | base_url: http://kratos:4434/
12 |
13 | selfservice:
14 | default_browser_return_url: http://localhost:4455/refinery/projects
15 | allowed_return_urls:
16 | - http://localhost:4455/
17 |
18 | methods:
19 | password:
20 | enabled: true
21 | link:
22 | enabled: true
23 |
24 | flows:
25 | error:
26 | ui_url: http://localhost:4455/auth/error
27 |
28 | settings:
29 | ui_url: http://localhost:4455/auth/settings
30 | privileged_session_max_age: 15m
31 |
32 | recovery:
33 | enabled: true
34 | ui_url: http://localhost:4455/auth/recovery
35 | use: link
36 |
37 | verification:
38 | enabled: true
39 | ui_url: http://localhost:4455/auth/verify
40 | use: link
41 | after:
42 | default_browser_return_url: http://localhost:4455/
43 |
44 | logout:
45 | after:
46 | default_browser_return_url: http://localhost:4455/auth/login
47 |
48 | login:
49 | ui_url: http://localhost:4455/auth/login
50 | lifespan: 10m
51 |
52 | registration:
53 | lifespan: 10m
54 | ui_url: http://localhost:4455/auth/registration
55 | after:
56 | password:
57 | hooks:
58 | -
59 | hook: session
60 |
61 | log:
62 | level: trace
63 | format: text
64 | leak_sensitive_values: true
65 |
66 | secrets:
67 | cookie:
68 | - y2Fo*MSez4!5BPh4M*@Aj@xgA*9!s7kT
69 |
70 | hashers:
71 | argon2:
72 | parallelism: 1
73 | memory: 128MB
74 | iterations: 2
75 | salt_length: 16
76 | key_length: 16
77 |
78 | identity:
79 | default_schema_id: default
80 | schemas:
81 | - id: default
82 | url: file:///etc/config/kratos/identity.schema.json
83 |
84 | courier:
85 | smtp:
86 | connection_uri: smtp://mailhog:1025/?disable_starttls=true
87 |
--------------------------------------------------------------------------------
/refinery/oathkeeper/access-rules.yml:
--------------------------------------------------------------------------------
1 | -
2 | id: "ory:kratos:public"
3 | upstream:
4 | preserve_host: true
5 | url: "http://kratos:4433"
6 | strip_path: /.ory/kratos/public
7 | match:
8 | url: "http://localhost:4455/.ory/kratos/public/<**>"
9 | methods:
10 | - GET
11 | - POST
12 | - PUT
13 | - DELETE
14 | - PATCH
15 | authenticators:
16 | -
17 | handler: noop
18 | authorizer:
19 | handler: allow
20 | mutators:
21 | - handler: noop
22 |
23 | -
24 | id: "ory:kratos:protected"
25 | upstream:
26 | preserve_host: true
27 | url: "http://kratos:4434"
28 | strip_path: /admin/api
29 | match:
30 | url: "http://localhost:4455/admin/api/<**>"
31 | methods:
32 | - GET
33 | - POST
34 | - PUT
35 | - DELETE
36 | - PATCH
37 | authenticators:
38 | -
39 | handler: noop
40 | authorizer:
41 | handler: allow
42 | mutators:
43 | - handler: noop
44 |
45 | -
46 | id: "kernai:refinery-entry:anonymous"
47 | upstream:
48 | preserve_host: true
49 | url: "http://refinery-entry:3000"
50 | strip_path: /auth/
51 | match:
52 | url: "http://localhost:4455<{/auth,/auth/**,/_next/**}>"
53 | methods:
54 | - GET
55 | authenticators:
56 | -
57 | handler: anonymous
58 | authorizer:
59 | handler: allow
60 | mutators:
61 | -
62 | handler: noop
63 |
64 | -
65 | id: "kernai:refinery-ui:protected"
66 | upstream:
67 | preserve_host: true
68 | url: "http://refinery-ui:3000"
69 | match:
70 | url: "http://localhost:4455<{/refinery,/refinery/**,/ws}>"
71 | methods:
72 | - GET
73 | authenticators:
74 | -
75 | handler: cookie_session
76 | authorizer:
77 | handler: allow
78 | mutators:
79 | - handler: id_token
80 | errors:
81 | - handler: redirect
82 | config:
83 | to: http://localhost:4455/auth/login
84 | return_to_query_param: 'return_to'
85 |
86 | - id: "kernai:refinery-gateway:fastapi"
87 | upstream:
88 | preserve_host: true
89 | url: "http://refinery-gateway:80"
90 | strip_path: /refinery-gateway/
91 | match:
92 | url: "http://localhost:4455/<{refinery-gateway/,refinery-gateway/**}>"
93 | methods:
94 | - GET
95 | - POST
96 | - PUT
97 | - DELETE
98 | authenticators:
99 | -
100 | handler: cookie_session
101 | authorizer:
102 | handler: allow
103 | mutators:
104 | - handler: id_token
105 | errors:
106 | - handler: redirect
107 | config:
108 | to: http://localhost:4455/auth/login
109 |
110 | - id: "kernai:refinery-is-managed:public"
111 | upstream:
112 | preserve_host: true
113 | url: "http://refinery-gateway:80"
114 | match:
115 | url: "http://localhost:4455/is_managed"
116 | methods:
117 | - GET
118 | authenticators:
119 | - handler: noop
120 | authorizer:
121 | handler: allow
122 | mutators:
123 | - handler: noop
124 |
125 | - id: "kernai:refinery-is-demo:public"
126 | upstream:
127 | preserve_host: true
128 | url: "http://refinery-gateway:80"
129 | match:
130 | url: "http://localhost:4455/is_demo"
131 | methods:
132 | - GET
133 | authenticators:
134 | - handler: noop
135 | authorizer:
136 | handler: allow
137 | mutators:
138 | - handler: noop
139 |
140 | -
141 | id: "kernai:object-storage:upload"
142 | upstream:
143 | preserve_host: false
144 | url: "http://object-storage:9000"
145 | strip_path: /api/upload/
146 | match:
147 | url: "http://localhost:4455/api/upload/<{**}>"
148 | methods:
149 | - PUT
150 | authenticators:
151 | - handler: cookie_session
152 | authorizer:
153 | handler: allow
154 | mutators:
155 | - handler: header
156 | config:
157 | headers:
158 | Host: object-storage:9000
159 | errors:
160 | - handler: redirect
161 | config:
162 | to: http://localhost:4455/auth/login
163 |
164 | -
165 | id: "kernai:refinery-config"
166 | upstream:
167 | preserve_host: false
168 | url: "http://refinery-config:80"
169 | strip_path: /config/
170 | match:
171 | url: "http://localhost:4455/config/<{,**}>"
172 | methods:
173 | - GET
174 | authenticators:
175 | - handler: cookie_session
176 | authorizer:
177 | handler: allow
178 | mutators:
179 | - handler: id_token
180 | errors:
181 | - handler: redirect
182 | config:
183 | to: http://localhost:4455/auth/login
184 |
185 | -
186 | id: "kernai:refinery-gateway-proxy"
187 | upstream:
188 | preserve_host: true
189 | url: "http://refinery-gateway-proxy:80"
190 | strip_path: /api/
191 | match:
192 | url: "http://localhost:4455/api/<{,**}>"
193 | methods:
194 | - GET
195 | - POST
196 | authenticators:
197 | - handler: cookie_session
198 | - handler: bearer_token
199 | authorizer:
200 | handler: allow
201 | mutators:
202 | - handler: id_token
203 | errors:
204 | - handler: redirect
205 | config:
206 | to: http://localhost:4455/auth/login
207 |
208 | -
209 | id: "kernai:refinery-websocket"
210 | upstream:
211 | preserve_host: true
212 | url: "http://refinery-websocket:8080"
213 | strip_path: /notify/
214 | match:
215 | url: "http://localhost:4455/notify/ws"
216 | methods:
217 | - GET
218 | authenticators:
219 | - handler: cookie_session
220 | authorizer:
221 | handler: allow
222 | mutators:
223 | - handler: id_token
224 | errors:
225 | - handler: redirect
226 | config:
227 | to: http://localhost:4455/auth/login
228 |
--------------------------------------------------------------------------------
/refinery/oathkeeper/oathkeeper.yml:
--------------------------------------------------------------------------------
1 | log:
2 | level: debug
3 | format: json
4 |
5 | serve:
6 | proxy:
7 | cors:
8 | enabled: true
9 | allowed_origins:
10 | - "*"
11 | allowed_methods:
12 | - POST
13 | - GET
14 | - PUT
15 | - PATCH
16 | - DELETE
17 | allowed_headers:
18 | - Authorization
19 | - Content-Type
20 | exposed_headers:
21 | - Content-Type
22 | allow_credentials: true
23 | debug: true
24 |
25 | errors:
26 | fallback:
27 | - redirect
28 |
29 | handlers:
30 | redirect:
31 | enabled: true
32 | config:
33 | to: http://localhost:4455/refinery
34 | when:
35 | -
36 | error:
37 | - unauthorized
38 | - forbidden
39 | - not_found
40 | request:
41 | header:
42 | accept:
43 | - text/html
44 | json:
45 | enabled: true
46 | config:
47 | verbose: true
48 |
49 | access_rules:
50 | matching_strategy: glob
51 | repositories:
52 | - file:///etc/config/oathkeeper/access-rules.yml
53 |
54 | authenticators:
55 | anonymous:
56 | enabled: true
57 | config:
58 | subject: guest
59 |
60 | cookie_session:
61 | enabled: true
62 | config:
63 | check_session_url: http://kratos:4433/sessions/whoami
64 | preserve_path: true
65 | extra_from: "@this"
66 | subject_from: "identity.id"
67 | only:
68 | - ory_kratos_session
69 |
70 | bearer_token:
71 | enabled: true
72 | config:
73 | check_session_url: http://kratos:4433/sessions/whoami
74 | preserve_path: true
75 | extra_from: "@this"
76 | subject_from: "identity.id"
77 |
78 | noop:
79 | enabled: true
80 |
81 | authorizers:
82 | allow:
83 | enabled: true
84 |
85 | remote_json:
86 | enabled: true
87 |
88 | config:
89 | remote: http://authorizer/authorize
90 | forward_response_headers_to_upstream:
91 | - X-organization-id
92 | payload: |
93 | {
94 | "subject": "{{ .Extra | toJson }}",
95 | "resource": ""
96 | }
97 |
98 |
99 | mutators:
100 | noop:
101 | enabled: true
102 |
103 | id_token:
104 | enabled: true
105 | config:
106 | issuer_url: http://localhost:4455/
107 | jwks_url: file:///etc/config/oathkeeper/jwks.json
108 | claims: |
109 | {
110 | "session": {{ .Extra | toJson }}
111 | }
112 |
113 |
--------------------------------------------------------------------------------
/refinery/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "LOCAL_VOLUME_KRATOS": "./kratos",
3 | "LOCAL_VOLUME_MINIO": "./minio-data",
4 | "LOCAL_VOLUME_OATHKEEPER": "./oathkeeper",
5 | "LOCAL_VOLUME_POSTGRES": "./postgres-data",
6 | "LOCAL_VOLUME_QDRANT": "./qdrant-volume",
7 | "LOCAL_VOLUME_CONFIG": "./config"
8 | }
--------------------------------------------------------------------------------
/refinery/versions.json:
--------------------------------------------------------------------------------
1 | {
2 | "REFINERY": {
3 | "AC_EXEC_ENV": "v1.15.0",
4 | "AUTHORIZER": "v1.15.0",
5 | "CONFIG": "v1.15.0",
6 | "DOC_OCK": "v1.15.0",
7 | "EMBEDDER": "v1.15.0",
8 | "ENTRY": "v1.15.0",
9 | "GATEWAY": "v1.15.2",
10 | "GATEWAY_PROXY": "v1.15.0",
11 | "ML_EXEC_ENV": "v1.15.0",
12 | "LF_EXEC_ENV": "v1.15.0",
13 | "NEURAL_SEARCH": "v1.15.0",
14 | "REFINERY": "v1.15.0",
15 | "RECORD_IDE_ENV": "v1.15.0",
16 | "TOKENIZER": "v1.15.0",
17 | "UI": "v1.15.0",
18 | "UPDATER": "v1.15.0",
19 | "WEAK_SUPERVISOR": "v1.15.0",
20 | "WEBSOCKET": "v1.15.0",
21 | "ZERO_SHOT": "v1.15.0"
22 | },
23 | "THIRD_PARTY": {
24 | "KRATOS": "v1.1.0",
25 | "MAILHOG": "v1.0.1",
26 | "MINIO": "RELEASE.2022-10-24T18-35-07Z",
27 | "OATHKEEPER": "v0.38.15-beta.1",
28 | "POSTGRES": "13",
29 | "QDRANT": "v1.5.1"
30 | }
31 | }
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | wasabi
3 | GitPython
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 |
5 | from setuptools import setup, find_packages
6 |
7 | this_directory = os.path.abspath(os.path.dirname(__file__))
8 | with open(os.path.join(this_directory, "README.md"), encoding="utf8") as file:
9 | long_description = file.read()
10 |
11 | setup(
12 | name="kern-refinery",
13 | version="1.3.0",
14 | author="jhoetter",
15 | author_email="johannes.hoetter@kern.ai",
16 | description="The open-source data-centric IDE for NLP.",
17 | long_description=long_description,
18 | long_description_content_type="text/markdown",
19 | url="https://github.com/code-kern-ai/refinery",
20 | keywords=[
21 | "Kern AI",
22 | "refinery",
23 | "machine-learning",
24 | "supervised-learning",
25 | "data-centric-ai",
26 | "data-annotation",
27 | "python",
28 | ],
29 | classifiers=[
30 | "Development Status :: 4 - Beta",
31 | "Programming Language :: Python :: 3",
32 | "License :: OSI Approved :: Apache Software License",
33 | ],
34 | package_dir={"": "."},
35 | packages=find_packages("."),
36 | install_requires=[
37 | "requests",
38 | "wasabi",
39 | "GitPython",
40 | ],
41 | entry_points={
42 | "console_scripts": [
43 | "refinery=refinery.cli:main",
44 | ],
45 | },
46 | )
47 |
--------------------------------------------------------------------------------
/start:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | unameOut="$(uname -s)"
4 | case "${unameOut}" in
5 | Linux*) HOST_IP=$(ip a | grep "inet " | grep -v 127.0.0.1 | head -1 | grep -o -E "[0-9]+.[0-9]+.[0-9]+.[0-9]+" | head -1);;
6 | Darwin*) HOST_IP=$(ifconfig | grep "inet " | grep -v 127.0.0.1 | head -1 | grep -o -E "[0-9]+.[0-9]+.[0-9]+.[0-9]+" | head -1);;
7 | esac
8 |
9 | MINIO_ENDPOINT="http://$HOST_IP:7053"
10 |
11 | docker pull kernai/alfred:v1.15.0
12 |
13 | docker run -d --rm --name alfred \
14 | -v /var/run/docker.sock:/var/run/docker.sock \
15 | -v "$PWD/refinery:/refinery" \
16 | kernai/alfred:v1.15.0 \
17 | python start.py "$PWD/refinery" $MINIO_ENDPOINT
18 |
19 | docker logs -f alfred
20 |
--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | REM path of the working directory with slashes instead of backslashes
4 | REM removes the colon, and prepends a slash
5 | set "PWD=%cd%"
6 | set "PWD=%PWD:\=/%"
7 | set "PWD=%PWD::=%"
8 | set "PWD=/%PWD%"
9 |
10 | REM grab MINIO_ENDPOINT from ipconfig
11 | Call :setMinioEndpoint
12 |
13 | docker pull kernai/alfred:v1.15.0
14 |
15 | docker run -d --rm ^
16 | --name alfred ^
17 | -v /var/run/docker.sock:/var/run/docker.sock ^
18 | -v "%PWD%/refinery:/refinery" kernai/alfred:v1.15.0 ^
19 | python start.py "%PWD%/refinery" %MINIO_ENDPOINT% > nul
20 |
21 | docker logs -f alfred
22 |
23 | if "%1" neq "update" pause
24 |
25 | goto :eof
26 |
27 | :setMinioEndpoint
28 | set ip_address_string="IPv4"
29 | for /f "usebackq tokens=2 delims=:" %%f in (`ipconfig ^| findstr /c:%ip_address_string%`) do (
30 | set ip=%%f
31 | )
32 | set ip=%ip: =%
33 | set MINIO_ENDPOINT=http://%ip%:7053
34 | exit /B 0
35 |
36 | :findstr
37 | Set "basestr=%~1"
38 | Set "sstr=%~2"
39 | set /a pos=0
40 | Set "sst0=!basestr:*%sstr%=!"
41 | if "%sst0%"=="%basestr%" echo "%sstr%" not found in "%basestr%"&goto :eof
42 | Set "sst1=!basestr:%sstr%%sst0%=!"
43 | if "%sst1%" neq "" for /l %%i in (0,1,8189) do if "!sst1:~%%i,1!" neq "" set /a pos+=1
44 | exit /B 0
45 |
--------------------------------------------------------------------------------
/stop:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker compose -f refinery/docker-compose.yml down --remove-orphans
--------------------------------------------------------------------------------
/stop.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | docker compose -f refinery\docker-compose.yml down --remove-orphans
4 |
5 | if "%1" neq "update" pause
--------------------------------------------------------------------------------
/update:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source stop
4 |
5 | git checkout main
6 | git pull
7 |
8 | source start
--------------------------------------------------------------------------------
/update.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | call stop.bat update
4 |
5 | git checkout main
6 | git pull
7 |
8 | call start.bat update
9 |
10 | pause
--------------------------------------------------------------------------------