├── config ├── config.yaml ├── samples │ └── http-dataset.yaml ├── crd │ ├── kustomizeconfig.yaml │ └── kustomization.yaml ├── rbac │ └── role.yaml ├── sample-config.yaml └── config.go ├── .tool-versions ├── internal ├── pkg │ ├── datasources │ │ ├── datasource_pixi_test.go │ │ ├── datasources.go │ │ ├── datasource_pixi.go │ │ ├── rclone.go │ │ ├── types.go │ │ ├── datasource_modelscope_test.go │ │ ├── datasource_s3_test.go │ │ ├── datasource_http_test.go │ │ ├── datasource_huggingface_test.go │ │ ├── credentials.go │ │ ├── fake.go │ │ ├── pip │ │ │ └── pip.go │ │ ├── modelscope │ │ │ ├── hub_test.go │ │ │ ├── hub.go │ │ │ └── fake │ │ │ │ └── hub.go │ │ ├── huggingface │ │ │ ├── hub_test.go │ │ │ ├── fake │ │ │ │ └── hub.go │ │ │ └── hub.go │ │ ├── datasource_modelscope.go │ │ ├── datasource_http.go │ │ ├── conda │ │ │ └── conda.go │ │ ├── datasource_git_test.go │ │ ├── datasource_huggingface.go │ │ └── datasource_s3.go │ └── constants │ │ └── const.go ├── controller │ └── dataset │ │ ├── configmap.go │ │ └── dataset_controller_test.go └── cmd │ └── dataloader │ └── root.go ├── main.go ├── .dockerignore ├── pkg ├── utils │ ├── string.go │ ├── string_test.go │ ├── writer.go │ ├── command_test.go │ ├── random.go │ ├── random_test.go │ ├── command.go │ ├── fs.go │ └── fs_test.go ├── log │ ├── log_test.go │ └── log.go ├── clients │ └── kubeconfig.go └── kubeutils │ └── kube.go ├── cmd ├── data-loader │ └── main.go └── main.go ├── manifests └── dataset │ ├── templates │ ├── serviceaccount.yaml │ ├── clusterrolebinding.yaml │ ├── service.yaml │ ├── _common.tpl │ ├── configmap.yaml │ ├── clusterrole.yaml │ ├── deployment.yaml │ └── _helpers.tpl │ ├── .helmignore │ ├── Chart.yaml │ └── values.yaml ├── .gitignore ├── PROJECT ├── .github ├── renovate.json └── workflows │ ├── ci.yml │ └── build.yml ├── hack └── boilerplate.go.txt ├── api ├── client │ ├── fake │ │ ├── doc.go │ │ ├── register.go │ │ └── clientset_generated.go │ ├── typed │ │ └── dataset │ │ │ └── v1alpha1 │ │ │ ├── generated_expansion.go │ │ │ ├── fake │ │ │ ├── doc.go │ │ │ ├── fake_dataset_client.go │ │ │ └── fake_dataset.go │ │ │ ├── doc.go │ │ │ ├── dataset.go │ │ │ └── dataset_client.go │ ├── scheme │ │ ├── doc.go │ │ └── register.go │ ├── listers │ │ └── dataset │ │ │ └── v1alpha1 │ │ │ ├── expansion_generated.go │ │ │ └── dataset.go │ ├── informers │ │ ├── internalinterfaces │ │ │ └── factory_interfaces.go │ │ ├── dataset │ │ │ ├── v1alpha1 │ │ │ │ ├── interface.go │ │ │ │ └── dataset.go │ │ │ └── interface.go │ │ └── generic.go │ └── clientset.go └── dataset │ └── v1alpha1 │ ├── groupversion_info.go │ └── zz_generated.deepcopy.go ├── Dockerfile ├── data-loader.Dockerfile ├── README.md ├── docs └── cascading-deletion-example.md └── go.mod /config/config.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | golang 1.25.1 2 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_pixi_test.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package apis 2 | 3 | import ( 4 | _ "k8s.io/code-generator" 5 | ) 6 | 7 | // only for controller gen, don't edit 8 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | -------------------------------------------------------------------------------- /pkg/utils/string.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | func ObscureString(str string, secrets []string) string { 8 | for _, v := range secrets { 9 | if (v) == "" { 10 | continue 11 | } 12 | 13 | str = strings.ReplaceAll(str, v, "******") 14 | } 15 | 16 | return str 17 | } 18 | -------------------------------------------------------------------------------- /cmd/data-loader/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/BaizeAI/dataset/internal/cmd/dataloader" 5 | "github.com/BaizeAI/dataset/pkg/log" 6 | ) 7 | 8 | func main() { 9 | log.SetDebug() 10 | 11 | cmd := dataloader.NewCommand() 12 | err := cmd.Execute() 13 | if err != nil { 14 | panic(err) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /config/samples/http-dataset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: dataset.baizeai.io/v1alpha1 2 | kind: Dataset 3 | metadata: 4 | name: gpt2-train-data 5 | spec: 6 | dataSyncRound: 1 7 | mountOptions: 8 | gid: 1000 9 | mode: "0774" 10 | path: / 11 | uid: 1000 12 | source: 13 | type: HTTP 14 | uri: http://baize-ai.daocloud.io/gpt2-train-data/ 15 | -------------------------------------------------------------------------------- /pkg/utils/string_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestObscureString(t *testing.T) { 10 | str := ObscureString("test", []string{"test"}) 11 | assert.Equal(t, "******", str) 12 | 13 | str = ObscureString("test-secret", []string{"secret"}) 14 | assert.Equal(t, "test-******", str) 15 | } 16 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasources.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "os" 5 | ) 6 | 7 | type Options struct { 8 | // Primary arguments 9 | Type Type 10 | URI string 11 | 12 | // --options flags 13 | Path string 14 | Mode os.FileMode 15 | UID int 16 | GID int 17 | Root string 18 | } 19 | 20 | type Loader interface { 21 | Sync(fromURI string, toPath string) error 22 | } 23 | -------------------------------------------------------------------------------- /manifests/dataset/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "dataset.serviceAccountName" . }} 6 | labels: 7 | {{- include "dataset.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_pixi.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | var _ Loader = &PixiLoader{} 4 | 5 | type PixiLoader struct { 6 | Options Options 7 | } 8 | 9 | func NewPixiLoader(datasourceOption map[string]string, options Options, secrets Secrets) (*PixiLoader, error) { 10 | return &PixiLoader{}, nil 11 | } 12 | 13 | func (l *PixiLoader) Sync(fromURI string, toPath string) error { 14 | return nil 15 | } 16 | -------------------------------------------------------------------------------- /pkg/log/log_test.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestInitEngine(t *testing.T) { 10 | cfg := &Config{ 11 | Output: "", 12 | Debug: true, 13 | } 14 | InitEngine(cfg) 15 | assert.Equal(t, "debug", GetLevel().String()) 16 | } 17 | 18 | func TestSetDebug(t *testing.T) { 19 | SetDebug() 20 | assert.Equal(t, "debug", GetLevel().String()) 21 | } 22 | -------------------------------------------------------------------------------- /manifests/dataset/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "dataset.fullname" . }} 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: {{ include "dataset.fullname" . }} 9 | subjects: 10 | - kind: ServiceAccount 11 | name: {{ include "dataset.serviceAccountName" . }} 12 | namespace: {{ .Release.Namespace }} 13 | -------------------------------------------------------------------------------- /manifests/dataset/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "dataset.fullname" . }} 5 | labels: 6 | {{- include "dataset.labels" . | nindent 4 }} 7 | spec: 8 | type: {{ .Values.service.type }} 9 | ports: 10 | - port: {{ .Values.service.port }} 11 | targetPort: http 12 | protocol: TCP 13 | name: http 14 | selector: 15 | {{- include "dataset.selectorLabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /manifests/dataset/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /internal/pkg/datasources/rclone.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "os/exec" 5 | 6 | "github.com/BaizeAI/dataset/pkg/log" 7 | "github.com/BaizeAI/dataset/pkg/utils" 8 | ) 9 | 10 | func rcloneCliConfigTouch() error { 11 | cmd := exec.Command("rclone", "config", "touch") 12 | logger := log.WithField("command", cmd.String()) 13 | 14 | logger.Debug("executing command to touch rclone config") 15 | 16 | err := utils.ExecuteCommand(logger, cmd, nil) 17 | 18 | return err 19 | } 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | bin/* 9 | Dockerfile.cross 10 | 11 | # Test binary, build with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Kubernetes Generated files - skip generated files, except for vendored files 18 | 19 | !vendor/**/zz_generated.* 20 | 21 | # editor and IDE paraphernalia 22 | .idea 23 | .vscode 24 | *.swp 25 | *.swo 26 | *~ 27 | .gitpod.yml 28 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: baize.io 6 | layout: 7 | - go.kubebuilder.io/v4 8 | multigroup: true 9 | projectName: kube 10 | repo: baize.io/api/kube 11 | resources: 12 | - api: 13 | crdVersion: v1 14 | namespaced: true 15 | domain: baize.io 16 | group: dataset 17 | kind: Dataset 18 | path: baize.io/api/kube/api/dataset/v1alpha1 19 | version: v1alpha1 20 | version: "3" 21 | -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | -------------------------------------------------------------------------------- /internal/pkg/datasources/types.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | type Type string 4 | 5 | const ( 6 | TypeS3 Type = "S3" 7 | TypeGit Type = "GIT" 8 | TypeHTTP Type = "HTTP" 9 | TypeConda Type = "CONDA" 10 | TypeHuggingFace Type = "HUGGING_FACE" 11 | TypeModelScope Type = "MODEL_SCOPE" 12 | ) 13 | 14 | var ( 15 | SupportedTypesString = []string{string(TypeS3), string(TypeGit), string(TypeHTTP), string(TypeConda), string(TypeHuggingFace), string(TypeModelScope)} 16 | SupportedTypes = []Type{TypeS3, TypeGit, TypeHTTP, TypeConda, TypeHuggingFace, TypeModelScope} 17 | ) 18 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended", 5 | ":dependencyDashboard", 6 | 7 | "schedule:weekly", 8 | ":prHourlyLimitNone", 9 | ":prConcurrentLimitNone", 10 | 11 | ":semanticPrefixFixDepsChoreOthers", 12 | ":ignoreModulesAndTests", 13 | 14 | "group:monorepos", 15 | "group:recommended", 16 | "group:allNonMajor", 17 | 18 | "replacements:all", 19 | "workarounds:all" 20 | ], 21 | "labels": ["dependencies"], 22 | "rangeStrategy": "bump", 23 | "automerge": false 24 | } 25 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ -------------------------------------------------------------------------------- /manifests/dataset/templates/_common.tpl: -------------------------------------------------------------------------------- 1 | 2 | {{- define "common.images.image" -}} 3 | {{- $registryName := .imageRoot.registry -}} 4 | {{- $repositoryName := .imageRoot.repository -}} 5 | {{- $tag := .defaultTag -}} 6 | {{- if .global }} 7 | {{- if .global.imageRegistry }} 8 | {{- $registryName = .global.imageRegistry -}} 9 | {{- end -}} 10 | {{- end -}} 11 | {{- if .imageRoot.registry }} 12 | {{- $registryName = .imageRoot.registry -}} 13 | {{- end -}} 14 | {{- if .imageRoot.tag }} 15 | {{- $tag = .imageRoot.tag -}} 16 | {{- end -}} 17 | {{- if $registryName }} 18 | {{- printf "%s/%s:%s" $registryName $repositoryName $tag -}} 19 | {{- else -}} 20 | {{- printf "%s:%s" $repositoryName $tag -}} 21 | {{- end -}} 22 | {{- end -}} 23 | -------------------------------------------------------------------------------- /api/client/fake/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | // This package has the automatically generated fake clientset. 19 | package fake 20 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | type DatasetExpansion interface{} 21 | -------------------------------------------------------------------------------- /api/client/scheme/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | // This package contains the scheme of the automatically generated clientset. 19 | package scheme 20 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/fake/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | // Package fake has the automatically generated clients. 19 | package fake 20 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | // This package has the automatically generated typed clients. 19 | package v1alpha1 20 | -------------------------------------------------------------------------------- /pkg/utils/writer.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "io" 5 | "strings" 6 | ) 7 | 8 | var _ io.Writer = &ObscuredWriter{} 9 | 10 | type ObscuredWriter struct { 11 | w io.Writer 12 | secrets []string 13 | } 14 | 15 | func NewObscuredWriter(wrappedWriter io.Writer, secrets []string) *ObscuredWriter { 16 | return &ObscuredWriter{ 17 | w: wrappedWriter, 18 | secrets: secrets, 19 | } 20 | } 21 | 22 | func (w *ObscuredWriter) Write(p []byte) (int, error) { 23 | original := p 24 | 25 | for _, secret := range w.secrets { 26 | if secret == "" { 27 | continue 28 | } 29 | 30 | p = []byte(strings.ReplaceAll(string(p), secret, "******")) 31 | } 32 | 33 | n, err := w.w.Write(p) 34 | if err != nil { 35 | return len(original), err 36 | } 37 | if n != len(p) { 38 | return len(original), io.ErrShortWrite 39 | } 40 | 41 | return len(original), nil 42 | } 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM --platform=$BUILDPLATFORM golang:1.25 as builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | 6 | WORKDIR /workspace 7 | # Copy the Go Modules manifests 8 | COPY go.mod go.mod 9 | COPY go.sum go.sum 10 | # cache deps before building and copying source so that we don't need to re-download as much 11 | # and so that source changes don't invalidate our downloaded layer 12 | RUN go mod download 13 | 14 | COPY . . 15 | 16 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -ldflags "-s -w" -a -o ./manager ./cmd/ 17 | 18 | # Use distroless as minimal base image to package the manager binary 19 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 20 | FROM gcr.io/distroless/static:nonroot 21 | WORKDIR /app 22 | COPY --from=builder /workspace/manager . 23 | USER 65532:65532 24 | 25 | ENTRYPOINT ["/app/manager"] 26 | -------------------------------------------------------------------------------- /pkg/utils/command_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os" 5 | "os/exec" 6 | "testing" 7 | 8 | "github.com/sirupsen/logrus" 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/require" 11 | 12 | "github.com/BaizeAI/dataset/pkg/log" 13 | ) 14 | 15 | func TestExecuteCommandWithAllOutput(t *testing.T) { 16 | logger := log.WithFields(logrus.Fields{ 17 | "test": "TestExecuteCommandWithAllOutput", 18 | }) 19 | d, _ := os.MkdirTemp("", "fakeCommand-*") 20 | defer func() { 21 | assert.NoError(t, os.RemoveAll(d)) 22 | }() 23 | require.NoError(t, os.WriteFile(d+"/test_output_0", []byte("output"), 0600)) 24 | require.NoError(t, os.WriteFile(d+"/test_output_1", []byte("error"), 0600)) 25 | t.Run("run with secret", func(t *testing.T) { 26 | o, _, err := ExecuteCommandWithAllOutput(logger, exec.Command("ls", d), []string{"test_output_1"}) 27 | assert.NoError(t, err) 28 | assert.Equal(t, "test_output_0\n******\n", o.String()) 29 | }) 30 | } 31 | -------------------------------------------------------------------------------- /api/client/listers/dataset/v1alpha1/expansion_generated.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by lister-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | // DatasetListerExpansion allows custom methods to be added to 21 | // DatasetLister. 22 | type DatasetListerExpansion interface{} 23 | 24 | // DatasetNamespaceListerExpansion allows custom methods to be added to 25 | // DatasetNamespaceLister. 26 | type DatasetNamespaceListerExpansion interface{} 27 | -------------------------------------------------------------------------------- /config/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: manager-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - namespaces 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - "" 17 | resources: 18 | - persistentvolumeclaims 19 | verbs: 20 | - create 21 | - delete 22 | - get 23 | - list 24 | - patch 25 | - update 26 | - watch 27 | - apiGroups: 28 | - "" 29 | resources: 30 | - persistentvolumes 31 | verbs: 32 | - delete 33 | - get 34 | - list 35 | - watch 36 | - apiGroups: 37 | - dataset.baizeai.io 38 | resources: 39 | - datasets 40 | verbs: 41 | - create 42 | - delete 43 | - get 44 | - list 45 | - patch 46 | - update 47 | - watch 48 | - apiGroups: 49 | - dataset.baizeai.io 50 | resources: 51 | - datasets/finalizers 52 | verbs: 53 | - update 54 | - apiGroups: 55 | - dataset.baizeai.io 56 | resources: 57 | - datasets/status 58 | verbs: 59 | - get 60 | - patch 61 | - update 62 | -------------------------------------------------------------------------------- /config/sample-config.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for the Dataset Controller 2 | # 3 | # To enable cascading deletion of reference datasets when the source dataset is deleted, 4 | # set enable_cascading_deletion to true. When enabled, if a dataset is deleted and other 5 | # datasets reference it (via DatasetTypeReference), those referencing datasets will also 6 | # be automatically deleted along with their associated retained PVs. 7 | # 8 | # Default: false (disabled for safety) 9 | enable_cascading_deletion: false 10 | 11 | # Custom job specification for dataset loading jobs (optional) 12 | # If not specified, a default job specification will be used 13 | # dataset_job_spec_yaml: | 14 | # backoffLimit: 4 15 | # completionMode: NonIndexed 16 | # completions: 1 17 | # parallelism: 1 18 | # template: 19 | # spec: 20 | # restartPolicy: Never 21 | # containers: 22 | # - image: ubuntu:20.04 23 | # command: ["/bin/bash", "-c", "echo 'Container args: '$(echo $@)"] 24 | # resources: 25 | # requests: 26 | # cpu: 100m 27 | # memory: 100Mi 28 | # limits: 29 | # cpu: 500m 30 | # memory: 500Mi -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/serving.baize.io_inferences.yaml 6 | - bases/serving.baize.io_models.yaml 7 | #+kubebuilder:scaffold:crdkustomizeresource 8 | 9 | patches: 10 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 11 | # patches here are for enabling the conversion webhook for each CRD 12 | #- path: patches/webhook_in_serving_inferences.yaml 13 | #+kubebuilder:scaffold:crdkustomizewebhookpatch 14 | 15 | # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. 16 | # patches here are for enabling the CA injection for each CRD 17 | #- path: patches/cainjection_in_serving_inferences.yaml 18 | #- path: patches/cainjection_in_serving_models.yaml 19 | #+kubebuilder:scaffold:crdkustomizecainjectionpatch 20 | 21 | # [WEBHOOK] To enable webhook, uncomment the following section 22 | # the following config is for teaching kustomize how to do kustomization for CRDs. 23 | 24 | #configurations: 25 | #- kustomizeconfig.yaml 26 | -------------------------------------------------------------------------------- /data-loader.Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM --platform=$BUILDPLATFORM golang:1.25 as builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | 6 | WORKDIR /workspace 7 | # Copy the Go Modules manifests 8 | COPY go.mod go.mod 9 | COPY go.sum go.sum 10 | # cache deps before building and copying source so that we don't need to re-download as much 11 | # and so that source changes don't invalidate our downloaded layer 12 | RUN go mod download 13 | 14 | COPY . . 15 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -ldflags "-s -w" -a -o data-loader ./cmd/data-loader 16 | 17 | FROM python:3.13 18 | 19 | RUN pip install --no-cache-dir "huggingface_hub[cli]"==0.33.1 modelscope==1.27.1 setuptools && \ 20 | rclone_version=v1.70.1 && \ 21 | arch=$(uname -m | sed -E 's/x86_64/amd64/g;s/aarch64/arm64/g') && \ 22 | filename=rclone-${rclone_version}-linux-${arch} && \ 23 | wget https://github.com/rclone/rclone/releases/download/${rclone_version}/${filename}.zip -O ${filename}.zip && \ 24 | unzip ${filename}.zip && mv ${filename}/rclone /usr/local/bin && rm -rf ${filename} ${filename}.zip 25 | 26 | 27 | COPY --from=builder /workspace/data-loader /usr/local/bin/ 28 | 29 | ENTRYPOINT ["/usr/local/bin/data-loader"] 30 | -------------------------------------------------------------------------------- /manifests/dataset/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: dataset 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /internal/pkg/constants/const.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const ( 4 | // DatasetJobSecretsMountPath is the path to the directory where the 5 | // dataset job secrets are mounted. 6 | DatasetJobSecretsMountPath string = "/run/dataset/secrets" // #nosec G101 7 | DatasetJobCondaConfigDir string = "/run/dataset/conda" 8 | DatasetJobCondaCondaEnvironmentYAMLFilename string = "environment.yaml" 9 | DatasetJobCondaPipRequirementsTxtFilename string = "requirements.txt" 10 | DatasetJobCondaCondaEnvironmentYAMLPath string = DatasetJobCondaConfigDir + "/" + DatasetJobCondaCondaEnvironmentYAMLFilename 11 | DatasetJobCondaPipRequirementsTxtPath string = DatasetJobCondaConfigDir + "/" + DatasetJobCondaPipRequirementsTxtFilename 12 | 13 | DatasetJobCondaMountDir = "/opt/baize-runtime-env" 14 | 15 | HamiVGPUTypeAnnotationName = "nvidia.com/use-gputype" 16 | ) 17 | 18 | const ( 19 | // The default baize-base env path for the conda env, 20 | // used for tensorboard, etc. 21 | // 22 | // Currently analysis component is using this path. 23 | CondaEnvBaizeBase string = "/opt/conda/envs/baize-base" 24 | CondaEnvBaizeBaseBin string = CondaEnvBaizeBase + "/bin" 25 | 26 | DatasetNameLabel = "baize.io/dataset-name" 27 | ) 28 | -------------------------------------------------------------------------------- /pkg/utils/random.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "crypto/rand" 5 | "crypto/sha256" 6 | "fmt" 7 | "math/big" 8 | ) 9 | 10 | // RandomInt64 generates a random integer. 11 | func RandomInt64(upper ...int64) int64 { //nolint:gosec 12 | var innerMax int64 13 | if len(upper) == 0 || (len(upper) > 0 && upper[0] <= 0) { 14 | innerMax = 9999999999 15 | } else { 16 | innerMax = upper[0] 17 | } 18 | 19 | nBig, _ := rand.Int(rand.Reader, big.NewInt(innerMax)) 20 | n := nBig.Int64() 21 | 22 | return n 23 | } 24 | 25 | // RandBytes generates bytes according to the given length, defaults to 32. 26 | func RandBytes(length ...int) ([]byte, error) { 27 | b := make([]byte, 32) 28 | if len(length) != 0 { 29 | b = make([]byte, length[0]) 30 | } 31 | 32 | _, err := rand.Read(b) 33 | if err != nil { 34 | return nil, err 35 | } 36 | 37 | return b, nil 38 | } 39 | 40 | // RandomHashString generates a random SHA256 string with the maximum length of 64. 41 | func RandomHashString(length ...int) string { 42 | b, _ := RandBytes(1024) 43 | if len(length) != 0 { 44 | sliceLength := length[0] 45 | if length[0] > 64 { 46 | sliceLength = 64 47 | } 48 | if length[0] <= 0 { 49 | sliceLength = 64 50 | } 51 | 52 | return fmt.Sprintf("%x", sha256.Sum256(b))[:sliceLength] 53 | } 54 | 55 | return fmt.Sprintf("%x", sha256.Sum256(b)) 56 | } 57 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/fake/fake_dataset_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package fake 19 | 20 | import ( 21 | v1alpha1 "github.com/BaizeAI/dataset/api/client/typed/dataset/v1alpha1" 22 | rest "k8s.io/client-go/rest" 23 | testing "k8s.io/client-go/testing" 24 | ) 25 | 26 | type FakeDatasetV1alpha1 struct { 27 | *testing.Fake 28 | } 29 | 30 | func (c *FakeDatasetV1alpha1) Datasets(namespace string) v1alpha1.DatasetInterface { 31 | return newFakeDatasets(c, namespace) 32 | } 33 | 34 | // RESTClient returns a RESTClient that is used to communicate 35 | // with API server by this client implementation. 36 | func (c *FakeDatasetV1alpha1) RESTClient() rest.Interface { 37 | var ret *rest.RESTClient 38 | return ret 39 | } 40 | -------------------------------------------------------------------------------- /pkg/clients/kubeconfig.go: -------------------------------------------------------------------------------- 1 | package clients 2 | 3 | import ( 4 | "os" 5 | 6 | "k8s.io/client-go/rest" 7 | "k8s.io/client-go/tools/clientcmd" 8 | ) 9 | 10 | func GetK8sConfigConfigWithFile(kubeconfig, context string) *rest.Config { 11 | var config *rest.Config 12 | if kubeconfig == "" && context == "" { 13 | config, _ := rest.InClusterConfig() 14 | if config != nil { 15 | return config 16 | } 17 | } 18 | if kubeconfig != "" { 19 | info, err := os.Stat(kubeconfig) 20 | if err != nil || info.Size() == 0 { 21 | // If the specified kubeconfig doesn't exists / empty file / any other error 22 | // from file stat, fall back to default 23 | kubeconfig = "" 24 | } 25 | } 26 | 27 | // Config loading rules: 28 | // 1. kubeconfig if it not empty string 29 | // 2. In cluster config if running in-cluster 30 | // 3. Config(s) in KUBECONFIG environment variable 31 | // 4. Use $HOME/.kube/config 32 | loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() 33 | loadingRules.DefaultClientConfig = &clientcmd.DefaultClientConfig 34 | loadingRules.ExplicitPath = kubeconfig 35 | configOverrides := &clientcmd.ConfigOverrides{ 36 | ClusterDefaults: clientcmd.ClusterDefaults, 37 | CurrentContext: context, 38 | } 39 | 40 | config, _ = clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, configOverrides).ClientConfig() 41 | return config 42 | } 43 | -------------------------------------------------------------------------------- /manifests/dataset/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- define "defaultJobSpec" }} 2 | backoffLimit: 4 3 | completionMode: NonIndexed 4 | completions: 1 5 | parallelism: 1 6 | template: 7 | spec: 8 | restartPolicy: Never 9 | securityContext: 10 | runAsUser: 0 11 | containers: 12 | - image: {{ template "dataset.data-loader.image" . }} 13 | command: 14 | - /usr/local/bin/data-loader 15 | resources: 16 | requests: 17 | cpu: 100m 18 | memory: 100Mi 19 | limits: 20 | cpu: 2000m 21 | memory: 2000Mi 22 | {{end}} 23 | 24 | apiVersion: v1 25 | kind: ConfigMap 26 | metadata: 27 | name: {{ include "dataset.fullname" . }} 28 | namespace: {{ .Release.Namespace }} 29 | labels: 30 | app: {{ include "dataset.fullname" . }} 31 | data: 32 | config.yaml: |- 33 | debug: {{.Values.global.debug }} 34 | enable_cascading_deletion: {{ .Values.config.enable_cascading_deletion }} 35 | dataset_job_spec_yaml: |- 36 | {{- if .Values.config.dataset_job_spec}} 37 | {{- $cus := .Values.config.dataset_job_spec }} 38 | {{- $d := include "defaultJobSpec" . | fromYaml }} 39 | {{- $merged := merge $cus $d }} 40 | {{- toYaml $merged | nindent 6}} 41 | {{- else }} 42 | {{- $d := include "defaultJobSpec" . | fromYaml }} 43 | {{- toYaml $d | nindent 6 }} 44 | {{end}} 45 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build-test: 13 | name: Build Test 14 | runs-on: "ubuntu-latest" 15 | 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v6 19 | 20 | - uses: actions/setup-go@v6 21 | with: 22 | go-version: "stable" 23 | cache: true 24 | 25 | - name: Test Build 26 | run: go build ./... 27 | 28 | lint: 29 | name: Lint 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v6 33 | 34 | - uses: actions/setup-go@v6 35 | with: 36 | go-version: "stable" 37 | cache: true 38 | 39 | - name: golangci-lint 40 | uses: golangci/golangci-lint-action@v8 41 | with: 42 | # Optional: golangci-lint command line arguments. 43 | args: "--timeout=10m" 44 | 45 | unittest: 46 | name: Unit Test 47 | runs-on: ubuntu-latest 48 | 49 | steps: 50 | - uses: actions/checkout@v6 51 | 52 | - name: Setup Go 53 | uses: actions/setup-go@v6 54 | with: 55 | go-version: "stable" 56 | cache: true 57 | 58 | - name: Unit tests 59 | run: | 60 | go test ./... -coverprofile=coverage.out -covermode=atomic -p=1 61 | go tool cover -func coverage.out 62 | -------------------------------------------------------------------------------- /pkg/utils/random_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestRandomInt64(t *testing.T) { 10 | t.Run("Default", func(t *testing.T) { 11 | assert := assert.New(t) 12 | 13 | for i := 0; i < 1000; i++ { 14 | assert.NotZero(RandomInt64()) 15 | } 16 | }) 17 | 18 | t.Run("WithMax", func(t *testing.T) { 19 | assert := assert.New(t) 20 | 21 | for i := 0; i < 1000; i++ { 22 | assert.LessOrEqual(RandomInt64(10), int64(10)) 23 | } 24 | }) 25 | } 26 | 27 | func TestRandomBytes(t *testing.T) { 28 | t.Run("no args", func(t *testing.T) { 29 | assert := assert.New(t) 30 | 31 | rand1, _ := RandBytes() 32 | rand2, _ := RandBytes() 33 | 34 | assert.NotEqual(rand1, rand2) 35 | assert.Equal(len(rand1), 32) 36 | assert.Equal(len(rand2), 32) 37 | }) 38 | t.Run("with args", func(t *testing.T) { 39 | assert := assert.New(t) 40 | 41 | arg := 123 42 | rand1, _ := RandBytes(arg) 43 | rand2, _ := RandBytes(arg) 44 | 45 | assert.NotEqual(rand1, rand2) 46 | assert.Equal(len(rand1), arg) 47 | assert.Equal(len(rand2), arg) 48 | }) 49 | } 50 | 51 | func TestRandomHashString(t *testing.T) { 52 | assert := assert.New(t) 53 | 54 | hashString := RandomHashString() 55 | assert.NotEmpty(hashString) 56 | assert.Len(hashString, 64) 57 | 58 | hashString2 := RandomHashString(32) 59 | assert.NotEmpty(hashString2) 60 | assert.Len(hashString2, 32) 61 | } 62 | -------------------------------------------------------------------------------- /manifests/dataset/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "dataset.fullname" . }} 6 | rules: 7 | - apiGroups: 8 | - dataset.baizeai.io 9 | resources: 10 | - datasets 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - watch 19 | - apiGroups: 20 | - dataset.baizeai.io 21 | resources: 22 | - datasets/finalizers 23 | verbs: 24 | - update 25 | - apiGroups: 26 | - dataset.baizeai.io 27 | resources: 28 | - datasets/status 29 | verbs: 30 | - get 31 | - patch 32 | - update 33 | - apiGroups: 34 | - coordination.k8s.io 35 | resources: 36 | - leases 37 | verbs: 38 | - "*" 39 | - apiGroups: 40 | - "batch" 41 | resources: 42 | - "jobs" 43 | verbs: 44 | - "*" 45 | - apiGroups: 46 | - "" 47 | resources: 48 | - "pods" 49 | - "persistentvolumeclaims" 50 | - "secrets" 51 | - "persistentvolumes" 52 | - "configmaps" 53 | - "services" 54 | - "events" 55 | verbs: 56 | - "*" 57 | - apiGroups: 58 | - "" 59 | resources: 60 | - "namespaces" 61 | verbs: 62 | - get 63 | - watch 64 | - list 65 | - apiGroups: 66 | - "apps" 67 | resources: 68 | - "deployments" 69 | verbs: 70 | - "*" 71 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_modelscope_test.go: -------------------------------------------------------------------------------- 1 | // nolint: dupl 2 | package datasources 3 | 4 | import ( 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestModelScopeLoader(t *testing.T) { 14 | loader, err := NewModelScopeLoader(map[string]string{}, Options{ 15 | Type: "", 16 | URI: "modelscope://ns/model", 17 | Path: "", 18 | Mode: 0, 19 | UID: 0, 20 | GID: 0, 21 | Root: "", 22 | }, Secrets{ 23 | Token: "test-token", 24 | }) 25 | assert.NoError(t, err) 26 | fakeHTTP := fakeCommand{ 27 | t: t, 28 | cmd: "modelscope", 29 | outputs: []out{ 30 | { 31 | stdout: "login", 32 | stderr: "", 33 | exit: 0, 34 | }, 35 | { 36 | stdout: "download", 37 | stderr: "", 38 | exit: 0, 39 | }, 40 | }, 41 | } 42 | defer func() { 43 | assert.NoError(t, fakeHTTP.Clean()) 44 | }() 45 | modelScopeDir, _ := os.MkdirTemp("", "modelScopeLoader-*") 46 | defer func() { 47 | assert.NoError(t, os.RemoveAll(modelScopeDir)) 48 | }() 49 | assert.NoError(t, err) 50 | fakeHTTP.WithContext(func() { 51 | err = loader.Sync("modelscope://ns/model", modelScopeDir) 52 | assert.NoError(t, err) 53 | }) 54 | bbs := fakeHTTP.GetAllInputs() 55 | require.Len(t, bbs, 2) 56 | assert.Equal(t, string(bbs[0]), "login --token test-token\n") 57 | assert.Equal(t, string(bbs[1]), strings.Join([]string{"download", "ns/model", "--local_dir", modelScopeDir}, " ")+"\n") 58 | } 59 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_s3_test.go: -------------------------------------------------------------------------------- 1 | // nolint: dupl 2 | package datasources 3 | 4 | import ( 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestS3Loader(t *testing.T) { 13 | loader, err := NewS3Loader(map[string]string{ 14 | "region": "us-east-1", 15 | }, Options{ 16 | Type: "", 17 | URI: "s3://test-bucket", 18 | Path: "", 19 | Mode: 0, 20 | UID: 0, 21 | GID: 0, 22 | Root: "", 23 | }, Secrets{ 24 | AKSKAccessKeyID: "accid", 25 | AKSKSecretAccessKey: "acckey", 26 | }) 27 | assert.NoError(t, err) 28 | fakeHTTP := fakeCommand{ 29 | t: t, 30 | cmd: "rclone", 31 | outputs: []out{ 32 | { 33 | stdout: "clone", 34 | stderr: "", 35 | exit: 0, 36 | }, 37 | { 38 | stdout: "config", 39 | stderr: "", 40 | exit: 0, 41 | }, 42 | { 43 | stdout: "config", 44 | stderr: "", 45 | exit: 0, 46 | }, 47 | }, 48 | } 49 | defer func() { 50 | assert.NoError(t, fakeHTTP.Clean()) 51 | }() 52 | s3Dir, _ := os.MkdirTemp("", "s3Loader-*") 53 | defer func() { 54 | assert.NoError(t, os.RemoveAll(s3Dir)) 55 | }() 56 | assert.NoError(t, err) 57 | fakeHTTP.WithContext(func() { 58 | err = loader.Sync("s3://test-bucket", s3Dir) 59 | assert.NoError(t, err) 60 | }) 61 | bbs := fakeHTTP.GetAllInputs() 62 | assert.Equal(t, []byte("config touch\n"), bbs[0]) 63 | assert.True(t, strings.HasPrefix(string(bbs[1]), "config create")) 64 | assert.True(t, strings.HasPrefix(string(bbs[2]), "sync")) 65 | } 66 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_http_test.go: -------------------------------------------------------------------------------- 1 | // nolint: dupl 2 | package datasources 3 | 4 | import ( 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestHTTPLoader(t *testing.T) { 13 | httpLoader, err := NewHTTPLoader(map[string]string{ 14 | "branch": "master", 15 | }, Options{ 16 | Type: "", 17 | URI: "https://test.com", 18 | Path: "", 19 | Mode: 0, 20 | UID: 0, 21 | GID: 0, 22 | Root: "", 23 | }, Secrets{ 24 | Username: "test-username", 25 | Password: "test-password", 26 | }) 27 | assert.NoError(t, err) 28 | fakeHTTP := fakeCommand{ 29 | t: t, 30 | cmd: "rclone", 31 | outputs: []out{ 32 | { 33 | stdout: "clone", 34 | stderr: "", 35 | exit: 0, 36 | }, 37 | { 38 | stdout: "config", 39 | stderr: "", 40 | exit: 0, 41 | }, 42 | { 43 | stdout: "config", 44 | stderr: "", 45 | exit: 0, 46 | }, 47 | }, 48 | } 49 | defer func() { 50 | assert.NoError(t, fakeHTTP.Clean()) 51 | }() 52 | gitDir, _ := os.MkdirTemp("", "httpLoader-*") 53 | defer func() { 54 | assert.NoError(t, os.RemoveAll(gitDir)) 55 | }() 56 | assert.NoError(t, err) 57 | fakeHTTP.WithContext(func() { 58 | err = httpLoader.Sync("http://test.com", gitDir) 59 | assert.NoError(t, err) 60 | }) 61 | bbs := fakeHTTP.GetAllInputs() 62 | assert.Equal(t, []byte("config touch\n"), bbs[0]) 63 | assert.True(t, strings.HasPrefix(string(bbs[1]), "config create")) 64 | assert.True(t, strings.HasPrefix(string(bbs[2]), "sync")) 65 | } 66 | -------------------------------------------------------------------------------- /api/client/informers/internalinterfaces/factory_interfaces.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by informer-gen. DO NOT EDIT. 17 | 18 | package internalinterfaces 19 | 20 | import ( 21 | time "time" 22 | 23 | client "github.com/BaizeAI/dataset/api/client" 24 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | runtime "k8s.io/apimachinery/pkg/runtime" 26 | cache "k8s.io/client-go/tools/cache" 27 | ) 28 | 29 | // NewInformerFunc takes client.Interface and time.Duration to return a SharedIndexInformer. 30 | type NewInformerFunc func(client.Interface, time.Duration) cache.SharedIndexInformer 31 | 32 | // SharedInformerFactory a small interface to allow for adding an informer without an import cycle 33 | type SharedInformerFactory interface { 34 | Start(stopCh <-chan struct{}) 35 | InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer 36 | } 37 | 38 | // TweakListOptionsFunc is a function that transforms a v1.ListOptions. 39 | type TweakListOptionsFunc func(*v1.ListOptions) 40 | -------------------------------------------------------------------------------- /pkg/log/log.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "runtime" 7 | "strings" 8 | 9 | log "github.com/sirupsen/logrus" 10 | ) 11 | 12 | var ( 13 | Debug = log.Debug 14 | Debugf = log.Debugf 15 | Info = log.Info 16 | Infof = log.Infof 17 | Warn = log.Warn 18 | Warning = log.Warn 19 | Warnf = log.Warnf 20 | Error = log.Error 21 | Errorf = log.Errorf 22 | Fatal = log.Fatal 23 | Fatalf = log.Fatalf 24 | WithField = log.WithField 25 | WithFields = log.WithFields 26 | AccessLog = log.New() 27 | ) 28 | 29 | func init() { 30 | log.SetFormatter(&log.TextFormatter{ 31 | DisableTimestamp: false, 32 | FullTimestamp: true, 33 | DisableLevelTruncation: true, 34 | DisableColors: true, 35 | CallerPrettyfier: func(f *runtime.Frame) (string, string) { 36 | fs := strings.Split(f.File, "/") 37 | filename := fs[len(fs)-1] 38 | ff := strings.Split(f.Function, "/") 39 | _f := ff[len(ff)-1] 40 | return fmt.Sprintf("%s()", _f), fmt.Sprintf("%s:%d", filename, f.Line) 41 | }, 42 | }) 43 | log.SetOutput(os.Stdout) 44 | log.SetLevel(log.InfoLevel) 45 | log.SetReportCaller(true) 46 | AccessLog.SetFormatter(&log.TextFormatter{ 47 | DisableColors: true, 48 | FullTimestamp: true, 49 | }) 50 | } 51 | 52 | func SetDebug() { 53 | log.SetLevel(log.DebugLevel) 54 | } 55 | 56 | func GetLevel() log.Level { 57 | return log.GetLevel() 58 | } 59 | 60 | type Config struct { 61 | Output string `json:"output"` // 文件输出路径,不填输出终端 62 | Debug bool `json:"debug"` 63 | } 64 | 65 | func InitEngine(config *Config) { 66 | if config != nil && config.Debug { 67 | SetDebug() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /api/dataset/v1alpha1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1alpha1 contains API Schema definitions for the dataset v1alpha1 API group 18 | // +kubebuilder:object:generate=true 19 | // +groupName=dataset.baizeai.io 20 | package v1alpha1 21 | 22 | import ( 23 | "k8s.io/apimachinery/pkg/runtime/schema" 24 | "sigs.k8s.io/controller-runtime/pkg/scheme" 25 | ) 26 | 27 | var ( 28 | // GroupVersion is group version used to register these objects 29 | GroupVersion = schema.GroupVersion{Group: "dataset.baizeai.io", Version: "v1alpha1"} 30 | SchemeGroupVersion = GroupVersion // for client-go 31 | 32 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 33 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 34 | 35 | // AddToScheme adds the types in this group-version to the given scheme. 36 | AddToScheme = SchemeBuilder.AddToScheme 37 | ) 38 | 39 | // Resource takes an unqualified resource and returns a Group qualified GroupResource 40 | func Resource(resource string) schema.GroupResource { 41 | return SchemeGroupVersion.WithResource(resource).GroupResource() 42 | } 43 | -------------------------------------------------------------------------------- /api/client/informers/dataset/v1alpha1/interface.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by informer-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | import ( 21 | internalinterfaces "github.com/BaizeAI/dataset/api/client/informers/internalinterfaces" 22 | ) 23 | 24 | // Interface provides access to all the informers in this group version. 25 | type Interface interface { 26 | // Datasets returns a DatasetInformer. 27 | Datasets() DatasetInformer 28 | } 29 | 30 | type version struct { 31 | factory internalinterfaces.SharedInformerFactory 32 | namespace string 33 | tweakListOptions internalinterfaces.TweakListOptionsFunc 34 | } 35 | 36 | // New returns a new Interface. 37 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 38 | return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 39 | } 40 | 41 | // Datasets returns a DatasetInformer. 42 | func (v *version) Datasets() DatasetInformer { 43 | return &datasetInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} 44 | } 45 | -------------------------------------------------------------------------------- /api/client/informers/dataset/interface.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by informer-gen. DO NOT EDIT. 17 | 18 | package dataset 19 | 20 | import ( 21 | v1alpha1 "github.com/BaizeAI/dataset/api/client/informers/dataset/v1alpha1" 22 | internalinterfaces "github.com/BaizeAI/dataset/api/client/informers/internalinterfaces" 23 | ) 24 | 25 | // Interface provides access to each of this group's versions. 26 | type Interface interface { 27 | // V1alpha1 provides access to shared informers for resources in V1alpha1. 28 | V1alpha1() v1alpha1.Interface 29 | } 30 | 31 | type group struct { 32 | factory internalinterfaces.SharedInformerFactory 33 | namespace string 34 | tweakListOptions internalinterfaces.TweakListOptionsFunc 35 | } 36 | 37 | // New returns a new Interface. 38 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 39 | return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 40 | } 41 | 42 | // V1alpha1 returns a new v1alpha1.Interface. 43 | func (g *group) V1alpha1() v1alpha1.Interface { 44 | return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) 45 | } 46 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_huggingface_test.go: -------------------------------------------------------------------------------- 1 | // nolint: dupl 2 | package datasources 3 | 4 | import ( 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestHuggingFaceLoader(t *testing.T) { 14 | loader, err := NewHuggingFaceLoader(map[string]string{ 15 | "endpoint": "https://example-hf.com", 16 | }, Options{ 17 | Type: "", 18 | URI: "huggingface://ns/model", 19 | Path: "", 20 | Mode: 0, 21 | UID: 0, 22 | GID: 0, 23 | Root: "", 24 | }, Secrets{ 25 | Token: "test-token", 26 | }) 27 | assert.NoError(t, err) 28 | fakeHTTP := fakeCommand{ 29 | t: t, 30 | cmd: "huggingface-cli", 31 | outputs: []out{ 32 | { 33 | stdout: "env", 34 | stderr: "", 35 | exit: 0, 36 | }, 37 | { 38 | stdout: "login", 39 | stderr: "", 40 | exit: 0, 41 | }, 42 | { 43 | stdout: "whoami", 44 | stderr: "", 45 | exit: 0, 46 | }, 47 | { 48 | stdout: "download", 49 | stderr: "", 50 | exit: 0, 51 | }, 52 | }, 53 | } 54 | defer func() { 55 | assert.NoError(t, fakeHTTP.Clean()) 56 | }() 57 | huggingFaceDir, _ := os.MkdirTemp("", "huggingFaceLoader-*") 58 | defer func() { 59 | assert.NoError(t, os.RemoveAll(huggingFaceDir)) 60 | }() 61 | assert.NoError(t, err) 62 | fakeHTTP.WithContext(func() { 63 | err = loader.Sync("huggingface://ns/model", huggingFaceDir) 64 | assert.NoError(t, err) 65 | }) 66 | bbs := fakeHTTP.GetAllInputs() 67 | require.Len(t, bbs, 4) 68 | assert.Equal(t, []byte("env\n"), bbs[0]) 69 | assert.Equal(t, string(bbs[1]), "login --token test-token\n") 70 | assert.Equal(t, string(bbs[2]), "whoami\n") 71 | assert.Equal(t, string(bbs[3]), strings.Join([]string{"download", "ns/model", "--local-dir", huggingFaceDir, "--resume-download"}, " ")+"\n") 72 | } 73 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/fake/fake_dataset.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package fake 19 | 20 | import ( 21 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/client/typed/dataset/v1alpha1" 22 | v1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 23 | gentype "k8s.io/client-go/gentype" 24 | ) 25 | 26 | // fakeDatasets implements DatasetInterface 27 | type fakeDatasets struct { 28 | *gentype.FakeClientWithList[*v1alpha1.Dataset, *v1alpha1.DatasetList] 29 | Fake *FakeDatasetV1alpha1 30 | } 31 | 32 | func newFakeDatasets(fake *FakeDatasetV1alpha1, namespace string) datasetv1alpha1.DatasetInterface { 33 | return &fakeDatasets{ 34 | gentype.NewFakeClientWithList[*v1alpha1.Dataset, *v1alpha1.DatasetList]( 35 | fake.Fake, 36 | namespace, 37 | v1alpha1.SchemeGroupVersion.WithResource("datasets"), 38 | v1alpha1.SchemeGroupVersion.WithKind("Dataset"), 39 | func() *v1alpha1.Dataset { return &v1alpha1.Dataset{} }, 40 | func() *v1alpha1.DatasetList { return &v1alpha1.DatasetList{} }, 41 | func(dst, src *v1alpha1.DatasetList) { dst.ListMeta = src.ListMeta }, 42 | func(list *v1alpha1.DatasetList) []*v1alpha1.Dataset { return gentype.ToPointerSlice(list.Items) }, 43 | func(list *v1alpha1.DatasetList, items []*v1alpha1.Dataset) { 44 | list.Items = gentype.FromPointerSlice(items) 45 | }, 46 | ), 47 | fake, 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /manifests/dataset/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "dataset.fullname" . }} 5 | labels: 6 | {{- include "dataset.labels" . | nindent 4 }} 7 | spec: 8 | replicas: {{ .Values.replicaCount }} 9 | selector: 10 | matchLabels: 11 | {{- include "dataset.selectorLabels" . | nindent 6 }} 12 | template: 13 | metadata: 14 | {{- with .Values.podAnnotations }} 15 | annotations: 16 | {{- toYaml . | nindent 8 }} 17 | {{- end }} 18 | labels: 19 | {{- include "dataset.selectorLabels" . | nindent 8 }} 20 | spec: 21 | {{- with .Values.imagePullSecrets }} 22 | imagePullSecrets: 23 | {{- toYaml . | nindent 8 }} 24 | {{- end }} 25 | serviceAccountName: {{ include "dataset.serviceAccountName" . }} 26 | securityContext: 27 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 28 | volumes: 29 | - name: config-volume 30 | configMap: 31 | name: {{ include "dataset.fullname" . }} 32 | containers: 33 | - name: {{ .Chart.Name }} 34 | securityContext: 35 | {{- toYaml .Values.securityContext | nindent 12 }} 36 | image: {{ template "dataset.controller.image" . }} 37 | imagePullPolicy: {{ .Values.global.imagePullPolicy }} 38 | readinessProbe: 39 | httpGet: 40 | path: /readyz 41 | port: 8083 42 | resources: 43 | {{- toYaml .Values.resources | nindent 12 }} 44 | volumeMounts: 45 | - mountPath: /app/config 46 | name: config-volume 47 | {{- with .Values.nodeSelector }} 48 | nodeSelector: 49 | {{- toYaml . | nindent 8 }} 50 | {{- end }} 51 | {{- with .Values.affinity }} 52 | affinity: 53 | {{- toYaml . | nindent 8 }} 54 | {{- end }} 55 | {{- with .Values.tolerations }} 56 | tolerations: 57 | {{- toYaml . | nindent 8 }} 58 | {{- end }} 59 | -------------------------------------------------------------------------------- /api/client/fake/register.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package fake 19 | 20 | import ( 21 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 22 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | runtime "k8s.io/apimachinery/pkg/runtime" 24 | schema "k8s.io/apimachinery/pkg/runtime/schema" 25 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 26 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 27 | ) 28 | 29 | var scheme = runtime.NewScheme() 30 | var codecs = serializer.NewCodecFactory(scheme) 31 | 32 | var localSchemeBuilder = runtime.SchemeBuilder{ 33 | datasetv1alpha1.AddToScheme, 34 | } 35 | 36 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 37 | // of clientsets, like in: 38 | // 39 | // import ( 40 | // "k8s.io/client-go/kubernetes" 41 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 42 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 43 | // ) 44 | // 45 | // kclientset, _ := kubernetes.NewForConfig(c) 46 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 47 | // 48 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 49 | // correctly. 50 | var AddToScheme = localSchemeBuilder.AddToScheme 51 | 52 | func init() { 53 | v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) 54 | utilruntime.Must(AddToScheme(scheme)) 55 | } 56 | -------------------------------------------------------------------------------- /manifests/dataset/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for dataset. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | global: 6 | imageRegistry: ghcr.io 7 | imagePullPolicy: IfNotPresent 8 | debug: false 9 | 10 | config: 11 | dataset_job_spec: {} 12 | # Enable cascading deletion of reference datasets when source dataset is deleted 13 | # Default: false (disabled for safety) 14 | enable_cascading_deletion: false 15 | 16 | replicaCount: 1 17 | 18 | imagePullSecrets: [] 19 | nameOverride: "" 20 | fullnameOverride: "" 21 | 22 | service: 23 | type: ClusterIP 24 | port: 8082 25 | 26 | serviceAccount: 27 | # Specifies whether a service account should be created 28 | create: true 29 | # Annotations to add to the service account 30 | annotations: {} 31 | # The name of the service account to use. 32 | # If not set and create is true, a name is generated using the fullname template 33 | name: "" 34 | 35 | podAnnotations: {} 36 | 37 | podSecurityContext: {} 38 | # fsGroup: 2000 39 | 40 | securityContext: {} 41 | # capabilities: 42 | # drop: 43 | # - ALL 44 | # readOnlyRootFilesystem: true 45 | # runAsNonRoot: true 46 | # runAsUser: 1000 47 | 48 | resources: {} 49 | # We usually recommend not to specify default resources and to leave this as a conscious 50 | # choice for the user. This also increases chances charts run on environments with little 51 | # resources, such as Minikube. If you do want to specify resources, uncomment the following 52 | # lines, adjust them as necessary, and remove the curly braces after 'resources:'. 53 | # limits: 54 | # cpu: 100m 55 | # memory: 128Mi 56 | # requests: 57 | # cpu: 100m 58 | # memory: 128Mi 59 | 60 | nodeSelector: {} 61 | 62 | tolerations: [] 63 | 64 | affinity: {} 65 | 66 | controller: 67 | image: 68 | registry: '' 69 | repository: baizeai/dataset-controller 70 | tag: latest 71 | 72 | dataloader: 73 | image: 74 | registry: '' 75 | repository: baizeai/dataset-data-loader 76 | tag: latest 77 | -------------------------------------------------------------------------------- /config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | 7 | "github.com/go-viper/mapstructure/v2" 8 | "github.com/spf13/viper" 9 | ) 10 | 11 | var ( 12 | config *configuration 13 | ) 14 | 15 | type configuration struct { 16 | DatasetJobSpecYaml string `json:"dataset_job_spec_yaml"` 17 | EnableCascadingDeletion bool `json:"enable_cascading_deletion"` 18 | } 19 | 20 | func GetDatasetJobSpecYaml() string { 21 | if config == nil || config.DatasetJobSpecYaml == "" { 22 | return ` 23 | backoffLimit: 4 24 | completionMode: NonIndexed 25 | completions: 1 26 | parallelism: 1 27 | template: 28 | spec: 29 | restartPolicy: Never 30 | containers: 31 | - image: ubuntu:20.04 32 | command: ["/bin/bash", "-c", "echo 'Container args: '$(echo $@)"] 33 | #command: ["/bin/bash", "-c", "--"] 34 | resources: 35 | requests: 36 | cpu: 100m 37 | memory: 100Mi 38 | limits: 39 | cpu: 500m 40 | memory: 500Mi 41 | ` 42 | } 43 | return config.DatasetJobSpecYaml 44 | } 45 | 46 | func IsCascadingDeletionEnabled() bool { 47 | if config == nil { 48 | return false 49 | } 50 | return config.EnableCascadingDeletion 51 | } 52 | 53 | func ParseConfigFromFileContent(content string) error { 54 | f, err := os.CreateTemp("", "dataset-config-*") 55 | if err != nil { 56 | return err 57 | } 58 | _, err = f.Write([]byte(content)) 59 | if err != nil { 60 | return err 61 | } 62 | defer func() { 63 | _ = f.Close() 64 | _ = os.Remove(f.Name()) 65 | }() 66 | return ParseConfigFromFile(f.Name()) 67 | } 68 | 69 | func ParseConfigFromFile(configPath string) error { 70 | cfg := &configuration{} 71 | viper.SetConfigType("yaml") 72 | viper.SetConfigFile(configPath) 73 | viper.AutomaticEnv() 74 | viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_")) 75 | if err := viper.ReadInConfig(); err != nil { 76 | return err 77 | } 78 | err := viper.Unmarshal(cfg, func(c *mapstructure.DecoderConfig) { 79 | c.TagName = "json" 80 | }) 81 | config = cfg 82 | if err != nil { 83 | return err 84 | } 85 | return nil 86 | } 87 | -------------------------------------------------------------------------------- /api/client/scheme/register.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package scheme 19 | 20 | import ( 21 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 22 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | runtime "k8s.io/apimachinery/pkg/runtime" 24 | schema "k8s.io/apimachinery/pkg/runtime/schema" 25 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 26 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 27 | ) 28 | 29 | var Scheme = runtime.NewScheme() 30 | var Codecs = serializer.NewCodecFactory(Scheme) 31 | var ParameterCodec = runtime.NewParameterCodec(Scheme) 32 | var localSchemeBuilder = runtime.SchemeBuilder{ 33 | datasetv1alpha1.AddToScheme, 34 | } 35 | 36 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 37 | // of clientsets, like in: 38 | // 39 | // import ( 40 | // "k8s.io/client-go/kubernetes" 41 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 42 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 43 | // ) 44 | // 45 | // kclientset, _ := kubernetes.NewForConfig(c) 46 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 47 | // 48 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 49 | // correctly. 50 | var AddToScheme = localSchemeBuilder.AddToScheme 51 | 52 | func init() { 53 | v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) 54 | utilruntime.Must(AddToScheme(Scheme)) 55 | } 56 | -------------------------------------------------------------------------------- /internal/pkg/datasources/credentials.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | 7 | "github.com/BaizeAI/dataset/pkg/log" 8 | ) 9 | 10 | type Secrets struct { 11 | Username string `json:"-"` 12 | Password string `json:"-"` 13 | 14 | SSHPrivateKey string `json:"-"` 15 | SSHPrivateKeyPassphrase string `json:"-"` 16 | 17 | Token string `json:"-"` 18 | 19 | AKSKAccessKeyID string `json:"-"` 20 | AKSKSecretAccessKey string `json:"-"` 21 | } 22 | 23 | type SecretKey string 24 | 25 | const ( 26 | SecretKeyUsername SecretKey = "username" 27 | SecretKeyPassword SecretKey = "password" 28 | SecretKeyPrivateKey SecretKey = "ssh-privatekey" 29 | SecretKeyPrivateKeyPassphrase SecretKey = "ssh-privatekey-passphrase" // #nosec G101 30 | SecretKeyToken SecretKey = "token" 31 | SecretKeyAccessKey SecretKey = "access-key" 32 | SecretKeySecretKey SecretKey = "secret-key" 33 | ) 34 | 35 | var ( 36 | keys = []SecretKey{ 37 | SecretKeyUsername, 38 | SecretKeyPassword, 39 | SecretKeyPrivateKey, 40 | SecretKeyPrivateKeyPassphrase, 41 | SecretKeyToken, 42 | SecretKeyAccessKey, 43 | SecretKeySecretKey, 44 | } 45 | ) 46 | 47 | func ReadAndParseSecrets(name string) (Secrets, error) { 48 | mSecrets := make(map[SecretKey]string) 49 | 50 | logger := log.WithField("secretMountDir", name) 51 | 52 | for _, v := range keys { 53 | secretContent, err := os.ReadFile(filepath.Join(name, string(v))) 54 | if err != nil { 55 | logger.WithField("secretDataKey", v).Debug("failed to read secret") 56 | continue 57 | } 58 | 59 | mSecrets[v] = string(secretContent) 60 | } 61 | 62 | return Secrets{ 63 | Username: mSecrets[SecretKeyUsername], 64 | Password: mSecrets[SecretKeyPassword], 65 | SSHPrivateKey: mSecrets[SecretKeyPrivateKey], 66 | SSHPrivateKeyPassphrase: mSecrets[SecretKeyPrivateKeyPassphrase], 67 | Token: mSecrets[SecretKeyToken], 68 | AKSKAccessKeyID: mSecrets[SecretKeyAccessKey], 69 | AKSKSecretAccessKey: mSecrets[SecretKeySecretKey], 70 | }, nil 71 | } 72 | -------------------------------------------------------------------------------- /api/client/informers/generic.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by informer-gen. DO NOT EDIT. 17 | 18 | package informers 19 | 20 | import ( 21 | fmt "fmt" 22 | 23 | v1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 24 | schema "k8s.io/apimachinery/pkg/runtime/schema" 25 | cache "k8s.io/client-go/tools/cache" 26 | ) 27 | 28 | // GenericInformer is type of SharedIndexInformer which will locate and delegate to other 29 | // sharedInformers based on type 30 | type GenericInformer interface { 31 | Informer() cache.SharedIndexInformer 32 | Lister() cache.GenericLister 33 | } 34 | 35 | type genericInformer struct { 36 | informer cache.SharedIndexInformer 37 | resource schema.GroupResource 38 | } 39 | 40 | // Informer returns the SharedIndexInformer. 41 | func (f *genericInformer) Informer() cache.SharedIndexInformer { 42 | return f.informer 43 | } 44 | 45 | // Lister returns the GenericLister. 46 | func (f *genericInformer) Lister() cache.GenericLister { 47 | return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) 48 | } 49 | 50 | // ForResource gives generic access to a shared informer of the matching type 51 | // TODO extend this to unknown resources with a client pool 52 | func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { 53 | switch resource { 54 | // Group=dataset, Version=v1alpha1 55 | case v1alpha1.SchemeGroupVersion.WithResource("datasets"): 56 | return &genericInformer{resource: resource.GroupResource(), informer: f.Dataset().V1alpha1().Datasets().Informer()}, nil 57 | 58 | } 59 | 60 | return nil, fmt.Errorf("no informer found for %v", resource) 61 | } 62 | -------------------------------------------------------------------------------- /manifests/dataset/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "dataset.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "dataset.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "dataset.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "dataset.labels" -}} 37 | helm.sh/chart: {{ include "dataset.chart" . }} 38 | {{ include "dataset.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "dataset.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "dataset.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "dataset.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "dataset.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | 64 | {{- define "dataset.controller.image" -}} 65 | {{ include "common.images.image" (dict "imageRoot" .Values.controller.image "global" .Values.global "defaultTag" .Chart.Version) }} 66 | {{- end -}} 67 | 68 | {{- define "dataset.data-loader.image" -}} 69 | {{ include "common.images.image" (dict "imageRoot" .Values.dataloader.image "global" .Values.global "defaultTag" .Chart.Version) }} 70 | {{- end -}} 71 | -------------------------------------------------------------------------------- /pkg/utils/command.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "os" 8 | "os/exec" 9 | "strings" 10 | 11 | "github.com/sirupsen/logrus" 12 | ) 13 | 14 | func ExecuteCommandWithAllOutput(logger *logrus.Entry, cmd *exec.Cmd, secrets []string) (*bytes.Buffer, *bytes.Buffer, error) { 15 | logger = logger.WithField("command", cmd.String()) 16 | logger.Debug("executing command") 17 | 18 | outBuffer, errBuffer := RedirectCmdWithObscureOutputWriter(cmd, secrets) 19 | 20 | p, err := exec.LookPath(cmd.Path) 21 | if err != nil { 22 | return outBuffer, errBuffer, err 23 | } 24 | f, err := os.Open(p) 25 | if err != nil { 26 | return outBuffer, errBuffer, err 27 | } 28 | defer func() { 29 | _ = f.Close() 30 | }() 31 | bs := make([]byte, 4) 32 | _, err = f.Read(bs) 33 | if err != nil { 34 | return outBuffer, errBuffer, err 35 | } 36 | if strings.Contains(string(bs), "#") { 37 | cmd.Args = []string{"sh", "-c", fmt.Sprintf("%s %s", cmd.Path, strings.Join(cmd.Args[1:], " "))} 38 | shellPath, _ := exec.LookPath("sh") 39 | cmd.Path = shellPath 40 | } 41 | 42 | err = cmd.Run() 43 | logger.Debugf("command output: %s", outBuffer.String()) 44 | if err != nil { 45 | logger.Errorf("command failed to execute, error: %s", errBuffer.String()) 46 | return outBuffer, errBuffer, fmt.Errorf("failed to execute command %s, err: %s", cmd.String(), err) 47 | } 48 | 49 | return outBuffer, errBuffer, nil 50 | } 51 | 52 | func ExecuteCommandWithOutput(logger *logrus.Entry, cmd *exec.Cmd, secrets []string) (*bytes.Buffer, error) { 53 | outBuffer, _, err := ExecuteCommandWithAllOutput(logger, cmd, secrets) 54 | 55 | return outBuffer, err 56 | } 57 | 58 | func ExecuteCommand(logger *logrus.Entry, cmd *exec.Cmd, secrets []string) error { 59 | _, err := ExecuteCommandWithOutput(logger, cmd, secrets) 60 | return err 61 | } 62 | 63 | func NewWrappedOutputWriter(wrappedWriter io.Writer) (*bytes.Buffer, io.Writer) { 64 | buffer := new(bytes.Buffer) 65 | return buffer, io.MultiWriter(buffer, wrappedWriter) 66 | } 67 | 68 | func NewObscuredOutputWriter(wrappedWriter io.Writer, secrets []string) (*bytes.Buffer, io.Writer) { 69 | buf, writer := NewWrappedOutputWriter(wrappedWriter) 70 | return buf, NewObscuredWriter(writer, secrets) 71 | } 72 | 73 | func RedirectCmdWithObscureOutputWriter(cmd *exec.Cmd, secrets []string) (*bytes.Buffer, *bytes.Buffer) { 74 | var outBuffer, errBuffer *bytes.Buffer 75 | 76 | outBuffer, cmd.Stdout = NewObscuredOutputWriter(os.Stdout, secrets) 77 | errBuffer, cmd.Stderr = NewObscuredOutputWriter(os.Stderr, secrets) 78 | 79 | return outBuffer, errBuffer 80 | } 81 | -------------------------------------------------------------------------------- /internal/pkg/datasources/fake.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "path" 8 | "strconv" 9 | "testing" 10 | "text/template" 11 | 12 | "github.com/stretchr/testify/require" 13 | ) 14 | 15 | type out struct { 16 | stdout string 17 | stderr string 18 | exit int 19 | } 20 | 21 | type fakeCommand struct { 22 | t *testing.T 23 | 24 | cmd string 25 | path string 26 | outputs []out 27 | } 28 | 29 | func (f *fakeCommand) Inject() error { 30 | if f.path == "" { 31 | f.path, _ = os.MkdirTemp("", "fakeCommand-*") 32 | } 33 | require.NoError(f.t, os.MkdirAll(f.path, 0755)) // nolint: gosec 34 | for i, o := range f.outputs { 35 | require.NoError(f.t, os.WriteFile(path.Join(f.path, fmt.Sprintf(".%s_output_%d", f.cmd, i)), []byte(o.stdout), 0600)) 36 | require.NoError(f.t, os.WriteFile(path.Join(f.path, fmt.Sprintf(".%s_stderr_%d", f.cmd, i)), []byte(o.stderr), 0600)) 37 | require.NoError(f.t, os.WriteFile(path.Join(f.path, fmt.Sprintf(".%s_exit_%d", f.cmd, i)), []byte(strconv.Itoa(o.exit)), 0600)) 38 | } 39 | t, err := template.New("fakeCommand").Parse( 40 | ` 41 | #!/usr/bin/env bash 42 | index=0 43 | if [ -f "{{.path}}/.{{.cmd}}_index" ]; then 44 | index=$(cat "{{.path}}/.{{.cmd}}_index") 45 | fi 46 | echo $((index+1)) > "{{.path}}/.{{.cmd}}_index" 47 | echo "$*" > "{{.path}}/.{{.cmd}}_input_$index" 48 | cat "{{.path}}/.{{.cmd}}_output_$index" 49 | cat "{{.path}}/.{{.cmd}}_stderr_$index" 1>&2 50 | exit $(cat "{{.path}}/.{{.cmd}}_exit_$index") 51 | `, 52 | ) 53 | if err != nil { 54 | return err 55 | } 56 | shell := bytes.NewBuffer(nil) 57 | err = t.Execute(shell, map[string]interface{}{ 58 | "path": f.path, 59 | "cmd": f.cmd, 60 | }) 61 | if err != nil { 62 | return err 63 | } 64 | return os.WriteFile(path.Join(f.path, f.cmd), shell.Bytes(), 0755) // nolint: gosec 65 | } 66 | 67 | func (f *fakeCommand) GetInput(index int) ([]byte, error) { 68 | return os.ReadFile(path.Join(f.path, fmt.Sprintf(".%s_input_%d", f.cmd, index))) 69 | } 70 | 71 | func (f *fakeCommand) GetAllInputs() [][]byte { 72 | var inputs [][]byte 73 | for i := 0; ; i++ { 74 | input, err := f.GetInput(i) 75 | if err != nil { 76 | break 77 | } 78 | inputs = append(inputs, input) 79 | } 80 | return inputs 81 | } 82 | 83 | func (f *fakeCommand) WithContext(run func()) { 84 | require.NoError(f.t, f.Inject()) 85 | p := os.Getenv("PATH") 86 | require.NoError(f.t, os.Setenv("PATH", fmt.Sprintf("%s:%s", f.path, p))) 87 | run() 88 | require.NoError(f.t, os.Setenv("PATH", p)) 89 | } 90 | 91 | func (f *fakeCommand) Clean() error { 92 | return os.RemoveAll(f.path) 93 | } 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataset: Simplified Data Management and Sharing for Kubernetes 2 | 3 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) 4 | 5 | ## Introduction 6 | 7 | **Dataset** is a Kubernetes-native tool designed to simplify data management and sharing across AI/ML workflows. It leverages Persistent Volume Claims (PVCs) to preload datasets and models from public sources like Huggingface or S3 into local Kubernetes clusters. This eliminates the need for custom data loaders in individual workloads and ensures seamless data sharing across namespaces. 8 | 9 | With Dataset, teams can efficiently manage and access data in multi-tenant environments while maintaining compatibility with any Kubernetes CSI driver. Its simplicity and ease of use make it an ideal choice for organizations looking to streamline AI/ML workflows without adding operational complexity. 10 | 11 | ## Key Features 12 | 13 | - **Preloaded Datasets**: Load data from external sources into PVCs for immediate use in training and inference tasks. 14 | - **Cross-Namespace Data Sharing**: Securely share data across namespaces, overcoming the traditional limitations of PVCs. 15 | - **Kubernetes-Native Design**: Fully compatible with any Kubernetes CSI driver, avoiding reliance on external technologies like FUSE. 16 | - **Cascading Deletion**: Optional feature to automatically delete dependent datasets when source datasets are removed, ensuring data consistency. 17 | - **Operational Simplicity**: Designed for easy deployment and maintenance, with minimal overhead. 18 | 19 | ## Benefits 20 | 21 | - **Streamlined Workflows**: Eliminates repetitive data-loading logic, allowing teams to focus on core AI/ML development. 22 | - **Enhanced Collaboration**: Enables secure, efficient data sharing in multi-tenant Kubernetes environments. 23 | - **Data Consistency**: Automatic cleanup of dependent resources prevents orphaned references and maintains data integrity. 24 | - **Scalable and Reliable**: Works seamlessly with Kubernetes-native resources, ensuring compatibility and stability. 25 | 26 | ## Configuration 27 | 28 | The Dataset controller supports configurable options through a YAML configuration file: 29 | 30 | ### Cascading Deletion 31 | 32 | When enabled, cascading deletion automatically removes reference datasets when their source dataset is deleted: 33 | 34 | ```yaml 35 | # Enable cascading deletion (default: false) 36 | enable_cascading_deletion: true 37 | ``` 38 | 39 | **Important**: This feature should be used with caution as it will automatically delete datasets that reference the source dataset. Consider the impact on dependent workloads before enabling this feature. 40 | -------------------------------------------------------------------------------- /api/client/listers/dataset/v1alpha1/dataset.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by lister-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | import ( 21 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 22 | labels "k8s.io/apimachinery/pkg/labels" 23 | listers "k8s.io/client-go/listers" 24 | cache "k8s.io/client-go/tools/cache" 25 | ) 26 | 27 | // DatasetLister helps list Datasets. 28 | // All objects returned here must be treated as read-only. 29 | type DatasetLister interface { 30 | // List lists all Datasets in the indexer. 31 | // Objects returned here must be treated as read-only. 32 | List(selector labels.Selector) (ret []*datasetv1alpha1.Dataset, err error) 33 | // Datasets returns an object that can list and get Datasets. 34 | Datasets(namespace string) DatasetNamespaceLister 35 | DatasetListerExpansion 36 | } 37 | 38 | // datasetLister implements the DatasetLister interface. 39 | type datasetLister struct { 40 | listers.ResourceIndexer[*datasetv1alpha1.Dataset] 41 | } 42 | 43 | // NewDatasetLister returns a new DatasetLister. 44 | func NewDatasetLister(indexer cache.Indexer) DatasetLister { 45 | return &datasetLister{listers.New[*datasetv1alpha1.Dataset](indexer, datasetv1alpha1.Resource("dataset"))} 46 | } 47 | 48 | // Datasets returns an object that can list and get Datasets. 49 | func (s *datasetLister) Datasets(namespace string) DatasetNamespaceLister { 50 | return datasetNamespaceLister{listers.NewNamespaced[*datasetv1alpha1.Dataset](s.ResourceIndexer, namespace)} 51 | } 52 | 53 | // DatasetNamespaceLister helps list and get Datasets. 54 | // All objects returned here must be treated as read-only. 55 | type DatasetNamespaceLister interface { 56 | // List lists all Datasets in the indexer for a given namespace. 57 | // Objects returned here must be treated as read-only. 58 | List(selector labels.Selector) (ret []*datasetv1alpha1.Dataset, err error) 59 | // Get retrieves the Dataset from the indexer for a given namespace and name. 60 | // Objects returned here must be treated as read-only. 61 | Get(name string) (*datasetv1alpha1.Dataset, error) 62 | DatasetNamespaceListerExpansion 63 | } 64 | 65 | // datasetNamespaceLister implements the DatasetNamespaceLister 66 | // interface. 67 | type datasetNamespaceLister struct { 68 | listers.ResourceIndexer[*datasetv1alpha1.Dataset] 69 | } 70 | -------------------------------------------------------------------------------- /internal/pkg/datasources/pip/pip.go: -------------------------------------------------------------------------------- 1 | package pip 2 | 3 | import ( 4 | "os" 5 | "os/exec" 6 | "path/filepath" 7 | "strings" 8 | 9 | "github.com/samber/lo" 10 | "github.com/sirupsen/logrus" 11 | 12 | "github.com/BaizeAI/dataset/pkg/utils" 13 | ) 14 | 15 | type PipCLI struct { 16 | // Global 17 | // - In a “pip” subdirectory of any of the paths set in the environment variable XDG_CONFIG_DIRS (if it exists), for example /etc/xdg/pip/pip.conf. 18 | // This will be followed by loading /etc/pip.conf. 19 | // 20 | // User 21 | // - $HOME/.config/pip/pip.conf, which respects the XDG_CONFIG_HOME environment variable. 22 | // The legacy “per-user” configuration file is also loaded, if it exists: $HOME/.pip/pip.conf. 23 | // 24 | // Site 25 | // - $VIRTUAL_ENV/pip.conf 26 | // PIP_CONFIG_FILE 27 | // Additionally, the environment variable PIP_CONFIG_FILE can be used to specify a configuration file that’s loaded last, and whose values override 28 | // the values set in the aforementioned files. Setting this to os.devnull disables the loading of all configuration files. Note that if a file exists 29 | // at the location that this is set to, the user config file will not be loaded. 30 | // 31 | // Configuration - pip documentation v24.0 https://pip.pypa.io/en/stable/topics/configuration/ 32 | ConfigFilePath string 33 | Bin string 34 | EnvPath string 35 | } 36 | 37 | func NewPipCLIWithCondaEnv(envPrefix string) *PipCLI { 38 | return &PipCLI{ 39 | Bin: filepath.Join(envPrefix, "bin", "pip"), 40 | EnvPath: filepath.Join(envPrefix, "bin"), 41 | } 42 | } 43 | 44 | func (p *PipCLI) bin() string { 45 | if p.Bin != "" { 46 | return p.Bin 47 | } 48 | 49 | return "pip" 50 | } 51 | 52 | // Equivalent to `pip --version` 53 | func (p *PipCLI) Version(logger *logrus.Entry) (string, error) { 54 | args := []string{ 55 | "--version", 56 | } 57 | 58 | cmd := exec.Command(p.bin(), args...) // #nosec G204 59 | cmd.Env = os.Environ() 60 | 61 | output, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 62 | if err != nil { 63 | return "", err 64 | } 65 | 66 | outputString := strings.TrimSpace(output.String()) 67 | output.Reset() 68 | 69 | return outputString, nil 70 | } 71 | 72 | // Equivalent to `pip install -r requirements.txt` 73 | func (p *PipCLI) InstallWithRequirementsTxt(logger *logrus.Entry, requirementsTxt string) error { 74 | args := []string{ 75 | "install", 76 | "-r", 77 | requirementsTxt, 78 | } 79 | 80 | cmd := exec.Command(p.bin(), args...) // #nosec G204 81 | cmd.Env = lo.Filter(os.Environ(), func(item string, index int) bool { 82 | return !strings.HasPrefix(item, "PATH=") 83 | }) 84 | cmd.Env = append(cmd.Env, "PATH="+p.EnvPath+":"+os.Getenv("PATH")) 85 | 86 | if p.ConfigFilePath != "" { 87 | cmd.Env = append(cmd.Env, "PIP_CONFIG_FILE="+p.ConfigFilePath) 88 | logger = logger.WithField("PIP_CONFIG_FILE", p.ConfigFilePath) 89 | } 90 | 91 | _, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 92 | if err != nil { 93 | return err 94 | } 95 | 96 | return nil 97 | } 98 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/dataset.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | import ( 21 | context "context" 22 | 23 | scheme "github.com/BaizeAI/dataset/api/client/scheme" 24 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 25 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 | types "k8s.io/apimachinery/pkg/types" 27 | watch "k8s.io/apimachinery/pkg/watch" 28 | gentype "k8s.io/client-go/gentype" 29 | ) 30 | 31 | // DatasetsGetter has a method to return a DatasetInterface. 32 | // A group's client should implement this interface. 33 | type DatasetsGetter interface { 34 | Datasets(namespace string) DatasetInterface 35 | } 36 | 37 | // DatasetInterface has methods to work with Dataset resources. 38 | type DatasetInterface interface { 39 | Create(ctx context.Context, dataset *datasetv1alpha1.Dataset, opts v1.CreateOptions) (*datasetv1alpha1.Dataset, error) 40 | Update(ctx context.Context, dataset *datasetv1alpha1.Dataset, opts v1.UpdateOptions) (*datasetv1alpha1.Dataset, error) 41 | // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). 42 | UpdateStatus(ctx context.Context, dataset *datasetv1alpha1.Dataset, opts v1.UpdateOptions) (*datasetv1alpha1.Dataset, error) 43 | Delete(ctx context.Context, name string, opts v1.DeleteOptions) error 44 | DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error 45 | Get(ctx context.Context, name string, opts v1.GetOptions) (*datasetv1alpha1.Dataset, error) 46 | List(ctx context.Context, opts v1.ListOptions) (*datasetv1alpha1.DatasetList, error) 47 | Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) 48 | Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *datasetv1alpha1.Dataset, err error) 49 | DatasetExpansion 50 | } 51 | 52 | // datasets implements DatasetInterface 53 | type datasets struct { 54 | *gentype.ClientWithList[*datasetv1alpha1.Dataset, *datasetv1alpha1.DatasetList] 55 | } 56 | 57 | // newDatasets returns a Datasets 58 | func newDatasets(c *DatasetV1alpha1Client, namespace string) *datasets { 59 | return &datasets{ 60 | gentype.NewClientWithList[*datasetv1alpha1.Dataset, *datasetv1alpha1.DatasetList]( 61 | "datasets", 62 | c.RESTClient(), 63 | scheme.ParameterCodec, 64 | namespace, 65 | func() *datasetv1alpha1.Dataset { return &datasetv1alpha1.Dataset{} }, 66 | func() *datasetv1alpha1.DatasetList { return &datasetv1alpha1.DatasetList{} }, 67 | ), 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /docs/cascading-deletion-example.md: -------------------------------------------------------------------------------- 1 | # Example: Cascading Deletion of Reference Datasets 2 | 3 | This document demonstrates how cascading deletion works with the Dataset controller. 4 | 5 | ## Scenario 6 | 7 | 1. **Source Dataset**: A dataset with type `GIT` that contains shared data 8 | 2. **Reference Datasets**: Multiple datasets with type `REFERENCE` that point to the source dataset 9 | 3. **Cascading Deletion**: When the source dataset is deleted, all reference datasets are automatically deleted 10 | 11 | ## Configuration 12 | 13 | To enable cascading deletion, set the following in your controller configuration: 14 | 15 | ```yaml 16 | enable_cascading_deletion: true 17 | ``` 18 | 19 | ## Example Datasets 20 | 21 | ### Source Dataset 22 | ```yaml 23 | apiVersion: dataset.baizeai.io/v1alpha1 24 | kind: Dataset 25 | metadata: 26 | name: shared-model 27 | namespace: ml-models 28 | spec: 29 | share: true # Enable sharing 30 | source: 31 | type: GIT 32 | uri: https://github.com/huggingface/transformers.git 33 | ``` 34 | 35 | ### Reference Dataset 1 36 | ```yaml 37 | apiVersion: dataset.baizeai.io/v1alpha1 38 | kind: Dataset 39 | metadata: 40 | name: training-model 41 | namespace: ml-training 42 | spec: 43 | source: 44 | type: REFERENCE 45 | uri: dataset://ml-models/shared-model # References the source dataset 46 | ``` 47 | 48 | ### Reference Dataset 2 49 | ```yaml 50 | apiVersion: dataset.baizeai.io/v1alpha1 51 | kind: Dataset 52 | metadata: 53 | name: inference-model 54 | namespace: ml-inference 55 | spec: 56 | source: 57 | type: REFERENCE 58 | uri: dataset://ml-models/shared-model # References the source dataset 59 | ``` 60 | 61 | ## Deletion Behavior 62 | 63 | ### Without Cascading Deletion (default) 64 | When `shared-model` is deleted: 65 | - Only `shared-model` is deleted 66 | - `training-model` and `inference-model` remain but become invalid (broken references) 67 | - Manual cleanup required 68 | 69 | ### With Cascading Deletion (enabled) 70 | When `shared-model` is deleted: 71 | - `shared-model` is deleted 72 | - `training-model` is automatically deleted 73 | - `inference-model` is automatically deleted 74 | - Associated PVs with retain policy are also cleaned up 75 | - No manual cleanup required 76 | 77 | ## Safety Considerations 78 | 79 | - **Default Disabled**: Cascading deletion is disabled by default for safety 80 | - **Impact Assessment**: Consider all dependent workloads before enabling 81 | - **Testing**: Test in non-production environments first 82 | - **Monitoring**: Monitor deletion events and ensure expected behavior 83 | 84 | ## Enabling Cascading Deletion 85 | 86 | 1. Update your controller configuration: 87 | ```yaml 88 | enable_cascading_deletion: true 89 | ``` 90 | 91 | 2. Restart the controller to apply the configuration 92 | 93 | 3. Verify the setting is applied by checking the controller logs 94 | 95 | ## Troubleshooting 96 | 97 | - Check controller logs for cascading deletion messages 98 | - Verify configuration is loaded correctly 99 | - Ensure RBAC permissions are sufficient for cross-namespace operations 100 | - Test with non-critical datasets first -------------------------------------------------------------------------------- /pkg/kubeutils/kube.go: -------------------------------------------------------------------------------- 1 | package kubeutils 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | "time" 8 | 9 | "github.com/samber/lo" 10 | corev1 "k8s.io/api/core/v1" 11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | "sigs.k8s.io/controller-runtime/pkg/client" 13 | ) 14 | 15 | func IsDeleted(obj client.Object) bool { 16 | return obj.GetDeletionTimestamp() != nil 17 | } 18 | 19 | func IsConditionReady(conditions []metav1.Condition, cond string) bool { 20 | return lo.ContainsBy(conditions, func(item metav1.Condition) bool { 21 | return item.Type == cond && item.Status == metav1.ConditionTrue 22 | }) 23 | } 24 | 25 | func SetCondition(conditions []metav1.Condition, typ string, err error) []metav1.Condition { 26 | if typ == "" { 27 | return conditions 28 | } 29 | index := -1 30 | for i, c := range conditions { 31 | if c.Type == typ { 32 | index = i 33 | break 34 | } 35 | } 36 | if index == -1 { 37 | index = len(conditions) 38 | conditions = append(conditions, metav1.Condition{ 39 | Type: typ, 40 | Reason: typ + "Ready", 41 | }) 42 | } 43 | if err == nil { 44 | if conditions[index].Status != metav1.ConditionTrue { 45 | conditions[index].Status = metav1.ConditionTrue 46 | conditions[index].LastTransitionTime = metav1.Time{Time: time.Now()} 47 | conditions[index].Message = "" 48 | } 49 | } else { 50 | if conditions[index].Status != metav1.ConditionFalse { 51 | conditions[index].Status = metav1.ConditionFalse 52 | conditions[index].LastTransitionTime = metav1.Time{Time: time.Now()} 53 | conditions[index].Message = err.Error() 54 | } 55 | } 56 | return conditions 57 | } 58 | 59 | func GetTolerationWithSeconds(TolerationSeconds *int64) []corev1.Toleration { 60 | if lo.FromPtr(TolerationSeconds) == 0 { 61 | return nil 62 | } 63 | return []corev1.Toleration{ 64 | { 65 | Key: corev1.TaintNodeNotReady, 66 | Effect: corev1.TaintEffectNoExecute, 67 | Operator: corev1.TolerationOpExists, 68 | TolerationSeconds: TolerationSeconds, 69 | }, 70 | { 71 | Key: corev1.TaintNodeUnreachable, 72 | Effect: corev1.TaintEffectNoExecute, 73 | Operator: corev1.TolerationOpExists, 74 | TolerationSeconds: TolerationSeconds, 75 | }, 76 | } 77 | } 78 | 79 | func GetTolerationSeconds(tolerations []corev1.Toleration) *int64 { 80 | for index := range tolerations { 81 | if tolerations[index].Key == corev1.TaintNodeNotReady || tolerations[index].Key == corev1.TaintNodeUnreachable { 82 | return tolerations[index].TolerationSeconds 83 | } 84 | } 85 | return nil 86 | } 87 | 88 | func MapToSelector(labels map[string]string) string { 89 | es := lo.Entries(labels) 90 | sort.Slice(es, func(i, j int) bool { 91 | return es[i].Key < es[j].Key 92 | }) 93 | return strings.Join(lo.Map(es, func(item lo.Entry[string, string], index int) string { 94 | if item.Value != "" { 95 | return fmt.Sprintf("%s=%s", item.Key, item.Value) 96 | } else { 97 | return item.Key 98 | } 99 | }), ",") 100 | } 101 | 102 | type PodReplicaSpec struct { 103 | Replicas int64 104 | PodSpec corev1.PodSpec 105 | } 106 | -------------------------------------------------------------------------------- /internal/pkg/datasources/modelscope/hub_test.go: -------------------------------------------------------------------------------- 1 | package modelscope 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "net/http" 7 | "net/http/httptest" 8 | "testing" 9 | 10 | "github.com/google/uuid" 11 | "github.com/samber/lo" 12 | "github.com/stretchr/testify/assert" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | func TestLogin(t *testing.T) { 17 | t.Run("Default", func(t *testing.T) { 18 | c := NewHubAPIClient() 19 | 20 | var serverHandled bool 21 | server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { 22 | serverHandled = true 23 | 24 | if req.URL.Path != hubAPIEndpointPathLogin { 25 | t.Error("invalid path") 26 | return 27 | } 28 | 29 | rw.WriteHeader(http.StatusOK) 30 | _, err := rw.Write(lo.Must(json.Marshal(&HubAPIBaseResponse[HubAPILoginResponse]{ 31 | Code: 200, 32 | Data: &HubAPILoginResponse{ 33 | AccessToken: "token", 34 | Email: "email", 35 | Username: "username", 36 | WorkNo: "work", 37 | }, 38 | Message: "success", 39 | RequestID: uuid.New().String(), 40 | Success: true, 41 | }))) 42 | require.NoError(t, err) 43 | })) 44 | defer server.Close() 45 | 46 | c.apiEndpoint = server.URL 47 | c.client = server.Client() 48 | 49 | loginResp, err := c.Login(context.Background(), "token") 50 | require.NoError(t, err) 51 | require.NotNil(t, loginResp) 52 | require.True(t, serverHandled) 53 | 54 | assert.Equal(t, int64(200), loginResp.Code) 55 | assert.Equal(t, "token", loginResp.Data.AccessToken) 56 | assert.Equal(t, "email", loginResp.Data.Email) 57 | assert.Equal(t, "username", loginResp.Data.Username) 58 | assert.Equal(t, "work", loginResp.Data.WorkNo) 59 | }) 60 | 61 | t.Run("Error - invalid token", func(t *testing.T) { 62 | c := NewHubAPIClient() 63 | 64 | var serverHandled bool 65 | server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { 66 | serverHandled = true 67 | 68 | if req.URL.Path != hubAPIEndpointPathLogin { 69 | t.Error("invalid path") 70 | return 71 | } 72 | 73 | rw.WriteHeader(http.StatusOK) 74 | _, err := rw.Write(lo.Must(json.Marshal(&HubAPIBaseResponse[HubAPILoginResponse]{ 75 | Code: 10010103009, 76 | Message: "登录失败,AccessToken错误,请从用户中心获取AccessToken或刷新", 77 | RequestID: uuid.New().String(), 78 | Success: false, 79 | }))) 80 | require.NoError(t, err) 81 | })) 82 | defer server.Close() 83 | 84 | c.apiEndpoint = server.URL 85 | c.client = server.Client() 86 | 87 | loginResp, err := c.Login(context.Background(), "token") 88 | require.Error(t, err) 89 | require.Nil(t, loginResp) 90 | require.True(t, serverHandled) 91 | assert.EqualError(t, err, "登录失败,AccessToken错误,请从用户中心获取AccessToken或刷新") 92 | 93 | errResp, ok := err.(*HubAPIError) 94 | require.True(t, ok) 95 | require.NotNil(t, errResp) 96 | 97 | assert.Equal(t, int64(10010103009), errResp.Code) 98 | assert.Empty(t, errResp.Data) 99 | assert.Equal(t, "登录失败,AccessToken错误,请从用户中心获取AccessToken或刷新", errResp.Message) 100 | assert.False(t, errResp.Success) 101 | }) 102 | } 103 | -------------------------------------------------------------------------------- /internal/controller/dataset/configmap.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "context" 5 | 6 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 7 | "github.com/samber/lo" 8 | corev1 "k8s.io/api/core/v1" 9 | apierrors "k8s.io/apimachinery/pkg/api/errors" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | "sigs.k8s.io/controller-runtime/pkg/client" 12 | 13 | "github.com/BaizeAI/dataset/internal/pkg/constants" 14 | ) 15 | 16 | func datasetConfigMapName(ds *datasetv1alpha1.Dataset) string { 17 | return "dataset-" + ds.Name + "-config" 18 | } 19 | 20 | func (r *DatasetReconciler) getConfigMap(ctx context.Context, ds *datasetv1alpha1.Dataset) (*corev1.ConfigMap, error) { 21 | cm := &corev1.ConfigMap{} 22 | err := r.Get(ctx, client.ObjectKey{ 23 | Namespace: ds.Namespace, 24 | Name: datasetConfigMapName(ds), 25 | }, cm) 26 | if err != nil { 27 | if apierrors.IsNotFound(err) { 28 | return nil, nil 29 | } 30 | return nil, err 31 | } 32 | return cm, nil 33 | } 34 | 35 | type condaOptions struct { 36 | environmentYAML *string 37 | requirementsTxt *string 38 | } 39 | 40 | type condaOption func(*condaOptions) 41 | 42 | func withCondaEnvironmentYAML(yaml string) condaOption { 43 | return func(o *condaOptions) { 44 | o.environmentYAML = &yaml 45 | } 46 | } 47 | 48 | func withPipRequirementsTxt(txt string) condaOption { 49 | return func(o *condaOptions) { 50 | o.requirementsTxt = &txt 51 | } 52 | } 53 | 54 | func (r *DatasetReconciler) createConfigMap(ctx context.Context, ds *datasetv1alpha1.Dataset, opts ...condaOption) (*corev1.ConfigMap, error) { 55 | defaultOpts := new(condaOptions) 56 | for _, opt := range opts { 57 | opt(defaultOpts) 58 | } 59 | 60 | cm := &corev1.ConfigMap{ 61 | ObjectMeta: metav1.ObjectMeta{ 62 | Name: datasetConfigMapName(ds), 63 | Namespace: ds.Namespace, 64 | Labels: lo.Assign(ds.Labels, map[string]string{ 65 | constants.DatasetNameLabel: ds.Name, 66 | }), 67 | OwnerReferences: datasetOwnerRef(ds), 68 | }, 69 | Data: make(map[string]string), 70 | } 71 | if defaultOpts.environmentYAML != nil { 72 | cm.Data[constants.DatasetJobCondaCondaEnvironmentYAMLFilename] = *defaultOpts.environmentYAML 73 | } 74 | if defaultOpts.requirementsTxt != nil { 75 | cm.Data[constants.DatasetJobCondaPipRequirementsTxtFilename] = *defaultOpts.requirementsTxt 76 | } 77 | 78 | err := r.Create(ctx, cm) 79 | if err != nil { 80 | return nil, err 81 | } 82 | return cm, nil 83 | } 84 | 85 | func (r *DatasetReconciler) updateConfigMap(ctx context.Context, cm *corev1.ConfigMap, opts ...condaOption) (*corev1.ConfigMap, error) { 86 | defaultOpts := new(condaOptions) 87 | for _, opt := range opts { 88 | opt(defaultOpts) 89 | } 90 | if cm.Data == nil { 91 | // NOTICE: .data is potentially nil when user deletes .data field in the manifest 92 | cm.Data = make(map[string]string) 93 | } 94 | 95 | if defaultOpts.environmentYAML != nil { 96 | cm.Data[constants.DatasetJobCondaCondaEnvironmentYAMLFilename] = *defaultOpts.environmentYAML 97 | } 98 | if defaultOpts.requirementsTxt != nil { 99 | cm.Data[constants.DatasetJobCondaPipRequirementsTxtFilename] = *defaultOpts.requirementsTxt 100 | } 101 | 102 | err := r.Update(ctx, cm) 103 | if err != nil { 104 | return nil, err 105 | } 106 | return cm, nil 107 | } 108 | -------------------------------------------------------------------------------- /internal/pkg/datasources/modelscope/hub.go: -------------------------------------------------------------------------------- 1 | package modelscope 2 | 3 | //go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 -generate 4 | 5 | import ( 6 | "bytes" 7 | "context" 8 | "encoding/json" 9 | "net/http" 10 | ) 11 | 12 | type HubAPIBaseResponse[T any] struct { 13 | Code int64 `json:"Code"` 14 | Data *T `json:"Data,omitempty"` 15 | Message string `json:"Message"` 16 | RequestID string `json:"RequestId"` 17 | Success bool `json:"Success"` 18 | } 19 | 20 | type HubAPILoginResponse struct { 21 | AccessToken string `json:"AccessToken"` 22 | Email string `json:"Email"` 23 | Username string `json:"Username"` 24 | WorkNo string `json:"WorkNo"` 25 | } 26 | 27 | type HubAPIError struct { 28 | HubAPIBaseResponse[any] 29 | } 30 | 31 | func (e *HubAPIError) Error() string { 32 | return e.Message 33 | } 34 | 35 | func IsHubAPIError(err error) bool { 36 | _, ok := err.(*HubAPIError) 37 | return ok 38 | } 39 | 40 | const ( 41 | HubAPIEndpointScheme = "https://" 42 | HubAPIEndpointDomain = "www.modelscope.cn" 43 | 44 | hubAPIEndpointPathLogin = "/api/v1/login" 45 | ) 46 | 47 | //counterfeiter:generate -o fake/hub.go --fake-name FakeHubAPI . HubAPI 48 | type HubAPI interface { 49 | Login(ctx context.Context, token string) (*HubAPIBaseResponse[HubAPILoginResponse], error) 50 | } 51 | 52 | type HubAPIClient struct { 53 | client *http.Client 54 | apiEndpoint string 55 | } 56 | 57 | // NewHubAPIClient creates a new HubAPIClient. 58 | // 59 | // Source code: https://github.com/modelscope/modelscope/blob/058df0e34c8dad07659f326e71ffa68c133c4ec8/modelscope/hub/api.py#L62-L94 60 | func NewHubAPIClient() *HubAPIClient { 61 | return &HubAPIClient{ 62 | client: &http.Client{}, 63 | } 64 | } 65 | 66 | func (c *HubAPIClient) endpoint() string { 67 | if c.apiEndpoint == "" { 68 | return HubAPIEndpointScheme + HubAPIEndpointDomain 69 | } 70 | 71 | return c.apiEndpoint 72 | } 73 | 74 | // Login signs in the user with the given token. 75 | // 76 | // The token is used to authenticate the user. 77 | // 78 | // Source code: https://github.com/modelscope/modelscope/blob/058df0e34c8dad07659f326e71ffa68c133c4ec8/modelscope/hub/api.py#L96-L133 79 | func (c *HubAPIClient) Login(ctx context.Context, token string) (*HubAPIBaseResponse[HubAPILoginResponse], error) { 80 | body := struct { 81 | AccessToken string `json:"AccessToken"` 82 | }{ 83 | AccessToken: token, 84 | } 85 | 86 | buffer := new(bytes.Buffer) 87 | err := json.NewEncoder(buffer).Encode(body) 88 | if err != nil { 89 | return nil, err 90 | } 91 | 92 | req, err := http.NewRequest(http.MethodPost, c.endpoint()+hubAPIEndpointPathLogin, buffer) 93 | if err != nil { 94 | return nil, err 95 | } 96 | 97 | req = req.WithContext(ctx) 98 | req.Header.Set("Content-Type", "application/json") 99 | 100 | resp, err := c.client.Do(req) 101 | if err != nil { 102 | return nil, err 103 | } 104 | 105 | defer func() { 106 | _ = resp.Body.Close() 107 | }() 108 | 109 | var response HubAPIBaseResponse[HubAPILoginResponse] 110 | err = json.NewDecoder(resp.Body).Decode(&response) 111 | if err != nil { 112 | return nil, err 113 | } 114 | if !response.Success { 115 | return nil, &HubAPIError{HubAPIBaseResponse: HubAPIBaseResponse[any]{ 116 | Code: response.Code, 117 | Message: response.Message, 118 | RequestID: response.RequestID, 119 | Success: response.Success, 120 | }} 121 | } 122 | 123 | return &response, nil 124 | } 125 | -------------------------------------------------------------------------------- /api/client/fake/clientset_generated.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package fake 19 | 20 | import ( 21 | clientset "github.com/BaizeAI/dataset/api/client" 22 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/client/typed/dataset/v1alpha1" 23 | fakedatasetv1alpha1 "github.com/BaizeAI/dataset/api/client/typed/dataset/v1alpha1/fake" 24 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | "k8s.io/apimachinery/pkg/runtime" 26 | "k8s.io/apimachinery/pkg/watch" 27 | "k8s.io/client-go/discovery" 28 | fakediscovery "k8s.io/client-go/discovery/fake" 29 | "k8s.io/client-go/testing" 30 | ) 31 | 32 | // NewSimpleClientset returns a clientset that will respond with the provided objects. 33 | // It's backed by a very simple object tracker that processes creates, updates and deletions as-is, 34 | // without applying any field management, validations and/or defaults. It shouldn't be considered a replacement 35 | // for a real clientset and is mostly useful in simple unit tests. 36 | // 37 | // DEPRECATED: NewClientset replaces this with support for field management, which significantly improves 38 | // server side apply testing. NewClientset is only available when apply configurations are generated (e.g. 39 | // via --with-applyconfig). 40 | func NewSimpleClientset(objects ...runtime.Object) *Clientset { 41 | o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) 42 | for _, obj := range objects { 43 | if err := o.Add(obj); err != nil { 44 | panic(err) 45 | } 46 | } 47 | 48 | cs := &Clientset{tracker: o} 49 | cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} 50 | cs.AddReactor("*", "*", testing.ObjectReaction(o)) 51 | cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { 52 | var opts metav1.ListOptions 53 | if watchActcion, ok := action.(testing.WatchActionImpl); ok { 54 | opts = watchActcion.ListOptions 55 | } 56 | gvr := action.GetResource() 57 | ns := action.GetNamespace() 58 | watch, err := o.Watch(gvr, ns, opts) 59 | if err != nil { 60 | return false, nil, err 61 | } 62 | return true, watch, nil 63 | }) 64 | 65 | return cs 66 | } 67 | 68 | // Clientset implements clientset.Interface. Meant to be embedded into a 69 | // struct to get a default implementation. This makes faking out just the method 70 | // you want to test easier. 71 | type Clientset struct { 72 | testing.Fake 73 | discovery *fakediscovery.FakeDiscovery 74 | tracker testing.ObjectTracker 75 | } 76 | 77 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 78 | return c.discovery 79 | } 80 | 81 | func (c *Clientset) Tracker() testing.ObjectTracker { 82 | return c.tracker 83 | } 84 | 85 | var ( 86 | _ clientset.Interface = &Clientset{} 87 | _ testing.FakeClient = &Clientset{} 88 | ) 89 | 90 | // DatasetV1alpha1 retrieves the DatasetV1alpha1Client 91 | func (c *Clientset) DatasetV1alpha1() datasetv1alpha1.DatasetV1alpha1Interface { 92 | return &fakedatasetv1alpha1.FakeDatasetV1alpha1{Fake: &c.Fake} 93 | } 94 | -------------------------------------------------------------------------------- /api/client/typed/dataset/v1alpha1/dataset_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | import ( 21 | http "net/http" 22 | 23 | scheme "github.com/BaizeAI/dataset/api/client/scheme" 24 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 25 | rest "k8s.io/client-go/rest" 26 | ) 27 | 28 | type DatasetV1alpha1Interface interface { 29 | RESTClient() rest.Interface 30 | DatasetsGetter 31 | } 32 | 33 | // DatasetV1alpha1Client is used to interact with features provided by the dataset group. 34 | type DatasetV1alpha1Client struct { 35 | restClient rest.Interface 36 | } 37 | 38 | func (c *DatasetV1alpha1Client) Datasets(namespace string) DatasetInterface { 39 | return newDatasets(c, namespace) 40 | } 41 | 42 | // NewForConfig creates a new DatasetV1alpha1Client for the given config. 43 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 44 | // where httpClient was generated with rest.HTTPClientFor(c). 45 | func NewForConfig(c *rest.Config) (*DatasetV1alpha1Client, error) { 46 | config := *c 47 | setConfigDefaults(&config) 48 | httpClient, err := rest.HTTPClientFor(&config) 49 | if err != nil { 50 | return nil, err 51 | } 52 | return NewForConfigAndClient(&config, httpClient) 53 | } 54 | 55 | // NewForConfigAndClient creates a new DatasetV1alpha1Client for the given config and http client. 56 | // Note the http client provided takes precedence over the configured transport values. 57 | func NewForConfigAndClient(c *rest.Config, h *http.Client) (*DatasetV1alpha1Client, error) { 58 | config := *c 59 | setConfigDefaults(&config) 60 | client, err := rest.RESTClientForConfigAndClient(&config, h) 61 | if err != nil { 62 | return nil, err 63 | } 64 | return &DatasetV1alpha1Client{client}, nil 65 | } 66 | 67 | // NewForConfigOrDie creates a new DatasetV1alpha1Client for the given config and 68 | // panics if there is an error in the config. 69 | func NewForConfigOrDie(c *rest.Config) *DatasetV1alpha1Client { 70 | client, err := NewForConfig(c) 71 | if err != nil { 72 | panic(err) 73 | } 74 | return client 75 | } 76 | 77 | // New creates a new DatasetV1alpha1Client for the given RESTClient. 78 | func New(c rest.Interface) *DatasetV1alpha1Client { 79 | return &DatasetV1alpha1Client{c} 80 | } 81 | 82 | func setConfigDefaults(config *rest.Config) { 83 | gv := datasetv1alpha1.SchemeGroupVersion 84 | config.GroupVersion = &gv 85 | config.APIPath = "/apis" 86 | config.NegotiatedSerializer = rest.CodecFactoryForGeneratedClient(scheme.Scheme, scheme.Codecs).WithoutConversion() 87 | 88 | if config.UserAgent == "" { 89 | config.UserAgent = rest.DefaultKubernetesUserAgent() 90 | } 91 | } 92 | 93 | // RESTClient returns a RESTClient that is used to communicate 94 | // with API server by this client implementation. 95 | func (c *DatasetV1alpha1Client) RESTClient() rest.Interface { 96 | if c == nil { 97 | return nil 98 | } 99 | return c.restClient 100 | } 101 | -------------------------------------------------------------------------------- /internal/pkg/datasources/huggingface/hub_test.go: -------------------------------------------------------------------------------- 1 | package huggingface 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "net/http" 7 | "net/http/httptest" 8 | "testing" 9 | "time" 10 | 11 | "github.com/samber/lo" 12 | "github.com/stretchr/testify/assert" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | func TestWhoAmI(t *testing.T) { 17 | t.Run("Default", func(t *testing.T) { 18 | c := NewHfAPIClient() 19 | 20 | var serverHandled bool 21 | server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { 22 | serverHandled = true 23 | 24 | if req.URL.Path != hubAPIEndpointPathWhoAmI { 25 | t.Error("invalid path") 26 | return 27 | } 28 | 29 | rw.WriteHeader(http.StatusOK) 30 | _, err := rw.Write(lo.Must(json.Marshal(&HfAPIWhoAmIResponse{ 31 | Auth: HfAPIWhoAmIResponseAuth{ 32 | AccessToken: HfAPIAccessToken{ 33 | CreatedAt: time.Now(), 34 | DisplayName: "token-name", 35 | Role: "read", 36 | }, 37 | }, 38 | AvatarURL: "https://example.com/avatar-url.png", 39 | CanPay: false, 40 | Email: "user@example.com", 41 | EmailVerified: true, 42 | Fullname: "User Name", 43 | ID: "643ba8e3b409fef15e05aa37", 44 | IsPro: false, 45 | Name: "username", 46 | Type: "user", 47 | }))) 48 | require.NoError(t, err) 49 | })) 50 | defer server.Close() 51 | 52 | c.apiEndpoint = server.URL 53 | c.client = server.Client() 54 | 55 | whoAmIResp, err := c.WhoAmI(context.Background(), "token") 56 | require.NoError(t, err) 57 | require.NotNil(t, whoAmIResp) 58 | require.True(t, serverHandled) 59 | 60 | assert.False(t, whoAmIResp.Auth.AccessToken.CreatedAt.IsZero()) 61 | assert.Equal(t, "token-name", whoAmIResp.Auth.AccessToken.DisplayName) 62 | assert.Equal(t, "read", whoAmIResp.Auth.AccessToken.Role) 63 | assert.Equal(t, "https://example.com/avatar-url.png", whoAmIResp.AvatarURL) 64 | assert.False(t, whoAmIResp.CanPay) 65 | assert.Equal(t, "user@example.com", whoAmIResp.Email) 66 | assert.True(t, whoAmIResp.EmailVerified) 67 | assert.Equal(t, "User Name", whoAmIResp.Fullname) 68 | assert.Equal(t, "643ba8e3b409fef15e05aa37", whoAmIResp.ID) 69 | assert.False(t, whoAmIResp.IsPro) 70 | assert.Equal(t, "username", whoAmIResp.Name) 71 | assert.Equal(t, "user", whoAmIResp.Type) 72 | }) 73 | 74 | t.Run("Error - invalid token", func(t *testing.T) { 75 | c := NewHfAPIClient() 76 | 77 | var serverHandled bool 78 | server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { 79 | serverHandled = true 80 | 81 | if req.URL.Path != hubAPIEndpointPathWhoAmI { 82 | t.Error("invalid path") 83 | return 84 | } 85 | 86 | rw.WriteHeader(http.StatusOK) 87 | _, err := rw.Write(lo.Must(json.Marshal(&HfAPIErrorResponse{ 88 | Error: "Invalid username or password.", 89 | }))) 90 | require.NoError(t, err) 91 | })) 92 | defer server.Close() 93 | 94 | c.apiEndpoint = server.URL 95 | c.client = server.Client() 96 | 97 | whoAmIResp, err := c.WhoAmI(context.Background(), "token") 98 | require.Error(t, err) 99 | require.Nil(t, whoAmIResp) 100 | require.True(t, serverHandled) 101 | assert.EqualError(t, err, "Invalid username or password.") 102 | 103 | errResp, ok := err.(*HfAPIError) 104 | require.True(t, ok) 105 | require.NotNil(t, errResp) 106 | 107 | assert.Equal(t, "Invalid username or password.", errResp.HfAPIErrorResponse.Error) 108 | }) 109 | } 110 | -------------------------------------------------------------------------------- /internal/pkg/datasources/huggingface/fake/hub.go: -------------------------------------------------------------------------------- 1 | // Code generated by counterfeiter. DO NOT EDIT. 2 | package fake 3 | 4 | import ( 5 | "context" 6 | "sync" 7 | 8 | "github.com/BaizeAI/dataset/internal/pkg/datasources/huggingface" 9 | ) 10 | 11 | type FakeHfAPI struct { 12 | WhoAmIStub func(context.Context, string) (*huggingface.HfAPIWhoAmIResponse, error) 13 | whoAmIMutex sync.RWMutex 14 | whoAmIArgsForCall []struct { 15 | arg1 context.Context 16 | arg2 string 17 | } 18 | whoAmIReturns struct { 19 | result1 *huggingface.HfAPIWhoAmIResponse 20 | result2 error 21 | } 22 | whoAmIReturnsOnCall map[int]struct { 23 | result1 *huggingface.HfAPIWhoAmIResponse 24 | result2 error 25 | } 26 | invocations map[string][][]interface{} 27 | invocationsMutex sync.RWMutex 28 | } 29 | 30 | func (fake *FakeHfAPI) WhoAmI(arg1 context.Context, arg2 string) (*huggingface.HfAPIWhoAmIResponse, error) { 31 | fake.whoAmIMutex.Lock() 32 | ret, specificReturn := fake.whoAmIReturnsOnCall[len(fake.whoAmIArgsForCall)] 33 | fake.whoAmIArgsForCall = append(fake.whoAmIArgsForCall, struct { 34 | arg1 context.Context 35 | arg2 string 36 | }{arg1, arg2}) 37 | stub := fake.WhoAmIStub 38 | fakeReturns := fake.whoAmIReturns 39 | fake.recordInvocation("WhoAmI", []interface{}{arg1, arg2}) 40 | fake.whoAmIMutex.Unlock() 41 | if stub != nil { 42 | return stub(arg1, arg2) 43 | } 44 | if specificReturn { 45 | return ret.result1, ret.result2 46 | } 47 | return fakeReturns.result1, fakeReturns.result2 48 | } 49 | 50 | func (fake *FakeHfAPI) WhoAmICallCount() int { 51 | fake.whoAmIMutex.RLock() 52 | defer fake.whoAmIMutex.RUnlock() 53 | return len(fake.whoAmIArgsForCall) 54 | } 55 | 56 | func (fake *FakeHfAPI) WhoAmICalls(stub func(context.Context, string) (*huggingface.HfAPIWhoAmIResponse, error)) { 57 | fake.whoAmIMutex.Lock() 58 | defer fake.whoAmIMutex.Unlock() 59 | fake.WhoAmIStub = stub 60 | } 61 | 62 | func (fake *FakeHfAPI) WhoAmIArgsForCall(i int) (context.Context, string) { 63 | fake.whoAmIMutex.RLock() 64 | defer fake.whoAmIMutex.RUnlock() 65 | argsForCall := fake.whoAmIArgsForCall[i] 66 | return argsForCall.arg1, argsForCall.arg2 67 | } 68 | 69 | func (fake *FakeHfAPI) WhoAmIReturns(result1 *huggingface.HfAPIWhoAmIResponse, result2 error) { 70 | fake.whoAmIMutex.Lock() 71 | defer fake.whoAmIMutex.Unlock() 72 | fake.WhoAmIStub = nil 73 | fake.whoAmIReturns = struct { 74 | result1 *huggingface.HfAPIWhoAmIResponse 75 | result2 error 76 | }{result1, result2} 77 | } 78 | 79 | func (fake *FakeHfAPI) WhoAmIReturnsOnCall(i int, result1 *huggingface.HfAPIWhoAmIResponse, result2 error) { 80 | fake.whoAmIMutex.Lock() 81 | defer fake.whoAmIMutex.Unlock() 82 | fake.WhoAmIStub = nil 83 | if fake.whoAmIReturnsOnCall == nil { 84 | fake.whoAmIReturnsOnCall = make(map[int]struct { 85 | result1 *huggingface.HfAPIWhoAmIResponse 86 | result2 error 87 | }) 88 | } 89 | fake.whoAmIReturnsOnCall[i] = struct { 90 | result1 *huggingface.HfAPIWhoAmIResponse 91 | result2 error 92 | }{result1, result2} 93 | } 94 | 95 | func (fake *FakeHfAPI) Invocations() map[string][][]interface{} { 96 | fake.invocationsMutex.RLock() 97 | defer fake.invocationsMutex.RUnlock() 98 | fake.whoAmIMutex.RLock() 99 | defer fake.whoAmIMutex.RUnlock() 100 | copiedInvocations := map[string][][]interface{}{} 101 | for key, value := range fake.invocations { 102 | copiedInvocations[key] = value 103 | } 104 | return copiedInvocations 105 | } 106 | 107 | func (fake *FakeHfAPI) recordInvocation(key string, args []interface{}) { 108 | fake.invocationsMutex.Lock() 109 | defer fake.invocationsMutex.Unlock() 110 | if fake.invocations == nil { 111 | fake.invocations = map[string][][]interface{}{} 112 | } 113 | if fake.invocations[key] == nil { 114 | fake.invocations[key] = [][]interface{}{} 115 | } 116 | fake.invocations[key] = append(fake.invocations[key], args) 117 | } 118 | 119 | var _ huggingface.HfAPI = new(FakeHfAPI) 120 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/BaizeAI/dataset 2 | 3 | go 1.25.1 4 | 5 | require ( 6 | github.com/go-viper/mapstructure/v2 v2.4.0 7 | github.com/google/uuid v1.6.0 8 | github.com/samber/lo v1.51.0 9 | github.com/sirupsen/logrus v1.9.3 10 | github.com/spf13/cobra v1.10.1 11 | github.com/spf13/viper v1.21.0 12 | github.com/stretchr/testify v1.11.1 13 | golang.org/x/crypto v0.42.0 14 | gopkg.in/yaml.v3 v3.0.1 15 | k8s.io/api v0.34.1 16 | k8s.io/apimachinery v0.34.1 17 | k8s.io/client-go v0.34.1 18 | k8s.io/code-generator v0.34.1 19 | sigs.k8s.io/controller-runtime v0.22.1 20 | sigs.k8s.io/yaml v1.6.0 21 | ) 22 | 23 | require ( 24 | github.com/beorn7/perks v1.0.1 // indirect 25 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 26 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 27 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 28 | github.com/evanphx/json-patch/v5 v5.9.11 // indirect 29 | github.com/fsnotify/fsnotify v1.9.0 // indirect 30 | github.com/fxamacker/cbor/v2 v2.9.0 // indirect 31 | github.com/go-logr/logr v1.4.2 // indirect 32 | github.com/go-logr/zapr v1.3.0 // indirect 33 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 34 | github.com/go-openapi/jsonreference v0.20.2 // indirect 35 | github.com/go-openapi/swag v0.23.0 // indirect 36 | github.com/gogo/protobuf v1.3.2 // indirect 37 | github.com/google/btree v1.1.3 // indirect 38 | github.com/google/gnostic-models v0.7.0 // indirect 39 | github.com/google/go-cmp v0.7.0 // indirect 40 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 41 | github.com/josharian/intern v1.0.0 // indirect 42 | github.com/json-iterator/go v1.1.12 // indirect 43 | github.com/mailru/easyjson v0.7.7 // indirect 44 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 45 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect 46 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 47 | github.com/pelletier/go-toml/v2 v2.2.4 // indirect 48 | github.com/pkg/errors v0.9.1 // indirect 49 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 50 | github.com/prometheus/client_golang v1.22.0 // indirect 51 | github.com/prometheus/client_model v0.6.1 // indirect 52 | github.com/prometheus/common v0.62.0 // indirect 53 | github.com/prometheus/procfs v0.15.1 // indirect 54 | github.com/sagikazarmark/locafero v0.11.0 // indirect 55 | github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect 56 | github.com/spf13/afero v1.15.0 // indirect 57 | github.com/spf13/cast v1.10.0 // indirect 58 | github.com/spf13/pflag v1.0.10 // indirect 59 | github.com/subosito/gotenv v1.6.0 // indirect 60 | github.com/x448/float16 v0.8.4 // indirect 61 | go.uber.org/multierr v1.11.0 // indirect 62 | go.uber.org/zap v1.27.0 // indirect 63 | go.yaml.in/yaml/v2 v2.4.2 // indirect 64 | go.yaml.in/yaml/v3 v3.0.4 // indirect 65 | golang.org/x/mod v0.27.0 // indirect 66 | golang.org/x/net v0.43.0 // indirect 67 | golang.org/x/oauth2 v0.27.0 // indirect 68 | golang.org/x/sync v0.17.0 // indirect 69 | golang.org/x/sys v0.36.0 // indirect 70 | golang.org/x/term v0.35.0 // indirect 71 | golang.org/x/text v0.29.0 // indirect 72 | golang.org/x/time v0.9.0 // indirect 73 | golang.org/x/tools v0.36.0 // indirect 74 | golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated // indirect 75 | gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect 76 | google.golang.org/protobuf v1.36.5 // indirect 77 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 78 | gopkg.in/inf.v0 v0.9.1 // indirect 79 | k8s.io/apiextensions-apiserver v0.34.0 // indirect 80 | k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f // indirect 81 | k8s.io/klog/v2 v2.130.1 // indirect 82 | k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect 83 | k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect 84 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 85 | sigs.k8s.io/randfill v1.0.0 // indirect 86 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect 87 | ) 88 | -------------------------------------------------------------------------------- /internal/pkg/datasources/modelscope/fake/hub.go: -------------------------------------------------------------------------------- 1 | // Code generated by counterfeiter. DO NOT EDIT. 2 | package fake 3 | 4 | import ( 5 | "context" 6 | "sync" 7 | 8 | "github.com/BaizeAI/dataset/internal/pkg/datasources/modelscope" 9 | ) 10 | 11 | type FakeHubAPI struct { 12 | LoginStub func(context.Context, string) (*modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse], error) 13 | loginMutex sync.RWMutex 14 | loginArgsForCall []struct { 15 | arg1 context.Context 16 | arg2 string 17 | } 18 | loginReturns struct { 19 | result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse] 20 | result2 error 21 | } 22 | loginReturnsOnCall map[int]struct { 23 | result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse] 24 | result2 error 25 | } 26 | invocations map[string][][]interface{} 27 | invocationsMutex sync.RWMutex 28 | } 29 | 30 | func (fake *FakeHubAPI) Login(arg1 context.Context, arg2 string) (*modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse], error) { 31 | fake.loginMutex.Lock() 32 | ret, specificReturn := fake.loginReturnsOnCall[len(fake.loginArgsForCall)] 33 | fake.loginArgsForCall = append(fake.loginArgsForCall, struct { 34 | arg1 context.Context 35 | arg2 string 36 | }{arg1, arg2}) 37 | stub := fake.LoginStub 38 | fakeReturns := fake.loginReturns 39 | fake.recordInvocation("Login", []interface{}{arg1, arg2}) 40 | fake.loginMutex.Unlock() 41 | if stub != nil { 42 | return stub(arg1, arg2) 43 | } 44 | if specificReturn { 45 | return ret.result1, ret.result2 46 | } 47 | return fakeReturns.result1, fakeReturns.result2 48 | } 49 | 50 | func (fake *FakeHubAPI) LoginCallCount() int { 51 | fake.loginMutex.RLock() 52 | defer fake.loginMutex.RUnlock() 53 | return len(fake.loginArgsForCall) 54 | } 55 | 56 | func (fake *FakeHubAPI) LoginCalls(stub func(context.Context, string) (*modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse], error)) { 57 | fake.loginMutex.Lock() 58 | defer fake.loginMutex.Unlock() 59 | fake.LoginStub = stub 60 | } 61 | 62 | func (fake *FakeHubAPI) LoginArgsForCall(i int) (context.Context, string) { 63 | fake.loginMutex.RLock() 64 | defer fake.loginMutex.RUnlock() 65 | argsForCall := fake.loginArgsForCall[i] 66 | return argsForCall.arg1, argsForCall.arg2 67 | } 68 | 69 | func (fake *FakeHubAPI) LoginReturns(result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse], result2 error) { 70 | fake.loginMutex.Lock() 71 | defer fake.loginMutex.Unlock() 72 | fake.LoginStub = nil 73 | fake.loginReturns = struct { 74 | result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse] 75 | result2 error 76 | }{result1, result2} 77 | } 78 | 79 | func (fake *FakeHubAPI) LoginReturnsOnCall(i int, result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse], result2 error) { 80 | fake.loginMutex.Lock() 81 | defer fake.loginMutex.Unlock() 82 | fake.LoginStub = nil 83 | if fake.loginReturnsOnCall == nil { 84 | fake.loginReturnsOnCall = make(map[int]struct { 85 | result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse] 86 | result2 error 87 | }) 88 | } 89 | fake.loginReturnsOnCall[i] = struct { 90 | result1 *modelscope.HubAPIBaseResponse[modelscope.HubAPILoginResponse] 91 | result2 error 92 | }{result1, result2} 93 | } 94 | 95 | func (fake *FakeHubAPI) Invocations() map[string][][]interface{} { 96 | fake.invocationsMutex.RLock() 97 | defer fake.invocationsMutex.RUnlock() 98 | fake.loginMutex.RLock() 99 | defer fake.loginMutex.RUnlock() 100 | copiedInvocations := map[string][][]interface{}{} 101 | for key, value := range fake.invocations { 102 | copiedInvocations[key] = value 103 | } 104 | return copiedInvocations 105 | } 106 | 107 | func (fake *FakeHubAPI) recordInvocation(key string, args []interface{}) { 108 | fake.invocationsMutex.Lock() 109 | defer fake.invocationsMutex.Unlock() 110 | if fake.invocations == nil { 111 | fake.invocations = map[string][][]interface{}{} 112 | } 113 | if fake.invocations[key] == nil { 114 | fake.invocations[key] = [][]interface{}{} 115 | } 116 | fake.invocations[key] = append(fake.invocations[key], args) 117 | } 118 | 119 | var _ modelscope.HubAPI = new(FakeHubAPI) 120 | -------------------------------------------------------------------------------- /pkg/utils/fs.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/sirupsen/logrus" 10 | ) 11 | 12 | func IsSymlink(fi os.FileInfo) bool { 13 | return fi.Mode()&fs.ModeSymlink == fs.ModeSymlink 14 | } 15 | 16 | func IsPermModeMatched(stat fs.FileInfo, desiredPerm fs.FileMode) bool { 17 | return stat.Mode().Perm() == desiredPerm 18 | } 19 | 20 | func readSymbolicLinkUntilRealPath(path string) (string, error) { 21 | finalPath, err := filepath.EvalSymlinks(path) 22 | if err != nil { 23 | return "", err 24 | } 25 | 26 | return finalPath, nil 27 | } 28 | 29 | func ChmodIfUnmatched(logger *logrus.Entry, path string, stat fs.FileInfo, desiredPerm fs.FileMode) error { 30 | if IsPermModeMatched(stat, desiredPerm) { 31 | return nil 32 | } 33 | 34 | err := os.Chmod(path, desiredPerm) 35 | if err != nil { 36 | return fmt.Errorf("failed to chmod %s to %s: %w", path, desiredPerm, err) 37 | } 38 | 39 | return nil 40 | } 41 | 42 | func ChmodAndChownRecursively(logger *logrus.Entry, path string, uid int, gid int, mode os.FileMode) error { 43 | dir := path 44 | logger = logger.WithFields(logrus.Fields{"dir": path}) 45 | 46 | stat, err := os.Stat(dir) 47 | if err != nil { 48 | return err 49 | } 50 | if !stat.IsDir() { 51 | dir = filepath.Dir(path) 52 | logger = logger.WithField("dir", dir) 53 | logger.Warn("path is not a directory, fallback to parent directory instead") 54 | } 55 | 56 | err = ChmodIfUnmatched(logger, dir, stat, mode) 57 | if err != nil { 58 | return err 59 | } 60 | 61 | err = os.Chown(dir, uid, gid) 62 | if err != nil { 63 | return fmt.Errorf("failed to chown %s to %d:%d: %w", dir, uid, gid, err) 64 | } 65 | 66 | return fs.WalkDir(os.DirFS(dir), ".", func(walkPath string, walkDirEntry fs.DirEntry, err error) error { 67 | if err != nil { 68 | return err 69 | } 70 | if walkPath == "." { 71 | return nil 72 | } 73 | if walkPath == ".." { 74 | return nil 75 | } 76 | 77 | walkPath = filepath.Join(dir, walkPath) 78 | 79 | stat, err := walkDirEntry.Info() 80 | if err != nil { 81 | return fmt.Errorf("failed to get info of %s: %w", walkPath, err) 82 | } 83 | 84 | if IsSymlink(stat) { 85 | resolvedWalkPath, err := readSymbolicLinkUntilRealPath(walkPath) 86 | if err != nil { 87 | if os.IsNotExist(err) { 88 | return nil 89 | } 90 | 91 | return fmt.Errorf("failed to resolve symlink %s: %w", walkPath, err) 92 | } 93 | 94 | stat, err = os.Stat(resolvedWalkPath) 95 | if err != nil { 96 | if os.IsNotExist(err) { 97 | return nil 98 | } 99 | 100 | return fmt.Errorf("failed to get info of resolved symlink %s: %w", resolvedWalkPath, err) 101 | } 102 | } 103 | // optionally chmod 104 | if stat.IsDir() || stat.Mode().IsRegular() { 105 | err = ChmodIfUnmatched(logger, walkPath, stat, mode) 106 | if err != nil { 107 | return err 108 | } 109 | } 110 | 111 | err = os.Chown(walkPath, uid, gid) 112 | if err != nil { 113 | return fmt.Errorf("failed to chown %s to %d:%d: %w", walkPath, uid, gid, err) 114 | } 115 | 116 | return nil 117 | }) 118 | } 119 | 120 | func CleanupNotExistingSymlinks(logger *logrus.Entry, path string) error { 121 | return fs.WalkDir(os.DirFS(path), ".", func(walkPath string, walkDirEntry fs.DirEntry, err error) error { 122 | if err != nil { 123 | return err 124 | } 125 | if walkPath == "." { 126 | return nil 127 | } 128 | if walkPath == ".." { 129 | return nil 130 | } 131 | 132 | walkPath = filepath.Join(path, walkPath) 133 | 134 | stat, err := walkDirEntry.Info() 135 | if err != nil { 136 | return fmt.Errorf("failed to get info of %s: %w", walkPath, err) 137 | } 138 | 139 | if IsSymlink(stat) { 140 | _, err := os.Stat(walkPath) 141 | if err != nil { 142 | if os.IsNotExist(err) { 143 | logger.Warnf("removing dangling symlink %s", walkPath) 144 | err = os.Remove(walkPath) 145 | if err != nil { 146 | return fmt.Errorf("failed to remove dangling symlink %s: %w", walkPath, err) 147 | } 148 | } else { 149 | return fmt.Errorf("failed to get info of symlink %s: %w", walkPath, err) 150 | } 151 | } 152 | } 153 | 154 | return nil 155 | }) 156 | } 157 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | tags: 8 | - 'v*.*.*' 9 | 10 | jobs: 11 | build-and-push: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v6 16 | - name: Set up QEMU 17 | uses: docker/setup-qemu-action@v3 18 | - name: Set up Docker Buildx 19 | uses: docker/setup-buildx-action@v3 20 | - name: Login Github Container registry 21 | uses: docker/login-action@v3 22 | with: 23 | registry: ghcr.io 24 | username: ${{ github.actor }} 25 | password: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | - name: Docker meta 28 | id: meta 29 | uses: docker/metadata-action@v5 30 | with: 31 | images: | 32 | ghcr.io/${{ github.repository_owner }}/dataset-controller 33 | tags: | 34 | ${{ github.ref == 'refs/heads/main' && 'latest' || github.ref_name }} 35 | 36 | - name: Build Manager 37 | uses: docker/build-push-action@v6 38 | with: 39 | context: . 40 | platforms: linux/amd64,linux/arm64 41 | push: true 42 | provenance: false 43 | tags: ${{ steps.meta.outputs.tags }} 44 | labels: ${{ steps.meta.outputs.labels }} 45 | 46 | - name: Docker meta 47 | id: meta_loader 48 | uses: docker/metadata-action@v5 49 | with: 50 | images: | 51 | ghcr.io/${{ github.repository_owner }}/dataset-data-loader 52 | tags: | 53 | ${{ github.ref == 'refs/heads/main' && 'latest' || github.ref_name }} 54 | 55 | - name: Build Data Loader 56 | uses: docker/build-push-action@v6 57 | with: 58 | context: . 59 | platforms: linux/amd64,linux/arm64 60 | push: true 61 | file: data-loader.Dockerfile 62 | provenance: false 63 | tags: ${{ steps.meta_loader.outputs.tags }} 64 | labels: ${{ steps.meta_loader.outputs.labels }} 65 | publish-chart: 66 | if: startsWith(github.ref, 'refs/tags/v') 67 | needs: [ build-and-push ] 68 | permissions: 69 | contents: write 70 | env: 71 | HELM_CHARTS_DIR: manifests/dataset 72 | HELM_CHART_NAME: dataset 73 | runs-on: ubuntu-latest 74 | steps: 75 | - name: Checkout 76 | uses: actions/checkout@v6 77 | 78 | - name: Install Helm 79 | uses: azure/setup-helm@v4 80 | 81 | - name: Get the version 82 | id: get_version 83 | run: | 84 | VERSION=${GITHUB_REF#refs/tags/} 85 | echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT 86 | 87 | - name: Tag helm chart image and copy crds 88 | run: | 89 | cp -rf config/crd/bases/* $HELM_CHARTS_DIR/templates/ 90 | image_tag=${{ steps.get_version.outputs.VERSION }} 91 | chart_version=${{ steps.get_version.outputs.VERSION }} 92 | sed -i "s/latest/${image_tag}/g" $HELM_CHARTS_DIR/values.yaml 93 | chart_smever=${chart_version#"v"} 94 | sed -i "s/0.1.0/${chart_smever}/g" $HELM_CHARTS_DIR/Chart.yaml 95 | 96 | - uses: getsentry/action-github-app-token@v3 97 | id: get_app_token 98 | with: 99 | app_id: ${{ secrets.APP_ID }} 100 | private_key: ${{ secrets.APP_PRIVATE_KEY }} 101 | - name: Sync Chart Repo 102 | run: | 103 | git config --global user.email "baize.ai[bot]@users.noreply.github.com" 104 | git config --global user.name "baize.ai[bot]" 105 | git clone https://x-access-token:${{ steps.get_app_token.outputs.token }}@github.com/BaizeAI/charts.git baize-charts 106 | helm package $HELM_CHARTS_DIR --destination ./baize-charts/docs/ 107 | helm repo index --url https://baizeai.github.io/charts ./baize-charts/docs/ 108 | cd baize-charts/ 109 | git add docs/ 110 | chart_version=${{ steps.get_version.outputs.VERSION }} 111 | chart_smever=${chart_version#"v"} 112 | git commit -m "update dataset chart ${chart_smever}" 113 | git push https://x-access-token:${{ steps.get_app_token.outputs.token }}@github.com/BaizeAI/charts.git 114 | -------------------------------------------------------------------------------- /api/client/clientset.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by client-gen. DO NOT EDIT. 17 | 18 | package client 19 | 20 | import ( 21 | fmt "fmt" 22 | http "net/http" 23 | 24 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/client/typed/dataset/v1alpha1" 25 | discovery "k8s.io/client-go/discovery" 26 | rest "k8s.io/client-go/rest" 27 | flowcontrol "k8s.io/client-go/util/flowcontrol" 28 | ) 29 | 30 | type Interface interface { 31 | Discovery() discovery.DiscoveryInterface 32 | DatasetV1alpha1() datasetv1alpha1.DatasetV1alpha1Interface 33 | } 34 | 35 | // Clientset contains the clients for groups. 36 | type Clientset struct { 37 | *discovery.DiscoveryClient 38 | datasetV1alpha1 *datasetv1alpha1.DatasetV1alpha1Client 39 | } 40 | 41 | // DatasetV1alpha1 retrieves the DatasetV1alpha1Client 42 | func (c *Clientset) DatasetV1alpha1() datasetv1alpha1.DatasetV1alpha1Interface { 43 | return c.datasetV1alpha1 44 | } 45 | 46 | // Discovery retrieves the DiscoveryClient 47 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 48 | if c == nil { 49 | return nil 50 | } 51 | return c.DiscoveryClient 52 | } 53 | 54 | // NewForConfig creates a new Clientset for the given config. 55 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 56 | // NewForConfig will generate a rate-limiter in configShallowCopy. 57 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 58 | // where httpClient was generated with rest.HTTPClientFor(c). 59 | func NewForConfig(c *rest.Config) (*Clientset, error) { 60 | configShallowCopy := *c 61 | 62 | if configShallowCopy.UserAgent == "" { 63 | configShallowCopy.UserAgent = rest.DefaultKubernetesUserAgent() 64 | } 65 | 66 | // share the transport between all clients 67 | httpClient, err := rest.HTTPClientFor(&configShallowCopy) 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | return NewForConfigAndClient(&configShallowCopy, httpClient) 73 | } 74 | 75 | // NewForConfigAndClient creates a new Clientset for the given config and http client. 76 | // Note the http client provided takes precedence over the configured transport values. 77 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 78 | // NewForConfigAndClient will generate a rate-limiter in configShallowCopy. 79 | func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { 80 | configShallowCopy := *c 81 | if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { 82 | if configShallowCopy.Burst <= 0 { 83 | return nil, fmt.Errorf("burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0") 84 | } 85 | configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) 86 | } 87 | 88 | var cs Clientset 89 | var err error 90 | cs.datasetV1alpha1, err = datasetv1alpha1.NewForConfigAndClient(&configShallowCopy, httpClient) 91 | if err != nil { 92 | return nil, err 93 | } 94 | 95 | cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) 96 | if err != nil { 97 | return nil, err 98 | } 99 | return &cs, nil 100 | } 101 | 102 | // NewForConfigOrDie creates a new Clientset for the given config and 103 | // panics if there is an error in the config. 104 | func NewForConfigOrDie(c *rest.Config) *Clientset { 105 | cs, err := NewForConfig(c) 106 | if err != nil { 107 | panic(err) 108 | } 109 | return cs 110 | } 111 | 112 | // New creates a new Clientset for the given RESTClient. 113 | func New(c rest.Interface) *Clientset { 114 | var cs Clientset 115 | cs.datasetV1alpha1 = datasetv1alpha1.New(c) 116 | 117 | cs.DiscoveryClient = discovery.NewDiscoveryClient(c) 118 | return &cs 119 | } 120 | -------------------------------------------------------------------------------- /api/client/informers/dataset/v1alpha1/dataset.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // Code generated by informer-gen. DO NOT EDIT. 17 | 18 | package v1alpha1 19 | 20 | import ( 21 | context "context" 22 | time "time" 23 | 24 | client "github.com/BaizeAI/dataset/api/client" 25 | internalinterfaces "github.com/BaizeAI/dataset/api/client/informers/internalinterfaces" 26 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/client/listers/dataset/v1alpha1" 27 | apidatasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 28 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 | runtime "k8s.io/apimachinery/pkg/runtime" 30 | watch "k8s.io/apimachinery/pkg/watch" 31 | cache "k8s.io/client-go/tools/cache" 32 | ) 33 | 34 | // DatasetInformer provides access to a shared informer and lister for 35 | // Datasets. 36 | type DatasetInformer interface { 37 | Informer() cache.SharedIndexInformer 38 | Lister() datasetv1alpha1.DatasetLister 39 | } 40 | 41 | type datasetInformer struct { 42 | factory internalinterfaces.SharedInformerFactory 43 | tweakListOptions internalinterfaces.TweakListOptionsFunc 44 | namespace string 45 | } 46 | 47 | // NewDatasetInformer constructs a new informer for Dataset type. 48 | // Always prefer using an informer factory to get a shared informer instead of getting an independent 49 | // one. This reduces memory footprint and number of connections to the server. 50 | func NewDatasetInformer(client client.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { 51 | return NewFilteredDatasetInformer(client, namespace, resyncPeriod, indexers, nil) 52 | } 53 | 54 | // NewFilteredDatasetInformer constructs a new informer for Dataset type. 55 | // Always prefer using an informer factory to get a shared informer instead of getting an independent 56 | // one. This reduces memory footprint and number of connections to the server. 57 | func NewFilteredDatasetInformer(client client.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { 58 | return cache.NewSharedIndexInformer( 59 | &cache.ListWatch{ 60 | ListFunc: func(options v1.ListOptions) (runtime.Object, error) { 61 | if tweakListOptions != nil { 62 | tweakListOptions(&options) 63 | } 64 | return client.DatasetV1alpha1().Datasets(namespace).List(context.Background(), options) 65 | }, 66 | WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { 67 | if tweakListOptions != nil { 68 | tweakListOptions(&options) 69 | } 70 | return client.DatasetV1alpha1().Datasets(namespace).Watch(context.Background(), options) 71 | }, 72 | ListWithContextFunc: func(ctx context.Context, options v1.ListOptions) (runtime.Object, error) { 73 | if tweakListOptions != nil { 74 | tweakListOptions(&options) 75 | } 76 | return client.DatasetV1alpha1().Datasets(namespace).List(ctx, options) 77 | }, 78 | WatchFuncWithContext: func(ctx context.Context, options v1.ListOptions) (watch.Interface, error) { 79 | if tweakListOptions != nil { 80 | tweakListOptions(&options) 81 | } 82 | return client.DatasetV1alpha1().Datasets(namespace).Watch(ctx, options) 83 | }, 84 | }, 85 | &apidatasetv1alpha1.Dataset{}, 86 | resyncPeriod, 87 | indexers, 88 | ) 89 | } 90 | 91 | func (f *datasetInformer) defaultInformer(client client.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { 92 | return NewFilteredDatasetInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) 93 | } 94 | 95 | func (f *datasetInformer) Informer() cache.SharedIndexInformer { 96 | return f.factory.InformerFor(&apidatasetv1alpha1.Dataset{}, f.defaultInformer) 97 | } 98 | 99 | func (f *datasetInformer) Lister() datasetv1alpha1.DatasetLister { 100 | return datasetv1alpha1.NewDatasetLister(f.Informer().GetIndexer()) 101 | } 102 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_modelscope.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/url" 7 | "os" 8 | "os/exec" 9 | "strings" 10 | 11 | "github.com/sirupsen/logrus" 12 | 13 | "github.com/BaizeAI/dataset/pkg/log" 14 | "github.com/BaizeAI/dataset/pkg/utils" 15 | ) 16 | 17 | var _ Loader = &ModelScopeLoader{} 18 | 19 | type ModelScopeLoader struct { 20 | Options Options 21 | 22 | modelScopeOptions ModelScopeLoaderOptions 23 | } 24 | 25 | func NewModelScopeLoader(datasourceOptions map[string]string, options Options, secrets Secrets) (*ModelScopeLoader, error) { 26 | modelScope := new(ModelScopeLoader) 27 | parsedOpts, err := modelScope.parseOptionsFromOptions(datasourceOptions) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | modelScope.Options = options 33 | modelScope.modelScopeOptions = parsedOpts 34 | modelScope.modelScopeOptions.token = secrets.Token 35 | 36 | return modelScope, nil 37 | } 38 | 39 | type ModelScopeLoaderOptions struct { 40 | Revision string `json:"revision"` 41 | RepoType string `json:"repoType"` 42 | Include string `json:"include"` 43 | Exclude string `json:"exclude"` 44 | 45 | token string 46 | } 47 | 48 | func (d *ModelScopeLoader) parseOptionsFromOptions(options map[string]string) (ModelScopeLoaderOptions, error) { 49 | jsonContent, err := json.Marshal(options) 50 | if err != nil { 51 | return ModelScopeLoaderOptions{}, err 52 | } 53 | 54 | var msOptions ModelScopeLoaderOptions 55 | err = json.Unmarshal(jsonContent, &msOptions) 56 | if err != nil { 57 | return ModelScopeLoaderOptions{}, err 58 | } 59 | 60 | return msOptions, nil 61 | } 62 | 63 | func (d *ModelScopeLoader) mapRepoTypeEnumStringToModelScopeRepoType(repoType string) string { 64 | switch repoType { 65 | case "MODEL", "model": 66 | return "model" 67 | case "DATASET", "dataset": 68 | return "dataset" 69 | default: 70 | return "" 71 | } 72 | } 73 | 74 | func (d *ModelScopeLoader) login(logger *logrus.Entry, token string) error { 75 | args := []string{ 76 | "login", 77 | "--token", 78 | token, 79 | } 80 | 81 | cmd := exec.Command("modelscope", args...) 82 | cmd.Env = os.Environ() 83 | 84 | _, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 85 | if err != nil { 86 | return err 87 | } 88 | 89 | return nil 90 | } 91 | 92 | func (d *ModelScopeLoader) Sync(fromURI string, toPath string) error { 93 | parsedURL, err := url.Parse(d.Options.URI) 94 | if err != nil { 95 | return err 96 | } 97 | if parsedURL.Scheme != "modelscope" { 98 | return fmt.Errorf("invalid scheme %s, only modelscope is supported", parsedURL.Scheme) 99 | } 100 | 101 | repoName := parsedURL.Host + parsedURL.Path 102 | repoType := d.mapRepoTypeEnumStringToModelScopeRepoType(d.modelScopeOptions.RepoType) 103 | 104 | logger := log.WithFields(logrus.Fields{ 105 | "fromURI": fromURI, 106 | "type": TypeModelScope, 107 | "toPath": toPath, 108 | "workingDirectory": d.Options.Root, 109 | "repoName": repoName, 110 | "revision": d.modelScopeOptions.Revision, 111 | "repoType": repoType, 112 | "include": d.modelScopeOptions.Include, 113 | "exclude": d.modelScopeOptions.Exclude, 114 | }) 115 | 116 | token := strings.TrimSpace(d.modelScopeOptions.token) 117 | 118 | logger.Debugf("performing modelscope download command to pull data from %s to %s", fromURI, toPath) 119 | 120 | if d.modelScopeOptions.token != "" { 121 | err = d.login(logger, token) 122 | if err != nil { 123 | return err 124 | } 125 | } 126 | 127 | args := []string{ 128 | "download", 129 | repoName, 130 | "--local_dir", 131 | toPath, 132 | } 133 | if repoType != "" { 134 | args = append(args, "--repo-type", repoType) 135 | } 136 | if d.modelScopeOptions.Include != "" { 137 | args = append(args, "--include", d.modelScopeOptions.Include) 138 | } 139 | if d.modelScopeOptions.Exclude != "" { 140 | args = append(args, "--exclude", d.modelScopeOptions.Exclude) 141 | } 142 | 143 | cmd := exec.Command("modelscope", args...) 144 | cmd.Dir = d.Options.Root 145 | 146 | logger = logger.WithField("command", cmd.String()) 147 | logger.Debug("executing command to download data from modelscope") 148 | 149 | cmd.Env = os.Environ() 150 | cmd.Env = append(cmd.Env, "DO_NOT_TRACK=1") // https://consoledonottrack.com/ 151 | 152 | outBuffer, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, []string{token}) 153 | 154 | if err != nil { 155 | logger.Errorf("modelscope download command error: %s", errBuffer) 156 | return fmt.Errorf("failed to copy data from %s to %s with modelscope command %s, err: %s", fromURI, toPath, cmd.String(), err) 157 | } 158 | 159 | logger.Debugf("modelscope download command output: %s", outBuffer.String()) 160 | 161 | return nil 162 | } 163 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_http.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "encoding/base64" 5 | "fmt" 6 | "github.com/samber/lo" 7 | "net/url" 8 | "os" 9 | "os/exec" 10 | "strings" 11 | 12 | "github.com/sirupsen/logrus" 13 | 14 | "github.com/BaizeAI/dataset/pkg/log" 15 | "github.com/BaizeAI/dataset/pkg/utils" 16 | ) 17 | 18 | var _ Loader = &HTTPLoader{} 19 | 20 | type HTTPLoader struct { 21 | Options Options 22 | 23 | httpOptions HTTPLoaderOptions 24 | } 25 | 26 | func NewHTTPLoader(datasourceOptions map[string]string, options Options, secrets Secrets) (*HTTPLoader, error) { 27 | h := new(HTTPLoader) 28 | 29 | h.Options = options 30 | 31 | _, err := url.Parse(options.URI) 32 | if err != nil { 33 | return nil, fmt.Errorf("failed to parse uri %s: %w", options.URI, err) 34 | } 35 | 36 | h.httpOptions.basicAuthUsername = secrets.Username 37 | h.httpOptions.basicAuthPassword = secrets.Password 38 | h.httpOptions.SyncMode = lo.CoalesceOrEmpty(datasourceOptions["syncMode"], "sync") 39 | 40 | err = h.validateOptions(h.httpOptions) 41 | if err != nil { 42 | return nil, err 43 | } 44 | 45 | return h, nil 46 | } 47 | 48 | type HTTPLoaderOptions struct { 49 | SyncMode string `json:"syncMode"` 50 | 51 | basicAuthUsername string 52 | basicAuthPassword string 53 | 54 | fromURI string 55 | } 56 | 57 | func (d *HTTPLoader) validateOptions(options HTTPLoaderOptions) error { 58 | if options.SyncMode != "" && options.SyncMode != "sync" && options.SyncMode != "copy" { 59 | return fmt.Errorf("invalid syncMode '%s', must be 'sync' or 'copy'", options.SyncMode) 60 | } 61 | return nil 62 | } 63 | 64 | func (d *HTTPLoader) configTouch() error { 65 | return rcloneCliConfigTouch() 66 | } 67 | 68 | func (d *HTTPLoader) configCreate(configName string) error { 69 | logger := log.WithFields(logrus.Fields{ 70 | "configName": configName, 71 | "type": TypeHTTP, 72 | "fromURI": d.httpOptions.fromURI, 73 | }) 74 | 75 | args := []string{ 76 | "config", 77 | "create", 78 | configName, 79 | "http", 80 | strings.Join([]string{"url", d.httpOptions.fromURI}, "="), 81 | } 82 | 83 | cmd := exec.Command("rclone", args...) 84 | 85 | logger = logger.WithField("command", cmd.String()) 86 | logger.Debug("executing command to create a new rclone config") 87 | 88 | outBuffer, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, nil) 89 | 90 | if err != nil { 91 | logger.Errorf("rclone config create command error: %s", errBuffer) 92 | return err 93 | } 94 | logger.Debugf("rclone config create command output: %s", outBuffer.String()) 95 | 96 | return nil 97 | } 98 | 99 | // From https://cs.opensource.google/go/go/+/refs/tags/go1.21.5:src/net/http/client.go;l=426 100 | func basicAuth(username, password string) string { 101 | auth := username + ":" + password 102 | return base64.StdEncoding.EncodeToString([]byte(auth)) 103 | } 104 | 105 | func (d *HTTPLoader) Sync(fromURI string, toPath string) error { 106 | _, err := url.Parse(fromURI) 107 | if err != nil { 108 | return fmt.Errorf("failed to parse uri %s: %w", fromURI, err) 109 | } 110 | 111 | logger := log.WithFields(logrus.Fields{ 112 | "fromURI": fromURI, 113 | "type": TypeHTTP, 114 | "toPath": toPath, 115 | "workingDirectory": d.Options.Root, 116 | }) 117 | 118 | basicAuthUsername := strings.TrimSpace(d.httpOptions.basicAuthUsername) 119 | basicAuthPassword := strings.TrimSpace(d.httpOptions.basicAuthPassword) 120 | basicAuthBase64 := basicAuth(basicAuthUsername, basicAuthPassword) 121 | 122 | logger.Debugf("performing rclone copy command to copy data served by HTTP") 123 | 124 | err = d.configTouch() 125 | if err != nil { 126 | return err 127 | } 128 | 129 | configName := fmt.Sprintf("baize-data-loader-copy-config-%s", utils.RandomHashString(8)) 130 | d.httpOptions.fromURI = fromURI 131 | 132 | err = d.configCreate(configName) 133 | if err != nil { 134 | return err 135 | } 136 | 137 | syncMode := d.httpOptions.SyncMode 138 | 139 | args := []string{ 140 | syncMode, 141 | fmt.Sprintf("%s:", configName), 142 | toPath, 143 | } 144 | 145 | args = append(args, "-vvv") 146 | cmd := exec.Command("rclone", args...) 147 | cmd.Dir = d.Options.Root 148 | 149 | logger = logger.WithField("command", cmd.String()) 150 | logger.Debug("executing command to copy data") 151 | 152 | cmd.Env = os.Environ() 153 | 154 | if basicAuthUsername != "" && basicAuthPassword != "" { 155 | cmd.Env = append(cmd.Env, fmt.Sprintf("RCLONE_HTTP_HEADERS=Authorization,Basic %s", basicAuthBase64)) 156 | } 157 | 158 | outBuffer, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, []string{basicAuthBase64}) 159 | if err != nil { 160 | logger.Errorf("rclone copy command error: %s", errBuffer) 161 | return fmt.Errorf("failed to copy data from %s to %s with rclone command %s, err: %s", fromURI, toPath, cmd.String(), err) 162 | } 163 | logger.Debugf("rclone copy command output: %s", outBuffer.String()) 164 | 165 | return nil 166 | } 167 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "flag" 21 | config2 "github.com/BaizeAI/dataset/config" 22 | "github.com/samber/lo" 23 | "os" 24 | 25 | // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) 26 | // to ensure that exec-entrypoint and run can make use of them. 27 | _ "k8s.io/client-go/plugin/pkg/client/auth" 28 | 29 | "k8s.io/apimachinery/pkg/runtime" 30 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 31 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 32 | ctrl "sigs.k8s.io/controller-runtime" 33 | "sigs.k8s.io/controller-runtime/pkg/healthz" 34 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 35 | metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 36 | 37 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 38 | 39 | datasetcontroller "github.com/BaizeAI/dataset/internal/controller/dataset" 40 | //+kubebuilder:scaffold:imports 41 | ) 42 | 43 | var ( 44 | scheme = runtime.NewScheme() 45 | setupLog = ctrl.Log.WithName("setup") 46 | ) 47 | 48 | func init() { 49 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 50 | 51 | utilruntime.Must(datasetv1alpha1.AddToScheme(scheme)) 52 | 53 | //+kubebuilder:scaffold:scheme 54 | } 55 | 56 | func main() { 57 | var metricsAddr string 58 | var enableLeaderElection bool 59 | var probeAddr string 60 | var config string 61 | flag.StringVar(&metricsAddr, "metrics-bind-address", ":8082", "The address the metric endpoint binds to.") 62 | flag.StringVar(&probeAddr, "health-probe-bind-address", ":8083", "The address the probe endpoint binds to.") 63 | flag.StringVar(&config, "config", "config/config.yaml", "The path of config file") 64 | flag.BoolVar(&enableLeaderElection, "leader-elect", true, 65 | "Enable leader election for controller manager. "+ 66 | "Enabling this will ensure there is only one active controller manager.") 67 | opts := zap.Options{ 68 | Development: true, 69 | } 70 | opts.BindFlags(flag.CommandLine) 71 | flag.Parse() 72 | ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) 73 | if err := config2.ParseConfigFromFile(config); err != nil { 74 | setupLog.Error(err, "unable to load config") 75 | os.Exit(1) 76 | } 77 | 78 | mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ 79 | Scheme: scheme, 80 | Metrics: metricsserver.Options{BindAddress: metricsAddr}, 81 | HealthProbeBindAddress: probeAddr, 82 | LeaderElection: enableLeaderElection, 83 | LeaderElectionID: "d227e0e2.baizeai.io", 84 | LeaderElectionNamespace: lo.CoalesceOrEmpty(os.Getenv("POD_NAMESPACE"), func() string { 85 | bs, _ := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") 86 | return string(bs) 87 | }(), "default"), 88 | // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily 89 | // when the Manager ends. This requires the binary to immediately end when the 90 | // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly 91 | // speeds up voluntary leader transitions as the new leader don't have to wait 92 | // LeaseDuration time first. 93 | // 94 | // In the default scaffold provided, the program ends immediately after 95 | // the manager stops, so would be fine to enable this option. However, 96 | // if you are doing or is intended to do any operation such as perform cleanups 97 | // after the manager stops then its usage might be unsafe. 98 | // LeaderElectionReleaseOnCancel: true, 99 | }) 100 | if err != nil { 101 | setupLog.Error(err, "unable to start manager") 102 | os.Exit(1) 103 | } 104 | if err = (&datasetcontroller.DatasetReconciler{ 105 | Client: mgr.GetClient(), 106 | Scheme: mgr.GetScheme(), 107 | }).SetupWithManager(mgr); err != nil { 108 | setupLog.Error(err, "unable to create controller", "controller", "Dataset") 109 | os.Exit(1) 110 | } 111 | //+kubebuilder:scaffold:builder 112 | 113 | if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 114 | setupLog.Error(err, "unable to set up health check") 115 | os.Exit(1) 116 | } 117 | if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 118 | setupLog.Error(err, "unable to set up ready check") 119 | os.Exit(1) 120 | } 121 | 122 | setupLog.Info("starting manager") 123 | if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 124 | setupLog.Error(err, "problem running manager") 125 | os.Exit(1) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /internal/pkg/datasources/conda/conda.go: -------------------------------------------------------------------------------- 1 | package conda 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "os" 8 | "os/exec" 9 | "path/filepath" 10 | "regexp" 11 | "strings" 12 | 13 | "github.com/sirupsen/logrus" 14 | 15 | "github.com/BaizeAI/dataset/pkg/utils" 16 | ) 17 | 18 | type MambaCLI struct { 19 | envs map[string]string 20 | } 21 | 22 | func NewMambaCLI() *MambaCLI { 23 | return &MambaCLI{ 24 | envs: map[string]string{ 25 | "always_yes": "true", 26 | }, 27 | } 28 | } 29 | 30 | func (c *MambaCLI) newCommand(args ...string) *exec.Cmd { 31 | cmd := exec.Command("mamba", args...) 32 | cmd.Env = os.Environ() 33 | cmd.Env = append(cmd.Env, c.GetEnvs()...) 34 | 35 | return cmd 36 | } 37 | 38 | func (c *MambaCLI) GetEnvs() []string { 39 | envs := make([]string, 0, len(c.envs)) 40 | for k, v := range c.envs { 41 | envs = append(envs, fmt.Sprintf("%s=%s", fmt.Sprintf("MAMBA_%s", strings.ToUpper(k)), v)) 42 | envs = append(envs, fmt.Sprintf("%s=%s", fmt.Sprintf("CONDA_%s", strings.ToUpper(k)), v)) 43 | } 44 | 45 | return envs 46 | } 47 | 48 | // Version returns the version of conda 49 | // Equivalent to `conda --version` 50 | func (c *MambaCLI) Version(logger *logrus.Entry) (string, error) { 51 | args := []string{ 52 | "--version", 53 | } 54 | 55 | cmd := c.newCommand(args...) 56 | output, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 57 | if err != nil { 58 | return "", err 59 | } 60 | 61 | outputString := strings.TrimSpace(output.String()) 62 | output.Reset() 63 | 64 | return strings.TrimSpace(outputString), nil 65 | } 66 | 67 | // Info returns the conda info 68 | // Equivalent to `conda info --json` 69 | func (c *MambaCLI) Info(logger *logrus.Entry) (*CondaInfoOutputRaw, error) { 70 | args := []string{ 71 | "info", 72 | "--json", 73 | } 74 | 75 | cmd := c.newCommand(args...) 76 | output, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | defer output.Reset() 82 | 83 | var info CondaInfoOutputRaw 84 | err = json.Unmarshal(output.Bytes(), &info) 85 | if err != nil { 86 | return nil, err 87 | } 88 | 89 | return &info, nil 90 | } 91 | 92 | // EnvList returns the list of conda environments 93 | // Equivalent to `conda env list` 94 | func (c *MambaCLI) EnvList(logger *logrus.Entry) ([]CondaEnvListOutputEnv, error) { 95 | args := []string{ 96 | "env", 97 | "list", 98 | "--json", 99 | } 100 | 101 | cmd := c.newCommand(args...) 102 | output, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 103 | if err != nil { 104 | return make([]CondaEnvListOutputEnv, 0), err 105 | } 106 | 107 | defer output.Reset() 108 | 109 | var envList CondaEnvListOutputRaw 110 | err = json.Unmarshal(output.Bytes(), &envList) 111 | if err != nil { 112 | return make([]CondaEnvListOutputEnv, 0), err 113 | } 114 | 115 | envs := make([]CondaEnvListOutputEnv, 0, len(envList.Envs)) 116 | for _, env := range envList.Envs { 117 | envs = append(envs, CondaEnvListOutputEnv{ 118 | Name: filepath.Base(env), 119 | Path: env, 120 | }) 121 | } 122 | 123 | return envs, nil 124 | } 125 | 126 | func (c *MambaCLI) ConfigSetShowChannelURLs(logger *logrus.Entry) { 127 | c.envs["show_channel_urls"] = "true" 128 | } 129 | 130 | // ConfigPrependPkgsDir prepends the pkgs_dir to the conda config 131 | // Equivalent to `conda config --prepend pkgs_dirs ` 132 | func (c *MambaCLI) ConfigPrependPkgsDir(logger *logrus.Entry, pkgsDir string) { 133 | c.envs["pkgs_dirs"] = pkgsDir 134 | } 135 | 136 | // ConfigPrependEnvsDir prepends the envs_dir to the conda config 137 | // Equivalent to `conda config --prepend envs_dirs ` 138 | func (c *MambaCLI) ConfigPrependEnvsDir(logger *logrus.Entry, envsDir string) { 139 | c.envs["envs_dirs"] = envsDir 140 | } 141 | 142 | var prefixAlreadyExistsErrRegexp = regexp.MustCompile(`^\sCondaValueError: prefix already exists: /.*\s\s$`) 143 | 144 | func (c *MambaCLI) IsPrefixAlreadyExistsError(errBuffer *bytes.Buffer) bool { 145 | return prefixAlreadyExistsErrRegexp.Match(errBuffer.Bytes()) 146 | } 147 | 148 | // CreateEnvFromFile creates a new conda environment from a file 149 | // Equivalent to `conda env create --file --verbose -y` 150 | func (c *MambaCLI) CreateEnvFromFile(logger *logrus.Entry, file string) error { 151 | args := []string{ 152 | "env", 153 | "create", 154 | "--file", 155 | file, 156 | "--verbose", 157 | } 158 | 159 | cmd := c.newCommand(args...) 160 | _, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, []string{}) 161 | if err != nil { 162 | if c.IsPrefixAlreadyExistsError(errBuffer) { 163 | return nil 164 | } 165 | 166 | return err 167 | } 168 | 169 | return nil 170 | } 171 | 172 | // CleanAll cleans all conda packages 173 | // Equivalent to `conda clean --all -y` 174 | func (c *MambaCLI) CleanAll(logger *logrus.Entry) error { 175 | args := []string{ 176 | "clean", 177 | "--all", 178 | "-y", 179 | } 180 | 181 | cmd := c.newCommand(args...) 182 | err := utils.ExecuteCommand(logger, cmd, []string{}) 183 | if err != nil { 184 | return err 185 | } 186 | 187 | return nil 188 | } 189 | -------------------------------------------------------------------------------- /internal/pkg/datasources/huggingface/hub.go: -------------------------------------------------------------------------------- 1 | package huggingface 2 | 3 | //go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 -generate 4 | 5 | import ( 6 | "bytes" 7 | "context" 8 | "encoding/json" 9 | "net/http" 10 | "time" 11 | ) 12 | 13 | const ( 14 | HubAPIEndpointScheme = "https://" 15 | HubAPIEndpointDomain = "huggingface.co" 16 | 17 | hubAPIEndpointPathWhoAmI = "/api/whoami-v2" 18 | ) 19 | 20 | type HfAPIAccessToken struct { 21 | CreatedAt time.Time `json:"createdAt"` 22 | DisplayName string `json:"displayName"` 23 | Role string `json:"role"` 24 | } 25 | 26 | type HfAPIWhoAmIResponseAuth struct { 27 | AccessToken HfAPIAccessToken `json:"accessToken"` 28 | Type string `json:"type"` 29 | } 30 | 31 | type HfAPIWhoAmIResponse struct { 32 | Auth HfAPIWhoAmIResponseAuth `json:"auth"` 33 | AvatarURL string `json:"avatarUrl"` 34 | CanPay bool `json:"canPay"` 35 | Email string `json:"email"` 36 | EmailVerified bool `json:"emailVerified"` 37 | Fullname string `json:"fullname"` 38 | ID string `json:"id"` 39 | IsPro bool `json:"isPro"` 40 | Name string `json:"name"` 41 | Type string `json:"type"` 42 | } 43 | 44 | type HfAPIErrorResponse struct { 45 | Error string `json:"error"` 46 | } 47 | 48 | type HfAPIError struct { 49 | HfAPIErrorResponse 50 | } 51 | 52 | func (e *HfAPIError) Error() string { 53 | return e.HfAPIErrorResponse.Error 54 | } 55 | 56 | func IsHfAPIError(err error) bool { 57 | _, ok := err.(*HfAPIError) 58 | return ok 59 | } 60 | 61 | //counterfeiter:generate -o fake/hub.go --fake-name FakeHfAPI . HfAPI 62 | type HfAPI interface { 63 | WhoAmI(ctx context.Context, token string) (*HfAPIWhoAmIResponse, error) 64 | } 65 | 66 | type HfAPIClient struct { 67 | client *http.Client 68 | apiEndpoint string 69 | } 70 | 71 | // NewHfAPIClient creates a new HfAPIClient. 72 | // 73 | // Source code: https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/hf_api.py#L1493-L1535 74 | func NewHfAPIClient() *HfAPIClient { 75 | return &HfAPIClient{ 76 | client: &http.Client{}, 77 | } 78 | } 79 | 80 | func (c *HfAPIClient) endpoint() string { 81 | if c.apiEndpoint == "" { 82 | return HubAPIEndpointScheme + HubAPIEndpointDomain 83 | } 84 | 85 | return c.apiEndpoint 86 | } 87 | 88 | // WhoAmI returns the current user. 89 | // 90 | // Source code: https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/hf_api.py#L1578-L1607 91 | func (c *HfAPIClient) WhoAmI(ctx context.Context, token string) (*HfAPIWhoAmIResponse, error) { 92 | req, err := http.NewRequest(http.MethodGet, c.endpoint()+hubAPIEndpointPathWhoAmI, nil) 93 | if err != nil { 94 | return nil, err 95 | } 96 | 97 | req = req.WithContext(ctx) 98 | req.Header = c.buildHfHeaders(token) 99 | 100 | resp, err := c.client.Do(req) 101 | if err != nil { 102 | return nil, err 103 | } 104 | 105 | defer func() { 106 | _ = resp.Body.Close() 107 | }() 108 | 109 | bodyBuffer := new(bytes.Buffer) 110 | _, err = bodyBuffer.ReadFrom(resp.Body) 111 | if err != nil { 112 | return nil, err 113 | } 114 | 115 | var errResponse HfAPIErrorResponse 116 | err = json.Unmarshal(bodyBuffer.Bytes(), &errResponse) 117 | if err != nil { 118 | return nil, err 119 | } 120 | if errResponse.Error != "" { 121 | return nil, &HfAPIError{errResponse} 122 | } 123 | 124 | var whoAmIResponse HfAPIWhoAmIResponse 125 | err = json.Unmarshal(bodyBuffer.Bytes(), &whoAmIResponse) 126 | if err != nil { 127 | return nil, err 128 | } 129 | 130 | return &whoAmIResponse, nil 131 | } 132 | 133 | // Documentations: https://huggingface.co/docs/huggingface_hub/quick-start#authentication 134 | // Source code: https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/hf_api.py#L1609-L1629 135 | // References: 136 | // - https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/_login.py#L50-L115 137 | // - https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/_login.py#L299-L330 138 | func (c *HfAPIClient) GetTokenPermission(ctx context.Context, token string) (string, error) { 139 | whoAmI, err := c.WhoAmI(ctx, token) 140 | if err != nil { 141 | return "", err 142 | } 143 | 144 | return whoAmI.Auth.AccessToken.Role, nil 145 | } 146 | 147 | // Source code: https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/utils/_headers.py#L39-L139 148 | // Reference: 149 | // - https://github.com/huggingface/huggingface_hub/blob/8d1ffc6d78827aa18c4fec3f73843ac7bb64a153/src/huggingface_hub/hf_api.py#L9399-L9421 150 | func (c *HfAPIClient) buildHfHeaders(token string) http.Header { 151 | return http.Header{ 152 | "Authorization": []string{"Bearer " + token}, 153 | "User-Agent": []string{"hf_hub/4.0.0"}, 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /pkg/utils/fs_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/sirupsen/logrus" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestIsSymlink(t *testing.T) { 13 | // Create a temporary directory 14 | tempDir, err := os.MkdirTemp("", "test") 15 | assert.NoError(t, err) 16 | defer func() { 17 | assert.NoError(t, os.RemoveAll(tempDir)) 18 | }() 19 | 20 | // Create a regular file 21 | regularFile := filepath.Join(tempDir, "regular") 22 | err = os.WriteFile(regularFile, []byte("test"), 0600) 23 | assert.NoError(t, err) 24 | 25 | // Create a symlink 26 | symlinkFile := filepath.Join(tempDir, "symlink") 27 | err = os.Symlink(regularFile, symlinkFile) 28 | assert.NoError(t, err) 29 | 30 | // Test regular file 31 | regularInfo, err := os.Lstat(regularFile) 32 | assert.NoError(t, err) 33 | assert.False(t, IsSymlink(regularInfo)) 34 | 35 | // Test symlink 36 | symlinkInfo, err := os.Lstat(symlinkFile) 37 | assert.NoError(t, err) 38 | assert.True(t, IsSymlink(symlinkInfo)) 39 | } 40 | 41 | func TestIsPermModeMatched(t *testing.T) { 42 | tempFile, err := os.CreateTemp("", "test") 43 | assert.NoError(t, err) 44 | defer func() { 45 | assert.NoError(t, os.Remove(tempFile.Name())) 46 | }() 47 | 48 | err = tempFile.Chmod(0644) 49 | assert.NoError(t, err) 50 | 51 | info, err := os.Stat(tempFile.Name()) 52 | assert.NoError(t, err) 53 | 54 | assert.True(t, IsPermModeMatched(info, 0644)) 55 | assert.False(t, IsPermModeMatched(info, 0755)) 56 | } 57 | 58 | func TestReadSymbolicLinkUntilRealPath(t *testing.T) { 59 | // Create a temporary directory 60 | tempDir, err := os.MkdirTemp("", "test") 61 | assert.NoError(t, err) 62 | defer func() { 63 | assert.NoError(t, os.RemoveAll(tempDir)) 64 | }() 65 | 66 | // Resolve the temporary directory to its real path 67 | realTempDir, err := filepath.EvalSymlinks(tempDir) 68 | assert.NoError(t, err) 69 | 70 | // Create a regular file 71 | regularFile := filepath.Join(realTempDir, "regular") 72 | err = os.WriteFile(regularFile, []byte("test"), 0600) 73 | assert.NoError(t, err) 74 | 75 | // Create a symlink to the regular file 76 | symlink1 := filepath.Join(realTempDir, "symlink1") 77 | err = os.Symlink(regularFile, symlink1) 78 | assert.NoError(t, err) 79 | 80 | // Create a symlink to the first symlink 81 | symlink2 := filepath.Join(realTempDir, "symlink2") 82 | err = os.Symlink(symlink1, symlink2) 83 | assert.NoError(t, err) 84 | 85 | // Test regular file 86 | path, err := readSymbolicLinkUntilRealPath(regularFile) 87 | assert.NoError(t, err) 88 | assert.Equal(t, regularFile, path) 89 | 90 | // Test symlink 91 | path, err = readSymbolicLinkUntilRealPath(symlink2) 92 | assert.NoError(t, err) 93 | assert.Equal(t, regularFile, path) 94 | 95 | // Test non-existent file 96 | _, err = readSymbolicLinkUntilRealPath(filepath.Join(realTempDir, "non-existent")) 97 | assert.Error(t, err) 98 | } 99 | 100 | func TestChmodIfUnmatched(t *testing.T) { 101 | tempFile, err := os.CreateTemp("", "test") 102 | assert.NoError(t, err) 103 | defer func() { 104 | assert.NoError(t, os.Remove(tempFile.Name())) 105 | }() 106 | 107 | logger := logrus.NewEntry(logrus.New()) 108 | 109 | // Set initial permissions 110 | err = tempFile.Chmod(0644) 111 | assert.NoError(t, err) 112 | 113 | info, err := os.Stat(tempFile.Name()) 114 | assert.NoError(t, err) 115 | 116 | // Test when permissions already match 117 | err = ChmodIfUnmatched(logger, tempFile.Name(), info, 0644) 118 | assert.NoError(t, err) 119 | 120 | // Test when permissions don't match 121 | err = ChmodIfUnmatched(logger, tempFile.Name(), info, 0755) 122 | assert.NoError(t, err) 123 | 124 | newInfo, err := os.Stat(tempFile.Name()) 125 | assert.NoError(t, err) 126 | assert.Equal(t, os.FileMode(0755), newInfo.Mode().Perm()) 127 | } 128 | 129 | func TestChmodAndChownRecursively(t *testing.T) { 130 | tempDir, err := os.MkdirTemp("", "test") 131 | assert.NoError(t, err) 132 | defer func() { 133 | assert.NoError(t, os.RemoveAll(tempDir)) 134 | }() 135 | 136 | logger := logrus.NewEntry(logrus.New()) 137 | 138 | err = os.Mkdir(filepath.Join(tempDir, "subdir"), 0755) 139 | assert.NoError(t, err) 140 | err = os.WriteFile(filepath.Join(tempDir, "file1"), []byte("test"), 0600) 141 | assert.NoError(t, err) 142 | err = os.WriteFile(filepath.Join(tempDir, "subdir", "file2"), []byte("test"), 0600) 143 | assert.NoError(t, err) 144 | 145 | err = ChmodAndChownRecursively(logger, tempDir, os.Getuid(), os.Getgid(), 0755) 146 | assert.NoError(t, err) 147 | 148 | // Check permissions (ownership can't be reliably tested without root) 149 | checkPerm := func(path string, expected os.FileMode) { 150 | info, err := os.Stat(path) 151 | assert.NoError(t, err) 152 | assert.Equal(t, expected, info.Mode().Perm()) 153 | } 154 | 155 | checkPerm(tempDir, 0755) 156 | checkPerm(filepath.Join(tempDir, "subdir"), 0755) 157 | checkPerm(filepath.Join(tempDir, "file1"), 0755) 158 | checkPerm(filepath.Join(tempDir, "subdir", "file2"), 0755) 159 | } 160 | 161 | func TestCleanupNotExistingSymlinks(t *testing.T) { 162 | tempDir, err := os.MkdirTemp("", "test") 163 | assert.NoError(t, err) 164 | defer func() { 165 | assert.NoError(t, os.RemoveAll(tempDir)) 166 | }() 167 | 168 | logger := logrus.NewEntry(logrus.New()) 169 | 170 | // Create a regular file 171 | regularFile := filepath.Join(tempDir, "regular") 172 | err = os.WriteFile(regularFile, []byte("test"), 0600) 173 | assert.NoError(t, err) 174 | 175 | // Create a valid symlink 176 | validSymlink := filepath.Join(tempDir, "valid_symlink") 177 | err = os.Symlink(regularFile, validSymlink) 178 | assert.NoError(t, err) 179 | 180 | // Create a dangling symlink 181 | danglingSymlink := filepath.Join(tempDir, "dangling_symlink") 182 | err = os.Symlink(filepath.Join(tempDir, "non_existent"), danglingSymlink) 183 | assert.NoError(t, err) 184 | 185 | err = CleanupNotExistingSymlinks(logger, tempDir) 186 | assert.NoError(t, err) 187 | 188 | // Check that the valid symlink still exists 189 | _, err = os.Stat(validSymlink) 190 | assert.NoError(t, err) 191 | 192 | // Check that the dangling symlink has been removed 193 | _, err = os.Stat(danglingSymlink) 194 | assert.True(t, os.IsNotExist(err)) 195 | } 196 | -------------------------------------------------------------------------------- /api/dataset/v1alpha1/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- 1 | //go:build !ignore_autogenerated 2 | 3 | /* 4 | Copyright 2023. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | // Code generated by controller-gen. DO NOT EDIT. 20 | 21 | package v1alpha1 22 | 23 | import ( 24 | "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | runtime "k8s.io/apimachinery/pkg/runtime" 26 | ) 27 | 28 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 29 | func (in *DataLoadStatus) DeepCopyInto(out *DataLoadStatus) { 30 | *out = *in 31 | in.StartTime.DeepCopyInto(&out.StartTime) 32 | in.EndTime.DeepCopyInto(&out.EndTime) 33 | } 34 | 35 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataLoadStatus. 36 | func (in *DataLoadStatus) DeepCopy() *DataLoadStatus { 37 | if in == nil { 38 | return nil 39 | } 40 | out := new(DataLoadStatus) 41 | in.DeepCopyInto(out) 42 | return out 43 | } 44 | 45 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 46 | func (in *Dataset) DeepCopyInto(out *Dataset) { 47 | *out = *in 48 | out.TypeMeta = in.TypeMeta 49 | in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) 50 | in.Spec.DeepCopyInto(&out.Spec) 51 | in.Status.DeepCopyInto(&out.Status) 52 | } 53 | 54 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Dataset. 55 | func (in *Dataset) DeepCopy() *Dataset { 56 | if in == nil { 57 | return nil 58 | } 59 | out := new(Dataset) 60 | in.DeepCopyInto(out) 61 | return out 62 | } 63 | 64 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 65 | func (in *Dataset) DeepCopyObject() runtime.Object { 66 | if c := in.DeepCopy(); c != nil { 67 | return c 68 | } 69 | return nil 70 | } 71 | 72 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 73 | func (in *DatasetList) DeepCopyInto(out *DatasetList) { 74 | *out = *in 75 | out.TypeMeta = in.TypeMeta 76 | in.ListMeta.DeepCopyInto(&out.ListMeta) 77 | if in.Items != nil { 78 | in, out := &in.Items, &out.Items 79 | *out = make([]Dataset, len(*in)) 80 | for i := range *in { 81 | (*in)[i].DeepCopyInto(&(*out)[i]) 82 | } 83 | } 84 | } 85 | 86 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetList. 87 | func (in *DatasetList) DeepCopy() *DatasetList { 88 | if in == nil { 89 | return nil 90 | } 91 | out := new(DatasetList) 92 | in.DeepCopyInto(out) 93 | return out 94 | } 95 | 96 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 97 | func (in *DatasetList) DeepCopyObject() runtime.Object { 98 | if c := in.DeepCopy(); c != nil { 99 | return c 100 | } 101 | return nil 102 | } 103 | 104 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 105 | func (in *DatasetSource) DeepCopyInto(out *DatasetSource) { 106 | *out = *in 107 | if in.Options != nil { 108 | in, out := &in.Options, &out.Options 109 | *out = make(map[string]string, len(*in)) 110 | for key, val := range *in { 111 | (*out)[key] = val 112 | } 113 | } 114 | } 115 | 116 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetSource. 117 | func (in *DatasetSource) DeepCopy() *DatasetSource { 118 | if in == nil { 119 | return nil 120 | } 121 | out := new(DatasetSource) 122 | in.DeepCopyInto(out) 123 | return out 124 | } 125 | 126 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 127 | func (in *DatasetSpec) DeepCopyInto(out *DatasetSpec) { 128 | *out = *in 129 | if in.ShareToNamespaceSelector != nil { 130 | in, out := &in.ShareToNamespaceSelector, &out.ShareToNamespaceSelector 131 | *out = new(v1.LabelSelector) 132 | (*in).DeepCopyInto(*out) 133 | } 134 | in.Source.DeepCopyInto(&out.Source) 135 | out.MountOptions = in.MountOptions 136 | in.VolumeClaimTemplate.DeepCopyInto(&out.VolumeClaimTemplate) 137 | } 138 | 139 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetSpec. 140 | func (in *DatasetSpec) DeepCopy() *DatasetSpec { 141 | if in == nil { 142 | return nil 143 | } 144 | out := new(DatasetSpec) 145 | in.DeepCopyInto(out) 146 | return out 147 | } 148 | 149 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 150 | func (in *DatasetStatus) DeepCopyInto(out *DatasetStatus) { 151 | *out = *in 152 | if in.Conditions != nil { 153 | in, out := &in.Conditions, &out.Conditions 154 | *out = make([]v1.Condition, len(*in)) 155 | for i := range *in { 156 | (*in)[i].DeepCopyInto(&(*out)[i]) 157 | } 158 | } 159 | if in.SyncRoundStatuses != nil { 160 | in, out := &in.SyncRoundStatuses, &out.SyncRoundStatuses 161 | *out = make([]DataLoadStatus, len(*in)) 162 | for i := range *in { 163 | (*in)[i].DeepCopyInto(&(*out)[i]) 164 | } 165 | } 166 | in.LastSyncTime.DeepCopyInto(&out.LastSyncTime) 167 | } 168 | 169 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetStatus. 170 | func (in *DatasetStatus) DeepCopy() *DatasetStatus { 171 | if in == nil { 172 | return nil 173 | } 174 | out := new(DatasetStatus) 175 | in.DeepCopyInto(out) 176 | return out 177 | } 178 | 179 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 180 | func (in *MountOptions) DeepCopyInto(out *MountOptions) { 181 | *out = *in 182 | } 183 | 184 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MountOptions. 185 | func (in *MountOptions) DeepCopy() *MountOptions { 186 | if in == nil { 187 | return nil 188 | } 189 | out := new(MountOptions) 190 | in.DeepCopyInto(out) 191 | return out 192 | } 193 | -------------------------------------------------------------------------------- /internal/cmd/dataloader/root.go: -------------------------------------------------------------------------------- 1 | package dataloader 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "regexp" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/samber/lo" 12 | "github.com/spf13/cobra" 13 | 14 | "github.com/BaizeAI/dataset/internal/pkg/constants" 15 | "github.com/BaizeAI/dataset/internal/pkg/datasources" 16 | "github.com/BaizeAI/dataset/pkg/log" 17 | "github.com/BaizeAI/dataset/pkg/utils" 18 | ) 19 | 20 | func NewCommand() *cobra.Command { 21 | supportedTypesOfDataSourcesCommandHelpStr := strings.Join(datasources.SupportedTypesString, "|") 22 | 23 | rootCmd := &cobra.Command{ 24 | Use: fmt.Sprintf("data-loader [%s] ", supportedTypesOfDataSourcesCommandHelpStr), 25 | Short: "Load datasets from various data sources", 26 | } 27 | 28 | flags := new(CommandFlags) 29 | 30 | rootCmd.Flags().StringVar(&flags.MountPath, "mount-path", "", "Mount path for data source to copy to") 31 | rootCmd.Flags().StringVar(&flags.MountMode, "mount-mode", "0755", "Mount mode for data source to copy to") 32 | rootCmd.Flags().IntVar(&flags.MountUID, "mount-uid", 1000, "Mount UID for data source to copy to") 33 | rootCmd.Flags().IntVar(&flags.MountGID, "mount-gid", 1000, "Mount GID for data source to copy to") 34 | rootCmd.Flags().StringVar(&flags.MountRoot, "mount-root", "", "Mount root for data source to copy to") 35 | rootCmd.Flags().StringVar(&flags.MountSecrets, "mount-secrets", constants.DatasetJobSecretsMountPath, "Mount secrets for data source to copy to") 36 | rootCmd.Flags().StringArrayVarP(&flags.Options, "options", "o", []string{}, "Options for data source to copy from") 37 | 38 | rootCmd.Args = newCommandValidateArgsFunc(flags) 39 | rootCmd.Run = newCommandRunEFunc(flags) 40 | 41 | return rootCmd 42 | } 43 | 44 | var ( 45 | optionsRegexp = regexp.MustCompile(`^(\w+)=(.*)$`) 46 | ) 47 | 48 | type CommandFlags struct { 49 | MountPath string 50 | MountMode string 51 | MountUID int 52 | MountGID int 53 | MountRoot string 54 | MountSecrets string 55 | Options []string 56 | } 57 | 58 | func newCommandValidateArgsFunc(flags *CommandFlags) func(cmd *cobra.Command, args []string) error { 59 | return func(cmd *cobra.Command, args []string) error { 60 | if len(args) < 2 || args[0] == "" || args[1] == "" { 61 | return fmt.Errorf("arguments and are required") 62 | } 63 | if !lo.Contains(datasources.SupportedTypesString, args[0]) { 64 | return fmt.Errorf("data source type %s is not supported, supported types are %s", args[0], strings.Join(datasources.SupportedTypesString, ", ")) 65 | } 66 | if flags.MountPath == "" { 67 | return fmt.Errorf("flag --mount-path is required") 68 | } 69 | 70 | return nil 71 | } 72 | } 73 | 74 | func execPostCopy(_ map[string]string, datasourceOptions datasources.Options, _ datasources.Secrets) error { 75 | err := utils.ChmodAndChownRecursively( 76 | log.WithField("action", "post copy"), 77 | filepath.Join(datasourceOptions.Root, datasourceOptions.Path), 78 | datasourceOptions.UID, 79 | datasourceOptions.GID, 80 | datasourceOptions.Mode, 81 | ) 82 | if err != nil { 83 | return fmt.Errorf("failed to perform post chmod and chown operations, err: %w", err) 84 | } 85 | 86 | return nil 87 | } 88 | 89 | func execCopy(rawOptions map[string]string, datasourceOptions datasources.Options, secrets datasources.Secrets) error { 90 | var err error 91 | var datasourceLoader datasources.Loader 92 | 93 | switch datasourceOptions.Type { 94 | case datasources.TypeS3: 95 | datasourceLoader, err = datasources.NewS3Loader(rawOptions, datasourceOptions, secrets) 96 | if err != nil { 97 | return err 98 | } 99 | case datasources.TypeHTTP: 100 | datasourceLoader, err = datasources.NewHTTPLoader(rawOptions, datasourceOptions, secrets) 101 | if err != nil { 102 | return err 103 | } 104 | case datasources.TypeGit: 105 | datasourceLoader, err = datasources.NewGitLoader(rawOptions, datasourceOptions, secrets) 106 | if err != nil { 107 | return err 108 | } 109 | case datasources.TypeConda: 110 | datasourceLoader, err = datasources.NewCondaLoader(rawOptions, datasourceOptions, secrets) 111 | if err != nil { 112 | return err 113 | } 114 | case datasources.TypeHuggingFace: 115 | datasourceLoader, err = datasources.NewHuggingFaceLoader(rawOptions, datasourceOptions, secrets) 116 | if err != nil { 117 | return err 118 | } 119 | case datasources.TypeModelScope: 120 | datasourceLoader, err = datasources.NewModelScopeLoader(rawOptions, datasourceOptions, secrets) 121 | if err != nil { 122 | return err 123 | } 124 | default: 125 | return fmt.Errorf("data source type %s is not supported", datasourceOptions.Type) 126 | } 127 | 128 | err = datasourceLoader.Sync(datasourceOptions.URI, datasourceOptions.Path) 129 | if err != nil { 130 | return err 131 | } 132 | 133 | return nil 134 | } 135 | 136 | func newCommandRunEFunc(flags *CommandFlags) func(cmd *cobra.Command, args []string) { 137 | return func(cmd *cobra.Command, args []string) { 138 | flags.MountPath = filepath.Join(".", flags.MountPath) 139 | 140 | if flags.MountRoot == "" { 141 | flags.MountRoot = lo.Must(os.Getwd()) 142 | } 143 | 144 | options := make(map[string]string) 145 | for _, optionStr := range flags.Options { 146 | option := optionsRegexp.FindStringSubmatch(optionStr) 147 | 148 | if len(option) == 3 { 149 | options[option[1]] = option[2] 150 | } else { 151 | options[option[1]] = "" 152 | } 153 | } 154 | 155 | fileMode, err := strconv.ParseUint(flags.MountMode, 8, 32) 156 | if err != nil { 157 | handleError(err) 158 | return 159 | } 160 | 161 | datasourceOptions := datasources.Options{ 162 | Type: datasources.Type(args[0]), 163 | URI: args[1], 164 | Path: flags.MountPath, 165 | Root: flags.MountRoot, 166 | UID: flags.MountUID, 167 | GID: flags.MountGID, 168 | Mode: os.FileMode(fileMode), 169 | } 170 | 171 | secrets, err := datasources.ReadAndParseSecrets(flags.MountSecrets) 172 | if err != nil { 173 | log.Warnf("failed to read and parse secrets from %s, err: %s", constants.DatasetJobSecretsMountPath, err) 174 | } 175 | 176 | err = execCopy(options, datasourceOptions, secrets) 177 | if err != nil { 178 | handleError(err) 179 | } 180 | 181 | err = execPostCopy(options, datasourceOptions, secrets) 182 | if err != nil { 183 | handleError(err) 184 | } 185 | } 186 | } 187 | 188 | func handleError(err error) { 189 | if err == nil { 190 | return 191 | } 192 | 193 | _, err = fmt.Fprintf(os.Stderr, "failed to load data: %s\n", err) 194 | if err != nil { 195 | panic(err) 196 | } 197 | 198 | os.Exit(1) 199 | } 200 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_git_test.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | func TestGitLoader(t *testing.T) { 13 | t.Run("clone", func(t *testing.T) { 14 | git, err := NewGitLoader(map[string]string{ 15 | "branch": "master", 16 | }, Options{}, Secrets{ 17 | Username: "test", 18 | Password: "password", 19 | }) 20 | assert.NoError(t, err) 21 | fakeGit := fakeCommand{ 22 | t: t, 23 | cmd: "git", 24 | outputs: []out{ 25 | { 26 | stdout: "clone", 27 | stderr: "", 28 | exit: 0, 29 | }, 30 | { 31 | stdout: "config", 32 | stderr: "", 33 | exit: 0, 34 | }, 35 | { 36 | stdout: "config", 37 | stderr: "", 38 | exit: 0, 39 | }, 40 | { 41 | stdout: "config", 42 | stderr: "", 43 | exit: 0, 44 | }, 45 | }, 46 | } 47 | defer func() { 48 | assert.NoError(t, fakeGit.Clean()) 49 | }() 50 | gitDir, _ := os.MkdirTemp("", "git-*") 51 | defer func() { 52 | assert.NoError(t, os.RemoveAll(gitDir)) 53 | }() 54 | assert.NoError(t, err) 55 | fakeGit.WithContext(func() { 56 | err = git.Sync("git://github.com/ndx-baize/baize.git", gitDir) 57 | assert.NoError(t, err) 58 | }) 59 | bbs := fakeGit.GetAllInputs() 60 | assert.Equal(t, [][]byte{ 61 | []byte(fmt.Sprintf("clone git://github.com/ndx-baize/baize.git %s --branch master -v\n", gitDir)), 62 | []byte("config --global safe.directory *\n"), 63 | []byte("config --local core.fileMode false\n"), 64 | []byte("remote set-url origin git://github.com/ndx-baize/baize.git\n"), 65 | }, bbs) 66 | }) 67 | t.Run("checkout commit", func(t *testing.T) { 68 | git, err := NewGitLoader(map[string]string{ 69 | "branch": "master", 70 | "commit": "12345", 71 | }, Options{}, Secrets{}) 72 | assert.NoError(t, err) 73 | fakeGit := fakeCommand{ 74 | t: t, 75 | cmd: "git", 76 | outputs: []out{ 77 | { 78 | stdout: "ok", 79 | stderr: "", 80 | exit: 0, 81 | }, 82 | { 83 | stdout: "ok", 84 | stderr: "", 85 | exit: 0, 86 | }, 87 | { 88 | stdout: "ok", 89 | stderr: "", 90 | exit: 0, 91 | }, 92 | { 93 | stdout: "ok", 94 | stderr: "", 95 | exit: 0, 96 | }, 97 | }, 98 | } 99 | defer func() { 100 | assert.NoError(t, fakeGit.Clean()) 101 | }() 102 | gitDir, _ := os.MkdirTemp("", "git-*") 103 | defer func() { 104 | assert.NoError(t, os.RemoveAll(gitDir)) 105 | }() 106 | assert.NoError(t, err) 107 | fakeGit.WithContext(func() { 108 | err = git.Sync("git://github.com/ndx-baize/baize.git", gitDir) 109 | assert.NoError(t, err) 110 | }) 111 | bbs := fakeGit.GetAllInputs() 112 | assert.Equal(t, [][]byte{ 113 | []byte(fmt.Sprintf("clone git://github.com/ndx-baize/baize.git %s --branch master -v\n", gitDir)), 114 | []byte("config --global safe.directory *\n"), 115 | []byte("config --local core.fileMode false\n"), 116 | []byte("checkout 12345\n"), 117 | }, bbs) 118 | }) 119 | t.Run("pull w/ branch", func(t *testing.T) { 120 | git, err := NewGitLoader(map[string]string{ 121 | "branch": "master", 122 | }, Options{}, Secrets{}) 123 | assert.NoError(t, err) 124 | fakeGit := fakeCommand{ 125 | t: t, 126 | cmd: "git", 127 | outputs: []out{ 128 | { 129 | stdout: "config", 130 | stderr: "", 131 | exit: 0, 132 | }, 133 | { 134 | stdout: "update", 135 | stderr: "", 136 | exit: 0, 137 | }, 138 | { 139 | stdout: "add", 140 | stderr: "", 141 | exit: 0, 142 | }, 143 | { 144 | stdout: "stash", 145 | stderr: "", 146 | exit: 0, 147 | }, 148 | { 149 | stdout: "reset", 150 | stderr: "", 151 | exit: 0, 152 | }, 153 | { 154 | stdout: "ok", 155 | stderr: "", 156 | exit: 0, 157 | }, 158 | { 159 | stdout: "ok", 160 | stderr: "", 161 | exit: 0, 162 | }, 163 | }, 164 | } 165 | defer func() { 166 | assert.NoError(t, fakeGit.Clean()) 167 | }() 168 | gitDir, _ := os.MkdirTemp("", "git-*") 169 | defer func() { 170 | assert.NoError(t, os.RemoveAll(gitDir)) 171 | }() 172 | require.NoError(t, os.Mkdir(gitDir+"/.git", 0755)) 173 | assert.NoError(t, err) 174 | fakeGit.WithContext(func() { 175 | err = git.Sync("git://github.com/ndx-baize/baize.git", gitDir) 176 | assert.NoError(t, err) 177 | }) 178 | bbs := fakeGit.GetAllInputs() 179 | assert.Contains(t, string(bbs[5]), "remote add") 180 | assert.Contains(t, string(bbs[6]), "pull") 181 | bbs[5] = []byte{} 182 | bbs[6] = []byte{} 183 | assert.Equal(t, [][]byte{ 184 | []byte("config --global safe.directory *\n"), 185 | []byte("update-index --refresh\n"), 186 | []byte("add -u\n"), 187 | []byte("stash\n"), 188 | []byte("reset --hard master\n"), 189 | {}, 190 | {}, 191 | }, bbs) 192 | }) 193 | t.Run("pull w/o branch", func(t *testing.T) { 194 | git, err := NewGitLoader(map[string]string{}, Options{}, Secrets{}) 195 | assert.NoError(t, err) 196 | fakeGit := fakeCommand{ 197 | t: t, 198 | cmd: "git", 199 | outputs: []out{ 200 | { 201 | stdout: "config", 202 | stderr: "", 203 | exit: 0, 204 | }, 205 | { 206 | stdout: "update", 207 | stderr: "", 208 | exit: 0, 209 | }, 210 | { 211 | stdout: "add", 212 | stderr: "", 213 | exit: 0, 214 | }, 215 | { 216 | stdout: "stash", 217 | stderr: "", 218 | exit: 0, 219 | }, 220 | { 221 | stdout: "reset", 222 | stderr: "", 223 | exit: 0, 224 | }, 225 | { 226 | stdout: "ok", 227 | stderr: "", 228 | exit: 0, 229 | }, 230 | { 231 | stdout: "branch1", 232 | stderr: "", 233 | exit: 0, 234 | }, 235 | { 236 | stdout: "ok", 237 | stderr: "", 238 | exit: 0, 239 | }, 240 | }, 241 | } 242 | defer func() { 243 | assert.NoError(t, fakeGit.Clean()) 244 | }() 245 | gitDir, _ := os.MkdirTemp("", "git-*") 246 | defer func() { 247 | assert.NoError(t, os.RemoveAll(gitDir)) 248 | }() 249 | require.NoError(t, os.Mkdir(gitDir+"/.git", 0755)) 250 | assert.NoError(t, err) 251 | fakeGit.WithContext(func() { 252 | err = git.Sync("git://github.com/ndx-baize/baize.git", gitDir) 253 | assert.NoError(t, err) 254 | }) 255 | bbs := fakeGit.GetAllInputs() 256 | assert.Contains(t, string(bbs[5]), "remote add") 257 | bbs[5] = []byte{} 258 | assert.Contains(t, string(bbs[7]), "branch1") 259 | bbs[7] = []byte{} 260 | assert.Equal(t, [][]byte{ 261 | []byte("config --global safe.directory *\n"), 262 | []byte("update-index --refresh\n"), 263 | []byte("add -u\n"), 264 | []byte("stash\n"), 265 | []byte("reset --hard origin/HEAD\n"), 266 | {}, 267 | []byte("branch --show-current\n"), 268 | {}, 269 | }, bbs) 270 | }) 271 | } 272 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_huggingface.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/url" 7 | "os" 8 | "os/exec" 9 | "strings" 10 | 11 | "github.com/sirupsen/logrus" 12 | 13 | "github.com/BaizeAI/dataset/pkg/log" 14 | "github.com/BaizeAI/dataset/pkg/utils" 15 | ) 16 | 17 | var _ Loader = &HuggingFaceLoader{} 18 | 19 | type HuggingFaceLoader struct { 20 | Options Options 21 | 22 | huggingFaceOptions HuggingFaceLoaderOptions 23 | } 24 | 25 | func NewHuggingFaceLoader(datasourceOptions map[string]string, options Options, secrets Secrets) (*HuggingFaceLoader, error) { 26 | huggingFace := new(HuggingFaceLoader) 27 | parsedOpts, err := huggingFace.parseOptionsFromOptions(datasourceOptions) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | huggingFace.Options = options 33 | huggingFace.huggingFaceOptions = parsedOpts 34 | huggingFace.huggingFaceOptions.token = secrets.Token 35 | 36 | err = huggingFace.validateOptions(parsedOpts) 37 | if err != nil { 38 | return nil, err 39 | } 40 | 41 | return huggingFace, nil 42 | } 43 | 44 | type HuggingFaceLoaderOptions struct { 45 | Revision string `json:"revision"` 46 | RepoType string `json:"repoType"` 47 | Endpoint string `json:"endpoint"` 48 | Offline bool `json:"offline"` 49 | Include string `json:"include"` 50 | Exclude string `json:"exclude"` 51 | 52 | token string 53 | } 54 | 55 | func (d *HuggingFaceLoader) parseOptionsFromOptions(options map[string]string) (HuggingFaceLoaderOptions, error) { 56 | jsonContent, err := json.Marshal(options) 57 | if err != nil { 58 | return HuggingFaceLoaderOptions{}, err 59 | } 60 | 61 | var hfOptions HuggingFaceLoaderOptions 62 | err = json.Unmarshal(jsonContent, &hfOptions) 63 | if err != nil { 64 | return HuggingFaceLoaderOptions{}, err 65 | } 66 | 67 | return hfOptions, nil 68 | } 69 | 70 | func (d *HuggingFaceLoader) validateOptions(options HuggingFaceLoaderOptions) error { 71 | if options.Endpoint != "" { 72 | _, err := url.Parse(options.Endpoint) 73 | if err != nil { 74 | return fmt.Errorf("invalid endpoint %s: %w", options.Endpoint, err) 75 | } 76 | } 77 | 78 | return nil 79 | } 80 | 81 | func (d *HuggingFaceLoader) mapRepoTypeEnumStringToHuggingFaceRepoType(repoType string) string { 82 | switch repoType { 83 | case "MODEL", "model": 84 | return "model" 85 | case "DATASET", "dataset": 86 | return "dataset" 87 | default: 88 | return "" 89 | } 90 | } 91 | 92 | func (d *HuggingFaceLoader) env(logger *logrus.Entry) (string, error) { 93 | args := []string{ 94 | "env", 95 | } 96 | 97 | cmd := exec.Command("huggingface-cli", args...) 98 | cmd.Env = os.Environ() 99 | 100 | output, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 101 | if err != nil { 102 | return "", err 103 | } 104 | 105 | outputString := strings.TrimSpace(output.String()) 106 | output.Reset() 107 | 108 | return outputString, nil 109 | } 110 | 111 | func (d *HuggingFaceLoader) login(logger *logrus.Entry, token string) error { 112 | args := []string{ 113 | "login", 114 | "--token", 115 | token, 116 | } 117 | 118 | cmd := exec.Command("huggingface-cli", args...) 119 | cmd.Env = os.Environ() 120 | 121 | _, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 122 | if err != nil { 123 | return err 124 | } 125 | 126 | return nil 127 | } 128 | 129 | func (d *HuggingFaceLoader) whoAmI(logger *logrus.Entry) (string, error) { 130 | args := []string{ 131 | "whoami", 132 | } 133 | 134 | cmd := exec.Command("huggingface-cli", args...) 135 | cmd.Env = os.Environ() 136 | 137 | output, err := utils.ExecuteCommandWithOutput(logger, cmd, []string{}) 138 | if err != nil { 139 | return "", err 140 | } 141 | 142 | outputString := strings.TrimSpace(output.String()) 143 | output.Reset() 144 | 145 | return outputString, nil 146 | } 147 | 148 | func (d *HuggingFaceLoader) Sync(fromURI string, toPath string) error { 149 | parsedURL, err := url.Parse(d.Options.URI) 150 | if err != nil { 151 | return err 152 | } 153 | if parsedURL.Scheme != "huggingface" { 154 | return fmt.Errorf("invalid scheme %s, only huggingface is supported", parsedURL.Scheme) 155 | } 156 | 157 | repoName := parsedURL.Host + parsedURL.Path 158 | repoType := d.mapRepoTypeEnumStringToHuggingFaceRepoType(d.huggingFaceOptions.RepoType) 159 | 160 | logger := log.WithFields(logrus.Fields{ 161 | "fromURI": fromURI, 162 | "type": TypeHuggingFace, 163 | "toPath": toPath, 164 | "workingDirectory": d.Options.Root, 165 | "repoName": repoName, 166 | "revision": d.huggingFaceOptions.Revision, 167 | "repoType": repoType, 168 | "endpoint": d.huggingFaceOptions.Endpoint, 169 | "offline": d.huggingFaceOptions.Offline, 170 | "include": d.huggingFaceOptions.Include, 171 | "exclude": d.huggingFaceOptions.Exclude, 172 | }) 173 | 174 | token := strings.TrimSpace(d.huggingFaceOptions.token) 175 | 176 | logger.Debugf("performing huggingface-cli download command to pull data from %s to %s", fromURI, toPath) 177 | 178 | _, err = d.env(logger) 179 | if err != nil { 180 | return err 181 | } 182 | 183 | if d.huggingFaceOptions.token != "" { 184 | err = d.login(logger, token) 185 | if err != nil { 186 | return err 187 | } 188 | 189 | whoAmI, err := d.whoAmI(logger) 190 | if err != nil { 191 | return err 192 | } 193 | 194 | logger.Debugf("huggingface-cli executed with authorized login handle as: %s", whoAmI) 195 | } 196 | 197 | args := []string{ 198 | "download", 199 | repoName, 200 | "--local-dir", 201 | toPath, 202 | "--resume-download", 203 | } 204 | if repoType != "" { 205 | args = append(args, "--repo-type", repoType) 206 | } 207 | if d.huggingFaceOptions.Include != "" { 208 | args = append(args, "--include", d.huggingFaceOptions.Include) 209 | } 210 | if d.huggingFaceOptions.Exclude != "" { 211 | args = append(args, "--exclude", d.huggingFaceOptions.Exclude) 212 | } 213 | 214 | cmd := exec.Command("huggingface-cli", args...) 215 | cmd.Dir = d.Options.Root 216 | 217 | logger = logger.WithField("command", cmd.String()) 218 | logger.Debug("executing command to download data from huggingface-cli") 219 | 220 | cmd.Env = os.Environ() 221 | cmd.Env = append(cmd.Env, "HF_HUB_VERBOSITY=debug") 222 | // cmd.Env = append(cmd.Env, "HF_HUB_DISABLE_PROGRESS_BARS=1") 223 | cmd.Env = append(cmd.Env, "HF_HUB_DOWNLOAD_TIMEOUT=60") 224 | cmd.Env = append(cmd.Env, "DO_NOT_TRACK=1") // https://consoledonottrack.com/ 225 | 226 | if d.huggingFaceOptions.Offline { 227 | cmd.Env = append(cmd.Env, "HF_HUB_OFFLINE=1") 228 | } 229 | if d.huggingFaceOptions.Endpoint != "" { 230 | cmd.Env = append(cmd.Env, fmt.Sprintf("HF_ENDPOINT=%s", d.huggingFaceOptions.Endpoint)) 231 | } 232 | 233 | outBuffer, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, []string{token}) 234 | 235 | if err != nil { 236 | logger.Errorf("huggingface-cli download command error: %s", errBuffer) 237 | return fmt.Errorf("failed to copy data from %s to %s with huggingface-cli command %s, err: %s", fromURI, toPath, cmd.String(), err) 238 | } 239 | logger.Debugf("huggingface-cli download command output: %s", outBuffer.String()) 240 | 241 | return nil 242 | } 243 | -------------------------------------------------------------------------------- /internal/pkg/datasources/datasource_s3.go: -------------------------------------------------------------------------------- 1 | package datasources 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/samber/lo" 7 | "net/url" 8 | "os" 9 | "os/exec" 10 | "path/filepath" 11 | "strings" 12 | 13 | "github.com/sirupsen/logrus" 14 | 15 | "github.com/BaizeAI/dataset/pkg/log" 16 | "github.com/BaizeAI/dataset/pkg/utils" 17 | ) 18 | 19 | var _ Loader = &S3Loader{} 20 | 21 | type S3Loader struct { 22 | Options Options 23 | 24 | s3Options S3LoaderOptions 25 | } 26 | 27 | func NewS3Loader(datasourceOptions map[string]string, options Options, secrets Secrets) (*S3Loader, error) { 28 | s3 := new(S3Loader) 29 | s3Options, err := s3.parseOptionsFromOptions(datasourceOptions) 30 | if err != nil { 31 | return nil, fmt.Errorf("failed to parse uri %s: %w", options.URI, err) 32 | } 33 | 34 | s3.Options = options 35 | s3.s3Options = s3Options 36 | s3.s3Options.accessKeyID = secrets.AKSKAccessKeyID 37 | s3.s3Options.secretAccessKey = secrets.AKSKSecretAccessKey 38 | s3.s3Options.SyncMode = lo.CoalesceOrEmpty(datasourceOptions["syncMode"], "sync") 39 | 40 | err = s3.validateOptions(s3Options) 41 | if err != nil { 42 | return nil, err 43 | } 44 | 45 | return s3, nil 46 | } 47 | 48 | type S3LoaderOptions struct { 49 | Provider string `json:"provider"` 50 | Region string `json:"region"` 51 | Endpoint string `json:"endpoint"` 52 | SyncMode string `json:"syncMode"` 53 | 54 | accessKeyID string 55 | secretAccessKey string 56 | } 57 | 58 | func (d *S3Loader) parseOptionsFromOptions(options map[string]string) (S3LoaderOptions, error) { 59 | jsonContent, err := json.Marshal(options) 60 | if err != nil { 61 | return S3LoaderOptions{}, err 62 | } 63 | 64 | var s3Options S3LoaderOptions 65 | err = json.Unmarshal(jsonContent, &s3Options) 66 | if err != nil { 67 | return S3LoaderOptions{}, err 68 | } 69 | 70 | return s3Options, nil 71 | } 72 | 73 | func (d *S3Loader) validateOptions(options S3LoaderOptions) error { 74 | if options.Provider == "AWS" && options.Region == "" { 75 | return fmt.Errorf("--options region is required for AWS provider") 76 | } 77 | 78 | if options.SyncMode != "" && options.SyncMode != "sync" && options.SyncMode != "copy" { 79 | return fmt.Errorf("invalid syncMode '%s', must be 'sync' or 'copy'", options.SyncMode) 80 | } 81 | 82 | return nil 83 | } 84 | 85 | func (d *S3Loader) mapProviderEnumStringToRCloneProvider(provider string) string { 86 | switch provider { 87 | case "AWS": 88 | return "AWS" 89 | case "MINIO": 90 | return "Minio" 91 | default: 92 | return "" 93 | } 94 | } 95 | 96 | func (d *S3Loader) configTouch() error { 97 | cmd := exec.Command("rclone", "config", "touch") 98 | 99 | logger := log.WithField("command", cmd.String()) 100 | logger.Debug("executing command to touch rclone config") 101 | 102 | outBuffer, errWriter, err := utils.ExecuteCommandWithAllOutput(logger, cmd, nil) 103 | if err != nil { 104 | logger.Errorf("rclone config touch command error: %s", errWriter) 105 | return err 106 | } 107 | logger.Debugf("rclone config touch command output: %s", outBuffer.String()) 108 | 109 | return nil 110 | } 111 | 112 | func (d *S3Loader) configCreate(configName string) error { 113 | logger := log.WithFields(logrus.Fields{ 114 | "configName": configName, 115 | "type": TypeS3, 116 | "provider": d.s3Options.Provider, 117 | "region": d.s3Options.Region, 118 | "endpoint": d.s3Options.Endpoint, 119 | }) 120 | 121 | args := []string{ 122 | "config", 123 | "create", 124 | configName, 125 | "s3", 126 | } 127 | if d.s3Options.accessKeyID != "" { 128 | args = append(args, strings.Join([]string{"env_auth", "true"}, "=")) 129 | } 130 | if d.s3Options.Region != "" { 131 | args = append(args, strings.Join([]string{"region", d.s3Options.Region}, "=")) 132 | } 133 | 134 | provider := d.mapProviderEnumStringToRCloneProvider(d.s3Options.Provider) 135 | 136 | if d.s3Options.Provider != "" { 137 | args = append(args, strings.Join([]string{"provider", provider}, "=")) 138 | } else { 139 | args = append(args, strings.Join([]string{"provider", "AWS"}, "=")) 140 | } 141 | 142 | if d.s3Options.Region == "" && (provider == "" || provider == "AWS") { 143 | // fallback as default region 144 | d.s3Options.Region = "us-east-1" 145 | } 146 | 147 | if d.s3Options.Endpoint != "" { 148 | args = append(args, strings.Join([]string{"endpoint", d.s3Options.Endpoint}, "=")) 149 | } 150 | 151 | cmd := exec.Command("rclone", args...) 152 | 153 | logger = logger.WithField("command", cmd.String()) 154 | logger.Debug("executing command to create a new rclone config") 155 | 156 | outBuffer, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, nil) 157 | 158 | if err != nil { 159 | logger.Errorf("rclone config create command error: %s", errBuffer) 160 | return err 161 | } 162 | logger.Debugf("rclone config create command output: %s", outBuffer.String()) 163 | 164 | return nil 165 | } 166 | 167 | func (d *S3Loader) Sync(fromURI string, toPath string) error { 168 | parsedURL, err := url.Parse(d.Options.URI) 169 | if err != nil { 170 | return err 171 | } 172 | if parsedURL.Scheme != "s3" { 173 | return fmt.Errorf("invalid scheme %s, only s3 is supported", parsedURL.Scheme) 174 | } 175 | 176 | bucket := parsedURL.Host 177 | objectDir := strings.TrimPrefix(parsedURL.Path, "/") 178 | 179 | logger := log.WithFields(logrus.Fields{ 180 | "fromURI": fromURI, 181 | "type": TypeS3, 182 | "toPath": toPath, 183 | "workingDirectory": d.Options.Root, 184 | "provider": d.s3Options.Provider, 185 | "region": d.s3Options.Region, 186 | "endpoint": d.s3Options.Endpoint, 187 | "bucket": bucket, 188 | "objectDir": objectDir, 189 | }) 190 | 191 | accessKeyID := strings.TrimSpace(d.s3Options.accessKeyID) 192 | secretAccessKey := strings.TrimSpace(d.s3Options.secretAccessKey) 193 | 194 | logger.Debugf("performing rclone copy command to copy data served by S3") 195 | 196 | err = d.configTouch() 197 | if err != nil { 198 | return err 199 | } 200 | 201 | configName := fmt.Sprintf("baize-data-loader-copy-config-%s", utils.RandomHashString(8)) 202 | 203 | err = d.configCreate(configName) 204 | if err != nil { 205 | return err 206 | } 207 | 208 | syncMode := d.s3Options.SyncMode 209 | 210 | args := []string{ 211 | syncMode, 212 | filepath.Join(fmt.Sprintf("%s:%s", configName, bucket), objectDir), 213 | toPath, 214 | } 215 | 216 | args = append(args, "-vvv") 217 | cmd := exec.Command("rclone", args...) 218 | cmd.Dir = d.Options.Root 219 | 220 | logger = logger.WithField("command", cmd.String()) 221 | logger.Debug("executing command to copy data") 222 | 223 | cmd.Env = os.Environ() 224 | 225 | if accessKeyID != "" && secretAccessKey != "" { 226 | cmd.Env = append(cmd.Env, fmt.Sprintf("RCLONE_S3_ACCESS_KEY_ID=%s", accessKeyID)) 227 | cmd.Env = append(cmd.Env, fmt.Sprintf("RCLONE_S3_SECRET_ACCESS_KEY=%s", secretAccessKey)) 228 | } 229 | 230 | outBuffer, errBuffer, err := utils.ExecuteCommandWithAllOutput(logger, cmd, []string{accessKeyID, secretAccessKey}) 231 | 232 | if err != nil { 233 | logger.Errorf("rclone copy command error: %s", errBuffer) 234 | return fmt.Errorf("failed to copy data from %s to %s with rclone command %s, err: %s", fromURI, toPath, cmd.String(), err) 235 | } 236 | logger.Debugf("rclone copy command output: %s", outBuffer.String()) 237 | 238 | return nil 239 | } 240 | -------------------------------------------------------------------------------- /internal/controller/dataset/dataset_controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package dataset 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | "time" 23 | 24 | "github.com/stretchr/testify/assert" 25 | "github.com/stretchr/testify/require" 26 | corev1 "k8s.io/api/core/v1" 27 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 | "k8s.io/apimachinery/pkg/runtime" 29 | "k8s.io/apimachinery/pkg/types" 30 | "sigs.k8s.io/controller-runtime/pkg/client" 31 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 32 | 33 | datasetv1alpha1 "github.com/BaizeAI/dataset/api/dataset/v1alpha1" 34 | "github.com/BaizeAI/dataset/config" 35 | "github.com/BaizeAI/dataset/internal/pkg/constants" 36 | ) 37 | 38 | func TestDatasetReconciler_findReferencingDatasets(t *testing.T) { 39 | scheme := runtime.NewScheme() 40 | require.NoError(t, datasetv1alpha1.AddToScheme(scheme)) 41 | require.NoError(t, corev1.AddToScheme(scheme)) 42 | 43 | // Create a source dataset 44 | sourceDs := &datasetv1alpha1.Dataset{ 45 | ObjectMeta: metav1.ObjectMeta{ 46 | Name: "source-dataset", 47 | Namespace: "default", 48 | }, 49 | Spec: datasetv1alpha1.DatasetSpec{ 50 | Share: true, 51 | Source: datasetv1alpha1.DatasetSource{ 52 | Type: datasetv1alpha1.DatasetTypeGit, 53 | URI: "https://github.com/example/repo.git", 54 | }, 55 | }, 56 | } 57 | 58 | // Create a referencing dataset 59 | refDs1 := &datasetv1alpha1.Dataset{ 60 | ObjectMeta: metav1.ObjectMeta{ 61 | Name: "ref-dataset-1", 62 | Namespace: "namespace1", 63 | }, 64 | Spec: datasetv1alpha1.DatasetSpec{ 65 | Source: datasetv1alpha1.DatasetSource{ 66 | Type: datasetv1alpha1.DatasetTypeReference, 67 | URI: "dataset://default/source-dataset", 68 | }, 69 | }, 70 | } 71 | 72 | // Create another referencing dataset 73 | refDs2 := &datasetv1alpha1.Dataset{ 74 | ObjectMeta: metav1.ObjectMeta{ 75 | Name: "ref-dataset-2", 76 | Namespace: "namespace2", 77 | }, 78 | Spec: datasetv1alpha1.DatasetSpec{ 79 | Source: datasetv1alpha1.DatasetSource{ 80 | Type: datasetv1alpha1.DatasetTypeReference, 81 | URI: "dataset://default/source-dataset", 82 | }, 83 | }, 84 | } 85 | 86 | // Create a non-referencing dataset 87 | nonRefDs := &datasetv1alpha1.Dataset{ 88 | ObjectMeta: metav1.ObjectMeta{ 89 | Name: "non-ref-dataset", 90 | Namespace: "namespace3", 91 | }, 92 | Spec: datasetv1alpha1.DatasetSpec{ 93 | Source: datasetv1alpha1.DatasetSource{ 94 | Type: datasetv1alpha1.DatasetTypeGit, 95 | URI: "https://github.com/example/other-repo.git", 96 | }, 97 | }, 98 | } 99 | 100 | fakeClient := fake.NewClientBuilder(). 101 | WithScheme(scheme). 102 | WithObjects(sourceDs, refDs1, refDs2, nonRefDs). 103 | Build() 104 | 105 | reconciler := &DatasetReconciler{ 106 | Client: fakeClient, 107 | Scheme: scheme, 108 | } 109 | 110 | ctx := context.Background() 111 | referencingDatasets, err := reconciler.findReferencingDatasets(ctx, sourceDs) 112 | 113 | require.NoError(t, err) 114 | assert.Len(t, referencingDatasets, 2) 115 | 116 | // Check that we found the correct referencing datasets 117 | foundNames := make(map[string]bool) 118 | for _, ds := range referencingDatasets { 119 | foundNames[ds.Name] = true 120 | } 121 | 122 | assert.True(t, foundNames["ref-dataset-1"]) 123 | assert.True(t, foundNames["ref-dataset-2"]) 124 | assert.False(t, foundNames["non-ref-dataset"]) 125 | assert.False(t, foundNames["source-dataset"]) 126 | } 127 | 128 | func TestDatasetReconciler_reconcileCascadingDeletion_Disabled(t *testing.T) { 129 | scheme := runtime.NewScheme() 130 | require.NoError(t, datasetv1alpha1.AddToScheme(scheme)) 131 | require.NoError(t, corev1.AddToScheme(scheme)) 132 | 133 | // Test configuration with cascading deletion disabled 134 | err := config.ParseConfigFromFileContent("enable_cascading_deletion: false") 135 | require.NoError(t, err) 136 | 137 | sourceDs := &datasetv1alpha1.Dataset{ 138 | ObjectMeta: metav1.ObjectMeta{ 139 | Name: "source-dataset", 140 | Namespace: "default", 141 | DeletionTimestamp: &metav1.Time{Time: time.Now()}, 142 | Finalizers: []string{"dataset-controller"}, 143 | }, 144 | Spec: datasetv1alpha1.DatasetSpec{ 145 | Share: true, 146 | Source: datasetv1alpha1.DatasetSource{ 147 | Type: datasetv1alpha1.DatasetTypeGit, 148 | URI: "https://github.com/example/repo.git", 149 | }, 150 | }, 151 | } 152 | 153 | fakeClient := fake.NewClientBuilder(). 154 | WithScheme(scheme). 155 | WithObjects(sourceDs). 156 | Build() 157 | 158 | reconciler := &DatasetReconciler{ 159 | Client: fakeClient, 160 | Scheme: scheme, 161 | } 162 | 163 | ctx := context.Background() 164 | err = reconciler.reconcileCascadingDeletion(ctx, sourceDs) 165 | 166 | // Should not error and should do nothing when cascading deletion is disabled 167 | require.NoError(t, err) 168 | } 169 | 170 | func TestDatasetReconciler_reconcileCascadingDeletion_Enabled(t *testing.T) { 171 | scheme := runtime.NewScheme() 172 | require.NoError(t, datasetv1alpha1.AddToScheme(scheme)) 173 | require.NoError(t, corev1.AddToScheme(scheme)) 174 | 175 | // Test configuration with cascading deletion enabled 176 | err := config.ParseConfigFromFileContent("enable_cascading_deletion: true") 177 | require.NoError(t, err) 178 | 179 | sourceDs := &datasetv1alpha1.Dataset{ 180 | ObjectMeta: metav1.ObjectMeta{ 181 | Name: "source-dataset", 182 | Namespace: "default", 183 | DeletionTimestamp: &metav1.Time{Time: time.Now()}, 184 | Finalizers: []string{"dataset-controller"}, 185 | }, 186 | Spec: datasetv1alpha1.DatasetSpec{ 187 | Share: true, 188 | Source: datasetv1alpha1.DatasetSource{ 189 | Type: datasetv1alpha1.DatasetTypeGit, 190 | URI: "https://github.com/example/repo.git", 191 | }, 192 | }, 193 | } 194 | 195 | refDs := &datasetv1alpha1.Dataset{ 196 | ObjectMeta: metav1.ObjectMeta{ 197 | Name: "ref-dataset", 198 | Namespace: "namespace1", 199 | }, 200 | Spec: datasetv1alpha1.DatasetSpec{ 201 | Source: datasetv1alpha1.DatasetSource{ 202 | Type: datasetv1alpha1.DatasetTypeReference, 203 | URI: "dataset://default/source-dataset", 204 | }, 205 | }, 206 | } 207 | 208 | fakeClient := fake.NewClientBuilder(). 209 | WithScheme(scheme). 210 | WithObjects(sourceDs, refDs). 211 | Build() 212 | 213 | reconciler := &DatasetReconciler{ 214 | Client: fakeClient, 215 | Scheme: scheme, 216 | } 217 | 218 | ctx := context.Background() 219 | err = reconciler.reconcileCascadingDeletion(ctx, sourceDs) 220 | 221 | require.NoError(t, err) 222 | 223 | // Check that the referencing dataset has been deleted 224 | updatedRefDs := &datasetv1alpha1.Dataset{} 225 | err = fakeClient.Get(ctx, types.NamespacedName{Name: "ref-dataset", Namespace: "namespace1"}, updatedRefDs) 226 | // The dataset should either be deleted (not found) or marked for deletion 227 | if err != nil { 228 | // Dataset was deleted completely 229 | require.True(t, client.IgnoreNotFound(err) == nil, "Expected dataset to be deleted or not found") 230 | } else { 231 | // Dataset exists but should be marked for deletion 232 | assert.NotNil(t, updatedRefDs.DeletionTimestamp, "Referencing dataset should be marked for deletion") 233 | } 234 | } 235 | 236 | func TestDatasetReconciler_cleanupRetainedPV(t *testing.T) { 237 | scheme := runtime.NewScheme() 238 | require.NoError(t, datasetv1alpha1.AddToScheme(scheme)) 239 | require.NoError(t, corev1.AddToScheme(scheme)) 240 | 241 | dsUID := types.UID("12345678-1234-1234-1234-123456789abc") 242 | ds := &datasetv1alpha1.Dataset{ 243 | ObjectMeta: metav1.ObjectMeta{ 244 | Name: "test-dataset", 245 | Namespace: "default", 246 | UID: dsUID, 247 | }, 248 | Spec: datasetv1alpha1.DatasetSpec{ 249 | Source: datasetv1alpha1.DatasetSource{ 250 | Type: datasetv1alpha1.DatasetTypeReference, 251 | URI: "dataset://other/source-dataset", 252 | }, 253 | }, 254 | } 255 | 256 | // Create a retained PV that should be cleaned up 257 | pvName := "dataset-default-test-dataset-123456789abc" 258 | pv := &corev1.PersistentVolume{ 259 | ObjectMeta: metav1.ObjectMeta{ 260 | Name: pvName, 261 | Labels: map[string]string{ 262 | constants.DatasetNameLabel: "test-dataset", 263 | }, 264 | }, 265 | Spec: corev1.PersistentVolumeSpec{ 266 | PersistentVolumeReclaimPolicy: corev1.PersistentVolumeReclaimRetain, 267 | }, 268 | } 269 | 270 | fakeClient := fake.NewClientBuilder(). 271 | WithScheme(scheme). 272 | WithObjects(ds, pv). 273 | Build() 274 | 275 | reconciler := &DatasetReconciler{ 276 | Client: fakeClient, 277 | Scheme: scheme, 278 | } 279 | 280 | ctx := context.Background() 281 | err := reconciler.cleanupRetainedPV(ctx, ds) 282 | 283 | require.NoError(t, err) 284 | 285 | // Check that the PV has been deleted 286 | deletedPV := &corev1.PersistentVolume{} 287 | err = fakeClient.Get(ctx, types.NamespacedName{Name: pvName}, deletedPV) 288 | assert.True(t, client.IgnoreNotFound(err) == nil, "PV should be deleted") 289 | } 290 | --------------------------------------------------------------------------------