├── admin
    ├── yamls
    │   └── .gitignore
    ├── assets
    │   ├── tree_users_ldap.png
    │   ├── phpLDAPadmin_create_ou.png
    │   ├── phpLDAPadmin_main_page.png
    │   ├── phpLDAPadmin_ou_commit.png
    │   ├── phpLDAPAdmin_group_input.png
    │   ├── phpLDAPAdmin_group_member.png
    │   ├── phpLDAPAdmin_user_commit.png
    │   ├── phpLDAPAdmin_user_template.png
    │   ├── phpLDAPadmin_create_object.png
    │   ├── phpLDAPAdmin_posix_group_name.png
    │   ├── phpLDAPAdmin_template_Default.png
    │   ├── phpLDAPAdmin_user_information.png
    │   ├── phpLDAPAdmin_user_input_email.png
    │   ├── phpLDAPAdmin_user_add_attribute.png
    │   ├── phpLDAPadmin_create_child_entry.png
    │   ├── phpLDAPadmin_template_posix_group.png
    │   ├── phpLDAPAdmin_user_add_attribute_Email.png
    │   ├── phpLDAPAdmin_user_create_child_entry.png
    │   ├── phpLDAPAdmin_user_information_detail.png
    │   ├── phpLDAPAdmin_template_groupOfUniqueNames.png
    │   ├── phpLDAPAdmin_user_add_user_email_commit.png
    │   └── phpLDAPadmin_create_object_posix_group.png
    ├── cluster_setting
    │   ├── admin_helm_ns.yaml
    │   └── rancher_local_path_nvme.yaml
    ├── adminchart
    │   ├── templates
    │   │   ├── sc_pv.yaml
    │   │   ├── quota.yaml
    │   │   ├── ipoibnetwork.yaml
    │   │   └── rbac.yaml
    │   ├── .helmignore
    │   ├── values.yaml
    │   └── Chart.yaml
    ├── gfshomechart
    │   ├── templates
    │   │   └── sc_pv.yaml
    │   ├── .helmignore
    │   ├── values.yaml
    │   └── Chart.yaml
    ├── gfssharechart
    │   ├── templates
    │   │   └── sc_pv.yaml
    │   ├── .helmignore
    │   ├── values.yaml
    │   └── Chart.yaml
    ├── ssdsharechart
    │   ├── templates
    │   │   └── sc_pv.yaml
    │   └── Chart.yaml
    ├── deluser_dir.sh
    ├── rmtestpod.sh.sh
    ├── values-template.yaml
    ├── adduser_dir.sh
    ├── pull_images_to_local.yaml
    └── README.md
├── assets
    ├── dex_login.png
    ├── dex_token_1.png
    ├── dex_token_2.png
    ├── ssp_success.png
    ├── ssp_main_page.png
    ├── dex_kubectl_run.png
    ├── vscode
    │   ├── vsc_k8s_plugin.jpg
    │   ├── vsc_connect_k8s.jpg
    │   ├── vsc_docker_plugin.jpg
    │   ├── vsc_k8s_select_ns.jpg
    │   ├── vsc_k8s_bridge_plugin.jpg
    │   └── vsc_remote_connector_plugin.jpg
    └── harbor
    │   ├── harbor_dashboard.jpg
    │   └── harbor_create_project.jpg
├── dockerfiles
    ├── comfyui
    │   ├── Dockerfile
    │   └── run.sh
    ├── sd-webui
    │   ├── Dockerfile
    │   └── run.sh
    ├── pytorch
    │   ├── requirements.txt
    │   └── Dockerfile
    ├── lab-cpu
    │   ├── requirements.txt
    │   └── Dockerfile
    └── lab
    │   ├── requirements.txt
    │   └── Dockerfile
├── user
    ├── userchart
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   └── templates
    │   │   └── deployment.yaml
    ├── sdwebui-template.yaml
    ├── comfyui-template.yaml
    └── values-template.yaml
├── test
    └── username.txt
├── FAQ.md
└── README.md


/admin/yamls/.gitignore:
--------------------------------------------------------------------------------
1 | values*.yaml
2 | 


--------------------------------------------------------------------------------
/assets/dex_login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_login.png


--------------------------------------------------------------------------------
/assets/dex_token_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_token_1.png


--------------------------------------------------------------------------------
/assets/dex_token_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_token_2.png


--------------------------------------------------------------------------------
/assets/ssp_success.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/ssp_success.png


--------------------------------------------------------------------------------
/assets/ssp_main_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/ssp_main_page.png


--------------------------------------------------------------------------------
/assets/dex_kubectl_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_kubectl_run.png


--------------------------------------------------------------------------------
/admin/assets/tree_users_ldap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/tree_users_ldap.png


--------------------------------------------------------------------------------
/admin/cluster_setting/admin_helm_ns.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: admin-helm


--------------------------------------------------------------------------------
/assets/vscode/vsc_k8s_plugin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_k8s_plugin.jpg


--------------------------------------------------------------------------------
/assets/harbor/harbor_dashboard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/harbor/harbor_dashboard.jpg


--------------------------------------------------------------------------------
/assets/vscode/vsc_connect_k8s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_connect_k8s.jpg


--------------------------------------------------------------------------------
/assets/vscode/vsc_docker_plugin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_docker_plugin.jpg


--------------------------------------------------------------------------------
/assets/vscode/vsc_k8s_select_ns.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_k8s_select_ns.jpg


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_create_ou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_ou.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_main_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_main_page.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_ou_commit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_ou_commit.png


--------------------------------------------------------------------------------
/assets/harbor/harbor_create_project.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/harbor/harbor_create_project.jpg


--------------------------------------------------------------------------------
/assets/vscode/vsc_k8s_bridge_plugin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_k8s_bridge_plugin.jpg


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_group_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_group_input.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_group_member.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_group_member.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_commit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_commit.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_template.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_template.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_create_object.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_object.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_posix_group_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_posix_group_name.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_template_Default.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_template_Default.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_information.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_information.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_input_email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_input_email.png


--------------------------------------------------------------------------------
/assets/vscode/vsc_remote_connector_plugin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_remote_connector_plugin.jpg


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_add_attribute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_add_attribute.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_create_child_entry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_child_entry.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_template_posix_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_template_posix_group.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_add_attribute_Email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_add_attribute_Email.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_create_child_entry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_create_child_entry.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_information_detail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_information_detail.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_template_groupOfUniqueNames.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_template_groupOfUniqueNames.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPAdmin_user_add_user_email_commit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_add_user_email_commit.png


--------------------------------------------------------------------------------
/admin/assets/phpLDAPadmin_create_object_posix_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_object_posix_group.png


--------------------------------------------------------------------------------
/dockerfiles/comfyui/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM harbor.ai.iiis.co:9443/zhaoyue/lab5:v4
2 | 
3 | 
4 | WORKDIR /comfyui-repo
5 | COPY . /comfyui-repo/
6 | 
7 | CMD ["bash", "run.sh"]


--------------------------------------------------------------------------------
/dockerfiles/sd-webui/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM harbor.ai.iiis.co:9443/zhaoyue/lab5:v4
2 | 
3 | 
4 | WORKDIR /sd-repo
5 | RUN pip install pydantic==1.10.19
6 | COPY . /sd-repo/
7 | 
8 | CMD ["bash", "run.sh"]


--------------------------------------------------------------------------------
/admin/adminchart/templates/sc_pv.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kind: PersistentVolumeClaim
 3 | apiVersion: v1
 4 | metadata:
 5 |   name: pvc-nfshome-{{ .Values.NameSpace }}
 6 |   namespace: {{ .Values.NameSpace }}
 7 | spec:
 8 |   storageClassName: nfs-users
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   resources:
12 |     requests:
13 |       storage: 2T  # 申请的资源大小
14 | 


--------------------------------------------------------------------------------
/admin/gfshomechart/templates/sc_pv.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kind: PersistentVolumeClaim
 3 | apiVersion: v1
 4 | metadata:
 5 |   name: gfs-sata-pvc-{{ .Values.NameSpace }} # 修改用户名
 6 |   namespace: {{ .Values.NameSpace }} # 修改命名空间
 7 | spec:
 8 |   storageClassName: kadalu.gfs-sata-users
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   resources:
12 |     requests:
13 |       storage: 10T  # 申请的资源大小


--------------------------------------------------------------------------------
/admin/gfssharechart/templates/sc_pv.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kind: PersistentVolumeClaim
 3 | apiVersion: v1
 4 | metadata:
 5 |   name: gfs-sata-share-pvc-{{ .Values.NameSpace }} # 修改用户名
 6 |   namespace: {{ .Values.NameSpace }} # 修改命名空间
 7 | spec:
 8 |   storageClassName: kadalu.gfs-sata-share
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   resources:
12 |     requests:
13 |       storage: 10T  # 申请的资源大小


--------------------------------------------------------------------------------
/admin/ssdsharechart/templates/sc_pv.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kind: PersistentVolumeClaim
 3 | apiVersion: v1
 4 | metadata:
 5 |   name: gfs-nvme-pvc-share-{{ .Values.NameSpace }} # 修改用户名
 6 |   namespace: {{ .Values.NameSpace }} # 修改命名空间
 7 | spec:
 8 |   storageClassName: kadalu.gfs-nvme-share
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   resources:
12 |     requests:
13 |       storage: 1T # 与上面PV保持一致避免浪费
14 | 


--------------------------------------------------------------------------------
/admin/adminchart/templates/quota.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ResourceQuota
 4 | metadata:
 5 |   name: quota-cpu-mem-gpu-pvc
 6 |   namespace: {{ .Values.NameSpace }}
 7 | spec:
 8 |   hard:
 9 |     limits.cpu: '56'
10 |     limits.memory: 500Gi
11 |     requests.cpu: '56'
12 |     requests.memory: 500Gi
13 |     requests.nvidia.com/gpu: '4'
14 |     requests.storage: 40T
15 |     count/pods: '10'
16 | 
17 | 


--------------------------------------------------------------------------------
/admin/adminchart/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/user/userchart/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/admin/gfshomechart/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/admin/gfssharechart/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/admin/adminchart/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for userchart.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | EMAIL: test06@test.lthpc.com
 6 | NameSpace: test06
 7 | DeployName: test06-pytorch-ssh
 8 | Label: pytorch
 9 | ContainerName: pytorch
10 | ContainerImage: harbor.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh
11 | Limits:
12 |   CPU: 8
13 |   memory: 20Gi
14 |   GPU: 2
15 | NVMEStorage: 1T
16 | UID: 2005
17 | GID: 500


--------------------------------------------------------------------------------
/admin/gfshomechart/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for userchart.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | EMAIL: test06@test.lthpc.com
 6 | NameSpace: test06
 7 | DeployName: test06-pytorch-ssh
 8 | Label: pytorch
 9 | ContainerName: pytorch
10 | ContainerImage: harbor.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh
11 | Limits:
12 |   CPU: 8
13 |   memory: 20Gi
14 |   GPU: 2
15 | NVMEStorage: 1T
16 | UID: 2005
17 | GID: 500


--------------------------------------------------------------------------------
/admin/gfssharechart/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for userchart.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | EMAIL: test06@test.lthpc.com
 6 | NameSpace: test06
 7 | DeployName: test06-pytorch-ssh
 8 | Label: pytorch
 9 | ContainerName: pytorch
10 | ContainerImage: harbor.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh
11 | Limits:
12 |   CPU: 8
13 |   memory: 20Gi
14 |   GPU: 2
15 | NVMEStorage: 1T
16 | UID: 2005
17 | GID: 500


--------------------------------------------------------------------------------
/admin/adminchart/templates/ipoibnetwork.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: mellanox.com/v1alpha1
 2 | kind: IPoIBNetwork
 3 | metadata:
 4 |   name: ipoibnetwork-{{ .Values.NameSpace }}
 5 | spec:
 6 |   networkNamespace: {{ .Values.NameSpace }}
 7 |   master: "ibs121"
 8 |   ipam: |
 9 |     {
10 |       "type": "whereabouts",
11 |       "datastore": "kubernetes",
12 |       "kubernetes": {
13 |         "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig"
14 |       },
15 |       "range": "192.168.0.0/16",
16 |       "log_file" : "/var/log/whereabouts.log",
17 |       "log_level" : "info",
18 |       "gateway": "192.168.0.1"
19 |     }


--------------------------------------------------------------------------------
/admin/adminchart/templates/rbac.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: RoleBinding
 4 | metadata:
 5 |   name: user-ns-binding
 6 |   namespace: {{ .Values.NameSpace }}
 7 | roleRef:
 8 |   apiGroup: rbac.authorization.k8s.io
 9 |   kind: ClusterRole
10 |   name: admin
11 | subjects:
12 |   - apiGroup: rbac.authorization.k8s.io
13 |     kind: User
14 |     name: {{ .Values.EMAIL }}
15 | ---
16 | apiVersion: rbac.authorization.k8s.io/v1
17 | kind: ClusterRoleBinding
18 | metadata:
19 |   name: clusterrole-user-pv-{{ .Values.NameSpace }}
20 | roleRef:
21 |   apiGroup: rbac.authorization.k8s.io
22 |   kind: ClusterRole
23 |   name: clusterrole-user-pv
24 | subjects:
25 |   - apiGroup: rbac.authorization.k8s.io
26 |     kind: User
27 |     name: {{ .Values.EMAIL }}


--------------------------------------------------------------------------------
/dockerfiles/comfyui/run.sh:
--------------------------------------------------------------------------------
 1 | cd /comfyui-repo/ComfyUI/models
 2 | rm -rf checkpoints && ln -s /ssdshare/share/lab5/ComfyUI-models/checkpoints checkpoints
 3 | rm -rf clip && ln -s /ssdshare/share/lab5/ComfyUI-models/clip clip
 4 | rm -rf controlnet && ln -s /ssdshare/share/lab5/ComfyUI-models/controlnet controlnet
 5 | rm -rf gligen && ln -s /ssdshare/share/lab5/ComfyUI-models/gligen gligen
 6 | rm -rf loras && ln -s /ssdshare/share/lab5/ComfyUI-models/loras loras
 7 | rm -rf upscale_models && ln -s /ssdshare/share/lab5/ComfyUI-models/upscale_models upscale_models
 8 | rm -rf vae && ln -s /ssdshare/share/lab5/ComfyUI-models/vae vae
 9 | 
10 | cd ..
11 | 
12 | cd custom_nodes
13 | ln -s /ssdshare/share/lab5/custom_nodes/ComfyUI-Manager ComfyUI-Manager
14 | 
15 | cd ..
16 | /usr/bin/python main.py


--------------------------------------------------------------------------------
/dockerfiles/pytorch/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | numpy
 3 | openai
 4 | pandas
 5 | requests
 6 | scikit-learn
 7 | transformers
 8 | sentencepiece
 9 | protobuf
10 | datasets
11 | accelerate
12 | chardet
13 | python-dotenv==1.0.0
14 | httpx[socks]
15 | httpcore[socks]
16 | ipykernel
17 | ipywidgets
18 | langchain==0.1.9
19 | langchain-openai==0.0.8
20 | langchainhub==0.1.14
21 | google-search-results==2.4.2
22 | lxml==4.9.3
23 | tiktoken
24 | faiss-cpu
25 | beautifulsoup4
26 | chroma-hnswlib
27 | chromadb
28 | matplotlib-inline
29 | pinecone-client
30 | pypdf
31 | scipy
32 | sentence-transformers
33 | tenacity
34 | tqdm
35 | unstructured
36 | unstructured-client
37 | unstructured-inference
38 | unstructured.pytesseract
39 | nltk 
40 | rouge
41 | peft
42 | pillow
43 | ftfy
44 | Jinja2
45 | diffusers
46 | tensorboard
47 | 
48 | 


--------------------------------------------------------------------------------
/admin/deluser_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while IFS=, read USER EMAIL UIDD GIDD
 4 | do
 5 |   if [ -n "$(echo "$USER" | tr -d '\r')" ]; then
 6 |     echo "username : $USER"	
 7 | 
 8 |     if [ ! -d "./yamls/values_$(echo "$USER" | tr -d '\r').yaml" ];then
 9 |       echo "delete delete pods pvcs namespace in k8s"
10 |       helm delete admin-$(echo "$USER" | tr -d '\r') --namespace=admin-helm
11 |       helm delete gfshome-$(echo "$USER" | tr -d '\r') --namespace=admin-helm
12 |       helm delete gfsshare-$(echo "$USER" | tr -d '\r') --namespace=admin-helm
13 |       helm delete ssdshare-$(echo "$USER" | tr -d '\r') --namespace=admin-helm
14 |       helm delete testuser-$(echo "$USER" | tr -d '\r')  --namespace=$(echo "$USER" | tr -d '\r')
15 |       kubectl delete namespace $(echo "$USER" | tr -d '\r')
16 |     else
17 |       echo "there is no yaml file for $USER"
18 |     fi
19 |   fi
20 | done < $1
21 | 


--------------------------------------------------------------------------------
/dockerfiles/sd-webui/run.sh:
--------------------------------------------------------------------------------
 1 | cd /sd-repo/stable-diffusion-webui
 2 | ln -s /ssdshare/share/lab5/repositories repositories
 3 | ln -s /ssdshare/share/lab5/interrogate interrogate
 4 | 
 5 | cd models
 6 | rm -rf Lora && ln -s /ssdshare/share/lab5/SD-WebUI-models/Lora Lora
 7 | rm -rf BLIP && ln -s /ssdshare/share/lab5/SD-WebUI-models/BLIP BLIP
 8 | rm -rf Stable-diffusion && ln -s /ssdshare/share/lab5/SD-WebUI-models/Stable-diffusion Stable-diffusion
 9 | rm -rf torch_deepdanbooru && ln -s /ssdshare/share/lab5/SD-WebUI-models/torch_deepdanbooru torch_deepdanbooru
10 | rm -rf CLIP && ln -s /ssdshare/share/lab5/SD-WebUI-models/CLIP CLIP
11 | rm -rf VAE && ln -s /ssdshare/share/lab5/SD-WebUI-models/VAE VAE
12 | rm -rf VAE-approx && ln -s /ssdshare/share/lab5/SD-WebUI-models/VAE-approx VAE-approx
13 | cd ..
14 | 
15 | HF_ENDPOINT="http://hf-mirror.com" /usr/bin/python launch.py --no-download-sd-model --skip-prepare-environment --clip-models-path /ssdshare/share/lab5/clip-vit-l-14
16 | 


--------------------------------------------------------------------------------
/admin/rmtestpod.sh.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while IFS=, read USER EMAIL UIDD GIDD
 4 | do
 5 |   username=$(echo "$USER" | tr -d '\r')
 6 |   if [ -n "$username" ]; then
 7 |     echo "username : $username"	
 8 |     
 9 |     testuser=$(helm list --namespace=$username)
10 |     echo $testuser
11 |     
12 |     if [[ ($testuser =~ "testuser-${username}") && ($testuser =~ "deployed")]]
13 |     then
14 |         echo "包含"
15 |         echo "testuser-${username}"
16 |         podtest=$(kubectl get pods -n=$username)
17 |         echo $podtest
18 |         if [[ ($podtest =~ "testuser-${username}") && ($podtest =~ "Running")]]
19 |         then 
20 |             echo "success create pod, delete helm release testuser-${username}"
21 |             helm delete testuser-${username} --namespace=$username
22 |         else
23 |             echo "create pod error, please check pod"
24 |             echo $podtest
25 |         fi
26 |     else
27 |         echo "create helm release error, please check helm"
28 |         echo $testuser
29 |     fi
30 |   fi
31 | done < $1
32 | 


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel
 2 | 
 3 | ENV TZ=Asia/Shanghai \
 4 |     DEBIAN_FRONTEND=noninteractive
 5 | RUN apt-get update && \
 6 |     apt-get install -y tzdata && \
 7 |     ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime && \
 8 |     echo ${TZ} > /etc/timezone && \
 9 |     dpkg-reconfigure -f noninteractive tzdata 
10 | 
11 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \ 
12 |     sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \
13 | 	echo '[global]\nindex-url=https://mirrors.aliyun.com/pypi/simple/\n' >> /etc/pip.conf
14 | 
15 | RUN apt-get update && \
16 |     apt-get install -y openssh-server vim curl wget git iputils-ping net-tools git-lfs && \
17 |     apt-get clean && rm -rf /tmp/* && \
18 |     service ssh start \
19 |     rmdir /workspace \
20 |     ln -s /root/workspace/ /workspace
21 | 
22 | COPY requirements.txt /opt/app/requirements.txt
23 | WORKDIR /opt/app
24 | RUN pip install -r requirements.txt
25 | WORKDIR /root
26 | 
27 | RUN apt-get install -y libgl1 && \
28 |     apt-get clean && rm -rf /tmp/* 
29 | 
30 | ENTRYPOINT ["/usr/sbin/sshd", "-D"]
31 | CMD ["-p","22"]
32 | 
33 | 


--------------------------------------------------------------------------------
/dockerfiles/lab-cpu/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | numpy
 3 | openai
 4 | pandas
 5 | requests
 6 | scikit-learn
 7 | transformers
 8 | sentencepiece
 9 | protobuf
10 | datasets
11 | chardet
12 | python-dotenv
13 | httpx[socks]
14 | httpcore[socks]
15 | ipykernel
16 | ipywidgets
17 | langchain
18 | langchain-openai
19 | langchainhub
20 | langchain_experimental
21 | langchain_core
22 | langchain_community
23 | langchain_unstructured
24 | langgraph
25 | google-search-results
26 | lxml
27 | tiktoken
28 | faiss-cpu
29 | beautifulsoup4
30 | chroma-hnswlib
31 | chromadb
32 | matplotlib-inline
33 | pinecone-client
34 | pypdf
35 | scipy
36 | tenacity
37 | tqdm
38 | unstructured
39 | unstructured-client
40 | unstructured.pytesseract
41 | nltk
42 | rouge
43 | pillow
44 | ftfy
45 | Jinja2
46 | diffusers
47 | tensorboard
48 | tensorstore
49 | zarr
50 | uvicorn
51 | fastapi
52 | websockets
53 | gradio==5.18.0
54 | dspy-ai
55 | langchain_community
56 | pytesseract
57 | pi-heif
58 | opencv-python-headless
59 | langchain_pinecone
60 | langchain_chroma
61 | asyncer
62 | neo4j
63 | yfiles_jupyter_graphs
64 | httpx[socks]
65 | 
66 | # git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
67 | # cd LLaMA-Factory
68 | # pip install -e ".[torch,metrics]"


--------------------------------------------------------------------------------
/admin/adminchart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: userchart
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/user/userchart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: userchart
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/admin/gfshomechart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: userchart
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/admin/gfssharechart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: userchart
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/admin/ssdsharechart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: userchart
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/admin/values-template.yaml:
--------------------------------------------------------------------------------
 1 | # Template values for userchart.  DO NOT Edit.  Instead, make a copy and edit the copy.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | ########### 必须要写的部分 ###########
 6 | EMAIL: $EMAIL
 7 | NameSpace: $NAMESPACE   # 自己的namespace （同用户名）
 8 | BaseName: pytorch   # 任务的基本名字，建议写任务描述，例如pytorch
 9 | ContainerImage: harbor-local.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh   # 镜像名称，默认为 harbor.ai.iiis.co/xuw/pytorch:v1.1，或者见README的说明
10 | GPU: RTX4090D # RTX4090D RTX4090 RTX3090 A100
11 | 
12 | ########### 选填的部分 ###########
13 | # DeployName: namespace-pytorch-release     # 任务（deployment）的名字，默认为'NameSpace-BaseName-ReleaseName'， releaseName为随机生成的字符串是在helm命令行里指定的
14 | # Label: pytorch-release              # 任务的标签，默认为'BaseName-ReleaseName'
15 | # ContainerName: pytorch-release      # 容器名，默认为'BaseName-ReleaseName'
16 | # NVMEStorage: 100G                   # 申请的本地盘/scratch的大小，不填即为默认值
17 | # Limits:             # 申请的资源，注意所有启动的资源总和不能超过自己ns的quota，如果增加quota，需要向管理员申请，不填为默认值
18 | #  CPU: 8
19 | #  memory: 16Gi
20 | #  GPU: 0
21 | # UseShm: False
22 | # ShmSize: 8Gi
23 | 
24 | # Replicas: 1     # starting more replica of the pod (for distributed training)
25 | 
26 | ########### 高级配置 ###########
27 | # ExtraPort: 7860
28 | # IngressHost: lab2.ai.iiis.co
29 | # Command: '["python", "/app/app_class1.py"]'
30 | # Args: ''
31 | UseIB: true # 是否使用IB，默认不使用，如果使用，需要在启动时指定UseIB=true
32 | # nogfs: true


--------------------------------------------------------------------------------
/user/sdwebui-template.yaml:
--------------------------------------------------------------------------------
 1 | # Template values for userchart.  DO NOT Edit.  Instead, make a copy and edit the copy.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | ########### 必须要写的部分 ###########
 6 | NameSpace: namespace   # 自己的namespace （同用户名）
 7 | BaseName: sdwebui   # 任务的基本名字，建议写任务描述，例如pytorch
 8 | ContainerImage: harbor-local.ai.iiis.co/llm-course/sdwebui:v1   # 镜像名称，默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:latest，或者见README的说明
 9 | GPU: RTX4090 # 可选的包括： RTX4090D RTX4090 RTX3090， 其中RTX4090D的显存为48G，RTX4090的显存为24G，RTX3090的显存为24G
10 | 
11 | ########### 选填的部分 ###########
12 | # DeployName: namespace-pytorch-release     # 任务（deployment）的名字，默认为`NameSpace-BaseName-ReleaseName`， releaseName为随机生成的字符串是在helm命令行里指定的
13 | # Label: pytorch-release              # 任务的标签，默认为`BaseName-ReleaseName`
14 | # ContainerName: pytorch-release      # 容器名，默认为`BaseName-ReleaseName`
15 | # NVMEStorage: 100G                   # 申请的本地盘/scratch的大小，不填即为默认值
16 | Limits:             # 申请的资源，注意所有启动的资源总和不能超过自己ns的quota，如果增加quota，需要向管理员申请，不填为默认值
17 |  CPU: 8
18 |  memory: 16Gi
19 |  GPU: 1
20 | # UseShm: False   # 多卡训练的时候有用
21 | # ShmSize: 8Gi    # 多卡训练的时候有用
22 | 
23 | # Replicas: 1     # starting more replica of the pod (for distributed training)
24 | 
25 | ########### 高级配置 ###########
26 | ExtraPort: 7860
27 | #IngressHost: sdwebui.ai.iiis.co
28 | Command: '["bash", "/sd-repo/run.sh"]'
29 | # Args: ''
30 | # UseIB: true # 是否使用IB，默认不使用，如果使用，需要在启动时指定UseIB=true
31 | # nogfs: true
32 | 


--------------------------------------------------------------------------------
/user/comfyui-template.yaml:
--------------------------------------------------------------------------------
 1 | # Template values for userchart.  DO NOT Edit.  Instead, make a copy and edit the copy.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | ########### 必须要写的部分 ###########
 6 | NameSpace: namespace   # 自己的namespace （同用户名）
 7 | BaseName: comfyui   # 任务的基本名字，建议写任务描述，例如pytorch
 8 | ContainerImage: harbor-local.ai.iiis.co/llm-course/comfyui:v1   # 镜像名称，默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:latest，或者见README的说明
 9 | GPU: RTX4090 # 可选的包括： RTX4090D RTX4090 RTX3090， 其中RTX4090D的显存为48G，RTX4090的显存为24G，RTX3090的显存为24G
10 | 
11 | ########### 选填的部分 ###########
12 | # DeployName: namespace-pytorch-release     # 任务（deployment）的名字，默认为`NameSpace-BaseName-ReleaseName`， releaseName为随机生成的字符串是在helm命令行里指定的
13 | # Label: pytorch-release              # 任务的标签，默认为`BaseName-ReleaseName`
14 | # ContainerName: pytorch-release      # 容器名，默认为`BaseName-ReleaseName`
15 | # NVMEStorage: 100G                   # 申请的本地盘/scratch的大小，不填即为默认值
16 | Limits:             # 申请的资源，注意所有启动的资源总和不能超过自己ns的quota，如果增加quota，需要向管理员申请，不填为默认值
17 |  CPU: 8
18 |  memory: 16Gi
19 |  GPU: 1
20 | # UseShm: False   # 多卡训练的时候有用
21 | # ShmSize: 8Gi    # 多卡训练的时候有用
22 | 
23 | # Replicas: 1     # starting more replica of the pod (for distributed training)
24 | 
25 | ########### 高级配置 ###########
26 | ExtraPort: 8188
27 | #IngressHost: comfyui.ai.iiis.co
28 | Command: '["bash", "/comfyui-repo/run.sh"]'
29 | # Args: ''
30 | # UseIB: true # 是否使用IB，默认不使用，如果使用，需要在启动时指定UseIB=true
31 | # nogfs: true
32 | 


--------------------------------------------------------------------------------
/user/values-template.yaml:
--------------------------------------------------------------------------------
 1 | # Template values for userchart.  DO NOT Edit.  Instead, make a copy and edit the copy.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | ########### 必须要写的部分 ###########
 6 | NameSpace: namespace   # 自己的namespace （同用户名）
 7 | BaseName: pytorch   # 任务的基本名字，建议写任务描述，例如pytorch
 8 | ContainerImage: harbor-local.ai.iiis.co/llm-course/lab-cpu:latest   # 镜像名称，默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:latest，或者见README的说明
 9 | GPU: RTX4090 # 可选的包括： RTX4090D RTX4090 RTX3090， 其中RTX4090D的显存为48G，RTX4090的显存为24G，RTX3090的显存为24G
10 | 
11 | ########### 选填的部分 ###########
12 | # DeployName: namespace-pytorch-release     # 任务（deployment）的名字，默认为`NameSpace-BaseName-ReleaseName`， releaseName为随机生成的字符串是在helm命令行里指定的
13 | # Label: pytorch-release              # 任务的标签，默认为`BaseName-ReleaseName`
14 | # ContainerName: pytorch-release      # 容器名，默认为`BaseName-ReleaseName`
15 | # NVMEStorage: 100G                   # 申请的本地盘/scratch的大小，不填即为默认值
16 | # Limits:             # 申请的资源，注意所有启动的资源总和不能超过自己ns的quota，如果增加quota，需要向管理员申请，不填为默认值
17 | #  CPU: 8
18 | #  memory: 16Gi
19 | #  GPU: 0
20 | # UseShm: False   # 多卡训练的时候有用
21 | # ShmSize: 8Gi    # 多卡训练的时候有用
22 | 
23 | # Replicas: 1     # starting more replica of the pod (for distributed training)
24 | 
25 | ########### 高级配置 ###########
26 | # ExtraPort: 7860
27 | # IngressHost: lab2.ai.iiis.co
28 | # Command: '["python", "/app/app_class1.py"]'
29 | # Args: ''
30 | # UseIB: true # 是否使用IB，默认不使用，如果使用，需要在启动时指定UseIB=true
31 | # nogfs: true
32 | 


--------------------------------------------------------------------------------
/admin/adduser_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while IFS=, read USER EMAIL UIDD GIDD
 4 | do
 5 |   username=$(echo "$USER" | tr -d '\r')
 6 |   if [ -n "$username" ]; then
 7 |     echo "username : $username"	
 8 |     
 9 |     yamlfile=`cat ./values-template.yaml`
10 |     all_variables="NAMESPACE=$username EMAIL=$(echo "$EMAIL" | tr -d '\r')"
11 |     
12 |     if [ ! -d "./yamls/" ];then
13 |         mkdir ./yamls
14 |     fi
15 |     printf "$all_variables\ncat << EOF\n$yamlfile\nEOF" | bash > ./yamls/values_$username.yaml
16 | 
17 |     kubectl create namespace $username
18 | 
19 |     helm install admin-$username \
20 |       --namespace=admin-helm \
21 |       --create-namespace \
22 |       --values ./yamls/values_$username.yaml \
23 |       ./adminchart
24 | 
25 |     helm install gfshome-$username \
26 |       --namespace=admin-helm \
27 |       --create-namespace \
28 |       --values ./yamls/values_$username.yaml \
29 |       ./gfshomechart
30 | 
31 |     helm install gfsshare-$username \
32 |       --namespace=admin-helm \
33 |       --create-namespace \
34 |       --values ./yamls/values_$username.yaml \
35 |       ./gfssharechart
36 |     
37 |     helm install ssdshare-$username \
38 |       --namespace=admin-helm \
39 |       --create-namespace \
40 |       --values ./yamls/values_$username.yaml \
41 |       ./ssdsharechart
42 |     
43 |      testuser=$(helm install testuser-$username \
44 |        --namespace=$username \
45 |        --values ./yamls/values_$username.yaml \
46 |        ../user/userchart)
47 | 
48 |     if [[ ($testuser =~ $username)  && ($testuser =~ "deployed")]]
49 |     then
50 |         echo "成功 创建用户 $username"
51 |     else
52 |         echo "失败 创建用户 $username"
53 |     fi
54 | 
55 |   fi
56 | done < $1
57 | 


--------------------------------------------------------------------------------
/test/username.txt:
--------------------------------------------------------------------------------
 1 | test01,test01@test.lthpc.com,2001,500
 2 | test02,test02@test.lthpc.com,2002,500
 3 | test03,test03@test.lthpc.com,2003,500
 4 | test04,test04@test.lthpc.com,2004,500
 5 | test05,test05@test.lthpc.com,2005,500
 6 | test06,test06@test.lthpc.com,2006,500
 7 | test07,test07@test.lthpc.com,2007,500
 8 | test08,test08@test.lthpc.com,2008,500
 9 | test09,test09@test.lthpc.com,2009,500
10 | test10,test10@test.lthpc.com,2010,500
11 | test11,test11@test.lthpc.com,2011,500
12 | test12,test12@test.lthpc.com,2012,500
13 | test13,test13@test.lthpc.com,2013,500
14 | test14,test14@test.lthpc.com,2014,500
15 | test15,test15@test.lthpc.com,2015,500
16 | test16,test16@test.lthpc.com,2016,500
17 | test17,test17@test.lthpc.com,2017,500
18 | test18,test18@test.lthpc.com,2018,500
19 | test19,test19@test.lthpc.com,2019,500
20 | test20,test20@test.lthpc.com,2020,500
21 | test21,test21@test.lthpc.com,2021,500
22 | test22,test22@test.lthpc.com,2022,500
23 | test23,test23@test.lthpc.com,2023,500
24 | test24,test24@test.lthpc.com,2024,500
25 | test25,test25@test.lthpc.com,2025,500
26 | test26,test26@test.lthpc.com,2026,500
27 | test27,test27@test.lthpc.com,2027,500
28 | test28,test28@test.lthpc.com,2028,500
29 | test29,test29@test.lthpc.com,2029,500
30 | test30,test30@test.lthpc.com,2030,500
31 | test31,test31@test.lthpc.com,2031,500
32 | test32,test32@test.lthpc.com,2032,500
33 | test33,test33@test.lthpc.com,2033,500
34 | test34,test34@test.lthpc.com,2034,500
35 | test35,test35@test.lthpc.com,2035,500
36 | test36,test36@test.lthpc.com,2036,500
37 | test37,test37@test.lthpc.com,2037,500
38 | test38,test38@test.lthpc.com,2038,500
39 | test39,test39@test.lthpc.com,2039,500
40 | test40,test40@test.lthpc.com,2040,500
41 | test41,test41@test.lthpc.com,2041,500
42 | test42,test42@test.lthpc.com,2042,500
43 | test43,test43@test.lthpc.com,2043,500
44 | test44,test44@test.lthpc.com,2044,500
45 | test45,test45@test.lthpc.com,2045,500
46 | test46,test46@test.lthpc.com,2046,500
47 | test47,test47@test.lthpc.com,2047,500
48 | test48,test48@test.lthpc.com,2048,500
49 | test49,test49@test.lthpc.com,2049,500
50 | test50,test50@test.lthpc.com,2050,500


--------------------------------------------------------------------------------
/admin/pull_images_to_local.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: prepuller
 5 | spec:
 6 |   selector:
 7 |     matchLabels:
 8 |       name: prepuller
 9 |   template:
10 |     metadata:
11 |       labels:
12 |         name: prepuller
13 |     spec:
14 |       # Configure an init container for each image you want to pull
15 |       initContainers:
16 |         - name: prepuller-1
17 |           # Set the image you want to pull
18 |           #image: harbor-local.ai.iiis.co/llm-course/lab:v2
19 |           image: harbor-local.ai.iiis.co/llm-course/lab5:v4
20 |           # Use a known command that will exit successfully immediately
21 |           # Any no-op command will do but YMMV with scratch based containers
22 |           command: ["sh", "-c", "'true'"]
23 |           resources:
24 |             limits:
25 |               cpu: 100m
26 |               memory: 1Gi
27 |             requests:
28 |               cpu: 1m
29 |               memory: 8Mi
30 |         - name: prepuller-2
31 |           # Set the image you want to pull
32 |           image: harbor-local.ai.iiis.co/llm-course/lab:v2.4
33 |           command: ["sh", "-c", "'true'"]
34 |           resources:
35 |             limits:
36 |               cpu: 100m
37 |               memory: 1Gi
38 |             requests:
39 |               cpu: 1m
40 |               memory: 8Mi
41 |         - name: prepuller-3
42 |           # Set the image you want to pull
43 |           image: harbor-local.ai.iiis.co/llm-course/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
44 |           command: ["sh", "-c", "'true'"]
45 |           resources:
46 |             limits:
47 |               cpu: 100m
48 |               memory: 1Gi
49 |             requests:
50 |               cpu: 1m
51 |               memory: 8Mi
52 |  
53 |         # - name: prepuller-2
54 |         #   image: ...
55 |         #   command: ["sh", "-c", "'true'"]
56 | 
57 |         # etc...
58 | 
59 |       # Use the pause container to ensure the Pod goes into a `Running` phase
60 |       # but doesn't take up resource on the cluster
61 |       containers:
62 |         - name: pause
63 |           image: harbor.ai.iiis.co:9443/xuw/pause:3.2
64 |           resources:
65 |             limits:
66 |               cpu: 1m
67 |               memory: 8Mi
68 |             requests:
69 |               cpu: 1m
70 |               memory: 8Mi
71 | 


--------------------------------------------------------------------------------
/dockerfiles/lab-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:24.04
 2 | 
 3 | ENV TZ=Asia/Shanghai \
 4 |     DEBIAN_FRONTEND=noninteractive
 5 | 
 6 | RUN apt-get update && \
 7 |     apt-get install -y ca-certificates tzdata && \
 8 |     ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime && \
 9 |     echo ${TZ} > /etc/timezone && \
10 |     dpkg-reconfigure -f noninteractive tzdata && \
11 |     cat <<'EOF' > /etc/apt/sources.list.d/ubuntu.sources
12 | Types: deb
13 | URIs: https://mirrors.cernet.edu.cn/ubuntu
14 | Suites: noble noble-updates noble-backports
15 | Components: main restricted universe multiverse
16 | Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
17 | 
18 | # 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
19 | # Types: deb-src
20 | # URIs: https://mirrors.cernet.edu.cn/ubuntu
21 | # Suites: noble noble-updates noble-backports
22 | # Components: main restricted universe multiverse
23 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
24 | 
25 | # 以下安全更新软件源包含了官方源与镜像站配置，如有需要可自行修改注释切换
26 | # Types: deb
27 | # URIs: https://mirrors.cernet.edu.cn/ubuntu
28 | # Suites: noble-security
29 | # Components: main restricted universe multiverse
30 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
31 | 
32 | # # Types: deb-src
33 | # # URIs: https://mirrors.cernet.edu.cn/ubuntu
34 | # # Suites: noble-security
35 | # # Components: main restricted universe multiverse
36 | # # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
37 | 
38 | Types: deb
39 | URIs: http://security.ubuntu.com/ubuntu/
40 | Suites: noble-security
41 | Components: main restricted universe multiverse
42 | Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
43 | 
44 | # Types: deb-src
45 | # URIs: http://security.ubuntu.com/ubuntu/
46 | # Suites: noble-security
47 | # Components: main restricted universe multiverse
48 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
49 | 
50 | # 预发布软件源，不建议启用
51 | # Types: deb
52 | # URIs: https://mirrors.cernet.edu.cn/ubuntu
53 | # Suites: noble-proposed
54 | # Components: main restricted universe multiverse
55 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
56 | 
57 | # # Types: deb-src
58 | # # URIs: https://mirrors.cernet.edu.cn/ubuntu
59 | # # Suites: noble-proposed
60 | # # Components: main restricted universe multiverse
61 | # # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
62 | EOF
63 | 
64 | RUN apt-get update && \
65 |     apt-get install -y openssh-server vim curl wget git iputils-ping net-tools git-lfs man-db zsh poppler-utils tesseract-ocr libgl1 python3-full python-is-python3 python3-pip && \
66 |     python3 -m pip config set global.index-url https://mirrors.cernet.edu.cn/pypi/web/simple && \
67 |     python3 -m pip config set global.trusted-host mirrors.cernet.edu.cn && \
68 |     python3 -m pip config set global.break-system-packages true && \
69 |     apt-get clean && rm -rf /tmp/* && \
70 |     service ssh start
71 | 
72 | COPY requirements.txt /opt/app/requirements.txt
73 | WORKDIR /opt/app
74 | RUN python3 -m pip install --no-cache-dir -r requirements.txt
75 | WORKDIR /root
76 | 
77 | ENTRYPOINT ["/usr/sbin/sshd", "-D"]
78 | CMD ["-p","22"]
79 | 
80 | 


--------------------------------------------------------------------------------
/dockerfiles/lab/requirements.txt:
--------------------------------------------------------------------------------
  1 | setuptools==69.5.1  # temp fix for compatibility with some old packages
  2 | matplotlib
  3 | numpy
  4 | openai
  5 | pandas
  6 | requests
  7 | scikit-learn
  8 | transformers
  9 | sentencepiece
 10 | protobuf
 11 | datasets
 12 | accelerate
 13 | chardet
 14 | python-dotenv
 15 | httpx[socks]
 16 | httpcore[socks]
 17 | ipykernel
 18 | ipywidgets
 19 | langchain
 20 | langchain-openai
 21 | langchainhub
 22 | langgraph
 23 | google-search-results
 24 | lxml
 25 | tiktoken
 26 | faiss-cpu
 27 | beautifulsoup4
 28 | chroma-hnswlib
 29 | chromadb
 30 | matplotlib-inline
 31 | pinecone-client
 32 | pypdf
 33 | scipy
 34 | sentence-transformers
 35 | tenacity
 36 | tqdm
 37 | unstructured
 38 | unstructured-client
 39 | unstructured-inference
 40 | unstructured.pytesseract
 41 | nltk 
 42 | rouge
 43 | peft
 44 | pillow
 45 | ftfy
 46 | Jinja2
 47 | diffusers
 48 | tensorboard
 49 | tensorstore
 50 | zarr
 51 | requests
 52 | uvicorn
 53 | websockets
 54 | gradio
 55 | python-dotenv
 56 | dspy-ai
 57 | jinja2
 58 | langchain_community
 59 | sentence-transformers
 60 | tenacity
 61 | tiktoken
 62 | tqdm
 63 | unstructured
 64 | unstructured-client
 65 | unstructured-inference
 66 | pytesseract
 67 | unstructured.pytesseract
 68 | pi-heif
 69 | opencv-python-headless
 70 | langchain_pinecone
 71 | langchain_chroma
 72 | asyncer
 73 | neo4j
 74 | yfiles_jupyter_graphs
 75 | 
 76 | # git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 77 | # cd LLaMA-Factory
 78 | # pip install -e ".[torch,metrics]"
 79 | comfyui-frontend-package==1.14.6
 80 | numpy>=1.25.0
 81 | einops
 82 | tokenizers>=0.13.3
 83 | sentencepiece
 84 | safetensors>=0.4.2
 85 | aiohttp>=3.11.8
 86 | yarl>=1.18.0
 87 | pyyaml
 88 | Pillow
 89 | scipy
 90 | tqdm
 91 | psutil
 92 | 
 93 | #non essential dependencies:
 94 | spandrel
 95 | soundfile
 96 | av
 97 | 
 98 | Pillow
 99 | 
100 | blendmodes
101 | clean-fid
102 | diskcache
103 | einops
104 | facexlib
105 | gradio==3.41.2
106 | inflection
107 | jsonmerge
108 | kornia
109 | lark
110 | numpy
111 | omegaconf
112 | open-clip-torch
113 | 
114 | piexif
115 | protobuf==3.20.0
116 | psutil
117 | pytorch_lightning
118 | requests
119 | resize-right
120 | 
121 | safetensors
122 | scikit-image>=0.19
123 | tomesd
124 | torchdiffeq
125 | transformers==4.30.2
126 | pillow-avif-plugin==1.4.3
127 | 
128 | GitPython==3.1.32
129 | Pillow==9.5.0
130 | accelerate==0.21.0
131 | blendmodes==2022
132 | clean-fid==0.1.35
133 | diskcache==5.6.3
134 | einops==0.4.1
135 | facexlib==0.3.0
136 | fastapi==0.95.2
137 | gradio==3.41.2
138 | httpcore==0.15
139 | inflection==0.5.1
140 | jsonmerge==1.8.0
141 | kornia==0.6.7
142 | lark==1.1.2
143 | numpy==1.26.2
144 | omegaconf==2.2.3
145 | open-clip-torch==2.20.0
146 | piexif==1.1.3
147 | protobuf==3.20.0
148 | psutil==5.9.5
149 | pytorch_lightning==1.9.4
150 | resize-right==0.0.2
151 | safetensors==0.4.2
152 | scikit-image==0.21.0
153 | spandrel==0.3.4
154 | spandrel-extra-arches==0.1.1
155 | tomesd==0.1.3
156 | torchdiffeq==0.2.3
157 | torchsde==0.2.6
158 | httpx==0.24.1
159 | pillow-avif-plugin==1.4.3
160 | pytest-base-url~=2.0
161 | pytest-cov~=4.0
162 | pytest~=7.3


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
 1 | ### FAQ - 常见问题汇总
 2 | 
 3 | #### 1. **Kubeconfig 配置错误**
 4 |    - **问题**: 复制粘贴 kubeconfig 配置时出现问题，如何解决？
 5 |    - **解决**: 
 6 |    确保在复制时没有破坏格式，尤其是 server 部分。可以重新运行 login.ai 网站上的命令，并检查生成的 `.crt` 文件是否存在，确保没有损坏。
 7 |    重新生成配置文件的步骤如下（详细步骤请参考[README](README.md#配置kubeconfig)）：
 8 |      1. 打开浏览器并访问 https://login.ai.iiis.co:9443。
 9 |      2. 使用您的邮箱地址（格式为：用户名@iiis.co）和密码登录。
10 |      3. 登录后，进入 kubeconfig 信息页面，选择您使用的系统类型。
11 |      4. 按照页面指示的顺序在命令行运行命令，这些命令会生成名为 `config` 的配置文件。
12 |      5. 确保在复制粘贴命令时没有破坏格式，尤其是 `server` 部分。
13 |      6. 检查生成的 `.crt` 文件是否存在，并确保没有损坏。
14 |      7. 运行以下命令设置默认的 namespace：
15 |         ```bash
16 |         kubectl config set-context --current --namespace=`kubectl config current-context | cut -d'-' -f 1`
17 |         ```
18 | 
19 | #### 2. **OpenAI API Key 问题**
20 |    - **问题**: OpenAI API Key 被禁用或失效，如何解决？
21 |    - **解决**: 如果无法使用 OpenAI API，可以尝试购买淘宝上的中转 API 或向助教借用一个，但会有使用量限制。
22 | 
23 | #### 3. **GPU 配置问题**
24 |    - **问题**: 如何确保在 Kubernetes 中正确配置和使用 GPU？
25 |    - **解决**: 在 YAML 文件中确保 GPU 配置正确，并通过 `nvidia-smi` 检查 GPU 是否可用。
26 |    如果使用 Helm 创建 Pod，请确保在 `values.yaml` 文件中正确配置以下 GPU 选项（详细步骤请参考[README](README.md#使用默认配置启动计算任务)）：
27 |      1. `GPU`: 设置所需的 GPU 类型，例如 `RTX4090`、`RTX4090D` 或 `RTX3090`。
28 |      2. `Limits.GPU`: 设置 GPU 的数量，确保不超过集群的配额。
29 |    配置完成后，重新安装 Helm 部署。重新安装的命令如下：
30 |      ```bash
31 |      helm uninstall release_name
32 |      helm install release_name --values ./values.yaml ./userchart
33 |      ```
34 | 
35 | #### 4. **Pod 启动问题**
36 |    - **问题**: 新创建的 Pod 一直处于 Pending 状态，如何解决？
37 |    - **解决**: 检查 Pod 的资源需求是否超出了集群资源，或通过 `kubectl describe` 检查 Pod 启动失败的原因。如果资源不足，可以尝试调整 Pod 的资源配置。
38 | 
39 | #### 5. **VS Code 连接问题**
40 |    - **问题**: VS Code 无法连接到远程 Pod，如何解决？
41 |    - **解决**: 确保 Pod 处于运行状态，如果 Pod 已经停止或被重新启动，可以通过删除旧的 Pod 并重新创建新的 Pod 来解决。确保 VS Code 配置正确，或者尝试通过删除 `.vscode-server` 文件夹后重新连接。
42 | 
43 | #### 6. **存储空间不足**
44 |    - **问题**: 运行模型时出现 shm 空间不足的问题，如何解决？
45 |    - **解决**: 参考最新的 `ailab` 仓库模板，修改 `values.yaml` 文件并启用 shm 支持，重新创建 Pod。
46 | 
47 | #### 7. **模型训练时内存溢出（OOM）**
48 |    - **问题**: 训练模型时出现 OOM 错误，如何避免？
49 |    - **解决**: 尝试调整 batch size 或将数据分批处理。如果使用多个 GPU，确保每个 GPU 的内存使用均衡。
50 | 
51 | #### 8. **API Key 和代理问题**
52 |    - **问题**: 使用 API 时出现 "找不到函数" 或 API 返回错误，如何解决？
53 |    - **解决**: 检查是否使用了正确的 API Key 和代理配置。如果需要，重试连接并检查网络或代理设置。
54 | 
55 | #### 9. **如何快速下载大文件**
56 |    - **问题**: 如何从远程服务器快速下载大于 100MB 的文件？
57 |    - **解决**: 使用 `scp` 或 `kubectl cp` 命令来下载文件。若遇到网络问题，可以考虑使用更好的代理。
58 | 
59 | #### 10. **模型生成错误**
60 |    - **问题**: 模型生成的结果缺少部分代码或格式不正确，如何修复？
61 |    - **解决**: 检查生成的代码是否符合格式要求，确保所有的代码块都正确闭合。尝试调整 `max_length` 或 `tokenizer` 配置，避免生成超长的代码。
62 | 
63 | #### 11. **如何避免多 GPU 计算时的卡顿问题**
64 |    - **问题**: 在使用多个 GPU 时，如何避免性能瓶颈？
65 |    - **解决**: 使用 `CUDA_VISIBLE_DEVICES` 配置来选择特定的 GPU，并调整模型的负载分配，避免某些 GPU 负载过重。
66 | 
67 | #### 12. **如何在 Jupyter Notebook 中释放 GPU 内存**
68 |    - **问题**: Jupyter Notebook 中如果某个 cell 出现 OOM 错误，如何释放 GPU 内存而不重启 Kernel？
69 |    - **解决**: 尝试使用 `empty_cache()`，但在某些情况下可能无法完全释放内存。如果内存没有被回收，重启 Kernel 或删除占用内存的变量可能是最有效的解决方法。
70 | 
71 | #### 13. **模型训练时的资源分配问题**
72 |    - **问题**: 使用两张显卡时，训练速度反而变慢，如何解决？
73 |    - **解决**: 检查 CPU 和内存的资源分配，确保资源足够。可以适当增加 CPU 核数或内存，避免瓶颈限制训练速度。
74 | 
75 | 这些是常见问题的解答，如果遇到其他问题，请随时询问！


--------------------------------------------------------------------------------
/dockerfiles/lab/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.10-py3
 2 | 
 3 | ENV TZ=Asia/Shanghai \
 4 |     DEBIAN_FRONTEND=noninteractive \
 5 |     BNB_CUDA_VERSION=125
 6 | 
 7 | RUN apt-get update && \
 8 |     apt-get install -y ca-certificates tzdata && \
 9 |     ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime && \
10 |     echo ${TZ} > /etc/timezone && \
11 |     dpkg-reconfigure -f noninteractive tzdata
12 | 
13 | 
14 | # RUN cat <<'EOF' > /etc/apt/sources.list
15 | # # 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
16 | # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy main restricted universe multiverse
17 | # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy main restricted universe multiverse
18 | # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse
19 | # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse
20 | # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse
21 | # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse
22 | 
23 | # # 以下安全更新软件源包含了官方源与镜像站配置，如有需要可自行修改注释切换
24 | # # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-security main restricted universe multiverse
25 | # # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-security main restricted universe multiverse
26 | 
27 | # deb http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
28 | # # deb-src http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
29 | 
30 | # # 预发布软件源，不建议启用
31 | # # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse
32 | # # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse
33 | # EOF
34 | 
35 | RUN apt-get update && \
36 |     apt-get install -y openssh-server vim curl wget git iputils-ping net-tools git-lfs man-db zsh poppler-utils tesseract-ocr libgl1 && \
37 |     # pip config set global.index-url https://mirrors.cernet.edu.cn/pypi/web/simple && \
38 |     # pip config set global.trusted-host mirrors.cernet.edu.cn && \
39 |     apt-get clean && rm -rf /tmp/* && \
40 |     service ssh start && \
41 |     rm /workspace/* -rf && \
42 |     mkdir -p /root/workspace && \  
43 |     ln -s /root/workspace/ /workspace
44 | 
45 | COPY requirements.txt /opt/app/requirements.txt
46 | COPY lab5-version-package/flashinfer-0.2.4.tar.gz /opt/app/flashinfer-0.2.4.tar.gz
47 | WORKDIR /opt/app
48 | RUN pip install -r requirements.txt && \
49 |     pip install ./flashinfer-0.2.4.tar.gz && \
50 |     pip install sglang[all] -i https://pypi.org/simple
51 | WORKDIR /root
52 | 
53 | COPY . /root
54 | RUN pip install torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 && \
55 |     pip install qwen-vl-utils && \
56 |     pip install /root/lab5-version-package/transformers && \
57 |     pip install /root/lab5-version-package/CLIP-d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip && \
58 |     pip install /root/lab5-version-package/open_clip-bb6e834e9c70d9c27d0dc3ecedeebeaeb1ffad6b.zip && \
59 |     pip install -U -I --no-deps xformers==0.0.23.post1 && \
60 |     pip install ngrok && \
61 |     pip install /root/lab5-version-package/torch-2.5.1+cu124-cp310-cp310-linux_x86_64.whl
62 | 
63 | RUN pip install flash-attn==2.7.0.post2 --no-build-isolation && \
64 |     pip install -U accelerate && \
65 |     pip install /root/lab5-version-package/torch-2.5.1+cu124-cp310-cp310-linux_x86_64.whl
66 | 
67 | WORKDIR /app
68 | COPY ./lab5-version-package/sam2 /app/sam2
69 | RUN pip install -e /app/sam2 && \
70 |     pip install /root/lab5-version-package/opencv_python-4.8.0.74-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl && \
71 |     pip install /root/lab5-version-package/torch-2.5.1+cu124-cp310-cp310-linux_x86_64.whl
72 | 
73 | WORKDIR /root
74 | ENTRYPOINT ["/usr/sbin/sshd", "-D"]
75 | CMD ["-p","22"]
76 | 
77 | 


--------------------------------------------------------------------------------
/admin/cluster_setting/rancher_local_path_nvme.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: Namespace
  3 | metadata:
  4 |   name: local-path-storage
  5 | 
  6 | ---
  7 | apiVersion: v1
  8 | kind: ServiceAccount
  9 | metadata:
 10 |   name: local-path-provisioner-service-account
 11 |   namespace: local-path-storage
 12 | 
 13 | ---
 14 | apiVersion: rbac.authorization.k8s.io/v1
 15 | kind: Role
 16 | metadata:
 17 |   name: local-path-provisioner-role
 18 |   namespace: local-path-storage
 19 | rules:
 20 |   - apiGroups: [""]
 21 |     resources: ["pods"]
 22 |     verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
 23 | 
 24 | ---
 25 | apiVersion: rbac.authorization.k8s.io/v1
 26 | kind: ClusterRole
 27 | metadata:
 28 |   name: local-path-provisioner-role
 29 | rules:
 30 |   - apiGroups: [""]
 31 |     resources: ["nodes", "persistentvolumeclaims", "configmaps", "pods", "pods/log"]
 32 |     verbs: ["get", "list", "watch"]
 33 |   - apiGroups: [""]
 34 |     resources: ["persistentvolumes"]
 35 |     verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
 36 |   - apiGroups: [""]
 37 |     resources: ["events"]
 38 |     verbs: ["create", "patch"]
 39 |   - apiGroups: ["storage.k8s.io"]
 40 |     resources: ["storageclasses"]
 41 |     verbs: ["get", "list", "watch"]
 42 | 
 43 | ---
 44 | apiVersion: rbac.authorization.k8s.io/v1
 45 | kind: RoleBinding
 46 | metadata:
 47 |   name: local-path-provisioner-bind
 48 |   namespace: local-path-storage
 49 | roleRef:
 50 |   apiGroup: rbac.authorization.k8s.io
 51 |   kind: Role
 52 |   name: local-path-provisioner-role
 53 | subjects:
 54 |   - kind: ServiceAccount
 55 |     name: local-path-provisioner-service-account
 56 |     namespace: local-path-storage
 57 | 
 58 | ---
 59 | apiVersion: rbac.authorization.k8s.io/v1
 60 | kind: ClusterRoleBinding
 61 | metadata:
 62 |   name: local-path-provisioner-bind
 63 | roleRef:
 64 |   apiGroup: rbac.authorization.k8s.io
 65 |   kind: ClusterRole
 66 |   name: local-path-provisioner-role
 67 | subjects:
 68 |   - kind: ServiceAccount
 69 |     name: local-path-provisioner-service-account
 70 |     namespace: local-path-storage
 71 | 
 72 | ---
 73 | apiVersion: apps/v1
 74 | kind: Deployment
 75 | metadata:
 76 |   name: local-path-provisioner
 77 |   namespace: local-path-storage
 78 | spec:
 79 |   replicas: 1
 80 |   selector:
 81 |     matchLabels:
 82 |       app: local-path-provisioner
 83 |   template:
 84 |     metadata:
 85 |       labels:
 86 |         app: local-path-provisioner
 87 |     spec:
 88 |       serviceAccountName: local-path-provisioner-service-account
 89 |       containers:
 90 |         - name: local-path-provisioner
 91 |           image: harbor.ai.iiis.co:9443/deploy/docker.io/rancher/local-path-provisioner:v0.0.26
 92 |           imagePullPolicy: IfNotPresent
 93 |           command:
 94 |             - local-path-provisioner
 95 |             - --debug
 96 |             - start
 97 |             - --config
 98 |             - /etc/config/config.json
 99 |           volumeMounts:
100 |             - name: config-volume
101 |               mountPath: /etc/config/
102 |           env:
103 |             - name: POD_NAMESPACE
104 |               valueFrom:
105 |                 fieldRef:
106 |                   fieldPath: metadata.namespace
107 |       volumes:
108 |         - name: config-volume
109 |           configMap:
110 |             name: local-path-config
111 | 
112 | ---
113 | apiVersion: storage.k8s.io/v1
114 | kind: StorageClass
115 | metadata:
116 |   name: rancher-local-path
117 | provisioner: rancher.io/local-path
118 | volumeBindingMode: WaitForFirstConsumer
119 | reclaimPolicy: Delete
120 | 
121 | ---
122 | kind: ConfigMap
123 | apiVersion: v1
124 | metadata:
125 |   name: local-path-config
126 |   namespace: local-path-storage
127 | data:
128 |   config.json: |-
129 |     {
130 |             "nodePathMap":[
131 |             {
132 |                     "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES",
133 |                     "paths":["/nvme1/local-path-provisioner","/nvme2/local-path-provisioner"]
134 |             }
135 |             ]
136 |     }
137 |   setup: |-
138 |     #!/bin/sh
139 |     set -eu
140 |     mkdir -m 0777 -p "$VOL_DIR"
141 |   teardown: |-
142 |     #!/bin/sh
143 |     set -eu
144 |     rm -rf "$VOL_DIR"
145 |   helperPod.yaml: |-
146 |     apiVersion: v1
147 |     kind: Pod
148 |     metadata:
149 |       name: helper-pod
150 |     spec:
151 |       priorityClassName: system-node-critical
152 |       tolerations:
153 |         - key: node.kubernetes.io/disk-pressure
154 |           operator: Exists
155 |           effect: NoSchedule
156 |       containers:
157 |       - name: helper-pod
158 |         image: harbor.ai.iiis.co:9443/deploy/docker.io/library/busybox:latest
159 |         imagePullPolicy: IfNotPresent


--------------------------------------------------------------------------------
/admin/README.md:
--------------------------------------------------------------------------------
  1 | # 管理员使用说明
  2 | 
  3 | ## 用户管理
  4 | 
  5 | 集群提供了LDAP的操作界面（phpLDAPAdmin），用于管理员进行用户账号信息的管理操作。
  6 | 
  7 | ### 关于用户组织架构的约定
  8 | 
  9 | 由于LDAP是一个应用场景范围比较广泛的规范。应用于K8S集群单点登录场景时，要对其中的用户组织架构附加一定的限制，才能顺畅地实现多个身份验证相关服务的打通。
 10 | 
 11 | 期望的组织架构如下图所示。
 12 | 
 13 | ![](assets/tree_users_ldap.png)
 14 | 
 15 | 组织架构的约定如下：
 16 | 1. 在根节点（dc=iiis,dc=co）下，创建两个ou（Organisational Unit），名为Groups、People；
 17 | 2. Group下必须一个或多个类型为posixGroup的组（如上图中的defaultGroup、student组），用于创建用户账户时填写gid字段；
 18 | 3. 组都建在ou=Group节点下；除第2条所述的个别posixGroup外，其他分组使用groupOfUniqueNames类型；
 19 | 4. 用户都建在ou=People节点下，类型为User Account对象；必须为用户增加Email字段，并确保该字段取值唯一；
 20 | 5. （可选）可以在People下面创建多个子ou来对用户进行分类管理（如上图中的faculty/staff/Students三个ou）；
 21 | 6. 通过修改组（groupOfUniqueNames类型）的member属性实现用户和组的关联，一个用户可以隶属于多个groupOfUniqueNames组。
 22 | 
 23 | ### 操作说明
 24 | 
 25 | 在浏览器上登录LDAP管理界面（phpLDAPAdmin）的URL地址：`https://ldap.ai.iiis.co/`。
 26 | 管理员用户名：`cn=admin,dc=iiis,dc=co`
 27 | 
 28 | 
 29 | 登录后，界面如下，用于操作的界面元素分为三部分：*左上操作按钮栏*、*左侧组织架构*、*右侧操作页面*。
 30 | 
 31 | ![](assets/phpLDAPadmin_main_page.png)
 32 | 
 33 | #### 1、创建ou（Groups和People）
 34 | 
 35 | 1）在*左侧组织架构*上选中根节点（dc=iiis,dc=co），右侧操作页面上选择【Create a child entry】操作：
 36 | 
 37 | ![](assets/phpLDAPadmin_create_child_entry.png)
 38 | 
 39 | 2）选择【Generic: Organisational Unit】模版：
 40 | 
 41 | ![](assets/phpLDAPadmin_create_object.png)
 42 | 
 43 | 3）填写ou名称（People或Groups），点击【Create Object】按钮：
 44 | 
 45 | ![](assets/phpLDAPadmin_create_ou.png)
 46 | 
 47 | 4）确认页面中，点击【Commit】按钮：
 48 | 
 49 | ![](assets/phpLDAPadmin_ou_commit.png)
 50 | 
 51 | 经过以上操作一个ou就创建完成了。可按照上述步骤再继续创建其他ou。
 52 | 
 53 | **注：在People下创建子ou的过程类似，只要在第（1）步中选择`ou=People`节点作为父节点进行操作即可。**
 54 | 
 55 | 
 56 | #### 2、创建一个Posix Group用户组
 57 | 
 58 | 由于创建用户账户时，需要选择一个Posix Group的Group ID（gid），因此须创建一个Posix Group对象。**这项操作只做一次即可。**
 59 | 操作方式如下。
 60 | 
 61 | 1）左侧组织架构上选中”ou=Groups“，右侧操作页面选择【Create a child entry】操作：
 62 | 
 63 | ![](assets/phpLDAPadmin_create_object_posix_group.png)
 64 | 
 65 | 2）选择【Generic: Posix Group】模版
 66 | 
 67 | ![](assets/phpLDAPadmin_template_posix_group.png)
 68 | 
 69 | 3）填写组名。系统会自动生成GID Number：
 70 | 
 71 | ![](assets/phpLDAPAdmin_posix_group_name.png)
 72 | 
 73 | 4）确认页面中点击【Commit】按钮，完成创建。
 74 | 
 75 | 
 76 | #### 3、创建用户账户（常用操作）
 77 | 
 78 | **这里创建的用户账户信息将可用于K8S集群kubeconfig的获取和相关系统的单点登录。**
 79 | 
 80 | 1）左侧组织架构上选中ou=People或者其子ou（例如下图的ou=staff），右侧操作页面选择【Create a child entry】操作
 81 | 
 82 | ![](assets/phpLDAPAdmin_user_create_child_entry.png)
 83 | 
 84 | 2）选择【Generic: User Account】模版类型：
 85 | 
 86 | ![](assets/phpLDAPAdmin_user_template.png)
 87 | 
 88 | 3）填写用户信息，点击【Create Object】按钮
 89 | 
 90 | - 其中GID Number字段可通过下拉菜单选择任意一个Posix Group的名称。
 91 | 
 92 | ![](assets/phpLDAPAdmin_user_information.png)
 93 | 
 94 | 
 95 | 4）在确认页中点击【Commit】按钮。
 96 | 
 97 | ![](assets/phpLDAPAdmin_user_commit.png)
 98 | 
 99 | 
100 | 5）增加Email属性并填写用户email地址。
101 | 左侧组织架构上选中用户cn，右侧页面选择【Add new attribute】操作：
102 | 
103 | ![](assets/phpLDAPAdmin_user_add_attribute.png)
104 | 
105 | 页面会出现Add Attribute的栏目，在下拉菜单中选择Email这个属性：
106 | 
107 | ![](assets/phpLDAPAdmin_user_add_attribute_Email.png)
108 | 
109 | 填写用户的Email地址：
110 | 
111 | ![](assets/phpLDAPAdmin_user_input_email.png)
112 | 
113 | 点击右侧操作页面下方的【Update Object】按钮。
114 | 并在确认页面点击【Update Object】按钮。
115 | 
116 | ![](assets/phpLDAPAdmin_user_add_user_email_commit.png)
117 | 
118 | 
119 | 可以在用户账号的页面上看到已经添加了Email信息：
120 | ![](assets/phpLDAPAdmin_user_information_detail.png)
121 | 
122 | 
123 | #### 4、创建用户组
124 | 
125 | 这里创建的用户组与用户账户是**多对多**的关系，即一个组可以包含多个用户，同时一个用户可以归属于多个组。从扩展性考虑，用户组采用groupOfUniqueNames类型（也可采用groupOfNames类型）。
126 | 
127 | 操作方式如下。
128 | 
129 | 1）左侧组织架构上选中ou=Groups，右侧操作页面选择【Create a child entry】操作；
130 | 
131 | 2）模版类型选择default - groupOfUniqueNames，如下面两个图所示：
132 | 
133 | ![](assets/phpLDAPAdmin_template_Default.png)
134 | 
135 | ![](assets/phpLDAPAdmin_template_groupOfUniqueNames.png)
136 | 
137 | 3）进入了用户组的编辑页面，填写三个必填字段即可：
138 | - RDN选择cn；
139 | - cn填写组名；
140 | - uniqueMember选择一个属于改组的用户账户即可。
141 | ![](assets/phpLDAPAdmin_group_input.png)
142 | 
143 | 后续提交、确认即可完成组的创建。
144 | 
145 | 
146 | #### 5、修改组成员
147 | 
148 | 在用户组上可以通过修改uniqueMember字段加入用户。同一个用户可以归属到多个组。
149 | 
150 | ![](assets/phpLDAPAdmin_group_member.png)
151 | 
152 | ### K8S创建命名空间、PVC及授权
153 | 
154 | `adduser_dir.sh` 脚本会为K8S集群安装本地硬盘卷的驱动器。并根据用户信息明细文件为每个用户创建命名空间、授权用户在自己命名空间中具有USER权限，在GFS为用户创建个人数据路径并创建对应的PVC，设置每个命名空间的资源限制。
155 | 
156 | 用户明细文件以 `username.txt` 为例，文件内容包括 `uid,mail,uidNumber,gidNumber` 四列内容，没有表头。
157 | 
158 | ```
159 | $ cat username.txt
160 | 
161 | test01,test01@test.lthpc.com,2001,500
162 | test02,test02@test.lthpc.com,2002,500
163 | test03,test03@test.lthpc.com,2003,500
164 | 
165 | ```
166 | 
167 | 切换到本项目的 `admin` 路径下，执行脚本自动创建用户的相关资源。
168 | ```
169 | $ bash adduser_dir.sh username.txt
170 | ```
171 | 脚本运行完成之后，`username.txt` 中的用户就可以通过kubeconfig使用K8S集群了。`username.txt` 文件需要保留，在删除用户时需要用到。
172 | 
173 | ### K8S 删除用户及相关资源
174 | 
175 | `deluser_dir.sh` 脚本可以自动删除用户在GFS的个人PVC、NFS中的个人PVC和个人命名空间中的所有资源以及命名空间本身。
176 | 
177 | 切换到本项目的 `admin` 路径下，执行脚本自动删除用户的相关资源。
178 | 
179 | ```
180 | $ bash deluser_dir.sh username.txt
181 | ```
182 | 
183 | ### 添加K8S集群管理员
184 | 
185 | 将`admin@admin.com`改为需要被设置为管理员的账号，在master节点或具有最高权限的终端执行下面的命令。
186 | 
187 | ```
188 | kubectl create clusterrolebinding root-cluster-admin-binding --clusterrole=cluster-admin --user=admin@admin.com
189 | ```


--------------------------------------------------------------------------------
/user/userchart/templates/deployment.yaml:
--------------------------------------------------------------------------------
  1 | {{ $randomid := randAlphaNum 8 | lower }}
  2 | {{ $base := .Values.BaseName | default "undescribed-job" }}
  3 | {{ $namespace := .Values.NameSpace | default "default"  }}
  4 | {{ $deploy := .Values.DeployName | default (printf "%s-%s-%s" $namespace $base .Release.Name) }}
  5 | {{ $label := .Values.Label | default (printf "%s-%s" $base .Release.Name) }}
  6 | {{ $containername := .Values.ContainerName | default (printf "%s-%s" $base .Release.Name) }}
  7 | {{ $containerimage := .Values.ContainerImage | default "harbor-local.ai.iiis.co/llm-course/lab-cpu:latest" }}
  8 | {{ $uid := .Values.UID | default "0" }}
  9 | {{ $gid := .Values.GID | default "0" }}
 10 | 
 11 | {{- $limits := .Values.Limits | default (dict) }}
 12 | {{ $limitscpu := $limits.CPU | default "8" }}
 13 | {{ $limitsmemory := $limits.memory | default "16Gi" }}
 14 | {{ $limitsgpu := $limits.GPU | default "0" }}
 15 | 
 16 | {{ $nvme := .Values.NVMEStorage | default "100Gi" }}
 17 | {{ $nogfs := .Values.NoGFS | default false }}
 18 | {{ $extraport := .Values.ExtraPort | default 0 }}
 19 | {{ $ingresshost := .Values.IngressHost | default "" }}
 20 | {{ $use_shm := .Values.UseShm | default false}}
 21 | {{ $shm_size := .Values.ShmSize | default "8Gi" }}
 22 | 
 23 | {{ $command := .Values.Command | default "" }}
 24 | {{ $args := .Values.Args | default "" }}
 25 | 
 26 | {{ $use_IB := .Values.UseIB | default false}}
 27 | 
 28 | {{ $replicas := .Values.Replicas | default 1 }}
 29 | 
 30 | ---
 31 | apiVersion: apps/v1
 32 | kind: Deployment
 33 | metadata:
 34 |   name: {{ $deploy }}
 35 |   namespace: {{ $namespace }}
 36 |   labels:
 37 |     app: {{ $label }}
 38 | spec:
 39 |   replicas: {{ $replicas }}
 40 |   selector:
 41 |     matchLabels:
 42 |       app: {{ $label }}
 43 |   template:
 44 |     metadata:
 45 |       labels:
 46 |         app: {{ $label }}
 47 |       annotations:
 48 |     {{ if $use_IB }}
 49 |         k8s.v1.cni.cncf.io/networks: ipoibnetwork-{{ .Values.NameSpace }}
 50 |     {{ end }}
 51 |     spec:
 52 |       hostIPC: false
 53 |       hostPID: false
 54 |       hostNetwork: false
 55 |       securityContext:
 56 |         runAsUser: {{ $uid }}
 57 |         runAsGroup: {{ $gid }}
 58 |       nodeSelector:
 59 |         gpu-model: {{ .Values.GPU }}
 60 |       containers:
 61 |       - name: {{ $containername }}
 62 |         imagePullPolicy: IfNotPresent
 63 |         image: {{ $containerimage }} # 可自行更改镜像和版本
 64 | {{ if $use_IB }}    
 65 |         securityContext:    
 66 |           capabilities:
 67 |             add: [ "IPC_LOCK" ]
 68 | {{ end }}
 69 | {{- if $command }}
 70 |         command: {{ $command }}
 71 |         args: {{ $args }}
 72 | {{- else }}
 73 |         command: ["bash", "-c", "--"]
 74 |         args: ["while true; do sleep 30; done;"]
 75 | {{- end }}
 76 |         resources:
 77 |           limits:
 78 |             cpu: {{ $limitscpu }} # 最大CPU
 79 |             memory: {{ $limitsmemory }} # 最大内存数目
 80 |             nvidia.com/gpu: {{ $limitsgpu }} # 请求的GPU数量
 81 | {{ if $use_IB }}
 82 |             rdma/rdma_shared_device_a: 1
 83 | {{ end }}
 84 |           requests:
 85 | {{ if $use_IB }}
 86 |             rdma/rdma_shared_device_a: 1
 87 | {{ end }}
 88 |         volumeMounts:
 89 |         - name: nfshome # 与下面volumes的名字对应
 90 |           mountPath: /root # 本地的挂载点 /root
 91 |         - name: scratch1 # 与下面volumes的名字对应
 92 |           mountPath: /scratch1 # 本地的挂载点
 93 |         - name: scratch2 # 与下面volumes的名字对应
 94 |           mountPath: /scratch2 # 本地的挂载点
 95 | {{ if not $nogfs }}
 96 |         - name: gfshome # 与下面volumes的名字对应
 97 |           mountPath: /gfshome # 本地的挂载点
 98 |         - name: gfsshare # 与下面volumes的名字对应
 99 |           mountPath: /share # 本地的挂载点
100 |         - name: ssdshare # 与下面volumes的名字对应
101 |           mountPath: /ssdshare # 本地的挂载点
102 | {{ end }}
103 | {{ if $use_shm }}
104 |         - name: dshm
105 |           mountPath: /dev/shm
106 | {{ end }}
107 |       volumes:
108 | {{ if $use_shm }}
109 |       - name: dshm
110 |         emptyDir:
111 |           medium: Memory
112 |           sizeLimit: {{ $shm_size }}
113 | {{ end }}
114 |       - name: nfshome
115 |         persistentVolumeClaim:
116 |           claimName: pvc-nfshome-{{ $namespace }}
117 |       - name: scratch1
118 |         persistentVolumeClaim:
119 |           claimName: pvc-rancher-localpath-1-{{ $namespace }}-{{ $deploy }}-{{ $randomid}}
120 |       - name: scratch2
121 |         persistentVolumeClaim:
122 |           claimName: pvc-rancher-localpath-2-{{ $namespace }}-{{ $deploy }}-{{ $randomid}}
123 | 
124 | {{ if not $nogfs }}
125 |       - name: gfshome
126 |         persistentVolumeClaim:
127 |           claimName: gfs-sata-pvc-{{ $namespace }}
128 |       - name: gfsshare
129 |         persistentVolumeClaim:
130 |           claimName: gfs-sata-share-pvc-{{ $namespace }}
131 |       - name: ssdshare
132 |         persistentVolumeClaim:
133 |           claimName: gfs-nvme-pvc-share-{{ $namespace }}
134 | {{ end }}
135 | 
136 | ---
137 | apiVersion: v1
138 | kind: Service
139 | metadata:
140 |   annotations: {}
141 |   labels: 
142 |     app: {{ $label }}
143 |     k8s.kuboard.cn/name: {{ $deploy }}
144 |   name: {{ $deploy }}
145 |   namespace: {{ $namespace }}
146 | spec: 
147 |   ports:
148 |     - name: {{ $deploy }}-port
149 |       port: 22
150 |       protocol: TCP
151 |       targetPort: 22
152 | {{ if $extraport}}
153 |     - name: {{ $deploy }}-extraport
154 |       port: {{ $extraport }}
155 |       protocol: TCP
156 |       targetPort: {{ $extraport }}
157 | {{ end }}
158 |   selector: 
159 |     app: {{ $label }}
160 |   sessionAffinity: None
161 |   type: NodePort
162 | ---
163 | kind: PersistentVolumeClaim
164 | apiVersion: v1
165 | metadata:
166 |   name: pvc-rancher-localpath-1-{{ $namespace }}-{{ $deploy }}-{{ $randomid}}
167 |   namespace: {{ $namespace }}
168 | spec:
169 |   accessModes:
170 |   - ReadWriteOnce
171 |   resources:
172 |     requests:
173 |       storage: {{ $nvme }}
174 |   storageClassName: rancher-local-path
175 | ---
176 | kind: PersistentVolumeClaim
177 | apiVersion: v1
178 | metadata:
179 |   name: pvc-rancher-localpath-2-{{ $namespace }}-{{ $deploy }}-{{ $randomid}}
180 |   namespace: {{ $namespace }}
181 | spec:
182 |   accessModes:
183 |   - ReadWriteOnce
184 |   resources:
185 |     requests:
186 |       storage: {{ $nvme}}
187 |   storageClassName: rancher-local-path
188 | 
189 | {{ if $ingresshost }}
190 | 
191 | ---
192 | apiVersion: networking.k8s.io/v1
193 | kind: Ingress
194 | metadata:
195 |   annotations:
196 |     cert-manager.io/cluster-issuer: letsencrypt-prod
197 |   name: {{ $deploy }}
198 |   namespace: {{ $namespace }}
199 | spec:
200 |   ingressClassName: nginx
201 |   rules:
202 |     - host: {{ $ingresshost }}
203 |       http:
204 |         paths:
205 |           - backend:
206 |               service:
207 |                 name: {{ $deploy }}
208 |                 port:
209 |                   number: {{ $extraport }}
210 |             path: /
211 |             pathType: Prefix
212 |   tls:
213 |     - hosts:
214 |         - {{ $ingresshost }}
215 |       secretName: passwd-tls
216 | 
217 | {{ end }}
218 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ailab
  2 | 
  3 | - [ailab](#ailab)
  4 |   - [集群概况](#集群概况)
  5 |   - [获取访问权限](#获取访问权限)
  6 |     - [配置kubeconfig](#配置kubeconfig)
  7 |   - [使用K8S](#使用k8s)
  8 |   - [其他使用说明](#其他使用说明)
  9 |     - [运行大模型课程labs代码](#运行大模型课程labs代码)
 10 |     - [修改账号密码](#修改账号密码)
 11 |     - [使用 VS Code 连接K8S远程调试](#使用-vs-code-连接k8s远程调试)
 12 |     - [私有容器镜像仓库](#私有容器镜像仓库)
 13 |       - [自定义镜像](#自定义镜像)
 14 |         - [信任集群 Harbor](#信任集群-harbor)
 15 |         - [制作镜像](#制作镜像)
 16 |           - [环境准备](#环境准备)
 17 |           - [编写 Dockerfile 制作镜像](#编写-dockerfile-制作镜像)
 18 |         - [从自定义镜像创建 Pod](#从自定义镜像创建-pod)
 19 | 
 20 | 
 21 | ## 集群概况
 22 | 
 23 | 本集群计算环境基于 K8S 搭建而成，硬件包括3台独立的 master 节点、50台 worker 节点和一台提供 NFS 服务的 NAS（网络存储服务器）。使用 Harbor 搭建私有镜像仓库，openLDAP 进行统一身份认证。通过统一的 kubeconfig 配置文件分发平台，用户也可以通过 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) 以命令行的方式使用K8S集群。
 24 | 
 25 | 
 26 | |系统|登陆地址|功能|
 27 | |---|---|---|
 28 | |密码管理|https://auth.ai.iiis.co:9443|账号修改密码。|
 29 | |Harbor|https://harbor.ai.iiis.co:9443|私有容器镜像仓库|
 30 | |kubeconfig|https://login.ai.iiis.co:9443|kubeconfig配置文件分发平台|
 31 | 
 32 | ## 获取访问权限
 33 | 
 34 | 在管理员已经为用户创建好账号的情况下， 用户需要确认是否已经满足下列三个条件
 35 | 
 36 | - 您使用的终端可以连通SSH跳板机，测试方法为ping js.ai.iiis.co， 如果ping不通，检查你的网络设置（特别是DNS设置），或者联络管理员。
 37 | - 您已经获取了访问K8S集群的用户名、用户账号关联邮箱和登录密码。
 38 | - 在等待获取访问权限的过程中，可以先准备好安装本地软件 （见下节）。
 39 | 为了确保账号安全，强烈建议大家拿到账号后先 [修改密码](#修改账号密码)。
 40 | 
 41 | ## 使用SSH跳板机
 42 |   
 43 |  - 在您使用的终端上执行如下命令：
 44 | ```bash
 45 |    ssh -i 私钥文件名  -N -L 6443:api.ai.iiis.co:6443 ailab@js.ai.iiis.co -p 9022
 46 | ```
 47 |  - 私钥文件名默认为~/.ssh/id_rsa (可以省略）：
 48 | ```bash
 49 |   ssh -N -L 6443:api.ai.iiis.co:6443 ailab@js.ai.iiis.co -p 9022
 50 | ```
 51 | 
 52 |  - 如果终端上6443端口已经被其他程序占用，可以换成其他端口，比如换成6444端口，则命令应写成：
 53 | ```bash
 54 |   ssh -i 私钥文件名  -N -L 6444:api.ai.iiis.co:6443 ailab@js.ai.iiis.co -p 9022
 55 | ```
 56 |  - 命令执行后，会出现貌似“卡死”现象（命令并不返回），这是正常的。**不要关闭**该terminal。可以另打开一个terminal进行其他操作。也可以在上述ssh命令的最后加上&，将放入后台。
 57 |  - 如果您希望自动连接跳板机，可以参考autossh (https://www.harding.motd.ca/autossh/)
 58 |   
 59 | ## 配置集群访问环境
 60 | >注：不推荐使用wsl。在wsl上执行可能在后续步骤中出现WebSocket close with status code 1006错误
 61 | ### 安装本地软件
 62 | 
 63 | 本地电脑至少需要安装以下两个软件。
 64 | 
 65 | #### Kubectl
 66 | 用户可以直接使用 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) 管理 k8s。
 67 | 安装说明在 https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/
 68 | 
 69 | #### Helm
 70 | helm 是 Kubernetes 的包管理器，helm的安装及使用方法可以参考[官方文档](https://helm.sh/docs/)。比较简单的安装方式（Linux上）是使用这个脚本
 71 | https://helm.sh/docs/intro/install/#from-script
 72 | 
 73 | 推荐安装以下软件：
 74 | * Docker （如果你需要本地构建镜像）。 PC上推荐安装docker desktop，有界面使用更方便。 https://www.docker.com/products/docker-desktop/
 75 | 
 76 | * VSCode （本地集成化开发环境，使用K8S更加方便）。
 77 | 
 78 | ### 配置kubeconfig
 79 | 
 80 | 用户基于 kubeconfig 通过命令行方式使用K8S，需要先在自己的终端设备配置好 kubeconfig。利用系统提供的 kubeconfig 信息（包含用户账户和 Token 等信息），可以在自己的终端利用 kubectl 对 K8S 集群中的资源进行访问。本节介绍如何获取和使用 kubeconfig。
 81 | 
 82 | 用浏览器访问URL地址：https://login.ai.iiis.co:9443
 83 | 
 84 | ![](assets/dex_login.png)
 85 | 
 86 | 输入邮箱地址（邮箱地址等同申请的账号，格式为：用户名@iiis.co）和密码（上边修改过的密码）即可登录。
 87 | 
 88 | >注：这里的邮箱地址是***用户名@iiis.co***。
 89 | 
 90 | 进入kubeconfig信息页面，选择您使用的系统类型。
 91 | 
 92 | 请按照您所使用的 kubectl 所在的操作系统进行选择。
 93 | 
 94 | ![](assets/dex_token_1.png)
 95 | 
 96 | 接下来，要按照页面指示的顺序在运行 kubectl 的命令行运行命令，会生成名为config的配置文件。
 97 | 
 98 | >注：如果你用的是Windows机器，请在Windows Power Shell 下运行这些命令；如果无法执行网站中的第一条命令，可以执行如下命令：
 99 | 
100 | ```bash
101 | $homeDir = $env:USERPROFILE
102 | $certDir = Join-Path $homeDir ".kube\certs\k8s.iiis"
103 | New-Item -ItemType Directory -Path $certDir -Force | Out-Null
104 | 
105 | $certPath = Join-Path $certDir "k8s-ca.crt"
106 | 
107 | @"
108 | -----BEGIN CERTIFICATE-----
109 | 
110 | (此处粘贴你自己的certificate）
111 | 
112 | -----END CERTIFICATE-----
113 | "@ | Out-File -FilePath $certPath -Encoding ascii
114 | 
115 | ```
116 | 
117 | ![](assets/dex_token_2.png)
118 | 
119 | 所有指令执行完毕后，再运行如下命令设置默认的namespace (ns)。在 K8S 集群中，管理员已经为每一位用户创建了与 UID 相同的命名空间ns。用户只在自己的 ns 中具有使用权限，因此所有操作都只能在自己的 ns 中完成。通过运行下面的命令，可以避免每个命令都需要指定ns。
120 | 
121 | ```bash
122 | kubectl config set-context --current --namespace=`kubectl config current-context | cut -d'-' -f 1` 
123 | ```
124 | 
125 | >注：如果你用的是Windows机器，请运行如下命令（powershell不支持cut）
126 | 
127 | ```bash
128 | kubectl config set-context --current --namespace=($((kubectl config current-context) -split '-')[0])
129 | ```
130 | 
131 | >注：请检查你的UID中是否有`-`。如果有，第一个`-`符号后的内容会自动被上述指令去掉，导致后续步骤出现了类似 `Error from server (Forbidden)` 的报错。请浏览并手动编辑kubeconfig文件
132 | 
133 | - 提示：如果连接SSH跳板机时，本地终端使用的端口不是6443，而是其他端口，比如6444，则需要把config文件内容中的server: https://127.0.0.1:6443 修改成server: https://127.0.0.1:6444。
134 | 
135 | 之后可以使用以下 kubectl 命令测试是否已经可以访问K8S中的资源。
136 | 
137 | ```bash
138 | kubectl get pvc
139 | ```
140 | 应该能看到返回了4个PVC （是用户在集群中可以访问的存储空间，可以理解为是一个盘）。
141 | 
142 | ## 使用K8S
143 | 
144 | ### 使用默认配置启动计算任务
145 | 
146 | 本仓库已经为用户提供了创建计算任务的默认 helm 模板，如果使用默认配置，请 clone 本仓库，并将 user/values.yaml 文件中的内容按照自己账号和计算需求进行修改，即可使用 helm 创建计算任务。 user/values-template.yaml 文件的具体内容为：
147 | 
148 | ```
149 | ########### 必须要写的部分 ###########
150 | NameSpace: namespace   # 更改为自己的namespace （同用户名）
151 | BaseName: pytorch   # 任务的基本名字，建议写任务描述，例如pytorch
152 | ContainerImage: harbor-local.ai.iiis.co/llm-course/lab-cpu:v2   # 镜像名称，默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:v2
153 | GPU: RTX4090D # RTX4090D RTX4090 RTX3090
154 | 
155 | ########### 选填的部分 ###########
156 | # DeployName: namespace-pytorch-release     # 任务（deployment）的名字，默认为`NameSpace-BaseName-ReleaseName`， releaseName为随机生成的字符串是在helm命令行里指定的
157 | # Label: pytorch-release              # 任务的标签，默认为`BaseName-ReleaseName`
158 | # ContainerName: pytorch-release      # 容器名，默认为`BaseName-ReleaseName`
159 | # NVMEStorage: 100G                   # 申请的本地盘/scratch的大小，不填即为默认值
160 | # Limits:             # 申请的资源，注意所有启动的资源总和不能超过自己ns的quota，如果增加quota，需要向管理员申请，不填为默认值
161 | #  CPU: 8
162 | #  memory: 16Gi
163 | #  GPU: 0
164 | # UseShm: False
165 | # ShmSize: 8Gi
166 | 
167 | # Replicas: 1     # starting more replica of the pod (for distributed training)
168 | 
169 | ```
170 | 
171 | 此文件用于创建一个副本数为 1 的 [Deployment](https://kubernetes.io/zh-cn/docs/concepts/workloads/controllers/deployment/)计算任务工作负载。你可以复制这个文件到比如 `cp value-template.yaml lab1.yaml`，然后编辑lab1.yaml文件，输入你的配置参数。之后在user目录中运行
172 | 
173 | ```
174 | cd user
175 | helm install release_name --values ./lab1.yaml ./userchart
176 | ```
177 | 
178 | `release_name`为helm部署的版本名（release），建议设置为自己的`UID+任务描述`的格式以方便后续维护管理，例如xuw_lab1。`--values ./lab1.yaml`为helm模板的各项变量提供了对应的值（你刚刚设置的），最后`./userchart`是helm模板的路径位置。
179 | 
180 | 之后，可以通过运行 
181 | ```bash
182 | kubectl get pods
183 | ```
184 | 来观察启动的pod是否已经启动了。启动之后可以通过
185 | 
186 | ```bash
187 | kubectl exec -i name_of_the_pod -- bash
188 | ```
189 | 来连接这个pod，并且启动bash。建议大家使用后边描述的使用VSCode连接K8S使用，要方便很多。
190 | 
191 | ### 默认挂载的存储描述
192 | 
193 | 在默认的模板中，自动为每个pod默认挂载了四个存储卷。这些存储卷是管理员为用户创建好了用于长期保存数据的[持久卷申领（PersistentVolumeClaim，PVC）](https://kubernetes.io/zh-cn/docs/concepts/storage/persistent-volumes/)。
194 | 
195 | - 挂载于容器内`/root`路径的NFS服务的PVC，用于存储文档及代码等小文件；
196 | - 挂载于容器内`/gfshome`路径GFS的个人存储空间PVC，用于存储模型文件、数据集等大文件；
197 | - 挂载于容器内`/share`路径GFS的共享空间PVC，用于存放和共享开源大模型、开源数据集等公共数据；
198 | - 挂载于容器内`/ssdshare`路径GFS的共享空间PVC，用于存储需要快速访问的模型文件等大文件（与share的区别为：该空间用SSD做存储，速度快）；
199 | 
200 | 临时数据存放在宿主机本地的NVME硬盘中，挂载在容器内的`/scratch1`和`/scratch2`，POD被删除后，这2个目录里面的数据也会被删除，请一定不要将需要持久化保存的重要数据放在这2个路径。
201 | 
202 | 上面的helm模板中会自动挂载长期存储数据的四个PVC，并自动创建对应于`/scratch1`和`/scratch2`两个临时数据存储PVC。
203 | 
204 | 
205 | | 存储系统   | 写入速度 |
206 | | ---------- | -------- |
207 | | 宿主机NVME | 2.3GB/s  |
208 | | GFS        | 1GB/s    |
209 | | GFS-SSD    | 2GB/s    |
210 | | NFS        | 1GB/s    |
211 | 
212 | ### 删除计算任务
213 | 
214 | 通过下面的命令删除计算任务
215 | 
216 | ```
217 | helm delete release_name
218 | ```
219 | 其中，release_name是你创建任务时候输入的第一个参数（release_name)，如果你忘了当时用的什么了，可以用
220 | ```
221 | helm list
222 | ```
223 | 来列出所有的release。
224 | 
225 | helm delete 命令会自动删除容器和应于`/scratch1`至`/scratch4`的四个临时数据存储PVC，但不会删除长期存储数据的三个PVC。
226 | 
227 | 
228 | ### 定制自己的模板
229 | 
230 | 如果对helm chart功能及语法比较熟悉，也欢迎用户对模板进行修改或定制，并将成果分享给大家。
231 | 
232 | ## 其他使用说明
233 | 
234 | ### 运行大模型课程labs代码
235 | 
236 | 1. 推荐使用下面的“使用 VS Code 连接K8S远程调试”方法先在vscode中连接集群。
237 | 2. 在VS Code命令行（terminal）中，clone课程仓库：
238 |    ```
239 |    git clone git@github.com:xuw/llm_course_public.git
240 |    ```
241 | 3. (更新)查看最新的课程内容信息：
242 |    ```
243 |    cd llm_course_public/ && git pull --all
244 |    ```
245 | 4. 在VS Code中运行labs的Jupyter Notebook
246 |    - 确定在Server端Jupyter插件已正确安装，已经启用（enable）
247 |    - 在GUI中设置kernel（environment）为conda即可
248 | 
249 | ### 修改账号密码
250 | 
251 | 集群提供了一套简单的密码修改界面，用户可以修改自己账号的密码。
252 | 
253 | 用浏览器访问URL地址 `https://auth.ai.iiis.co:9443` 访问密码修改界面。界面如下图：
254 | ![](assets/ssp_main_page.png)
255 | 
256 | 在界面上填写用户名（界面上的Login字段）、原密码（Old password字段）、新密码（New password字段），并重复输入一次新密码（Confirm字段），点击【Send】按钮，即可完成账号密码修改。
257 | ![](assets/ssp_success.png)
258 | 
259 | ### 使用 VS Code 连接K8S远程调试
260 | 
261 | 使用 [VS Code](https://code.visualstudio.com/) 可以远程 debug 集群中创建的 POD。这里我们给出一个简单的教程，更多的信息请自行查阅 [Kubernetes 文档](https://kubernetes.io/zh/docs/concepts/services-networking/service/)与 [VS Code 文档](https://code.visualstudio.com/docs/azure/kubernetes)。
262 | 
263 | 首先我们需要在 VS Code 中安装`Kubernetes`插件、`Docker`插件、`Remote Container`插件（改名为`Dev container`）、`Bridge to Kubernetes`插件（被弃用，但不影响使用）：
264 | 
265 | ![](assets/vscode/vsc_k8s_plugin.jpg)
266 | 
267 | ![](assets/vscode/vsc_docker_plugin.jpg)
268 | 
269 | ![](assets/vscode/vsc_remote_connector_plugin.jpg)
270 | 
271 | ![](assets/vscode/vsc_k8s_bridge_plugin.jpg)
272 | 
273 | 使用`ctrl + shift + P`（Mac 下`command + shift + P`）选择`Kubernetes: Use Namespace`
274 | 
275 | ![](assets/vscode/vsc_k8s_select_ns.jpg)
276 | 
277 | 输入自己的 namespace 后就能访问自己namespace下的资源了。以连接一个 POD 作为示例：
278 | 
279 | ![](assets/vscode/vsc_connect_k8s.jpg)
280 | 
281 | 这样将会自动连接一个 VS Code 远程窗口，之后的开发就和本地类似了。
282 | 
283 | ### 私有容器镜像仓库
284 | 
285 | 私有容器镜像仓库对应2个域名，分别为harbor.ai.iiis.co:9443(应用场景：用户通过外网向镜像仓库中推送自定义镜像) 和 harbor-local.ai.iiis.co（应用场景：用户建立POD时，从镜像仓库中拉取镜像）。二者区别为 harbor.ai.iiis.co:9443用于外网访问镜像仓库，harbor-local.ai.iiis.co走集群内部网络，建立pod时，基于该域名拉取镜像速度快。
286 | 
287 | #### 自定义镜像
288 | 
289 | 我们可以在集群里从自定义镜像拉起 POD，以支持快速的实验环境配置。自定义镜像的思路是**在`ubuntu-tensorflow`、`ubuntu-pytorch`或`orion-client-2.4.2`的基础上，配置自己的环境**。
290 | 
291 | ##### 信任集群 Harbor
292 | 
293 | 自定义镜像需要从 Harbor 拉取，因此我们需要在 Docker 中添加对集群 Harbor 的信任。在Mac下用 Docker Desktop 可以直接在客户端`Docker Engine`里加入`insecure-registries`项，若未使用 Docker Desktop，则在`/etc/docker/daemon.json`中添加（若该文件不存在则创建）：
294 | 
295 | ```json
296 | {
297 | 
298 |   "insecure-registries": [
299 |     "harbor.ai.iiis.co"
300 |   ]
301 | }
302 | ```
303 | 
304 | 添加完毕后，重启 Docker。
305 | 
306 | ##### 制作镜像
307 | 
308 | 制作镜像的方式有基于 Dockerfile 和 `docker commit`命令两种形式。我们这里推荐基于 Dockerfile 方式，`docker commit`方式请参考[官方文档](https://docs.docker.com/engine/reference/commandline/commit/)。
309 | 
310 | > **_NOTE:_** 在[这里](https://github.com/iiisthu/gpupool/tree/master/examples/build_example)可以找到我们在这一节所使用的例子。
311 | 
312 | 我们假设在`ubuntu-pytorch`的基础上，我们还需要配置一系列环境：
313 | 
314 | 1. 安装一系列 Python 依赖库，在`requirements.txt`中指明。
315 | 2. 将某个 Python 包的 Git 仓库放入镜像，并从仓库源码安装该 Python 包。
316 | 3. 创建`workspace`工作目录。
317 | 
318 | 其他的操作可以参考这几个任务。我们假设我们在`build`目录下工作，我们使用[`navdeep-G/samplemod`](https://github.com/navdeep-G/samplemod)作为 Python Package 的例子。
319 | 
320 | ###### 环境准备
321 | 
322 | 假设我们需要 Python 支持一系列的库，例如画图的`matplotlib`和交互式的`jupyter`等，我们将这些写在`build`目录下：
323 | 
324 | ```txt
325 | # requirements.txt
326 | numpy >= 1.19
327 | matplotlib
328 | pandas >= 1.0
329 | jupyter
330 | ```
331 | 
332 | 我们也希望pod能安装我们自己的一个私有代码仓库中的某个 Python Package，我们以[`navdeep-G/samplemod`](https://github.com/navdeep-G/samplemod)为例：
333 | 
334 | ```bash
335 | # PWD: build/
336 | git clone https://github.com/navdeep-G/samplemod
337 | ```
338 | 
339 | 整个工作目录为：
340 | 
341 | ```
342 | build
343 | ├── samplemod
344 | │   ├── docs/
345 | │   ├── sample/
346 | │   ├── tests/
347 | │   ├── .gitignore
348 | │   ├── LICENSE
349 | │   ├── MANIFEST.in
350 | │   ├── Makefile
351 | │   ├── README.rst
352 | │   ├── requirements.txt
353 | │   └── setup.py
354 | └── requirements.txt
355 | ```
356 | 
357 | ###### 编写 Dockerfile 制作镜像
358 | 
359 | 我们从`harbor.ai.iiis.co:9443/library/`下的镜像出发，安装`requirements.txt`中的依赖，并安装数据。我们这里不赘述[ Dockerfile 的语法](https://docs.docker.com/engine/reference/builder/)。实例的 Dockerfile 如下：
360 | 
361 | ```docker
362 | # Dockerfile
363 | FROM harbor.ai.iiis.co:9443/library/ubuntu-pytorch:1.5.0
364 | COPY . build
365 | RUN pip install -r build/requirements.txt && cd build/samplemod; pip install . && mkdir -p workspace && rm -rf build
366 | ```
367 | 
368 | > **_NOTE:_** 这里用单行命令是为了让制作后的镜像历史中不会存在build文件夹（类似于git，即使删去的文件也会在历史中存储，以备未来可能的恢复）。
369 | 
370 | 之后利用`docker`按照 Dockerfile 制作镜像，并标记为`sample:v0`：
371 | 
372 | ```bash
373 | docker build . -t sample:v0
374 | ```
375 | 
376 | 最后确认镜像已经成功创建：
377 | 
378 | ```
379 | $ docker images | grep sample
380 | sample          v0         707ab1c88146        30 seconds ago       11.3GB
381 | ```
382 | 
383 | ##### 从自定义镜像创建 Pod
384 | 
385 | 从刚才我们制作的镜像创建 Pod 分为两步，首先需要将镜像推送到集群镜像仓库 Harbor，再从 Harbor 对应的镜像拉起 Pod。
386 | 
387 | 访问[https://harbor.ai.iiis.co:9443](https://harbor.ai.iiis.co:9443)，注意这里必须是https，用户名及密码等同用户访问k8s集群的用户名及密码。
388 | 
389 | > **_NOTE:_** 注意这里的用户名格式为“用户名@iiis.co”。
390 | 
391 | 连接到 Harbor 后新建项目：
392 | 
393 | ![](assets/harbor/harbor_dashboard.jpg)
394 | 
395 | ![](assets/harbor/harbor_create_project.jpg)
396 | 
397 | > **_NOTE:_** 注意这里需要勾选公开，原因是私有集群物理机的 docker 并没有登录用户个人的 Harbor 账户，因此无法拉取私有仓库中的镜像。
398 | 
399 | 假设我们的项目名为 zhangsan，则我们之后的镜像均要 push 到`harbor.ai.iiis.co:9443/zhangsan/`下，首先 tag 我们做好的镜像：
400 | 
401 | ```bash
402 | docker tag sample:v0 harbor.ai.iiis.co:9443/zhangsan/sample:v0
403 | ```
404 | 
405 | 之后将镜像 push 到 Harbor 中，我们需要先在 docker 中登录我们在 Harbor上的账号：
406 | 
407 | ```txt
408 | $ docker logout harbor.ai.iiis.co:9443
409 | Removing login credentials for harbor.ai.iiis.co
410 | $ docker login harbor.ai.iiis.co:9443
411 | Username: zhangsan@iiis.co
412 | Password:
413 | Login Succeeded
414 | ```
415 | 
416 | 最后将镜像推送到 Harbor 中：
417 | 
418 | ```bash
419 | docker push harbor.ai.iiis.co:9443/zhangsan/sample:v0
420 | ```
421 | 
422 | 创建好镜像后，拉起 Pod 流程和标准镜像一样。
423 | 
424 | 提示：建立Pod时，values-template.yaml模板中，指定容器镜像字段ContainerImage处，需要修改镜像仓库对应的域名为harbor-local.ai.iiis.co，可以提高镜像拉取速度，请参见本文档“使用默认配置启动计算任务”部分。
425 | 


--------------------------------------------------------------------------------