├── admin ├── yamls │ └── .gitignore ├── assets │ ├── tree_users_ldap.png │ ├── phpLDAPadmin_create_ou.png │ ├── phpLDAPadmin_main_page.png │ ├── phpLDAPadmin_ou_commit.png │ ├── phpLDAPAdmin_group_input.png │ ├── phpLDAPAdmin_group_member.png │ ├── phpLDAPAdmin_user_commit.png │ ├── phpLDAPAdmin_user_template.png │ ├── phpLDAPadmin_create_object.png │ ├── phpLDAPAdmin_posix_group_name.png │ ├── phpLDAPAdmin_template_Default.png │ ├── phpLDAPAdmin_user_information.png │ ├── phpLDAPAdmin_user_input_email.png │ ├── phpLDAPAdmin_user_add_attribute.png │ ├── phpLDAPadmin_create_child_entry.png │ ├── phpLDAPadmin_template_posix_group.png │ ├── phpLDAPAdmin_user_add_attribute_Email.png │ ├── phpLDAPAdmin_user_create_child_entry.png │ ├── phpLDAPAdmin_user_information_detail.png │ ├── phpLDAPAdmin_template_groupOfUniqueNames.png │ ├── phpLDAPAdmin_user_add_user_email_commit.png │ └── phpLDAPadmin_create_object_posix_group.png ├── cluster_setting │ ├── admin_helm_ns.yaml │ └── rancher_local_path_nvme.yaml ├── adminchart │ ├── templates │ │ ├── sc_pv.yaml │ │ ├── quota.yaml │ │ ├── ipoibnetwork.yaml │ │ └── rbac.yaml │ ├── .helmignore │ ├── values.yaml │ └── Chart.yaml ├── gfshomechart │ ├── templates │ │ └── sc_pv.yaml │ ├── .helmignore │ ├── values.yaml │ └── Chart.yaml ├── gfssharechart │ ├── templates │ │ └── sc_pv.yaml │ ├── .helmignore │ ├── values.yaml │ └── Chart.yaml ├── ssdsharechart │ ├── templates │ │ └── sc_pv.yaml │ └── Chart.yaml ├── deluser_dir.sh ├── rmtestpod.sh.sh ├── values-template.yaml ├── adduser_dir.sh ├── pull_images_to_local.yaml └── README.md ├── assets ├── dex_login.png ├── dex_token_1.png ├── dex_token_2.png ├── ssp_success.png ├── ssp_main_page.png ├── dex_kubectl_run.png ├── vscode │ ├── vsc_k8s_plugin.jpg │ ├── vsc_connect_k8s.jpg │ ├── vsc_docker_plugin.jpg │ ├── vsc_k8s_select_ns.jpg │ ├── vsc_k8s_bridge_plugin.jpg │ └── vsc_remote_connector_plugin.jpg └── harbor │ ├── harbor_dashboard.jpg │ └── harbor_create_project.jpg ├── dockerfiles ├── comfyui │ ├── Dockerfile │ └── run.sh ├── sd-webui │ ├── Dockerfile │ └── run.sh ├── pytorch │ ├── requirements.txt │ └── Dockerfile ├── lab-cpu │ ├── requirements.txt │ └── Dockerfile └── lab │ ├── requirements.txt │ └── Dockerfile ├── user ├── userchart │ ├── .helmignore │ ├── Chart.yaml │ └── templates │ │ └── deployment.yaml ├── sdwebui-template.yaml ├── comfyui-template.yaml └── values-template.yaml ├── test └── username.txt ├── FAQ.md └── README.md /admin/yamls/.gitignore: -------------------------------------------------------------------------------- 1 | values*.yaml 2 | -------------------------------------------------------------------------------- /assets/dex_login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_login.png -------------------------------------------------------------------------------- /assets/dex_token_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_token_1.png -------------------------------------------------------------------------------- /assets/dex_token_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_token_2.png -------------------------------------------------------------------------------- /assets/ssp_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/ssp_success.png -------------------------------------------------------------------------------- /assets/ssp_main_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/ssp_main_page.png -------------------------------------------------------------------------------- /assets/dex_kubectl_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/dex_kubectl_run.png -------------------------------------------------------------------------------- /admin/assets/tree_users_ldap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/tree_users_ldap.png -------------------------------------------------------------------------------- /admin/cluster_setting/admin_helm_ns.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: admin-helm -------------------------------------------------------------------------------- /assets/vscode/vsc_k8s_plugin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_k8s_plugin.jpg -------------------------------------------------------------------------------- /assets/harbor/harbor_dashboard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/harbor/harbor_dashboard.jpg -------------------------------------------------------------------------------- /assets/vscode/vsc_connect_k8s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_connect_k8s.jpg -------------------------------------------------------------------------------- /assets/vscode/vsc_docker_plugin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_docker_plugin.jpg -------------------------------------------------------------------------------- /assets/vscode/vsc_k8s_select_ns.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_k8s_select_ns.jpg -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_create_ou.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_ou.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_main_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_main_page.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_ou_commit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_ou_commit.png -------------------------------------------------------------------------------- /assets/harbor/harbor_create_project.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/harbor/harbor_create_project.jpg -------------------------------------------------------------------------------- /assets/vscode/vsc_k8s_bridge_plugin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_k8s_bridge_plugin.jpg -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_group_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_group_input.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_group_member.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_group_member.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_commit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_commit.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_template.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_create_object.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_object.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_posix_group_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_posix_group_name.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_template_Default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_template_Default.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_information.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_information.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_input_email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_input_email.png -------------------------------------------------------------------------------- /assets/vscode/vsc_remote_connector_plugin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/assets/vscode/vsc_remote_connector_plugin.jpg -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_add_attribute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_add_attribute.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_create_child_entry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_child_entry.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_template_posix_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_template_posix_group.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_add_attribute_Email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_add_attribute_Email.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_create_child_entry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_create_child_entry.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_information_detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_information_detail.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_template_groupOfUniqueNames.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_template_groupOfUniqueNames.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPAdmin_user_add_user_email_commit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPAdmin_user_add_user_email_commit.png -------------------------------------------------------------------------------- /admin/assets/phpLDAPadmin_create_object_posix_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iiisthu/ailab/HEAD/admin/assets/phpLDAPadmin_create_object_posix_group.png -------------------------------------------------------------------------------- /dockerfiles/comfyui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM harbor.ai.iiis.co:9443/zhaoyue/lab5:v4 2 | 3 | 4 | WORKDIR /comfyui-repo 5 | COPY . /comfyui-repo/ 6 | 7 | CMD ["bash", "run.sh"] -------------------------------------------------------------------------------- /dockerfiles/sd-webui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM harbor.ai.iiis.co:9443/zhaoyue/lab5:v4 2 | 3 | 4 | WORKDIR /sd-repo 5 | RUN pip install pydantic==1.10.19 6 | COPY . /sd-repo/ 7 | 8 | CMD ["bash", "run.sh"] -------------------------------------------------------------------------------- /admin/adminchart/templates/sc_pv.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: PersistentVolumeClaim 3 | apiVersion: v1 4 | metadata: 5 | name: pvc-nfshome-{{ .Values.NameSpace }} 6 | namespace: {{ .Values.NameSpace }} 7 | spec: 8 | storageClassName: nfs-users 9 | accessModes: 10 | - ReadWriteMany 11 | resources: 12 | requests: 13 | storage: 2T # 申请的资源大小 14 | -------------------------------------------------------------------------------- /admin/gfshomechart/templates/sc_pv.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: PersistentVolumeClaim 3 | apiVersion: v1 4 | metadata: 5 | name: gfs-sata-pvc-{{ .Values.NameSpace }} # 修改用户名 6 | namespace: {{ .Values.NameSpace }} # 修改命名空间 7 | spec: 8 | storageClassName: kadalu.gfs-sata-users 9 | accessModes: 10 | - ReadWriteMany 11 | resources: 12 | requests: 13 | storage: 10T # 申请的资源大小 -------------------------------------------------------------------------------- /admin/gfssharechart/templates/sc_pv.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: PersistentVolumeClaim 3 | apiVersion: v1 4 | metadata: 5 | name: gfs-sata-share-pvc-{{ .Values.NameSpace }} # 修改用户名 6 | namespace: {{ .Values.NameSpace }} # 修改命名空间 7 | spec: 8 | storageClassName: kadalu.gfs-sata-share 9 | accessModes: 10 | - ReadWriteMany 11 | resources: 12 | requests: 13 | storage: 10T # 申请的资源大小 -------------------------------------------------------------------------------- /admin/ssdsharechart/templates/sc_pv.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: PersistentVolumeClaim 3 | apiVersion: v1 4 | metadata: 5 | name: gfs-nvme-pvc-share-{{ .Values.NameSpace }} # 修改用户名 6 | namespace: {{ .Values.NameSpace }} # 修改命名空间 7 | spec: 8 | storageClassName: kadalu.gfs-nvme-share 9 | accessModes: 10 | - ReadWriteMany 11 | resources: 12 | requests: 13 | storage: 1T # 与上面PV保持一致避免浪费 14 | -------------------------------------------------------------------------------- /admin/adminchart/templates/quota.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ResourceQuota 4 | metadata: 5 | name: quota-cpu-mem-gpu-pvc 6 | namespace: {{ .Values.NameSpace }} 7 | spec: 8 | hard: 9 | limits.cpu: '56' 10 | limits.memory: 500Gi 11 | requests.cpu: '56' 12 | requests.memory: 500Gi 13 | requests.nvidia.com/gpu: '4' 14 | requests.storage: 40T 15 | count/pods: '10' 16 | 17 | -------------------------------------------------------------------------------- /admin/adminchart/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /user/userchart/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /admin/gfshomechart/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /admin/gfssharechart/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /admin/adminchart/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for userchart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | EMAIL: test06@test.lthpc.com 6 | NameSpace: test06 7 | DeployName: test06-pytorch-ssh 8 | Label: pytorch 9 | ContainerName: pytorch 10 | ContainerImage: harbor.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh 11 | Limits: 12 | CPU: 8 13 | memory: 20Gi 14 | GPU: 2 15 | NVMEStorage: 1T 16 | UID: 2005 17 | GID: 500 -------------------------------------------------------------------------------- /admin/gfshomechart/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for userchart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | EMAIL: test06@test.lthpc.com 6 | NameSpace: test06 7 | DeployName: test06-pytorch-ssh 8 | Label: pytorch 9 | ContainerName: pytorch 10 | ContainerImage: harbor.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh 11 | Limits: 12 | CPU: 8 13 | memory: 20Gi 14 | GPU: 2 15 | NVMEStorage: 1T 16 | UID: 2005 17 | GID: 500 -------------------------------------------------------------------------------- /admin/gfssharechart/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for userchart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | EMAIL: test06@test.lthpc.com 6 | NameSpace: test06 7 | DeployName: test06-pytorch-ssh 8 | Label: pytorch 9 | ContainerName: pytorch 10 | ContainerImage: harbor.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh 11 | Limits: 12 | CPU: 8 13 | memory: 20Gi 14 | GPU: 2 15 | NVMEStorage: 1T 16 | UID: 2005 17 | GID: 500 -------------------------------------------------------------------------------- /admin/adminchart/templates/ipoibnetwork.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: mellanox.com/v1alpha1 2 | kind: IPoIBNetwork 3 | metadata: 4 | name: ipoibnetwork-{{ .Values.NameSpace }} 5 | spec: 6 | networkNamespace: {{ .Values.NameSpace }} 7 | master: "ibs121" 8 | ipam: | 9 | { 10 | "type": "whereabouts", 11 | "datastore": "kubernetes", 12 | "kubernetes": { 13 | "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" 14 | }, 15 | "range": "192.168.0.0/16", 16 | "log_file" : "/var/log/whereabouts.log", 17 | "log_level" : "info", 18 | "gateway": "192.168.0.1" 19 | } -------------------------------------------------------------------------------- /admin/adminchart/templates/rbac.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: RoleBinding 4 | metadata: 5 | name: user-ns-binding 6 | namespace: {{ .Values.NameSpace }} 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: ClusterRole 10 | name: admin 11 | subjects: 12 | - apiGroup: rbac.authorization.k8s.io 13 | kind: User 14 | name: {{ .Values.EMAIL }} 15 | --- 16 | apiVersion: rbac.authorization.k8s.io/v1 17 | kind: ClusterRoleBinding 18 | metadata: 19 | name: clusterrole-user-pv-{{ .Values.NameSpace }} 20 | roleRef: 21 | apiGroup: rbac.authorization.k8s.io 22 | kind: ClusterRole 23 | name: clusterrole-user-pv 24 | subjects: 25 | - apiGroup: rbac.authorization.k8s.io 26 | kind: User 27 | name: {{ .Values.EMAIL }} -------------------------------------------------------------------------------- /dockerfiles/comfyui/run.sh: -------------------------------------------------------------------------------- 1 | cd /comfyui-repo/ComfyUI/models 2 | rm -rf checkpoints && ln -s /ssdshare/share/lab5/ComfyUI-models/checkpoints checkpoints 3 | rm -rf clip && ln -s /ssdshare/share/lab5/ComfyUI-models/clip clip 4 | rm -rf controlnet && ln -s /ssdshare/share/lab5/ComfyUI-models/controlnet controlnet 5 | rm -rf gligen && ln -s /ssdshare/share/lab5/ComfyUI-models/gligen gligen 6 | rm -rf loras && ln -s /ssdshare/share/lab5/ComfyUI-models/loras loras 7 | rm -rf upscale_models && ln -s /ssdshare/share/lab5/ComfyUI-models/upscale_models upscale_models 8 | rm -rf vae && ln -s /ssdshare/share/lab5/ComfyUI-models/vae vae 9 | 10 | cd .. 11 | 12 | cd custom_nodes 13 | ln -s /ssdshare/share/lab5/custom_nodes/ComfyUI-Manager ComfyUI-Manager 14 | 15 | cd .. 16 | /usr/bin/python main.py -------------------------------------------------------------------------------- /dockerfiles/pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | openai 4 | pandas 5 | requests 6 | scikit-learn 7 | transformers 8 | sentencepiece 9 | protobuf 10 | datasets 11 | accelerate 12 | chardet 13 | python-dotenv==1.0.0 14 | httpx[socks] 15 | httpcore[socks] 16 | ipykernel 17 | ipywidgets 18 | langchain==0.1.9 19 | langchain-openai==0.0.8 20 | langchainhub==0.1.14 21 | google-search-results==2.4.2 22 | lxml==4.9.3 23 | tiktoken 24 | faiss-cpu 25 | beautifulsoup4 26 | chroma-hnswlib 27 | chromadb 28 | matplotlib-inline 29 | pinecone-client 30 | pypdf 31 | scipy 32 | sentence-transformers 33 | tenacity 34 | tqdm 35 | unstructured 36 | unstructured-client 37 | unstructured-inference 38 | unstructured.pytesseract 39 | nltk 40 | rouge 41 | peft 42 | pillow 43 | ftfy 44 | Jinja2 45 | diffusers 46 | tensorboard 47 | 48 | -------------------------------------------------------------------------------- /admin/deluser_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while IFS=, read USER EMAIL UIDD GIDD 4 | do 5 | if [ -n "$(echo "$USER" | tr -d '\r')" ]; then 6 | echo "username : $USER" 7 | 8 | if [ ! -d "./yamls/values_$(echo "$USER" | tr -d '\r').yaml" ];then 9 | echo "delete delete pods pvcs namespace in k8s" 10 | helm delete admin-$(echo "$USER" | tr -d '\r') --namespace=admin-helm 11 | helm delete gfshome-$(echo "$USER" | tr -d '\r') --namespace=admin-helm 12 | helm delete gfsshare-$(echo "$USER" | tr -d '\r') --namespace=admin-helm 13 | helm delete ssdshare-$(echo "$USER" | tr -d '\r') --namespace=admin-helm 14 | helm delete testuser-$(echo "$USER" | tr -d '\r') --namespace=$(echo "$USER" | tr -d '\r') 15 | kubectl delete namespace $(echo "$USER" | tr -d '\r') 16 | else 17 | echo "there is no yaml file for $USER" 18 | fi 19 | fi 20 | done < $1 21 | -------------------------------------------------------------------------------- /dockerfiles/sd-webui/run.sh: -------------------------------------------------------------------------------- 1 | cd /sd-repo/stable-diffusion-webui 2 | ln -s /ssdshare/share/lab5/repositories repositories 3 | ln -s /ssdshare/share/lab5/interrogate interrogate 4 | 5 | cd models 6 | rm -rf Lora && ln -s /ssdshare/share/lab5/SD-WebUI-models/Lora Lora 7 | rm -rf BLIP && ln -s /ssdshare/share/lab5/SD-WebUI-models/BLIP BLIP 8 | rm -rf Stable-diffusion && ln -s /ssdshare/share/lab5/SD-WebUI-models/Stable-diffusion Stable-diffusion 9 | rm -rf torch_deepdanbooru && ln -s /ssdshare/share/lab5/SD-WebUI-models/torch_deepdanbooru torch_deepdanbooru 10 | rm -rf CLIP && ln -s /ssdshare/share/lab5/SD-WebUI-models/CLIP CLIP 11 | rm -rf VAE && ln -s /ssdshare/share/lab5/SD-WebUI-models/VAE VAE 12 | rm -rf VAE-approx && ln -s /ssdshare/share/lab5/SD-WebUI-models/VAE-approx VAE-approx 13 | cd .. 14 | 15 | HF_ENDPOINT="http://hf-mirror.com" /usr/bin/python launch.py --no-download-sd-model --skip-prepare-environment --clip-models-path /ssdshare/share/lab5/clip-vit-l-14 16 | -------------------------------------------------------------------------------- /admin/rmtestpod.sh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while IFS=, read USER EMAIL UIDD GIDD 4 | do 5 | username=$(echo "$USER" | tr -d '\r') 6 | if [ -n "$username" ]; then 7 | echo "username : $username" 8 | 9 | testuser=$(helm list --namespace=$username) 10 | echo $testuser 11 | 12 | if [[ ($testuser =~ "testuser-${username}") && ($testuser =~ "deployed")]] 13 | then 14 | echo "包含" 15 | echo "testuser-${username}" 16 | podtest=$(kubectl get pods -n=$username) 17 | echo $podtest 18 | if [[ ($podtest =~ "testuser-${username}") && ($podtest =~ "Running")]] 19 | then 20 | echo "success create pod, delete helm release testuser-${username}" 21 | helm delete testuser-${username} --namespace=$username 22 | else 23 | echo "create pod error, please check pod" 24 | echo $podtest 25 | fi 26 | else 27 | echo "create helm release error, please check helm" 28 | echo $testuser 29 | fi 30 | fi 31 | done < $1 32 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel 2 | 3 | ENV TZ=Asia/Shanghai \ 4 | DEBIAN_FRONTEND=noninteractive 5 | RUN apt-get update && \ 6 | apt-get install -y tzdata && \ 7 | ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime && \ 8 | echo ${TZ} > /etc/timezone && \ 9 | dpkg-reconfigure -f noninteractive tzdata 10 | 11 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \ 12 | sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \ 13 | echo '[global]\nindex-url=https://mirrors.aliyun.com/pypi/simple/\n' >> /etc/pip.conf 14 | 15 | RUN apt-get update && \ 16 | apt-get install -y openssh-server vim curl wget git iputils-ping net-tools git-lfs && \ 17 | apt-get clean && rm -rf /tmp/* && \ 18 | service ssh start \ 19 | rmdir /workspace \ 20 | ln -s /root/workspace/ /workspace 21 | 22 | COPY requirements.txt /opt/app/requirements.txt 23 | WORKDIR /opt/app 24 | RUN pip install -r requirements.txt 25 | WORKDIR /root 26 | 27 | RUN apt-get install -y libgl1 && \ 28 | apt-get clean && rm -rf /tmp/* 29 | 30 | ENTRYPOINT ["/usr/sbin/sshd", "-D"] 31 | CMD ["-p","22"] 32 | 33 | -------------------------------------------------------------------------------- /dockerfiles/lab-cpu/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | openai 4 | pandas 5 | requests 6 | scikit-learn 7 | transformers 8 | sentencepiece 9 | protobuf 10 | datasets 11 | chardet 12 | python-dotenv 13 | httpx[socks] 14 | httpcore[socks] 15 | ipykernel 16 | ipywidgets 17 | langchain 18 | langchain-openai 19 | langchainhub 20 | langchain_experimental 21 | langchain_core 22 | langchain_community 23 | langchain_unstructured 24 | langgraph 25 | google-search-results 26 | lxml 27 | tiktoken 28 | faiss-cpu 29 | beautifulsoup4 30 | chroma-hnswlib 31 | chromadb 32 | matplotlib-inline 33 | pinecone-client 34 | pypdf 35 | scipy 36 | tenacity 37 | tqdm 38 | unstructured 39 | unstructured-client 40 | unstructured.pytesseract 41 | nltk 42 | rouge 43 | pillow 44 | ftfy 45 | Jinja2 46 | diffusers 47 | tensorboard 48 | tensorstore 49 | zarr 50 | uvicorn 51 | fastapi 52 | websockets 53 | gradio==5.18.0 54 | dspy-ai 55 | langchain_community 56 | pytesseract 57 | pi-heif 58 | opencv-python-headless 59 | langchain_pinecone 60 | langchain_chroma 61 | asyncer 62 | neo4j 63 | yfiles_jupyter_graphs 64 | httpx[socks] 65 | 66 | # git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git 67 | # cd LLaMA-Factory 68 | # pip install -e ".[torch,metrics]" -------------------------------------------------------------------------------- /admin/adminchart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: userchart 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /user/userchart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: userchart 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /admin/gfshomechart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: userchart 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /admin/gfssharechart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: userchart 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /admin/ssdsharechart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: userchart 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /admin/values-template.yaml: -------------------------------------------------------------------------------- 1 | # Template values for userchart. DO NOT Edit. Instead, make a copy and edit the copy. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | ########### 必须要写的部分 ########### 6 | EMAIL: $EMAIL 7 | NameSpace: $NAMESPACE # 自己的namespace (同用户名) 8 | BaseName: pytorch # 任务的基本名字,建议写任务描述,例如pytorch 9 | ContainerImage: harbor-local.ai.iiis.co/share/pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime-ssh # 镜像名称,默认为 harbor.ai.iiis.co/xuw/pytorch:v1.1,或者见README的说明 10 | GPU: RTX4090D # RTX4090D RTX4090 RTX3090 A100 11 | 12 | ########### 选填的部分 ########### 13 | # DeployName: namespace-pytorch-release # 任务(deployment)的名字,默认为'NameSpace-BaseName-ReleaseName', releaseName为随机生成的字符串是在helm命令行里指定的 14 | # Label: pytorch-release # 任务的标签,默认为'BaseName-ReleaseName' 15 | # ContainerName: pytorch-release # 容器名,默认为'BaseName-ReleaseName' 16 | # NVMEStorage: 100G # 申请的本地盘/scratch的大小,不填即为默认值 17 | # Limits: # 申请的资源,注意所有启动的资源总和不能超过自己ns的quota,如果增加quota,需要向管理员申请,不填为默认值 18 | # CPU: 8 19 | # memory: 16Gi 20 | # GPU: 0 21 | # UseShm: False 22 | # ShmSize: 8Gi 23 | 24 | # Replicas: 1 # starting more replica of the pod (for distributed training) 25 | 26 | ########### 高级配置 ########### 27 | # ExtraPort: 7860 28 | # IngressHost: lab2.ai.iiis.co 29 | # Command: '["python", "/app/app_class1.py"]' 30 | # Args: '' 31 | UseIB: true # 是否使用IB,默认不使用,如果使用,需要在启动时指定UseIB=true 32 | # nogfs: true -------------------------------------------------------------------------------- /user/sdwebui-template.yaml: -------------------------------------------------------------------------------- 1 | # Template values for userchart. DO NOT Edit. Instead, make a copy and edit the copy. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | ########### 必须要写的部分 ########### 6 | NameSpace: namespace # 自己的namespace (同用户名) 7 | BaseName: sdwebui # 任务的基本名字,建议写任务描述,例如pytorch 8 | ContainerImage: harbor-local.ai.iiis.co/llm-course/sdwebui:v1 # 镜像名称,默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:latest,或者见README的说明 9 | GPU: RTX4090 # 可选的包括: RTX4090D RTX4090 RTX3090, 其中RTX4090D的显存为48G,RTX4090的显存为24G,RTX3090的显存为24G 10 | 11 | ########### 选填的部分 ########### 12 | # DeployName: namespace-pytorch-release # 任务(deployment)的名字,默认为`NameSpace-BaseName-ReleaseName`, releaseName为随机生成的字符串是在helm命令行里指定的 13 | # Label: pytorch-release # 任务的标签,默认为`BaseName-ReleaseName` 14 | # ContainerName: pytorch-release # 容器名,默认为`BaseName-ReleaseName` 15 | # NVMEStorage: 100G # 申请的本地盘/scratch的大小,不填即为默认值 16 | Limits: # 申请的资源,注意所有启动的资源总和不能超过自己ns的quota,如果增加quota,需要向管理员申请,不填为默认值 17 | CPU: 8 18 | memory: 16Gi 19 | GPU: 1 20 | # UseShm: False # 多卡训练的时候有用 21 | # ShmSize: 8Gi # 多卡训练的时候有用 22 | 23 | # Replicas: 1 # starting more replica of the pod (for distributed training) 24 | 25 | ########### 高级配置 ########### 26 | ExtraPort: 7860 27 | #IngressHost: sdwebui.ai.iiis.co 28 | Command: '["bash", "/sd-repo/run.sh"]' 29 | # Args: '' 30 | # UseIB: true # 是否使用IB,默认不使用,如果使用,需要在启动时指定UseIB=true 31 | # nogfs: true 32 | -------------------------------------------------------------------------------- /user/comfyui-template.yaml: -------------------------------------------------------------------------------- 1 | # Template values for userchart. DO NOT Edit. Instead, make a copy and edit the copy. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | ########### 必须要写的部分 ########### 6 | NameSpace: namespace # 自己的namespace (同用户名) 7 | BaseName: comfyui # 任务的基本名字,建议写任务描述,例如pytorch 8 | ContainerImage: harbor-local.ai.iiis.co/llm-course/comfyui:v1 # 镜像名称,默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:latest,或者见README的说明 9 | GPU: RTX4090 # 可选的包括: RTX4090D RTX4090 RTX3090, 其中RTX4090D的显存为48G,RTX4090的显存为24G,RTX3090的显存为24G 10 | 11 | ########### 选填的部分 ########### 12 | # DeployName: namespace-pytorch-release # 任务(deployment)的名字,默认为`NameSpace-BaseName-ReleaseName`, releaseName为随机生成的字符串是在helm命令行里指定的 13 | # Label: pytorch-release # 任务的标签,默认为`BaseName-ReleaseName` 14 | # ContainerName: pytorch-release # 容器名,默认为`BaseName-ReleaseName` 15 | # NVMEStorage: 100G # 申请的本地盘/scratch的大小,不填即为默认值 16 | Limits: # 申请的资源,注意所有启动的资源总和不能超过自己ns的quota,如果增加quota,需要向管理员申请,不填为默认值 17 | CPU: 8 18 | memory: 16Gi 19 | GPU: 1 20 | # UseShm: False # 多卡训练的时候有用 21 | # ShmSize: 8Gi # 多卡训练的时候有用 22 | 23 | # Replicas: 1 # starting more replica of the pod (for distributed training) 24 | 25 | ########### 高级配置 ########### 26 | ExtraPort: 8188 27 | #IngressHost: comfyui.ai.iiis.co 28 | Command: '["bash", "/comfyui-repo/run.sh"]' 29 | # Args: '' 30 | # UseIB: true # 是否使用IB,默认不使用,如果使用,需要在启动时指定UseIB=true 31 | # nogfs: true 32 | -------------------------------------------------------------------------------- /user/values-template.yaml: -------------------------------------------------------------------------------- 1 | # Template values for userchart. DO NOT Edit. Instead, make a copy and edit the copy. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | ########### 必须要写的部分 ########### 6 | NameSpace: namespace # 自己的namespace (同用户名) 7 | BaseName: pytorch # 任务的基本名字,建议写任务描述,例如pytorch 8 | ContainerImage: harbor-local.ai.iiis.co/llm-course/lab-cpu:latest # 镜像名称,默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:latest,或者见README的说明 9 | GPU: RTX4090 # 可选的包括: RTX4090D RTX4090 RTX3090, 其中RTX4090D的显存为48G,RTX4090的显存为24G,RTX3090的显存为24G 10 | 11 | ########### 选填的部分 ########### 12 | # DeployName: namespace-pytorch-release # 任务(deployment)的名字,默认为`NameSpace-BaseName-ReleaseName`, releaseName为随机生成的字符串是在helm命令行里指定的 13 | # Label: pytorch-release # 任务的标签,默认为`BaseName-ReleaseName` 14 | # ContainerName: pytorch-release # 容器名,默认为`BaseName-ReleaseName` 15 | # NVMEStorage: 100G # 申请的本地盘/scratch的大小,不填即为默认值 16 | # Limits: # 申请的资源,注意所有启动的资源总和不能超过自己ns的quota,如果增加quota,需要向管理员申请,不填为默认值 17 | # CPU: 8 18 | # memory: 16Gi 19 | # GPU: 0 20 | # UseShm: False # 多卡训练的时候有用 21 | # ShmSize: 8Gi # 多卡训练的时候有用 22 | 23 | # Replicas: 1 # starting more replica of the pod (for distributed training) 24 | 25 | ########### 高级配置 ########### 26 | # ExtraPort: 7860 27 | # IngressHost: lab2.ai.iiis.co 28 | # Command: '["python", "/app/app_class1.py"]' 29 | # Args: '' 30 | # UseIB: true # 是否使用IB,默认不使用,如果使用,需要在启动时指定UseIB=true 31 | # nogfs: true 32 | -------------------------------------------------------------------------------- /admin/adduser_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while IFS=, read USER EMAIL UIDD GIDD 4 | do 5 | username=$(echo "$USER" | tr -d '\r') 6 | if [ -n "$username" ]; then 7 | echo "username : $username" 8 | 9 | yamlfile=`cat ./values-template.yaml` 10 | all_variables="NAMESPACE=$username EMAIL=$(echo "$EMAIL" | tr -d '\r')" 11 | 12 | if [ ! -d "./yamls/" ];then 13 | mkdir ./yamls 14 | fi 15 | printf "$all_variables\ncat << EOF\n$yamlfile\nEOF" | bash > ./yamls/values_$username.yaml 16 | 17 | kubectl create namespace $username 18 | 19 | helm install admin-$username \ 20 | --namespace=admin-helm \ 21 | --create-namespace \ 22 | --values ./yamls/values_$username.yaml \ 23 | ./adminchart 24 | 25 | helm install gfshome-$username \ 26 | --namespace=admin-helm \ 27 | --create-namespace \ 28 | --values ./yamls/values_$username.yaml \ 29 | ./gfshomechart 30 | 31 | helm install gfsshare-$username \ 32 | --namespace=admin-helm \ 33 | --create-namespace \ 34 | --values ./yamls/values_$username.yaml \ 35 | ./gfssharechart 36 | 37 | helm install ssdshare-$username \ 38 | --namespace=admin-helm \ 39 | --create-namespace \ 40 | --values ./yamls/values_$username.yaml \ 41 | ./ssdsharechart 42 | 43 | testuser=$(helm install testuser-$username \ 44 | --namespace=$username \ 45 | --values ./yamls/values_$username.yaml \ 46 | ../user/userchart) 47 | 48 | if [[ ($testuser =~ $username) && ($testuser =~ "deployed")]] 49 | then 50 | echo "成功 创建用户 $username" 51 | else 52 | echo "失败 创建用户 $username" 53 | fi 54 | 55 | fi 56 | done < $1 57 | -------------------------------------------------------------------------------- /test/username.txt: -------------------------------------------------------------------------------- 1 | test01,test01@test.lthpc.com,2001,500 2 | test02,test02@test.lthpc.com,2002,500 3 | test03,test03@test.lthpc.com,2003,500 4 | test04,test04@test.lthpc.com,2004,500 5 | test05,test05@test.lthpc.com,2005,500 6 | test06,test06@test.lthpc.com,2006,500 7 | test07,test07@test.lthpc.com,2007,500 8 | test08,test08@test.lthpc.com,2008,500 9 | test09,test09@test.lthpc.com,2009,500 10 | test10,test10@test.lthpc.com,2010,500 11 | test11,test11@test.lthpc.com,2011,500 12 | test12,test12@test.lthpc.com,2012,500 13 | test13,test13@test.lthpc.com,2013,500 14 | test14,test14@test.lthpc.com,2014,500 15 | test15,test15@test.lthpc.com,2015,500 16 | test16,test16@test.lthpc.com,2016,500 17 | test17,test17@test.lthpc.com,2017,500 18 | test18,test18@test.lthpc.com,2018,500 19 | test19,test19@test.lthpc.com,2019,500 20 | test20,test20@test.lthpc.com,2020,500 21 | test21,test21@test.lthpc.com,2021,500 22 | test22,test22@test.lthpc.com,2022,500 23 | test23,test23@test.lthpc.com,2023,500 24 | test24,test24@test.lthpc.com,2024,500 25 | test25,test25@test.lthpc.com,2025,500 26 | test26,test26@test.lthpc.com,2026,500 27 | test27,test27@test.lthpc.com,2027,500 28 | test28,test28@test.lthpc.com,2028,500 29 | test29,test29@test.lthpc.com,2029,500 30 | test30,test30@test.lthpc.com,2030,500 31 | test31,test31@test.lthpc.com,2031,500 32 | test32,test32@test.lthpc.com,2032,500 33 | test33,test33@test.lthpc.com,2033,500 34 | test34,test34@test.lthpc.com,2034,500 35 | test35,test35@test.lthpc.com,2035,500 36 | test36,test36@test.lthpc.com,2036,500 37 | test37,test37@test.lthpc.com,2037,500 38 | test38,test38@test.lthpc.com,2038,500 39 | test39,test39@test.lthpc.com,2039,500 40 | test40,test40@test.lthpc.com,2040,500 41 | test41,test41@test.lthpc.com,2041,500 42 | test42,test42@test.lthpc.com,2042,500 43 | test43,test43@test.lthpc.com,2043,500 44 | test44,test44@test.lthpc.com,2044,500 45 | test45,test45@test.lthpc.com,2045,500 46 | test46,test46@test.lthpc.com,2046,500 47 | test47,test47@test.lthpc.com,2047,500 48 | test48,test48@test.lthpc.com,2048,500 49 | test49,test49@test.lthpc.com,2049,500 50 | test50,test50@test.lthpc.com,2050,500 -------------------------------------------------------------------------------- /admin/pull_images_to_local.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: prepuller 5 | spec: 6 | selector: 7 | matchLabels: 8 | name: prepuller 9 | template: 10 | metadata: 11 | labels: 12 | name: prepuller 13 | spec: 14 | # Configure an init container for each image you want to pull 15 | initContainers: 16 | - name: prepuller-1 17 | # Set the image you want to pull 18 | #image: harbor-local.ai.iiis.co/llm-course/lab:v2 19 | image: harbor-local.ai.iiis.co/llm-course/lab5:v4 20 | # Use a known command that will exit successfully immediately 21 | # Any no-op command will do but YMMV with scratch based containers 22 | command: ["sh", "-c", "'true'"] 23 | resources: 24 | limits: 25 | cpu: 100m 26 | memory: 1Gi 27 | requests: 28 | cpu: 1m 29 | memory: 8Mi 30 | - name: prepuller-2 31 | # Set the image you want to pull 32 | image: harbor-local.ai.iiis.co/llm-course/lab:v2.4 33 | command: ["sh", "-c", "'true'"] 34 | resources: 35 | limits: 36 | cpu: 100m 37 | memory: 1Gi 38 | requests: 39 | cpu: 1m 40 | memory: 8Mi 41 | - name: prepuller-3 42 | # Set the image you want to pull 43 | image: harbor-local.ai.iiis.co/llm-course/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3 44 | command: ["sh", "-c", "'true'"] 45 | resources: 46 | limits: 47 | cpu: 100m 48 | memory: 1Gi 49 | requests: 50 | cpu: 1m 51 | memory: 8Mi 52 | 53 | # - name: prepuller-2 54 | # image: ... 55 | # command: ["sh", "-c", "'true'"] 56 | 57 | # etc... 58 | 59 | # Use the pause container to ensure the Pod goes into a `Running` phase 60 | # but doesn't take up resource on the cluster 61 | containers: 62 | - name: pause 63 | image: harbor.ai.iiis.co:9443/xuw/pause:3.2 64 | resources: 65 | limits: 66 | cpu: 1m 67 | memory: 8Mi 68 | requests: 69 | cpu: 1m 70 | memory: 8Mi 71 | -------------------------------------------------------------------------------- /dockerfiles/lab-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | 3 | ENV TZ=Asia/Shanghai \ 4 | DEBIAN_FRONTEND=noninteractive 5 | 6 | RUN apt-get update && \ 7 | apt-get install -y ca-certificates tzdata && \ 8 | ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime && \ 9 | echo ${TZ} > /etc/timezone && \ 10 | dpkg-reconfigure -f noninteractive tzdata && \ 11 | cat <<'EOF' > /etc/apt/sources.list.d/ubuntu.sources 12 | Types: deb 13 | URIs: https://mirrors.cernet.edu.cn/ubuntu 14 | Suites: noble noble-updates noble-backports 15 | Components: main restricted universe multiverse 16 | Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 17 | 18 | # 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 19 | # Types: deb-src 20 | # URIs: https://mirrors.cernet.edu.cn/ubuntu 21 | # Suites: noble noble-updates noble-backports 22 | # Components: main restricted universe multiverse 23 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 24 | 25 | # 以下安全更新软件源包含了官方源与镜像站配置,如有需要可自行修改注释切换 26 | # Types: deb 27 | # URIs: https://mirrors.cernet.edu.cn/ubuntu 28 | # Suites: noble-security 29 | # Components: main restricted universe multiverse 30 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 31 | 32 | # # Types: deb-src 33 | # # URIs: https://mirrors.cernet.edu.cn/ubuntu 34 | # # Suites: noble-security 35 | # # Components: main restricted universe multiverse 36 | # # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 37 | 38 | Types: deb 39 | URIs: http://security.ubuntu.com/ubuntu/ 40 | Suites: noble-security 41 | Components: main restricted universe multiverse 42 | Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 43 | 44 | # Types: deb-src 45 | # URIs: http://security.ubuntu.com/ubuntu/ 46 | # Suites: noble-security 47 | # Components: main restricted universe multiverse 48 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 49 | 50 | # 预发布软件源,不建议启用 51 | # Types: deb 52 | # URIs: https://mirrors.cernet.edu.cn/ubuntu 53 | # Suites: noble-proposed 54 | # Components: main restricted universe multiverse 55 | # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 56 | 57 | # # Types: deb-src 58 | # # URIs: https://mirrors.cernet.edu.cn/ubuntu 59 | # # Suites: noble-proposed 60 | # # Components: main restricted universe multiverse 61 | # # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg 62 | EOF 63 | 64 | RUN apt-get update && \ 65 | apt-get install -y openssh-server vim curl wget git iputils-ping net-tools git-lfs man-db zsh poppler-utils tesseract-ocr libgl1 python3-full python-is-python3 python3-pip && \ 66 | python3 -m pip config set global.index-url https://mirrors.cernet.edu.cn/pypi/web/simple && \ 67 | python3 -m pip config set global.trusted-host mirrors.cernet.edu.cn && \ 68 | python3 -m pip config set global.break-system-packages true && \ 69 | apt-get clean && rm -rf /tmp/* && \ 70 | service ssh start 71 | 72 | COPY requirements.txt /opt/app/requirements.txt 73 | WORKDIR /opt/app 74 | RUN python3 -m pip install --no-cache-dir -r requirements.txt 75 | WORKDIR /root 76 | 77 | ENTRYPOINT ["/usr/sbin/sshd", "-D"] 78 | CMD ["-p","22"] 79 | 80 | -------------------------------------------------------------------------------- /dockerfiles/lab/requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools==69.5.1 # temp fix for compatibility with some old packages 2 | matplotlib 3 | numpy 4 | openai 5 | pandas 6 | requests 7 | scikit-learn 8 | transformers 9 | sentencepiece 10 | protobuf 11 | datasets 12 | accelerate 13 | chardet 14 | python-dotenv 15 | httpx[socks] 16 | httpcore[socks] 17 | ipykernel 18 | ipywidgets 19 | langchain 20 | langchain-openai 21 | langchainhub 22 | langgraph 23 | google-search-results 24 | lxml 25 | tiktoken 26 | faiss-cpu 27 | beautifulsoup4 28 | chroma-hnswlib 29 | chromadb 30 | matplotlib-inline 31 | pinecone-client 32 | pypdf 33 | scipy 34 | sentence-transformers 35 | tenacity 36 | tqdm 37 | unstructured 38 | unstructured-client 39 | unstructured-inference 40 | unstructured.pytesseract 41 | nltk 42 | rouge 43 | peft 44 | pillow 45 | ftfy 46 | Jinja2 47 | diffusers 48 | tensorboard 49 | tensorstore 50 | zarr 51 | requests 52 | uvicorn 53 | websockets 54 | gradio 55 | python-dotenv 56 | dspy-ai 57 | jinja2 58 | langchain_community 59 | sentence-transformers 60 | tenacity 61 | tiktoken 62 | tqdm 63 | unstructured 64 | unstructured-client 65 | unstructured-inference 66 | pytesseract 67 | unstructured.pytesseract 68 | pi-heif 69 | opencv-python-headless 70 | langchain_pinecone 71 | langchain_chroma 72 | asyncer 73 | neo4j 74 | yfiles_jupyter_graphs 75 | 76 | # git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git 77 | # cd LLaMA-Factory 78 | # pip install -e ".[torch,metrics]" 79 | comfyui-frontend-package==1.14.6 80 | numpy>=1.25.0 81 | einops 82 | tokenizers>=0.13.3 83 | sentencepiece 84 | safetensors>=0.4.2 85 | aiohttp>=3.11.8 86 | yarl>=1.18.0 87 | pyyaml 88 | Pillow 89 | scipy 90 | tqdm 91 | psutil 92 | 93 | #non essential dependencies: 94 | spandrel 95 | soundfile 96 | av 97 | 98 | Pillow 99 | 100 | blendmodes 101 | clean-fid 102 | diskcache 103 | einops 104 | facexlib 105 | gradio==3.41.2 106 | inflection 107 | jsonmerge 108 | kornia 109 | lark 110 | numpy 111 | omegaconf 112 | open-clip-torch 113 | 114 | piexif 115 | protobuf==3.20.0 116 | psutil 117 | pytorch_lightning 118 | requests 119 | resize-right 120 | 121 | safetensors 122 | scikit-image>=0.19 123 | tomesd 124 | torchdiffeq 125 | transformers==4.30.2 126 | pillow-avif-plugin==1.4.3 127 | 128 | GitPython==3.1.32 129 | Pillow==9.5.0 130 | accelerate==0.21.0 131 | blendmodes==2022 132 | clean-fid==0.1.35 133 | diskcache==5.6.3 134 | einops==0.4.1 135 | facexlib==0.3.0 136 | fastapi==0.95.2 137 | gradio==3.41.2 138 | httpcore==0.15 139 | inflection==0.5.1 140 | jsonmerge==1.8.0 141 | kornia==0.6.7 142 | lark==1.1.2 143 | numpy==1.26.2 144 | omegaconf==2.2.3 145 | open-clip-torch==2.20.0 146 | piexif==1.1.3 147 | protobuf==3.20.0 148 | psutil==5.9.5 149 | pytorch_lightning==1.9.4 150 | resize-right==0.0.2 151 | safetensors==0.4.2 152 | scikit-image==0.21.0 153 | spandrel==0.3.4 154 | spandrel-extra-arches==0.1.1 155 | tomesd==0.1.3 156 | torchdiffeq==0.2.3 157 | torchsde==0.2.6 158 | httpx==0.24.1 159 | pillow-avif-plugin==1.4.3 160 | pytest-base-url~=2.0 161 | pytest-cov~=4.0 162 | pytest~=7.3 -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | ### FAQ - 常见问题汇总 2 | 3 | #### 1. **Kubeconfig 配置错误** 4 | - **问题**: 复制粘贴 kubeconfig 配置时出现问题,如何解决? 5 | - **解决**: 6 | 确保在复制时没有破坏格式,尤其是 server 部分。可以重新运行 login.ai 网站上的命令,并检查生成的 `.crt` 文件是否存在,确保没有损坏。 7 | 重新生成配置文件的步骤如下(详细步骤请参考[README](README.md#配置kubeconfig)): 8 | 1. 打开浏览器并访问 https://login.ai.iiis.co:9443。 9 | 2. 使用您的邮箱地址(格式为:用户名@iiis.co)和密码登录。 10 | 3. 登录后,进入 kubeconfig 信息页面,选择您使用的系统类型。 11 | 4. 按照页面指示的顺序在命令行运行命令,这些命令会生成名为 `config` 的配置文件。 12 | 5. 确保在复制粘贴命令时没有破坏格式,尤其是 `server` 部分。 13 | 6. 检查生成的 `.crt` 文件是否存在,并确保没有损坏。 14 | 7. 运行以下命令设置默认的 namespace: 15 | ```bash 16 | kubectl config set-context --current --namespace=`kubectl config current-context | cut -d'-' -f 1` 17 | ``` 18 | 19 | #### 2. **OpenAI API Key 问题** 20 | - **问题**: OpenAI API Key 被禁用或失效,如何解决? 21 | - **解决**: 如果无法使用 OpenAI API,可以尝试购买淘宝上的中转 API 或向助教借用一个,但会有使用量限制。 22 | 23 | #### 3. **GPU 配置问题** 24 | - **问题**: 如何确保在 Kubernetes 中正确配置和使用 GPU? 25 | - **解决**: 在 YAML 文件中确保 GPU 配置正确,并通过 `nvidia-smi` 检查 GPU 是否可用。 26 | 如果使用 Helm 创建 Pod,请确保在 `values.yaml` 文件中正确配置以下 GPU 选项(详细步骤请参考[README](README.md#使用默认配置启动计算任务)): 27 | 1. `GPU`: 设置所需的 GPU 类型,例如 `RTX4090`、`RTX4090D` 或 `RTX3090`。 28 | 2. `Limits.GPU`: 设置 GPU 的数量,确保不超过集群的配额。 29 | 配置完成后,重新安装 Helm 部署。重新安装的命令如下: 30 | ```bash 31 | helm uninstall release_name 32 | helm install release_name --values ./values.yaml ./userchart 33 | ``` 34 | 35 | #### 4. **Pod 启动问题** 36 | - **问题**: 新创建的 Pod 一直处于 Pending 状态,如何解决? 37 | - **解决**: 检查 Pod 的资源需求是否超出了集群资源,或通过 `kubectl describe` 检查 Pod 启动失败的原因。如果资源不足,可以尝试调整 Pod 的资源配置。 38 | 39 | #### 5. **VS Code 连接问题** 40 | - **问题**: VS Code 无法连接到远程 Pod,如何解决? 41 | - **解决**: 确保 Pod 处于运行状态,如果 Pod 已经停止或被重新启动,可以通过删除旧的 Pod 并重新创建新的 Pod 来解决。确保 VS Code 配置正确,或者尝试通过删除 `.vscode-server` 文件夹后重新连接。 42 | 43 | #### 6. **存储空间不足** 44 | - **问题**: 运行模型时出现 shm 空间不足的问题,如何解决? 45 | - **解决**: 参考最新的 `ailab` 仓库模板,修改 `values.yaml` 文件并启用 shm 支持,重新创建 Pod。 46 | 47 | #### 7. **模型训练时内存溢出(OOM)** 48 | - **问题**: 训练模型时出现 OOM 错误,如何避免? 49 | - **解决**: 尝试调整 batch size 或将数据分批处理。如果使用多个 GPU,确保每个 GPU 的内存使用均衡。 50 | 51 | #### 8. **API Key 和代理问题** 52 | - **问题**: 使用 API 时出现 "找不到函数" 或 API 返回错误,如何解决? 53 | - **解决**: 检查是否使用了正确的 API Key 和代理配置。如果需要,重试连接并检查网络或代理设置。 54 | 55 | #### 9. **如何快速下载大文件** 56 | - **问题**: 如何从远程服务器快速下载大于 100MB 的文件? 57 | - **解决**: 使用 `scp` 或 `kubectl cp` 命令来下载文件。若遇到网络问题,可以考虑使用更好的代理。 58 | 59 | #### 10. **模型生成错误** 60 | - **问题**: 模型生成的结果缺少部分代码或格式不正确,如何修复? 61 | - **解决**: 检查生成的代码是否符合格式要求,确保所有的代码块都正确闭合。尝试调整 `max_length` 或 `tokenizer` 配置,避免生成超长的代码。 62 | 63 | #### 11. **如何避免多 GPU 计算时的卡顿问题** 64 | - **问题**: 在使用多个 GPU 时,如何避免性能瓶颈? 65 | - **解决**: 使用 `CUDA_VISIBLE_DEVICES` 配置来选择特定的 GPU,并调整模型的负载分配,避免某些 GPU 负载过重。 66 | 67 | #### 12. **如何在 Jupyter Notebook 中释放 GPU 内存** 68 | - **问题**: Jupyter Notebook 中如果某个 cell 出现 OOM 错误,如何释放 GPU 内存而不重启 Kernel? 69 | - **解决**: 尝试使用 `empty_cache()`,但在某些情况下可能无法完全释放内存。如果内存没有被回收,重启 Kernel 或删除占用内存的变量可能是最有效的解决方法。 70 | 71 | #### 13. **模型训练时的资源分配问题** 72 | - **问题**: 使用两张显卡时,训练速度反而变慢,如何解决? 73 | - **解决**: 检查 CPU 和内存的资源分配,确保资源足够。可以适当增加 CPU 核数或内存,避免瓶颈限制训练速度。 74 | 75 | 这些是常见问题的解答,如果遇到其他问题,请随时询问! -------------------------------------------------------------------------------- /dockerfiles/lab/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.10-py3 2 | 3 | ENV TZ=Asia/Shanghai \ 4 | DEBIAN_FRONTEND=noninteractive \ 5 | BNB_CUDA_VERSION=125 6 | 7 | RUN apt-get update && \ 8 | apt-get install -y ca-certificates tzdata && \ 9 | ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime && \ 10 | echo ${TZ} > /etc/timezone && \ 11 | dpkg-reconfigure -f noninteractive tzdata 12 | 13 | 14 | # RUN cat <<'EOF' > /etc/apt/sources.list 15 | # # 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 16 | # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy main restricted universe multiverse 17 | # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy main restricted universe multiverse 18 | # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse 19 | # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse 20 | # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse 21 | # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse 22 | 23 | # # 以下安全更新软件源包含了官方源与镜像站配置,如有需要可自行修改注释切换 24 | # # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-security main restricted universe multiverse 25 | # # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-security main restricted universe multiverse 26 | 27 | # deb http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse 28 | # # deb-src http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse 29 | 30 | # # 预发布软件源,不建议启用 31 | # # deb https://mirrors.cernet.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse 32 | # # # deb-src https://mirrors.cernet.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse 33 | # EOF 34 | 35 | RUN apt-get update && \ 36 | apt-get install -y openssh-server vim curl wget git iputils-ping net-tools git-lfs man-db zsh poppler-utils tesseract-ocr libgl1 && \ 37 | # pip config set global.index-url https://mirrors.cernet.edu.cn/pypi/web/simple && \ 38 | # pip config set global.trusted-host mirrors.cernet.edu.cn && \ 39 | apt-get clean && rm -rf /tmp/* && \ 40 | service ssh start && \ 41 | rm /workspace/* -rf && \ 42 | mkdir -p /root/workspace && \ 43 | ln -s /root/workspace/ /workspace 44 | 45 | COPY requirements.txt /opt/app/requirements.txt 46 | COPY lab5-version-package/flashinfer-0.2.4.tar.gz /opt/app/flashinfer-0.2.4.tar.gz 47 | WORKDIR /opt/app 48 | RUN pip install -r requirements.txt && \ 49 | pip install ./flashinfer-0.2.4.tar.gz && \ 50 | pip install sglang[all] -i https://pypi.org/simple 51 | WORKDIR /root 52 | 53 | COPY . /root 54 | RUN pip install torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 && \ 55 | pip install qwen-vl-utils && \ 56 | pip install /root/lab5-version-package/transformers && \ 57 | pip install /root/lab5-version-package/CLIP-d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip && \ 58 | pip install /root/lab5-version-package/open_clip-bb6e834e9c70d9c27d0dc3ecedeebeaeb1ffad6b.zip && \ 59 | pip install -U -I --no-deps xformers==0.0.23.post1 && \ 60 | pip install ngrok && \ 61 | pip install /root/lab5-version-package/torch-2.5.1+cu124-cp310-cp310-linux_x86_64.whl 62 | 63 | RUN pip install flash-attn==2.7.0.post2 --no-build-isolation && \ 64 | pip install -U accelerate && \ 65 | pip install /root/lab5-version-package/torch-2.5.1+cu124-cp310-cp310-linux_x86_64.whl 66 | 67 | WORKDIR /app 68 | COPY ./lab5-version-package/sam2 /app/sam2 69 | RUN pip install -e /app/sam2 && \ 70 | pip install /root/lab5-version-package/opencv_python-4.8.0.74-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl && \ 71 | pip install /root/lab5-version-package/torch-2.5.1+cu124-cp310-cp310-linux_x86_64.whl 72 | 73 | WORKDIR /root 74 | ENTRYPOINT ["/usr/sbin/sshd", "-D"] 75 | CMD ["-p","22"] 76 | 77 | -------------------------------------------------------------------------------- /admin/cluster_setting/rancher_local_path_nvme.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: local-path-storage 5 | 6 | --- 7 | apiVersion: v1 8 | kind: ServiceAccount 9 | metadata: 10 | name: local-path-provisioner-service-account 11 | namespace: local-path-storage 12 | 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1 15 | kind: Role 16 | metadata: 17 | name: local-path-provisioner-role 18 | namespace: local-path-storage 19 | rules: 20 | - apiGroups: [""] 21 | resources: ["pods"] 22 | verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] 23 | 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1 26 | kind: ClusterRole 27 | metadata: 28 | name: local-path-provisioner-role 29 | rules: 30 | - apiGroups: [""] 31 | resources: ["nodes", "persistentvolumeclaims", "configmaps", "pods", "pods/log"] 32 | verbs: ["get", "list", "watch"] 33 | - apiGroups: [""] 34 | resources: ["persistentvolumes"] 35 | verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] 36 | - apiGroups: [""] 37 | resources: ["events"] 38 | verbs: ["create", "patch"] 39 | - apiGroups: ["storage.k8s.io"] 40 | resources: ["storageclasses"] 41 | verbs: ["get", "list", "watch"] 42 | 43 | --- 44 | apiVersion: rbac.authorization.k8s.io/v1 45 | kind: RoleBinding 46 | metadata: 47 | name: local-path-provisioner-bind 48 | namespace: local-path-storage 49 | roleRef: 50 | apiGroup: rbac.authorization.k8s.io 51 | kind: Role 52 | name: local-path-provisioner-role 53 | subjects: 54 | - kind: ServiceAccount 55 | name: local-path-provisioner-service-account 56 | namespace: local-path-storage 57 | 58 | --- 59 | apiVersion: rbac.authorization.k8s.io/v1 60 | kind: ClusterRoleBinding 61 | metadata: 62 | name: local-path-provisioner-bind 63 | roleRef: 64 | apiGroup: rbac.authorization.k8s.io 65 | kind: ClusterRole 66 | name: local-path-provisioner-role 67 | subjects: 68 | - kind: ServiceAccount 69 | name: local-path-provisioner-service-account 70 | namespace: local-path-storage 71 | 72 | --- 73 | apiVersion: apps/v1 74 | kind: Deployment 75 | metadata: 76 | name: local-path-provisioner 77 | namespace: local-path-storage 78 | spec: 79 | replicas: 1 80 | selector: 81 | matchLabels: 82 | app: local-path-provisioner 83 | template: 84 | metadata: 85 | labels: 86 | app: local-path-provisioner 87 | spec: 88 | serviceAccountName: local-path-provisioner-service-account 89 | containers: 90 | - name: local-path-provisioner 91 | image: harbor.ai.iiis.co:9443/deploy/docker.io/rancher/local-path-provisioner:v0.0.26 92 | imagePullPolicy: IfNotPresent 93 | command: 94 | - local-path-provisioner 95 | - --debug 96 | - start 97 | - --config 98 | - /etc/config/config.json 99 | volumeMounts: 100 | - name: config-volume 101 | mountPath: /etc/config/ 102 | env: 103 | - name: POD_NAMESPACE 104 | valueFrom: 105 | fieldRef: 106 | fieldPath: metadata.namespace 107 | volumes: 108 | - name: config-volume 109 | configMap: 110 | name: local-path-config 111 | 112 | --- 113 | apiVersion: storage.k8s.io/v1 114 | kind: StorageClass 115 | metadata: 116 | name: rancher-local-path 117 | provisioner: rancher.io/local-path 118 | volumeBindingMode: WaitForFirstConsumer 119 | reclaimPolicy: Delete 120 | 121 | --- 122 | kind: ConfigMap 123 | apiVersion: v1 124 | metadata: 125 | name: local-path-config 126 | namespace: local-path-storage 127 | data: 128 | config.json: |- 129 | { 130 | "nodePathMap":[ 131 | { 132 | "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", 133 | "paths":["/nvme1/local-path-provisioner","/nvme2/local-path-provisioner"] 134 | } 135 | ] 136 | } 137 | setup: |- 138 | #!/bin/sh 139 | set -eu 140 | mkdir -m 0777 -p "$VOL_DIR" 141 | teardown: |- 142 | #!/bin/sh 143 | set -eu 144 | rm -rf "$VOL_DIR" 145 | helperPod.yaml: |- 146 | apiVersion: v1 147 | kind: Pod 148 | metadata: 149 | name: helper-pod 150 | spec: 151 | priorityClassName: system-node-critical 152 | tolerations: 153 | - key: node.kubernetes.io/disk-pressure 154 | operator: Exists 155 | effect: NoSchedule 156 | containers: 157 | - name: helper-pod 158 | image: harbor.ai.iiis.co:9443/deploy/docker.io/library/busybox:latest 159 | imagePullPolicy: IfNotPresent -------------------------------------------------------------------------------- /admin/README.md: -------------------------------------------------------------------------------- 1 | # 管理员使用说明 2 | 3 | ## 用户管理 4 | 5 | 集群提供了LDAP的操作界面(phpLDAPAdmin),用于管理员进行用户账号信息的管理操作。 6 | 7 | ### 关于用户组织架构的约定 8 | 9 | 由于LDAP是一个应用场景范围比较广泛的规范。应用于K8S集群单点登录场景时,要对其中的用户组织架构附加一定的限制,才能顺畅地实现多个身份验证相关服务的打通。 10 | 11 | 期望的组织架构如下图所示。 12 | 13 | ![](assets/tree_users_ldap.png) 14 | 15 | 组织架构的约定如下: 16 | 1. 在根节点(dc=iiis,dc=co)下,创建两个ou(Organisational Unit),名为Groups、People; 17 | 2. Group下必须一个或多个类型为posixGroup的组(如上图中的defaultGroup、student组),用于创建用户账户时填写gid字段; 18 | 3. 组都建在ou=Group节点下;除第2条所述的个别posixGroup外,其他分组使用groupOfUniqueNames类型; 19 | 4. 用户都建在ou=People节点下,类型为User Account对象;必须为用户增加Email字段,并确保该字段取值唯一; 20 | 5. (可选)可以在People下面创建多个子ou来对用户进行分类管理(如上图中的faculty/staff/Students三个ou); 21 | 6. 通过修改组(groupOfUniqueNames类型)的member属性实现用户和组的关联,一个用户可以隶属于多个groupOfUniqueNames组。 22 | 23 | ### 操作说明 24 | 25 | 在浏览器上登录LDAP管理界面(phpLDAPAdmin)的URL地址:`https://ldap.ai.iiis.co/`。 26 | 管理员用户名:`cn=admin,dc=iiis,dc=co` 27 | 28 | 29 | 登录后,界面如下,用于操作的界面元素分为三部分:*左上操作按钮栏*、*左侧组织架构*、*右侧操作页面*。 30 | 31 | ![](assets/phpLDAPadmin_main_page.png) 32 | 33 | #### 1、创建ou(Groups和People) 34 | 35 | 1)在*左侧组织架构*上选中根节点(dc=iiis,dc=co),右侧操作页面上选择【Create a child entry】操作: 36 | 37 | ![](assets/phpLDAPadmin_create_child_entry.png) 38 | 39 | 2)选择【Generic: Organisational Unit】模版: 40 | 41 | ![](assets/phpLDAPadmin_create_object.png) 42 | 43 | 3)填写ou名称(People或Groups),点击【Create Object】按钮: 44 | 45 | ![](assets/phpLDAPadmin_create_ou.png) 46 | 47 | 4)确认页面中,点击【Commit】按钮: 48 | 49 | ![](assets/phpLDAPadmin_ou_commit.png) 50 | 51 | 经过以上操作一个ou就创建完成了。可按照上述步骤再继续创建其他ou。 52 | 53 | **注:在People下创建子ou的过程类似,只要在第(1)步中选择`ou=People`节点作为父节点进行操作即可。** 54 | 55 | 56 | #### 2、创建一个Posix Group用户组 57 | 58 | 由于创建用户账户时,需要选择一个Posix Group的Group ID(gid),因此须创建一个Posix Group对象。**这项操作只做一次即可。** 59 | 操作方式如下。 60 | 61 | 1)左侧组织架构上选中”ou=Groups“,右侧操作页面选择【Create a child entry】操作: 62 | 63 | ![](assets/phpLDAPadmin_create_object_posix_group.png) 64 | 65 | 2)选择【Generic: Posix Group】模版 66 | 67 | ![](assets/phpLDAPadmin_template_posix_group.png) 68 | 69 | 3)填写组名。系统会自动生成GID Number: 70 | 71 | ![](assets/phpLDAPAdmin_posix_group_name.png) 72 | 73 | 4)确认页面中点击【Commit】按钮,完成创建。 74 | 75 | 76 | #### 3、创建用户账户(常用操作) 77 | 78 | **这里创建的用户账户信息将可用于K8S集群kubeconfig的获取和相关系统的单点登录。** 79 | 80 | 1)左侧组织架构上选中ou=People或者其子ou(例如下图的ou=staff),右侧操作页面选择【Create a child entry】操作 81 | 82 | ![](assets/phpLDAPAdmin_user_create_child_entry.png) 83 | 84 | 2)选择【Generic: User Account】模版类型: 85 | 86 | ![](assets/phpLDAPAdmin_user_template.png) 87 | 88 | 3)填写用户信息,点击【Create Object】按钮 89 | 90 | - 其中GID Number字段可通过下拉菜单选择任意一个Posix Group的名称。 91 | 92 | ![](assets/phpLDAPAdmin_user_information.png) 93 | 94 | 95 | 4)在确认页中点击【Commit】按钮。 96 | 97 | ![](assets/phpLDAPAdmin_user_commit.png) 98 | 99 | 100 | 5)增加Email属性并填写用户email地址。 101 | 左侧组织架构上选中用户cn,右侧页面选择【Add new attribute】操作: 102 | 103 | ![](assets/phpLDAPAdmin_user_add_attribute.png) 104 | 105 | 页面会出现Add Attribute的栏目,在下拉菜单中选择Email这个属性: 106 | 107 | ![](assets/phpLDAPAdmin_user_add_attribute_Email.png) 108 | 109 | 填写用户的Email地址: 110 | 111 | ![](assets/phpLDAPAdmin_user_input_email.png) 112 | 113 | 点击右侧操作页面下方的【Update Object】按钮。 114 | 并在确认页面点击【Update Object】按钮。 115 | 116 | ![](assets/phpLDAPAdmin_user_add_user_email_commit.png) 117 | 118 | 119 | 可以在用户账号的页面上看到已经添加了Email信息: 120 | ![](assets/phpLDAPAdmin_user_information_detail.png) 121 | 122 | 123 | #### 4、创建用户组 124 | 125 | 这里创建的用户组与用户账户是**多对多**的关系,即一个组可以包含多个用户,同时一个用户可以归属于多个组。从扩展性考虑,用户组采用groupOfUniqueNames类型(也可采用groupOfNames类型)。 126 | 127 | 操作方式如下。 128 | 129 | 1)左侧组织架构上选中ou=Groups,右侧操作页面选择【Create a child entry】操作; 130 | 131 | 2)模版类型选择default - groupOfUniqueNames,如下面两个图所示: 132 | 133 | ![](assets/phpLDAPAdmin_template_Default.png) 134 | 135 | ![](assets/phpLDAPAdmin_template_groupOfUniqueNames.png) 136 | 137 | 3)进入了用户组的编辑页面,填写三个必填字段即可: 138 | - RDN选择cn; 139 | - cn填写组名; 140 | - uniqueMember选择一个属于改组的用户账户即可。 141 | ![](assets/phpLDAPAdmin_group_input.png) 142 | 143 | 后续提交、确认即可完成组的创建。 144 | 145 | 146 | #### 5、修改组成员 147 | 148 | 在用户组上可以通过修改uniqueMember字段加入用户。同一个用户可以归属到多个组。 149 | 150 | ![](assets/phpLDAPAdmin_group_member.png) 151 | 152 | ### K8S创建命名空间、PVC及授权 153 | 154 | `adduser_dir.sh` 脚本会为K8S集群安装本地硬盘卷的驱动器。并根据用户信息明细文件为每个用户创建命名空间、授权用户在自己命名空间中具有USER权限,在GFS为用户创建个人数据路径并创建对应的PVC,设置每个命名空间的资源限制。 155 | 156 | 用户明细文件以 `username.txt` 为例,文件内容包括 `uid,mail,uidNumber,gidNumber` 四列内容,没有表头。 157 | 158 | ``` 159 | $ cat username.txt 160 | 161 | test01,test01@test.lthpc.com,2001,500 162 | test02,test02@test.lthpc.com,2002,500 163 | test03,test03@test.lthpc.com,2003,500 164 | 165 | ``` 166 | 167 | 切换到本项目的 `admin` 路径下,执行脚本自动创建用户的相关资源。 168 | ``` 169 | $ bash adduser_dir.sh username.txt 170 | ``` 171 | 脚本运行完成之后,`username.txt` 中的用户就可以通过kubeconfig使用K8S集群了。`username.txt` 文件需要保留,在删除用户时需要用到。 172 | 173 | ### K8S 删除用户及相关资源 174 | 175 | `deluser_dir.sh` 脚本可以自动删除用户在GFS的个人PVC、NFS中的个人PVC和个人命名空间中的所有资源以及命名空间本身。 176 | 177 | 切换到本项目的 `admin` 路径下,执行脚本自动删除用户的相关资源。 178 | 179 | ``` 180 | $ bash deluser_dir.sh username.txt 181 | ``` 182 | 183 | ### 添加K8S集群管理员 184 | 185 | 将`admin@admin.com`改为需要被设置为管理员的账号,在master节点或具有最高权限的终端执行下面的命令。 186 | 187 | ``` 188 | kubectl create clusterrolebinding root-cluster-admin-binding --clusterrole=cluster-admin --user=admin@admin.com 189 | ``` -------------------------------------------------------------------------------- /user/userchart/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | {{ $randomid := randAlphaNum 8 | lower }} 2 | {{ $base := .Values.BaseName | default "undescribed-job" }} 3 | {{ $namespace := .Values.NameSpace | default "default" }} 4 | {{ $deploy := .Values.DeployName | default (printf "%s-%s-%s" $namespace $base .Release.Name) }} 5 | {{ $label := .Values.Label | default (printf "%s-%s" $base .Release.Name) }} 6 | {{ $containername := .Values.ContainerName | default (printf "%s-%s" $base .Release.Name) }} 7 | {{ $containerimage := .Values.ContainerImage | default "harbor-local.ai.iiis.co/llm-course/lab-cpu:latest" }} 8 | {{ $uid := .Values.UID | default "0" }} 9 | {{ $gid := .Values.GID | default "0" }} 10 | 11 | {{- $limits := .Values.Limits | default (dict) }} 12 | {{ $limitscpu := $limits.CPU | default "8" }} 13 | {{ $limitsmemory := $limits.memory | default "16Gi" }} 14 | {{ $limitsgpu := $limits.GPU | default "0" }} 15 | 16 | {{ $nvme := .Values.NVMEStorage | default "100Gi" }} 17 | {{ $nogfs := .Values.NoGFS | default false }} 18 | {{ $extraport := .Values.ExtraPort | default 0 }} 19 | {{ $ingresshost := .Values.IngressHost | default "" }} 20 | {{ $use_shm := .Values.UseShm | default false}} 21 | {{ $shm_size := .Values.ShmSize | default "8Gi" }} 22 | 23 | {{ $command := .Values.Command | default "" }} 24 | {{ $args := .Values.Args | default "" }} 25 | 26 | {{ $use_IB := .Values.UseIB | default false}} 27 | 28 | {{ $replicas := .Values.Replicas | default 1 }} 29 | 30 | --- 31 | apiVersion: apps/v1 32 | kind: Deployment 33 | metadata: 34 | name: {{ $deploy }} 35 | namespace: {{ $namespace }} 36 | labels: 37 | app: {{ $label }} 38 | spec: 39 | replicas: {{ $replicas }} 40 | selector: 41 | matchLabels: 42 | app: {{ $label }} 43 | template: 44 | metadata: 45 | labels: 46 | app: {{ $label }} 47 | annotations: 48 | {{ if $use_IB }} 49 | k8s.v1.cni.cncf.io/networks: ipoibnetwork-{{ .Values.NameSpace }} 50 | {{ end }} 51 | spec: 52 | hostIPC: false 53 | hostPID: false 54 | hostNetwork: false 55 | securityContext: 56 | runAsUser: {{ $uid }} 57 | runAsGroup: {{ $gid }} 58 | nodeSelector: 59 | gpu-model: {{ .Values.GPU }} 60 | containers: 61 | - name: {{ $containername }} 62 | imagePullPolicy: IfNotPresent 63 | image: {{ $containerimage }} # 可自行更改镜像和版本 64 | {{ if $use_IB }} 65 | securityContext: 66 | capabilities: 67 | add: [ "IPC_LOCK" ] 68 | {{ end }} 69 | {{- if $command }} 70 | command: {{ $command }} 71 | args: {{ $args }} 72 | {{- else }} 73 | command: ["bash", "-c", "--"] 74 | args: ["while true; do sleep 30; done;"] 75 | {{- end }} 76 | resources: 77 | limits: 78 | cpu: {{ $limitscpu }} # 最大CPU 79 | memory: {{ $limitsmemory }} # 最大内存数目 80 | nvidia.com/gpu: {{ $limitsgpu }} # 请求的GPU数量 81 | {{ if $use_IB }} 82 | rdma/rdma_shared_device_a: 1 83 | {{ end }} 84 | requests: 85 | {{ if $use_IB }} 86 | rdma/rdma_shared_device_a: 1 87 | {{ end }} 88 | volumeMounts: 89 | - name: nfshome # 与下面volumes的名字对应 90 | mountPath: /root # 本地的挂载点 /root 91 | - name: scratch1 # 与下面volumes的名字对应 92 | mountPath: /scratch1 # 本地的挂载点 93 | - name: scratch2 # 与下面volumes的名字对应 94 | mountPath: /scratch2 # 本地的挂载点 95 | {{ if not $nogfs }} 96 | - name: gfshome # 与下面volumes的名字对应 97 | mountPath: /gfshome # 本地的挂载点 98 | - name: gfsshare # 与下面volumes的名字对应 99 | mountPath: /share # 本地的挂载点 100 | - name: ssdshare # 与下面volumes的名字对应 101 | mountPath: /ssdshare # 本地的挂载点 102 | {{ end }} 103 | {{ if $use_shm }} 104 | - name: dshm 105 | mountPath: /dev/shm 106 | {{ end }} 107 | volumes: 108 | {{ if $use_shm }} 109 | - name: dshm 110 | emptyDir: 111 | medium: Memory 112 | sizeLimit: {{ $shm_size }} 113 | {{ end }} 114 | - name: nfshome 115 | persistentVolumeClaim: 116 | claimName: pvc-nfshome-{{ $namespace }} 117 | - name: scratch1 118 | persistentVolumeClaim: 119 | claimName: pvc-rancher-localpath-1-{{ $namespace }}-{{ $deploy }}-{{ $randomid}} 120 | - name: scratch2 121 | persistentVolumeClaim: 122 | claimName: pvc-rancher-localpath-2-{{ $namespace }}-{{ $deploy }}-{{ $randomid}} 123 | 124 | {{ if not $nogfs }} 125 | - name: gfshome 126 | persistentVolumeClaim: 127 | claimName: gfs-sata-pvc-{{ $namespace }} 128 | - name: gfsshare 129 | persistentVolumeClaim: 130 | claimName: gfs-sata-share-pvc-{{ $namespace }} 131 | - name: ssdshare 132 | persistentVolumeClaim: 133 | claimName: gfs-nvme-pvc-share-{{ $namespace }} 134 | {{ end }} 135 | 136 | --- 137 | apiVersion: v1 138 | kind: Service 139 | metadata: 140 | annotations: {} 141 | labels: 142 | app: {{ $label }} 143 | k8s.kuboard.cn/name: {{ $deploy }} 144 | name: {{ $deploy }} 145 | namespace: {{ $namespace }} 146 | spec: 147 | ports: 148 | - name: {{ $deploy }}-port 149 | port: 22 150 | protocol: TCP 151 | targetPort: 22 152 | {{ if $extraport}} 153 | - name: {{ $deploy }}-extraport 154 | port: {{ $extraport }} 155 | protocol: TCP 156 | targetPort: {{ $extraport }} 157 | {{ end }} 158 | selector: 159 | app: {{ $label }} 160 | sessionAffinity: None 161 | type: NodePort 162 | --- 163 | kind: PersistentVolumeClaim 164 | apiVersion: v1 165 | metadata: 166 | name: pvc-rancher-localpath-1-{{ $namespace }}-{{ $deploy }}-{{ $randomid}} 167 | namespace: {{ $namespace }} 168 | spec: 169 | accessModes: 170 | - ReadWriteOnce 171 | resources: 172 | requests: 173 | storage: {{ $nvme }} 174 | storageClassName: rancher-local-path 175 | --- 176 | kind: PersistentVolumeClaim 177 | apiVersion: v1 178 | metadata: 179 | name: pvc-rancher-localpath-2-{{ $namespace }}-{{ $deploy }}-{{ $randomid}} 180 | namespace: {{ $namespace }} 181 | spec: 182 | accessModes: 183 | - ReadWriteOnce 184 | resources: 185 | requests: 186 | storage: {{ $nvme}} 187 | storageClassName: rancher-local-path 188 | 189 | {{ if $ingresshost }} 190 | 191 | --- 192 | apiVersion: networking.k8s.io/v1 193 | kind: Ingress 194 | metadata: 195 | annotations: 196 | cert-manager.io/cluster-issuer: letsencrypt-prod 197 | name: {{ $deploy }} 198 | namespace: {{ $namespace }} 199 | spec: 200 | ingressClassName: nginx 201 | rules: 202 | - host: {{ $ingresshost }} 203 | http: 204 | paths: 205 | - backend: 206 | service: 207 | name: {{ $deploy }} 208 | port: 209 | number: {{ $extraport }} 210 | path: / 211 | pathType: Prefix 212 | tls: 213 | - hosts: 214 | - {{ $ingresshost }} 215 | secretName: passwd-tls 216 | 217 | {{ end }} 218 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ailab 2 | 3 | - [ailab](#ailab) 4 | - [集群概况](#集群概况) 5 | - [获取访问权限](#获取访问权限) 6 | - [配置kubeconfig](#配置kubeconfig) 7 | - [使用K8S](#使用k8s) 8 | - [其他使用说明](#其他使用说明) 9 | - [运行大模型课程labs代码](#运行大模型课程labs代码) 10 | - [修改账号密码](#修改账号密码) 11 | - [使用 VS Code 连接K8S远程调试](#使用-vs-code-连接k8s远程调试) 12 | - [私有容器镜像仓库](#私有容器镜像仓库) 13 | - [自定义镜像](#自定义镜像) 14 | - [信任集群 Harbor](#信任集群-harbor) 15 | - [制作镜像](#制作镜像) 16 | - [环境准备](#环境准备) 17 | - [编写 Dockerfile 制作镜像](#编写-dockerfile-制作镜像) 18 | - [从自定义镜像创建 Pod](#从自定义镜像创建-pod) 19 | 20 | 21 | ## 集群概况 22 | 23 | 本集群计算环境基于 K8S 搭建而成,硬件包括3台独立的 master 节点、50台 worker 节点和一台提供 NFS 服务的 NAS(网络存储服务器)。使用 Harbor 搭建私有镜像仓库,openLDAP 进行统一身份认证。通过统一的 kubeconfig 配置文件分发平台,用户也可以通过 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) 以命令行的方式使用K8S集群。 24 | 25 | 26 | |系统|登陆地址|功能| 27 | |---|---|---| 28 | |密码管理|https://auth.ai.iiis.co:9443|账号修改密码。| 29 | |Harbor|https://harbor.ai.iiis.co:9443|私有容器镜像仓库| 30 | |kubeconfig|https://login.ai.iiis.co:9443|kubeconfig配置文件分发平台| 31 | 32 | ## 获取访问权限 33 | 34 | 在管理员已经为用户创建好账号的情况下, 用户需要确认是否已经满足下列三个条件 35 | 36 | - 您使用的终端可以连通SSH跳板机,测试方法为ping js.ai.iiis.co, 如果ping不通,检查你的网络设置(特别是DNS设置),或者联络管理员。 37 | - 您已经获取了访问K8S集群的用户名、用户账号关联邮箱和登录密码。 38 | - 在等待获取访问权限的过程中,可以先准备好安装本地软件 (见下节)。 39 | 为了确保账号安全,强烈建议大家拿到账号后先 [修改密码](#修改账号密码)。 40 | 41 | ## 使用SSH跳板机 42 | 43 | - 在您使用的终端上执行如下命令: 44 | ```bash 45 | ssh -i 私钥文件名 -N -L 6443:api.ai.iiis.co:6443 ailab@js.ai.iiis.co -p 9022 46 | ``` 47 | - 私钥文件名默认为~/.ssh/id_rsa (可以省略): 48 | ```bash 49 | ssh -N -L 6443:api.ai.iiis.co:6443 ailab@js.ai.iiis.co -p 9022 50 | ``` 51 | 52 | - 如果终端上6443端口已经被其他程序占用,可以换成其他端口,比如换成6444端口,则命令应写成: 53 | ```bash 54 | ssh -i 私钥文件名 -N -L 6444:api.ai.iiis.co:6443 ailab@js.ai.iiis.co -p 9022 55 | ``` 56 | - 命令执行后,会出现貌似“卡死”现象(命令并不返回),这是正常的。**不要关闭**该terminal。可以另打开一个terminal进行其他操作。也可以在上述ssh命令的最后加上&,将放入后台。 57 | - 如果您希望自动连接跳板机,可以参考autossh (https://www.harding.motd.ca/autossh/) 58 | 59 | ## 配置集群访问环境 60 | >注:不推荐使用wsl。在wsl上执行可能在后续步骤中出现WebSocket close with status code 1006错误 61 | ### 安装本地软件 62 | 63 | 本地电脑至少需要安装以下两个软件。 64 | 65 | #### Kubectl 66 | 用户可以直接使用 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) 管理 k8s。 67 | 安装说明在 https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/ 68 | 69 | #### Helm 70 | helm 是 Kubernetes 的包管理器,helm的安装及使用方法可以参考[官方文档](https://helm.sh/docs/)。比较简单的安装方式(Linux上)是使用这个脚本 71 | https://helm.sh/docs/intro/install/#from-script 72 | 73 | 推荐安装以下软件: 74 | * Docker (如果你需要本地构建镜像)。 PC上推荐安装docker desktop,有界面使用更方便。 https://www.docker.com/products/docker-desktop/ 75 | 76 | * VSCode (本地集成化开发环境,使用K8S更加方便)。 77 | 78 | ### 配置kubeconfig 79 | 80 | 用户基于 kubeconfig 通过命令行方式使用K8S,需要先在自己的终端设备配置好 kubeconfig。利用系统提供的 kubeconfig 信息(包含用户账户和 Token 等信息),可以在自己的终端利用 kubectl 对 K8S 集群中的资源进行访问。本节介绍如何获取和使用 kubeconfig。 81 | 82 | 用浏览器访问URL地址:https://login.ai.iiis.co:9443 83 | 84 | ![](assets/dex_login.png) 85 | 86 | 输入邮箱地址(邮箱地址等同申请的账号,格式为:用户名@iiis.co)和密码(上边修改过的密码)即可登录。 87 | 88 | >注:这里的邮箱地址是***用户名@iiis.co***。 89 | 90 | 进入kubeconfig信息页面,选择您使用的系统类型。 91 | 92 | 请按照您所使用的 kubectl 所在的操作系统进行选择。 93 | 94 | ![](assets/dex_token_1.png) 95 | 96 | 接下来,要按照页面指示的顺序在运行 kubectl 的命令行运行命令,会生成名为config的配置文件。 97 | 98 | >注:如果你用的是Windows机器,请在Windows Power Shell 下运行这些命令;如果无法执行网站中的第一条命令,可以执行如下命令: 99 | 100 | ```bash 101 | $homeDir = $env:USERPROFILE 102 | $certDir = Join-Path $homeDir ".kube\certs\k8s.iiis" 103 | New-Item -ItemType Directory -Path $certDir -Force | Out-Null 104 | 105 | $certPath = Join-Path $certDir "k8s-ca.crt" 106 | 107 | @" 108 | -----BEGIN CERTIFICATE----- 109 | 110 | (此处粘贴你自己的certificate) 111 | 112 | -----END CERTIFICATE----- 113 | "@ | Out-File -FilePath $certPath -Encoding ascii 114 | 115 | ``` 116 | 117 | ![](assets/dex_token_2.png) 118 | 119 | 所有指令执行完毕后,再运行如下命令设置默认的namespace (ns)。在 K8S 集群中,管理员已经为每一位用户创建了与 UID 相同的命名空间ns。用户只在自己的 ns 中具有使用权限,因此所有操作都只能在自己的 ns 中完成。通过运行下面的命令,可以避免每个命令都需要指定ns。 120 | 121 | ```bash 122 | kubectl config set-context --current --namespace=`kubectl config current-context | cut -d'-' -f 1` 123 | ``` 124 | 125 | >注:如果你用的是Windows机器,请运行如下命令(powershell不支持cut) 126 | 127 | ```bash 128 | kubectl config set-context --current --namespace=($((kubectl config current-context) -split '-')[0]) 129 | ``` 130 | 131 | >注:请检查你的UID中是否有`-`。如果有,第一个`-`符号后的内容会自动被上述指令去掉,导致后续步骤出现了类似 `Error from server (Forbidden)` 的报错。请浏览并手动编辑kubeconfig文件 132 | 133 | - 提示:如果连接SSH跳板机时,本地终端使用的端口不是6443,而是其他端口,比如6444,则需要把config文件内容中的server: https://127.0.0.1:6443 修改成server: https://127.0.0.1:6444。 134 | 135 | 之后可以使用以下 kubectl 命令测试是否已经可以访问K8S中的资源。 136 | 137 | ```bash 138 | kubectl get pvc 139 | ``` 140 | 应该能看到返回了4个PVC (是用户在集群中可以访问的存储空间,可以理解为是一个盘)。 141 | 142 | ## 使用K8S 143 | 144 | ### 使用默认配置启动计算任务 145 | 146 | 本仓库已经为用户提供了创建计算任务的默认 helm 模板,如果使用默认配置,请 clone 本仓库,并将 user/values.yaml 文件中的内容按照自己账号和计算需求进行修改,即可使用 helm 创建计算任务。 user/values-template.yaml 文件的具体内容为: 147 | 148 | ``` 149 | ########### 必须要写的部分 ########### 150 | NameSpace: namespace # 更改为自己的namespace (同用户名) 151 | BaseName: pytorch # 任务的基本名字,建议写任务描述,例如pytorch 152 | ContainerImage: harbor-local.ai.iiis.co/llm-course/lab-cpu:v2 # 镜像名称,默认为 harbor-local.ai.iiis.co/llm-course/lab-cpu:v2 153 | GPU: RTX4090D # RTX4090D RTX4090 RTX3090 154 | 155 | ########### 选填的部分 ########### 156 | # DeployName: namespace-pytorch-release # 任务(deployment)的名字,默认为`NameSpace-BaseName-ReleaseName`, releaseName为随机生成的字符串是在helm命令行里指定的 157 | # Label: pytorch-release # 任务的标签,默认为`BaseName-ReleaseName` 158 | # ContainerName: pytorch-release # 容器名,默认为`BaseName-ReleaseName` 159 | # NVMEStorage: 100G # 申请的本地盘/scratch的大小,不填即为默认值 160 | # Limits: # 申请的资源,注意所有启动的资源总和不能超过自己ns的quota,如果增加quota,需要向管理员申请,不填为默认值 161 | # CPU: 8 162 | # memory: 16Gi 163 | # GPU: 0 164 | # UseShm: False 165 | # ShmSize: 8Gi 166 | 167 | # Replicas: 1 # starting more replica of the pod (for distributed training) 168 | 169 | ``` 170 | 171 | 此文件用于创建一个副本数为 1 的 [Deployment](https://kubernetes.io/zh-cn/docs/concepts/workloads/controllers/deployment/)计算任务工作负载。你可以复制这个文件到比如 `cp value-template.yaml lab1.yaml`,然后编辑lab1.yaml文件,输入你的配置参数。之后在user目录中运行 172 | 173 | ``` 174 | cd user 175 | helm install release_name --values ./lab1.yaml ./userchart 176 | ``` 177 | 178 | `release_name`为helm部署的版本名(release),建议设置为自己的`UID+任务描述`的格式以方便后续维护管理,例如xuw_lab1。`--values ./lab1.yaml`为helm模板的各项变量提供了对应的值(你刚刚设置的),最后`./userchart`是helm模板的路径位置。 179 | 180 | 之后,可以通过运行 181 | ```bash 182 | kubectl get pods 183 | ``` 184 | 来观察启动的pod是否已经启动了。启动之后可以通过 185 | 186 | ```bash 187 | kubectl exec -i name_of_the_pod -- bash 188 | ``` 189 | 来连接这个pod,并且启动bash。建议大家使用后边描述的使用VSCode连接K8S使用,要方便很多。 190 | 191 | ### 默认挂载的存储描述 192 | 193 | 在默认的模板中,自动为每个pod默认挂载了四个存储卷。这些存储卷是管理员为用户创建好了用于长期保存数据的[持久卷申领(PersistentVolumeClaim,PVC)](https://kubernetes.io/zh-cn/docs/concepts/storage/persistent-volumes/)。 194 | 195 | - 挂载于容器内`/root`路径的NFS服务的PVC,用于存储文档及代码等小文件; 196 | - 挂载于容器内`/gfshome`路径GFS的个人存储空间PVC,用于存储模型文件、数据集等大文件; 197 | - 挂载于容器内`/share`路径GFS的共享空间PVC,用于存放和共享开源大模型、开源数据集等公共数据; 198 | - 挂载于容器内`/ssdshare`路径GFS的共享空间PVC,用于存储需要快速访问的模型文件等大文件(与share的区别为:该空间用SSD做存储,速度快); 199 | 200 | 临时数据存放在宿主机本地的NVME硬盘中,挂载在容器内的`/scratch1`和`/scratch2`,POD被删除后,这2个目录里面的数据也会被删除,请一定不要将需要持久化保存的重要数据放在这2个路径。 201 | 202 | 上面的helm模板中会自动挂载长期存储数据的四个PVC,并自动创建对应于`/scratch1`和`/scratch2`两个临时数据存储PVC。 203 | 204 | 205 | | 存储系统 | 写入速度 | 206 | | ---------- | -------- | 207 | | 宿主机NVME | 2.3GB/s | 208 | | GFS | 1GB/s | 209 | | GFS-SSD | 2GB/s | 210 | | NFS | 1GB/s | 211 | 212 | ### 删除计算任务 213 | 214 | 通过下面的命令删除计算任务 215 | 216 | ``` 217 | helm delete release_name 218 | ``` 219 | 其中,release_name是你创建任务时候输入的第一个参数(release_name),如果你忘了当时用的什么了,可以用 220 | ``` 221 | helm list 222 | ``` 223 | 来列出所有的release。 224 | 225 | helm delete 命令会自动删除容器和应于`/scratch1`至`/scratch4`的四个临时数据存储PVC,但不会删除长期存储数据的三个PVC。 226 | 227 | 228 | ### 定制自己的模板 229 | 230 | 如果对helm chart功能及语法比较熟悉,也欢迎用户对模板进行修改或定制,并将成果分享给大家。 231 | 232 | ## 其他使用说明 233 | 234 | ### 运行大模型课程labs代码 235 | 236 | 1. 推荐使用下面的“使用 VS Code 连接K8S远程调试”方法先在vscode中连接集群。 237 | 2. 在VS Code命令行(terminal)中,clone课程仓库: 238 | ``` 239 | git clone git@github.com:xuw/llm_course_public.git 240 | ``` 241 | 3. (更新)查看最新的课程内容信息: 242 | ``` 243 | cd llm_course_public/ && git pull --all 244 | ``` 245 | 4. 在VS Code中运行labs的Jupyter Notebook 246 | - 确定在Server端Jupyter插件已正确安装,已经启用(enable) 247 | - 在GUI中设置kernel(environment)为conda即可 248 | 249 | ### 修改账号密码 250 | 251 | 集群提供了一套简单的密码修改界面,用户可以修改自己账号的密码。 252 | 253 | 用浏览器访问URL地址 `https://auth.ai.iiis.co:9443` 访问密码修改界面。界面如下图: 254 | ![](assets/ssp_main_page.png) 255 | 256 | 在界面上填写用户名(界面上的Login字段)、原密码(Old password字段)、新密码(New password字段),并重复输入一次新密码(Confirm字段),点击【Send】按钮,即可完成账号密码修改。 257 | ![](assets/ssp_success.png) 258 | 259 | ### 使用 VS Code 连接K8S远程调试 260 | 261 | 使用 [VS Code](https://code.visualstudio.com/) 可以远程 debug 集群中创建的 POD。这里我们给出一个简单的教程,更多的信息请自行查阅 [Kubernetes 文档](https://kubernetes.io/zh/docs/concepts/services-networking/service/)与 [VS Code 文档](https://code.visualstudio.com/docs/azure/kubernetes)。 262 | 263 | 首先我们需要在 VS Code 中安装`Kubernetes`插件、`Docker`插件、`Remote Container`插件(改名为`Dev container`)、`Bridge to Kubernetes`插件(被弃用,但不影响使用): 264 | 265 | ![](assets/vscode/vsc_k8s_plugin.jpg) 266 | 267 | ![](assets/vscode/vsc_docker_plugin.jpg) 268 | 269 | ![](assets/vscode/vsc_remote_connector_plugin.jpg) 270 | 271 | ![](assets/vscode/vsc_k8s_bridge_plugin.jpg) 272 | 273 | 使用`ctrl + shift + P`(Mac 下`command + shift + P`)选择`Kubernetes: Use Namespace` 274 | 275 | ![](assets/vscode/vsc_k8s_select_ns.jpg) 276 | 277 | 输入自己的 namespace 后就能访问自己namespace下的资源了。以连接一个 POD 作为示例: 278 | 279 | ![](assets/vscode/vsc_connect_k8s.jpg) 280 | 281 | 这样将会自动连接一个 VS Code 远程窗口,之后的开发就和本地类似了。 282 | 283 | ### 私有容器镜像仓库 284 | 285 | 私有容器镜像仓库对应2个域名,分别为harbor.ai.iiis.co:9443(应用场景:用户通过外网向镜像仓库中推送自定义镜像) 和 harbor-local.ai.iiis.co(应用场景:用户建立POD时,从镜像仓库中拉取镜像)。二者区别为 harbor.ai.iiis.co:9443用于外网访问镜像仓库,harbor-local.ai.iiis.co走集群内部网络,建立pod时,基于该域名拉取镜像速度快。 286 | 287 | #### 自定义镜像 288 | 289 | 我们可以在集群里从自定义镜像拉起 POD,以支持快速的实验环境配置。自定义镜像的思路是**在`ubuntu-tensorflow`、`ubuntu-pytorch`或`orion-client-2.4.2`的基础上,配置自己的环境**。 290 | 291 | ##### 信任集群 Harbor 292 | 293 | 自定义镜像需要从 Harbor 拉取,因此我们需要在 Docker 中添加对集群 Harbor 的信任。在Mac下用 Docker Desktop 可以直接在客户端`Docker Engine`里加入`insecure-registries`项,若未使用 Docker Desktop,则在`/etc/docker/daemon.json`中添加(若该文件不存在则创建): 294 | 295 | ```json 296 | { 297 | 298 | "insecure-registries": [ 299 | "harbor.ai.iiis.co" 300 | ] 301 | } 302 | ``` 303 | 304 | 添加完毕后,重启 Docker。 305 | 306 | ##### 制作镜像 307 | 308 | 制作镜像的方式有基于 Dockerfile 和 `docker commit`命令两种形式。我们这里推荐基于 Dockerfile 方式,`docker commit`方式请参考[官方文档](https://docs.docker.com/engine/reference/commandline/commit/)。 309 | 310 | > **_NOTE:_** 在[这里](https://github.com/iiisthu/gpupool/tree/master/examples/build_example)可以找到我们在这一节所使用的例子。 311 | 312 | 我们假设在`ubuntu-pytorch`的基础上,我们还需要配置一系列环境: 313 | 314 | 1. 安装一系列 Python 依赖库,在`requirements.txt`中指明。 315 | 2. 将某个 Python 包的 Git 仓库放入镜像,并从仓库源码安装该 Python 包。 316 | 3. 创建`workspace`工作目录。 317 | 318 | 其他的操作可以参考这几个任务。我们假设我们在`build`目录下工作,我们使用[`navdeep-G/samplemod`](https://github.com/navdeep-G/samplemod)作为 Python Package 的例子。 319 | 320 | ###### 环境准备 321 | 322 | 假设我们需要 Python 支持一系列的库,例如画图的`matplotlib`和交互式的`jupyter`等,我们将这些写在`build`目录下: 323 | 324 | ```txt 325 | # requirements.txt 326 | numpy >= 1.19 327 | matplotlib 328 | pandas >= 1.0 329 | jupyter 330 | ``` 331 | 332 | 我们也希望pod能安装我们自己的一个私有代码仓库中的某个 Python Package,我们以[`navdeep-G/samplemod`](https://github.com/navdeep-G/samplemod)为例: 333 | 334 | ```bash 335 | # PWD: build/ 336 | git clone https://github.com/navdeep-G/samplemod 337 | ``` 338 | 339 | 整个工作目录为: 340 | 341 | ``` 342 | build 343 | ├── samplemod 344 | │ ├── docs/ 345 | │ ├── sample/ 346 | │ ├── tests/ 347 | │ ├── .gitignore 348 | │ ├── LICENSE 349 | │ ├── MANIFEST.in 350 | │ ├── Makefile 351 | │ ├── README.rst 352 | │ ├── requirements.txt 353 | │ └── setup.py 354 | └── requirements.txt 355 | ``` 356 | 357 | ###### 编写 Dockerfile 制作镜像 358 | 359 | 我们从`harbor.ai.iiis.co:9443/library/`下的镜像出发,安装`requirements.txt`中的依赖,并安装数据。我们这里不赘述[ Dockerfile 的语法](https://docs.docker.com/engine/reference/builder/)。实例的 Dockerfile 如下: 360 | 361 | ```docker 362 | # Dockerfile 363 | FROM harbor.ai.iiis.co:9443/library/ubuntu-pytorch:1.5.0 364 | COPY . build 365 | RUN pip install -r build/requirements.txt && cd build/samplemod; pip install . && mkdir -p workspace && rm -rf build 366 | ``` 367 | 368 | > **_NOTE:_** 这里用单行命令是为了让制作后的镜像历史中不会存在build文件夹(类似于git,即使删去的文件也会在历史中存储,以备未来可能的恢复)。 369 | 370 | 之后利用`docker`按照 Dockerfile 制作镜像,并标记为`sample:v0`: 371 | 372 | ```bash 373 | docker build . -t sample:v0 374 | ``` 375 | 376 | 最后确认镜像已经成功创建: 377 | 378 | ``` 379 | $ docker images | grep sample 380 | sample v0 707ab1c88146 30 seconds ago 11.3GB 381 | ``` 382 | 383 | ##### 从自定义镜像创建 Pod 384 | 385 | 从刚才我们制作的镜像创建 Pod 分为两步,首先需要将镜像推送到集群镜像仓库 Harbor,再从 Harbor 对应的镜像拉起 Pod。 386 | 387 | 访问[https://harbor.ai.iiis.co:9443](https://harbor.ai.iiis.co:9443),注意这里必须是https,用户名及密码等同用户访问k8s集群的用户名及密码。 388 | 389 | > **_NOTE:_** 注意这里的用户名格式为“用户名@iiis.co”。 390 | 391 | 连接到 Harbor 后新建项目: 392 | 393 | ![](assets/harbor/harbor_dashboard.jpg) 394 | 395 | ![](assets/harbor/harbor_create_project.jpg) 396 | 397 | > **_NOTE:_** 注意这里需要勾选公开,原因是私有集群物理机的 docker 并没有登录用户个人的 Harbor 账户,因此无法拉取私有仓库中的镜像。 398 | 399 | 假设我们的项目名为 zhangsan,则我们之后的镜像均要 push 到`harbor.ai.iiis.co:9443/zhangsan/`下,首先 tag 我们做好的镜像: 400 | 401 | ```bash 402 | docker tag sample:v0 harbor.ai.iiis.co:9443/zhangsan/sample:v0 403 | ``` 404 | 405 | 之后将镜像 push 到 Harbor 中,我们需要先在 docker 中登录我们在 Harbor上的账号: 406 | 407 | ```txt 408 | $ docker logout harbor.ai.iiis.co:9443 409 | Removing login credentials for harbor.ai.iiis.co 410 | $ docker login harbor.ai.iiis.co:9443 411 | Username: zhangsan@iiis.co 412 | Password: 413 | Login Succeeded 414 | ``` 415 | 416 | 最后将镜像推送到 Harbor 中: 417 | 418 | ```bash 419 | docker push harbor.ai.iiis.co:9443/zhangsan/sample:v0 420 | ``` 421 | 422 | 创建好镜像后,拉起 Pod 流程和标准镜像一样。 423 | 424 | 提示:建立Pod时,values-template.yaml模板中,指定容器镜像字段ContainerImage处,需要修改镜像仓库对应的域名为harbor-local.ai.iiis.co,可以提高镜像拉取速度,请参见本文档“使用默认配置启动计算任务”部分。 425 | --------------------------------------------------------------------------------