Sorry, your browser cannot access this site
This page requires browser support (enable) JavaScript
Learn more >

使用 Helm 部署 HAMi

HAMi简介

基于K8s的GPU虚拟化工具,提供多种厂家显卡类型的显卡虚拟化,并使用监控采集显卡使用的信息。

前提条件

  1. 配置 nvidia-container-toolkit

  2. Docker运行 Kubernetes时

  3. 节点打标签

    1
    kubectl label nodes {nodeid} gpu=on

离线安装

准备 Helm Chart(离线包)

1
2
3
helm repo add hami-charts https://project-hami.github.io/HAMi/
helm pull hami-charts/hami --untar
tar czf hami.tar.gz hami/

离线环境中解压并本地安装(与 Kubernetes 服务器版本匹配)

1
2
3
tar xzf hami.tar.gz
helm install hami ./hami -n kube-system \
--set scheduler.kubeScheduler.imageTag=v1.20.2

下载所需镜像

1
2
3
4
5
6
7
8
9
10
11
# HAMi 核心镜像
docker pull --platform=linux/amd64 projecthami/hami:v2.5.1

# kube-scheduler(匹配 Kubernetes v1.20.2)
docker pull --platform=linux/amd64 registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:v1.20.2

# webhook 证书镜像 1
docker pull --platform=linux/amd64 docker.io/jettech/kube-webhook-certgen:v1.5.2

# webhook 证书镜像 2(HAMi 默认用)
docker pull --platform=linux/amd64 liangjw/kube-webhook-certgen:v1.1.1
1
2
3
4
5
docker save -o hami-images.tar \
projecthami/hami:v2.5.1 \
registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:v1.20.2 \
docker.io/jettech/kube-webhook-certgen:v1.5.2 \
liangjw/kube-webhook-certgen:v1.1.1
1
docker load -i hami-images.tar

在离线集群中安装 Helm Chart

1
tar zxvf hami.tar.gz ./
1
2
helm install hami ./hami -n kube-system \
--set scheduler.kubeScheduler.imageTag=v1.20.2

验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-deployment
namespace: prod
labels:
app: vllm
spec:
replicas: 1
selector:
matchLabels:
app: vllm
template:
metadata:
labels:
app: vllm
spec:
containers:
- name: vllm-container
image: vllm/vllm-openai:v0.7.2
command: ["/bin/bash"]
args: ["-c", "tail -f /dev/null"]
ports:
- containerPort: 8000
volumeMounts:
- name: model-volume
mountPath: /root/models/maas-dev-bge-reranker-base
- name: nvidia-smi-bin
mountPath: /usr/bin/nvidia-smi
readOnly: true
resources:
limits:
nvidia.com/gpu: 1
nvidia.com/gpumem: 5120
volumes:
- name: model-volume
hostPath:
path: /root/tmp/models/maas-dev-bge-reranker-base
type: Directory
- name: nvidia-smi-bin
hostPath:
path: /usr/bin/nvidia-smi
type: File
restartPolicy: Always
#runtimeClassName: nvidia # 如有需要可以启用

1
2
3
kubectl apply -f deploy.yaml 
kubectl -n prod get pods -l app=vllm
kubectl -n prod exec -it pod/vllm-deployment-96d597cf8-gbbkw -- nvidia-smi

web ui 离线安装

下载 Helm Chart和相关镜像

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 添加 Helm 仓库
helm repo add hami-webui https://project-hami.github.io/HAMi-WebUI

# 下载 chart 到本地目录
helm pull hami-webui/hami-webui --untar

# 前端镜像
docker pull --platform=linux/amd64 projecthami/hami-webui-fe-oss:main

# 后端镜像
docker pull --platform=linux/amd64 projecthami/hami-webui-be-oss:main

# 保存为本地 tar 文件
docker save -o hami-web-ui-images.tar \
projecthami/hami-webui-fe-oss:main \
projecthami/hami-webui-be-oss:main

prometheus安装

镜像下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/bin/bash
images=(
"quay.io/prometheus/alertmanager:v0.27.0"
"quay.io/prometheus-operator/admission-webhook:v0.76.0"
"registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6"
"quay.io/prometheus-operator/prometheus-operator:v0.76.0"
"quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0"
"quay.io/thanos/thanos:v0.36.1"
"quay.io/prometheus/prometheus:v2.54.1"
"docker.io/grafana/grafana:11.2.0"
"registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0"
)
for img in "${images[@]}"; do
echo "Pulling $img..."
docker pull --platform linux/amd64 "$img"
if [ $? -eq 0 ]; then
echo "Successfully pulled $img"
else
echo "Failed to pull $img"
fi
done
1
2
3
4
5
6
7
8
9
10
docker save -o kube-prometheus-stack-images.tar \
quay.io/prometheus/alertmanager:v0.27.0 \
quay.io/prometheus-operator/admission-webhook:v0.76.0 \
registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6 \
quay.io/prometheus-operator/prometheus-operator:v0.76.0 \
quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0 \
quay.io/thanos/thanos:v0.36.1 \
quay.io/prometheus/prometheus:v2.54.1 \
"docker.io/grafana/grafana:11.2.0" \
"registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0" \

安装

1
2
3
helm install prometheus ./hami-webui/charts/kube-prometheus-stack -n prometheus --create-namespace --set crds.enabled=true
helm uninstall prometheus -n prometheus
helm show values ./kube-prometheus-stack > values.yaml

dcgm-exporter安装

镜像下载

1
2
docker pull --platform linux/amd64 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04
docker save -o dcgm.tar nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04

安装

1
helm install dcgm ./hami-webui/charts/dcgm-exporter -n dcgm --create-namespace

hami-webui安装

1
2
3
4
5
6
helm install my-hami-webui ./hami-webui \
--set externalPrometheus.enabled=true \
--set externalPrometheus.address="http://my-hami-webui-kube-prometh-prometheus.hami.svc.cluster.local:9090" \
-n hami
helm uninstall my-hami-webui ./hami-webui \
-n hami

评论