Inference Extension 安装
1.准备工作
# 确认当前kgateway部署已经开启了相关能力,如未开启,请参照部署文档
helm -n kgateway-system get values kgateway -o yaml | sed -n '1,80p'
或
helm -n kgateway-system get values agentgateway -o yaml | sed -n '1,80p'
inferenceExtension:
enabled: true
2.安装Inference Extension的CRD
# 自动获取 Gateway API Inference Extension 项目“最新稳定版”的发布tag,避免手动写死版本。
IGW_LATEST_RELEASE=$(curl -s <https://api.github.com/repos/kubernetes-sigs/gateway-api-inference-extension/releases> \\
| jq -r '.[] | select(.prerelease == false) | .tag_name' \\
| sort -V \\
| tail -n1)
# 查看当前版本
echo "$IGW_LATEST_RELEASE"
v1.3.1
# 把Inference Extension所需的Kubernetes扩展API(CRD)安装到集群,使集群“认识”并能创建Inference相关自定义资源。
kubectl apply -f <https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${IGW_LATEST_RELEASE}/manifests.yaml>
customresourcedefinition.apiextensions.k8s.io/inferencemodelrewrites.inference.networking.x-k8s.io configured
customresourcedefinition.apiextensions.k8s.io/inferenceobjectives.inference.networking.x-k8s.io configured
customresourcedefinition.apiextensions.k8s.io/inferencepoolimports.inference.networking.x-k8s.io configured
customresourcedefinition.apiextensions.k8s.io/inferencepools.inference.networking.k8s.io configured
customresourcedefinition.apiextensions.k8s.io/inferencepools.inference.networking.x-k8s.io configured
AI aware Inference Routing-vLLM
1. 部署vLLM的模型服务
💡本样例以常见的单机多卡推理实例Deployment类型资源来做演示
kubectl apply -f - <<'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
aitype: tuili
name: qwen3-8b-demo
namespace: roadshow
spec:
replicas: 3
selector:
matchLabels:
app: qwen3-8b-demo
template:
metadata:
labels:
aitype: tuili
app: qwen3-8b-demo
spec:
containers:
- args:
- |
seqNum=$(expr 1 - 1)
CUDA_VISIBLE_DEVICES=$(seq -s, 0 $seqNum) /opt/conda/bin/python3 -m vllm.entrypoints.openai.api_server --model /workspace/model/Qwen3-8B --port 8080 --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --enable-auto-tool-choice --tool-call-parser granite --served-model-name Qwen3-8B --trust-remote-code
command:
- /bin/bash
- -c
env:
- name: RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES
value: "1"
image: aiimage.wtsht.cn/tenant_public/vllm-mars:ai3.3-torch2.6-py312-ubuntu22.04-amd64
imagePullPolicy: IfNotPresent
name: qwen3-8b-demo-container-01
resources:
limits:
cpu: "12"
ephemeral-storage: 50Gi
mars-tech.com/gpu: "1"
memory: 96Gi
requests:
cpu: "12"
ephemeral-storage: 50Gi
mars-tech.com/gpu: "1"
memory: 96Gi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /workspace/model
name: localmodelvolume
readOnly: true
- mountPath: /dev/shm
name: dshm
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: volcano
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- hostPath:
path: /zion0/modelsrepo/models
type: Directory
name: localmodelvolume
- emptyDir:
medium: Memory
sizeLimit: 15Gi
name: dshm
---
apiVersion: v1
kind: Service
metadata:
name: qwen3-8b-demo-service
namespace: roadshow
labels:
app: qwen3-8b-demo
spec:
type: NodePort
selector:
app: qwen3-8b-demo
ports:
- name: http
port: 8080
targetPort: 8080
nodePort: 31301
EOF
# 通过内网检查验证服务可用性
curl -sS <http://10.8.17.200:31301/v1/models>
{"object":"list","data":[{"id":"Qwen3-8B","object":"model","created":1772164142,"owned_by":"vllm","root":"/workspace/model/Qwen3-8B","parent":null,"max_model_len":131072,"permission":[{"id":"modelperm-a82e27138dc5437484b13e3f91486070","object":"model_permission","created":1772164142,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}
curl -sS <http://10.8.17.200:31301/v1/chat/completions> \\
-H "Content-Type: application/json" \\
-d '{
"model":"Qwen3-8B",
"messages":[{"role":"user","content":"你好,用一句话自我介绍"}],
"temperature":0.2
}'
{"id":"chatcmpl-9d2d479a58404ecab00b7a4cd5c5172e","object":"chat.completion","created":1772164152,"model":"Qwen3-8B","choices":[{"index":0,"message":{"role":"assistant","content":"<think>\\n好的,用户让我用一句话自我介绍。首先,我需要确定用户的需求是什么。他们可能是在测试我的反应速度,或者想快速了解我的功能。作为AI助手,我应该简洁明了地介绍自己,同时突出核心功能。\\n\\n接下来,我要考虑用户可能的场景。也许他们刚接触这个AI,或者需要快速获取信息。这时候,一句话的自我介绍要包含关键点:我是谁,我能做什么,以及我的目标。比如,提到我是通义千问,基于通义实验室的模型,擅长回答问题、创作文字等,同时强调帮助用户解决问题和创造价值。\\n\\n然后,用户的身份可能是什么?可能是普通用户,也可能是开发者或研究人员。如果是普通用户,他们可能更关注实用功能;如果是开发者,可能对技术细节更感兴趣。不过用户只要求一句话,所以需要保持通用性,不涉及太多技术术语。\\n\\n用户可能没有说出来的深层需求是什么?他们可能希望知道我的可靠性和能力范围,或者想确认我是否能处理他们的具体任务。因此,在自我介绍中,我需要传达出专业性和多方面的技能,让用户感到信任和有用。\\n\\n还要注意语气要友好,符合中文表达习惯。避免过于生硬或冗长。比如使用“您好”开头,然后简要说明身份和功能,最后以帮助用户为目标。同时,确保信息准确,比如提到我的训练数据截止到2024年4月,这样用户知道我的知识是最新且有限的。\\n\\n最后,检查是否符合要求:一句话,简洁,包含关键信息。确保没有使用复杂结构,让句子流畅自然。例如:“您好,我是通义千问,基于通义实验室的超大规模语言模型,擅长回答问题、创作文字、编程和逻辑推理,致力于为您提供准确、有用的信息和帮助。” 这样既涵盖了主要功能,又明确了目标,符合用户的需求。\\n</think>\\n\\n您好,我是通义千问,基于通义实验室的超大规模语言模型,擅长回答问题、创作文字、编程和逻辑推理,致力于为您提供准确、有用的信息和帮助。","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"stop","stop_reason":null,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":14,"total_tokens":448,"completion_tokens":434,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null}root@homelab:~/charts/inferencepool#
2.部署网关实例
# 配置监听8090端口的agentgateway网关实例
kubectl apply -f - <<'EOF'
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: llm
namespace: kgateway-system
spec:
gatewayClassName: agentgateway
listeners:
- allowedRoutes:
namespaces:
from: All
name: http
port: 8090
protocol: HTTP
EOF
# 检查
kubectl get gateway llm -n kgateway-system
NAME CLASS ADDRESS PROGRAMMED AGE
llm agentgateway 10.8.17.152 True 69d
3.配置路由
kubectl apply -f - <<'EOF'
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
labels:
capsule.clastix.io/managed-by: hypersuite
name: qwen3-8b-demo-route
namespace: roadshow
spec:
hostnames:
- llm.wtsht.cn
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: llm
namespace: kgateway-system
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: qwen3-8b-demo
filters:
- type: URLRewrite
urlRewrite:
path:
replacePrefixMatch: /
type: ReplacePrefixMatch
matches:
- path:
type: PathPrefix
value: /roadshow/qwen3-8b-demo
EOF
4.部署InferencePool与Endpoint Picker Extension
# 通过helm安装inferencepool和epp
helm upgrade --install qwen3-8b-demo . \\
--namespace roadshow --create-namespace \\
--dependency-update \\
--set inferencePool.modelServers.matchLabels.app=qwen3-8b-demo \\
-f values.yaml
NAME: qwen3-8b-demo
LAST DEPLOYED: Fri Feb 27 04:26:17 2026
NAMESPACE: roadshow
STATUS: deployed
REVISION: 1
DESCRIPTION: Install complete
TEST SUITE: None
NOTES:
InferencePool qwen3-8b-demo deployed.
# 查看helm创建了哪些资源
helm -n roadshow status qwen3-8b-demo
NAME: qwen3-8b-demo
LAST DEPLOYED: Fri Feb 27 04:26:17 2026
NAMESPACE: roadshow
STATUS: deployed
REVISION: 1
DESCRIPTION: Install complete
RESOURCES:
==> v1/ConfigMap
NAME DATA AGE
qwen3-8b-demo-epp 1 2m46s
==> v1/Role
NAME CREATED AT
qwen3-8b-demo-epp 2026-02-27T04:26:19Z
==> v1/RoleBinding
NAME ROLE AGE
qwen3-8b-demo-epp Role/qwen3-8b-demo-epp 2m47s
==> v1/Service
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
qwen3-8b-demo-epp ClusterIP 10.101.199.180 <none> 9002/TCP,9090/TCP 2m47s
==> v1/Deployment
NAME READY UP-TO-DATE AVAILABLE AGE
qwen3-8b-demo-epp 1/1 1 1 2m47s
==> v1/Pod(related)
NAME READY STATUS RESTARTS AGE
qwen3-8b-demo-epp-679ff99955-g5s5f 1/1 Running 0 2m47s
==> v1/InferencePool
NAME AGE
qwen3-8b-demo 2m47s
==> v1/ServiceAccount
NAME SECRETS AGE
qwen3-8b-demo-epp 0 2m46s
TEST SUITE: None
NOTES:
InferencePool qwen3-8b-demo deployed.
5.测试与验证
# 缺省pool内使用到的负载均衡策略
kubectl -n roadshow get cm qwen3-8b-demo-epp -o yaml
apiVersion: v1
data:
default-plugins.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: queue-scorer
- type: kv-cache-utilization-scorer
- type: prefix-cache-scorer
schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
weight: 2
- pluginRef: kv-cache-utilization-scorer
weight: 2
- pluginRef: prefix-cache-scorer
weight: 3
kind: ConfigMap
metadata:
annotations:
meta.helm.sh/release-name: qwen3-8b-demo
meta.helm.sh/release-namespace: roadshow
creationTimestamp: "2026-03-13T03:31:53Z"
labels:
app.kubernetes.io/managed-by: Helm
capsule.clastix.io/managed-by: livedemo
name: qwen3-8b-demo-epp
namespace: roadshow
resourceVersion: "357726537"
uid: 7133950d-a3ba-4bba-8b3c-509c1abd1195
# 查看每个副本实际命中情况
kubectl get pods -n roadshow -o wide
qwen3-8b-demo-866dbc74c4-mjhq6 1/1 Running 0 3h57m 192.168.107.194 gpu-worker-65 <none> <none>
qwen3-8b-demo-866dbc74c4-xmz7j 1/1 Running 0 3h57m 192.168.112.49 gpu-worker-72 <none> <none>
qwen3-8b-demo-epp-679ff99955-wrg94 1/1 Running 0 33h 192.168.117.45 gpu-worker-77 <none> <none>
# 这个脚本会每秒查询一次2个 vLLM 后端 Pod 的 /metrics,实时显示每个实例当前正在处理的请求数、排队请求数、缓存使用率以及累计处理的 token。
# 它的目的是让你直观看到压测流量到底落到了哪些后端实例上,以及是否真的在多个 Pod 之间分摊。
watch -n 1 '
for ip in 192.168.107.194 192.168.112.49; do
echo "===== $ip ====="
curl -sS <http://$ip:8080/metrics> | egrep "vllm:num_requests_running|vllm:num_requests_waiting|vllm:gpu_cache_usage_perc|vllm:request_prompt_tokens_sum|vllm:generation_tokens_total"
echo
done
'
Every 1.0s: master-01: Sat Mar 14 13:20:08 2026
===== 192.168.107.194 =====
# HELP vllm:num_requests_running Number of requests in model execution batches.
# TYPE vllm:num_requests_running gauge
vllm:num_requests_running{engine="0",model_name="Qwen3-8B"} 0.0
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
# TYPE vllm:num_requests_waiting gauge
vllm:num_requests_waiting{engine="0",model_name="Qwen3-8B"} 0.0
# HELP vllm:generation_tokens_total Number of generation tokens processed.
# TYPE vllm:generation_tokens_total counter
vllm:generation_tokens_total{engine="0",model_name="Qwen3-8B"} 509.0
vllm:request_prompt_tokens_sum{engine="0",model_name="Qwen3-8B"} 30.0
===== 192.168.112.49 =====
# HELP vllm:num_requests_running Number of requests in model execution batches.
# TYPE vllm:num_requests_running gauge
vllm:num_requests_running{engine="0",model_name="Qwen3-8B"} 0.0
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
# TYPE vllm:num_requests_waiting gauge
vllm:num_requests_waiting{engine="0",model_name="Qwen3-8B"} 0.0
# HELP vllm:generation_tokens_total Number of generation tokens processed.
# TYPE vllm:generation_tokens_total counter
vllm:generation_tokens_total{engine="0",model_name="Qwen3-8B"} 0.0
vllm:request_prompt_tokens_sum{engine="0",model_name="Qwen3-8B"} 0.0
# 验证业务流程是否正常
curl -sS -H 'Host: llm.wtsht.cn' <http://10.8.17.152:8090/roadshow/qwen3-8b-demo/v1/models>
{"data":[{"created":1772166885,"id":"Qwen3-8B","max_model_len":131072,"object":"model","owned_by":"vllm","parent":null,"permission":[{"allow_create_engine":false,"allow_fine_tuning":false,"allow_logprobs":true,"allow_sampling":true,"allow_search_indices":false,"allow_view":true,"created":1772166885,"group":null,"id":"modelperm-2024f940d92448d8bac1378ef1879bef","is_blocking":false,"object":"model_permission","organization":"*"}],"root":"/workspace/model/Qwen3-8B"}],"object":"list"}root@homelab:~/charts/inferencepool# ^C
curl -sS -H 'Host: llm.wtsht.cn' <http://10.8.17.152:8090/roadshow/qwen3-8b-demo/v1/chat/completions> \\
-H 'Content-Type: application/json' \\
-d '{
"model":"Qwen3-8B",
"messages":[{"role":"user","content":"你好,用一句话自我介绍"}],
"temperature":0.2
}'
{"choices":[{"finish_reason":"stop","index":0,"logprobs":null,"message":{"annotations":null,"audio":null,"content":"\\u003cthink\\u003e\\n好的,用户让我用一句话自我介绍。首先,我需要确定用户的需求是什么。他们可能是在测试我的反应速度,或者想快速了解我的功能。作为AI助手,我应该简洁明了地介绍自己,同时突出核心功能。\\n\\n接下来,我要考虑用户可能的场景。也许他们刚接触这个AI,或者需要快速获取信息。这时候,一句话的自我介绍要包含关键点:我是谁,我能做什么,以及我的目标。比如,提到我是通义千问,由通义实验室研发,具备多语言支持和广泛的知识库,能够回答问题、创作文字等,旨在提供帮助。\\n\\n然后,用户可能的深层需求是什么?他们可能希望知道我的可靠性、多功能性,或者是否适合他们的特定任务。因此,我需要确保自我介绍中涵盖这些方面,比如强调我的训练数据和能力范围,让用户感到信任。\\n\\n还要注意语气要友好且专业,避免过于技术化的术语,保持自然。比如用“您好”开头,然后说明我的功能和目的。同时,保持句子简洁,不冗长,确保用户能快速理解。\\n\\n最后,检查是否有遗漏的重要信息,比如是否需要提到我的应用场景或用户群体。不过用户只要求一句话,所以需要精简,把最重要的部分放在前面。确认没有错误后,组织成流畅的句子。\\n\\u003c/think\\u003e\\n\\n您好,我是通义千问,由通义实验室研发的超大规模语言模型,能够帮助您回答问题、创作文字、编程、分析数据等,致力于提供高效、准确的智能服务。","function_call":null,"reasoning_content":null,"refusal":null,"role":"assistant","tool_calls":[]},"stop_reason":null,"token_ids":null}],"created":1772166908,"id":"chatcmpl-5decd207-7911-4adc-8732-0d3d0051ac05","kv_transfer_params":null,"model":"Qwen3-8B","object":"chat.completion","prompt_logprobs":null,"prompt_token_ids":null,"service_tier":null,"system_fingerprint":null,"usage":{"completion_tokens":322,"prompt_tokens":14,"prompt_tokens_details":null,"total_tokens":336}}
# 执行压测
GWIP=$(kubectl -n kgateway-system get gateway llm -o jsonpath='{.status.addresses[0].value}')
LONG=$(python3 - <<'PY'
print("请重复输出以下段落并逐句改写,保持语义一致:" + "你好。"*8000)
PY
)
for i in $(seq 1 200); do
curl -sS -H 'Host: llm.wtsht.cn' \\
<http://$GWIP:8090/roadshow/qwen3-8b-demo/v1/chat/completions> \\
-H 'Content-Type: application/json' \\
-d "$(cat <<EOF
{"model":"Qwen3-8B","messages":[{"role":"user","content":"$LONG"}],"max_tokens":1024,"temperature":0.2}
EOF
)" >/dev/null &
done
wait
**# 验证EPP从modelserver拿到了用于选路的指标**
# 通过epp的服务接口来查看是否真正拿到了modelserver的指标,队列大小会发生变化。
kubectl -n roadshow get svc qwen3-8b-demo-epp -o wide
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
qwen3-8b-demo-epp ClusterIP 10.101.199.180 <none> 9002/TCP,9090/TCP 11m inferencepool=qwen3-8b-demo-epp
curl -sS <http://10.101.199.180:9090/metrics> | egrep "inference_pool_ready_pods|inference_pool_per_pod_queue_size|inference_pool_average_" | head -n 80
# HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool.
# TYPE inference_pool_average_kv_cache_utilization gauge
inference_pool_average_kv_cache_utilization{name="qwen3-8b-demo"} 0.04881823810044065
# HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue.
# TYPE inference_pool_average_queue_size gauge
inference_pool_average_queue_size{name="qwen3-8b-demo"} 48
# HELP inference_pool_per_pod_queue_size [ALPHA] The total number of requests pending in the model server queue for each underlying pod.
# TYPE inference_pool_per_pod_queue_size gauge
inference_pool_per_pod_queue_size{model_server_pod="qwen3-8b-demo-866dbc74c4-47k6n-rank-0",name="qwen3-8b-demo"} 0
inference_pool_per_pod_queue_size{model_server_pod="qwen3-8b-demo-866dbc74c4-5clwk-rank-0",name="qwen3-8b-demo"} 144
inference_pool_per_pod_queue_size{model_server_pod="qwen3-8b-demo-866dbc74c4-6hv5n-rank-0",name="qwen3-8b-demo"} 0
# HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool.
# TYPE inference_pool_ready_pods gauge
inference_pool_ready_pods{name="qwen3-8b-demo"} 3
# 查看EPP metrics,确认是否由EPP来完成了请求的决策。
inference_extension_scheduler_attempts_total{status="success"} 5
这说明:EPP已经成功执行了5次调度决策(也就是至少有 5 次 ext-proc 调用触发了“选哪个 endpoint”)。
curl -sS <http://10.101.199.180:9090/metrics> | egrep -i 'ext.?proc|grpc|http|request|decision|pick|schedule' | head -n 120
# HELP go_cpu_classes_gc_mark_idle_cpu_seconds_total Estimated total CPU time spent performing GC tasks on spare CPU resources that the Go scheduler could not otherwise find a use for. This should be subtracted from the total GC CPU time to obtain a measure of compulsory GC CPU time. This metric is an overestimate, and not directly comparable to system CPU time measurements. Compare only with other /cpu/classes metrics. Sourced from /cpu/classes/gc/mark/idle:cpu-seconds.
# HELP go_godebug_non_default_behavior_http2client_events_total The number of non-default behaviors executed by the net/http package due to a non-default GODEBUG=http2client=... setting. Sourced from /godebug/non-default-behavior/http2client:events.
# TYPE go_godebug_non_default_behavior_http2client_events_total counter
go_godebug_non_default_behavior_http2client_events_total 0
# HELP go_godebug_non_default_behavior_http2server_events_total The number of non-default behaviors executed by the net/http package due to a non-default GODEBUG=http2server=... setting. Sourced from /godebug/non-default-behavior/http2server:events.
# TYPE go_godebug_non_default_behavior_http2server_events_total counter
go_godebug_non_default_behavior_http2server_events_total 0
# HELP go_godebug_non_default_behavior_httpcookiemaxnum_events_total The number of non-default behaviors executed by the net/http package due to a non-default GODEBUG=httpcookiemaxnum=... setting. Sourced from /godebug/non-default-behavior/httpcookiemaxnum:events.
# TYPE go_godebug_non_default_behavior_httpcookiemaxnum_events_total counter
go_godebug_non_default_behavior_httpcookiemaxnum_events_total 0
# HELP go_godebug_non_default_behavior_httplaxcontentlength_events_total The number of non-default behaviors executed by the net/http package due to a non-default GODEBUG=httplaxcontentlength=... setting. Sourced from /godebug/non-default-behavior/httplaxcontentlength:events.
# TYPE go_godebug_non_default_behavior_httplaxcontentlength_events_total counter
go_godebug_non_default_behavior_httplaxcontentlength_events_total 0
# HELP go_godebug_non_default_behavior_httpmuxgo121_events_total The number of non-default behaviors executed by the net/http package due to a non-default GODEBUG=httpmuxgo121=... setting. Sourced from /godebug/non-default-behavior/httpmuxgo121:events.
# TYPE go_godebug_non_default_behavior_httpmuxgo121_events_total counter
go_godebug_non_default_behavior_httpmuxgo121_events_total 0
# HELP go_godebug_non_default_behavior_httpservecontentkeepheaders_events_total The number of non-default behaviors executed by the net/http package due to a non-default GODEBUG=httpservecontentkeepheaders=... setting. Sourced from /godebug/non-default-behavior/httpservecontentkeepheaders:events.
# TYPE go_godebug_non_default_behavior_httpservecontentkeepheaders_events_total counter
go_godebug_non_default_behavior_httpservecontentkeepheaders_events_total 0
# HELP go_sched_latencies_seconds Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running. Bucket counts increase monotonically. Sourced from /sched/latencies:seconds.
inference_extension_plugin_duration_seconds_bucket{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker",le="0.01"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker",le="0.02"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker",le="0.05"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker",le="0.1"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker",le="+Inf"} 5
inference_extension_plugin_duration_seconds_sum{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker"} 2.5848999999999996e-05
inference_extension_plugin_duration_seconds_count{extension_point="Picker",plugin_name="max-score-picker",plugin_type="max-score-picker"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer",le="0.0001"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer",le="0.0002"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer",le="0.0005"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer",le="0.001"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer",le="+Inf"} 5
inference_extension_plugin_duration_seconds_sum{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer"} 0.00010747600000000001
inference_extension_plugin_duration_seconds_count{extension_point="PreRequest",plugin_name="prefix-cache-scorer",plugin_type="prefix-cache-scorer"} 5
inference_extension_plugin_duration_seconds_bucket{extension_point="ProfilePicker",plugin_name="single-profile-handler",plugin_type="single-profile-handler",le="0.0001"} 10
inference_extension_plugin_duration_seconds_bucket{extension_point="ProfilePicker",plugin_name="single-profile-handler",plugin_type="single-profile-handler",le="0.0002"} 10
inference_extension_plugin_duration_seconds_bucket{extension_point="ProfilePicker",plugin_name="single-profile-handler",plugin_type="single-profile-handler",le="0.05"} 10
inference_extension_plugin_duration_seconds_bucket{extension_point="ProfilePicker",plugin_name="single-profile-handler",plugin_type="single-profile-handler",le="0.1"} 10
inference_extension_plugin_duration_seconds_sum{extension_point="ProfilePicker",plugin_name="single-profile-handler",plugin_type="single-profile-handler"} 4.733e-06
inference_extension_plugin_duration_seconds_count{extension_point="ProfilePicker",plugin_name="single-profile-handler",plugin_type="single-profile-handler"} 10
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
# TYPE inference_extension_scheduler_attempts_total counter
inference_extension_scheduler_attempts_total{status="success"} 5
# HELP inference_extension_scheduler_e2e_duration_seconds [ALPHA] End-to-end scheduling latency distribution in seconds.
# TYPE inference_extension_scheduler_e2e_duration_seconds histogram
inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0001"} 4
inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0002"} 4
inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0005"} 4
inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.001"} 5
inference_extension_scheduler_e2e_duration_seconds_bucket{le="+Inf"} 5
inference_extension_scheduler_e2e_duration_seconds_sum 0.001092388
inference_extension_scheduler_e2e_duration_seconds_count 5
... ...
inference_objective_request_duration_seconds_bucket{model_name="Qwen3-8B",target_model_name="Qwen3-8B",le="1200"} 5
inference_objective_request_duration_seconds_bucket{model_name="Qwen3-8B",target_model_name="Qwen3-8B",le="1800"} 5
inference_objective_request_duration_seconds_bucket{model_name="Qwen3-8B",target_model_name="Qwen3-8B",le="2700"} 5
inference_objective_request_duration_seconds_bucket{model_name="Qwen3-8B",target_model_name="Qwen3-8B",le="3600"} 5
inference_objective_request_duration_seconds_bucket{model_name="Qwen3-8B",target_model_name="Qwen3-8B",le="+Inf"} 5
inference_objective_request_duration_seconds_sum{model_name="Qwen3-8B",target_model_name="Qwen3-8B"} 5.2231309790000005
inference_objective_request_duration_seconds_count{model_name="Qwen3-8B",target_model_name="Qwen3-8B"} 5
# 模拟真实用户外网访问
curl -sS <https://llm.wtsht.cn/roadshow/qwen3-8b-demo/v1/models>
curl -sS <https://llm.wtsht.cn/roadshow/qwen3-8b-demo/v1/chat/completions> \\
-H 'Content-Type: application/json' \\
-d '{
"model":"Qwen3-8B",
"messages":[{"role":"user","content":"你好,来一个1000字的自我介绍"}],
"temperature":0.2
}'
其它
💡测样例多为多机多卡场景使用lws资源
配置vLLM分布式模型服务
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
labels:
capsule.clastix.io/managed-by: livedemo
name: qwen25-72b-demo
namespace: roadshow
spec:
leaderWorkerTemplate:
leaderTemplate:
metadata:
annotations:
k8s.v1.cni.cncf.io/networks: '[ { "name": "compute-a-net01", "namespace":
"nad" }, { "name": "compute-b-net01", "namespace": "nad" } ]'
labels:
role: leader
app: qwen25-72b-demo-leader
spec:
containers:
- command:
- sh
- -c
- "export MCCL_IB_HCA=$(/usr/sbin/show_gids |grep -E \\"mlx5_*.*10.*v2\\"|
awk '{print $1}'|paste -sd ',' -); \\nexport HCCL_IB_HCA=$(/usr/sbin/show_gids
|grep -E \\"mlx5_*.*10.*v2\\"| awk '{print $1}'|paste -sd ',' -); \\nexport
MCCL_IB_GID_INDEX=$(/usr/sbin/show_gids |grep -E \\"mlx5_0.*10.*v2\\"| awk
'{print $3}'); \\nexport HCCL_IB_GID_INDEX=$(/usr/sbin/show_gids |grep
-E \\"mlx5_0.*10.*v2\\"| awk '{print $3}'); \\nseqNum=$(expr 4 - 1); \\nexport
CUDA_VISIBLE_DEVICES=$(seq -s, 0 $seqNum);\\n# Patch ray start to advertise
GPU resources (head) \\nsed -i 's/ray start --num-gpus=8/ray start --num-gpus=4
/g' /workspace/multi-node-serving.sh; \\nbash /workspace/multi-node-serving.sh
leader --ray_cluster_size=$(LWS_GROUP_SIZE); \\nseqNum=$(expr 4 - 1); \\nCUDA_VISIBLE_DEVICES=$(seq
-s, 0 $seqNum) /opt/conda/bin/python3 -m vllm.entrypoints.openai.api_server
--port 8080 --model /workspace/model/Qwen2.5-72B-Instruct --gpu-memory-utilization
0.8 --rope-scaling '{\\"rope_type\\":\\"yarn\\",\\"factor\\":4.0,\\"original_max_position_embeddings\\":32768}'
\\ --enable-auto-tool-choice --tool-call-parser granite --tensor-parallel-size
2 --pipeline_parallel_size 2 --served-model-name Qwen2.5-72B-Instruct
--trust-remote-code \\n"
env:
- name: HF_ENDPOINT
value: <https://hf-mirror.com>
- name: TRITON_ENABLE_MACA_OPT_MOVE_DOT_OPERANDS_OUT_LOOP
value: "1"
- name: TRITON_ENABLE_MACA_CHAIN_DOT_OPT
value: "1"
- name: TRITON_DISABLE_MACA_OPT_MMA_PREFETCH
value: "1"
- name: TRITON_ENABLE_MACA_COMPILER_INT8_OPT
value: "True"
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IGNORE_CPU_AFFINITY
value: "1"
- name: RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES
value: "1"
image: aiimage.wtsht.cn/tenant_public/vllm-mars:ai3.3-torch2.6-py312-ubuntu22.04-amd64
name: vllm-leader
ports:
- containerPort: 8080
protocol: TCP
readinessProbe:
initialDelaySeconds: 15
periodSeconds: 10
tcpSocket:
port: 8080
resources:
limits:
cpu: "48"
ephemeral-storage: 50Gi
mars-tech.com/gpu: "4"
memory: 384Gi
requests:
cpu: "48"
ephemeral-storage: 50Gi
mars-tech.com/gpu: "4"
memory: 384Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /workspace/model
name: localmodelvolume
readOnly: true
nodeSelector:
location: compute-zone01
schedulerName: volcano
securityContext:
seccompProfile:
type: Unconfined
volumes:
- emptyDir:
medium: Memory
sizeLimit: 15Gi
name: dshm
- hostPath:
path: /zion0/modelsrepo/models
type: Directory
name: localmodelvolume
restartPolicy: RecreateGroupOnPodRestart
size: 2
workerTemplate:
metadata:
annotations:
k8s.v1.cni.cncf.io/networks: '[ { "name": "compute-a-net01", "namespace":
"nad" }, { "name": "compute-b-net01", "namespace": "nad" } ]'
spec:
containers:
- command:
- sh
- -c
- "export MCCL_IB_HCA=$(/usr/sbin/show_gids |grep -E \\"mlx5_*.*10.*v2\\"|
awk '{print $1}'|paste -sd ',' -); \\nexport HCCL_IB_HCA=$(/usr/sbin/show_gids
|grep -E \\"mlx5_*.*10.*v2\\"| awk '{print $1}'|paste -sd ',' -); \\nexport
MCCL_IB_GID_INDEX=$(/usr/sbin/show_gids |grep -E \\"mlx5_0.*10.*v2\\"| awk
'{print $3}'); \\nexport HCCL_IB_GID_INDEX=$(/usr/sbin/show_gids |grep
-E \\"mlx5_0.*10.*v2\\"| awk '{print $3}'); \\nseqNum=$(expr 4 - 1); \\nexport
CUDA_VISIBLE_DEVICES=$(seq -s, 0 $seqNum);\\nsed -i 's/ray start --num-gpus=8/ray
start --num-gpus=4 /g' /workspace/multi-node-serving.sh; \\nbash /workspace/multi-node-serving.sh
worker --ray_address=$(LWS_LEADER_ADDRESS); \\n"
env:
- name: TRITON_ENABLE_MACA_OPT_MOVE_DOT_OPERANDS_OUT_LOOP
value: "1"
- name: TRITON_ENABLE_MACA_CHAIN_DOT_OPT
value: "1"
- name: TRITON_DISABLE_MACA_OPT_MMA_PREFETCH
value: "1"
- name: TRITON_ENABLE_MACA_COMPILER_INT8_OPT
value: "True"
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IGNORE_CPU_AFFINITY
value: "1"
- name: RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES
value: "1"
image: aiimage.wtsht.cn/tenant_public/vllm-mars:ai3.3-torch2.6-py312-ubuntu22.04-amd64
name: vllm-worker
resources:
limits:
cpu: "48"
ephemeral-storage: 50Gi
mars-tech.com/gpu: "4"
memory: 384Gi
requests:
cpu: "48"
ephemeral-storage: 50Gi
mars-tech.com/gpu: "4"
memory: 384Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /workspace/model
name: localmodelvolume
readOnly: true
nodeSelector:
location: compute-zone01
securityContext:
seccompProfile:
type: Unconfined
volumes:
- emptyDir:
medium: Memory
sizeLimit: 15Gi
name: dshm
- hostPath:
path: /zion0/modelsrepo/models
type: Directory
name: localmodelvolume
networkConfig:
subdomainPolicy: Shared
replicas: 1
rolloutStrategy:
rollingUpdateConfiguration:
maxSurge: 0
maxUnavailable: 1
partition: 0
type: RollingUpdate
startupPolicy: LeaderCreated
配置路由
💡网关延用之前的网关配置
kubectl apply -f - <<'EOF'
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
labels:
capsule.clastix.io/managed-by: livedemo
name: llm-qwen25-72b-demo-route
namespace: roadshow
spec:
hostnames:
- llm.wtsht.cn
parentRefs:
- group: inference.networking.k8s.io
kind: Gateway
name: llm
namespace: kgateway-system
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: qwen25-72b-demo
filters:
- type: URLRewrite
urlRewrite:
path:
replacePrefixMatch: /
type: ReplacePrefixMatch
matches:
- path:
type: PathPrefix
value: /roadshow/qwen25-72b-demo
EOF
配置InferencePool
helm upgrade --install qwen25-72b-demo . \\
--namespace roadshow \\
--dependency-update \\
--set inferencePool.modelServers.matchLabels.app=qwen25-72b-demo-leader \\
-f values.yaml
验证测试
curl -sS <https://llm.wtsht.cn/roadshow/qwen25-72b-demo/v1/models> \\
-H 'Authorization: Bearer k-123'
curl -N <https://llm.wtsht.cn/roadshow/qwen25-72b-demo/v1/chat/completions> \\
-H 'Content-Type: application/json' \\
-H 'Authorization: Bearer k-123' \\
-d '{
"model":"Qwen2.5-72B-Instruct",
"messages":[{"role":"user","content":"你好,用一句话自我介绍"}],
"max_tokens":256,
"temperature":0.2,
"stream": true
}'
AI Aware Inference Routing-SGLang
1. 部署SGLang的模型服务
💡本样例以常见的单机多卡推理实例Deployment类型资源来做演示
kubectl apply -f - <<'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
name: sglang-qwen32b
namespace: demo
spec:
replicas: 1
selector:
matchLabels:
app: sglang-qwen32b
template:
metadata:
labels:
app: sglang-qwen32b
inference.networking.k8s.io/engine-type: sglang
spec:
securityContext:
seccompProfile:
type: Unconfined
containers:
- name: sglang
image: 10.8.17.100:60066/sglang/sglang:0.5.4-hpcc.ai3.3.0.13-torch2.6-py310-ubuntu22.04-amd64
imagePullPolicy: IfNotPresent
env:
- name: HPCC_SMALL_PAGESIZE_ENABLE
value: "1"
- name: PYTORCH_ENABLE_PG_HIGH_PRIORITY_STREAM
value: "1"
- name: HPCC_VISIBLE_DEVICE
value: "0,1,2,3,4,5,6,7"
- name: TRITON_ENABLE_HPCC_OPT_MOVE_DOT_OPERANDS_OUT_LOOP
value: "1"
- name: TRITON_DISABLE_HPCC_OPT_MMA_PREFETCH
value: "1"
- name: TRITON_ENABLE_HPCC_CHAIN_DOT_OPT
value: "1"
- name: TRITON_ENABLE_HPCC_COMPILER_INT8_OPT
value: "True"
- name: VLLM_PP_LAYER_PARTITION
value: "16,15,15,15"
command:
- sh
- -c
- |
/opt/conda/bin/python3 -m sglang.launch_server \\
--model-path /workspace/model/Qwen3-32B \\
--served-model-name Qwen3-32B \\
--tp 8 \\
--dp 1 \\
--nnodes 1 \\
--node-rank 0 \\
--dist-init-addr 127.0.0.1:5000 \\
--trust-remote-code \\
--attention-backend flashinfer \\
--enable-dp-attention \\
--enable-metrics \\
--host 0.0.0.0 \\
--port 8080
ports:
- name: http
containerPort: 30000
resources:
limits:
mars-tech.com/gpu: 8
requests:
mars-tech.com/gpu: 8
securityContext:
capabilities:
add:
- IPC_LOCK
volumeMounts:
- name: dshm
mountPath: /dev/shm
- name: localmodelvolume
mountPath: /workspace/model
readOnly: true
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 100Gi
- hostPath:
path: /zion0/modelsrepo/models
type: Directory
name: localmodelvolume
---
apiVersion: v1
kind: Service
metadata:
name: sglang-qwen32b
namespace: demo
spec:
type: NodePort
selector:
app: sglang-qwen32b
ports:
- name: http
port: 8080
targetPort: 8080
nodePort: 30287
EOF
# 验证
# 模型服务是否正常
curl -sS <http://10.8.17.200:30287/v1/models>
# 测试metrics是否暴露
curl -s <http://10.8.17.200:30287/metrics> | grep 'sglang:num_queue_reqs'
curl -s <http://10.8.17.200:30287/metrics> | grep 'sglang:num_running_reqs'
curl -s <http://10.8.17.200:30287/metrics> | grep 'sglang:token_usage'
# HELP sglang:num_queue_reqs The number of requests in the waiting queue.
# TYPE sglang:num_queue_reqs gauge
sglang:num_queue_reqs{engine_type="unified",model_name="/workspace/model/Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
# HELP sglang:num_running_reqs The number of running requests.
# TYPE sglang:num_running_reqs gauge
sglang:num_running_reqs{engine_type="unified",model_name="/workspace/model/Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
# HELP sglang:num_running_reqs_offline_batch The number of running low-priority offline batch requests(label is 'batch').
# TYPE sglang:num_running_reqs_offline_batch gauge
sglang:num_running_reqs_offline_batch{engine_type="unified",model_name="/workspace/model/Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
# HELP sglang:token_usage The token usage.
# TYPE sglang:token_usage gauge
sglang:token_usage{engine_type="unified",model_name="/workspace/model/Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
2.部署网关实例
# 配置监听8090端口的agentgateway网关实例
kubectl apply -f - <<'EOF'
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: llm
namespace: kgateway-system
spec:
gatewayClassName: agentgateway
listeners:
- allowedRoutes:
namespaces:
from: All
name: http
port: 8090
protocol: HTTP
EOF
# 检查
kubectl get gateway llm -n kgateway-system
NAME CLASS ADDRESS PROGRAMMED AGE
llm agentgateway 10.8.17.152 True 69d
3.配置路由
kubectl apply -f - <<'EOF'
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
labels:
capsule.clastix.io/managed-by: hypersuite
name: sglang-qwen32b-route
namespace: demo
spec:
hostnames:
- llm.wtsht.cn
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: llm
namespace: kgateway-system
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: sglang-qwen32b-pool
namespace: demo
filters:
- type: URLRewrite
urlRewrite:
path:
replacePrefixMatch: /
type: ReplacePrefixMatch
matches:
- path:
type: PathPrefix
value: /demo/sglang-qwen32b
EOF
4.部署InferencePool与Endpoint Picker Extension
# 通过helm安装inferencepool和epp
helm upgrade --install sglang-qwen32b-pool . \\
--namespace demo --create-namespace \\
--dependency-update \\
--set inferencePool.modelServers.matchLabels.app=sglang-qwen32b \\
--set inferencePool.modelServerType=sglang \\
--set experimentalHttpRoute.enabled=false \\
-f values.yaml
5.测试与验证
# 1.验证业务流程是否正常
curl -sS <http://10.8.17.152:8090/demo/sglang-qwen32b/v1/chat/completions> \\
-H 'Host: llm.wtsht.cn' \\
-H 'Content-Type: application/json' \\
-d '{
"model": "/workspace/model/Qwen3-32B",
"messages": [
{"role":"user","content":"你好,用一句话自我介绍"}
],
"temperature": 0.2
}'
# 2. 查看每个副本实际命中情况
kubectl get pods -n demo -l app=sglang-qwen32b -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
sglang-qwen32b-788dd97f98-hkcx8 1/1 Running 0 17m 192.168.117.49 gpu-worker-77 <none> <none>
sglang-qwen32b-788dd97f98-ngkcr 1/1 Running 0 5m58s 192.168.19.13 gpu-worker-96 <none> <none>
# 这个脚本会每秒查询一次2个 vLLM 后端 Pod 的 /metrics,实时显示每个实例当前正在处理的请求数、排队请求数、缓存使用率以及累计处理的 token。
# 它的目的是让你直观看到压测流量到底落到了哪些后端实例上,以及是否真的在多个 Pod 之间分摊。
watch -n 1 '
for ip in 192.168.117.49 192.168.19.13; do
echo "===== $ip ====="
curl -sS <http://$ip:8080/metrics> | egrep "sglang:num_running_reqs|sglang:num_queue_reqs|sglang:token_usage"
echo
done
'
# 3. 执行压测
GWIP=10.8.17.152
for i in $(seq 1 100); do
curl -sS <http://$GWIP:8090/demo/sglang-qwen32b/v1/chat/completions> \\
-H 'Host: llm.wtsht.cn' \\
-H 'Content-Type: application/json' \\
-d "$(cat <<EOF
{"model":"Qwen3-32B","messages":[{"role":"user","content":"请求编号 REQ-$i,请写一篇不少于2000字的自我介绍,并且每一段都要展开说明。"}],"max_tokens":1024,"temperature":0.2}
EOF
)" >/dev/null &
done
wait
在刚才执行watch的终端来查看
Every 1.0s: master-01: Mon Mar 16 15:30:10 2026
===== 192.168.117.49 =====
# HELP sglang:num_running_reqs The number of running requests.
# TYPE sglang:num_running_reqs gauge
sglang:num_running_reqs{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 43.0
# HELP sglang:token_usage The token usage.
# TYPE sglang:token_usage gauge
sglang:token_usage{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0021420527801805037
# HELP sglang:num_queue_reqs The number of requests in the waiting queue.
# TYPE sglang:num_queue_reqs gauge
sglang:num_queue_reqs{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
# HELP sglang:num_running_reqs_offline_batch The number of running low-priority offline batch requests(label is 'batch').
# TYPE sglang:num_running_reqs_offline_batch gauge
sglang:num_running_reqs_offline_batch{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
===== 192.168.19.13 =====
# HELP sglang:num_running_reqs The number of running requests.
# TYPE sglang:num_running_reqs gauge
sglang:num_running_reqs{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 57.0
# HELP sglang:token_usage The token usage.
# TYPE sglang:token_usage gauge
sglang:token_usage{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0032260016823625975
# HELP sglang:num_queue_reqs The number of requests in the waiting queue.
# TYPE sglang:num_queue_reqs gauge
sglang:num_queue_reqs{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
# HELP sglang:num_running_reqs_offline_batch The number of running low-priority offline batch requests(label is 'batch').
# TYPE sglang:num_running_reqs_offline_batch gauge
sglang:num_running_reqs_offline_batch{engine_type="unified",model_name="Qwen3-32B",pp_rank="0",tp_rank="0"} 0.0
# 4.查看epp对应metric记录
kubectl get svc sglang-qwen32b-pool-epp -n demo -o wide
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
sglang-qwen32b-pool-epp ClusterIP 10.96.60.237 <none> 9002/TCP,9090/TCP 50m inferencepool=sglang-qwen32b-pool-epp
curl -sS <http://10.96.60.237:9090/metrics> | egrep "inference_pool_ready_pods|inference_pool_per_pod_queue_size|inference_pool_average_"
# HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool.
# TYPE inference_pool_average_kv_cache_utilization gauge
inference_pool_average_kv_cache_utilization{name="sglang-qwen32b-pool"} 0
# HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue.
# TYPE inference_pool_average_queue_size gauge
inference_pool_average_queue_size{name="sglang-qwen32b-pool"} 0
# HELP inference_pool_per_pod_queue_size [ALPHA] The total number of requests pending in the model server queue for each underlying pod.
# TYPE inference_pool_per_pod_queue_size gauge
inference_pool_per_pod_queue_size{model_server_pod="sglang-qwen32b-788dd97f98-hkcx8-rank-0",name="sglang-qwen32b-pool"} 0
inference_pool_per_pod_queue_size{model_server_pod="sglang-qwen32b-788dd97f98-ngkcr-rank-0",name="sglang-qwen32b-pool"} 0
# HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool.
# TYPE inference_pool_ready_pods gauge
inference_pool_ready_pods{name="sglang-qwen32b-pool"} 2
# 5. 模拟真实用户外网访问
curl -sS <https://llm.wtsht.cn/demo/sglang-qwen32b/v1/chat/completions> \\
-H 'Content-Type: application/json' \\
-d '{
"model":"Qwen3-32B",
"messages":[{"role":"user","content":"你好,来一个1000字的自我介绍"}],
"temperature":0.2
}'
