华为昇腾910B部署Qwen3-Reranker-8B
容器下载
1
docker pull quay.io/ascend/vllm-ascend:v0.9.2rc1
容器编排文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
version: '3.8'
services:
Qwen3-Reranker-8B:
image: quay.io/ascend/vllm-ascend:v0.9.2rc1
container_name: Qwen3-Reranker-8B
devices:
- "/dev/davinci4:/dev/davinci4"
- "/dev/davinci5:/dev/davinci5"
- "/dev/davinci_manager:/dev/davinci_manager"
- "/dev/devmm_svm:/dev/devmm_svm"
- "/dev/hisi_hdc:/dev/hisi_hdc"
volumes:
- "/usr/local/dcmi:/usr/local/dcmi"
- "/usr/local/bin/npu-smi:/usr/local/bin/npu-smi"
- "/usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/"
- "/usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info"
- "/etc/ascend_install.info:/etc/ascend_install.info"
- "/root/.cache:/root/.cache"
- "/mnt/nvme01/model/Qwen3-Reranker-8B/:/mnt/nvme01/model/Qwen3-Reranker-8B/:ro"
tty: true
stdin_open: true
network_mode: host
command: >
python3 -m vllm.entrypoints.openai.api_server
--model /mnt/nvme01/model/Qwen3-Reranker-8B
--max-num-seqs 50
--max-model-len 32768
--max-num-batched-tokens 32768
--dtype bfloat16
--tensor-parallel-size 2
--block-size 128
--host 0.0.0.0
--port 21025
--gpu-memory-utilization 0.9
--served-model-name Qwen3-Reranker-8B
--enable-prefix-caching
其他注意事项
1
--tensor-parallel-size # 用于配置显卡个数
测试服务
1
curl -X GET "http://localhost:21025/v1/models"
1
2
3
4
5
6
7
8
9
curl -X POST "http://localhost:21025/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen3-Reranker-8B",
"messages": [
{"role": "user", "content": "这是一个测试问题,用于验证模型是否正常工作。"}
],
"max_tokens": 100
}'
管理服务
1
2
3
4
5
6
# 启动服务
docker-compose -f /root/Qwen3-Reranker-8B/docker-compose.yml up -d
# 重启服务
docker-compose -f /root/Qwen3-Reranker-8B/docker-compose.yml restart
# 停止服务
docker-compose -f /root/Qwen3-Reranker-8BB/docker-compose.yml down
本文由作者按照
CC BY 4.0
进行授权