文章

华为昇腾910B部署Qwen3-Embedding-8B

容器下载

1
docker pull quay.io/ascend/vllm-ascend:v0.9.2rc1

容器编排文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
version: '3.8'

services:
  Qwen3-Embedding-8B:
    image: quay.io/ascend/vllm-ascend:v0.9.2rc1
    container_name: Qwen3-Embedding-8B
    devices:
      - "/dev/davinci6:/dev/davinci6"
      - "/dev/davinci7:/dev/davinci7"
      - "/dev/davinci_manager:/dev/davinci_manager"
      - "/dev/devmm_svm:/dev/devmm_svm"
      - "/dev/hisi_hdc:/dev/hisi_hdc"
    volumes:
      - "/usr/local/dcmi:/usr/local/dcmi"
      - "/usr/local/bin/npu-smi:/usr/local/bin/npu-smi"
      - "/usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/"
      - "/usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info"
      - "/etc/ascend_install.info:/etc/ascend_install.info"
      - "/root/.cache:/root/.cache"
      - "/mnt/nvme01/model/Qwen3-Embedding-8B/:/mnt/nvme01/model/Qwen3-Embedding-8B/:ro"
    tty: true
    stdin_open: true
    network_mode: host
    command: >
      python3 -m vllm.entrypoints.openai.api_server
      --model /mnt/nvme01/model/Qwen3-Embedding-8B
      --max-num-seqs 50
      --max-model-len 32768
      --max-num-batched-tokens 32768
      --dtype bfloat16
      --tensor-parallel-size 2
      --block-size 128
      --host 0.0.0.0
      --port 11025
      --gpu-memory-utilization 0.9
      --served-model-name Qwen3-Embedding-8B
      --enable-prefix-caching

其他注意事项

1
--tensor-parallel-size # 用于配置显卡个数

测试服务

1
curl -X GET "http://localhost:11025/v1/models"
1
2
3
4
5
6
curl -X POST "http://localhost:11025/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
  "model": "Qwen3-Embedding-8B",
  "input": "这是一个测试句子,用于验证 Embedding 模型是否正常工作。"
}'

管理服务

1
2
3
4
5
6
# 启动服务
docker-compose -f /root/Qwen3-Embedding-8B/docker-compose.yml up -d
# 重启服务
docker-compose -f /root/Qwen3-Embedding-8B/docker-compose.yml restart
# 停止服务
docker-compose -f /root/Qwen3-Embedding-8B/docker-compose.yml down
本文由作者按照 CC BY 4.0 进行授权