vLLM:快速易用的 LLM 推理和服务库
大模型
在下载前,请先通过如下命令安装 ModelScope
pip install modelscope
ModelScope 下载默认存储到 ~/.cache/modelscope/hub(Linux/macOS)或 C:\Users<用户名>.cache\modelscope\hub(Windows)。--local_dir 参数可以指定下载目录。
多模态(Multi-modal)
- Qwen2.5-VL-3B-Instruct
modelscope download --model Qwen/Qwen2.5-VL-3B-Instruct --local_dir Qwen2.5-VL-3B-Instruct - Qwen2.5-VL-7B-Instruct
modelscope download --model Qwen/Qwen2.5-VL-7B-Instruct --local_dir Qwen2.5-VL-7B-Instruct
转录(Transcriptions)
- Whisper large-v3
modelscope download --model openai-mirror/whisper-large-v3 --local_dir whisper-large-v3 - Whisper large-v3-turbo
modelscope download --model openai-mirror/whisper-large-v3-turbo --local_dir whisper-large-v3-turbo
部署
Docker
多模态(Multi-modal)
CUDA_VISIBLE_DEVICES=0,1 \
vllm serve /data/models/vlm/Qwen2.5-VL-7B-Instruct \
--port 8000 \
--served-model-name Qwen2.5-VL \
--tensor-parallel-size 2 \
--max-model-len 32000
转录(Transcriptions)
CUDA_VISIBLE_DEVICES=2 \
vllm serve /data/models/vlm/whisper-large-v3 \
--port 8081 \
--served-model-name whisper-large-v3
Docker Compose
version: '3.8'
services:
qwen-vl-service:
image: vllm/vllm-openai:latest
container_name: qwen-vl-service
environment:
- CUDA_VISIBLE_DEVICES=0,1
ports:
- "8000:8000"
volumes:
- /data/models/vlm/Qwen2.5-VL-7B-Instruct:/model
command:
- --model
- /model
- --served-model-name
- qwen2.5-vl
- --tensor-parallel-size
- "2"
- --max-model-len
- "32000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 2
capabilities: [gpu]
whisper-service:
image: vllm/vllm-openai:latest
container_name: whisper-service
environment:
- CUDA_VISIBLE_DEVICES=2
ports:
- "8081:8000"
volumes:
- /data/models/vlm/whisper-large-v3-turbo:/model
command:
- --model
- /model
- --served-model-name
- whisper-large-v3-turbo
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]