Skip to content

Commit a057bfd

Browse files
authored
Merge branch 'vllm-project:main' into main
2 parents a587684 + c8a7e93 commit a057bfd

File tree

102 files changed

+7033
-5169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+7033
-5169
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
2+
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.409
8+
- name: "exact_match,flexible-extract"
9+
value: 0.406
10+
limit: 1000
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
77
Minitron-4B-Base.yaml
88
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
99
Qwen2-1.5B-Instruct-FP8W8.yaml
10+
Meta-Llama-3-8B-QQQ.yaml

.github/workflows/scripts/build.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt
1313

1414
# Limit the number of parallel jobs to avoid OOM
1515
export MAX_JOBS=1
16-
# Make sure punica is built for the release (for LoRA)
17-
export VLLM_INSTALL_PUNICA_KERNELS=1
1816
# Make sure release wheels are built for the following architectures
1917
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
2018
# Build

CMakeLists.txt

Lines changed: 1 addition & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
170170
"csrc/quantization/awq/gemm_kernels.cu"
171171
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
172172
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
173+
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
173174
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
174175
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
175176
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
@@ -222,61 +223,7 @@ define_gpu_extension_target(
222223
USE_SABI 3
223224
WITH_SOABI)
224225

225-
#
226-
# _punica_C extension
227-
#
228-
229-
set(VLLM_PUNICA_EXT_SRC
230-
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
231-
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
232-
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
233-
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
234-
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
235-
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
236-
"csrc/punica/punica_ops.cu"
237-
"csrc/punica/torch_bindings.cpp")
238-
239-
#
240-
# Copy GPU compilation flags+update for punica
241-
#
242-
set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
243-
list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
244-
"-D__CUDA_NO_HALF_OPERATORS__"
245-
"-D__CUDA_NO_HALF_CONVERSIONS__"
246-
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
247-
"-D__CUDA_NO_HALF2_OPERATORS__")
248-
249-
#
250-
# Filter out CUDA architectures < 8.0 for punica.
251-
#
252-
if (${VLLM_GPU_LANG} STREQUAL "CUDA")
253-
set(VLLM_PUNICA_GPU_ARCHES)
254-
foreach(ARCH ${VLLM_GPU_ARCHES})
255-
string_to_ver(CODE_VER ${ARCH})
256-
if (CODE_VER GREATER_EQUAL 8.0)
257-
list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
258-
endif()
259-
endforeach()
260-
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
261-
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
262-
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
263-
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
264-
endif()
265226

266-
if (VLLM_PUNICA_GPU_ARCHES)
267-
define_gpu_extension_target(
268-
_punica_C
269-
DESTINATION vllm
270-
LANGUAGE ${VLLM_GPU_LANG}
271-
SOURCES ${VLLM_PUNICA_EXT_SRC}
272-
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
273-
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
274-
USE_SABI 3
275-
WITH_SOABI)
276-
else()
277-
message(WARNING "Unable to create _punica_C target because none of the "
278-
"requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
279-
endif()
280227

281228
#
282229
# Add the `default` target which detects which extensions should be
@@ -300,12 +247,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
300247
message(STATUS "Enabling moe extension.")
301248
add_dependencies(default _moe_C)
302249

303-
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
304-
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
305-
# there are supported target arches.
306-
if (VLLM_PUNICA_GPU_ARCHES AND
307-
(ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
308-
message(STATUS "Enabling punica extension.")
309-
add_dependencies(default _punica_C)
310-
endif()
311250
endif()

Dockerfile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,6 @@ ENV MAX_JOBS=${max_jobs}
8888
# number of threads used by nvcc
8989
ARG nvcc_threads=8
9090
ENV NVCC_THREADS=$nvcc_threads
91-
# make sure punica kernels are built (for LoRA)
92-
ENV VLLM_INSTALL_PUNICA_KERNELS=1
9391

9492
ARG buildkite_commit
9593
ENV BUILDKITE_COMMIT=${buildkite_commit}

Dockerfile.rocm

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,7 @@ COPY . .
131131
RUN --mount=type=cache,target=/root/.cache/pip \
132132
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
133133

134-
# Make sure punica kernels are built (for LoRA)
135-
ENV VLLM_INSTALL_PUNICA_KERNELS=1
134+
136135
# Workaround for ray >= 2.10.0
137136
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
138137
# Silences the HF Tokenizers warning

benchmarks/cutlass_benchmarks/w8a8_benchmarks.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,20 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
112112
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
113113

114114
timers = []
115-
# pytorch impl
115+
# pytorch impl - bfloat16
116116
timers.append(
117117
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
118118
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
119119
torch.bfloat16, label, sub_label, pytorch_mm_impl,
120120
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
121121

122+
# pytorch impl - float16
123+
timers.append(
124+
bench_fn(a.to(dtype=torch.float16, device="cuda"),
125+
b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
126+
torch.float16, label, sub_label, pytorch_mm_impl,
127+
"pytorch_fp16_fp16_fp16_matmul-no-scales"))
128+
122129
# cutlass impl
123130
timers.append(
124131
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,

csrc/ops.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,13 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
115115
torch::Tensor const& b_scales,
116116
c10::optional<torch::Tensor> const& bias);
117117

118+
torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
119+
torch::Tensor const& b_q_weight,
120+
torch::Tensor const& s_tok,
121+
torch::Tensor const& s_ch,
122+
torch::Tensor const& s_group,
123+
torch::Tensor& workspace, int64_t size_m,
124+
int64_t size_n, int64_t size_k);
118125
#endif
119126

120127
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,

csrc/punica/LICENSE

Lines changed: 0 additions & 217 deletions
This file was deleted.

csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu

Lines changed: 0 additions & 5 deletions
This file was deleted.

csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)