Skip to content

Commit 8431e82

Browse files
authored
Merge branch 'vllm-project:main' into main
2 parents 952cdc3 + af647fb commit 8431e82

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2685
-731
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
name: Remove ready Label on notready Comment
2+
3+
on:
4+
issue_comment:
5+
types: [created]
6+
7+
jobs:
8+
add-ready-label:
9+
runs-on: ubuntu-latest
10+
if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
11+
steps:
12+
- name: Remove ready label
13+
uses: actions/github-script@v5
14+
with:
15+
script: |
16+
github.rest.issues.removeLabel({
17+
owner: context.repo.owner,
18+
repo: context.repo.repo,
19+
issue_number: context.issue.number,
20+
name: 'ready'
21+
})
22+
env:
23+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

Dockerfile.tpu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ RUN pip install "numpy<2"
1212
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
1313
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
1414

15+
# Fix FastAPI dependence
16+
RUN pip install "starlette<0.38.0"
17+
1518
# Build vLLM.
1619
COPY . /workspace/vllm
1720
ENV VLLM_TARGET_DEVICE="tpu"

benchmarks/cutlass_benchmarks/w8a8_benchmarks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from vllm import _custom_ops as ops
1414
from vllm.utils import FlexibleArgumentParser
1515

16-
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
16+
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
1717
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
1818
DEFAULT_TP_SIZES = [1]
1919

csrc/quantization/awq/gemm_kernels.cu

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,6 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
1717
namespace vllm {
1818
namespace awq {
1919

20-
// Pack two half values.
21-
static inline __device__ __host__ unsigned __pack_half2(const half x,
22-
const half y) {
23-
unsigned v0 = *((unsigned short*)&x);
24-
unsigned v1 = *((unsigned short*)&y);
25-
return (v1 << 16) | v0;
26-
}
27-
2820
template <int N>
2921
__global__ void __launch_bounds__(64)
3022
gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
@@ -42,11 +34,7 @@ __global__ void __launch_bounds__(64)
4234
__shared__ half A_shared[16 * (32 + 8)];
4335
__shared__ half B_shared[32 * (N + 8)];
4436

45-
__shared__ half scaling_factors_shared[N];
46-
__shared__ half zeros_shared[N];
47-
4837
int j_factors1 = ((OC + N - 1) / N);
49-
int blockIdx_x = 0;
5038
int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
5139
int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
5240

@@ -60,7 +48,6 @@ __global__ void __launch_bounds__(64)
6048

6149
static constexpr int row_stride_warp = 32 * 8 / 32;
6250
static constexpr int row_stride = 2 * 32 * 8 / N;
63-
bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
6451
// TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
6552
bool ld_A_flag =
6653
(blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
@@ -145,11 +132,7 @@ __global__ void __launch_bounds__(64)
145132
uint32_t B_loaded =
146133
*(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
147134
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
148-
// uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N /
149-
// 8)) * 8);
150135

151-
// uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x
152-
// % (cta_N / 8)) * 8);
153136
// - zero and * scale
154137
// TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
155138
// q * scale - zero * scale.
@@ -367,17 +350,11 @@ __global__ void __launch_bounds__(64)
367350
__global__ void __launch_bounds__(64)
368351
dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
369352
int* __restrict__ zeros, half* __restrict__ C, int G) {
370-
int j_factors1 = 4;
371-
int row_stride2 = 4;
372-
int split_k_iters = 1;
373353
static constexpr uint32_t ZERO = 0x0;
374354
half B_shared[32 * (128 + 8)];
375355

376356
half* B_shared_ptr2 = B_shared;
377357

378-
half B_shared_warp[32];
379-
int OC = 512;
380-
381358
int N = blockDim.x * gridDim.x; // 2
382359
int col = (blockIdx.x * blockDim.x + threadIdx.x);
383360
int row = blockIdx.y * blockDim.y + threadIdx.y;

0 commit comments

Comments
 (0)