mirror of
https://github.com/RYDE-WORK/ktransformers.git
synced 2026-01-19 12:43:16 +08:00
fix fp8 multi gpu; update FQA
This commit is contained in:
parent
89b55052b8
commit
7e5962af3d
@ -93,3 +93,7 @@ Traceback (most recent call last):
|
||||
RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
|
||||
```
|
||||
**SOLUTION**: The issue of running ktransformers on Ubuntu 22.04 is caused by the current system's g++ version being too old, and the pre-defined macros do not include avx_bf16. We have tested and confirmed that it works on g++ 11.4 in Ubuntu 22.04.
|
||||
|
||||
### Q: Using fp8 prefill very slow.
|
||||
|
||||
The FP8 kernel is build by JIT, so the first run will be slow. The subsequent runs will be faster.
|
||||
@ -102,7 +102,8 @@ def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> t
|
||||
M, N = x.size()
|
||||
y = torch.empty_like(x, dtype=torch.get_default_dtype())
|
||||
grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
|
||||
weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
|
||||
with torch.cuda.device(x.device):
|
||||
weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
|
||||
return y
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user