mirror of
https://github.com/RYDE-WORK/ktransformers.git
synced 2026-01-19 21:03:18 +08:00
commit
76b081879a
32
.github/workflows/book-ci.yml
vendored
Normal file
32
.github/workflows/book-ci.yml
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
name: Book-CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
# - server_support
|
||||
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
# - server_support
|
||||
jobs:
|
||||
test:
|
||||
name: test
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Rust
|
||||
run: |
|
||||
rustup set profile minimal
|
||||
rustup toolchain install stable
|
||||
rustup default stable
|
||||
- name: Setup mdBook
|
||||
uses: peaceiris/actions-mdbook@v2
|
||||
with:
|
||||
mdbook-version: "latest"
|
||||
# - name: Run tests
|
||||
# run: mdbook test
|
||||
49
.github/workflows/deploy.yml
vendored
Normal file
49
.github/workflows/deploy.yml
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
name: Deploy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
# - server_support
|
||||
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
# - server_support
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Rust
|
||||
run: |
|
||||
rustup set profile minimal
|
||||
rustup toolchain install stable
|
||||
rustup default stable
|
||||
- name: Setup mdBook
|
||||
uses: peaceiris/actions-mdbook@v2
|
||||
with:
|
||||
mdbook-version: "latest"
|
||||
- run: mdbook build
|
||||
# - name: Copy Assets
|
||||
# run: |
|
||||
# chmod +x ci/copy-assets.sh
|
||||
# ci/copy-assets.sh ${{ matrix.os }}
|
||||
- name: Deploy
|
||||
uses: peaceiris/actions-gh-pages@v3
|
||||
# or || github.ref == 'refs/heads/server_support'
|
||||
if: ${{ github.ref == 'refs/heads/main' }}
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
publish_dir: ./book
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -22,3 +22,4 @@ img/
|
||||
tmp1.txt
|
||||
test_65_300_1536.txt
|
||||
test.txt
|
||||
book
|
||||
|
||||
18
book.toml
Normal file
18
book.toml
Normal file
@ -0,0 +1,18 @@
|
||||
[book]
|
||||
authors = ["kvcache-ai"]
|
||||
language = "zh-CN"
|
||||
title = "Ktransformers"
|
||||
src = "doc"
|
||||
|
||||
[output.html]
|
||||
git-repository-url = "https://github.com/kvcache-ai/ktransformers"
|
||||
edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
|
||||
|
||||
[output.html.playground]
|
||||
editable = true
|
||||
copy-js = true
|
||||
# line-numbers = true
|
||||
|
||||
[output.html.fold]
|
||||
enable = true
|
||||
level = 0
|
||||
31
doc/README.md
Normal file
31
doc/README.md
Normal file
@ -0,0 +1,31 @@
|
||||
<div align="center">
|
||||
<!-- <h1>KTransformers</h1> -->
|
||||
<p align="center">
|
||||
|
||||
<picture>
|
||||
<img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
|
||||
|
||||
</picture>
|
||||
|
||||
</p>
|
||||
|
||||
</div>
|
||||
|
||||
<h2 id="intro">🎉 Introduction</h2>
|
||||
KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
|
||||
<br/><br/>
|
||||
KTransformers is a flexible, Python-centric framework designed with extensibility at its core.
|
||||
By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
|
||||
interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI.
|
||||
<br/><br/>
|
||||
Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
|
||||
|
||||
<h2 id="Updates">🔥 Updates</h2>
|
||||
|
||||
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md).
|
||||
* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md).
|
||||
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
|
||||
* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU.
|
||||
* **Aug 14, 2024**: Support llamfile as linear backend.
|
||||
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
|
||||
* **Aug 9, 2024**: Support windows native.
|
||||
19
doc/SUMMARY.md
Normal file
19
doc/SUMMARY.md
Normal file
@ -0,0 +1,19 @@
|
||||
# Ktransformer
|
||||
|
||||
[Introduction](./README.md)
|
||||
# DeepSeek
|
||||
- [Deepseek-R1/V3 Tutorial](en/DeepseekR1_V3_tutorial.md)
|
||||
- [Deepseek-V2 Injection](en/deepseek-v2-injection.md)
|
||||
- [Injection Tutorial](en/injection_tutorial.md)
|
||||
|
||||
# Server
|
||||
- [Server](en/api/server/server.md)
|
||||
- [Website](en/api/server/website.md)
|
||||
- [Tabby](en/api/server/tabby.md)
|
||||
# For Developer
|
||||
- [Makefile Usage](en/makefile_usage.md)
|
||||
|
||||
# FAQ
|
||||
- [FAQ](en/FAQ.md)
|
||||
# V3 Reproduction
|
||||
- [Success List](en/V3-success.md)
|
||||
1
doc/basic/note1.md
Normal file
1
doc/basic/note1.md
Normal file
@ -0,0 +1 @@
|
||||
# basic-first20
|
||||
1
doc/basic/note2.md
Normal file
1
doc/basic/note2.md
Normal file
@ -0,0 +1 @@
|
||||
# basic-data_structure
|
||||
11
doc/en/V3-success.md
Normal file
11
doc/en/V3-success.md
Normal file
@ -0,0 +1,11 @@
|
||||
## Hello everyone, here is the successfully reproduced environment configuration for your reference:
|
||||
### Case 1
|
||||
- Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel
|
||||
- Performance: prefill 108 tokens/s, decode 10.8 tokens/s
|
||||
- Used version: main source code compiled
|
||||
### Case 2
|
||||
- Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card
|
||||
- Performance: Running speed approximately 6-8 tokens per second
|
||||
## NOTE
|
||||
If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : )
|
||||
[click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2)
|
||||
@ -54,15 +54,15 @@ class KLinearBase(ABC):
|
||||
|
||||
self.has_bias = False
|
||||
self.dtype = torch.get_default_dtype()
|
||||
# if orig_module is not None:
|
||||
# self.in_features = orig_module.in_features
|
||||
# self.out_features = orig_module.out_features
|
||||
# else:
|
||||
shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
|
||||
if len(shape) == 1:
|
||||
print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
|
||||
self.in_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
|
||||
self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]
|
||||
if orig_module is not None:
|
||||
self.in_features = orig_module.in_features
|
||||
self.out_features = orig_module.out_features
|
||||
else:
|
||||
shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
|
||||
if len(shape) == 1:
|
||||
print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
|
||||
self.in_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
|
||||
self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]
|
||||
|
||||
@abstractmethod
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
@ -136,12 +136,19 @@ class KLinearTorch(KLinearBase):
|
||||
def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
|
||||
if device is None: device = self.device
|
||||
if w is None: w = self.load_weight(device=device)
|
||||
# else: self.out_features = w.shape[0], self.in_features = w.shape[1]
|
||||
|
||||
if isinstance(w, nn.Parameter):
|
||||
self.w = w.to(dtype=self.dtype).T
|
||||
try:
|
||||
self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
|
||||
except:
|
||||
self.w = w.to(dtype=self.dtype).T
|
||||
self.has_bias = False
|
||||
elif isinstance(w, tuple):
|
||||
self.w = w[0].to(dtype=self.dtype).T
|
||||
try:
|
||||
self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
|
||||
except:
|
||||
self.w = w[0].to(dtype=self.dtype).T
|
||||
self.bias = w[1].to(dtype=self.dtype)
|
||||
self.has_bias = True
|
||||
else:
|
||||
@ -187,7 +194,8 @@ class KLinearMarlin(KLinearBase):
|
||||
def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
|
||||
if device is None: device = self.device
|
||||
assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
|
||||
if w is None: w = self.load_weight(device=device)
|
||||
if w is None:
|
||||
w = self.load_weight(device=device)
|
||||
|
||||
if isinstance(w, nn.Parameter):
|
||||
# pad weight
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user