From 8747c099f200bc98c35fcf85c627021199629a36 Mon Sep 17 00:00:00 2001 From: TangJingqi Date: Thu, 29 Aug 2024 22:39:20 +0800 Subject: [PATCH] update yaml example; update version idx; update docker file --- Dockerfile | 2 +- ktransformers/__init__.py | 6 +- .../DeepSeek-V2-Chat-multi-gpu-4.yaml | 56 +++++++++---------- .../DeepSeek-V2-Chat-multi-gpu.yaml | 4 +- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index 78264c8..707c1a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ rm -rf /var/lib/apt/lists/* && cd ktransformers && git submodule init && git submodule update && -pip install ninja pyproject numpy && +pip install ninja pyproject numpy cpufeature && pip install flash-attn && CPU_INSTRUCT=NATIVE KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9" pip install . --no-build-isolation --verbose && pip cache purge diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py index a833c84..2c7b4dc 100644 --- a/ktransformers/__init__.py +++ b/ktransformers/__init__.py @@ -5,7 +5,7 @@ Description : Author : kkk1nak0 Date : 2024-08-15 07:34:46 Version : 1.0.0 -LastEditors : chenxl -LastEditTime : 2024-08-28 15:19:03 +LastEditors : Azure-Tang +LastEditTime : 2024-08-29 22:35:51 ''' -__version__ = "0.1.3" +__version__ = "0.1.4" \ No newline at end of file diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml index 07f173f..a87a30c 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml @@ -7,7 +7,7 @@ prefill_device: "cpu" - match: - name: "^model\\.layers\\.([0-9])\\." + name: "^model\\.layers\\.([0-9]|[1][0-4])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding @@ -15,7 +15,7 @@ generate_device: "cuda:0" prefill_device: "cuda:0" - match: - name: "^model\\.layers\\.([1][0-9])\\." + name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding @@ -23,7 +23,7 @@ generate_device: "cuda:1" prefill_device: "cuda:1" - match: - name: "^model\\.layers\\.([2][0-9])\\." + name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding @@ -31,7 +31,7 @@ generate_device: "cuda:2" prefill_device: "cuda:2" - match: - name: "^model\\.layers\\.([345][0-9])\\." + name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding @@ -40,7 +40,7 @@ prefill_device: "cuda:3" - match: - name: "^model\\.layers\\.([0-9])\\.(?!self_attn).*$" # regular expression + name: "^model\\.layers\\.([0-9]|[1][0-4])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types @@ -50,7 +50,7 @@ generate_op: "KLinearMarlin" prefill_op: "KLinearTorch" - match: - name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$" # regular expression + name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types @@ -60,7 +60,7 @@ generate_op: "KLinearMarlin" prefill_op: "KLinearTorch" - match: - name: "^model\\.layers\\.([2][0-9])\\.(?!self_attn).*$" # regular expression + name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types @@ -70,7 +70,7 @@ generate_op: "KLinearMarlin" prefill_op: "KLinearTorch" - match: - name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression + name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types @@ -81,7 +81,7 @@ prefill_op: "KLinearTorch" - match: - name: "^model\\.layers\\.([0-9])\\.mlp$" + name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function @@ -89,7 +89,7 @@ generate_device: "cuda:0" prefill_device: "cuda:0" - match: - name: "^model\\.layers\\.([1][0-9])\\.mlp$" + name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function @@ -97,7 +97,7 @@ generate_device: "cuda:1" prefill_device: "cuda:1" - match: - name: "^model\\.layers\\.([2][0-9])\\.mlp$" + name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function @@ -105,7 +105,7 @@ generate_device: "cuda:2" prefill_device: "cuda:2" - match: - name: "^model\\.layers\\.([345][0-9])\\.mlp$" + name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function @@ -114,7 +114,7 @@ prefill_device: "cuda:3" - match: - name: "^model\\.layers\\.([0-9])\\.mlp\\.experts$" + name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: @@ -125,7 +125,7 @@ out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module - match: - name: "^model\\.layers\\.([1][0-9])\\.mlp\\.experts$" + name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: @@ -136,7 +136,7 @@ out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module - match: - name: "^model\\.layers\\.([2][0-9])\\.mlp\\.experts$" + name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: @@ -147,7 +147,7 @@ out_device: "cuda:2" recursive: False # don't recursively inject submodules of this module - match: - name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$" + name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: @@ -159,28 +159,28 @@ recursive: False # don't recursively inject submodules of this module - match: - name: "^model\\.layers\\.([0-9])\\.self_attn$" + name: "^model\\.layers\\.([0-9]|[1][0-4])\\.self_attn$" replace: class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: - name: "^model\\.layers\\.([1][0-9])\\.self_attn$" + name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.self_attn$" replace: class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: - name: "^model\\.layers\\.([2][0-9])\\.self_attn$" + name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.self_attn$" replace: class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: - name: "^model\\.layers\\.([345][0-9])\\.self_attn$" + name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.self_attn$" replace: class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: @@ -194,35 +194,35 @@ kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: - 10: "cuda:1" - 20: "cuda:2" - 30: "cuda:3" + 15: "cuda:1" + 30: "cuda:2" + 45: "cuda:3" - match: - name: "^model\\.layers\\.([0-9])\\." + name: "^model\\.layers\\.([0-9]|[1][0-4])\\." replace: class: "default" kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: - name: "(^model\\.layers\\.([1][0-9])\\.)" + name: "(^model\\.layers\\.([2][0-9]|[1][5-9])\\.)" replace: class: "default" kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: - name: "(^model\\.layers\\.([2][0-9])\\.)" + name: "(^model\\.layers\\.([3][0-9]|[4][0-4])\\.)" replace: class: "default" kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: - name: "(^model\\.layers\\.([345][0-9])\\.)|(^model.norm)|(^lm_head)" + name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)|(^lm_head)" replace: class: "default" kwargs: generate_device: "cuda:3" - prefill_device: "cuda:3" + prefill_device: "cuda:3" \ No newline at end of file diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml index 3884077..269257e 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml @@ -24,7 +24,7 @@ prefill_device: "cuda:1" - match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn).*$" # regular expression + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types @@ -35,7 +35,7 @@ prefill_op: "KLinearTorch" - match: - name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression + name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types