Merge pull request #17 from UnicornChan/feature-support-multi-instruct

Feature support multi instruct
2026-02-01 12:09:37 +08:00 · 2024-08-01 12:29:07 +08:00 · 2024-08-01 12:29:07 +08:00 · 5e83bc0c82
commit 5e83bc0c82
parent 25620829ce 86ba1336a9
9 changed files with 272 additions and 25 deletions
--- a/34
+++ b/34
@ -0,0 +1,34 @@
 FROM node:20.16.0 as web_compile
 WORKDIR /home
 RUN <<EOF
 git clone https://github.com/kvcache-ai/ktransformers.git &&
 cd ktransformers/ktransformers/website/ &&
 npm install @vue/cli &&
 npm run build &&
 rm -rf node_modules
 EOF
 FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server
 WORKDIR /workspace
 COPY --from=web_compile /home/ktransformers /workspace/ktransformers
 RUN <<EOF
 apt update -y &&  apt install -y  --no-install-recommends \
    git \
    wget \
    vim \
    gcc \
    g++ \
    cmake && 
 rm -rf /var/lib/apt/lists/* &&
 cd ktransformers &&
 git submodule init &&
 git submodule update &&
 pip install ninja pyproject numpy &&
 pip install flash-attn &&
 CPU_INSTRUCT=NATIVE  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9" pip install . --no-build-isolation --verbose &&
 pip cache purge
 EOF
 ENTRYPOINT [ "/opt/conda/bin/ktransformers" ]
--- a/README.md
+++ b/README.md
@ -80,30 +80,32 @@ Some preparation:
  ```
 <h3>Installation</h3>
 You can install using Pypi:
-```
+1. Use a Docker image, see [documentation for Docker](./doc/en/docker.md) 
-pip install ktransformers --no-build-isolation
+2. You can install using Pypi:
 ```
 Or download source code and compile:
 - init source code 
  ```sh
  git clone https://github.com/kvcache-ai/ktransformers.git
  cd ktransformers
  git submodule init
  git submodule update
  ```
 - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
 - Compile and install
   ```
-   bash install.sh
+   pip install ktransformers --no-build-isolation
   ```
 3. Or you can download source code and compile:
   - init source code 
     ```sh
     git clone https://github.com/kvcache-ai/ktransformers.git
     cd ktransformers
     git submodule init
     git submodule update
     ```
   - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
   - Compile and install
     ```
     bash install.sh
     ```
 <h3>Local Chat</h3>
 We provide a simple command-line local chat Python script that you can run for testing. 
-  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test. 
+  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 
 <h4>Run Example</h4>
--- a/doc/en/Docker.md
+++ b/doc/en/Docker.md
@ -0,0 +1,27 @@
 # Docker
 ## Prerequisites
 * Docker must be installed and running on your system.
 * Create a folder to store big models & intermediate files (ex. /mnt/models)
 ## Images
 There are Docker images available for our project：
 **Uploading**
 ## Building docker image locally
 - Download Dockerfile in [there](../../Dockerfile)
 - finish, execute
   ```bash
   docker build  -t approachingai/ktransformers:v0.1.1 .
   ```
 ## Usage
 Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
 ```
 docker run --gpus all -v /path/to/models:/models -p 10002:10002 approachingai/ktransformers:v0.1.1 --port 10002 --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --web True
 ```
 More operators you can see in the [readme](../../README.md)
--- a/doc/en/deepseek-v2-injection.md
+++ b/doc/en/deepseek-v2-injection.md
@ -43,7 +43,11 @@ In the current version of KTransformers, we utilize Marlin for GPU kernels and l
    <img alt="CPUInfer Performance" src="../assets/cpuinfer.png" width=80%>
  </picture>
 </p>
-
+<p align="center">
  <picture>
    <img alt="marlin performance" src="https://github.com/IST-DASLab/marlin/blob/master/assets/sustained.png?raw=true" width=80%>
  </picture>
 </p>
 ### Arithmetic Intensity Guided Offloading
--- a/ktransformers/init.py
+++ b/ktransformers/init.py
@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.1"
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.16)
 project(cpuinfer_ext VERSION 0.1.0)
 set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
 set(CMAKE_BUILD_TYPE "Release")
 include(CheckCXXCompilerFlag)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@ -10,6 +10,27 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 # instruction set specific
 if (LLAMA_NATIVE)
    set(INS_ENB OFF)
 else()
    set(INS_ENB ON)
 endif()
 option(LLAMA_AVX                             "llama: enable AVX"                                OFF)
 option(LLAMA_AVX2                            "llama: enable AVX2"                               OFF)
 option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
 option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
 option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
 option(LLAMA_FMA                             "llama: enable FMA"                                OFF)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
    option(LLAMA_F16C                        "llama: enable F16C"                               OFF)
 endif()
 option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
 option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                                                         OFF)
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@ -102,6 +123,20 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_FANCY_SIMD)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
            endif()
        elseif (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
@ -133,6 +168,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
        if (LLAMA_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if (LLAMA_AVX512_FANCY_SIMD)
            list(APPEND ARCH_FLAGS -mavx512vl)
            list(APPEND ARCH_FLAGS -mavx512bw)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if (LLAMA_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
--- a/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake
+++ b/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake
@ -0,0 +1,100 @@
 include(CheckCSourceRuns)
 set(AVX_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 a;
        a = _mm256_set1_ps(0);
        return 0;
    }
 ")
 set(AVX512_CODE "
    #include <immintrin.h>
    int main()
    {
        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0);
        __m512i b = a;
        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
        return 0;
    }
 ")
 set(AVX2_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256i a = {0};
        a = _mm256_abs_epi16(a);
        __m256i x;
        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
        return 0;
    }
 ")
 set(FMA_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 acc = _mm256_setzero_ps();
        const __m256 d = _mm256_setzero_ps();
        const __m256 p = _mm256_setzero_ps();
        acc = _mm256_fmadd_ps( d, p, acc );
        return 0;
    }
 ")
 macro(check_sse type flags)
    set(__FLAG_I 1)
    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
    foreach (__FLAG ${flags})
        if (NOT ${type}_FOUND)
            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
            if (HAS_${type}_${__FLAG_I})
                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
            endif()
            math(EXPR __FLAG_I "${__FLAG_I}+1")
        endif()
    endforeach()
    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
    if (NOT ${type}_FOUND)
        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
        set(${type}_FLAGS "" CACHE STRING "${type} flags")
    endif()
    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
 endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
    set(LLAMA_AVX OFF)
 else()
    set(LLAMA_AVX ON)
 endif()
 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
    set(LLAMA_AVX2 OFF)
 else()
    set(LLAMA_AVX2 ON)
 endif()
 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
    set(LLAMA_AVX512 OFF)
 else()
    set(LLAMA_AVX512 ON)
 endif()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,7 +1,7 @@
 [build-system]
 requires = [
  "setuptools",
-  "torch == 2.3.1", 
+  "torch >= 2.3.0", 
  "ninja",
  "packaging"
  ]
@ -29,7 +29,7 @@ dependencies = [
  "fire"
 ]
-requires-python = ">=3.11"
+requires-python = ">=3.10"
 authors = [
  {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
@ -50,6 +50,7 @@ keywords = ["ktransformers", "llm"]
 classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12"
 ]
--- a/setup.py
+++ b/setup.py
@ -6,7 +6,7 @@ Author       : chenxl
 Date         : 2024-07-27 16:15:27
 Version      : 1.0.0
 LastEditors  : chenxl 
-LastEditTime : 2024-07-29 09:40:24
+LastEditTime : 2024-07-31 09:44:46
 Adapted from:
 https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
 Copyright (c) 2023, Tri Dao.
@ -19,6 +19,7 @@ import re
 import ast
 import subprocess
 import platform
 import http.client
 import urllib.request
 import urllib.error
 from pathlib import Path
@ -28,6 +29,15 @@ from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 class CpuInstructInfo:
    CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
    FANCY = "FANCY"
    AVX512 = "AVX512"
    AVX2 = "AVX2"
    CMAKE_NATIVE = "-DLLAMA_NATIVE=ON"
    CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
    CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
    CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"
 class VersionInfo:
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
@ -61,12 +71,24 @@ class VersionInfo:
            raise ValueError("Unsupported platform: {}".format(sys.platform))
    def get_cpu_instruct(self,):
        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            return "fancy"
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
            return "avx512"
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
            return "avx2"
        else:
            print("Using native cpu instruct")
        if sys.platform.startswith("linux"):
            with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
                cpuinfo = cpu_f.read()
            flags_line = [line for line in cpuinfo.split(
                '\n') if line.startswith('flags')][0]
            flags = flags_line.split(':')[1].strip().split(' ')
            # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI
            for flag in flags:
                if 'avx512bw' in flag:
                    return 'fancy'
            for flag in flags:
                if 'avx512' in flag:
                    return 'avx512'
@ -116,6 +138,7 @@ class BuildWheelsCommand(_bdist_wheel):
    def run(self):
        if VersionInfo.FORCE_BUILD:
            super().run()
            return
        wheel_filename, wheel_url = self.get_wheel_name()
        print("Guessing wheel URL: ", wheel_url)
        try:
@ -132,7 +155,7 @@ class BuildWheelsCommand(_bdist_wheel):
            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
            print("Raw wheel path", wheel_path)
            os.rename(wheel_filename, wheel_path)
-        except (urllib.error.HTTPError, urllib.error.URLError):
+        except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected):
            print("Precompiled wheel not found. Building from source...")
            # If the wheel could not be downloaded, build from source
            super().run()
@ -187,6 +210,18 @@ class CMakeBuild(BuildExtension):
            cmake_args += [
                item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            cpu_args = CpuInstructInfo.CMAKE_FANCY
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
            cpu_args = CpuInstructInfo.CMAKE_AVX512
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
            cpu_args = CpuInstructInfo.CMAKE_AVX2
        else:
            cpu_args = CpuInstructInfo.CMAKE_NATIVE
        cmake_args += [
            item for item in cpu_args.split(" ") if item
        ]
        # In this example, we pass in the version to C++. You might not need to.
        cmake_args += [
            f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]