diff --git a/.github/workflows/package_wheel_release.yml b/.github/workflows/package_wheel_release.yml new file mode 100644 index 0000000..93e5f38 --- /dev/null +++ b/.github/workflows/package_wheel_release.yml @@ -0,0 +1,252 @@ +name: Build Wheels +on: + workflow_dispatch: + inputs: + release: + description: 'Release? 1 = yes, 0 = no' + default: '0' + required: true + type: string +jobs: + build_wheels: + name: ${{ matrix.os }} Python=${{ matrix.pyver }} CUDA=${{ matrix.cuda }} CPU_INSTRUCT=${{ matrix.instruct }} Torch=${{ matrix.torch }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + # Ubuntu + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + + # Windows + - { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + - { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'} + - { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'} + + defaults: + run: + shell: pwsh + + steps: + - uses: actions/checkout@v3 + + - name: Free Disk Space + uses: jlumbroso/free-disk-space@v1.3.1 + if: runner.os == 'Linux' + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: false + swap-storage: true + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: check_space + run: | + if($IsLinux) {df -h} + if($IsWindows) {Get-PSDrive -PSProvider 'FileSystem'} + + - uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup Mamba + if: matrix.cuda != '' + uses: conda-incubator/setup-miniconda@v2.3.0 + with: + activate-environment: "ktransformers" + python-version: ${{ matrix.pyver }} + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + add-pip-as-python-dependency: true + auto-activate-base: false + + + + - name: build web + run: | + cd ktransformers/website/ + npm install + npm run build + cd ../../ + + - name: build for cuda + if: matrix.cuda != '' + run: | + git submodule init + git submodule update + if($IsWindows){ + $originalPath = Get-Location + Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' + Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -DevCmdArguments '-arch=x64 -host_arch=x64' + $env:DISTUTILS_USE_SDK=1 + Set-Location $originalPath + } + $cudaVersion = '${{ matrix.cuda }}' + $env:MAMBA_NO_LOW_SPEED_LIMIT = 1 + mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + if ($IsLinux) { + $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH + $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib/python${{ matrix.pyver }}/site-packages/nvidia/nvjitlink/lib:' + $env:LD_LIBRARY_PATH + if (!(Test-Path $env:CUDA_HOME/lib64)) { + New-Item -ItemType SymbolicLink -Path $env:CUDA_HOME/lib64 -Target $env:CUDA_HOME/lib + } + } + if ($IsWindows) { + $env:CUDA_PATH = "$env:CUDA_PATH/Library" + $env:CUDA_HOME = $env:CUDA_PATH + $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH + cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/ + $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE + + } + python -m pip install torch==${{ matrix.torch }} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${{ matrix.torch_cu }} + python -m pip install cpufeature build wheel ninja packaging setuptools + $env:KTRANSFORMERS_FORCE_BUILD = "TRUE" + $env:CPU_INSTRUCT = '${{ matrix.instruct }}' + $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}' + python -m build --no-isolation --verbose + + + - name: create Rlease dir + run: | + if ($IsWindows) { + $env:date = $(Get-Date -Format "yyyy-MM-dd") + New-Item -ItemType Directory -Force -Path "$Env:USERPROFILE\.ssh" + $Env:SSH_PATH = "$Env:USERPROFILE\.ssh\id_rsa" + Set-Content -Path $Env:SSH_PATH -Value "${{ secrets.SSH_PRIVATE_KEY }}" + (Get-Content -Path $Env:SSH_PATH).Replace("`r`n","`n") | Set-Content -Path $Env:SSH_PATH + chmod 600 $Env:SSH_PATH + } + if ($IsLinux) { + $env:date = $(date +%Y-%m-%d) + mkdir -p ~/.ssh/ + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + } + + ssh -p ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no root@${{ secrets.SSH_SERVER }} "mkdir -p /mnt/data/release-$env:date" + scp -P ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no dist/*.whl root@${{ secrets.SSH_SERVER }}:/mnt/data/release-$env:date/ \ No newline at end of file diff --git a/.github/workflows/package_wheel_test.yml b/.github/workflows/package_wheel_test.yml new file mode 100644 index 0000000..9fe82f8 --- /dev/null +++ b/.github/workflows/package_wheel_test.yml @@ -0,0 +1,132 @@ +name: Build Wheels +on: + workflow_dispatch: + inputs: + release: + description: 'Release? 1 = yes, 0 = no' + default: '0' + required: true + type: string +jobs: + build_wheels: + name: ${{ matrix.os }} Python=${{ matrix.pyver }} CUDA=${{ matrix.cuda }} CPU_INSTRUCT=${{ matrix.instruct }} Torch=${{ matrix.torch }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + # Ubuntu + - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'} + - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'} + + defaults: + run: + shell: pwsh + + steps: + - uses: actions/checkout@v3 + + - name: Free Disk Space + uses: jlumbroso/free-disk-space@v1.3.1 + if: runner.os == 'Linux' + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: false + swap-storage: true + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: check_space + run: | + if($IsLinux) {df -h} + if($IsWindows) {Get-PSDrive -PSProvider 'FileSystem'} + + - uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup Mamba + if: matrix.cuda != '' + uses: conda-incubator/setup-miniconda@v2.3.0 + with: + activate-environment: "ktransformers" + python-version: ${{ matrix.pyver }} + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + add-pip-as-python-dependency: true + auto-activate-base: false + + + + - name: build web + run: | + cd ktransformers/website/ + npm install + npm run build + cd ../../ + + - name: build for cuda + if: matrix.cuda != '' + run: | + git submodule init + git submodule update + if($IsWindows){ + $originalPath = Get-Location + Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' + Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -DevCmdArguments '-arch=x64 -host_arch=x64' + $env:DISTUTILS_USE_SDK=1 + Set-Location $originalPath + } + $cudaVersion = '${{ matrix.cuda }}' + $env:MAMBA_NO_LOW_SPEED_LIMIT = 1 + mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + if ($IsLinux) { + $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH + $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib/python${{ matrix.pyver }}/site-packages/nvidia/nvjitlink/lib:' + $env:LD_LIBRARY_PATH + if (!(Test-Path $env:CUDA_HOME/lib64)) { + New-Item -ItemType SymbolicLink -Path $env:CUDA_HOME/lib64 -Target $env:CUDA_HOME/lib + } + } + if ($IsWindows) { + $env:CUDA_PATH = "$env:CUDA_PATH/Library" + $env:CUDA_HOME = $env:CUDA_PATH + $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH + cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/ + $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE + + } + python -m pip install torch==${{ matrix.torch }} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${{ matrix.torch_cu }} + python -m pip install cpufeature build wheel ninja packaging setuptools + $env:KTRANSFORMERS_FORCE_BUILD = "TRUE" + $env:CPU_INSTRUCT = '${{ matrix.instruct }}' + $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}' + python -m build --no-isolation --verbose + + + - name: create Rlease dir + run: | + if ($IsWindows) { + $env:date = $(Get-Date -Format "yyyy-MM-dd") + New-Item -ItemType Directory -Force -Path "$Env:USERPROFILE\.ssh" + $Env:SSH_PATH = "$Env:USERPROFILE\.ssh\id_rsa" + Set-Content -Path $Env:SSH_PATH -Value "${{ secrets.SSH_PRIVATE_KEY }}" + (Get-Content -Path $Env:SSH_PATH).Replace("`r`n","`n") | Set-Content -Path $Env:SSH_PATH + chmod 600 $Env:SSH_PATH + } + if ($IsLinux) { + $env:date = $(date +%Y-%m-%d) + mkdir -p ~/.ssh/ + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + } + + ssh -p ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no root@${{ secrets.SSH_SERVER }} "mkdir -p /mnt/data/release-$env:date" + scp -P ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no dist/*.whl root@${{ secrets.SSH_SERVER }}:/mnt/data/release-$env:date/ \ No newline at end of file diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py index d1f2e39..48fef32 100644 --- a/ktransformers/__init__.py +++ b/ktransformers/__init__.py @@ -1 +1 @@ -__version__ = "0.1.1" \ No newline at end of file +__version__ = "0.1.2" \ No newline at end of file diff --git a/setup.py b/setup.py index aeff5f6..2a09b48 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ Author : chenxl Date : 2024-07-27 16:15:27 Version : 1.0.0 LastEditors : chenxl -LastEditTime : 2024-08-08 02:45:15 +LastEditTime : 2024-08-14 16:36:19 Adapted from: https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py Copyright (c) 2023, Tri Dao. @@ -299,6 +299,15 @@ setup( 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu', 'ktransformers/ktransformers_ext/cuda/binding.cpp', 'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu' - ]) + ], + extra_compile_args={ + 'cxx': ['-O3'], + 'nvcc': [ + '-O3', + '--use_fast_math', + '-Xcompiler', '-fPIC', + ] + } + ) ] ) diff --git a/third_party/llamafile/sgemm.cpp b/third_party/llamafile/sgemm.cpp index 6a7cab4..38f6d18 100644 --- a/third_party/llamafile/sgemm.cpp +++ b/third_party/llamafile/sgemm.cpp @@ -94,7 +94,6 @@ static const struct GemmFuncs { #if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) #if defined(__AVX2__) #if defined(__AVX512F__) - printf("__AVX512F__\n"); #if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) // AMD Zen4+ (2023-) sgemm = llamafile_sgemm_amd_zen4;