Merge pull request #110 from zRzRzRzRzRzRzR/main

mlx inference
2026-01-25 08:05:33 +08:00 · 2024-04-11 11:19:20 +08:00 · 2024-04-11 11:19:20 +08:00 · 789060f769
commit 789060f769
parent a1013b1ad2 6618dd93be
6 changed files with 822 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,7 @@
 *.pyc
 finetune/output/*
 wip.*
+.idea
+venv
+.venv
+.env
--- a/README.md
+++ b/README.md
@ -488,6 +488,12 @@ python demo/vllm_based_demo.py --model_path <vllmcpm_repo_path>
 python demo/hf_based_demo.py --model_path <hf_repo_path>
 ```

+#### 使用如下命令启动基于 Mac mlx 加速框架推理
+
+你需要安装 `mlx_lm` 库，并且，你需要下载对应的转换后的专用模型权重[MiniCPM-2B-sft-bf16-llama-format-mlx](https://huggingface.co/mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx)，然后运行以下命令：
+```shell
+python -m mlx_lm.generate --model mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx --prompt "hello, tell me a joke." --trust-remote-code
+```

 <p id="6"></p>

--- a/demo/hf_based_demo.py
+++ b/demo/hf_based_demo.py
@ -7,26 +7,29 @@ import gradio as gr
 import torch
 from threading import Thread
 from transformers import (
-    AutoModelForCausalLM, 
+    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer
 )
 import warnings
+
 warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

 parser = argparse.ArgumentParser()
 parser.add_argument("--model_path", type=str, default="")
-parser.add_argument("--torch_dtype", type=str, default="bfloat16", choices=["float32", "bfloat16"])
+parser.add_argument("--torch_dtype", type=str, default="bfloat16", choices=["float32", "bfloat16", "float16"])
 parser.add_argument("--server_name", type=str, default="127.0.0.1")
 parser.add_argument("--server_port", type=int, default=7860)
 args = parser.parse_args()

 # init model torch dtype
 torch_dtype = args.torch_dtype
-if torch_dtype =="" or torch_dtype == "bfloat16":
+if torch_dtype == "" or torch_dtype == "bfloat16":
    torch_dtype = torch.bfloat16
 elif torch_dtype == "float32":
    torch_dtype = torch.float32
+elif torch_dtype == "float16":
+    torch_dtype = torch.float16
 else:
    raise ValueError(f"Invalid torch dtype: {torch_dtype}")

@ -36,8 +39,8 @@ tokenizer = AutoTokenizer.from_pretrained(path)
 model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch_dtype, device_map="auto", trust_remote_code=True)

 # init gradio demo host and port
-server_name=args.server_name
-server_port=args.server_port
+server_name = args.server_name
+server_port = args.server_port

 def hf_gen(dialog: List, top_p: float, temperature: float, repetition_penalty: float, max_dec_len: int):
    """generate model output with huggingface api
@ -50,7 +53,7 @@ def hf_gen(dialog: List, top_p: float, temperature: float, repetition_penalty: f

    Yields:
        str: real-time generation results of hf model
-    """    
+    """
    inputs = tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=False)
    enc = tokenizer(inputs, return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer)
@ -73,7 +76,8 @@ def hf_gen(dialog: List, top_p: float, temperature: float, repetition_penalty: f
        yield answer[4 + len(inputs):]


-def generate(chat_history: List, query: str, top_p: float, temperature: float, repetition_penalty: float, max_dec_len: int):
+def generate(chat_history: List, query: str, top_p: float, temperature: float, repetition_penalty: float,
+             max_dec_len: int):
    """generate after hitting "submit" button

    Args:
@ -85,7 +89,7 @@ def generate(chat_history: List, query: str, top_p: float, temperature: float, r

    Yields:
        List: [[q_1, a_1], [q_2, a_2], ..., [q_n, a_n], [q_n+1, a_n+1]]. chat_history + QA of current round.
-    """    
+    """
    assert query != "", "Input must not be empty!!!"
    # apply chat template
    model_input = []
@ -111,7 +115,7 @@ def regenerate(chat_history: List, top_p: float, temperature: float, repetition_

    Yields:
        List: [[q_1, a_1], [q_2, a_2], ..., [q_n, a_n]]. chat_history
-    """    
+    """
    assert len(chat_history) >= 1, "History is empty. Nothing to regenerate!!"
    # apply chat template
    model_input = []
@ -130,7 +134,7 @@ def clear_history():

    Returns:
        List: empty chat history
-    """    
+    """
    return []


@ -142,7 +146,7 @@ def reverse_last_round(chat_history):

    Returns:
        List: [[q_1, a_1], [q_2, a_2], ..., [q_n-1, a_n-1]]. chat_history without last round.
-    """    
+    """
    assert len(chat_history) >= 1, "History is empty. Nothing to reverse!!"
    return chat_history[:-1]

@ -166,8 +170,10 @@ with gr.Blocks(theme="soft") as demo:
                regen = gr.Button("Regenerate")
                reverse = gr.Button("Reverse")

-    submit.click(generate, inputs=[chatbot, user_input, top_p, temperature, repetition_penalty, max_dec_len], outputs=[user_input, chatbot])
-    regen.click(regenerate, inputs=[chatbot, top_p, temperature, repetition_penalty, max_dec_len], outputs=[user_input, chatbot])
+    submit.click(generate, inputs=[chatbot, user_input, top_p, temperature, repetition_penalty, max_dec_len],
+                 outputs=[user_input, chatbot])
+    regen.click(regenerate, inputs=[chatbot, top_p, temperature, repetition_penalty, max_dec_len],
+                outputs=[user_input, chatbot])
    clear.click(clear_history, inputs=[], outputs=[chatbot])
    reverse.click(reverse_last_round, inputs=[chatbot], outputs=[chatbot])

--- a/demo/mlx_based_demo.py
+++ b/demo/mlx_based_demo.py
@ -0,0 +1,42 @@
+"""
+使用 MLX 快速推理 MiniCPM
+
+如果你使用 Mac 设备进行推理，可以直接使用MLX进行推理。
+由于 MiniCPM 暂时不支持 mlx 格式转换。您可以下载由 MLX 社群转换好的模型 [MiniCPM-2B-sft-bf16-llama-format-mlx](https://huggingface.co/mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx)。
+
+并安装对应的依赖包
+
+
+```bash
+pip install mlx-lm
+```
+
+这是一个简单的推理代码，使用 Mac 设备推理 MiniCPM-2
+```python
+python -m mlx_lm.generate --model mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx --prompt "hello, tell me a joke." --trust-remote-code
+```
+
+"""
+
+from mlx_lm import load, generate
+from jinja2 import Template
+
+def chat_with_model():
+    model, tokenizer = load("mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx")
+    print("Model loaded. Start chatting! (Type 'quit' to stop)")
+
+    messages = []
+    chat_template = Template(
+        "{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}")
+
+    while True:
+        user_input = input("You: ")
+        if user_input.lower() == 'quit':
+            break
+        messages.append({"role": "user", "content": user_input})
+        response = generate(model, tokenizer, prompt=chat_template.render(messages=messages), verbose=True)
+        print("Model:", response)
+        messages.append({"role": "ai", "content": response})
+
+
+chat_with_model()
--- a/finetune/mlx_finetune.py
+++ b/finetune/mlx_finetune.py
@ -0,0 +1,739 @@
+# Copyright © 2023-2024 Apple Inc.
+"""
+This script demonstrates how to fine-tune a LoRA model on AdvertiseGen dataset in mlx.
+Using Code is modified from https://github.com/ml-explore/mlx-examples.
+Using Model with https://huggingface.co/mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx
+
+Use this Code with command:
+
+train:
+python mlx_finetune.py --model MiniCPM-2B-sft-bf16-llama-format-mlx  --data data/AdvertiseGen  --train  --seed 2024 --iters 500
+
+输出结果如下：
+
+Training
+Iter 1: Val loss 4.015, Val took 1067.669s
+Iter 2: Val loss 4.001, Val took 1061.649s
+...
+
+训练结束之后，文件夹下会有 adapters.npz 文件，用于后续的测试。接着，运行测试命令
+
+test:
+python mlx_finetune.py --model MiniCPM-2B-sft-bf16-llama-format-mlx  --data data/AdvertiseGen  --test --seed 2024
+
+输出结果如下：
+
+Testing
+Test loss 3.977, Test ppl 53.350.
+
+
+"""
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import Generator
+import transformers
+import numpy as np
+from huggingface_hub import snapshot_download
+import glob
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+from mlx.utils import tree_flatten, tree_unflatten
+import mlx.optimizers as optim
+import mlx.core as mx
+import mlx.nn as nn
+
+
+@dataclass
+class ModelArgs:
+    hidden_size: int
+    num_hidden_layers: int
+    intermediate_size: int
+    num_attention_heads: int
+    rms_norm_eps: float
+    vocab_size: int
+    num_key_value_heads: int = None
+    rope_theta: float = 10000
+    rope_traditional: bool = False
+    model_type: str = None
+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
+
+    def __post_init__(self):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+        if self.rope_scaling:
+            required_keys = {"factor", "type"}
+            if not all(key in self.rope_scaling for key in required_keys):
+                raise ValueError(f"rope_scaling must contain keys {required_keys}")
+
+            if self.rope_scaling["type"] != "linear":
+                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
+
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+
+
+class LoRALinear(nn.Module):
+    @staticmethod
+    def from_linear(linear: nn.Linear, rank: int = 8):
+        # TODO remove when input_dims and output_dims are attributes
+        # on linear and quantized linear
+        output_dims, input_dims = linear.weight.shape
+        if isinstance(linear, nn.QuantizedLinear):
+            input_dims *= 32 // linear.bits
+        lora_lin = LoRALinear(input_dims, output_dims, rank)
+        lora_lin.linear = linear
+        return lora_lin
+
+    def to_linear(self):
+        linear = self.linear
+        bias = "bias" in linear
+        weight = linear.weight
+        is_quantized = isinstance(linear, nn.QuantizedLinear)
+
+        # Use the same type as the linear weight if not quantized
+        dtype = weight.dtype
+
+        if is_quantized:
+            dtype = mx.float16
+            weight = mx.dequantize(
+                weight,
+                linear.scales,
+                linear.biases,
+                linear.group_size,
+                linear.bits,
+            )
+        output_dims, input_dims = weight.shape
+        fused_linear = nn.Linear(input_dims, output_dims, bias=bias)
+
+        lora_b = (self.scale * self.lora_b.T).astype(dtype)
+        lora_a = self.lora_a.T.astype(dtype)
+        fused_linear.weight = weight + lora_b @ lora_a
+        if bias:
+            fused_linear.bias = linear.bias
+
+        if is_quantized:
+            fused_linear = nn.QuantizedLinear.from_linear(
+                fused_linear,
+                linear.group_size,
+                linear.bits,
+            )
+
+        return fused_linear
+
+    def __init__(
+            self,
+            input_dims: int,
+            output_dims: int,
+            lora_rank: int = 8,
+            bias: bool = False,
+            scale: float = 20.0,
+    ):
+        super().__init__()
+
+        # Regular linear layer weights
+        self.linear = nn.Linear(input_dims, output_dims, bias=bias)
+
+        # Scale for low-rank update
+        self.scale = scale
+
+        # Low rank lora weights
+        scale = 1 / math.sqrt(input_dims)
+        self.lora_a = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(input_dims, lora_rank),
+        )
+        self.lora_b = mx.zeros(shape=(lora_rank, output_dims))
+
+    def __call__(self, x):
+        dtype = self.linear.weight.dtype
+        if isinstance(self.linear, nn.QuantizedLinear):
+            dtype = self.linear.scales.dtype
+        y = self.linear(x.astype(dtype))
+        z = (x @ self.lora_a) @ self.lora_b
+        return y + self.scale * z
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        dim = args.hidden_size
+        self.n_heads = n_heads = args.num_attention_heads
+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+
+        self.repeats = n_heads // n_kv_heads
+
+        head_dim = args.hidden_size // n_heads
+        self.scale = head_dim ** -0.5
+
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
+        rope_scale = (
+            1 / args.rope_scaling["factor"]
+            if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
+            else 1
+        )
+        self.rope = nn.RoPE(
+            head_dim,
+            traditional=args.rope_traditional,
+            base=args.rope_theta,
+            scale=rope_scale,
+        )
+
+    def __call__(
+            self,
+            x: mx.array,
+            mask: Optional[mx.array] = None,
+            cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            key_cache, value_cache = cache
+            queries = self.rope(queries, offset=key_cache.shape[2])
+            keys = self.rope(keys, offset=key_cache.shape[2])
+            keys = mx.concatenate([key_cache, keys], axis=2)
+            values = mx.concatenate([value_cache, values], axis=2)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = mx.fast.scaled_dot_product_attention(
+            queries, keys, values, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output), (keys, values)
+
+
+class MLP(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.num_attention_heads = args.num_attention_heads
+        self.hidden_size = args.hidden_size
+        self.self_attn = Attention(args)
+        self.mlp = MLP(args.hidden_size, args.intermediate_size)
+        self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            args.hidden_size, eps=args.rms_norm_eps
+        )
+        self.args = args
+
+    def __call__(
+            self,
+            x: mx.array,
+            mask: Optional[mx.array] = None,
+            cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        r, cache = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out, cache
+
+
+class LlamaModel(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.vocab_size = args.vocab_size
+        self.num_hidden_layers = args.num_hidden_layers
+        assert self.vocab_size > 0
+        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.layers = [
+            TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+    def __call__(
+            self,
+            inputs: mx.array,
+            cache=None,
+    ):
+        h = self.embed_tokens(inputs)
+
+        mask = None
+        if h.shape[1] > 1:
+            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
+            mask = mask.astype(h.dtype)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for e, layer in enumerate(self.layers):
+            h, cache[e] = layer(h, mask, cache[e])
+
+        return self.norm(h), cache
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.model = LlamaModel(args)
+        self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
+
+    def __call__(
+            self,
+            inputs: mx.array,
+            cache=None,
+    ):
+        out, cache = self.model(inputs, cache)
+        return self.lm_head(out), cache
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(description="LoRA or QLoRA finetuning.")
+    parser.add_argument(
+        "--model",
+        default="mlx_model",
+        help="The path to the local model directory or Hugging Face repo.",
+    )
+    # Generation args
+    parser.add_argument(
+        "--max-tokens",
+        "-m",
+        type=int,
+        default=100,
+        help="The maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temp", type=float, default=0.8, help="The sampling temperature"
+    )
+    parser.add_argument(
+        "--prompt",
+        "-p",
+        type=str,
+        help="The prompt for generation",
+        default=None,
+    )
+
+    # Training args
+    parser.add_argument(
+        "--train",
+        action="store_true",
+        help="Do training",
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        default="data/",
+        help="Directory with {train, valid, test}.json files",
+    )
+    parser.add_argument(
+        "--lora-layers",
+        type=int,
+        default=16,
+        help="Number of layers to fine-tune",
+    )
+    parser.add_argument("--batch-size", type=int, default=4, help="Minibatch size.")
+    parser.add_argument(
+        "--iters", type=int, default=1000, help="Iterations to train for."
+    )
+    parser.add_argument(
+        "--val-batches",
+        type=int,
+        default=25,
+        help="Number of validation batches, -1 uses the entire validation set.",
+    )
+    parser.add_argument(
+        "--learning-rate", type=float, default=1e-5, help="Adam learning rate."
+    )
+    parser.add_argument(
+        "--steps-per-report",
+        type=int,
+        default=10,
+        help="Number of training steps between loss reporting.",
+    )
+    parser.add_argument(
+        "--steps-per-eval",
+        type=int,
+        default=200,
+        help="Number of training steps between validations.",
+    )
+    parser.add_argument(
+        "--resume-adapter-file",
+        type=str,
+        default=None,
+        help="Load path to resume training with the given adapter weights.",
+    )
+    parser.add_argument(
+        "--adapter-file",
+        type=str,
+        default="adapters.npz",
+        help="Save/load path for the trained adapter weights.",
+    )
+    parser.add_argument(
+        "--save-every",
+        type=int,
+        default=100,
+        help="Save the model every N iterations.",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="Evaluate on the test set after training",
+    )
+    parser.add_argument(
+        "--test-batches",
+        type=int,
+        default=500,
+        help="Number of test set batches, -1 uses the entire test set.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")
+    return parser
+
+
+class ConversationDataset:
+
+    def __init__(self, path: Path):
+        with open(path, "r") as fid:
+            self._data = [json.loads(l) for l in fid]
+
+    def __getitem__(self, idx: int):
+        entry = self._data[idx]
+        content = entry.get("content", "")
+        summary = entry.get("summary", "")
+        return content, summary
+
+    def __len__(self):
+        return len(self._data)
+
+
+def load(args):
+    def load_and_check(name):
+        dataset_path = Path(args.data) / f"{name}.json"
+        try:
+            return ConversationDataset(dataset_path)
+        except Exception as e:
+            print(f"Unable to build dataset {dataset_path} ({e})")
+            raise
+
+    names = ("train", "dev", "dev")
+    train, valid, test = (load_and_check(n) for n in names)
+
+    if args.train and len(train) == 0:
+        raise ValueError(
+            "Training set not found or empty. Must provide training set for fine-tuning."
+        )
+    if args.train and len(valid) == 0:
+        raise ValueError(
+            "Validation set not found or empty. Must provide validation set for fine-tuning."
+        )
+    if args.test and len(test) == 0:
+        raise ValueError(
+            "Test set not found or empty. Must provide test set for evaluation."
+        )
+    return train, valid, test
+
+
+def loss(model, inputs, targets, lengths):
+    logits, _ = model(inputs)
+    logits = logits.astype(mx.float32)
+    length_mask = mx.arange(inputs.shape[1])[None, :] < lengths[:, None]
+    ce = nn.losses.cross_entropy(logits, targets) * length_mask
+    ntoks = length_mask.sum()
+    ce = ce.sum() / ntoks
+    return ce, ntoks
+
+
+def iterate_batches(dset, tokenizer, batch_size, train=False):
+    # Shuffle indices
+    while True:
+        indices = np.arange(len(dset))
+        if train:
+            indices = np.random.permutation(indices)
+
+        # Collect batches from dataset
+        for i in range(0, len(indices) - batch_size + 1, batch_size):
+            # Encode batch
+            batch = [tokenizer.encode(dset[indices[i + j]]) for j in range(batch_size)]
+            lengths = [len(x) for x in batch]
+            # Check if any sequence is longer than 2048 tokens
+            if max(lengths) > 2048:
+                print(
+                    "[WARNING] Some sequences are longer than 2048 tokens. "
+                    "Consider pre-splitting your data to save memory."
+                )
+
+            # Pad to the max length
+            batch_arr = np.zeros((batch_size, max(lengths)), np.int32)
+
+            for j in range(batch_size):
+                batch_arr[j, : lengths[j]] = batch[j]
+            batch = mx.array(batch_arr)
+            yield batch[:, :-1], batch[:, 1:], mx.array(lengths)
+
+        if not train:
+            break
+
+
+def load_model(path_or_hf_repo: str):
+    # If the path exists, it will try to load model form it
+    # otherwise download and cache from the hf_repo and cache
+    model_path = Path(path_or_hf_repo)
+    if not model_path.exists():
+        model_path = Path(
+            snapshot_download(
+                repo_id=path_or_hf_repo,
+                allow_patterns=["*.json", "*.safetensors", "tokenizer.model"],
+            )
+        )
+
+    with open(model_path / "config.json", "r") as f:
+        config = json.loads(f.read())
+        quantization = config.get("quantization", None)
+
+    weight_files = glob.glob(str(model_path / "*.safetensors"))
+    if len(weight_files) == 0:
+        raise FileNotFoundError("No safetensors found in {}".format(model_path))
+
+    weights = {}
+    for wf in weight_files:
+        weights.update(mx.load(wf).items())
+
+    model_args = ModelArgs.from_dict(config)
+    model = Model(model_args)
+    if quantization is not None:
+        nn.QuantizedLinear.quantize_module(
+            model,
+            **quantization,
+            linear_class_predicate=lambda m: isinstance(m, nn.Linear)
+                                             and m.weight.shape[0] != 8,
+        )
+
+    model.load_weights(list(weights.items()))
+
+    mx.eval(model.parameters())
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+    return model, tokenizer, config
+
+
+def generate(
+        prompt: mx.array, model: nn.Module, temp: float = 0.0
+) -> Generator[mx.array, None, None]:
+    """
+    Generate text based on the given prompt and model.
+
+    Args:
+        prompt (mx.array): The input prompt.
+        model (nn.Module): The model to use for generation.
+        temp (float): The temperature for sampling. If temp is 0, use max sampling.
+
+    Yields:
+        mx.array: The generated text.
+    """
+
+    def sample(logits: mx.array) -> mx.array:
+        return (
+            mx.argmax(logits, axis=-1)
+            if temp == 0
+            else mx.random.categorical(logits * (1 / temp))
+        )
+
+    y = prompt
+    cache = None
+    while True:
+        logits, cache = model(y[None], cache=cache)
+        logits = logits[:, -1, :]
+        y = sample(logits)
+        yield y
+
+
+def evaluate(model, dataset, loss, tokenizer, batch_size, num_batches):
+    all_losses = []
+    ntokens = 0
+    for it, batch in zip(
+            range(num_batches),
+            iterate_batches(dataset, tokenizer, batch_size),
+    ):
+        losses, toks = loss(model, *batch)
+        all_losses.append((losses * toks).item())
+        ntokens += toks.item()
+
+    return np.sum(all_losses) / ntokens
+
+
+def train(model, train_set, val_set, optimizer, loss, tokenizer, args):
+    # Create value and grad function for loss
+    loss_value_and_grad = nn.value_and_grad(model, loss)
+
+    losses = []
+    n_tokens = 0
+
+    # Main training loop
+    start = time.perf_counter()
+    for it, batch in zip(
+            range(args.iters),
+            iterate_batches(train_set, tokenizer, args.batch_size, train=True),
+    ):
+        # Forward and backward pass
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+
+        # Model update
+        optimizer.update(model, grad)
+        mx.eval(model.parameters(), optimizer.state, lvalue)
+
+        # Record loss
+        losses.append(lvalue.item())
+        n_tokens += toks.item()
+
+        if (it + 1) % args.steps_per_report == 0:
+            train_loss = np.mean(losses)
+
+            stop = time.perf_counter()
+            print(
+                f"Iter {it + 1}: Train loss {train_loss:.3f}, "
+                f"It/sec {args.steps_per_report / (stop - start):.3f}, "
+                f"Tokens/sec {float(n_tokens) / (stop - start):.3f}"
+            )
+            losses = []
+            n_tokens = 0
+            start = time.perf_counter()
+
+        # Report validation loss if needed
+        if it == 0 or (it + 1) % args.steps_per_eval == 0:
+            stop = time.perf_counter()
+            val_loss = evaluate(
+                model, val_set, loss, tokenizer, args.batch_size, args.val_batches
+            )
+            print(
+                f"Iter {it + 1}: "
+                f"Val loss {val_loss:.3f}, "
+                f"Val took {(time.perf_counter() - stop):.3f}s"
+            )
+
+            start = time.perf_counter()
+
+        # Save adapter weights if needed
+        if (it + 1) % args.save_every == 0:
+            mx.savez(
+                args.adapter_file, **dict(tree_flatten(model.trainable_parameters()))
+            )
+            print(f"Iter {it + 1}: Saved adapter weights to {args.adapter_file}.")
+
+
+def generate(model, prompt, tokenizer, args):
+    print(prompt, end="", flush=True)
+
+    prompt = mx.array(tokenizer.encode(prompt))
+
+    tokens = []
+    skip = 0
+    for token, n in zip(
+            generate(prompt, model, args.temp),
+            range(args.max_tokens),
+    ):
+        if token == tokenizer.eos_token_id:
+            break
+
+        tokens.append(token.item())
+        s = tokenizer.decode(tokens)
+        if len(s) - skip > 1:
+            print(s[skip:-1], end="", flush=True)
+            skip = len(s) - 1
+    print(tokenizer.decode(tokens)[skip:], flush=True)
+    print("=" * 10)
+    if len(tokens) == 0:
+        print("No tokens generated for this prompt")
+        return
+
+
+if __name__ == "__main__":
+    parser = build_parser()
+    args = parser.parse_args()
+
+    np.random.seed(args.seed)
+
+    print("Loading pretrained model")
+    model, tokenizer, _ = load_model(args.model)
+
+    # Freeze all layers other than LORA linears
+    model.freeze()
+    for l in model.model.layers[len(model.model.layers) - args.lora_layers:]:
+        l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)
+        l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)
+        if hasattr(l, "block_sparse_moe"):
+            l.block_sparse_moe.gate = LoRALinear.from_linear(l.block_sparse_moe.gate)
+
+    p = sum(v.size for _, v in tree_flatten(model.parameters())) / 10 ** 6
+    print(f"Total parameters {p:.3f}M")
+    p = sum(v.size for _, v in tree_flatten(model.trainable_parameters())) / 10 ** 6
+    print(f"Trainable parameters {p:.3f}M")
+
+    print("Loading datasets")
+    train_set, valid_set, test_set = load(args)
+
+    # Resume training the given adapters.
+    if args.resume_adapter_file is not None:
+        print(f"Loading pretrained adapters from {args.resume_adapter_file}")
+        model.load_weights(args.resume_adapter_file, strict=False)
+
+    if args.train:
+        print("Training")
+        opt = optim.Adam(learning_rate=args.learning_rate)
+
+        # Train model
+        train(model, train_set, valid_set, opt, loss, tokenizer, args)
+
+        # Save adapter weights
+        mx.savez(args.adapter_file, **dict(tree_flatten(model.trainable_parameters())))
+
+    # Load the LoRA adapter weights which we assume should exist by this point
+    if not Path(args.adapter_file).is_file():
+        raise ValueError(
+            f"Adapter file {args.adapter_file} missing. "
+            "Use --train to learn and save the adapters.npz."
+        )
+    model.load_weights(args.adapter_file, strict=False)
+
+    if args.test:
+        print("Testing")
+        model.eval()
+        test_loss = evaluate(
+            model,
+            test_set,
+            loss,
+            tokenizer,
+            args.batch_size,
+            num_batches=args.test_batches,
+        )
+        test_ppl = math.exp(test_loss)
+
+        print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}.")
+
+    if args.prompt is not None:
+        print("Generating")
+        generate(model, args.prompt, tokenizer, args)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,12 @@
+transformers>=4.39.1
+torch>=2.2.0
+triton>=2.2.0
+httpx>=0.27.0
+gradio>=4.26.0
+flash_attn>=2.4.1
+accelerate>=0.29.2
+sentence_transformers>=2.6.1
+sse_starlette>=2.1.0
+tiktoken>=0.6.0
+mlx_lm>=0.8.0
+openai>=0.16.2