π§ LFM2
Collection
LFM2 is a new generation of hybrid models, designed for on-device deployment. β’ 28 items β’ Updated β’ 153
ONNX export of LFM2-24B-A2B for cross-platform inference.
LFM2-MoE is a Mixture of Experts model with 24B total parameters and ~2B active parameters per token. It uses 64 experts with 4 experts activated per token, combining the efficiency of sparse models with the quality of larger dense models.
| Precision | Size | Use Case |
|---|---|---|
| Q4F16 | ~13GB | Recommended (Q4 MoE + FP16 dense) |
| FP16 | ~44GB | Higher quality |
onnx/
βββ model_fp16.onnx # FP16
βββ model_q4f16.onnx # Q4 MoE experts + FP16 dense (recommended)
pip install onnxruntime transformers numpy huggingface_hub
# or with GPU support:
pip install onnxruntime-gpu transformers numpy huggingface_hub
from transformers import AutoConfig, AutoTokenizer
import onnxruntime
import numpy as np
from huggingface_hub import snapshot_download
# 1. Load config, processor, and model
model_id = "LiquidAI/LFM2-24B-A2B-ONNX"
config = AutoConfig.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
eos_token_id = config.eos_token_id
filename = "model_q4f16.onnx" # Options: "model_fp16.onnx", "model_q4f16.onnx"
model_path = snapshot_download(repo_id=model_id, allow_patterns=f"onnx/{filename}*") # Download the graph + weights
session = onnxruntime.InferenceSession(f"{model_path}/onnx/{filename}")
# 2. Prepare inputs
prompt = "What is C. elegans?"
messages = [{"role": "user", "content": prompt}]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="np")
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
batch_size = input_ids.shape[0]
num_logits_to_keep = np.array(1, dtype=np.int64)
past_cache_values = {}
for inp in session.get_inputs():
name = inp.name
shape = inp.shape
dtype = np.float32 if inp.type == "tensor(float)" else np.float16
if name.startswith("past_key_values"):
# Attention KV cache: shape [batch_size, num_kv_heads, 0, head_dim]
past_cache_values[name] = np.zeros([batch_size, shape[1], 0, shape[3]], dtype=dtype)
elif name.startswith("past_conv"):
# Conv cache: shape [batch_size, hidden_size, conv_L_cache]
past_cache_values[name] = np.zeros([batch_size, shape[1], shape[2]], dtype=dtype)
# 3. Generation loop
max_new_tokens = 1024
generated_tokens = np.array([[]], dtype=np.int64)
for i in range(max_new_tokens):
logits, *present_cache_values = session.run(None, dict(
input_ids=input_ids,
attention_mask=attention_mask,
num_logits_to_keep=num_logits_to_keep,
**past_cache_values,
))
## Update values for next generation loop
input_ids = logits[:, -1].argmax(-1, keepdims=True)
attention_mask = np.concatenate([attention_mask, np.ones_like(input_ids, dtype=np.int64)], axis=-1)
for j, key in enumerate(past_cache_values):
past_cache_values[key] = present_cache_values[j]
generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1)
if np.isin(input_ids, eos_token_id).any():
break
## (Optional) Streaming
print(tokenizer.decode(input_ids[0]), end='', flush=True)
print()
# 4. Output result
print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
This model is released under the LFM 1.0 License.
Base model
LiquidAI/LFM2-24B-A2B