Learn how to fine-tune large language models efficiently on consumer hardware using LoRA and QLoRA. Full code examples with Llama 3, Mistral, and more.
pip install torch transformers peft bitsandbytes accelerate datasets trlpip install unsloth (2026 favorite)
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
# Load model & tokenizer (4-bit quantized)
model_name = "meta-llama/Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
device_map="auto"
)
# LoRA config
lora_config = LoraConfig(
r=16, # rank
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# Dataset (Alpaca-style)
dataset = load_dataset("yahma/alpaca-cleaned", split="train")
# Training args
training_args = TrainingArguments(
output_dir="./lora-llama3",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
max_steps=500,
fp16=True,
logging_steps=10,
save_steps=100,
optim="paged_adamw_8bit"
)
# Trainer
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
peft_config=lora_config
)
trainer.train()
# Same as above, but with bitsandbytes 4-bit
from transformers import BitsAndBytesConfig
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quant_config,
device_map="auto"
)
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/llama-3-8b-bnb-4bit",
max_seq_length=2048,
dtype=None,
load_in_4bit=True
)
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing=True
)
# Train with same trainer as above
# Merge LoRA adapters into base model
model = model.merge_and_unload()
# Save merged model
model.save_pretrained("merged-llama3-finetuned")
tokenizer.save_pretrained("merged-llama3-finetuned")