Run powerful LLMs completely offline on your own hardware and expose them as secure web apps/APIs using FastAPI or Flask.
# Install Ollama (one-time)
# Go to https://ollama.com/download → install for your OS
# Start Ollama server in terminal
ollama serve
# In another terminal – pull a model (8B is good balance)
ollama pull llama3.1:8b
# Test it
ollama run llama3.1:8b
>>> Hello! How can I help you today?
# app/main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from langchain_ollama import ChatOllama
import os
app = FastAPI(title="Local Ollama API 2026")
llm = ChatOllama(model="llama3.1:8b", base_url="http://localhost:11434")
class ChatRequest(BaseModel):
prompt: str
system: str = "You are a helpful assistant."
@app.post("/chat")
async def chat(request: ChatRequest):
try:
response = llm.invoke(
request.prompt,
system=request.system
)
return {"response": response.content}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# Run: uvicorn main:app --reload
# app.py (Flask version)
from flask import Flask, request, jsonify
from langchain_ollama import ChatOllama
app = Flask(__name__)
llm = ChatOllama(model="llama3.1:8b")
@app.route("/chat", methods=["POST"])
def chat():
data = request.json
prompt = data.get("prompt")
system = data.get("system", "You are a helpful assistant.")
if not prompt:
return jsonify({"error": "No prompt provided"}), 400
try:
response = llm.invoke(prompt, system=system)
return jsonify({"response": response.content})
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
app.run(debug=True, port=5000)
Local AI Chat
Local Ollama Chat