Combine vision and text in one model: describe images, answer visual questions, understand charts & documents, analyze screenshots — all in Python using OpenAI GPT-4o and Anthropic Claude-3.5.
from openai import OpenAI
import base64
import os
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Example image
image_path = "chart-example.jpg"
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this chart and explain the key insights."},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
max_tokens=500
)
print(response.choices[0].message.content)
from anthropic import Anthropic
import base64
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
with open("invoice.pdf", "rb") as f:
pdf_data = base64.b64encode(f.read()).decode("utf-8")
message = client.messages.create(
model="claude-3-5-sonnet-20240620",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "base64",
"media_type": "application/pdf",
"data": pdf_data
}
},
{
"type": "text",
"text": "Extract key information: invoice number, date, total amount, vendor name."
}
]
}
]
)
print(message.content[0].text)
from fastapi import FastAPI, UploadFile, File
from openai import OpenAI
import base64
app = FastAPI()
client = OpenAI()
@app.post("/describe-image")
async def describe_image(file: UploadFile = File(...)):
image_data = await file.read()
base64_image = base64.b64encode(image_data).decode('utf-8')
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail."},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}
]
)
return {"description": response.choices[0].message.content}
base64 for image input (no public URLs needed)