Learn how to scrape websites ethically, parse unstructured content with LLMs, and build reliable data pipelines in Python – fully compliant & modern.
playwright install
from playwright.async_api import async_playwright
import asyncio
async def scrape_page(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, wait_until="networkidle")
# Respect robots.txt (manual check recommended)
content = await page.content()
await browser.close()
return content
# Run
html = asyncio.run(scrape_page("https://example.com"))
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from bs4 import BeautifulSoup
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = ChatPromptTemplate.from_template(
"""Extract the following from this HTML page as JSON:
- article title
- publication date
- author name
- main content summary (max 200 words)
- key entities (people, organizations, locations)
HTML:
{html}
Output only valid JSON."""
)
chain = prompt | llm | JsonOutputParser()
soup = BeautifulSoup(html, "lxml")
clean_html = soup.prettify()[:8000] # truncate for token limit
result = chain.invoke({"html": clean_html})
print(result)
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
# Embeddings
embeddings = OllamaEmbeddings(model="llama3.1:8b")
# Vector store
vectorstore = Chroma(collection_name="articles", embedding_function=embeddings)
# Pipeline
async def pipeline(url):
html = await scrape_page(url)
parsed = chain.invoke({"html": html})
# Store in vector DB
doc = Document(
page_content=parsed["main_content_summary"],
metadata={
"title": parsed["article_title"],
"url": url,
"date": parsed["publication_date"]
}
)
vectorstore.add_documents([doc])
return parsed
# Run
result = asyncio.run(pipeline("https://example.com/article"))