snippets/python-utils

Python Utils

Use Case: Reusable Python utilities for data extraction, JSON/YAML transformation, bookmark parsing, and MCP tool scaffolding. These scripts bridge between web data and MCP tool inputs.

Prerequisites

  • Python 3.10+
  • httpx, beautifulsoup4, pyyaml (pip install httpx beautifulsoup4 pyyaml)

Setup & Configuration

Set up a virtual environment and install dependencies:

Terminalbash
python3 -m venv .venv
source .venv/bin/activate
pip install httpx beautifulsoup4 pyyaml lxml

Core Implementation

Extract page metadata and content for use with MCP bookmark tools:

utils/page_extractor.pypython
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import json
import re

def extract_page_metadata(url: str) -> dict:
    """Fetch a page and extract title, description, and text content."""
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MCPBookmarkBot/1.0)"
    }

    with httpx.Client(timeout=10.0, follow_redirects=True) as client:
        resp = client.get(url, headers=headers)
        resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "lxml")

    title = soup.title.string.strip() if soup.title else urlparse(url).netloc
    description = ""
    meta_desc = soup.find("meta", attrs={"name": "description"})
    if meta_desc:
        description = meta_desc.get("content", "")

    # Strip scripts and styles for clean text
    for tag in soup(["script", "style", "nav", "footer"]):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)[:2000]  # truncate

    return {
        "title": title,
        "url": url,
        "description": description,
        "snippet": text[:300],
        "domain": urlparse(url).netloc,
    }

if __name__ == "__main__":
    import sys
    result = extract_page_metadata(sys.argv[1])
    print(json.dumps(result, indent=2))

Parse Chrome bookmarks export (HTML format) into MCP-ready JSON:

utils/bookmark_parser.pypython
from bs4 import BeautifulSoup
import json
from pathlib import Path

def parse_chrome_bookmarks(html_path: str) -> list[dict]:
    """Parse Chrome exported bookmarks HTML into structured JSON."""
    with open(html_path, "r") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    bookmarks = []
    stack = [{"folder": "root", "children": bookmarks}]

    for dt in soup.find_all("dt"):
        h3 = dt.find("h3")
        if h3:
            folder_name = h3.get_text(strip=True)
            new_folder = {"folder": folder_name, "children": []}
            if stack:
                stack[-1]["children"].append(new_folder)
            stack.append(new_folder)
            continue

        a_tag = dt.find("a")
        if a_tag and a_tag.get("href"):
            bookmark = {
                "title": a_tag.get_text(strip=True),
                "url": a_tag["href"],
                "add_date": a_tag.get("add_date", ""),
                "tags": a_tag.get("tags", "").split(",") if a_tag.get("tags") else [],
            }
            if stack:
                stack[-1]["children"].append(bookmark)

        # Close folder on </dl>
        if dt.find("dl") is None and len(stack) > 1:
            stack.pop()

    return bookmarks

def to_mcp_tool_input(bookmarks: list[dict]) -> list[dict]:
    """Flatten nested bookmarks into MCP tool-compatible entries."""
    flat = []

    def walk(items, folder=""):
        for item in items:
            if "folder" in item:
                walk(item["children"], item["folder"])
            else:
                flat.append({
                    "title": item["title"],
                    "url": item["url"],
                    "folder": folder,
                })

    walk(bookmarks)
    return flat

if __name__ == "__main__":
    result = parse_chrome_bookmarks("bookmarks.html")
    mcp_ready = to_mcp_tool_input(result)
    print(json.dumps(mcp_ready, indent=2))

Convert between JSON and YAML for MCP tool configuration files:

utils/config_converter.pypython
import json
import yaml
from pathlib import Path

def load_mcp_config(path: str) -> dict:
    """Load MCP server config from JSON or YAML."""
    p = Path(path)
    raw = p.read_text()
    if p.suffix in (".yaml", ".yml"):
        return yaml.safe_load(raw)
    return json.loads(raw)

def save_mcp_config(config: dict, path: str):
    """Save MCP config, inferring format from extension."""
    p = Path(path)
    if p.suffix in (".yaml", ".yml"):
        p.write_text(yaml.dump(config, default_flow_style=False))
    else:
        p.write_text(json.dumps(config, indent=2))

def merge_tool_defs(base: dict, tool_defs: list[dict]) -> dict:
    """Merge new MCP tool definitions into an existing config."""
    existing = base.get("tools", [])
    names = {t["name"] for t in existing}
    for t in tool_defs:
        if t["name"] not in names:
            existing.append(t)
    base["tools"] = existing
    return base

if __name__ == "__main__":
    config = load_mcp_config("mcp_config.json")
    config["version"] = "2.0.0"
    save_mcp_config(config, "mcp_config.yaml")

Deployment Notes

  • Rate limits: HTTPX requests can trigger rate limits on target sites. Add a time.sleep(1) between requests or use asyncio with semaphores.
  • Encoding: Some sites serve gzipped content. HTTPX handles this automatically, but urllib users need to set Accept-Encoding.
  • JavaScript-rendered pages: BeautifulSoup only sees the static HTML. For SPAs, pipe through playwright or use the web_search MCP tool instead.
  • Chrome bookmark format: The exported HTML uses DT=<h3> for folder depth. The parser handles nesting up to 10 levels.