Python Utils
Use Case: Reusable Python utilities for data extraction, JSON/YAML transformation, bookmark parsing, and MCP tool scaffolding. These scripts bridge between web data and MCP tool inputs.
Prerequisites
- Python 3.10+
httpx,beautifulsoup4,pyyaml(pip install httpx beautifulsoup4 pyyaml)
Setup & Configuration
Set up a virtual environment and install dependencies:
Terminalbash
python3 -m venv .venv
source .venv/bin/activate
pip install httpx beautifulsoup4 pyyaml lxmlCore Implementation
Extract page metadata and content for use with MCP bookmark tools:
utils/page_extractor.pypython
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import json
import re
def extract_page_metadata(url: str) -> dict:
"""Fetch a page and extract title, description, and text content."""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MCPBookmarkBot/1.0)"
}
with httpx.Client(timeout=10.0, follow_redirects=True) as client:
resp = client.get(url, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
title = soup.title.string.strip() if soup.title else urlparse(url).netloc
description = ""
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
description = meta_desc.get("content", "")
# Strip scripts and styles for clean text
for tag in soup(["script", "style", "nav", "footer"]):
tag.decompose()
text = soup.get_text(separator=" ", strip=True)
text = re.sub(r"\s+", " ", text)[:2000] # truncate
return {
"title": title,
"url": url,
"description": description,
"snippet": text[:300],
"domain": urlparse(url).netloc,
}
if __name__ == "__main__":
import sys
result = extract_page_metadata(sys.argv[1])
print(json.dumps(result, indent=2))Parse Chrome bookmarks export (HTML format) into MCP-ready JSON:
utils/bookmark_parser.pypython
from bs4 import BeautifulSoup
import json
from pathlib import Path
def parse_chrome_bookmarks(html_path: str) -> list[dict]:
"""Parse Chrome exported bookmarks HTML into structured JSON."""
with open(html_path, "r") as f:
soup = BeautifulSoup(f.read(), "html.parser")
bookmarks = []
stack = [{"folder": "root", "children": bookmarks}]
for dt in soup.find_all("dt"):
h3 = dt.find("h3")
if h3:
folder_name = h3.get_text(strip=True)
new_folder = {"folder": folder_name, "children": []}
if stack:
stack[-1]["children"].append(new_folder)
stack.append(new_folder)
continue
a_tag = dt.find("a")
if a_tag and a_tag.get("href"):
bookmark = {
"title": a_tag.get_text(strip=True),
"url": a_tag["href"],
"add_date": a_tag.get("add_date", ""),
"tags": a_tag.get("tags", "").split(",") if a_tag.get("tags") else [],
}
if stack:
stack[-1]["children"].append(bookmark)
# Close folder on </dl>
if dt.find("dl") is None and len(stack) > 1:
stack.pop()
return bookmarks
def to_mcp_tool_input(bookmarks: list[dict]) -> list[dict]:
"""Flatten nested bookmarks into MCP tool-compatible entries."""
flat = []
def walk(items, folder=""):
for item in items:
if "folder" in item:
walk(item["children"], item["folder"])
else:
flat.append({
"title": item["title"],
"url": item["url"],
"folder": folder,
})
walk(bookmarks)
return flat
if __name__ == "__main__":
result = parse_chrome_bookmarks("bookmarks.html")
mcp_ready = to_mcp_tool_input(result)
print(json.dumps(mcp_ready, indent=2))Convert between JSON and YAML for MCP tool configuration files:
utils/config_converter.pypython
import json
import yaml
from pathlib import Path
def load_mcp_config(path: str) -> dict:
"""Load MCP server config from JSON or YAML."""
p = Path(path)
raw = p.read_text()
if p.suffix in (".yaml", ".yml"):
return yaml.safe_load(raw)
return json.loads(raw)
def save_mcp_config(config: dict, path: str):
"""Save MCP config, inferring format from extension."""
p = Path(path)
if p.suffix in (".yaml", ".yml"):
p.write_text(yaml.dump(config, default_flow_style=False))
else:
p.write_text(json.dumps(config, indent=2))
def merge_tool_defs(base: dict, tool_defs: list[dict]) -> dict:
"""Merge new MCP tool definitions into an existing config."""
existing = base.get("tools", [])
names = {t["name"] for t in existing}
for t in tool_defs:
if t["name"] not in names:
existing.append(t)
base["tools"] = existing
return base
if __name__ == "__main__":
config = load_mcp_config("mcp_config.json")
config["version"] = "2.0.0"
save_mcp_config(config, "mcp_config.yaml")Deployment Notes
- Rate limits: HTTPX requests can trigger rate limits on target sites. Add a
time.sleep(1)between requests or useasynciowith semaphores. - Encoding: Some sites serve gzipped content. HTTPX handles this automatically, but
urllibusers need to setAccept-Encoding. - JavaScript-rendered pages: BeautifulSoup only sees the static HTML. For SPAs, pipe through
playwrightor use theweb_searchMCP tool instead. - Chrome bookmark format: The exported HTML uses
DT=<h3>for folder depth. The parser handles nesting up to 10 levels.