import re
from pathlib import Path
INFILE = Path("input.html")
OUTFILE = Path("output.cleaned.html")
html = INFILE.read_text(encoding="utf-8", errors="replace")
# ----------------------------
# 1) Fix common WP encoding junk
# ----------------------------
replacements = {
"‘": "‘",
"’": "’",
"“": "“",
"â€�": "”",
"—": "—",
"–": "–",
"Â ": " ",
"Â": "",
}
for k, v in replacements.items():
html = html.replace(k, v)
# ----------------------------
# 2) Update ALL event date references (best-effort)
# ----------------------------
date_map = {
# Main training date ranges
r"September\s+10\s+through\s+September\s+14,\s*2019": "March 24 through March 28, 2026",
r"September\s+10th\s+Through\s+September\s+14th,\s*2019": "March 24th Through March 28th, 2026",
r"September\s+10th,\s*2019\s+Through\s+September\s+14th,\s*2019": "March 24th, 2026 Through March 28th, 2026",
r"September\s+10th,\s*2019": "March 24th, 2026",
r"September\s+14th,\s*2019": "March 28th, 2026",
r"September\s+7th,\s*2019": "March 21st, 2026",
r"September\s+7,\s*2019": "March 21, 2026",
r"August\s+15,\s*2019": "March 10, 2026",
}
for pattern, repl in date_map.items():
html = re.sub(pattern, repl, html, flags=re.IGNORECASE)
# Keep the original letter date "August 6, 2019 8:23:AM" unchanged (you didn't ask to update it)
# ----------------------------
# 3) Update location / venue mentions (targeted, minimal)
# ----------------------------
# Replace the “secret training facility” sentence to match your request.
html = re.sub(
r"The training will be conducted at my secret training facility here in San Diego,",
"The training will be conducted here in San Diego, California at the Sorrento Valley Marriott,",
html,
flags=re.IGNORECASE
)
# Also nudge any “March Training” CTA line to stay accurate, without rewriting copy.
html = re.sub(
r"Click The Link Now To Register For the\s+March Training Before It's Too Late\.",
"Click The Link Now To Register For the March 24th–28th, 2026 Training Before It's Too Late.",
html,
flags=re.IGNORECASE
)
# ----------------------------
# 4) Strip common WP junk attributes/classes and spacer noise
# ----------------------------
# Remove wp-specific classes like "wp-image-123", alignleft/right/center, etc.
html = re.sub(r'\sclass="[^"]*?\bwp-image-\d+\b[^"]*?"', ' ', html, flags=re.IGNORECASE)
html = re.sub(r'\sclass="[^"]*?\balign(left|right|center)\b[^"]*?"', ' ', html, flags=re.IGNORECASE)
html = re.sub(r'\sclass="[^"]*?\bwp-att-\d+\b[^"]*?"', ' ', html, flags=re.IGNORECASE)
# Remove redundant empty strong tags and excessive padding blocks
html = html.replace(" ", "")
html = re.sub(r"(
]*>\s* \s*
\s*){2,}", "", html, flags=re.IGNORECASE)
# Remove empty headings like
html = re.sub(r"
# Remove spacer tables used only for layout (simple heuristic: tables with no meaningful text)
# We'll keep tables that contain substantial content.
def strip_empty_tables(s: str) -> str:
def table_repl(match):
table = match.group(0)
text = re.sub(r"<[^>]+>", "", table)
text = re.sub(r"\s+", " ", text).strip()
# If table text is tiny or purely decorative, drop it.
if len(text) <= 10:
return ""
return table
return re.sub(r"