#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 通用版：把 insurance-guide/articles/ 下任意单篇 HTML 文章转换成"公众号 HTML（带样式）" + "知乎 MD（原生 Markdown）"。用法：python convert_single.py 例： python convert_single.py insurance-guide/articles/cpa-01-dividend-trap.html "公众号_港险分红智商税" """ import sys import re from bs4 import BeautifulSoup WECHAT_CSS = """ """ # ---------------- 提取 ---------------- def extract(html_file): with open(html_file, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f.read(), 'html.parser') # 标题 t = soup.find('div', class_='article-title-main') or soup.find('h1', class_='article-title-main') if t is None: t = soup.find('h1') title = t.get_text().strip() if t else '未命名文章' # 摘要 s = (soup.find('div', class_='article-summary') or soup.find('div', class_='v22-summary')) summary = s.get_text().strip() if s else '' # 主体：article 标签下，去掉 meta/toc/导航/末尾 section-end article = soup.find('article') if not article: article = soup.find('div', class_='article-body') # 把要剔除的容器先 decompose 掉 for cls in ['article-meta', 'top-bar', 'section-end', 'toc']: for el in article.find_all('div', class_=cls): el.decompose() return title, summary, article # ---------------- 公众号 HTML ---------------- def to_wechat_html(title, summary, article, brand_meta='— CPA × 香港保险 —'): parts = [] for el in article.children: if getattr(el, 'name', None) is None: continue # 跳过明显是导航/分割文字 if el.name == 'h2' and el.get_text().strip() in ('---', '——'): continue parts.append(str(el)) body = ''.join(parts) summary_html = f'

{summary}

' if summary else '' return f""" {title} {WECHAT_CSS}

{brand_meta}

{title}

{summary_html} {body} """ # ---------------- 知乎 Markdown ---------------- def inline_md(el): out = [] for child in el.children: if getattr(child, 'name', None) is None: out.append(str(child)) elif child.name in ('strong', 'b'): out.append(f"**{child.get_text()}**") elif child.name in ('em', 'i'): out.append(f"*{child.get_text()}*") elif child.name == 'code': out.append(f"`{child.get_text()}`") elif child.name == 'br': out.append("\n") elif child.name == 'a': href = child.get('href', '') txt = child.get_text() out.append(f"[{txt}]({href})" if href else txt) else: out.append(child.get_text()) return ''.join(out) def el_to_md(el): name = el.name if name == 'h1': return f"# {el.get_text().strip()}\n\n" if name == 'h2': return f"## {el.get_text().strip()}\n\n" if name == 'h3': return f"### {el.get_text().strip()}\n\n" if name == 'h4': return f"#### {el.get_text().strip()}\n\n" if name == 'hr': return "\n---\n\n" if name == 'blockquote': lines = [] children = el.find_all('p', recursive=False) if not children: txt = inline_md(el).strip() for line in txt.split('\n'): lines.append(f"> {line}" if line.strip() else ">") else: for c in children: txt = inline_md(c).strip() for line in txt.split('\n'): lines.append(f"> {line}" if line.strip() else ">") return "\n".join(lines) + "\n\n" if name == 'p': return inline_md(el).strip() + "\n\n" if name == 'ul': items = [f"- {inline_md(li).strip()}" for li in el.find_all('li', recursive=False)] return "\n".join(items) + "\n\n" if name == 'ol': items = [f"{i}. {inline_md(li).strip()}" for i, li in enumerate(el.find_all('li', recursive=False), 1)] return "\n".join(items) + "\n\n" if name == 'table': return table_to_md(el) if name == 'div': cls = el.get('class', []) if cls and 'callout' in cls: label_map = {'warning': '⚠️ ', 'info': '💡 ', 'success': '✅ ', 'danger': '🚫 '} prefix = '' for k, v in label_map.items(): if k in cls: prefix = v break inner_text = el.get_text().strip() inner_text = re.sub(r'\s+\n', '\n', inner_text) lines = [f"> {prefix}{line}".rstrip() if i == 0 else f"> {line}".rstrip() for i, line in enumerate(inner_text.split('\n'))] return "\n".join(lines) + "\n\n" if cls and 'formula' in cls: inner = el.get_text().strip() return f"```\n{inner}\n```\n\n" if cls and ('warn-block' in cls or 'warning-block' in cls): inner_text = el.get_text().strip() lines = [f"> ⚠️ {line}".rstrip() if i == 0 else f"> {line}".rstrip() for i, line in enumerate(inner_text.split('\n'))] return "\n".join(lines) + "\n\n" if cls and ('key-insight' in cls or 'key-insight-body' in cls): inner_text = el.get_text().strip() lines = [f"> 💡 {line}".rstrip() if i == 0 else f"> {line}".rstrip() for i, line in enumerate(inner_text.split('\n'))] return "\n".join(lines) + "\n\n" # 普通 div：递归处理 out = '' for c in el.children: if getattr(c, 'name', None): out += el_to_md(c) return out return '' def table_to_md(table): rows = table.find_all('tr') if not rows: return '' md_rows = [] header_done = False for i, r in enumerate(rows): cells = r.find_all(['th', 'td']) line = '| ' + ' | '.join(c.get_text().strip().replace('\n', ' ') for c in cells) + ' |' md_rows.append(line) if i == 0 and not header_done: md_rows.append('| ' + ' | '.join(['---'] * len(cells)) + ' |') header_done = True return "\n".join(md_rows) + "\n\n" def to_zhihu_md(title, summary, article): out = [f"# {title}\n\n"] if summary: out.append(f"> {summary}\n\n") out.append("---\n\n") for el in article.children: if getattr(el, 'name', None) is None: continue if el.name == 'h2' and el.get_text().strip() in ('---', '——'): continue md = el_to_md(el) if md: out.append(md) text = ''.join(out) text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() + '\n' # ---------------- 主流程 ---------------- def main(): if len(sys.argv) < 3: print('Usage: python convert_single.py ') sys.exit(1) html_file = sys.argv[1] basename = sys.argv[2] title, summary, article = extract(html_file) wechat_html = to_wechat_html(title, summary, article) zhihu_md = to_zhihu_md(title, summary, article) wx_path = f'公众号_{basename}.html' zh_path = f'知乎_{basename}.md' with open(wx_path, 'w', encoding='utf-8') as f: f.write(wechat_html) with open(zh_path, 'w', encoding='utf-8') as f: f.write(zhihu_md) print('OK') print(f' {wx_path}') print(f' {zh_path}') print(f' TITLE: {title}') if __name__ == '__main__': main()