文档转换

# 文档转换

Markdown→HTML、HTML→PDF、Word→PDF、pandoc 万能文档转换器。

# 一、Markdown → HTML

pip install markdown

#!/usr/bin/env python3
import markdown

with open('README.md') as f:
    md_text = f.read()

# ---- 基础转换 ----
html = markdown.markdown(md_text)

# ---- 带扩展的转换 ----
html = markdown.markdown(md_text, extensions=[
    'tables',            # GitHub 风格表格
    'fenced_code',       # 代码块 ```
    'codehilite',        # 代码高亮
    'toc',               # 目录生成
    'footnotes',         # 脚注
])

# ---- 生成完整 HTML 页面 ----
full_html = f"""<!DOCTYPE html>
<html><head><meta charset="utf-8">
<style>body{{max-width:800px;margin:auto;padding:20px}}</style>
</head><body>{html}</body></html>"""

with open('output.html', 'w') as f:
    f.write(full_html)
print("✅ README.md → output.html")

# 二、HTML → PDF

# 安装依赖
# macOS: brew install wkhtmltopdf
# Linux: apt install wkhtmltopdf
pip install pdfkit

#!/usr/bin/env python3
import pdfkit

# ---- HTML 文件 → PDF ----
pdfkit.from_file('output.html', 'output.pdf')

# ---- HTML 字符串 → PDF ----
html = '<h1>Hello PDF</h1><p>This is a test</p>'
pdfkit.from_string(html, 'output.pdf')

# ---- URL → PDF ----
pdfkit.from_url('https://example.com', 'webpage.pdf')

# ---- 带选项 ----
options = {
    'page-size': 'A4',
    'margin-top': '20mm',
    'margin-bottom': '20mm',
    'encoding': 'UTF-8',
    'no-outline': None,
}
pdfkit.from_file('report.html', 'report.pdf', options=options)

# 三、Word → PDF

# macOS
pip install docx2pdf

# Linux（需要 LibreOffice）
# apt install libreoffice

#!/usr/bin/env python3
"""Word → PDF——macOS 用 docx2pdf，Linux 用 LibreOffice"""

# ---- macOS: docx2pdf ----
from docx2pdf import convert
convert('document.docx', 'document.pdf')

# 批量转换
convert('docs/', 'pdfs/')    # 转换目录下所有 .docx

# ---- Linux: LibreOffice 命令行 ----
# subprocess.run([
#     'libreoffice', '--headless', '--convert-to', 'pdf',
#     'document.docx', '--outdir', '/output'
# ])

# 四、pandoc——万能文档转换器

# 安装
# macOS:  brew install pandoc
# Linux:  apt install pandoc
# 或下载：https://pandoc.org/installing.html

# ===== 基础用法 =====
# Markdown → HTML
pandoc input.md -o output.html

# Markdown → DOCX
pandoc input.md -o output.docx

# Markdown → PDF（需安装 LaTeX：brew install basictex / apt install texlive-xetex）
pandoc input.md -o output.pdf --pdf-engine=xelatex

# DOCX → Markdown
pandoc input.docx -o output.md

# HTML → Markdown
pandoc input.html -t markdown -o output.md

# ===== 高级选项 =====
# 带目录
pandoc input.md --toc -o output.html

# 自包含 HTML（内嵌 CSS）
pandoc input.md --self-contained -o output.html

# 指定模板
pandoc input.md --template=template.html -o output.html

# ===== 支持格式列表 =====
pandoc --list-output-formats
# 输出：markdown html pdf docx epub rst latex ...

# 五、Python pandoc 集成

pip install pypandoc

#!/usr/bin/env python3
"""Python 调用 pandoc——程序化文档转换"""
import pypandoc

# ---- 格式转换 ----
output = pypandoc.convert_file('README.md', 'html')
with open('output.html', 'w') as f:
    f.write(output)

# ---- 字符串转换 ----
html = pypandoc.convert_text('# Hello', 'html', format='markdown')

# ---- 批量转换 ----
import os, glob
for md in glob.glob('posts/*.md'):
    out = md.replace('.md', '.html')
    pypandoc.convert_file(md, 'html', outputfile=out)
    print(f"✅ {md} → {out}")

# 六、实战：文档发布流水线

#!/usr/bin/env python3
"""Markdown 发布流水线: .md → .html → .pdf"""
import markdown, os, sys

def publish(md_path, output_dir='output'):
    os.makedirs(output_dir, exist_ok=True)
    base = os.path.splitext(os.path.basename(md_path))[0]

    with open(md_path) as f:
        md = f.read()

    # Step 1: MD → HTML
    html_content = markdown.markdown(md, extensions=['tables', 'fenced_code', 'codehilite'])
    html_full = f"""<!DOCTYPE html><html><head><meta charset="utf-8">
<style>body{{max-width:900px;margin:auto;font-family:sans-serif;padding:20px}}
pre{{background:#f5f5f5;padding:15px;border-radius:5px}}</style>
</head><body>{html_content}</body></html>"""

    html_path = os.path.join(output_dir, f'{base}.html')
    with open(html_path, 'w') as f:
        f.write(html_full)
    print(f"✅ {html_path}")

    # Step 2: HTML → PDF (optional, needs wkhtmltopdf)
    try:
        import pdfkit
        pdf_path = os.path.join(output_dir, f'{base}.pdf')
        pdfkit.from_string(html_full, pdf_path)
        print(f"✅ {pdf_path}")
    except Exception:
        print(f"⚠️  PDF 生成失败（可能未安装 wkhtmltopdf）")

    # Step 3: Word (optional, needs pandoc)
    try:
        import pypandoc
        docx_path = os.path.join(output_dir, f'{base}.docx')
        pypandoc.convert_file(md_path, 'docx', outputfile=docx_path)
        print(f"✅ {docx_path}")
    except Exception:
        print(f"⚠️  DOCX 生成失败（可能未安装 pandoc）")

if __name__ == '__main__':
    publish(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else 'output')

#工具 #格式

上次更新: 2026/06/28, 17:55:19

← 图片转换批量重命名→