文档转换
# 文档转换
Markdown→HTML、HTML→PDF、Word→PDF、pandoc 万能文档转换器。
# 一、Markdown → HTML
pip install markdown
1
#!/usr/bin/env python3
import markdown
with open('README.md') as f:
md_text = f.read()
# ---- 基础转换 ----
html = markdown.markdown(md_text)
# ---- 带扩展的转换 ----
html = markdown.markdown(md_text, extensions=[
'tables', # GitHub 风格表格
'fenced_code', # 代码块 ```
'codehilite', # 代码高亮
'toc', # 目录生成
'footnotes', # 脚注
])
# ---- 生成完整 HTML 页面 ----
full_html = f"""<!DOCTYPE html>
<html><head><meta charset="utf-8">
<style>body{{max-width:800px;margin:auto;padding:20px}}</style>
</head><body>{html}</body></html>"""
with open('output.html', 'w') as f:
f.write(full_html)
print("✅ README.md → output.html")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 二、HTML → PDF
# 安装依赖
# macOS: brew install wkhtmltopdf
# Linux: apt install wkhtmltopdf
pip install pdfkit
1
2
3
4
2
3
4
#!/usr/bin/env python3
import pdfkit
# ---- HTML 文件 → PDF ----
pdfkit.from_file('output.html', 'output.pdf')
# ---- HTML 字符串 → PDF ----
html = '<h1>Hello PDF</h1><p>This is a test</p>'
pdfkit.from_string(html, 'output.pdf')
# ---- URL → PDF ----
pdfkit.from_url('https://example.com', 'webpage.pdf')
# ---- 带选项 ----
options = {
'page-size': 'A4',
'margin-top': '20mm',
'margin-bottom': '20mm',
'encoding': 'UTF-8',
'no-outline': None,
}
pdfkit.from_file('report.html', 'report.pdf', options=options)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 三、Word → PDF
# macOS
pip install docx2pdf
# Linux(需要 LibreOffice)
# apt install libreoffice
1
2
3
4
5
2
3
4
5
#!/usr/bin/env python3
"""Word → PDF——macOS 用 docx2pdf,Linux 用 LibreOffice"""
# ---- macOS: docx2pdf ----
from docx2pdf import convert
convert('document.docx', 'document.pdf')
# 批量转换
convert('docs/', 'pdfs/') # 转换目录下所有 .docx
# ---- Linux: LibreOffice 命令行 ----
# subprocess.run([
# 'libreoffice', '--headless', '--convert-to', 'pdf',
# 'document.docx', '--outdir', '/output'
# ])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 四、pandoc——万能文档转换器
# 安装
# macOS: brew install pandoc
# Linux: apt install pandoc
# 或下载:https://pandoc.org/installing.html
# ===== 基础用法 =====
# Markdown → HTML
pandoc input.md -o output.html
# Markdown → DOCX
pandoc input.md -o output.docx
# Markdown → PDF(需安装 LaTeX:brew install basictex / apt install texlive-xetex)
pandoc input.md -o output.pdf --pdf-engine=xelatex
# DOCX → Markdown
pandoc input.docx -o output.md
# HTML → Markdown
pandoc input.html -t markdown -o output.md
# ===== 高级选项 =====
# 带目录
pandoc input.md --toc -o output.html
# 自包含 HTML(内嵌 CSS)
pandoc input.md --self-contained -o output.html
# 指定模板
pandoc input.md --template=template.html -o output.html
# ===== 支持格式列表 =====
pandoc --list-output-formats
# 输出:markdown html pdf docx epub rst latex ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 五、Python pandoc 集成
pip install pypandoc
1
#!/usr/bin/env python3
"""Python 调用 pandoc——程序化文档转换"""
import pypandoc
# ---- 格式转换 ----
output = pypandoc.convert_file('README.md', 'html')
with open('output.html', 'w') as f:
f.write(output)
# ---- 字符串转换 ----
html = pypandoc.convert_text('# Hello', 'html', format='markdown')
# ---- 批量转换 ----
import os, glob
for md in glob.glob('posts/*.md'):
out = md.replace('.md', '.html')
pypandoc.convert_file(md, 'html', outputfile=out)
print(f"✅ {md} → {out}")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 六、实战:文档发布流水线
#!/usr/bin/env python3
"""Markdown 发布流水线: .md → .html → .pdf"""
import markdown, os, sys
def publish(md_path, output_dir='output'):
os.makedirs(output_dir, exist_ok=True)
base = os.path.splitext(os.path.basename(md_path))[0]
with open(md_path) as f:
md = f.read()
# Step 1: MD → HTML
html_content = markdown.markdown(md, extensions=['tables', 'fenced_code', 'codehilite'])
html_full = f"""<!DOCTYPE html><html><head><meta charset="utf-8">
<style>body{{max-width:900px;margin:auto;font-family:sans-serif;padding:20px}}
pre{{background:#f5f5f5;padding:15px;border-radius:5px}}</style>
</head><body>{html_content}</body></html>"""
html_path = os.path.join(output_dir, f'{base}.html')
with open(html_path, 'w') as f:
f.write(html_full)
print(f"✅ {html_path}")
# Step 2: HTML → PDF (optional, needs wkhtmltopdf)
try:
import pdfkit
pdf_path = os.path.join(output_dir, f'{base}.pdf')
pdfkit.from_string(html_full, pdf_path)
print(f"✅ {pdf_path}")
except Exception:
print(f"⚠️ PDF 生成失败(可能未安装 wkhtmltopdf)")
# Step 3: Word (optional, needs pandoc)
try:
import pypandoc
docx_path = os.path.join(output_dir, f'{base}.docx')
pypandoc.convert_file(md_path, 'docx', outputfile=docx_path)
print(f"✅ {docx_path}")
except Exception:
print(f"⚠️ DOCX 生成失败(可能未安装 pandoc)")
if __name__ == '__main__':
publish(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else 'output')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
上次更新: 2026/06/17, 12:47:39