编码转义
# 编码转义
自动检测编码(chardet)、URL 编码/解码、HTML 实体转义、Shell iconv 编码转换。
# 一、编码检测与转换
# 1.1 chardet——自动检测文件编码
pip install chardet
1
#!/usr/bin/env python3
"""自动检测编码并正确读取文件"""
import chardet
def read_file_smart(filepath):
"""智能读取——先检测编码,再解码"""
with open(filepath, 'rb') as f:
raw = f.read()
result = chardet.detect(raw)
encoding = result['encoding']
confidence = result['confidence']
print(f"检测编码: {encoding} (置信度: {confidence:.0%})")
return raw.decode(encoding)
# ---- 示例 ----
text = read_file_smart('unknown_file.txt')
print(text[:200])
# ===== 批量转换文件编码 =====
def convert_encoding(input_path, output_path, to_encoding='utf-8'):
with open(input_path, 'rb') as f:
raw = f.read()
detected = chardet.detect(raw)['encoding'] or 'gbk'
with open(output_path, 'w', encoding=to_encoding) as f:
f.write(raw.decode(detected))
print(f"✅ {detected} → {to_encoding}: {output_path}")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 1.2 Python encode/decode
#!/usr/bin/env python3
# ---- 字符串与字节互转 ----
text = "你好"
utf8_bytes = text.encode('utf-8') # b'\xe4\xbd\xa0\xe5\xa5\xbd'
gbk_bytes = text.encode('gbk') # b'\xc4\xe3\xba\xc3'
decoded = utf8_bytes.decode('utf-8') # '你好'
# ---- 常见编码错误处理 ----
# 'ignore' 跳过、'replace' 用 � 替代
broken = b'\xc0\xaf'.decode('utf-8', errors='replace') # '��'
safe = b'\xc0\xaf'.decode('utf-8', errors='ignore') # ''
# ---- 检测 BOM 头 ----
def has_bom(data):
"""检测 UTF-8/UTF-16 BOM"""
if data.startswith(b'\xef\xbb\xbf'): return 'UTF-8-BOM'
if data.startswith(b'\xff\xfe'): return 'UTF-16-LE'
if data.startswith(b'\xfe\xff'): return 'UTF-16-BE'
return None
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 二、URL 编码 / HTML 转义
# 2.1 URL 编码(百分号编码)
#!/usr/bin/env python3
from urllib.parse import quote, unquote, urlencode, parse_qs
# ---- 字符串 ↔ URL 编码 ----
text = "你好 world"
encoded = quote(text) # '%E4%BD%A0%E5%A5%BD%20world'
decoded = unquote(encoded) # '你好 world'
# 保留 / 不编码(用于路径)
quote("a/b c", safe='/') # 'a/b%20c'
# ---- 查询参数编码 ----
params = {'q': '你好', 'page': '1'}
query = urlencode(params) # 'q=%E4%BD%A0%E5%A5%BD&page=1'
# ---- 解析查询字符串 ----
parsed = parse_qs('q=%E4%BD%A0%E5%A5%BD&page=1')
print(parsed) # {'q': ['你好'], 'page': ['1']}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 2.2 HTML 实体转义
#!/usr/bin/env python3
import html
# ---- 转义——防 XSS ----
dangerous = '<script>alert("XSS")</script>'
safe = html.escape(dangerous)
print(safe) # <script>alert("XSS")</script>
# ---- 反转义 ----
original = html.unescape('<div>Hello</div>')
print(original) # <div>Hello</div>
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# 三、Shell 编码转换
#!/bin/bash
# ===== iconv——编码转换 =====
iconv -f GBK -t UTF-8 gbk_file.txt > utf8_file.txt
# ===== 检测文件编码 =====
file -I unknown.txt # 显示 charset
# 输出:unknown.txt: text/plain; charset=utf-8
# ===== 批量转换目录下所有文件编码 =====
for f in *.txt; do
iconv -f GBK -t UTF-8 "$f" -o "${f%.txt}_utf8.txt"
done
# ===== URL 编码/解码 =====
# 编码
python3 -c "import urllib.parse; print(urllib.parse.quote('你好世界'))"
# 解码
python3 -c "import urllib.parse; print(urllib.parse.unquote('%E4%BD%A0%E5%A5%BD'))"
# ===== HTML 转义 =====
python3 -c "import html; print(html.escape('<script>'))"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
上次更新: 2026/06/17, 12:47:39