1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import fitz  # PyMuPDF
import os
import re


def sanitize_filename(name):
"""去除文件名中不允许的字符"""
name = re.sub(r'[<>:"/\\|?*]', '_', name)
name = name.strip('. ')
return name


def get_toc_from_pdf(input_pdf_path):
"""从PDF书签中获取目录"""
doc = fitz.open(input_pdf_path)
toc = doc.get_toc() # 返回 [[level, title, page], ...]
doc.close()

if not toc:
raise ValueError("PDF中没有找到书签目录!")

print(f"从PDF中读取到 {len(toc)} 个目录条目")
return toc


def is_title_at_page_start(doc, page_idx, title):
"""
检查标题是否在页面的开头位置
返回: 是否在开头
"""
if page_idx >= doc.page_count:
return False

page = doc[page_idx]
text = page.get_text()

# 清理文本,规范化空白字符
text_clean = ' '.join(text.split())
title_clean = ' '.join(title.split())

# 查找标题在文本中的位置
pos = text_clean.find(title_clean)

if pos == -1:
# 尝试模糊匹配(只匹配前30个字符)
short_title = title_clean[:30]
pos = text_clean.find(short_title)
if pos == -1:
print(f" ⚠️ 警告: 在页码 {page_idx+1} 中未找到标题 '{title_clean[:50]}...'")
return False

# 判断是否在开头(前100个字符内认为是开头,容忍页眉等)
return pos < 100


def find_precise_end_page(doc, current_page, next_title, next_start_page):
"""
精确查找当前章节的结束页面
返回 0-based 的结束页码
"""
next_start_idx = next_start_page - 1

# 边界检查
if next_start_idx >= doc.page_count:
return doc.page_count - 1

# 检查下一个章节标题是否在下页开头
if is_title_at_page_start(doc, next_start_idx, next_title):
# 下一章节从新页开始,当前章节结束于前一页
return next_start_idx - 1
else:
# 共享页面,当前章节也包含该页
return next_start_idx


def split_pdf_by_toc(input_pdf_path, output_dir):
"""根据PDF书签切分章节"""
# 获取目录
toc = get_toc_from_pdf(input_pdf_path)

doc = fitz.open(input_pdf_path)
total_pages = doc.page_count

if not os.path.exists(output_dir):
os.makedirs(output_dir)

print(f"PDF总页数: {total_pages}")
print(f"目录条目数: {len(toc)}")
print("=" * 80)

success_count = 0
skip_count = 0

for idx, item in enumerate(toc):
level, title, start_page = item
start_idx = start_page - 1 # 转为0-based

# 确定结束页码
if idx + 1 < len(toc):
next_level, next_title, next_start_page = toc[idx + 1]

# 使用精确查找
end_idx = find_precise_end_page(doc, start_page, next_title, next_start_page)

# 检查是否是共享页面
is_shared = (end_idx == next_start_page - 1)
share_info = " [共享页面]" if is_shared else ""
else:
# 最后一个章节,到文档末尾
end_idx = total_pages - 1
share_info = ""

# 边界检查
start_idx = max(0, start_idx)
end_idx = min(total_pages - 1, end_idx)

if start_idx > end_idx:
level_indent = " " if level == 2 else ""
print(f"⚠️ 跳过 [{level}]{title}: 起始页({start_page}) > 结束页({end_idx+1})")
skip_count += 1
continue

# 创建新PDF
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)

# 生成文件名
number = f"{idx + 1:03d}"
safe_title = sanitize_filename(title)
filename = f"{number}_{safe_title}.pdf"
filepath = os.path.join(output_dir, filename)

new_doc.save(filepath)
new_doc.close()
success_count += 1

# 打印信息
level_indent = " " if level == 2 else ""
page_count = end_idx - start_idx + 1
print(f"[{number}] {level_indent}[Lv.{level}] {title}")
print(f" 页码: 第{start_page}页 → 第{end_idx+1}页 | 共{page_count}{share_info}")
print()

doc.close()

print("=" * 80)
print(f"✅ 完成!成功生成: {success_count} 个文件,跳过: {skip_count} 个")
print(f"📁 输出目录: {output_dir}")


if __name__ == "__main__":
input_pdf = "in.pdf" # 修改为你的PDF文件名
output_folder = "out"

split_pdf_by_toc(input_pdf, output_folder)