1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
| import fitz import os import re
def sanitize_filename(name): """去除文件名中不允许的字符""" name = re.sub(r'[<>:"/\\|?*]', '_', name) name = name.strip('. ') return name
def get_toc_from_pdf(input_pdf_path): """从PDF书签中获取目录""" doc = fitz.open(input_pdf_path) toc = doc.get_toc() doc.close() if not toc: raise ValueError("PDF中没有找到书签目录!") print(f"从PDF中读取到 {len(toc)} 个目录条目") return toc
def is_title_at_page_start(doc, page_idx, title): """ 检查标题是否在页面的开头位置 返回: 是否在开头 """ if page_idx >= doc.page_count: return False page = doc[page_idx] text = page.get_text() text_clean = ' '.join(text.split()) title_clean = ' '.join(title.split()) pos = text_clean.find(title_clean) if pos == -1: short_title = title_clean[:30] pos = text_clean.find(short_title) if pos == -1: print(f" ⚠️ 警告: 在页码 {page_idx+1} 中未找到标题 '{title_clean[:50]}...'") return False return pos < 100
def find_precise_end_page(doc, current_page, next_title, next_start_page): """ 精确查找当前章节的结束页面 返回 0-based 的结束页码 """ next_start_idx = next_start_page - 1 if next_start_idx >= doc.page_count: return doc.page_count - 1 if is_title_at_page_start(doc, next_start_idx, next_title): return next_start_idx - 1 else: return next_start_idx
def split_pdf_by_toc(input_pdf_path, output_dir): """根据PDF书签切分章节""" toc = get_toc_from_pdf(input_pdf_path) doc = fitz.open(input_pdf_path) total_pages = doc.page_count
if not os.path.exists(output_dir): os.makedirs(output_dir)
print(f"PDF总页数: {total_pages}") print(f"目录条目数: {len(toc)}") print("=" * 80)
success_count = 0 skip_count = 0
for idx, item in enumerate(toc): level, title, start_page = item start_idx = start_page - 1
if idx + 1 < len(toc): next_level, next_title, next_start_page = toc[idx + 1] end_idx = find_precise_end_page(doc, start_page, next_title, next_start_page) is_shared = (end_idx == next_start_page - 1) share_info = " [共享页面]" if is_shared else "" else: end_idx = total_pages - 1 share_info = ""
start_idx = max(0, start_idx) end_idx = min(total_pages - 1, end_idx) if start_idx > end_idx: level_indent = " " if level == 2 else "" print(f"⚠️ 跳过 [{level}]{title}: 起始页({start_page}) > 结束页({end_idx+1})") skip_count += 1 continue
new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
number = f"{idx + 1:03d}" safe_title = sanitize_filename(title) filename = f"{number}_{safe_title}.pdf" filepath = os.path.join(output_dir, filename)
new_doc.save(filepath) new_doc.close() success_count += 1 level_indent = " " if level == 2 else "" page_count = end_idx - start_idx + 1 print(f"[{number}] {level_indent}[Lv.{level}] {title}") print(f" 页码: 第{start_page}页 → 第{end_idx+1}页 | 共{page_count}页{share_info}") print()
doc.close() print("=" * 80) print(f"✅ 完成!成功生成: {success_count} 个文件,跳过: {skip_count} 个") print(f"📁 输出目录: {output_dir}")
if __name__ == "__main__": input_pdf = "in.pdf" output_folder = "out" split_pdf_by_toc(input_pdf, output_folder)
|