pdf按章节拆分到文件

将pdf按照标题内容等进行切分
import fitz  # PyMuPDF
import os
import re


def sanitize_filename(name):
    """去除文件名中不允许的字符"""
    name = re.sub(r'[<>:"/\\|?*]', '_', name)
    name = name.strip('. ')
    return name


def get_toc_from_pdf(input_pdf_path):
    """从PDF书签中获取目录"""
    doc = fitz.open(input_pdf_path)
    toc = doc.get_toc()  # 返回 [[level, title, page], ...]
    doc.close()
    
    if not toc:
        raise ValueError("PDF中没有找到书签目录！")
    
    print(f"从PDF中读取到 {len(toc)} 个目录条目")
    return toc


def is_title_at_page_start(doc, page_idx, title):
    """
    检查标题是否在页面的开头位置
    返回: 是否在开头
    """
    if page_idx >= doc.page_count:
        return False
    
    page = doc[page_idx]
    text = page.get_text()
    
    # 清理文本，规范化空白字符
    text_clean = ' '.join(text.split())
    title_clean = ' '.join(title.split())
    
    # 查找标题在文本中的位置
    pos = text_clean.find(title_clean)
    
    if pos == -1:
        # 尝试模糊匹配（只匹配前30个字符）
        short_title = title_clean[:30]
        pos = text_clean.find(short_title)
        if pos == -1:
            print(f"    ⚠️  警告: 在页码 {page_idx+1} 中未找到标题 '{title_clean[:50]}...'")
            return False
    
    # 判断是否在开头（前100个字符内认为是开头，容忍页眉等）
    return pos < 100


def find_precise_end_page(doc, current_page, next_title, next_start_page):
    """
    精确查找当前章节的结束页面
    返回 0-based 的结束页码
    """
    next_start_idx = next_start_page - 1
    
    # 边界检查
    if next_start_idx >= doc.page_count:
        return doc.page_count - 1
    
    # 检查下一个章节标题是否在下页开头
    if is_title_at_page_start(doc, next_start_idx, next_title):
        # 下一章节从新页开始，当前章节结束于前一页
        return next_start_idx - 1
    else:
        # 共享页面，当前章节也包含该页
        return next_start_idx


def split_pdf_by_toc(input_pdf_path, output_dir):
    """根据PDF书签切分章节"""
    # 获取目录
    toc = get_toc_from_pdf(input_pdf_path)
    
    doc = fitz.open(input_pdf_path)
    total_pages = doc.page_count

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"PDF总页数: {total_pages}")
    print(f"目录条目数: {len(toc)}")
    print("=" * 80)

    success_count = 0
    skip_count = 0

    for idx, item in enumerate(toc):
        level, title, start_page = item
        start_idx = start_page - 1  # 转为0-based

        # 确定结束页码
        if idx + 1 < len(toc):
            next_level, next_title, next_start_page = toc[idx + 1]
            
            # 使用精确查找
            end_idx = find_precise_end_page(doc, start_page, next_title, next_start_page)
            
            # 检查是否是共享页面
            is_shared = (end_idx == next_start_page - 1)
            share_info = " [共享页面]" if is_shared else ""
        else:
            # 最后一个章节，到文档末尾
            end_idx = total_pages - 1
            share_info = ""

        # 边界检查
        start_idx = max(0, start_idx)
        end_idx = min(total_pages - 1, end_idx)
        
        if start_idx > end_idx:
            level_indent = "  " if level == 2 else ""
            print(f"⚠️  跳过 [{level}]{title}: 起始页({start_page}) > 结束页({end_idx+1})")
            skip_count += 1
            continue

        # 创建新PDF
        new_doc = fitz.open()
        new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)

        # 生成文件名
        number = f"{idx + 1:03d}"
        safe_title = sanitize_filename(title)
        filename = f"{number}_{safe_title}.pdf"
        filepath = os.path.join(output_dir, filename)

        new_doc.save(filepath)
        new_doc.close()
        success_count += 1
        
        # 打印信息
        level_indent = "  " if level == 2 else ""
        page_count = end_idx - start_idx + 1
        print(f"[{number}] {level_indent}[Lv.{level}] {title}")
        print(f"     页码: 第{start_page}页 → 第{end_idx+1}页 | 共{page_count}页{share_info}")
        print()

    doc.close()
    
    print("=" * 80)
    print(f"✅ 完成！成功生成: {success_count} 个文件，跳过: {skip_count} 个")
    print(f"📁 输出目录: {output_dir}")


if __name__ == "__main__":
    input_pdf = "in.pdf"   # 修改为你的PDF文件名
    output_folder = "out"
    
    split_pdf_by_toc(input_pdf, output_folder)