d365scm/analyze_pdf.py

#!/usr/bin/env python3
"""
分析PDF文件结构，识别Topic位置
"""
import re
from PyPDF2 import PdfReader

def analyze_pdf_structure(pdf_path):
    """
    分析PDF结构，找出所有Topic的位置
    """
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    print(f"PDF总页数: {total_pages}")

    topic_pattern = re.compile(r'^Topic\s+(\d+)', re.IGNORECASE)
    question_pattern = re.compile(r'^Question\s+(\d+)', re.IGNORECASE)

    topics = {}
    current_topic = None
    question_count = 0
    total_questions = 0

    for page_num in range(total_pages):
        page = reader.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split('\n')
            for line in lines:
                line = line.strip()

                topic_match = topic_pattern.match(line)
                if topic_match:
                    if current_topic is not None:
                        topics[current_topic]['end_page'] = page_num
                        topics[current_topic]['question_count'] = question_count
                        total_questions += question_count

                    topic_num = int(topic_match.group(1))
                    current_topic = topic_num
                    topics[topic_num] = {
                        'title': line,
                        'start_page': page_num,
                        'end_page': None,
                        'question_count': 0
                    }
                    question_count = 0
                    print(f"发现 Topic {topic_num}: 第 {page_num + 1} 页 - {line}")

                question_match = question_pattern.match(line)
                if question_match and current_topic is not None:
                    q_num = int(question_match.group(1))
                    if q_num > question_count:
                        question_count = q_num

    if current_topic is not None:
        topics[current_topic]['end_page'] = total_pages - 1
        topics[current_topic]['question_count'] = question_count
        total_questions += question_count

    print(f"\n共发现 {len(topics)} 个Topic")
    print(f"总题目数: {total_questions}")
    print("\n各Topic统计:")
    for topic_num in sorted(topics.keys()):
        info = topics[topic_num]
        print(f"  Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {info['question_count']} 道题")

    return topics, total_pages

if __name__ == '__main__':
    pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
    topics, total_pages = analyze_pdf_structure(pdf_path)