d365scm/analyze_pdf_v2.py

#!/usr/bin/env python3
"""
分析PDF文件结构，识别Topic位置和题目数量
"""
import re
import json
from PyPDF2 import PdfReader

def analyze_pdf_structure(pdf_path):
    """
    分析PDF结构，找出所有Topic的位置和题目数量
    """
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    print(f"PDF总页数: {total_pages}")

    topic_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE)
    question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)

    topics = {}
    current_topic = None
    current_topic_start = None
    all_questions = []

    for page_num in range(total_pages):
        page = reader.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split('\n')
            for line in lines:
                line = line.strip()

                topic_match = topic_pattern.match(line)
                if topic_match:
                    if current_topic is not None:
                        topics[current_topic] = {
                            'topic_num': current_topic,
                            'start_page': current_topic_start,
                            'end_page': page_num - 1,
                            'questions': all_questions.copy()
                        }
                        all_questions = []

                    current_topic = int(topic_match.group(1))
                    current_topic_start = page_num
                    print(f"发现 Topic {current_topic}: 第 {page_num + 1} 页")

                question_match = question_pattern.search(line)
                if question_match:
                    q_num = int(question_match.group(1))
                    if q_num not in all_questions:
                        all_questions.append(q_num)

    if current_topic is not None:
        topics[current_topic] = {
            'topic_num': current_topic,
            'start_page': current_topic_start,
            'end_page': total_pages - 1,
            'questions': all_questions
        }

    print(f"\n共发现 {len(topics)} 个Topic")

    result = []
    total_questions = 0
    print("\n各Topic统计:")
    for topic_num in sorted(topics.keys()):
        info = topics[topic_num]
        q_count = len(info['questions'])
        total_questions += q_count
        print(f"  Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {q_count} 道题")
        result.append({
            'topic_num': topic_num,
            'start_page': info['start_page'],
            'end_page': info['end_page'],
            'question_count': q_count,
            'questions': sorted(info['questions'])
        })

    print(f"\n总题目数: {total_questions}")

    with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print("\nTopic信息已保存到 topics_info.json")

    return result

if __name__ == '__main__':
    pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
    topics = analyze_pdf_structure(pdf_path)