d365scm/analyze_pdf_v3.py

#!/usr/bin/env python3
"""
分析PDF文件结构，识别Topic位置和题目数量 - 改进版
"""
import re
import json
from PyPDF2 import PdfReader

def analyze_pdf_structure(pdf_path):
    """
    分析PDF结构，找出所有Topic的位置和题目数量
    """
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    print(f"PDF总页数: {total_pages}")

    topic_header_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE)
    question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)

    topic_pages = {}
    all_questions_in_topic = {}

    for page_num in range(total_pages):
        page = reader.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split('\n')
            for line in lines:
                line = line.strip()

                topic_match = topic_header_pattern.match(line)
                if topic_match:
                    topic_num = int(topic_match.group(1))
                    if topic_num not in topic_pages:
                        topic_pages[topic_num] = {'first_page': page_num, 'last_page': page_num}
                        all_questions_in_topic[topic_num] = []
                    else:
                        topic_pages[topic_num]['last_page'] = page_num

                question_match = question_pattern.search(line)
                if question_match:
                    q_num = int(question_match.group(1))
                    for topic_num in sorted(topic_pages.keys(), reverse=True):
                        if topic_num not in all_questions_in_topic:
                            all_questions_in_topic[topic_num] = []
                        if page_num >= topic_pages[topic_num]['first_page']:
                            if q_num not in all_questions_in_topic[topic_num]:
                                all_questions_in_topic[topic_num].append(q_num)
                            break

    print(f"\n共发现 {len(topic_pages)} 个Topic")

    result = []
    total_questions = 0
    print("\n各Topic统计:")
    for topic_num in sorted(topic_pages.keys()):
        questions = sorted(all_questions_in_topic.get(topic_num, []))
        q_count = len(questions)
        total_questions += q_count
        first_page = topic_pages[topic_num]['first_page']
        last_page = topic_pages[topic_num]['last_page']
        print(f"  Topic {topic_num}: 第 {first_page+1}-{last_page+1} 页, {q_count} 道题 (题目编号: {questions[:5]}{'...' if len(questions) > 5 else ''})")
        result.append({
            'topic_num': topic_num,
            'start_page': first_page,
            'end_page': last_page,
            'question_count': q_count,
            'questions': questions
        })

    print(f"\n总题目数: {total_questions}")

    with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print("\nTopic信息已保存到 topics_info.json")

    return result

if __name__ == '__main__':
    pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
    topics = analyze_pdf_structure(pdf_path)