#!/usr/bin/env python3 """ 分析PDF文件结构,识别Topic位置 """ import re from PyPDF2 import PdfReader def analyze_pdf_structure(pdf_path): """ 分析PDF结构,找出所有Topic的位置 """ reader = PdfReader(pdf_path) total_pages = len(reader.pages) print(f"PDF总页数: {total_pages}") topic_pattern = re.compile(r'^Topic\s+(\d+)', re.IGNORECASE) question_pattern = re.compile(r'^Question\s+(\d+)', re.IGNORECASE) topics = {} current_topic = None question_count = 0 total_questions = 0 for page_num in range(total_pages): page = reader.pages[page_num] text = page.extract_text() if text: lines = text.split('\n') for line in lines: line = line.strip() topic_match = topic_pattern.match(line) if topic_match: if current_topic is not None: topics[current_topic]['end_page'] = page_num topics[current_topic]['question_count'] = question_count total_questions += question_count topic_num = int(topic_match.group(1)) current_topic = topic_num topics[topic_num] = { 'title': line, 'start_page': page_num, 'end_page': None, 'question_count': 0 } question_count = 0 print(f"发现 Topic {topic_num}: 第 {page_num + 1} 页 - {line}") question_match = question_pattern.match(line) if question_match and current_topic is not None: q_num = int(question_match.group(1)) if q_num > question_count: question_count = q_num if current_topic is not None: topics[current_topic]['end_page'] = total_pages - 1 topics[current_topic]['question_count'] = question_count total_questions += question_count print(f"\n共发现 {len(topics)} 个Topic") print(f"总题目数: {total_questions}") print("\n各Topic统计:") for topic_num in sorted(topics.keys()): info = topics[topic_num] print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {info['question_count']} 道题") return topics, total_pages if __name__ == '__main__': pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf' topics, total_pages = analyze_pdf_structure(pdf_path)