#!/usr/bin/env python3 """ 分析PDF文件结构,识别Topic位置和题目数量 """ import re import json from PyPDF2 import PdfReader def analyze_pdf_structure(pdf_path): """ 分析PDF结构,找出所有Topic的位置和题目数量 """ reader = PdfReader(pdf_path) total_pages = len(reader.pages) print(f"PDF总页数: {total_pages}") topic_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE) question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE) topics = {} current_topic = None current_topic_start = None all_questions = [] for page_num in range(total_pages): page = reader.pages[page_num] text = page.extract_text() if text: lines = text.split('\n') for line in lines: line = line.strip() topic_match = topic_pattern.match(line) if topic_match: if current_topic is not None: topics[current_topic] = { 'topic_num': current_topic, 'start_page': current_topic_start, 'end_page': page_num - 1, 'questions': all_questions.copy() } all_questions = [] current_topic = int(topic_match.group(1)) current_topic_start = page_num print(f"发现 Topic {current_topic}: 第 {page_num + 1} 页") question_match = question_pattern.search(line) if question_match: q_num = int(question_match.group(1)) if q_num not in all_questions: all_questions.append(q_num) if current_topic is not None: topics[current_topic] = { 'topic_num': current_topic, 'start_page': current_topic_start, 'end_page': total_pages - 1, 'questions': all_questions } print(f"\n共发现 {len(topics)} 个Topic") result = [] total_questions = 0 print("\n各Topic统计:") for topic_num in sorted(topics.keys()): info = topics[topic_num] q_count = len(info['questions']) total_questions += q_count print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {q_count} 道题") result.append({ 'topic_num': topic_num, 'start_page': info['start_page'], 'end_page': info['end_page'], 'question_count': q_count, 'questions': sorted(info['questions']) }) print(f"\n总题目数: {total_questions}") with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print("\nTopic信息已保存到 topics_info.json") return result if __name__ == '__main__': pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf' topics = analyze_pdf_structure(pdf_path)