#!/usr/bin/env python3 """ 分析PDF文件结构,识别Topic位置和题目数量 - 改进版 """ import re import json from PyPDF2 import PdfReader def analyze_pdf_structure(pdf_path): """ 分析PDF结构,找出所有Topic的位置和题目数量 """ reader = PdfReader(pdf_path) total_pages = len(reader.pages) print(f"PDF总页数: {total_pages}") topic_header_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE) question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE) topic_pages = {} all_questions_in_topic = {} for page_num in range(total_pages): page = reader.pages[page_num] text = page.extract_text() if text: lines = text.split('\n') for line in lines: line = line.strip() topic_match = topic_header_pattern.match(line) if topic_match: topic_num = int(topic_match.group(1)) if topic_num not in topic_pages: topic_pages[topic_num] = {'first_page': page_num, 'last_page': page_num} all_questions_in_topic[topic_num] = [] else: topic_pages[topic_num]['last_page'] = page_num question_match = question_pattern.search(line) if question_match: q_num = int(question_match.group(1)) for topic_num in sorted(topic_pages.keys(), reverse=True): if topic_num not in all_questions_in_topic: all_questions_in_topic[topic_num] = [] if page_num >= topic_pages[topic_num]['first_page']: if q_num not in all_questions_in_topic[topic_num]: all_questions_in_topic[topic_num].append(q_num) break print(f"\n共发现 {len(topic_pages)} 个Topic") result = [] total_questions = 0 print("\n各Topic统计:") for topic_num in sorted(topic_pages.keys()): questions = sorted(all_questions_in_topic.get(topic_num, [])) q_count = len(questions) total_questions += q_count first_page = topic_pages[topic_num]['first_page'] last_page = topic_pages[topic_num]['last_page'] print(f" Topic {topic_num}: 第 {first_page+1}-{last_page+1} 页, {q_count} 道题 (题目编号: {questions[:5]}{'...' if len(questions) > 5 else ''})") result.append({ 'topic_num': topic_num, 'start_page': first_page, 'end_page': last_page, 'question_count': q_count, 'questions': questions }) print(f"\n总题目数: {total_questions}") with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print("\nTopic信息已保存到 topics_info.json") return result if __name__ == '__main__': pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf' topics = analyze_pdf_structure(pdf_path)