#!/usr/bin/env python3 """ 切割PDF文件并提取题目内容 """ import re import json import os from pypdf import PdfReader, PdfWriter def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir): """ 按Topic切割PDF并提取题目内容 """ with open(topics_info_path, 'r', encoding='utf-8') as f: topics = json.load(f) reader = PdfReader(pdf_path) total_pages = len(reader.pages) os.makedirs(output_dir, exist_ok=True) pdf_dir = os.path.join(output_dir, 'pdfs') os.makedirs(pdf_dir, exist_ok=True) all_questions = [] for topic in topics: topic_num = topic['topic_num'] start_page = topic['start_page'] end_page = topic['end_page'] writer = PdfWriter() for page_num in range(start_page, min(end_page + 1, total_pages)): writer.add_page(reader.pages[page_num]) pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf') with open(pdf_output_path, 'wb') as f: writer.write(f) print(f"已保存: {pdf_output_path}") print(f"正在提取 Topic {topic_num} 的题目内容...") topic_questions = extract_questions_from_pages(reader, start_page, end_page, topic_num) all_questions.extend(topic_questions) print(f" Topic {topic_num}: 提取了 {len(topic_questions)} 道题") questions_json_path = os.path.join(output_dir, 'questions.json') with open(questions_json_path, 'w', encoding='utf-8') as f: json.dump(all_questions, f, ensure_ascii=False, indent=2) print(f"\n所有题目已保存到: {questions_json_path}") print(f"总共提取了 {len(all_questions)} 道题") return all_questions def extract_questions_from_pages(reader, start_page, end_page, topic_num): """ 从指定页面范围提取题目内容 """ questions = [] current_question = None question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE) for page_num in range(start_page, end_page + 1): page = reader.pages[page_num] text = page.extract_text() if not text: continue lines = text.split('\n') for line in lines: line = line.strip() if not line: continue q_match = question_pattern.search(line) if q_match: if current_question: questions.append(current_question) q_num = int(q_match.group(1)) current_question = { 'topic': topic_num, 'question_num': q_num, 'content': line, 'options': [], 'answer': None, 'explanation': None } elif current_question: if line.startswith('A.') or line.startswith('B.') or line.startswith('C.') or line.startswith('D.'): current_question['options'].append(line) elif line.startswith('Correct Answer:'): current_question['answer'] = line.replace('Correct Answer:', '').strip() elif line.startswith('Comments'): current_question['explanation'] = '' elif current_question.get('explanation') is not None: current_question['explanation'] += ' ' + line if current_question: questions.append(current_question) return questions if __name__ == '__main__': pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf' topics_info_path = '/Users/duguoyou/D365/topics_info.json' output_dir = '/Users/duguoyou/D365/exam_data' questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)