#!/usr/bin/env python3 """ 精确提取PDF题目内容 - 最终版 """ import re import json import os from pypdf import PdfReader, PdfWriter def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir): """ 按Topic切割PDF并精确提取题目内容 """ with open(topics_info_path, 'r', encoding='utf-8') as f: topics = json.load(f) reader = PdfReader(pdf_path) total_pages = len(reader.pages) os.makedirs(output_dir, exist_ok=True) pdf_dir = os.path.join(output_dir, 'pdfs') os.makedirs(pdf_dir, exist_ok=True) all_questions = [] for topic in topics: topic_num = topic['topic_num'] start_page = topic['start_page'] end_page = topic['end_page'] writer = PdfWriter() for page_num in range(start_page, min(end_page + 1, total_pages)): writer.add_page(reader.pages[page_num]) pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf') with open(pdf_output_path, 'wb') as f: writer.write(f) print(f"已保存: {pdf_output_path}") print(f"正在提取 Topic {topic_num} 的题目内容...") topic_questions = extract_questions_precise(reader, start_page, end_page, topic_num) all_questions.extend(topic_questions) print(f" Topic {topic_num}: 提取了 {len(topic_questions)} 道题") questions_json_path = os.path.join(output_dir, 'questions.json') with open(questions_json_path, 'w', encoding='utf-8') as f: json.dump(all_questions, f, ensure_ascii=False, indent=2) print(f"\n所有题目已保存到: {questions_json_path}") print(f"总共提取了 {len(all_questions)} 道题") return all_questions def extract_questions_precise(reader, start_page, end_page, topic_num): """ 精确提取题目内容 """ questions = [] full_text = "" for page_num in range(start_page, end_page + 1): page = reader.pages[page_num] text = page.extract_text() if text: full_text += text + "\n" question_pattern = re.compile( r'Question\s+#(\d+)\s*\n(.*?)(?=Question\s+#\d+|Topic\s+\d+|$)', re.DOTALL | re.IGNORECASE ) matches = question_pattern.findall(full_text) for match in matches: q_num = int(match[0]) content = match[1].strip() question_data = parse_question_content(topic_num, q_num, content) if question_data: questions.append(question_data) return questions def parse_question_content(topic_num, q_num, content): """ 解析题目内容,提取题干、选项和答案 """ lines = content.split('\n') question_stem = "" options = [] correct_answer = "" option_pattern = re.compile(r'^([A-Z])\.\s*(.*)', re.IGNORECASE) answer_pattern = re.compile(r'Correct Answer:\s*([A-Z,\s]+)', re.IGNORECASE) comments_pattern = re.compile(r'^Comments', re.IGNORECASE) current_section = "stem" current_option = None current_option_text = "" for line in lines: line = line.strip() if not line: continue if comments_pattern.match(line): break answer_match = answer_pattern.search(line) if answer_match: correct_answer = answer_match.group(1).strip().upper() line = answer_pattern.sub('', line).strip() if not line: continue option_match = option_pattern.match(line) if option_match: if current_option is not None and current_option_text: options.append({ 'label': current_option, 'text': current_option_text.strip() }) current_option = option_match.group(1).upper() current_option_text = option_match.group(2) current_section = "options" elif current_section == "options" and current_option is not None: if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')): current_option_text += " " + line elif current_section == "stem": if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')): question_stem += " " + line if current_option is not None and current_option_text: options.append({ 'label': current_option, 'text': current_option_text.strip() }) question_stem = question_stem.strip() if not question_stem and not options: return None return { 'topic': topic_num, 'question_num': q_num, 'stem': question_stem, 'options': options, 'answer': correct_answer } if __name__ == '__main__': pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf' topics_info_path = '/Users/duguoyou/D365/topics_info.json' output_dir = '/Users/duguoyou/D365/exam_data' questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)