d365scm/extract_questions_v2.py

#!/usr/bin/env python3
"""
精确提取PDF题目内容 - 改进版
"""
import re
import json
import os
from pypdf import PdfReader, PdfWriter

def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir):
    """
    按Topic切割PDF并精确提取题目内容
    """
    with open(topics_info_path, 'r', encoding='utf-8') as f:
        topics = json.load(f)

    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)

    os.makedirs(output_dir, exist_ok=True)
    pdf_dir = os.path.join(output_dir, 'pdfs')
    os.makedirs(pdf_dir, exist_ok=True)

    all_questions = []

    for topic in topics:
        topic_num = topic['topic_num']
        start_page = topic['start_page']
        end_page = topic['end_page']

        writer = PdfWriter()
        for page_num in range(start_page, min(end_page + 1, total_pages)):
            writer.add_page(reader.pages[page_num])

        pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf')
        with open(pdf_output_path, 'wb') as f:
            writer.write(f)
        print(f"已保存: {pdf_output_path}")

        print(f"正在提取 Topic {topic_num} 的题目内容...")
        topic_questions = extract_questions_precise(reader, start_page, end_page, topic_num)
        all_questions.extend(topic_questions)
        print(f"  Topic {topic_num}: 提取了 {len(topic_questions)} 道题")

    questions_json_path = os.path.join(output_dir, 'questions.json')
    with open(questions_json_path, 'w', encoding='utf-8') as f:
        json.dump(all_questions, f, ensure_ascii=False, indent=2)
    print(f"\n所有题目已保存到: {questions_json_path}")
    print(f"总共提取了 {len(all_questions)} 道题")

    return all_questions

def extract_questions_precise(reader, start_page, end_page, topic_num):
    """
    精确提取题目内容
    """
    questions = []

    full_text = ""
    for page_num in range(start_page, end_page + 1):
        page = reader.pages[page_num]
        text = page.extract_text()
        if text:
            full_text += text + "\n"

    question_pattern = re.compile(
        r'Question\s+#(\d+)\s*\n(.*?)(?=Question\s+#\d+|Topic\s+\d+|$)',
        re.DOTALL | re.IGNORECASE
    )

    matches = question_pattern.findall(full_text)

    for match in matches:
        q_num = int(match[0])
        content = match[1].strip()

        question_data = parse_question_content(topic_num, q_num, content)
        if question_data:
            questions.append(question_data)

    return questions

def parse_question_content(topic_num, q_num, content):
    """
    解析题目内容，提取题干、选项和答案
    """
    lines = content.split('\n')

    question_stem = ""
    options = []
    correct_answer = ""

    option_pattern = re.compile(r'^([A-Z])\.\s*(.*)', re.IGNORECASE)
    answer_pattern = re.compile(r'Correct Answer:\s*([A-Z,\s]+)', re.IGNORECASE)
    comments_pattern = re.compile(r'^Comments', re.IGNORECASE)

    current_section = "stem"
    current_option = None
    current_option_text = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if comments_pattern.match(line):
            break

        answer_match = answer_pattern.search(line)
        if answer_match:
            correct_answer = answer_match.group(1).strip().upper()
            continue

        option_match = option_pattern.match(line)
        if option_match:
            if current_option is not None and current_option_text:
                options.append({
                    'label': current_option,
                    'text': current_option_text.strip()
                })
            current_option = option_match.group(1).upper()
            current_option_text = option_match.group(2)
            current_section = "options"
        elif current_section == "options" and current_option is not None:
            if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote')):
                current_option_text += " " + line
        elif current_section == "stem":
            if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')):
                question_stem += " " + line

    if current_option is not None and current_option_text:
        options.append({
            'label': current_option,
            'text': current_option_text.strip()
        })

    question_stem = question_stem.strip()

    if not question_stem and not options:
        return None

    return {
        'topic': topic_num,
        'question_num': q_num,
        'stem': question_stem,
        'options': options,
        'answer': correct_answer
    }

if __name__ == '__main__':
    pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
    topics_info_path = '/Users/duguoyou/D365/topics_info.json'
    output_dir = '/Users/duguoyou/D365/exam_data'

    questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)