first commit

2026-03-21 09:12:47 +08:00
commit a1e76157c9
80 changed files with 506309 additions and 0 deletions
--- a/extract_questions_v3.py
+++ b/extract_questions_v3.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+精确提取PDF题目内容 - 最终版
+"""
+import re
+import json
+import os
+from pypdf import PdfReader, PdfWriter
+
+def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir):
+    """
+    按Topic切割PDF并精确提取题目内容
+    """
+    with open(topics_info_path, 'r', encoding='utf-8') as f:
+        topics = json.load(f)
+    
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    
+    os.makedirs(output_dir, exist_ok=True)
+    pdf_dir = os.path.join(output_dir, 'pdfs')
+    os.makedirs(pdf_dir, exist_ok=True)
+    
+    all_questions = []
+    
+    for topic in topics:
+        topic_num = topic['topic_num']
+        start_page = topic['start_page']
+        end_page = topic['end_page']
+        
+        writer = PdfWriter()
+        for page_num in range(start_page, min(end_page + 1, total_pages)):
+            writer.add_page(reader.pages[page_num])
+        
+        pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf')
+        with open(pdf_output_path, 'wb') as f:
+            writer.write(f)
+        print(f"已保存: {pdf_output_path}")
+        
+        print(f"正在提取 Topic {topic_num} 的题目内容...")
+        topic_questions = extract_questions_precise(reader, start_page, end_page, topic_num)
+        all_questions.extend(topic_questions)
+        print(f"  Topic {topic_num}: 提取了 {len(topic_questions)} 道题")
+    
+    questions_json_path = os.path.join(output_dir, 'questions.json')
+    with open(questions_json_path, 'w', encoding='utf-8') as f:
+        json.dump(all_questions, f, ensure_ascii=False, indent=2)
+    print(f"\n所有题目已保存到: {questions_json_path}")
+    print(f"总共提取了 {len(all_questions)} 道题")
+    
+    return all_questions
+
+def extract_questions_precise(reader, start_page, end_page, topic_num):
+    """
+    精确提取题目内容
+    """
+    questions = []
+    
+    full_text = ""
+    for page_num in range(start_page, end_page + 1):
+        page = reader.pages[page_num]
+        text = page.extract_text()
+        if text:
+            full_text += text + "\n"
+    
+    question_pattern = re.compile(
+        r'Question\s+#(\d+)\s*\n(.*?)(?=Question\s+#\d+|Topic\s+\d+|$)',
+        re.DOTALL | re.IGNORECASE
+    )
+    
+    matches = question_pattern.findall(full_text)
+    
+    for match in matches:
+        q_num = int(match[0])
+        content = match[1].strip()
+        
+        question_data = parse_question_content(topic_num, q_num, content)
+        if question_data:
+            questions.append(question_data)
+    
+    return questions
+
+def parse_question_content(topic_num, q_num, content):
+    """
+    解析题目内容，提取题干、选项和答案
+    """
+    lines = content.split('\n')
+    
+    question_stem = ""
+    options = []
+    correct_answer = ""
+    
+    option_pattern = re.compile(r'^([A-Z])\.\s*(.*)', re.IGNORECASE)
+    answer_pattern = re.compile(r'Correct Answer:\s*([A-Z,\s]+)', re.IGNORECASE)
+    comments_pattern = re.compile(r'^Comments', re.IGNORECASE)
+    
+    current_section = "stem"
+    current_option = None
+    current_option_text = ""
+    
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        
+        if comments_pattern.match(line):
+            break
+        
+        answer_match = answer_pattern.search(line)
+        if answer_match:
+            correct_answer = answer_match.group(1).strip().upper()
+            line = answer_pattern.sub('', line).strip()
+            if not line:
+                continue
+        
+        option_match = option_pattern.match(line)
+        if option_match:
+            if current_option is not None and current_option_text:
+                options.append({
+                    'label': current_option,
+                    'text': current_option_text.strip()
+                })
+            current_option = option_match.group(1).upper()
+            current_option_text = option_match.group(2)
+            current_section = "options"
+        elif current_section == "options" and current_option is not None:
+            if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')):
+                current_option_text += " " + line
+        elif current_section == "stem":
+            if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')):
+                question_stem += " " + line
+    
+    if current_option is not None and current_option_text:
+        options.append({
+            'label': current_option,
+            'text': current_option_text.strip()
+        })
+    
+    question_stem = question_stem.strip()
+    
+    if not question_stem and not options:
+        return None
+    
+    return {
+        'topic': topic_num,
+        'question_num': q_num,
+        'stem': question_stem,
+        'options': options,
+        'answer': correct_answer
+    }
+
+if __name__ == '__main__':
+    pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
+    topics_info_path = '/Users/duguoyou/D365/topics_info.json'
+    output_dir = '/Users/duguoyou/D365/exam_data'
+    
+    questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)