first commit

2026-03-21 09:12:47 +08:00
commit a1e76157c9
80 changed files with 506309 additions and 0 deletions
--- a/split_pdf.py
+++ b/split_pdf.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+切割PDF文件并提取题目内容
+"""
+import re
+import json
+import os
+from pypdf import PdfReader, PdfWriter
+
+def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir):
+    """
+    按Topic切割PDF并提取题目内容
+    """
+    with open(topics_info_path, 'r', encoding='utf-8') as f:
+        topics = json.load(f)
+    
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    
+    os.makedirs(output_dir, exist_ok=True)
+    pdf_dir = os.path.join(output_dir, 'pdfs')
+    os.makedirs(pdf_dir, exist_ok=True)
+    
+    all_questions = []
+    
+    for topic in topics:
+        topic_num = topic['topic_num']
+        start_page = topic['start_page']
+        end_page = topic['end_page']
+        
+        writer = PdfWriter()
+        for page_num in range(start_page, min(end_page + 1, total_pages)):
+            writer.add_page(reader.pages[page_num])
+        
+        pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf')
+        with open(pdf_output_path, 'wb') as f:
+            writer.write(f)
+        print(f"已保存: {pdf_output_path}")
+        
+        print(f"正在提取 Topic {topic_num} 的题目内容...")
+        topic_questions = extract_questions_from_pages(reader, start_page, end_page, topic_num)
+        all_questions.extend(topic_questions)
+        print(f"  Topic {topic_num}: 提取了 {len(topic_questions)} 道题")
+    
+    questions_json_path = os.path.join(output_dir, 'questions.json')
+    with open(questions_json_path, 'w', encoding='utf-8') as f:
+        json.dump(all_questions, f, ensure_ascii=False, indent=2)
+    print(f"\n所有题目已保存到: {questions_json_path}")
+    print(f"总共提取了 {len(all_questions)} 道题")
+    
+    return all_questions
+
+def extract_questions_from_pages(reader, start_page, end_page, topic_num):
+    """
+    从指定页面范围提取题目内容
+    """
+    questions = []
+    current_question = None
+    question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)
+    
+    for page_num in range(start_page, end_page + 1):
+        page = reader.pages[page_num]
+        text = page.extract_text()
+        
+        if not text:
+            continue
+        
+        lines = text.split('\n')
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            
+            q_match = question_pattern.search(line)
+            if q_match:
+                if current_question:
+                    questions.append(current_question)
+                
+                q_num = int(q_match.group(1))
+                current_question = {
+                    'topic': topic_num,
+                    'question_num': q_num,
+                    'content': line,
+                    'options': [],
+                    'answer': None,
+                    'explanation': None
+                }
+            elif current_question:
+                if line.startswith('A.') or line.startswith('B.') or line.startswith('C.') or line.startswith('D.'):
+                    current_question['options'].append(line)
+                elif line.startswith('Correct Answer:'):
+                    current_question['answer'] = line.replace('Correct Answer:', '').strip()
+                elif line.startswith('Comments'):
+                    current_question['explanation'] = ''
+                elif current_question.get('explanation') is not None:
+                    current_question['explanation'] += ' ' + line
+    
+    if current_question:
+        questions.append(current_question)
+    
+    return questions
+
+if __name__ == '__main__':
+    pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
+    topics_info_path = '/Users/duguoyou/D365/topics_info.json'
+    output_dir = '/Users/duguoyou/D365/exam_data'
+    
+    questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)