first commit
This commit is contained in:
157
extract_questions_v3.py
Normal file
157
extract_questions_v3.py
Normal file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
精确提取PDF题目内容 - 最终版
|
||||
"""
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir):
|
||||
"""
|
||||
按Topic切割PDF并精确提取题目内容
|
||||
"""
|
||||
with open(topics_info_path, 'r', encoding='utf-8') as f:
|
||||
topics = json.load(f)
|
||||
|
||||
reader = PdfReader(pdf_path)
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
pdf_dir = os.path.join(output_dir, 'pdfs')
|
||||
os.makedirs(pdf_dir, exist_ok=True)
|
||||
|
||||
all_questions = []
|
||||
|
||||
for topic in topics:
|
||||
topic_num = topic['topic_num']
|
||||
start_page = topic['start_page']
|
||||
end_page = topic['end_page']
|
||||
|
||||
writer = PdfWriter()
|
||||
for page_num in range(start_page, min(end_page + 1, total_pages)):
|
||||
writer.add_page(reader.pages[page_num])
|
||||
|
||||
pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf')
|
||||
with open(pdf_output_path, 'wb') as f:
|
||||
writer.write(f)
|
||||
print(f"已保存: {pdf_output_path}")
|
||||
|
||||
print(f"正在提取 Topic {topic_num} 的题目内容...")
|
||||
topic_questions = extract_questions_precise(reader, start_page, end_page, topic_num)
|
||||
all_questions.extend(topic_questions)
|
||||
print(f" Topic {topic_num}: 提取了 {len(topic_questions)} 道题")
|
||||
|
||||
questions_json_path = os.path.join(output_dir, 'questions.json')
|
||||
with open(questions_json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_questions, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n所有题目已保存到: {questions_json_path}")
|
||||
print(f"总共提取了 {len(all_questions)} 道题")
|
||||
|
||||
return all_questions
|
||||
|
||||
def extract_questions_precise(reader, start_page, end_page, topic_num):
|
||||
"""
|
||||
精确提取题目内容
|
||||
"""
|
||||
questions = []
|
||||
|
||||
full_text = ""
|
||||
for page_num in range(start_page, end_page + 1):
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
full_text += text + "\n"
|
||||
|
||||
question_pattern = re.compile(
|
||||
r'Question\s+#(\d+)\s*\n(.*?)(?=Question\s+#\d+|Topic\s+\d+|$)',
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
matches = question_pattern.findall(full_text)
|
||||
|
||||
for match in matches:
|
||||
q_num = int(match[0])
|
||||
content = match[1].strip()
|
||||
|
||||
question_data = parse_question_content(topic_num, q_num, content)
|
||||
if question_data:
|
||||
questions.append(question_data)
|
||||
|
||||
return questions
|
||||
|
||||
def parse_question_content(topic_num, q_num, content):
|
||||
"""
|
||||
解析题目内容,提取题干、选项和答案
|
||||
"""
|
||||
lines = content.split('\n')
|
||||
|
||||
question_stem = ""
|
||||
options = []
|
||||
correct_answer = ""
|
||||
|
||||
option_pattern = re.compile(r'^([A-Z])\.\s*(.*)', re.IGNORECASE)
|
||||
answer_pattern = re.compile(r'Correct Answer:\s*([A-Z,\s]+)', re.IGNORECASE)
|
||||
comments_pattern = re.compile(r'^Comments', re.IGNORECASE)
|
||||
|
||||
current_section = "stem"
|
||||
current_option = None
|
||||
current_option_text = ""
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if comments_pattern.match(line):
|
||||
break
|
||||
|
||||
answer_match = answer_pattern.search(line)
|
||||
if answer_match:
|
||||
correct_answer = answer_match.group(1).strip().upper()
|
||||
line = answer_pattern.sub('', line).strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
option_match = option_pattern.match(line)
|
||||
if option_match:
|
||||
if current_option is not None and current_option_text:
|
||||
options.append({
|
||||
'label': current_option,
|
||||
'text': current_option_text.strip()
|
||||
})
|
||||
current_option = option_match.group(1).upper()
|
||||
current_option_text = option_match.group(2)
|
||||
current_section = "options"
|
||||
elif current_section == "options" and current_option is not None:
|
||||
if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')):
|
||||
current_option_text += " " + line
|
||||
elif current_section == "stem":
|
||||
if not line.startswith(('Most Voted', 'upvoted', 'Selected Answer:', 'Community vote', 'Correct Answer')):
|
||||
question_stem += " " + line
|
||||
|
||||
if current_option is not None and current_option_text:
|
||||
options.append({
|
||||
'label': current_option,
|
||||
'text': current_option_text.strip()
|
||||
})
|
||||
|
||||
question_stem = question_stem.strip()
|
||||
|
||||
if not question_stem and not options:
|
||||
return None
|
||||
|
||||
return {
|
||||
'topic': topic_num,
|
||||
'question_num': q_num,
|
||||
'stem': question_stem,
|
||||
'options': options,
|
||||
'answer': correct_answer
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
|
||||
topics_info_path = '/Users/duguoyou/D365/topics_info.json'
|
||||
output_dir = '/Users/duguoyou/D365/exam_data'
|
||||
|
||||
questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)
|
||||
Reference in New Issue
Block a user