109 lines
3.8 KiB
Python
109 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
切割PDF文件并提取题目内容
|
|
"""
|
|
import re
|
|
import json
|
|
import os
|
|
from pypdf import PdfReader, PdfWriter
|
|
|
|
def split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir):
|
|
"""
|
|
按Topic切割PDF并提取题目内容
|
|
"""
|
|
with open(topics_info_path, 'r', encoding='utf-8') as f:
|
|
topics = json.load(f)
|
|
|
|
reader = PdfReader(pdf_path)
|
|
total_pages = len(reader.pages)
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
pdf_dir = os.path.join(output_dir, 'pdfs')
|
|
os.makedirs(pdf_dir, exist_ok=True)
|
|
|
|
all_questions = []
|
|
|
|
for topic in topics:
|
|
topic_num = topic['topic_num']
|
|
start_page = topic['start_page']
|
|
end_page = topic['end_page']
|
|
|
|
writer = PdfWriter()
|
|
for page_num in range(start_page, min(end_page + 1, total_pages)):
|
|
writer.add_page(reader.pages[page_num])
|
|
|
|
pdf_output_path = os.path.join(pdf_dir, f'topic_{topic_num:02d}.pdf')
|
|
with open(pdf_output_path, 'wb') as f:
|
|
writer.write(f)
|
|
print(f"已保存: {pdf_output_path}")
|
|
|
|
print(f"正在提取 Topic {topic_num} 的题目内容...")
|
|
topic_questions = extract_questions_from_pages(reader, start_page, end_page, topic_num)
|
|
all_questions.extend(topic_questions)
|
|
print(f" Topic {topic_num}: 提取了 {len(topic_questions)} 道题")
|
|
|
|
questions_json_path = os.path.join(output_dir, 'questions.json')
|
|
with open(questions_json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(all_questions, f, ensure_ascii=False, indent=2)
|
|
print(f"\n所有题目已保存到: {questions_json_path}")
|
|
print(f"总共提取了 {len(all_questions)} 道题")
|
|
|
|
return all_questions
|
|
|
|
def extract_questions_from_pages(reader, start_page, end_page, topic_num):
|
|
"""
|
|
从指定页面范围提取题目内容
|
|
"""
|
|
questions = []
|
|
current_question = None
|
|
question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)
|
|
|
|
for page_num in range(start_page, end_page + 1):
|
|
page = reader.pages[page_num]
|
|
text = page.extract_text()
|
|
|
|
if not text:
|
|
continue
|
|
|
|
lines = text.split('\n')
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
q_match = question_pattern.search(line)
|
|
if q_match:
|
|
if current_question:
|
|
questions.append(current_question)
|
|
|
|
q_num = int(q_match.group(1))
|
|
current_question = {
|
|
'topic': topic_num,
|
|
'question_num': q_num,
|
|
'content': line,
|
|
'options': [],
|
|
'answer': None,
|
|
'explanation': None
|
|
}
|
|
elif current_question:
|
|
if line.startswith('A.') or line.startswith('B.') or line.startswith('C.') or line.startswith('D.'):
|
|
current_question['options'].append(line)
|
|
elif line.startswith('Correct Answer:'):
|
|
current_question['answer'] = line.replace('Correct Answer:', '').strip()
|
|
elif line.startswith('Comments'):
|
|
current_question['explanation'] = ''
|
|
elif current_question.get('explanation') is not None:
|
|
current_question['explanation'] += ' ' + line
|
|
|
|
if current_question:
|
|
questions.append(current_question)
|
|
|
|
return questions
|
|
|
|
if __name__ == '__main__':
|
|
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
|
|
topics_info_path = '/Users/duguoyou/D365/topics_info.json'
|
|
output_dir = '/Users/duguoyou/D365/exam_data'
|
|
|
|
questions = split_pdf_and_extract_questions(pdf_path, topics_info_path, output_dir)
|