92 lines
3.1 KiB
Python
92 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
分析PDF文件结构,识别Topic位置和题目数量
|
||
"""
|
||
import re
|
||
import json
|
||
from PyPDF2 import PdfReader
|
||
|
||
def analyze_pdf_structure(pdf_path):
|
||
"""
|
||
分析PDF结构,找出所有Topic的位置和题目数量
|
||
"""
|
||
reader = PdfReader(pdf_path)
|
||
total_pages = len(reader.pages)
|
||
print(f"PDF总页数: {total_pages}")
|
||
|
||
topic_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE)
|
||
question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)
|
||
|
||
topics = {}
|
||
current_topic = None
|
||
current_topic_start = None
|
||
all_questions = []
|
||
|
||
for page_num in range(total_pages):
|
||
page = reader.pages[page_num]
|
||
text = page.extract_text()
|
||
|
||
if text:
|
||
lines = text.split('\n')
|
||
for line in lines:
|
||
line = line.strip()
|
||
|
||
topic_match = topic_pattern.match(line)
|
||
if topic_match:
|
||
if current_topic is not None:
|
||
topics[current_topic] = {
|
||
'topic_num': current_topic,
|
||
'start_page': current_topic_start,
|
||
'end_page': page_num - 1,
|
||
'questions': all_questions.copy()
|
||
}
|
||
all_questions = []
|
||
|
||
current_topic = int(topic_match.group(1))
|
||
current_topic_start = page_num
|
||
print(f"发现 Topic {current_topic}: 第 {page_num + 1} 页")
|
||
|
||
question_match = question_pattern.search(line)
|
||
if question_match:
|
||
q_num = int(question_match.group(1))
|
||
if q_num not in all_questions:
|
||
all_questions.append(q_num)
|
||
|
||
if current_topic is not None:
|
||
topics[current_topic] = {
|
||
'topic_num': current_topic,
|
||
'start_page': current_topic_start,
|
||
'end_page': total_pages - 1,
|
||
'questions': all_questions
|
||
}
|
||
|
||
print(f"\n共发现 {len(topics)} 个Topic")
|
||
|
||
result = []
|
||
total_questions = 0
|
||
print("\n各Topic统计:")
|
||
for topic_num in sorted(topics.keys()):
|
||
info = topics[topic_num]
|
||
q_count = len(info['questions'])
|
||
total_questions += q_count
|
||
print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {q_count} 道题")
|
||
result.append({
|
||
'topic_num': topic_num,
|
||
'start_page': info['start_page'],
|
||
'end_page': info['end_page'],
|
||
'question_count': q_count,
|
||
'questions': sorted(info['questions'])
|
||
})
|
||
|
||
print(f"\n总题目数: {total_questions}")
|
||
|
||
with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
print("\nTopic信息已保存到 topics_info.json")
|
||
|
||
return result
|
||
|
||
if __name__ == '__main__':
|
||
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
|
||
topics = analyze_pdf_structure(pdf_path)
|