Files
d365scm/analyze_pdf_v2.py
2026-03-21 09:12:47 +08:00

92 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
分析PDF文件结构识别Topic位置和题目数量
"""
import re
import json
from PyPDF2 import PdfReader
def analyze_pdf_structure(pdf_path):
"""
分析PDF结构找出所有Topic的位置和题目数量
"""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
print(f"PDF总页数: {total_pages}")
topic_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE)
question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)
topics = {}
current_topic = None
current_topic_start = None
all_questions = []
for page_num in range(total_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
line = line.strip()
topic_match = topic_pattern.match(line)
if topic_match:
if current_topic is not None:
topics[current_topic] = {
'topic_num': current_topic,
'start_page': current_topic_start,
'end_page': page_num - 1,
'questions': all_questions.copy()
}
all_questions = []
current_topic = int(topic_match.group(1))
current_topic_start = page_num
print(f"发现 Topic {current_topic}: 第 {page_num + 1}")
question_match = question_pattern.search(line)
if question_match:
q_num = int(question_match.group(1))
if q_num not in all_questions:
all_questions.append(q_num)
if current_topic is not None:
topics[current_topic] = {
'topic_num': current_topic,
'start_page': current_topic_start,
'end_page': total_pages - 1,
'questions': all_questions
}
print(f"\n共发现 {len(topics)} 个Topic")
result = []
total_questions = 0
print("\n各Topic统计:")
for topic_num in sorted(topics.keys()):
info = topics[topic_num]
q_count = len(info['questions'])
total_questions += q_count
print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {q_count} 道题")
result.append({
'topic_num': topic_num,
'start_page': info['start_page'],
'end_page': info['end_page'],
'question_count': q_count,
'questions': sorted(info['questions'])
})
print(f"\n总题目数: {total_questions}")
with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print("\nTopic信息已保存到 topics_info.json")
return result
if __name__ == '__main__':
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
topics = analyze_pdf_structure(pdf_path)