Files
d365scm/analyze_pdf_v3.py
2026-03-21 09:12:47 +08:00

83 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
分析PDF文件结构识别Topic位置和题目数量 - 改进版
"""
import re
import json
from PyPDF2 import PdfReader
def analyze_pdf_structure(pdf_path):
"""
分析PDF结构找出所有Topic的位置和题目数量
"""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
print(f"PDF总页数: {total_pages}")
topic_header_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE)
question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)
topic_pages = {}
all_questions_in_topic = {}
for page_num in range(total_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
line = line.strip()
topic_match = topic_header_pattern.match(line)
if topic_match:
topic_num = int(topic_match.group(1))
if topic_num not in topic_pages:
topic_pages[topic_num] = {'first_page': page_num, 'last_page': page_num}
all_questions_in_topic[topic_num] = []
else:
topic_pages[topic_num]['last_page'] = page_num
question_match = question_pattern.search(line)
if question_match:
q_num = int(question_match.group(1))
for topic_num in sorted(topic_pages.keys(), reverse=True):
if topic_num not in all_questions_in_topic:
all_questions_in_topic[topic_num] = []
if page_num >= topic_pages[topic_num]['first_page']:
if q_num not in all_questions_in_topic[topic_num]:
all_questions_in_topic[topic_num].append(q_num)
break
print(f"\n共发现 {len(topic_pages)} 个Topic")
result = []
total_questions = 0
print("\n各Topic统计:")
for topic_num in sorted(topic_pages.keys()):
questions = sorted(all_questions_in_topic.get(topic_num, []))
q_count = len(questions)
total_questions += q_count
first_page = topic_pages[topic_num]['first_page']
last_page = topic_pages[topic_num]['last_page']
print(f" Topic {topic_num}: 第 {first_page+1}-{last_page+1} 页, {q_count} 道题 (题目编号: {questions[:5]}{'...' if len(questions) > 5 else ''})")
result.append({
'topic_num': topic_num,
'start_page': first_page,
'end_page': last_page,
'question_count': q_count,
'questions': questions
})
print(f"\n总题目数: {total_questions}")
with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print("\nTopic信息已保存到 topics_info.json")
return result
if __name__ == '__main__':
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
topics = analyze_pdf_structure(pdf_path)