Files
d365scm/analyze_pdf.py
2026-03-21 09:12:47 +08:00

74 lines
2.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
分析PDF文件结构识别Topic位置
"""
import re
from PyPDF2 import PdfReader
def analyze_pdf_structure(pdf_path):
"""
分析PDF结构找出所有Topic的位置
"""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
print(f"PDF总页数: {total_pages}")
topic_pattern = re.compile(r'^Topic\s+(\d+)', re.IGNORECASE)
question_pattern = re.compile(r'^Question\s+(\d+)', re.IGNORECASE)
topics = {}
current_topic = None
question_count = 0
total_questions = 0
for page_num in range(total_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
line = line.strip()
topic_match = topic_pattern.match(line)
if topic_match:
if current_topic is not None:
topics[current_topic]['end_page'] = page_num
topics[current_topic]['question_count'] = question_count
total_questions += question_count
topic_num = int(topic_match.group(1))
current_topic = topic_num
topics[topic_num] = {
'title': line,
'start_page': page_num,
'end_page': None,
'question_count': 0
}
question_count = 0
print(f"发现 Topic {topic_num}: 第 {page_num + 1} 页 - {line}")
question_match = question_pattern.match(line)
if question_match and current_topic is not None:
q_num = int(question_match.group(1))
if q_num > question_count:
question_count = q_num
if current_topic is not None:
topics[current_topic]['end_page'] = total_pages - 1
topics[current_topic]['question_count'] = question_count
total_questions += question_count
print(f"\n共发现 {len(topics)} 个Topic")
print(f"总题目数: {total_questions}")
print("\n各Topic统计:")
for topic_num in sorted(topics.keys()):
info = topics[topic_num]
print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {info['question_count']} 道题")
return topics, total_pages
if __name__ == '__main__':
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
topics, total_pages = analyze_pdf_structure(pdf_path)