74 lines
2.6 KiB
Python
74 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
分析PDF文件结构,识别Topic位置
|
||
"""
|
||
import re
|
||
from PyPDF2 import PdfReader
|
||
|
||
def analyze_pdf_structure(pdf_path):
|
||
"""
|
||
分析PDF结构,找出所有Topic的位置
|
||
"""
|
||
reader = PdfReader(pdf_path)
|
||
total_pages = len(reader.pages)
|
||
print(f"PDF总页数: {total_pages}")
|
||
|
||
topic_pattern = re.compile(r'^Topic\s+(\d+)', re.IGNORECASE)
|
||
question_pattern = re.compile(r'^Question\s+(\d+)', re.IGNORECASE)
|
||
|
||
topics = {}
|
||
current_topic = None
|
||
question_count = 0
|
||
total_questions = 0
|
||
|
||
for page_num in range(total_pages):
|
||
page = reader.pages[page_num]
|
||
text = page.extract_text()
|
||
|
||
if text:
|
||
lines = text.split('\n')
|
||
for line in lines:
|
||
line = line.strip()
|
||
|
||
topic_match = topic_pattern.match(line)
|
||
if topic_match:
|
||
if current_topic is not None:
|
||
topics[current_topic]['end_page'] = page_num
|
||
topics[current_topic]['question_count'] = question_count
|
||
total_questions += question_count
|
||
|
||
topic_num = int(topic_match.group(1))
|
||
current_topic = topic_num
|
||
topics[topic_num] = {
|
||
'title': line,
|
||
'start_page': page_num,
|
||
'end_page': None,
|
||
'question_count': 0
|
||
}
|
||
question_count = 0
|
||
print(f"发现 Topic {topic_num}: 第 {page_num + 1} 页 - {line}")
|
||
|
||
question_match = question_pattern.match(line)
|
||
if question_match and current_topic is not None:
|
||
q_num = int(question_match.group(1))
|
||
if q_num > question_count:
|
||
question_count = q_num
|
||
|
||
if current_topic is not None:
|
||
topics[current_topic]['end_page'] = total_pages - 1
|
||
topics[current_topic]['question_count'] = question_count
|
||
total_questions += question_count
|
||
|
||
print(f"\n共发现 {len(topics)} 个Topic")
|
||
print(f"总题目数: {total_questions}")
|
||
print("\n各Topic统计:")
|
||
for topic_num in sorted(topics.keys()):
|
||
info = topics[topic_num]
|
||
print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {info['question_count']} 道题")
|
||
|
||
return topics, total_pages
|
||
|
||
if __name__ == '__main__':
|
||
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
|
||
topics, total_pages = analyze_pdf_structure(pdf_path)
|