first commit
This commit is contained in:
73
analyze_pdf.py
Normal file
73
analyze_pdf.py
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
分析PDF文件结构,识别Topic位置
|
||||
"""
|
||||
import re
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
def analyze_pdf_structure(pdf_path):
|
||||
"""
|
||||
分析PDF结构,找出所有Topic的位置
|
||||
"""
|
||||
reader = PdfReader(pdf_path)
|
||||
total_pages = len(reader.pages)
|
||||
print(f"PDF总页数: {total_pages}")
|
||||
|
||||
topic_pattern = re.compile(r'^Topic\s+(\d+)', re.IGNORECASE)
|
||||
question_pattern = re.compile(r'^Question\s+(\d+)', re.IGNORECASE)
|
||||
|
||||
topics = {}
|
||||
current_topic = None
|
||||
question_count = 0
|
||||
total_questions = 0
|
||||
|
||||
for page_num in range(total_pages):
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text()
|
||||
|
||||
if text:
|
||||
lines = text.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
topic_match = topic_pattern.match(line)
|
||||
if topic_match:
|
||||
if current_topic is not None:
|
||||
topics[current_topic]['end_page'] = page_num
|
||||
topics[current_topic]['question_count'] = question_count
|
||||
total_questions += question_count
|
||||
|
||||
topic_num = int(topic_match.group(1))
|
||||
current_topic = topic_num
|
||||
topics[topic_num] = {
|
||||
'title': line,
|
||||
'start_page': page_num,
|
||||
'end_page': None,
|
||||
'question_count': 0
|
||||
}
|
||||
question_count = 0
|
||||
print(f"发现 Topic {topic_num}: 第 {page_num + 1} 页 - {line}")
|
||||
|
||||
question_match = question_pattern.match(line)
|
||||
if question_match and current_topic is not None:
|
||||
q_num = int(question_match.group(1))
|
||||
if q_num > question_count:
|
||||
question_count = q_num
|
||||
|
||||
if current_topic is not None:
|
||||
topics[current_topic]['end_page'] = total_pages - 1
|
||||
topics[current_topic]['question_count'] = question_count
|
||||
total_questions += question_count
|
||||
|
||||
print(f"\n共发现 {len(topics)} 个Topic")
|
||||
print(f"总题目数: {total_questions}")
|
||||
print("\n各Topic统计:")
|
||||
for topic_num in sorted(topics.keys()):
|
||||
info = topics[topic_num]
|
||||
print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {info['question_count']} 道题")
|
||||
|
||||
return topics, total_pages
|
||||
|
||||
if __name__ == '__main__':
|
||||
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
|
||||
topics, total_pages = analyze_pdf_structure(pdf_path)
|
||||
Reference in New Issue
Block a user