first commit

This commit is contained in:
2026-03-21 09:12:47 +08:00
commit a1e76157c9
80 changed files with 506309 additions and 0 deletions

73
analyze_pdf.py Normal file
View File

@@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""
分析PDF文件结构识别Topic位置
"""
import re
from PyPDF2 import PdfReader
def analyze_pdf_structure(pdf_path):
"""
分析PDF结构找出所有Topic的位置
"""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
print(f"PDF总页数: {total_pages}")
topic_pattern = re.compile(r'^Topic\s+(\d+)', re.IGNORECASE)
question_pattern = re.compile(r'^Question\s+(\d+)', re.IGNORECASE)
topics = {}
current_topic = None
question_count = 0
total_questions = 0
for page_num in range(total_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
line = line.strip()
topic_match = topic_pattern.match(line)
if topic_match:
if current_topic is not None:
topics[current_topic]['end_page'] = page_num
topics[current_topic]['question_count'] = question_count
total_questions += question_count
topic_num = int(topic_match.group(1))
current_topic = topic_num
topics[topic_num] = {
'title': line,
'start_page': page_num,
'end_page': None,
'question_count': 0
}
question_count = 0
print(f"发现 Topic {topic_num}: 第 {page_num + 1} 页 - {line}")
question_match = question_pattern.match(line)
if question_match and current_topic is not None:
q_num = int(question_match.group(1))
if q_num > question_count:
question_count = q_num
if current_topic is not None:
topics[current_topic]['end_page'] = total_pages - 1
topics[current_topic]['question_count'] = question_count
total_questions += question_count
print(f"\n共发现 {len(topics)} 个Topic")
print(f"总题目数: {total_questions}")
print("\n各Topic统计:")
for topic_num in sorted(topics.keys()):
info = topics[topic_num]
print(f" Topic {topic_num}: 第 {info['start_page']+1}-{info['end_page']+1} 页, {info['question_count']} 道题")
return topics, total_pages
if __name__ == '__main__':
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
topics, total_pages = analyze_pdf_structure(pdf_path)