83 lines
3.1 KiB
Python
83 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
分析PDF文件结构,识别Topic位置和题目数量 - 改进版
|
||
"""
|
||
import re
|
||
import json
|
||
from PyPDF2 import PdfReader
|
||
|
||
def analyze_pdf_structure(pdf_path):
|
||
"""
|
||
分析PDF结构,找出所有Topic的位置和题目数量
|
||
"""
|
||
reader = PdfReader(pdf_path)
|
||
total_pages = len(reader.pages)
|
||
print(f"PDF总页数: {total_pages}")
|
||
|
||
topic_header_pattern = re.compile(r'^Topic\s+(\d+)$', re.IGNORECASE)
|
||
question_pattern = re.compile(r'Question\s+#(\d+)', re.IGNORECASE)
|
||
|
||
topic_pages = {}
|
||
all_questions_in_topic = {}
|
||
|
||
for page_num in range(total_pages):
|
||
page = reader.pages[page_num]
|
||
text = page.extract_text()
|
||
|
||
if text:
|
||
lines = text.split('\n')
|
||
for line in lines:
|
||
line = line.strip()
|
||
|
||
topic_match = topic_header_pattern.match(line)
|
||
if topic_match:
|
||
topic_num = int(topic_match.group(1))
|
||
if topic_num not in topic_pages:
|
||
topic_pages[topic_num] = {'first_page': page_num, 'last_page': page_num}
|
||
all_questions_in_topic[topic_num] = []
|
||
else:
|
||
topic_pages[topic_num]['last_page'] = page_num
|
||
|
||
question_match = question_pattern.search(line)
|
||
if question_match:
|
||
q_num = int(question_match.group(1))
|
||
for topic_num in sorted(topic_pages.keys(), reverse=True):
|
||
if topic_num not in all_questions_in_topic:
|
||
all_questions_in_topic[topic_num] = []
|
||
if page_num >= topic_pages[topic_num]['first_page']:
|
||
if q_num not in all_questions_in_topic[topic_num]:
|
||
all_questions_in_topic[topic_num].append(q_num)
|
||
break
|
||
|
||
print(f"\n共发现 {len(topic_pages)} 个Topic")
|
||
|
||
result = []
|
||
total_questions = 0
|
||
print("\n各Topic统计:")
|
||
for topic_num in sorted(topic_pages.keys()):
|
||
questions = sorted(all_questions_in_topic.get(topic_num, []))
|
||
q_count = len(questions)
|
||
total_questions += q_count
|
||
first_page = topic_pages[topic_num]['first_page']
|
||
last_page = topic_pages[topic_num]['last_page']
|
||
print(f" Topic {topic_num}: 第 {first_page+1}-{last_page+1} 页, {q_count} 道题 (题目编号: {questions[:5]}{'...' if len(questions) > 5 else ''})")
|
||
result.append({
|
||
'topic_num': topic_num,
|
||
'start_page': first_page,
|
||
'end_page': last_page,
|
||
'question_count': q_count,
|
||
'questions': questions
|
||
})
|
||
|
||
print(f"\n总题目数: {total_questions}")
|
||
|
||
with open('/Users/duguoyou/D365/topics_info.json', 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
print("\nTopic信息已保存到 topics_info.json")
|
||
|
||
return result
|
||
|
||
if __name__ == '__main__':
|
||
pdf_path = '/Users/duguoyou/D365/MB-330_with_discussion.pdf'
|
||
topics = analyze_pdf_structure(pdf_path)
|