Files
algorithm/services/speech-to-text/ai_algorithm.py

90 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import base64
from io import BytesIO
from typing import List, Dict, Any
logger = logging.getLogger(__name__)
class SpeechToTextConverter:
"""语音转文字转换器"""
def __init__(self):
"""初始化语音转文字转换器"""
logger.info("初始化语音转文字转换器")
# 这里可以加载预训练模型
# 示例中使用简单的规则转换
def convert(self, audios: List[str], params: Dict[str, Any] = None) -> List[Dict[str, Any]]:
"""转换语音为文字
Args:
audios: 音频列表每个音频为base64编码字符串
params: 转换参数
Returns:
转换结果列表
"""
if params is None:
params = {}
language = params.get("language", "zh")
results = []
for audio_base64 in audios:
# 简单的规则转换示例
transcription = self._simple_convert(audio_base64, language)
results.append({
"audio": audio_base64[:100] + "..." if len(audio_base64) > 100 else audio_base64,
"text": transcription["text"],
"confidence": transcription["confidence"]
})
return results
def _simple_convert(self, audio_base64: str, language: str) -> Dict[str, Any]:
"""简单的语音转文字实现
Args:
audio_base64: base64编码的音频
language: 语言
Returns:
转换结果
"""
# 简单的规则转换(基于音频大小和内容特征)
try:
# 解码base64
audio_data = base64.b64decode(audio_base64)
# 计算音频大小特征
audio_size = len(audio_data)
# 基于大小的简单转换
if audio_size < 10240: # 小于10KB
text = "这是一段短音频"
elif audio_size < 102400: # 小于100KB
text = "这是一段中等长度的音频"
else: # 大于100KB
text = "这是一段长音频"
# 根据语言调整文本
if language == "en":
if audio_size < 10240:
text = "This is a short audio"
elif audio_size < 102400:
text = "This is a medium length audio"
else:
text = "This is a long audio"
return {
"text": text,
"confidence": 0.85
}
except Exception as e:
logger.error(f"Speech to text conversion error: {str(e)}")
return {
"text": "",
"confidence": 0.0
}