Edge-tts库怎么生成字幕？

2025-07-23 01:07:13 42 分享链接开发笔记 python

要生成字幕SRT文件，需要获取每个词语或句子的时间戳信息。虽然edge-tts库本身不直接支持生成SRT文件，但可以通过监听WordBoundary事件来获取每个单词的开始时间和持续时间，从而生成字幕文件。

实现方案

以下是一个完整的示例，展示如何使用edge-tts生成语音并同时创建对应的SRT字幕文件：

import asyncio
import edge_tts
from datetime import timedelta

async def text_to_speech_with_srt(
    text: str,
    voice: str = "zh-CN-YunxiNeural",
    output_audio: str = "output.mp3",
    output_srt: str = "output.srt"
) -> None:
    """
    将文本转换为语音并生成对应的SRT字幕文件
    
    Args:
        text: 要转换的文本
        voice: 语音模型
        output_audio: 输出音频文件路径
        output_srt: 输出SRT文件路径
    """
    # 创建通信对象
    communicate = edge_tts.Communicate(text, voice)
    
    # 存储单词边界信息
    word_boundaries = []
    
    # 流式处理音频和边界信息
    with open(output_audio, "wb") as audio_file:
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_file.write(chunk["data"])
            elif chunk["type"] == "WordBoundary":
                # 记录每个单词的开始时间和内容
                word_boundaries.append({
                    "offset": chunk["offset"],  # 开始时间（毫秒）
                    "duration": chunk["duration"],  # 持续时间（毫秒）
                    "text": chunk["text"]  # 单词文本
                })
    
    # 生成SRT文件
    if word_boundaries:
        create_srt_file(word_boundaries, output_srt)
        print(f"SRT字幕文件已生成: {output_srt}")

def create_srt_file(word_boundaries: list, output_file: str) -> None:
    """
    根据单词边界信息创建SRT字幕文件
    
    Args:
        word_boundaries: 单词边界信息列表
        output_file: SRT文件输出路径
    """
    # 按句子分割（简化版：假设每个句号/感叹号/问号后为新句子）
    sentence_ends = ['.', '!', '?', '。', '！', '？']
    sentences = []
    current_sentence = []
    
    for boundary in word_boundaries:
        current_sentence.append(boundary)
        if boundary["text"][-1] in sentence_ends:
            sentences.append(current_sentence)
            current_sentence = []
    
    # 添加最后一个不完整的句子
    if current_sentence:
        sentences.append(current_sentence)
    
    # 写入SRT文件
    with open(output_file, "w", encoding="utf-8") as srt_file:
        for i, sentence in enumerate(sentences, 1):
            # 计算句子的开始和结束时间
            start_time = timedelta(milliseconds=sentence[0]["offset"])
            end_time = timedelta(milliseconds=sentence[-1]["offset"] + sentence[-1]["duration"])
            
            # 构建字幕文本
            text = " ".join([word["text"] for word in sentence])
            
            # 写入SRT条目
            srt_file.write(f"{i}\n")
            srt_file.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
            srt_file.write(f"{text}\n\n")

def format_time(time_delta: timedelta) -> str:
    """将时间间隔格式化为SRT时间格式: HH:MM:SS,mmm"""
    total_seconds = int(time_delta.total_seconds())
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = time_delta.microseconds // 1000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

# 使用示例
if __name__ == "__main__":
    text = "你好，这是一个生成字幕的示例。这段文本将被转换为语音，同时创建对应的SRT字幕文件。"
    
    asyncio.run(
        text_to_speech_with_srt(
            text=text,
            voice="zh-CN-YunxiNeural",
            output_audio="subtitle_example.mp3",
            output_srt="subtitle_example.srt"
        )
    )

代码说明

语音生成与时间戳捕获：
- 使用edge-tts的流式处理功能，同时保存音频数据和单词边界信息
- 通过监听WordBoundary事件获取每个单词的开始时间、持续时间和内容
SRT文件格式：
- SRT文件由多个字幕条目组成，每个条目包含序号、时间范围和文本
- 时间格式为HH:MM:SS,mmm（小时:分钟:秒,毫秒）
句子分割：
- 代码将连续的单词按句号、感叹号和问号分割成句子
- 每个句子作为一个单独的字幕条目

高级用法：自定义字幕样式和时间间隔

如果你需要更精细的控制，可以添加以下功能：

# 自定义字幕合并间隔（毫秒）
def create_srt_file(word_boundaries: list, output_file: str, max_words_per_line: int = 8) -> None:
    """创建SRT文件，支持自定义每行最多单词数"""
    subtitles = []
    current_subtitle = []
    
    for boundary in word_boundaries:
        # 如果当前字幕组超过最大单词数，或者单词间隔太长，则开始新字幕
        if (len(current_subtitle) >= max_words_per_line or 
            (current_subtitle and boundary["offset"] - 
             (current_subtitle[-1]["offset"] + current_subtitle[-1]["duration"]) > 1000)):
            subtitles.append(current_subtitle)
            current_subtitle = [boundary]
        else:
            current_subtitle.append(boundary)
    
    # 添加最后一个字幕组
    if current_subtitle:
        subtitles.append(current_subtitle)
    
    # 写入SRT文件
    with open(output_file, "w", encoding="utf-8") as srt_file:
        for i, subtitle in enumerate(subtitles, 1):
            start_time = timedelta(milliseconds=subtitle[0]["offset"])
            end_time = timedelta(milliseconds=subtitle[-1]["offset"] + subtitle[-1]["duration"])
            
            # 将长句子分成多行（每行最多max_words_per_line个单词）
            text_lines = []
            current_line = []
            for word in subtitle:
                if len(current_line) >= max_words_per_line:
                    text_lines.append(" ".join([w["text"] for w in current_line]))
                    current_line = [word]
                else:
                    current_line.append(word)
            if current_line:
                text_lines.append(" ".join([w["text"] for w in current_line]))
            
            text = "\n".join(text_lines)
            
            srt_file.write(f"{i}\n")
            srt_file.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
            srt_file.write(f"{text}\n\n")