ai.add_notes_voice_pptx のソースコード

"""
概要: PowerPointプレゼンテーションに音声ノートを追加し、自動再生リンクを設定するスクリプト。
詳細説明:
    このスクリプトは、Windows環境においてPowerPoint (win32com.client) とtkttsライブラリを
    利用し、PPTXファイルのノートに記載されたテキストから音声ファイルを生成し、その音声を
    プレゼンテーション内の各スライドに自動再生されるメディアオブジェクトとしてリンクします。
    また、外部の発言テキストファイルからノート内容をPPTXに書き込む機能も提供します。
    サポートされるTTSエンジンには、pyttsx3, VOICEVOX, AquesTalkPlayer, OpenAIなどがあります。
関連リンク: :doc:`add_notes_voice_pptx_usage`
"""
import os
import sys
import re
import shutil
import argparse
import time
from pathlib import Path
import traceback

try:
    import tktts 
    from tktts import tkTTS
    import pythoncom
    import win32com.client
except ImportError as e:
    print(f"Error: tktts/pythoncom/win32com.client のインポートエラー: {e}")
    print("tktts.py が同一ディレクトリに存在し、win32com.client がインストールされていることを確認してください。")
    sys.exit(1)


DEFAULT_ENGINE = "pyttsx3"
DEFAULT_VOICEVOX_ENDPOINT = "http://127.0.0.1:50021"
DEFAULT_AQUESTALK_PATH = "AquesTalkPlayer.exe"
DEFAULT_TEMP_DIR = "tts_temp_wavs_pptx" # 一時ファイルを格納するディレクトリ
DEFAULT_SPEAK_RATE = 150 # pyttsx3の読み上げ速度 (WPM)

VOICE_MAPS = {
    "pyttsx3": {"Zira": "Zira", "David": "David", "四国めたん": "Zira", "ずんだもん": "David", "れいむ": "Zira", "まりさ": "David"}, 
    "aquestalkplayer": {"四国めたん": "れいむ", "ずんだもん": "まりさ", "れいむ": "れいむ", "まりさ": "まりさ"},
    "openai": {"四国めたん": "nova", "ずんだもん": "shimmer", "れいむ": "alloy", "まりさ": "fable"},
}



[ドキュメント]
def terminate():
    """
    概要: プログラムを一時停止し、ユーザーの入力後に終了する。
    詳細説明:
        コンソールに終了を促すメッセージを表示し、ユーザーがEnterキーを押すまで
        プログラムの実行を停止します。その後、プログラムを終了します。
    """
    input("\nPress ENTER to terminate>>\n")
    exit()



[ドキュメント]
def initialize():
    """
    概要: コマンドライン引数を解析し、プログラムの設定を初期化する。
    詳細説明:
        argparseモジュールを使用して、TTSエンジンの選択、入力/出力ファイルパス、
        音声保存ディレクトリ、読み上げ速度、VOICEVOXやAquesTalkPlayer、OpenAI固有の
        オプションなど、多岐にわたるコマンドライン引数を定義し、解析します。
        これにより、ユーザーは様々な設定でスクリプトを実行できます。
    :returns: args: argparse.Namespace: コマンドライン引数を格納したオブジェクト。
    """
    # speak.py のオプションを可能な限り移植
    parser = argparse.ArgumentParser(
        description="PPTXノートから音声ファイルを生成し、自動再生リンクを設定するプログラム (Windows/PowerPoint, tktts使用)",
        formatter_class=argparse.RawTextHelpFormatter
    )

    parser.add_argument("--mode", choices=['list', 'map', 'conv'], help="実行モード:\n list: 利用可能な音声名を表示\n conv: 音声ファイルを生成しPPTXにリンク", default='conv')
    parser.add_argument("--monologue", "-m", type=int, default=0, help="独話形式 (カンマのない行も読み込む)")

    parser.add_argument("--tts", choices=["pyttsx3", "voicevox", "aquestalkplayer", "atp", "openai"], default=DEFAULT_ENGINE, help="TTSエンジンを選択")
    
    parser.add_argument("--input_path",    "-i", type=str, help="[convモード] ノートを追加する元のPPTXファイルパス")
    parser.add_argument("--narration_txt", "-n", type=str, help="[convモード] ノート内容を含む発言テキストファイルパス (オプション: 既存のノートがない場合)")
    parser.add_argument("--audio_dir",     "-a", type=str, default="audio_output", help="[convモード] 生成された音声ファイルを保存するディレクトリ")
    parser.add_argument("--output_path",   "-o", type=str, help="[convモード] 音声リンクが追加された出力PPTXファイルパス")
    
    parser.add_argument("--voices", "-v", type=str, default="", help="[tktts] voice_map の上書き (話者名=ボイス;話者名=ボイス)")
    parser.add_argument("--replace", "-r", type=str, default="", help="[tktts] 文字列置換ルール (key=val;key=val)")
    parser.add_argument("--temp_dir", type=str, default=DEFAULT_TEMP_DIR, help="一時ファイルを作成するディレクトリ名")
    
    # pyttsx3/AQT/OpenAI 共通 (speak.py準拠)
    parser.add_argument("--speak_rate", type=int, default=DEFAULT_SPEAK_RATE, help="[pyttsx3] 読み上げ速度 (WPM) / [AQT] 速度比")
    parser.add_argument("--tinterval", type=float, default=0.5, help="[AQT/OpenAI/VOICEVOX] 音声ファイル間に挿入する無音区間の長さ（秒）")
    
    # VOICEVOX 固有オプション (speak.py準拠)
    parser.add_argument("--endpoint", type=str, default=DEFAULT_VOICEVOX_ENDPOINT, help="[VOICEVOX] Engineのendpoint")
    parser.add_argument("--fspeak_rate", type=float, default=1.0, help="[VOICEVOX] 読み上げ速度比 (標準: 1.0)")
    parser.add_argument("--fspeak_pitch", type=float, default=0.0, help="[VOICEVOX] 声の高さ比 (標準: 0.0)")

    # AquesTalkPlayer 固有オプション (speak.py準拠)
    parser.add_argument("--aquestalk_path", type=str, default="AquesTalkPlayer.exe", help="[AQT] AquesTalkPlayer.exe の実行パス")
    
    # OpenAI 固有オプション (speak.py準拠)
    parser.add_argument("--instruction", type=str, default="", help="[OpenAI] TTS APIへの追加指示")


    args = parser.parse_args()
    return args



[ドキュメント]
def parse_narration_file(narration_path, monologue = True):
    """
    概要: 発言テキストファイルからスライド番号とノート内容をパースする。
    詳細説明:
        指定されたテキストファイルを開き、"# Slide N" (Nは数字) のパターンを認識して、
        各スライドに対応するテキストを抽出します。抽出されたテキストは、PowerPointの
        ノートとして使用されることを想定しています。空行やコメント行 ('#')、括弧で囲まれた行は無視されます。
        `monologue`がTrueの場合、カンマ区切りでない行もそのままノートとして扱います。
    :param narration_path: str: 発言テキストファイルへのパス。
    :param monologue: bool: 独話形式かどうかを示すフラグ。Trueの場合、カンマのない行も全て読み込む。
    :returns: dict or None: スライド番号をキー、ノート内容を値とする辞書。パースエラーが発生した場合はNone。
    """
    slide_texts = {}
    current_slide = None
    current_lines = []
    try:
        with open(narration_path, "r", encoding="utf-8") as f:
             for line in f:
                 line = line.rstrip("\n")
                 if line.strip() == "" or line.strip() == '---': continue

                 m = re.match(r"#\s*Slide\s*(\d+)", line, re.IGNORECASE)
                 if m:
                     if current_slide is not None:
                         slide_texts[current_slide] = "\n".join(current_lines).strip()
                     current_slide = int(m.group(1))
                     current_lines = []
                 else:
                     if line.startswith('#'): continue
                     if line.startswith('(') and line.endswith(')'): continue
#                     if monologue:
                     if True: # monologueフラグは実質的に常にTrueとして処理されている
                         current_lines.append(line)
                     else:
                        _aa = line.split(',', 1)
                        if len(_aa) == 2:
                            current_lines.append(_aa[1])
                        else:
                            current_lines.append(line)
             if current_slide is not None:
                 slide_texts[current_slide] = "\n".join(current_lines).strip()
    except Exception as e:
        print(f"❌ テキストファイルの読み込み/パースエラー: {e}")
        return None

    print(f"✅ テキストファイルを解析しました: {len(slide_texts)} スライド分のノートを検出。")
    return slide_texts



[ドキュメント]
def add_notes_to_pptx_com(pptx_path, slide_texts, temp_pptx_path):
    """
    概要: PowerPointファイルにスライドノートを追加する。
    詳細説明:
        `win32com.client`モジュールを使用してPowerPointアプリケーションを操作し、
        指定されたPPTXファイルをコピーした後、`slide_texts`辞書に基づいて各スライドの
        ノートページにテキストを書き込みます。処理後、PowerPointアプリケーションを閉じ、
        リソースを解放します。
    :param pptx_path: str: ノートを追加する元のPPTXファイルのパス。
    :param slide_texts: dict: スライド番号をキー、追加するノート内容を値とする辞書。
    :param temp_pptx_path: str: ノートが追加された一時PPTXファイルの出力パス。
    :returns: bool: ノートの追加と保存が成功した場合はTrue、それ以外はFalse。
    """
    ppt = win32com.client.Dispatch("PowerPoint.Application")
    try:
        shutil.copyfile(pptx_path, temp_pptx_path)
    except Exception as e:
        print(f"エラー: コピー中にエラー: {e}")
        ppt.Quit()
        return False
    
    try:
        full_path = os.path.abspath(temp_pptx_path)
        pres = ppt.Presentations.Open(full_path)
    except Exception as e:
        print(f"エラー: PowerPointで一時ファイルを開けませんでした: {e}")
        ppt.Quit()
        return False
        
    print("  ノートを一時ファイルに書き込み中...")
    success_count = 0
    total_slides = pres.Slides.Count
    for idx in range(1, total_slides + 1):
        if idx in slide_texts:
            notes_text = slide_texts[idx]
            slide = pres.Slides(idx)
            notes_page = slide.NotesPage
            for shp in notes_page.Shapes:
                if shp.Type == 14 and shp.PlaceholderFormat.Type == 2: # 14: msoTextBox, 2: ppNotesPlaceholder
                    shp.TextFrame.TextRange.Text = notes_text
                    success_count += 1
                    break

        # ループ内で解放 
        shp = None 
        notes_page = None 
        slide = None

    #リソースを全て解放
    slide = None
    shp = None
    notes_page = None

    try:
        pres.Save()
        pythoncom.PumpWaitingMessages()
        pres.Close()
        ppt.Quit()
        print(f"✅ ノートを一時PPTXファイルに書き込みました: {temp_pptx_path} ({success_count}件)")
        return True
    except Exception as e:
        print(f"エラー: 一時ファイルへの保存中にエラー: {e}")
        print(f"　　無視して続行します")
        ppt.Quit()
        return True

#        return False


[ドキュメント]
def link_audio_autoplay(source_pptx_path, audio_tasks, output_pptx):
    """
    概要: PowerPointファイルに音声ファイルをリンクし、自動再生を設定する。
    詳細説明:
        `win32com.client`モジュールを使用してPowerPointアプリケーションを操作し、
        指定されたPPTXファイルをコピーした後、`audio_tasks`リストに基づいて各スライドに
        音声ファイルをメディアオブジェクトとして追加します。追加されたメディアオブジェクトは、
        スライドの右下隅に配置され、スライド遷移と同時に自動再生されるように設定されます。
        処理後、PowerPointアプリケーションを閉じ、リソースを解放します。
    :param source_pptx_path: str: 元となるPPTXファイルのパス（このファイルをコピーして編集する）。
    :param audio_tasks: list[tuple[int, str]]: (スライド番号, 音声ファイルパス) のタプルリスト。
    :param output_pptx: str: 音声リンクが追加された出力PPTXファイルのパス。
    """
    print("\n--- 🔗 PPTXに音声ファイルをリンクし、自動再生を設定中 ---")
    try:
        shutil.copyfile(source_pptx_path, output_pptx)
    except Exception as e:
        print(f"エラー: 出力ファイル {output_pptx} へのコピー中にエラー: {e}")
        return

    ppt = win32com.client.Dispatch("PowerPoint.Application")
    try:
        pres = ppt.Presentations.Open(os.path.abspath(output_pptx))
    except Exception as e:
        print(f"エラー: PowerPointで出力ファイルを開けませんでした: {e}")
        ppt.Quit()
        return

    for idx, wav_path in audio_tasks:
        slide = pres.Slides(idx)
        slide_width = pres.PageSetup.SlideWidth
        slide_height = pres.PageSetup.SlideHeight
        icon_size = 40
        left = slide_width - icon_size - 10
        top = slide_height - icon_size - 10

        wav_path = os.path.abspath(wav_path)
        print(f"  🔗 リンク追加: スライド {idx} ({wav_path[-40:]})")
        if not os.path.exists(wav_path):
            break
        
        shape = slide.Shapes.AddMediaObject2(
            wav_path,
            LinkToFile=True,
            SaveWithDocument=False,
            Left=left, Top=top,
            Width=icon_size, Height=icon_size
        )

        # 自動再生設定
        play_settings = shape.AnimationSettings.PlaySettings
        play_settings.PlayOnEntry = True
        
        # msoAnimTriggerWithPrevious = 2 でスライド遷移と同時に実行
        effect = slide.TimeLine.MainSequence.AddEffect(
            shape,
            9,   # msoAnimEffectMediaPlay
            0,   # 第3引数 Level (ここでは0)
            2    # 第4引数 Trigger (msoAnimTriggerWithPrevious)
        )
#        effect.Timing.Duration = 600.0 # 600.0秒 (10分)

    pres.Save()
    try:
        pres.Close()
    except:
        print("Warning: PowerPoiintオブジェクトのClose()に失敗しました")
        pass
    try:
       ppt.Quit()
    except:
        print("Warning: PowerPoiintオブジェクトのQuit()に失敗しました")
        pass
    print(f"\n✅ PowerPoint更新完了: {output_pptx}")




[ドキュメント]
def generate_audio_files_tktts(pptx_path, voice_map, args):
    """
    概要: PPTXノートから音声ファイルを生成する。
    詳細説明:
        `win32com.client`を使用して指定されたPowerPointファイルからスライドノートを抽出し、
        `tkTTS`インスタンス（指定されたTTSエンジン）を利用して、各ノートの内容に基づき
        音声ファイルを生成します。生成された音声ファイルは指定された出力ディレクトリに
        保存されます。一時ディレクトリは処理後にクリーンアップされます。
    :param pptx_path: str: ノートが記載されたPPTXファイルのパス。
    :param voice_map: dict: 話者名とTTSボイス名のマッピング辞書。
    :param args: argparse.Namespace: コマンドライン引数を格納したオブジェクト。
                                     TTSエンジンの設定や出力ディレクトリなどが含まれる。
    :returns: list[tuple[int, str]]: (スライド番号, 生成された音声ファイルの絶対パス) のタプルリスト。
                                    音声ファイルの生成に失敗した場合は空のリストを返す。
    """
    
    tktts = tkTTS(tts_name = args.tts, config = args)
    tts_engine = args.tts.lower()
    output_dir = args.audio_dir
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. PPTXからノートを取得
    print("--- 📝 PPTXからノートを取得中 ---")
    ppt = win32com.client.Dispatch("PowerPoint.Application")
    full_path = os.path.abspath(pptx_path)
    try:
        pres = ppt.Presentations.Open(full_path)
    except Exception as e:
        print(f"エラー: PowerPointでファイルを開けませんでした: {e}")
        ppt.Quit()
        return [], ""
    
    dialogue_map = {} # {slide_idx: (None, notes_text)}
    for idx in range(1, pres.Slides.Count + 1):
#        print()
#        print(f"Slide {idx} の音声を生成します:")
        slide = pres.Slides(idx)
        notes = ""
        try:
            for shp in slide.NotesPage.Shapes:
                if shp.Type == 14 and shp.PlaceholderFormat.Type == 2:
                    notes = shp.TextFrame.TextRange.Text.strip()
                    break
        except Exception:
            pass
        
        if notes:
            # tktts.speak_dialogue が期待する形式 (speaker=None, text)
            dialogue_map[idx] = (None, notes)
            
    pres.Close()
    ppt.Quit()
    
    if not dialogue_map:
        print("  ⚠️ ノートが記載されたスライドが見つかりませんでした。")
        return [], ""
        
    print(f"  ✅ ノートを検出: {len(dialogue_map)} スライド")

    # 2. tktts.speak_dialogue のためのデータ準備
#    dialogue_list = list(dialogue_map.values()) # [(None, text1), (None, text2), ...]
    replacements = {}

    # 3. エンジンモジュールを直接使用してスライドごとに音声を生成
    print(f"\n--- 🗣️ スライドごとに {tts_engine.upper()} で音声ファイルを生成中 ---")
    
    ext = "wav"
#    ext = "mp3"
    sorted_slide_indices = sorted(dialogue_map.keys())
    audio_tasks = []
    for slide_idx in sorted_slide_indices:
        print()
        print(f"スライド #{slide_idx} の音声ファイルを生成しています...")
        
        # スライドごとの読み上げテキストを (None, text) のタプルで渡す
        speaker, notes = dialogue_map[slide_idx]
        # 単一スライドの対話リスト
        single_dialogue = [(speaker, notes)]
        slide_outfile = os.path.join(output_dir, f"slide{slide_idx}.{ext}")
        
        try:
            args.outfile = slide_outfile
            generated_file = tktts.speak_dialogue(
                config = args, dialogue = single_dialogue, 
                voice_map = voice_map, replacements = replacements,
                output_format = ext,
                )

            if generated_file:
                if os.path.exists(slide_outfile):
                     audio_tasks.append((slide_idx, os.path.abspath(generated_file)))
                     print(f"  ✅ 生成完了: {os.path.basename(slide_outfile)} (スライド {slide_idx})")
                else:
                    # エラー処理
                    print(f"  ❌ ファイルが見つかりません: スライド {slide_idx} での生成失敗")
                    
            else:
                print(f"  ❌ 生成失敗: スライド {slide_idx}")
                
        except Exception as e:
            print(f"  ❌ TTSモジュール呼び出しエラー: スライド {slide_idx} - {e}")
            traceback.print_exc()

    # 4. 一時ファイルのクリーンアップ (tktts 内部で削除されない場合を考慮)
    if os.path.exists(args.temp_dir):
        shutil.rmtree(args.temp_dir)
        print(f"\n🗑️ 一時ディレクトリ {args.temp_dir} を削除しました。")


    print(f"✅ 音声ファイル生成完了 ({len(audio_tasks)}件)")
    return audio_tasks




[ドキュメント]
def main():
    """
    概要: プログラムのメイン処理を実行する。
    詳細説明:
        コマンドライン引数を解析し、選択された実行モード（リスト表示、マップ表示、変換）に
        応じて処理を分岐させます。変換モードでは、指定された発言テキストファイルからPPTXに
        ノートを追加し、その後、ノートの内容から音声ファイルを生成し、最終的に生成された
        音声をPowerPointスライドに自動再生リンクとして埋め込みます。
        各ステップでエラーチェックとログ出力が行われます。
    """
    args = initialize()

# endpoint, aquestalk_pathはargsで渡す
    tktts = tkTTS(tts_name = args.tts, config = args)

    if args.mode == 'list':
        if args.tts.lower() == "voicevox":
            tktts.list_available_voices(args.tts, endpoint=args.endpoint)
        else:
            tktts.list_available_voices(args.tts)
        terminate()

    source_pptx = args.input_path
    narration_file = args.narration_txt
    output_pptx = args.output_path

    if not all([source_pptx, narration_file]):
        print("\nエラー: mapモードでは --input_path, --narration_txtの引数が必要です。")
        print(f"現在の設定: input={source_pptx}, text={narration_file}")
        terminate()

    if args.mode == 'map':
        tktts.show_voice_map(narration_file, args.voices, VOICE_MAPS = {}, is_monologue = args.monologue)
        terminate()

    if not all([source_pptx, narration_file, output_pptx]):
        print("\nエラー: map/convモードでは --input_path, --narration_txt, --output_path の全ての引数が必要です。")
        print(f"現在の設定: input={source_pptx}, text={narration_file}, output={output_pptx}")
        terminate()

    print("--- 💻 PowerPointナレーション作成プログラム (CONVモード) 開始 ---")
    print(f"  TTS engine  : {args.tts}")
    print(f"  is monologue: {args.monologue}")
    print(f"  入力PPTX    : {source_pptx}")
    print(f"  入力テキスト: {narration_file}")
    print(f"  出力PPTX    : {output_pptx}")
    print(f"  音声フォルダ: {args.audio_dir}")
    print("-" * 50)

    use_narration = narration_file and os.path.exists(narration_file)
    temp_pptx_file = source_pptx
    
    # 1 & 2. テキストファイルがあればノートを追加
    current_voice_map = None
    if use_narration:
        print()
        print(f"[{narration_file}]を解析します:")
        dialogue = tktts.load_text(narration_file, args.monologue, wait_for_clipboard = False)
        if not dialogue:
            print("エラー: 有効なテキストデータが取得できませんでした。")
            if not args.monologue:
                print("  対話形式でない場合は --monologue=1 オプションをつけてください。")
            terminate()

        speakers_in_file = tktts.get_speakers_from_dialogue(dialogue)
        print(f"  Speakers in [{narration_file}]")
        for idx, sp in enumerate(speakers_in_file):
            print(f"    {idx:02d}: {sp}")

        current_voice_map = tktts.update_voice_map(voice_map = VOICE_MAPS, 
                                voices = args.voices, speakers = speakers_in_file)

        print()
        print(f"Voice map updated:")
        for key, val in current_voice_map.items():
            if type(key) is str:
                print(f"  (speaker) {key}: (voice) {val}")
        for key, val in current_voice_map.items():
            if type(key) is not str and type(key) is not int:
                print(f"  (speaker) {key}: (voice) {val}")
        for key, val in current_voice_map.items():
            if type(key) is int:
                print(f"  (speaker) {key}: (voice) {val}")

        print("=== 検出された話者とvoice ===")
        for s in sorted(speakers_in_file):
            if s is None or s == "":
                voice = current_voice_map.get(s, None)
                if voice is None: voice = current_voice_map.get(0, None)
                print(f"  (独話): {voice}")
            else:
                s = tktts.normalize_speaker(s, args.tts)
                print(f"- {s}: {current_voice_map.get(s, '未設定')}")

    # ノート用にナレーションファイルを読みこみ
        slide_texts = parse_narration_file(narration_file, args.monologue)
        if not slide_texts: terminate()

        temp_pptx_file = "temp_notes_added.pptx"
        ret = add_notes_to_pptx_com(source_pptx, slide_texts, temp_pptx_file)
        if not ret: terminate()
    
    # 3. ノートが書き込まれた一時ファイルから音声ファイルを生成 (tktts版)
    # ノートが書き込まれたファイルからノートテキストを抽出して音声生成を行う
    print()
    print("音声ファイルを生成します:")
    audio_tasks = generate_audio_files_tktts(temp_pptx_file, voice_map = current_voice_map, 
                    args = args)

    # 4. 音声ファイルをリンクして出力PPTXを保存
    if audio_tasks:
        print()
        print(f"音声ファイルを[{temp_pptx_file}]にリンクします:")
        link_audio_autoplay(temp_pptx_file, audio_tasks, output_pptx)
    else:
        print("\n完了: 音声ファイルが生成されなかったため、リンク処理をスキップしました。")

    # クリーンアップ
    if use_narration and os.path.exists(temp_pptx_file) and temp_pptx_file != source_pptx:
        print()
        print("一時ファイル [{temp_pptx_file}] を削除します:")
        os.remove(temp_pptx_file)
        print(f"  クリーンアップ完了")

    print("--- 🎉 プログラム終了 ---")



if __name__ == "__main__":
    main()
    terminate()