import os
import sys
import tempfile
try:
    import torch
except:
    print(f"\nImport error: torch")
    print("Install CPU ver: pip install torch")
    input("Install GPU ver: check https://pytorch.org/get-started/locally/\n")
    exit()
try:
    import whisper
except:
    print(f"\nImport error: whisper")
    print("Install: pip install whisper")
    input("or check python ver. Version 3.10 or earlier is recommended")
    exit()
try:
    from pydub import AudioSegment
except:
    print(f"\nImport error: pydub")
    input("Install: pip install pydub")
    exit()


input_path  = ""
output_path = None

model = "small"  # "small", "base", "medium", "large"
mode = "once"
#mode = "split"
language = "ja"
verbose = True

argv = sys.argv
nargv = len(argv)
if nargv > 1: input_path = argv[1]    
if nargv > 2: output_path = argv[2]
if nargv > 3: mode = argv[3]  
if nargv > 4: model = argv[4]
if nargv > 5: language = argv[5]
if output_path is None:
    input_file = os.path.basename(input_path)   # ファイル名のみ取得        
    input_dir  = os.path.dirname(input_path)   # ディレクトリ名のみ取得
    output_path = os.path.join(input_dir, f"{os.path.splitext(input_file)[0]}-transcribed.txt")

print()
print("Transcribe audio file using local whisper")
print(f"input_path: {input_path}")
print(f"output_path: {output_path}")
print(f"mode: {mode}")
print(f"model: {model}")
print(f"language: {language}")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"tourch version: {torch.version.cuda}")
    print(f"Running on    : {gpu_name}")
else:
    print("Running on: CPU")

if mode != "split":
    model = whisper.load_model(model)
#Whisper（ローカル版）は、transcribe() の中で 自動的に音声をチャンクに分けて処理する
#ただし、十分なメモリが必要
    result = model.transcribe(input_path, language = language, verbose = verbose)
    
    final_transcript = ""
    for seg in result["segments"]:
        final_transcript += seg["text"].strip() + "\n"
    
#    print(result["text"])
    print(final_transcript)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_transcript)
#        f.write(result["text"])

    print(f"\n==== 完了しました。{output_path} に保存されました。====")

elif mode == "split":
# ステップ1：軽量モデルでセグメント取得
    print("Loading base model for segmentation...")
    model_base = whisper.load_model("base")
    result = model_base.transcribe(input_path, language = language, verbose = verbose)
    segments = result["segments"]
    print(f"{len(segments)} セグメントが検出されました。")
# 音声全体を読み込む
    audio = AudioSegment.from_file(input_path)

# セグメントごとに切り出して再認識
    print("Loading high-accuracy model for re-transcription...")
    model_high = whisper.load_model(model)
    final_transcript = ""
    for i, seg in enumerate(segments):
        start_ms = int(seg["start"] * 1000)
        end_ms   = int(seg["end"] * 1000)
        chunk = audio[start_ms:end_ms]

        print(f"再認識中：セグメント {i+1}/{len(segments)}（{seg['start']}s 〜 {seg['end']}s）")

        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        tmp_file_path = tmp_file.name
        tmp_file.close()  # ← Windows対策で明示的に閉じる

        chunk.export(tmp_file_path, format="wav")

        result_hi = model_high.transcribe(tmp_file_path, language = language, verbose = verbose)
        final_transcript += result_hi["text"].strip() + "\n"

        os.remove(tmp_file_path)

# 出力結果の保存
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_transcript)

    print(f"\n==== 完了しました。{output_path} に保存されました。====")

    input("\nPress ENTER to terminate>>\n")
    