# pip install python-pptx python-docx openpyxl

import os
import sys
import csv # CSVファイルを扱うために追加
from pptx import Presentation
from docx import Document
from openpyxl import load_workbook

# 現在のディレクトリ
root_dir = "."
# 出力ファイルのパス
output_path = "extracted_office_and_csv_text.txt" # 出力ファイル名を変更
# 検索するディレクトリの最大深度 (-1は無制限)
max_level = -1

# コマンドライン引数の解析
# python script.py [root_dir] [output_path] [max_level]
if len(sys.argv) > 1:
    root_dir = sys.argv[1]
if len(sys.argv) > 2:
    output_path = sys.argv[2]
if len(sys.argv) > 3:
    max_level = sys.argv[3]
try:
    max_level = int(max_level)
except ValueError:
    print(f"エラー: 無効なmax_level '{max_level}'です。")
    sys.exit(1)

def apply_replacements(text, replacements):
    """
    テキスト内のJinja2形式のプレースホルダーを置換するヘルパー関数。
    """
    if replacements is None:
        replacements = {}
    for key, value in replacements.items():
        placeholder = f"{{{{{key}}}}}"
        text = text.replace(placeholder, str(value))
    return text

def pptx_text_generator(pptx_path, mode="paragraph", replacements=None):
    """
    指定されたPowerPointファイルからテキストを抽出し、スライドごとにジェネレータとしてyieldする。
    Jinja2形式のプレースホルダー (例: {{key}}) を指定された値で置換する。

    Args:
        pptx_path (str): PowerPointファイルのパス。
        mode (str): 'paragraph' (段落ごと) または 'run' (テキストランごと) でテキストを抽出。
        replacements (dict, optional): 置換するキーと値の辞書。例: {'name': '太郎'}. Defaults to None.

    Yields:
        dict: 抽出および置換されたスライドのテキスト情報を含む辞書。
              キー: 'file_path', 'type', 'slide_number', 'slide_title', 'content'
    """
    try:
        prs = Presentation(pptx_path)

        for i, slide in enumerate(prs.slides):
            slide_text_parts = []
            current_slide_title = ""

            # 1. スライドタイトルを最初に抽出
            if slide.shapes.title:  # 標準のタイトルプレースホルダーがあるか確認
                current_slide_title = slide.shapes.title.text_frame.text.strip()

            # 2. その他のテキストを抽出
            for shape in slide.shapes:
                if hasattr(shape, "text_frame") and shape.text_frame:
                    for paragraph in shape.text_frame.paragraphs:
                        if mode == 'paragraph':
                            # 空の段落はスキップ
                            if paragraph.text.strip():
                                slide_text_parts.append(paragraph.text.strip())
                        else:
                            for run in paragraph.runs:
                                if run.text.strip():
                                    slide_text_parts.append(run.text.strip())

            # 抽出されたテキストを結合
            combined_slide_text = "\n".join(slide_text_parts)

            # Jinja2形式のプレースホルダーを置換
            combined_slide_text = apply_replacements(combined_slide_text, replacements)
            current_slide_title = apply_replacements(current_slide_title, replacements)

            # ジェネレータとして情報をyield
            yield {
                'file_path': pptx_path,
                'type': 'pptx',
                'slide_number': i + 1,
                'slide_title': current_slide_title,
                'content': combined_slide_text
            }

    except Exception as e:
        print(f"エラー: {pptx_path}の処理中にエラーが発生しました: {e}", file=sys.stderr)

def docx_text_generator(docx_path, replacements=None):
    """
    指定されたWordファイルからテキストを抽出し、段落ごとにジェネレータとしてyieldする。
    Jinja2形式のプレースホルダー (例: {{key}}) を指定された値で置換する。

    Args:
        docx_path (str): Wordファイルのパス。
        replacements (dict, optional): 置換するキーと値の辞書。Defaults to None.

    Yields:
        dict: 抽出および置換された段落のテキスト情報を含む辞書。
              キー: 'file_path', 'type', 'paragraph_number', 'content'
    """
    try:
        doc = Document(docx_path)
        for i, paragraph in enumerate(doc.paragraphs):
            paragraph_text = paragraph.text.strip()
            if paragraph_text: # 空の段落はスキップ
                # Jinja2形式のプレースホルダーを置換
                processed_text = apply_replacements(paragraph_text, replacements)
                yield {
                    'file_path': docx_path,
                    'type': 'docx',
                    'paragraph_number': i + 1,
                    'content': processed_text
                }
    except Exception as e:
        print(f"エラー: {docx_path}の処理中にエラーが発生しました: {e}", file=sys.stderr)

def xlsx_text_generator(xlsx_path, replacements=None):
    """
    指定されたExcelファイルからテキストを抽出し、シートと行ごとにジェネレータとしてyieldする。
    Jinja2形式のプレースホルダー (例: {{key}}) を指定された値で置換する。

    Args:
        xlsx_path (str): Excelファイルのパス。
        replacements (dict, optional): 置換するキーと値の辞書。Defaults to None.

    Yields:
        dict: 抽出および置換されたセルのテキスト情報を含む辞書。
              キー: 'file_path', 'type', 'sheet_name', 'row_number', 'content'
    """
    try:
        workbook = load_workbook(xlsx_path)
        for sheet_name in workbook.sheetnames:
            sheet = workbook[sheet_name]
            for i, row in enumerate(sheet.iter_rows()):
                row_values = []
                for cell in row:
                    if cell.value is not None:
                        row_values.append(str(cell.value).strip())
                
                combined_row_text = " ".join(row_values)
                if combined_row_text: # 空の行はスキップ
                    # Jinja2形式のプレースホルダーを置換
                    processed_text = apply_replacements(combined_row_text, replacements)
                    yield {
                        'file_path': xlsx_path,
                        'type': 'xlsx',
                        'sheet_name': sheet_name,
                        'row_number': i + 1,
                        'content': processed_text
                    }
    except Exception as e:
        print(f"エラー: {xlsx_path}の処理中にエラーが発生しました: {e}", file=sys.stderr)

def csv_text_generator(csv_path, replacements=None):
    """
    指定されたCSVファイルからテキストを抽出し、行ごとにジェネレータとしてyieldする。
    Jinja2形式のプレースホルダー (例: {{key}}) を指定された値で置換する。

    Args:
        csv_path (str): CSVファイルのパス。
        replacements (dict, optional): 置換するキーと値の辞書。Defaults to None.

    Yields:
        dict: 抽出および置換された行のテキスト情報を含む辞書。
              キー: 'file_path', 'type', 'row_number', 'content'
    """
    try:
        with open(csv_path, 'r', newline='', encoding='utf-8') as csvfile:
            csv_reader = csv.reader(csvfile)
            for i, row in enumerate(csv_reader):
                # 各セルの値を結合して1つの文字列にする
                row_text = ", ".join(row).strip() # カンマ区切りで結合
                if row_text: # 空の行はスキップ
                    # Jinja2形式のプレースホルダーを置換
                    processed_text = apply_replacements(row_text, replacements)
                    yield {
                        'file_path': csv_path,
                        'type': 'csv',
                        'row_number': i + 1,
                        'content': processed_text
                    }
    except Exception as e:
        print(f"エラー: {csv_path}の処理中にエラーが発生しました: {e}", file=sys.stderr)


def search_office_files_and_extract(root_dir, output_path, max_level=-1, replacements=None):
    """
    指定されたディレクトリからPowerPoint, Word, Excel, CSVファイルを再帰的に検索し、
    ジェネレータを使用してテキストを抽出し、出力ファイルに書き込む。
    Jinja2形式のプレースホルダーを置換する。
    """
    if not os.path.isdir(root_dir):
        print(f"エラー: ルートディレクトリ '{root_dir}' は存在しません。")
        return

    print(f"'{root_dir}' 内の Office および CSV ファイルを検索中 (最大深度: {max_level})...")

    with open(output_path, 'w', encoding='utf-8') as outfile:
        for dirpath, dirnames, filenames in os.walk(root_dir):
            # 現在のディレクトリの深さを計算
            current_level = dirpath.count(os.sep) - root_dir.count(os.sep)

            if max_level != -1 and current_level >= max_level:
                del dirnames[:]
                continue

            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                print(f"  処理中: {full_path}")

                if filename.endswith(".pptx"):
                    for item_info in pptx_text_generator(full_path, replacements=replacements):
                        outfile.write(f"ファイル: {item_info['file_path']}\n")
                        outfile.write(f"タイプ: {item_info['type'].upper()}\n")
                        if item_info['slide_title']:
                            outfile.write(f"スライド {item_info['slide_number']}: {item_info['slide_title'].replace('\n', ' ')}\n")
                        else:
                            outfile.write(f"スライド {item_info['slide_number']}:\n")
                        if item_info['content'].strip():
                            outfile.write(item_info['content'].strip() + "\n\n")
                        else:
                            outfile.write("(テキストが見つかりませんでした)\n\n")
                
                elif filename.endswith(".docx"):
                    for item_info in docx_text_generator(full_path, replacements=replacements):
                        outfile.write(f"ファイル: {item_info['file_path']}\n")
                        outfile.write(f"タイプ: {item_info['type'].upper()}\n")
                        outfile.write(f"段落 {item_info['paragraph_number']}:\n")
                        if item_info['content'].strip():
                            outfile.write(item_info['content'].strip() + "\n\n")
                        else:
                            outfile.write("(テキストが見つかりませんでした)\n\n")

                elif filename.endswith(".xlsx"):
                    for item_info in xlsx_text_generator(full_path, replacements=replacements):
                        outfile.write(f"ファイル: {item_info['file_path']}\n")
                        outfile.write(f"タイプ: {item_info['type'].upper()}\n")
                        outfile.write(f"シート: {item_info['sheet_name']}, 行: {item_info['row_number']}:\n")
                        if item_info['content'].strip():
                            outfile.write(item_info['content'].strip() + "\n\n")
                        else:
                            outfile.write("(テキストが見つかりませんでした)\n\n")
                
                elif filename.endswith(".csv"): # CSVファイルの処理を追加
                    for item_info in csv_text_generator(full_path, replacements=replacements):
                        outfile.write(f"ファイル: {item_info['file_path']}\n")
                        outfile.write(f"タイプ: {item_info['type'].upper()}\n")
                        outfile.write(f"行 {item_info['row_number']}:\n")
                        if item_info['content'].strip():
                            outfile.write(item_info['content'].strip() + "\n\n")
                        else:
                            outfile.write("(テキストが見つかりませんでした)\n\n")


if __name__ == "__main__":
    # テスト用の置換辞書
    # 実際の使用時には、ここに置換したいキーと値のペアを追加してください
    sample_replacements = {
        'company_name': 'XYZソリューションズ',
        'project_title': 'データ分析プラットフォーム',
        'date': '2025年7月18日',
        'author': 'AIアシスタント',
        'product_name': '新製品A' # CSVテスト用に追加
    }

    search_office_files_and_extract(root_dir, output_path, max_level, replacements=sample_replacements)
    print(f"\nOfficeおよびCSVファイルからのテキスト抽出と置換が完了しました。出力は '{output_path}' に保存されました。")
    input("\n終了するにはENTERキーを押してください>>")