converter.replace のソースコード

"""
テキストファイル内の特定のパターンを正規表現に基づいて置換するスクリプト。

詳細説明:
本スクリプトは、入力ファイル、出力ファイル、および置換ルールを定義するINIファイルを指定して、
テキストの自動修正を行います。`replace.ini`（デフォルト）とユーザー指定のINIファイル
の両方からルールを読み込み、ユーザー定義ルールがデフォルトルールよりも優先されます。
ファイルエンコーディングは自動的に検出されます。

関連リンク: :doc:`replace_usage`
"""

import os
import re
import argparse
import sys

missing = []
for lib in ["chardet"]:
    try:
        __import__(lib)
    except ImportError:
        missing.append(lib)
if missing:
    print(f"Error: Missing libraries:\n{', '.join(missing)}")
    print(f"  pip install chardet")
    input("\nPress ENTER to terminate>>\n")
    sys.exit(1)

import chardet



[ドキュメント]
def terminate():
    """
    プログラムを終了する前にユーザー入力を待機する。

    詳細説明:
    エラー発生時や処理完了時に、ユーザーがメッセージを確認し、
    Enterキーを押すまでプログラムの終了を一時停止します。
    """
    input(f"\nPress ENTER to terminate>>\n")
    exit()

    

[ドキュメント]
def detect_encoding(file_path):
    """
    指定されたファイルの文字エンコーディングを検出する。

    詳細説明:
    ファイルをバイナリモードで読み込み、`chardet`ライブラリを使用して文字エンコーディングを推測します。

    :param file_path: str: 検出対象のファイルのパス。
    :returns: str: 検出された文字エンコーディングの文字列（例: 'utf-8', 'Shift_JIS'）。
    """
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    result = chardet.detect(raw_data)
    return result['encoding']



[ドキュメント]
def load_replace_dict(ini_path):
    """
    INIファイルから置換ルールを読み込み、辞書として返す。

    詳細説明:
    INIファイルを1行ずつ読み込み、`#`で始まるコメント行や`=`を含まない行を無視します。
    キーと値は`=`で区切られ、キーが引用符で囲まれている場合は引用符を取り除きます。
    値の前後の空白は保持されます。

    :param ini_path: str: 置換ルールが記述されたINIファイルのパス。
    :returns: dict[str, str]: キー（置換パターン）と値（置換文字列）の辞書。
    """
    replace_dict = {}
    if not ini_path or not os.path.isfile(ini_path):
        if ini_path:
            print(f"  [skip] File not found: {ini_path}")
        return replace_dict

    try:
        with open(ini_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.rstrip('\n')
                if line.startswith('#') or '=' not in line:
                    continue

                match = re.match(r"""^(['"].+?['"]|[^=]+?)=(.*)$""", line)
                if not match:
                    continue

                raw_key, val = match.groups()
                key = raw_key[1:-1] if (raw_key.startswith("'") and raw_key.endswith("'")) or (raw_key.startswith('"') and raw_key.endswith('"')) else raw_key.strip()
                replace_dict[key] = val  # valの前後の空白は保持
    except Exception as e:
        print(f"  [skip] Failed to read {ini_path}: {e}")
    return replace_dict



[ドキュメント]
def apply_replacements(text, replace_list):
    """
    テキストに正規表現による置換リストを適用する。

    詳細説明:
    指定された置換リスト（パターンと置換文字列のペア）を順にテキストに適用します。
    `re.IGNORECASE`と`re.MULTILINE`フラグが使用され、大文字小文字を区別せず、
    複数行にわたるマッチングを可能にします。置換中にエラーが発生した場合はプログラムを終了します。

    :param text: str: 置換を適用する元のテキスト。
    :param replace_list: list[tuple[str, str]]: (パターン, 置換文字列)のタプルのリスト。
    :returns: str: 全ての置換が適用された後のテキスト。
    """
    for pattern, replacement in replace_list:
        try:
            print(f"pattern: {pattern} => {replacement}")
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE | re.MULTILINE)
        except Exception as e:
            print(f"re.sub error for [{pattern}]: {e}")
            terminate()

    return text



[ドキュメント]
def generate_output_path(input_path):
    """
    入力ファイルパスに基づいてデフォルトの出力ファイルパスを生成する。

    詳細説明:
    入力ファイルのファイル名を基に、拡張子の前に`-converted`を追加した新しいファイルパスを生成します。

    :param input_path: str: 入力ファイルのパス。
    :returns: str: 自動生成された出力ファイルのパス。
    """
    base, ext = os.path.splitext(input_path)
    return f"{base}-converted{ext}"



[ドキュメント]
def main():
    """
    プログラムのエントリポイント。コマンドライン引数を解析し、テキストの置換処理を実行する。

    詳細説明:
    `argparse`を使用して入力ファイル、出力ファイル、置換ルールファイルを指定します。
    入力ファイルのエンコーディングを検出し、置換ルールを読み込み、テキストに適用し、
    結果を新しいファイルに書き込みます。ユーザー定義の置換ルールはデフォルトルールに優先します。
    """
    parser = argparse.ArgumentParser(description='読み方補正プログラム（正規表現対応）')
    parser.add_argument('--input_file', "-i", default='input.md', help='入力テキストファイルのパス（デフォルト: input.md）') 
    parser.add_argument('--output_file', "-o", default=None, help='出力ファイルのパス（指定がない場合は自動生成）') 
    parser.add_argument('--replace_file', "-r", default='replace.ini', help='default replace.iniのパス（デフォルト: replace.ini）') 
    parser.add_argument('--replace_file2', "-s", default='', help='user replace.iniのパス（デフォルト: ""）')
    args = parser.parse_args()

    input_path = args.input_file
    output_path = args.output_file or generate_output_path(input_path)

    print(f"\nRead input file from [{input_path}]")
    encoding = detect_encoding(input_path)
    print(f"  encoding:", encoding)

    text = None
    try:
        with open(input_path, 'r', encoding=encoding) as f:
            text = f.read()
    except Exception as e:
        print(f"  Failed with encoding={encoding}: {e}")
        print("  Retrying with utf-8...")
        with open(input_path, 'r', encoding="utf-8") as f:
            text = f.read()

    # 置換辞書の読み込み
    print(f"\nLoading replacement dictionaries...")
    dict2 = load_replace_dict(args.replace_file2)  # ユーザー定義
    dict1 = load_replace_dict(args.replace_file)   # デフォルト

    # dict2 に存在するキーは dict1 から除外（ユーザー定義が優先）
    dict1_filtered = {k: v for k, v in dict1.items() if k not in dict2}

    # 表示
    print("Replacement list (replace_file2 first, then replace_file):")
    for pattern, replacement in dict2.items():
        print(f"  [user] {pattern}: [{replacement}]")
    for pattern, replacement in dict1_filtered.items():
        print(f"  [default] {pattern}: [{replacement}]")

    # 置換処理: dict2 → dict1_filtered の順で適用
    print("\nReplacing...")
    replaced_text = apply_replacements(text, list(dict2.items()))
    replaced_text = apply_replacements(replaced_text, list(dict1_filtered.items()))

    # 出力
    print(f"\nWrite to [{output_path}]")
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(replaced_text)


if __name__ == '__main__':
    main()
    terminate()