web.replace_recursive のソースコード

"""
指定されたディレクトリツリー内のファイルを再帰的に処理し、特定の文字列置換とファイルコピーを行うスクリプト。

このスクリプトは、`source_dir` で指定されたディレクトリから `target_dir` へファイルをコピーします。
`convert_file_extensions` に含まれる拡張子を持つファイルに対しては、
`replace_list` に基づく正規表現置換を実行します。
`exclude_list` に含まれるファイルは置換処理をスキップし、そのままコピーされます。

関連リンク: :doc:`replace_recursive_usage`
"""
import os
import shutil
import chardet
import re


source_dir = 'msl'
target_dir = 'msl_titech'
convert_file_extensions = ['.html', '.css', '.js']
exclude_list = ['500.html']

# 置換ルールのリスト
replace_list = [
    [r'https:\/\/www\.msl\.titech\.ac\.jp\/', '/msl_titech/'],
    [r'^\.\/assets\/', '/msl_titech/assets/'],
    [  r'^\/assets\/', '/msl_titech/assets/'],
    [r'file:\/\/\/E:\/\/www\.msl\.titech\.ac\.jp\/', '/msl_titech/'],
]


[ドキュメント]
def regex_replace(text, replace_list):
    """
    テキスト内の指定されたパターンを正規表現に基づいて置換します。

    replace_listに含まれる各(パターン, 置換文字列)のペアを使用して、
    入力テキストに対して正規表現置換を順番に適用します。

    :param text: 置換対象の文字列。
    :type text: str
    :param replace_list: (正規表現パターン, 置換文字列)のリスト。
    :type replace_list: list[tuple[str, str]]
    :returns: 置換後の文字列。
    :rtype: str
    """
    for pattern, repl in replace_list:
        text = re.sub(pattern, repl, text)
    return text




[ドキュメント]
def extract_filename(file_path):
    """
    ファイルパスからファイル名を抽出します。

    :param file_path: ファイルのパス。
    :type file_path: str
    :returns: 抽出されたファイル名。
    :rtype: str
    """
    filename = os.path.basename(file_path)
    return filename



[ドキュメント]
def copy_file(source_file, target_file):
    """
    ファイルをソースパスからターゲットパスへコピーします。

    ターゲットディレクトリが存在しない場合は作成し、
    ファイルのメタデータも保持してコピーします。

    :param source_file: コピー元のファイルパス。
    :type source_file: str
    :param target_file: コピー先のファイルパス。
    :type target_file: str
    :returns: なし
    :rtype: None
    """
    target_dir = os.path.dirname(target_file)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    shutil.copy2(source_file, target_file)



[ドキュメント]
def detect_encoding(file_path):
    """
    指定されたファイルのエンコーディングを検出します。

    chardetライブラリを使用して、ファイルのバイト列からエンコーディングを推測します。

    :param file_path: エンコーディングを検出するファイルのパス。
    :type file_path: str
    :returns: 検出されたエンコーディング名（例: 'utf-8', 'EUC-JP'）。
    :rtype: str
    """
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
    return encoding



[ドキュメント]
def read_file_with_detected_encoding(file_path):
    """
    ファイルのエンコーディングを自動検出してファイルを読み込みます。

    ファイルが存在しない場合や読み込みに失敗した場合はFalseを返します。

    :param file_path: 読み込むファイルのパス。
    :type file_path: str
    :returns: ファイルの内容、または読み込み失敗時にはFalse。
    :rtype: str or bool
    """
    if not os.path.exists(file_path): return False
    encoding = detect_encoding(file_path)
    try:
        with open(file_path, 'r', encoding=encoding) as f:
            return f.read()
    except:
        return False



[ドキュメント]
def replace_strings_in_file(source_file, target_file, replace_list):
    """
    ソースファイルを読み込み、指定された置換リストに基づいて文字列を置換し、結果をターゲットファイルに保存します。

    `detect_encoding` を使用してエンコーディングを検出し、ファイルを読み込みます。
    置換は正規表現で行われます。ターゲットファイルのディレクトリが存在しない場合は作成されます。

    :param source_file: 読み込み元のファイルパス。
    :type source_file: str
    :param target_file: 置換結果を書き込むファイルパス。
    :type target_file: str
    :param replace_list: (正規表現パターン, 置換文字列)のリスト。
    :type replace_list: list[tuple[str, str]]
    :returns: ファイルの処理が成功した場合はTrue、読み込みに失敗した場合はFalse。
    :rtype: bool
    """
    file_data = read_file_with_detected_encoding(source_file)
    if not file_data: return file_data

#    print("source_file=", source_file)
    new_data = file_data
    for search_string, replace_string in replace_list:
#        new_data = new_data.replace(search_string, replace_string)
        new_data = re.sub(search_string, replace_string, new_data)

    os.makedirs(os.path.dirname(target_file), exist_ok=True)
    with open(target_file, 'w', encoding='utf-8') as file:
        file.write(new_data)

    return True



[ドキュメント]
def process_directory(source_dir, target_dir, replace_list, file_extensions):
    """
    ソースディレクトリツリーを走査し、ファイルをターゲットディレクトリにコピーしながら、
    指定された拡張子のファイルに対して文字列置換を実行します。

    `os.walk` を使用してディレクトリを再帰的に走査します。
    `file_extensions` に含まれるファイルは `replace_strings_in_file` で処理され、
    `exclude_list` に含まれるファイルは置換をスキップしてコピーされます。
    その他のファイルは単純にコピーされます。

    :param source_dir: 処理対象のソースディレクトリパス。
    :type source_dir: str
    :param target_dir: 処理結果を保存するターゲットディレクトリパス。
    :type target_dir: str
    :param replace_list: (正規表現パターン, 置換文字列)のリスト。
    :type replace_list: list[tuple[str, str]]
    :param file_extensions: 文字列置換を行うファイル拡張子のリスト（例: ['.html', '.css']）。
    :type file_extensions: list[str]
    :returns: なし
    :rtype: None
    """
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            source_file = os.path.join(root, file)
            relative_path = os.path.relpath(source_file, source_dir)
            target_file = os.path.join(target_dir, relative_path)

            if any(file.endswith(ext) for ext in file_extensions):
                filename = extract_filename(file)
                if filename in exclude_list:
                    print(f"file [{file}] is in the exclude list {exclude_list}. Skip and Copy")
                    copy_file(source_file, target_file)
                    continue

                print(f"Convert {source_file} ({relative_path}) to {target_file}")
                ret = replace_strings_in_file(source_file, target_file, replace_list)
                if ret is None: # replace_strings_in_fileはTrueかFalseを返すため、この分岐には入らないが、既存ロジックは変更しない
                    print(f"Error to convert. Just copy the file")
                    copy_file(source_file, target_file)
                elif ret is False:
                    print(f"Error to open. Skip and Copy.")
                    copy_file(source_file, target_file)
            else:
                print(f"Copy {source_file} ({relative_path}) to {target_file}")
                copy_file(source_file, target_file)



[ドキュメント]
def main():
    """
    スクリプトの主要な実行ロジックをカプセル化します。

    定義済みの `source_dir`, `target_dir`, `replace_list`, `convert_file_extensions`
    を使用して `process_directory` 関数を呼び出します。

    :returns: なし
    :rtype: None
    """
    process_directory(source_dir, target_dir, replace_list, convert_file_extensions)


if __name__ == '__main__':
    main()