search_cif_db_tkcif.py ダウンロード/コピー

search_cif_db_tkcif.py をダウンロード
search_cif_db_tkcif.py
search_cif_db_tkcif.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
概要:
    CIFファイルを再帰的に検索し、その組成情報をSQLite3データベースに格納するツールです。
    格納されたデータベースを利用して、組成式からCIFデータを検索することができます。
詳細説明:
    このスクリプトは、指定されたルートディレクトリ以下のCIFファイルを読み込み、
    pymatgenとtkcifライブラリを使用して構造と組成情報を解析します。
    解析された情報はSQLite3データベースに保存され、後で高速に検索できるようになります。
    特に、組成式、還元組成式、元素のリスト、空間群、格子定数などのメタデータが格納されます。

    主な機能:
    - indexモード: 指定されたディレクトリからCIFファイルをスキャンし、データベースを構築または更新します。
      コマンド例:
        python search_cif_db_tkcif.py --mode index  --root ./COD --db cif_index.sqlite
    - searchモード: データベースに対して、組成式に基づいてCIFデータを検索します。
      検索モードには、exact (還元組成式が完全に一致)、elements (構成元素が完全に一致)、
      contains (指定元素が部分集合として含まれる) の3種類のマッチング方式があります。
      コマンド例:
        python search_cif_db_tkcif.py --mode search --db cif_index.sqlite --formula BaTiO3
        python search_cif_db_tkcif.py --mode search --db cif_index.sqlite --formula TiBaO3 --match exact
        python search_cif_db_tkcif.py --mode search --db cif_index.sqlite --formula BaTiO3 --match elements
        python search_cif_db_tkcif.py --mode search --db cif_index.sqlite --formula BaTiO3 --match contains
    - infoモード: データベースの統計情報を表示します。
      コマンド例:
        python search_cif_db_tkcif.py --mode info   --db cif_index.sqlite

    データベースのカラムは、ソースデータベース名、マテリアルID、CIFファイルのパス、組成式、
    還元組成式、匿名組成式、JSON形式の組成情報、元素リスト、元素数、サイト数、
    空間群記号、空間群番号、体積、原子あたりの体積、密度、格子定数 (a, b, c, alpha, beta, gamma)、
    使用されたバックエンド、正規化の有無、処理ステータス、エラーメッセージを含みます。

関連リンク:
    search_cif_db_tkcif_usage
"""

from __future__ import annotations

import argparse
import json
import re
import sqlite3
import sys
import traceback
import warnings
from pathlib import Path
from typing import Any

try:
    from pymatgen.core import Composition
except Exception:
    print("ERROR: failed to import pymatgen Composition")
    traceback.print_exc()
    print("\nInstall example:")
    print("  pip install pymatgen")
    input("\nPress ENTER to terminate>>\n")
    sys.exit(1)

try:
    from tkcif.tkcif_reader import read_structure
except Exception:
    print("ERROR: failed to import tkcif.tkcif_reader")
    traceback.print_exc()
    input("\nPress ENTER to terminate>>\n")
    sys.exit(1)

# 大量 CIF 処理では pymatgen/ASE の warning が多すぎるので標準では抑制する。
warnings.simplefilter("ignore")


SCHEMA_COLUMNS: dict[str, str] = {
    "source_db": "TEXT",
    "material_id": "TEXT",
    "cif_path": "TEXT UNIQUE",
    "formula": "TEXT",
    "reduced_formula": "TEXT",
    "anonymous_formula": "TEXT",
    "composition_json": "TEXT",
    "elements": "TEXT",
    "nelements": "INTEGER",
    "nsites": "INTEGER",
    "sg_symbol": "TEXT",
    "sg_number": "INTEGER",
    "volume": "REAL",
    "volume_per_atom": "REAL",
    "density": "REAL",
    "a": "REAL",
    "b": "REAL",
    "c": "REAL",
    "alpha": "REAL",
    "beta": "REAL",
    "gamma": "REAL",
    "backend": "TEXT",
    "normalized": "INTEGER",
    "status": "TEXT",
    "error": "TEXT",
}


DEFAULT_ERROR_RECORD_VALUES: dict[str, Any] = {
    "formula": "",
    "reduced_formula": "",
    "anonymous_formula": "",
    "composition_json": "{}",
    "elements": "",
    "nelements": -1,
    "nsites": -1,
    "sg_symbol": "",
    "sg_number": -1,
    "volume": -1.0,
    "volume_per_atom": -1.0,
    "density": -1.0,
    "a": -1.0,
    "b": -1.0,
    "c": -1.0,
    "alpha": -1.0,
    "beta": -1.0,
    "gamma": -1.0,
    "backend": "",
    "normalized": 0,
}


def guess_db_name(path: Path) -> str:
    """
    概要:
        ファイルのパスからデータベース名を推定します。
    詳細説明:
        パスの要素を小文字に変換し、"pcod", "tcod", "cod" のいずれかが含まれる場合、
        その名前をデータベース名として返します。
        これらの名前が見つからない場合は "unknown" を返します。
    引数:
        :param path: CIFファイルのパス。
        :type path: pathlib.Path
    戻り値:
        :returns: 推定されたデータベース名 (例: "cod", "tcod", "pcod", "unknown")。
        :rtype: str
    """
    parts = [p.lower() for p in path.parts]
    for name in ("pcod", "tcod", "cod"):
        if name in parts:
            return name
    return "unknown"


def guess_material_id(path: Path) -> str:
    """
    概要:
        CIFファイルのファイル名からマテリアルIDを推定します。
    詳細説明:
        ファイル名から最初に見つかる数字列をマテリアルIDとして抽出します。
        例えば、"1234567.cif" から "1234567" を返します。
        数字が見つからない場合は、ファイル名全体 (拡張子なし) を返します。
    引数:
        :param path: CIFファイルのパス。
        :type path: pathlib.Path
    戻り値:
        :returns: 推定されたマテリアルID。
        :rtype: str
    """
    m = re.search(r"(\d+)", path.stem)
    return m.group(1) if m else path.stem


def composition_to_sorted_json(comp: Composition) -> str:
    """
    概要:
        pymatgen.core.Composition オブジェクトを、元素名でソートされたJSON文字列に変換します。
    詳細説明:
        Composition オブジェクトの各元素と量の情報を取得し、元素のシンボル (文字列) でソートします。
        その後、JSON形式の文字列として出力します。
        pymatgen Composition.as_dict() のキーは Element オブジェクトではなく文字列として扱われます。
    引数:
        :param comp: 変換する組成オブジェクト。
        :type comp: pymatgen.core.Composition
    戻り値:
        :returns: 元素名でソートされた組成情報のJSON文字列。
        :rtype: str
    """
    d = {str(el): float(amount) for el, amount in comp.as_dict().items()}
    return json.dumps(dict(sorted(d.items())), ensure_ascii=False, sort_keys=True)


def normalize_formula(formula: str) -> str:
    """
    概要:
        与えられた組成式をpymatgen.core.Compositionを使用して標準化（還元）します。
    詳細説明:
        例えば、"BaTiO3" や "Ba Ti O3" のような式を pymatgen が認識する標準形式に変換し、
        さらに組成が最も簡単な整数比になるように還元します。
    引数:
        :param formula: 標準化する組成式文字列。
        :type formula: str
    戻り値:
        :returns: 標準化された還元組成式文字列。
        :rtype: str
    """
    comp = Composition(formula)
    return comp.reduced_formula


def composition_info_from_structure(path: Path) -> dict[str, Any]:
    """
    概要:
        CIFファイルから構造情報を読み込み、検索用の組成および構造メタデータを作成します。
    詳細説明:
        指定されたCIFファイルを tkcif.tkcif_reader.read_structure を使って読み込み、
        pymatgen.core.Structure オブジェクトを生成します。
        そこから組成式、還元組成式、匿名組成式、元素リスト、サイト数、空間群情報、
        格子定数、体積、原子あたりの体積、密度などを抽出し、辞書形式で返します。
        空間群情報の取得に失敗した場合は、空文字列や -1 を返します。
    引数:
        :param path: CIFファイルのパス。
        :type path: pathlib.Path
    戻り値:
        :returns: 検索用のメタデータを含む辞書。
        :rtype: dict
    """
    structure, read_info = read_structure(path, return_info=True)

    comp = structure.composition
    red_comp = comp.reduced_composition

    elements = sorted([el.symbol for el in red_comp.elements])
    element_set = ",".join(elements)

    try:
        sg_symbol, sg_number_raw = structure.get_space_group_info()
        sg_number = int(sg_number_raw)
    except Exception:
        sg_symbol = ""
        sg_number = -1

    nsites = len(structure)
    volume = float(structure.volume)
    volume_per_atom = volume / nsites if nsites > 0 else -1.0

    try:
        density = float(structure.density)
    except Exception:
        density = -1.0

    return {
        "formula": comp.formula,
        "reduced_formula": red_comp.reduced_formula,
        "anonymous_formula": red_comp.anonymized_formula,
        "composition_json": composition_to_sorted_json(red_comp),
        "elements": element_set,
        "nelements": len(elements),
        "nsites": nsites,
        "sg_symbol": sg_symbol,
        "sg_number": sg_number,
        "volume": volume,
        "volume_per_atom": volume_per_atom,
        "density": density,
        "a": float(structure.lattice.a),
        "b": float(structure.lattice.b),
        "c": float(structure.lattice.c),
        "alpha": float(structure.lattice.alpha),
        "beta": float(structure.lattice.beta),
        "gamma": float(structure.lattice.gamma),
        "backend": getattr(read_info, "backend", ""),
        "normalized": int(bool(getattr(read_info, "normalized", False))),
    }


def get_existing_columns(conn: sqlite3.Connection, table_name: str) -> set[str]:
    """
    概要:
        指定されたSQLiteテーブルに存在するカラム名を取得します。
    詳細説明:
        PRAGMA table_info SQLコマンドを実行し、テーブルのスキーマ情報を取得します。
        結果からカラム名のみを抽出し、集合として返します。
    引数:
        :param conn: SQLiteデータベース接続オブジェクト。
        :type conn: sqlite3.Connection
        :param table_name: カラム情報を取得するテーブルの名前。
        :type table_name: str
    戻り値:
        :returns: 既存のカラム名の集合。
        :rtype: set
    """
    rows = conn.execute(f"PRAGMA table_info({table_name})").fetchall()
    return {str(row[1]) for row in rows}


def create_schema(conn: sqlite3.Connection) -> None:
    """
    概要:
        SQLiteデータベースのスキーマを新規作成または既存のスキーマを更新します。
    詳細説明:
        cif_index テーブルが存在しない場合は作成し、必要なカラムとインデックスを追加します。
        テーブルが既に存在する場合は、SCHEMA_COLUMNS に定義されているがまだ存在しないカラムがあれば追加します。
        これにより、新しいバージョンのスキーマに既存のデータベースをアップグレードできます。
        その後、検索パフォーマンス向上のために複数のインデックスを作成します。
    引数:
        :param conn: SQLiteデータベース接続オブジェクト。
        :type conn: sqlite3.Connection
    戻り値:
        :returns: なし
        :rtype: None
    """
    column_defs = ",\n        ".join(
        f"{name} {dtype}" for name, dtype in SCHEMA_COLUMNS.items()
    )
    conn.execute(f"""
    CREATE TABLE IF NOT EXISTS cif_index (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        {column_defs}
    )
    """)

    existing = get_existing_columns(conn, "cif_index")
    for name, dtype in SCHEMA_COLUMNS.items():
        if name not in existing:
            conn.execute(f"ALTER TABLE cif_index ADD COLUMN {name} {dtype}")

    conn.execute("CREATE INDEX IF NOT EXISTS idx_formula ON cif_index(reduced_formula)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_elements ON cif_index(elements)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_source_db ON cif_index(source_db)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_backend ON cif_index(backend)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_sg_number ON cif_index(sg_number)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_volume_per_atom ON cif_index(volume_per_atom)")
    conn.commit()


def upsert_record(conn: sqlite3.Connection, rec: dict[str, Any]) -> None:
    """
    概要:
        データベースにレコードを挿入または更新（UPSERT）します。
    詳細説明:
        指定されたレコード辞書recの内容に基づいて、cif_indexテーブルにデータを挿入します。
        cif_path カラムは UNIQUE 制約を持つため、同じパスのレコードが既に存在する場合は
        既存のレコードが新しいデータで置き換えられます（INSERT OR REPLACE）。
    引数:
        :param conn: SQLiteデータベース接続オブジェクト。
        :type conn: sqlite3.Connection
        :param rec: データベースに挿入または更新するレコードデータを含む辞書。
        :type rec: dict
    戻り値:
        :returns: なし
        :rtype: None
    """
    cols = list(SCHEMA_COLUMNS.keys())
    col_sql = ", ".join(cols)
    val_sql = ", ".join(f":{c}" for c in cols)
    conn.execute(
        f"""
        INSERT OR REPLACE INTO cif_index ({col_sql})
        VALUES ({val_sql})
        """,
        rec,
    )


def build_index(
    root: Path,
    db_path: Path,
    pattern: str = "*.cif",
    store_errors: int = 1,
    commit_interval: int = 200,
) -> None:
    """
    概要:
        指定されたルートディレクトリ以下のCIFファイルをスキャンし、データベースインデックスを構築します。
    詳細説明:
        再帰的にCIFファイルを検索し、各ファイルから組成および構造情報を抽出します。
        抽出された情報はSQLiteデータベースに格納され、検索可能なインデックスが作成されます。
        ファイル処理中にエラーが発生した場合、store_errors が1であればエラー情報もデータベースに記録されます。
        commit_interval ごとにトランザクションがコミットされ、進行状況が出力されます。
        最後に、処理の要約と、使用されたバックエンドの統計情報が表示されます。
    引数:
        :param root: CIFファイルを検索するルートディレクトリのパス。
        :type root: pathlib.Path
        :param db_path: SQLiteデータベースファイルのパス。
        :type db_path: pathlib.Path
        :param pattern: 検索するCIFファイルのパターン (例: "*.cif")。
        :type pattern: str
        :param store_errors: エラーが発生したレコードをデータベースに保存するかどうか (0: 保存しない, 1: 保存する)。
        :type store_errors: int
        :param commit_interval: データベースにコミットする間隔 (処理されたファイル数)。
        :type commit_interval: int
    戻り値:
        :returns: なし
        :rtype: None
    """
    conn = sqlite3.connect(str(db_path))
    create_schema(conn)

    files = list(root.rglob(pattern))
    print(f"Found CIF files: {len(files)}")

    n_ok = 0
    n_err = 0
    backend_counts: dict[str, int] = {}

    for i, path in enumerate(files, start=1):
        path = path.resolve()

        rec_base: dict[str, Any] = {
            "source_db": guess_db_name(path),
            "material_id": guess_material_id(path),
            "cif_path": str(path),
        }

        try:
            info = composition_info_from_structure(path)
            rec = {
                **rec_base,
                **info,
                "status": "ok",
                "error": "",
            }
            upsert_record(conn, rec)
            n_ok += 1
            backend = str(rec.get("backend", "")) or "unknown"
            backend_counts[backend] = backend_counts.get(backend, 0) + 1

        except Exception as exc:
            n_err += 1
            print(f"[ERROR] {path}: {exc}")

            if store_errors:
                rec = {
                    **rec_base,
                    **DEFAULT_ERROR_RECORD_VALUES,
                    "status": "error",
                    "error": str(exc),
                }
                upsert_record(conn, rec)

        if i % commit_interval == 0:
            conn.commit()
            print(f"Indexed {i}/{len(files)}  ok={n_ok}  error={n_err}")

    conn.commit()
    conn.close()

    print(f"Done. ok={n_ok}, error={n_err}")
    if backend_counts:
        print("\nBackend summary:")
        for backend, count in sorted(backend_counts.items(), key=lambda x: (-x[1], x[0])):
            print(f"  {backend:24s} {count}")
    print(f"\nDB: {db_path}")


def make_subset_condition_for_contains(target_elements: list[str]) -> tuple[str, list[Any]]:
    """
    概要:
        データベースの elements カラムが指定された元素リストの部分集合である条件を生成します。
    詳細説明:
        この関数は、SQLのLIKE演算子を使用して、ある元素集合 (target_elements) の中に
        データベースレコードの元素集合が含まれるかを判定するためのWHERE句とパラメータを生成します。
        elementsカラムは、DB作成時にソートされたカンマ区切りの文字列として格納されているため、
        例として、targetが Ba,O,Ti の場合、DBレコードの elements が Ba,O や Ti,O などであればヒットします。
        要素の境界を明確にするために、カンマで囲まれた文字列に対する LIKE 検索を行います。
    引数:
        :param target_elements: 検索対象となる元素のソート済みリスト。
        :type target_elements: list
    戻り値:
        :returns: WHERE句の条件文字列と、その条件にバインドするパラメータのリストのタプル。
        :rtype: tuple
    """
    target_string = "," + ",".join(target_elements) + ","
    return "? LIKE '%,' || elements || ',%'", [target_string]


def search_by_formula(
    db_path: Path,
    formula: str,
    source_db: str = "",
    match: str = "exact",
    limit: int = 100,
    output_json: int = 0,
) -> None:
    """
    概要:
        指定された組成式とマッチングモードに基づいて、データベースからCIFデータを検索します。
    詳細説明:
        pymatgen.core.Composition を用いて検索対象の組成式を正規化し、
        指定されたマッチングモード (exact, elements, contains) に従ってデータベースをクエリします。
        検索結果は、source_db、material_id、reduced_formula、elements、空間群、格子定数などの情報を含みます。
        結果は、元素数、ソースデータベース、還元組成式、マテリアルIDの順にソートされ、指定された件数に制限されます。
        output_json が1の場合、結果はJSON形式で出力されます。それ以外の場合は、人間が読みやすい形式で出力されます。
    引数:
        :param db_path: SQLiteデータベースファイルのパス。
        :type db_path: pathlib.Path
        :param formula: 検索する組成式。
        :type formula: str
        :param source_db: 検索対象のソースデータベース名 (例: "cod", "tcod")。空文字列の場合はすべてのデータベースを検索します。
        :type source_db: str
        :param match: マッチングモード。"exact" (還元組成式が完全に一致), "elements" (構成元素が完全に一致),
                      "contains" (指定元素集合が部分集合として含まれる) のいずれかを指定します。
        :type match: str
        :param limit: 検索結果の最大件数。
        :type limit: int
        :param output_json: 結果をJSON形式で出力するかどうか (0: 通常出力, 1: JSON出力)。
        :type output_json: int
    戻り値:
        :returns: なし
        :rtype: None
    例外:
        :raises ValueError: 未知のマッチングモードが指定された場合に発生します。
    """
    target_comp = Composition(formula).reduced_composition
    target_formula = target_comp.reduced_formula
    target_elements = sorted([el.symbol for el in target_comp.elements])
    target_element_set = ",".join(target_elements)

    conn = sqlite3.connect(str(db_path))
    conn.row_factory = sqlite3.Row

    params: list[Any] = []
    where = ["status = 'ok'"]

    if source_db:
        where.append("source_db = ?")
        params.append(source_db)

    if match == "exact":
        # BaTiO3 と TiBaO3 はどちらも BaTiO3 に正規化される。
        where.append("reduced_formula = ?")
        params.append(target_formula)

    elif match == "elements":
        # 元素集合が完全一致：Ba-Ti-O系のみ。
        where.append("elements = ?")
        params.append(target_element_set)

    elif match == "contains":
        # 指定元素集合の部分集合を許す。
        # BaTiO3検索で Ba, Ti, O, BaO, TiO2, BaTiO3 などがヒット。
        cond, cond_params = make_subset_condition_for_contains(target_elements)
        where.append(cond)
        params.extend(cond_params)

    else:
        raise ValueError(f"Unknown match mode: {match}")

    sql = f"""
    SELECT
        source_db, material_id, reduced_formula, formula,
        composition_json, elements, sg_symbol, sg_number, nsites,
        a, b, c, alpha, beta, gamma, volume, volume_per_atom, density,
        backend, normalized, cif_path
    FROM cif_index
    WHERE {' AND '.join(where)}
    ORDER BY nelements DESC, source_db, reduced_formula, material_id
    LIMIT ?
    """
    params.append(limit)

    rows = [dict(r) for r in conn.execute(sql, params).fetchall()]
    conn.close()

    if output_json:
        print(json.dumps(rows, ensure_ascii=False, indent=2))
        return

    print(f"query formula   : {formula}")
    print(f"reduced formula : {target_formula}")
    print(f"elements        : {target_element_set}")
    print(f"match           : {match}")
    print(f"hits            : {len(rows)}")
    print()

    for r in rows:
        print(
            f"[{r['source_db']}] {r['material_id']}  "
            f"{r['reduced_formula']}  "
            f"elements={r['elements']}  "
            f"SG={r['sg_symbol']}({r['sg_number']})  "
            f"nsites={r['nsites']}  "
            f"backend={r.get('backend', '')}"
        )
        print(f"  a,b,c = {r['a']:.6g}, {r['b']:.6g}, {r['c']:.6g}")
        print(f"  V/atom = {r.get('volume_per_atom', -1):.6g} A^3, density = {r.get('density', -1):.6g} g/cm^3")
        print(f"  path   = {r['cif_path']}")
        print()


def show_info(db_path: Path) -> None:
    """
    概要:
        指定されたデータベースの統計情報を表示します。
    詳細説明:
        データベースに格納されているCIFレコードの総数、ステータスごとの内訳 (成功/エラー)、
        ソースデータベースごとの内訳、使用されたバックエンドごとの内訳、
        および正規化の有無ごとの内訳を出力します。
        これらの情報は、データベースの状態やインデックス作成の品質を把握するのに役立ちます。
    引数:
        :param db_path: SQLiteデータベースファイルのパス。
        :type db_path: pathlib.Path
    戻り値:
        :returns: なし
        :rtype: None
    """
    conn = sqlite3.connect(str(db_path))
    cur = conn.cursor()

    print("Total:")
    for row in cur.execute("""
        SELECT status, COUNT(*) FROM cif_index GROUP BY status
    """):
        print(f"  {row[0]}: {row[1]}")

    print("\nBy source_db:")
    for row in cur.execute("""
        SELECT source_db, status, COUNT(*)
        FROM cif_index
        GROUP BY source_db, status
        ORDER BY source_db, status
    """):
        print(f"  {row[0]:8s} {row[1]:8s} {row[2]}")

    existing_cols = get_existing_columns(conn, "cif_index")
    if "backend" in existing_cols:
        print("\nBy backend:")
        for row in cur.execute("""
            SELECT backend, status, COUNT(*)
            FROM cif_index
            GROUP BY backend, status
            ORDER BY status, backend
        """):
            backend = row[0] if row[0] else "(none)"
            print(f"  {backend:24s} {row[1]:8s} {row[2]}")

    if "normalized" in existing_cols:
        print("\nNormalized:")
        for row in cur.execute("""
            SELECT normalized, status, COUNT(*)
            FROM cif_index
            GROUP BY normalized, status
            ORDER BY normalized, status
        """):
            print(f"  normalized={row[0]} {row[1]:8s} {row[2]}")

    conn.close()


def main() -> None:
    """
    概要:
        スクリプトのエントリポイント関数です。
    詳細説明:
        コマンドライン引数を解析し、指定されたモード (index, search, info) に応じて
        適切な処理関数を呼び出します。
        --mode index はCIFファイルのインデックスを構築し、
        --mode search はデータベースから組成式で検索し、
        --mode info はデータベースの統計情報を表示します。
        検索モード (--mode search) の場合、--formula 引数が必須です。
    引数:
        なし
    戻り値:
        :returns: なし
        :rtype: None
    """
    parser = argparse.ArgumentParser()

    parser.add_argument("--mode", type=str, default="search",
                        choices=["index", "search", "info"])

    parser.add_argument("--root", type=str, default=".")
    parser.add_argument("--db", type=str, default="cif_index.sqlite")
    parser.add_argument("--pattern", type=str, default="*.cif")

    parser.add_argument("--formula", type=str, default="")
    parser.add_argument("--match", type=str, default="exact",
                        choices=["exact", "elements", "contains"],
                        help=(
                            "exact: 還元組成一致, "
                            "elements: 元素集合一致, "
                            "contains: 指定元素集合の部分集合も許す"
                        ))
    parser.add_argument("--source-db", type=str, default="",
                        help="cod, tcod, pcod など。空なら全DB検索。")

    parser.add_argument("--store-errors", type=int, default=1, choices=[0, 1])
    parser.add_argument("--json", type=int, default=0, choices=[0, 1])
    parser.add_argument("--limit", type=int, default=100)

    args = parser.parse_args()

    db_path = Path(args.db)

    if args.mode == "index":
        build_index(
            root=Path(args.root),
            db_path=db_path,
            pattern=args.pattern,
            store_errors=args.store_errors,
        )

    elif args.mode == "search":
        if not args.formula:
            print("ERROR: --formula is required for --mode search")
            sys.exit(1)

        search_by_formula(
            db_path=db_path,
            formula=args.formula,
            source_db=args.source_db,
            match=args.match,
            limit=args.limit,
            output_json=args.json,
        )

    elif args.mode == "info":
        show_info(db_path)


if __name__ == "__main__":
    main()