tklib.tkxml のソースコード

"""
概要: XMLデータの解析、変換、操作を支援するユーティリティ機能を提供します。

詳細説明:
このモジュールは、Pythonの標準ライブラリであるxml.etree.ElementTreeやxml.dom.minidomに加え、
外部ライブラリのxmltodict、dicttoxml、numpyを利用して、XMLファイルとPythonの辞書構造間の
相互変換、XML要素の再帰的な解析、特定のセクション情報の抽出、整形されたXMLの書き出しなど、
XML操作に関する幅広い機能を提供します。
ATLAS最適化XMLやVASP計算結果XML (vasprun.xml) などの特定フォーマットのXML解析例も含まれています。

関連リンク:
tkxml_usage
"""
import os
import sys
import xml.etree.ElementTree as ET
import xml.dom.minidom as md
import numpy as np

import xmltodict
import dicttoxml


infile = 'opt_2onl.xml'

nargs = len(sys.argv)
if nargs > 1:
    infile = sys.argv[1]



[ドキュメント]
def get_elements_recursive(element):
    """
    概要: XML要素の子要素の情報を再帰的に取得します。

    詳細説明:
    入力されたXML要素が子要素を持たない場合、その要素のテキスト内容を返します。
    子要素を持つ場合、各子要素のタグ、テキスト、属性を抽出し、さらにその子要素に対して
    再帰的にこの関数を呼び出して情報を収集します。
    収集された情報は、辞書のリストとして返されます。

    引数:
        :param element: 処理対象のxml.etree.ElementTree.Elementオブジェクト。
        :type element: xml.etree.ElementTree.Element
    戻り値:
        :returns: 子要素がない場合は要素のテキスト (str)、子要素がある場合は子要素情報のリスト (list[dict])。
        :rtype: str or list[dict]
    """
    ne = len(element)
    
    if ne == 0 or element.text is None:
        return element.text

    inf = []
# if element has child elements
    for child_element in element:
        tag    = child_element.tag
        ctext  = child_element.text
        attrib = child_element.attrib
        child_inf = get_elements_recursive(child_element)
        if type(child_inf) is str:
            inf.append({"tag": tag, "text": ctext, "attrib": attrib})
        else:
            inf.append({"tag": tag, "text": ctext, "attrib": attrib, "child": child_inf})

    return inf



[ドキュメント]
def get_unique_key(d, section, val = None):
    """
    概要: 辞書内でユニークなキー名を生成します。

    詳細説明:
    指定されたsectionとvalに基づいてキーを生成し、辞書dにそのキーが存在しない場合は
    そのまま返します。既に存在する場合は、section[i]:valまたはsection[i]の形式で
    インデックスを付加し、ユニークなキーが見つかるまで試行します。

    引数:
        :param d: キーの存在を確認する辞書。
        :type d: dict
        :param section: キーの基本となるセクション名。
        :type section: str
        :param val: (オプション) セクションに付加する値。デフォルトはNone。
        :type val: str or None
    戻り値:
        :returns: 辞書内でユニークなキー文字列。
        :rtype: str
    """
    if val is None:
        key = section
    else:
        key = f"{section}:{val}"
    
    if d.get(key, None) is None:
        return key

    i = 0
    while True:
        if val is None:
            key = f"{section}[{i}]"
        else:
            key = f"{section}[{i}]:{val}"

        if d.get(key, None) is None:
            return key

        i += 1



[ドキュメント]
def get_section_inf(parent, sections = None, inf = None, params = None, ret_type = 'list', add_parent_params = True):
    """
    概要: 指定されたセクションパスに沿ってXML要素の情報を取得します。

    詳細説明:
    parent要素からsectionsリストで指定されたパスを辿り、最後のセクションの子要素の情報を取得します。
    ret_typeが'list'の場合はリストで、'dict'の場合は辞書で情報を返します。
    途中の親要素の属性もparams_childとして収集され、add_parent_paramsがTrueの場合、
    最終的な戻り値に含められます。

    引数:
        :param parent: 検索を開始するxml.etree.ElementTree.Elementオブジェクト。
        :type parent: xml.etree.ElementTree.Element
        :param sections: (オプション) 検索する子要素のタグ名のリスト。デフォルトはNone。
        :type sections: list[str] or None
        :param inf: (オプション) 情報を追加する既存のリストまたは辞書。関数内でコピーして使用します。デフォルトはNone。
        :type inf: list or dict or None
        :param params: (オプション) 親要素から引き継がれるパラメータの辞書。デフォルトはNone。
        :type params: dict or None
        :param ret_type: (オプション) 戻り値の型 ('list'または'dict')。デフォルトは'list'。
        :type ret_type: str
        :param add_parent_params: (オプション) 親のパラメータを戻り値に含めるかどうか。デフォルトはTrue。
        :type add_parent_params: bool
    戻り値:
        :returns: 子要素のタグ、テキスト、属性、または子要素の属性情報を含む辞書またはリスト。
        :rtype: list or dict
    """
    if inf is None:
        if ret_type == 'list':
            inf_child = []
        else:
            inf_child = {}
    else:
        inf_child = inf.copy()

    if sections is None or len(sections) == 0:
        return inf_child

    ns = len(sections)
    if params is None:
        params_child = {}
    else:
        params_child = params.copy()

    section_list = parent
    for i in range(ns - 1):
        section_list = section_list.find(sections[i])
        if section_list is None:
            return None

        for _key, _val in section_list.items():
            key = get_unique_key(params_child, sections[i], _key)
            params_child[key] = _val

    sections_last = section_list.findall(sections[-1])
    for parameter in sections_last:
        keys = parameter.keys()
        if len(keys) > 0:
            pkey = parameter.get(keys[0], None)
            _inf = {}
            for k, v in parameter.items():
                _inf[k] = v
        else:
            pkey = "list"
            _inf = get_elements_recursive(parameter)

        if ret_type == 'list':
            inf_child.append(_inf)
        else:
            inf_child[pkey] = _inf

    if add_parent_params:
        if ret_type == 'list':
            inf_child.append(params_child)
        else:
            inf_child["params"] = params_child

    return inf_child



[ドキュメント]
def get_section_inf_all(parent, sections = None, section_parent = 'root', inf = None, 
                level = 0, params = None, pkey = None, ret_type = 'list', last_node_only = True):
    """
    概要: 指定されたセクションパスに沿ってXML要素の情報を再帰的に取得します。

    詳細説明:
    parent要素からsectionsリストで指定されたパスを辿り、各階層の子要素の情報を取得します。
    last_node_onlyがFalseの場合、中間ノードの情報も収集します。
    ret_typeが'list'の場合はリストで、'dict'の場合は辞書で情報を返します。
    親要素の属性はparams_childとして引き継がれ、各ノードの情報に含まれます。

    引数:
        :param parent: 検索を開始するxml.etree.ElementTree.Elementオブジェクト。
        :type parent: xml.etree.ElementTree.Element
        :param sections: (オプション) 検索する子要素のタグ名のリスト。デフォルトはNone。
        :type sections: list[str] or None
        :param section_parent: (オプション) 現在のparent要素のタグ名（ユニークキー生成用）。デフォルトは'root'。
        :type section_parent: str
        :param inf: (オプション) 情報を追加する既存のリストまたは辞書。デフォルトはNone。
        :type inf: list or dict or None
        :param level: (オプション) 現在の再帰レベル。デフォルトは0。
        :type level: int
        :param params: (オプション) 親要素から引き継がれるパラメータの辞書。デフォルトはNone。
        :type params: dict or None
        :param pkey: (オプション) ret_typeが'dict'の場合にキーとして使用する属性名。デフォルトはNone。
        :type pkey: str or None
        :param ret_type: (オプション) 戻り値の型 ('list'または'dict')。デフォルトは'list'。
        :type ret_type: str
        :param last_node_only: (オプション) 最後のセクションのノードのみを収集するか、すべてのノードを収集するか。デフォルトはTrue。
        :type last_node_only: bool
    戻り値:
        :returns: 子要素のタグ、テキスト、属性、および親要素のパラメータを含む辞書またはリスト。
        :rtype: list or dict
    """
#    ret_type = 'list'

    if inf is None:
        if ret_type == 'list':
            inf = []
        else:
            inf = {}

    if sections is None or len(sections) == 0:
        return inf

    ns = len(sections)
    if params is None:
        params_child = {}
    else:
        params_child = params.copy()

    for _key, _val in parent.items():
        key = get_unique_key(params_child, section_parent, _key)
        params_child[key] = _val

#    print(f"level={level} sections={sections} sections[0]={sections[0]} param={params_child}")
    section_list = parent.findall(sections[0])
    for i, section in enumerate(section_list):
        if len(sections) == 0:
            print(f"  **return  len(inf)={len(inf)}")
            return inf

        tag    = section.tag
        ctext  = section.text
        attrib = section.attrib
        keys   = section.keys()
        if ret_type == 'dict':
            if len(keys) == 0:
                _pkey = 'list'
            else:
                if pkey is None:
                    _pkey = keys[0]
                else:
                    _pkey = pkey
                _pkey = get_unique_key(inf, section.get(_pkey, keys[0]), None)

        _inf = {"level": level, "tag": tag, "text": ctext, "attrib": attrib, "params": params_child}

        if not last_node_only or len(sections) == 1:
            if ret_type == 'list':
                inf.append(_inf)
            else:
                inf[_pkey] = _inf

        inf_ret = get_section_inf_all(section, section_parent = sections[0], sections = sections[1:], 
                        level = level + 1, inf = inf, params = params_child, 
                        pkey = pkey, ret_type = ret_type, last_node_only = last_node_only)
#        print("  in for: i=", i, "  _inf=", _inf)

    return inf



[ドキュメント]
def xml2dict(xml):
    """
    概要: XML文字列またはElementTree要素を辞書に変換します。

    詳細説明:
    xmltodictライブラリを使用して、入力されたXMLデータをPythonの辞書構造に変換します。

    引数:
        :param xml: 変換するXML文字列。
        :type xml: str
    戻り値:
        :returns: 変換された辞書。
        :rtype: dict
    """
    return xmltodict.parse(xml)


def _element_to_dict(element):
    """
    概要: xml.etree.ElementTreeのElementオブジェクトを再帰的に辞書に変換します。

    詳細説明:
    要素の属性は '@attributes' キーの下に、テキスト内容は '#text' キーの下に格納されます。
    同じタグ名の子要素が複数ある場合はリストとして扱われます。
    これはxmltodictの代替または補助的な役割を果たす内部関数です。

    引数:
        :param element: 変換するxml.etree.ElementTree.Elementオブジェクト。
        :type element: xml.etree.ElementTree.Element
    戻り値:
        :returns: 変換された辞書。
        :rtype: dict
    """
    result = {}

    # 要素の属性を処理
    if element.attrib:
        result['@attributes'] = element.attrib

    # 要素のテキスト内容を処理
    if element.text and element.text.strip():
        result['#text'] = element.text.strip()

    # 子要素を処理
    for child in element:
        child_dict = _element_to_dict(child)
        if child.tag in result:
            # 同じタグ名の子要素が複数ある場合、リストに追加
            if not isinstance(result[child.tag], list):
                result[child.tag] = [result[child.tag]]
            result[child.tag].append(child_dict)
        else:
            # 新しいタグ名の子要素の場合
            result[child.tag] = child_dict
    return result


[ドキュメント]
def file2dict(infile):
    """
    概要: 指定されたファイルパスのXMLファイルを読み込み、辞書に変換します。

    詳細説明:
    指定されたパスからXMLファイルを読み込み、その内容をxml2dict関数を使って辞書形式に変換します。

    引数:
        :param infile: 読み込むXMLファイルのパス。
        :type infile: str
    戻り値:
        :returns: 変換された辞書。
        :rtype: dict
    """
    with open(infile) as fp:
        xml = fp.read()

    return xml2dict(xml)



[ドキュメント]
def dict2xml(d, attr_type = True, root = True):
    """
    概要: 辞書をXMLインスタンスに変換します。

    詳細説明:
    dicttoxmlライブラリを使用して、入力されたPythonの辞書をXMLインスタンスに変換します。

    引数:
        :param d: 変換する辞書。
        :type d: dict
        :param attr_type: (オプション) 辞書のキーをXMLタグの属性として扱うかどうか。デフォルトはTrue。
        :type attr_type: bool
        :param root: (オプション) 生成されるXMLにルート要素を含めるかどうか。デフォルトはTrue。
        :type root: bool
    戻り値:
        :returns: 変換されたXMLインスタンス (bytes)。
        :rtype: bytes
    """
    return dicttoxml.dicttoxml(d, attr_type = attr_type, root = root)    



[ドキュメント]
def to_xml(outfile, element, encoding = 'utf-8', newl = '', indent = '', addindent = '    ',
            xml_declaration = True, use_minidom = False):
    """
    概要: XML要素をファイルに書き出します。

    詳細説明:
    use_minidomがTrueの場合、xml.dom.minidomを使用してXMLを整形（インデントと改行を追加）して
    ファイルに書き出します。それ以外の場合は、xml.etree.ElementTreeを使用して直接書き出します。

    引数:
        :param outfile: 書き込み先のファイルパス。
        :type outfile: str
        :param element: 書き出すルートのxml.etree.ElementTree.Elementオブジェクト。
        :type element: xml.etree.ElementTree.Element
        :param encoding: (オプション) 出力ファイルのエンコーディング。デフォルトは'utf-8'。
        :type encoding: str
        :param newl: (オプション) use_minidomがTrueの場合の改行文字。デフォルトは''。
        :type newl: str
        :param indent: (オプション) use_minidomがTrueの場合のインデント文字列。デフォルトは''。
        :type indent: str
        :param addindent: (オプション) use_minidomがTrueの場合の追加インデント文字列。デフォルトは'    '。
        :type addindent: str
        :param xml_declaration: (オプション) XML宣言 (<?xml version="...">) を含めるかどうか。デフォルトはTrue。
        :type xml_declaration: bool
        :param use_minidom: (オプション) xml.dom.minidomを使用してXMLを整形するかどうか。デフォルトはFalse。
        :type use_minidom: bool
    戻り値:
        :returns: None
        :rtype: None
    """
    if not use_minidom:
        tree = ET.ElementTree(element)
        tree.write(outfile, encoding = encoding, xml_declaration = xml_declaration)
    else:
        doc = md.parseString(ET.tostring(element, encoding = encoding))
        fp = open(outfile, 'w')
        doc.writexml(fp, encoding = encoding, newl = newl, indent = indent, addindent = addindent)
        fp.close()



[ドキュメント]
def get_attrib(element):
    """
    概要: XML要素のタグ、テキスト、および属性を取得します。

    引数:
        :param element: 処理対象のxml.etree.ElementTree.Elementオブジェクト。
        :type element: xml.etree.ElementTree.Element
    戻り値:
        :returns: 要素のタグ (str)、テキスト内容 (str)、属性の辞書 (dict) のタプル。
        :rtype: tuple[str, str, dict]
    """
    return element.tag, element.text, element.attrib

    

[ドキュメント]
def get_root(xml):
    """
    概要: XMLソースからルート要素を取得します。

    詳細説明:
    入力がファイルパスの場合はファイルを解析し、XML文字列の場合は文字列から解析して
    xml.etree.ElementTree.Element型のルート要素を返します。

    引数:
        :param xml: XMLファイルのパス (str) またはXML文字列 (str)。
        :type xml: str
    戻り値:
        :returns: xml.etree.ElementTree.Element型のXMLルート要素。
        :rtype: xml.etree.ElementTree.Element
    """
    if os.path.isfile(xml):
        xml_tree = ET.parse(xml)
        xml_root = xml_tree.getroot()
    else:
        xml_root = ET.fromstring(xml)

    return xml_root




[ドキュメント]
def main():
    """
    概要: スクリプトのメイン実行関数です。

    詳細説明:
    コマンドライン引数から入力ファイルを読み込み、そのXML構造に基づいて
    ATLAS最適化XMLまたはVASP計算結果XML（vasprun.xml）を解析し、
    関連情報を標準出力に出力します。

    引数:
        なし
    戻り値:
        :returns: None
        :rtype: None
    """
    print()
    print(f"infile: {infile}")

#    dict_xml = file2dict(infile)
#    print("dict:", dict_xml)
#    _xml = dict2xml(dict_xml)
#    print("xml:", _xml)

    xml_root = get_root(infile)
#    tag, text, attrib = get_attrib(xml_root)
#    print("tag=", tag)
#    print("text=", text)
#    print("attrib=", attrib)

    if xml_root.find('.//parameter-list'):
        print()
        print("ATLAS optimization XML:")
        parameter_inf = get_section_inf_all(xml_root, ['parameter-list', 'parameter'], ret_type = 'dict')
#        parameter_inf = get_section_inf(xml_root, ['parameter-list', 'parameter'], ret_type = 'dict')
#        parameter_inf = get_section_inf(xml_root.find('parameter-list'), ['parameter'])
#       parameter_inf = get_section_inf(xml_root, ['.//parameter'])
#        parameter_inf = get_section_inf(xml_root, ['.//parameter-list'])
        print("parameter_inf:")
        for key, val in parameter_inf.items():
            print(f"  {key}: {val}")
        setting_inf = get_section_inf_all(xml_root, ['settings', 'setting'], ret_type = 'dict')
#        setting_inf = get_section_inf(xml_root, ['settings', 'setting'], ret_type = 'dict')
        print("setting_inf:")
        for key, val in setting_inf.items():
            print(f"  {key}: {val}")
        target_inf = get_section_inf_all(xml_root, ['target-list', 'target'], ret_type = 'dict')
#        target_inf = get_section_inf(xml_root, ['target-list', 'target'], ret_type = 'dict')
        print("target_inf:")
        for key, val in target_inf.items():
            print(f"  {key}: {val}")
    elif xml_root.find('.//incar'):
        print()
        print("vasprun.xml:")
#        band_inf = get_section_inf(xml_root)
#        band_inf = get_section_inf(xml_root, [".//projected_kpoints_opt", ".//eigenvalues", './/set[@comment]'])
#        band_inf = get_section_inf(xml_root, [".//projected_kpoints_opt", ".//eigenvalues", './/set[@comment]', './/set[@comment]', './/r'])
        band_inf = get_section_inf_all(xml_root, [".//projected_kpoints_opt", ".//eigenvalues", './/set[@comment]', './/set[@comment]', './/r'])
        print("band_inf:")
#        for inf in band_inf:
#            print(f"  {inf}")
#        incar_inf = get_section_inf(xml_root, [".//incar"], ret_type = 'dict')
#        print("incar_inf:", incar_inf)
#        incar_inf = get_section_inf_all(xml_root, [".//incar", "*"], ret_type = 'list', last_node_only = False)
#        print("incar_inf:")
#        for val in incar_inf:
#            print(f"  {val}")
        incar_inf = get_section_inf_all(xml_root, [".//incar", "*"], pkey = 'name', ret_type = 'dict', last_node_only = False)
        for key, val in incar_inf.items():
            print(f"  {key}: {val}")
    else:
        print()
        print("Error: Invalid XML type")
        exit()



if __name__ == '__main__':
    main()