tklib.tkcrystal.tkcif のソースコード

"""
CIFファイルを管理するためのパッケージ。

CIF (Crystallographic Information File) 形式のファイルを読み込み、解析し、
結晶構造データとして操作するためのクラスと関数を提供します。
ファイルからのデータの抽出、キーワードと値のペアの解析、ループデータの処理などをサポートします。

関連リンク:
    :doc:`tkcif_usage`
"""
import sys
import shlex
import re
from pprint import pprint


from tklib.tkobject import tkObject
from tklib.tkcrystal.tkcifobject import tkCIFObject, tkCIFData
from tklib.tkcrystal.tkcrystal import tkCrystal
from tklib.tkutils import SplitFilePath, split_quoted_args
from tklib.tkfile import tkFile
import tklib.tkre as tkre


#=====================================
# CIF manager class
#=====================================
end_of_cifdata = 0
header    = 1
comment   = 2
singlekey = 3
multikeys = 4
loopkeys  = 5
other     = 999


[ドキュメント]
class tkCIF(tkCIFObject):
    """
    CIFファイルを管理するためのクラス。

    CIFファイルの読み込み、解析、およびデータ抽出機能を提供します。
    tkCIFObjectを継承し、ファイルI/OおよびCIFデータ構造の処理を拡張します。
    """

    def __init__(self, path = None, mode = 'r', **args):
        """
        tkCIFオブジェクトを初期化します。

        CIFファイルのパスとオープンモードを設定し、内部変数を初期化します。
        パスが指定されている場合、ファイルをオープンします。

        :param path: str, optional CIFファイルのパス。デフォルトはNone。
        :param mode: str, optional ファイルのオープンモード（例: 'r'、'w'）。デフォルトは'r'。
        :param args: dict その他のキーワード引数。
        """
        super(tkCIF, self).__init__(path, mode, **args)
        self.debug = 0
        self.cifdata_list = []
        self.fp   = None
        self.path = None
        self.mode = 'r'
        self.path = self.IfYes(path is not None, path, self.path)
        self.mode = self.IfYes(mode is not None, mode, self.mode)
        self.update(**args)

        if self.path is not None:
            self.Open(self.path, self.mode)
#            print("open [{}]:".format(self.path), self.fp)

    def __del__(self):
        """
        tkCIFオブジェクトのデストラクタ。

        オープンされているファイルを閉じ、親クラスのデストラクタを呼び出します。
        """
        self.Close()
        super(tkCIF, self).__del__()

    def __str__(self):
        """
        オブジェクトの文字列表現を返します。

        このオブジェクトのクラスパスを返します。

        :returns: str このオブジェクトのクラスパス。
        """
        return self.ClassPath()



[ドキュメント]
    def splitline(self, lines, i):
        """
        与えられた行リストの指定されたインデックスの行をキーワードと値に分割します。

        行をスペースで区切られたキーワードと値に分割します。
        キーワードは非空白部分、値は残りの部分からヘッダーと末尾のスペースを除いたものです。

        :param lines: list[str] 分割対象の行のリスト。
        :param i: int 分割する行のインデックス。
        :returns: tuple[str, str, str] 行、キーワード、値のタプル。
                                    行が存在しない場合は (None, '', '') を返します。
        """
        if i >= len(lines):
            return None, '', ''
        match = re.match(r"\s*(\S.*?)\s+(\S.*?)\s*$", lines[i])
        if match:
            return lines[i], match.group(1), match.group(2)
        return lines[i], lines[i].strip(), ''




[ドキュメント]
    def GetNextValuesOne(self, lines, nrequired, i):
        """
        単一の行からCIF値を抽出します。

        `lines[i]` からCIFデータを抽出し、指定された数の値が要求された場合に、
        それらをリストとして返します。マルチラインの値（セミコロンで開始・終了）も処理します。

        :param lines: list[str] 解析対象の行のリスト。
        :param nrequired: int 必要とされる値の数。
        :param i: int 解析を開始する行のインデックス。
        :returns: tuple[int, int, str, list[str]]
                  現在の行インデックス、ステータス、関連する行（文字列）、値のリスト。
                  ステータスは `end_of_cifdata`、`singlekey`、`multikeys` のいずれかです。
        """

        l, key, val = self.splitline(lines, i)
# data end
        if l is None:
#            print("dataend")
            return i, end_of_cifdata, '', []

# for single key, single line value
        l0 = l.strip()
        list = tkre.Match(r';\s*(.*)\s*', l0)
        if not list:
            if nrequired == 1:
                self.dprint(1, "GNVOne(single): cA={}  nreq={}  l={}, key={}".format(i, nrequired, l0, key))
                return i+1, singlekey, l0, [l0]
            else:
                self.dprint(1, "GNVOne(mult nreq): cB={}  nreq={}  l={}, key={}".format(i, nrequired, l0, key))
                kws = split_quoted_args(l.strip())
#                try:
#                    kws = shlex.split(l.strip())
#                except:
#                    kws = l.strip()
#                self.dprint(1, "kws=", kws)
                if len(kws) == nrequired:
                    return i+1, multikeys, l0, kws
                return i+1, multikeys, l0, kws

# for single key, multi line value
        if len(list) >= 2:
            s = list[1]
        else:
            s = ''
#        try:
#            s = list[1]
#        except:
#            s = ''
        while 1:
            i += 1
            l, key, val = self.splitline(lines, i)
            self.dprint(1, "GNVOne(mult): cC={}  l={}, key={}".format(i, l0, key))
            l0 = l.strip()
# data end
            if l is None:
                s0 = s.strip()
                if s0 == '':
#                    print("single multi line1")
                    return i, singlekey, s, []
                else:
#                    print("single multi line2")
                    return i, singlekey, s, [s0]
# multi line data end
            list = tkre.Match(';', l0)
            if list:
                s0 = s.strip()
                if s0 == '':
#                    print("single multi line ; 1")
                    return i+1, singlekey, s, []
                else:
#                    print("single multi line ; 2")
                    return i+1, singlekey, s, [s0]
            s += l




[ドキュメント]
    def GetNextValues(self, lines, nrequired, i):
        """
        複数行からCIF値を抽出します。

        `lines[i]` から始まる複数行からCIF値を抽出し、指定された数の値が要求されるまで読み込みを続けます。
        主に `_keyword` に続く値や `loop_` データブロック内の値を処理するために使用されます。

        :param lines: list[str] 解析対象の行のリスト。
        :param nrequired: int 必要とされる値の数。
        :param i: int 解析を開始する行のインデックス。
        :returns: tuple[int, int, str, list[str]]
                  現在の行インデックス、ステータス、関連する結合行（文字列）、値のリスト。
                  ステータスは `multikeys` または `end_of_cifdata` のいずれかです。
        """

        count = 0
        retval = []
        retline = ''
        c = i
        while 1:
            c, type2, l2, values = self.GetNextValuesOne(lines, nrequired, c)
            nv = len(values)
            count += nv
            retval += values
            retline += l2
            self.dprint(1, "GNV: c={}  count={}  type2={}  values={}".format(c, count, type2, values))
            if count >= nrequired:
                return c, multikeys, retline, retval
            if type2 == end_of_cifdata:
                if nv == 0:
                    return c, end_of_cifdata, retline, retval
                else:
                    return c, multikeys, retline, retval




[ドキュメント]
    def GetNextKey(self, lines, i):
        """
        指定された行からCIFキーワードと対応する値を抽出します。

        CIFファイルの行を解析し、`data_`ヘッダー、コメント、単一キーワード、
        または `loop_` ブロックのキーワードと値を識別します。

        :param lines: list[str] 解析対象の行のリスト。
        :param i: int 解析を開始する行のインデックス。
        :returns: tuple[int, int, list]
                  現在の行インデックス、ステータス、および抽出されたキーワードと値のリスト。
                  ステータスは `end_of_cifdata`、`header`、`comment`、`singlekey`、
                  `multikeys` (実際には`loopkeys`が返される)、`loopkeys`、`other` のいずれかです。
                  `keyval` の形式はステータスに依存します。
                  - `header`: `['header', 'data_name']`
                  - `comment`: `['# comment text', '']`
                  - `singlekey`: `['keyword', 'value']`
                  - `loopkeys`: `[[['kw1', 'val1a'], ['kw2', 'val2a']], [['kw1', 'val1b'], ['kw2', 'val2b']], ...]`
        """
# data end
        nlines = len(lines)
        if i >= nlines:
#            print("end_of_cifdata: i >= nlines ({} > {})".format(i, nlines))
            return i, end_of_cifdata, []

        keyinf = []
        c = i
        nlines = len(lines)
        while 1:
            l, key, val = self.splitline(lines, c)
            if l is None:
                break

            self.dprint(1, "GNK: c={}, nlines={}: key=[{}]  val=[{}]".format(c, len(lines), key, val))

            l0 = l.strip()
# null value
            if l0 == '':
                c += 1
                continue
# end of cif
            if re.match(r'#End of', l0):
#                print("end_of_cifdata: #End of")
                return c+1, end_of_cifdata, []
#            if re.match(r'data_', l0):
#                return c, end_of_cifdata, []
# comment
            if re.match('#', l0):
                return c+1, comment, [l0, '']

# first header line
            if re.match('data_', key):
                return c+1, header, ['header', l0]

# for single keyword starting from '_'
            if key == '':
                continue
            elif re.match('_', key):
#                self.dprint(1, "singlekw")
# for single key, multiple line value
                if val == '':
                    c += 1
                    c, type2, l2, values = self.GetNextValues(lines, 1, c)
                    self.dprint(1, "GNK: c3={}  type={}  key={}  val={}".format(c, key, type2, values[0]))
                    return c, singlekey, [key, values[0]]
                else:
                    return c+1, singlekey, [key, val]
# for keyword 'loop_'
            elif key == 'loop_':
                self.dprint(1, "\n")
                self.dprint(1, "======================= loop")
                self.dprint(1, "  ** Read keywords")
# collect keywords starting from '_'
                keywords = []
                c += 1
                while 1:
                    if c >= nlines:
                        break
                    l, key, val = self.splitline(lines, c)
                    if l is None:
                        break
                    self.dprint(1, "GNK: c5={}  key={}  val={}".format(c, key, val))
                    l0 = l.strip()
                    if l0 == '':
                        c += 1
                        continue
#                    if re.match(r"\s*#End of", l) or re.match(r"\s*#=====", l):
                    if re.match(r"\s*#End of", l):
                        c += 1
                        break
#                    if re.match(r"\s*data_", l):
#                        break
                    if re.match(r"\s*#", l):
                        c += 1
                        continue
                    if re.match(r"\s*_", key):
                        keywords.append(key)
                        c += 1
                        continue
                    else:
                        c -= 1
                        break
# collect values untile a new '_' or 'loop_' is found
                c += 1
                nrequired = len(keywords)
                keyvalues = []
                nkeyvalues = 0

                self.dprint(1, "\n")
                self.dprint(1, "  ** Read values (c={}  nlines={}  nrequired={})"
                    .format(c, nlines, nrequired))
                while 1:
                    if c >= nlines:
                        break

                    self.dprint(1, "**check c={} {}".format(c, lines[c]))
                    l0 = lines[c].strip()
#                    print("l0=", l0)
                    if l0 == '':
                        c += 1
                        continue
#                    if re.match(r"\s*#End of", lines[c]) or re.match(r"\s*#=====", lines[c]):
                    if re.match(r"\s*#End of", lines[c]):
                        c += 1
                        break
#                    if re.match(r"\s*data_", lines[c]):
#                        break
                    if re.match(r"\s*#", lines[c]):
                        c += 1
                        continue

                    if re.match("\s*$", lines[c]):
                        c += 1
                        continue

                    if re.match("_", lines[c]) or re.match("loop_", lines[c]):
                        break

                    self.dprint(1, "GNK: for GetNextValues: nreq=", nrequired)
                    c, type2, l, values = self.GetNextValues(lines, nrequired, c)
                    self.dprint(1, "     c4={}  type2={}  key={}  val={}  l={}"
                            .format(c, type2, key, values, l.strip()))

                    if type2 == end_of_cifdata:
                        break
#                    if re.match("_", key) or key == 'loop_':
#                        break

                    nval = len(values)
                    self.dprint(1, "\n")
                    self.dprint(1, "GNK: nrequired={}  nvalues={}  nkeyvalues={}"
                            .format(nrequired, nval, nkeyvalues))
                    self.dprint(1, "  values=", values)        
                    keyval = []
                    for i in range(len(keywords)):
                        self.dprint(1, "GNK: build keyval: [{}]::[{}]"
                            .format(keywords[i], values[i]))
                        keyval.append([keywords[i], values[i]])
                    keyvalues.append(keyval)
                    nkeyvalues += 1

                self.dprint(1, "\n*****************GNK: loop_values break")
                self.dprint(1, "keyvalues:")
                self.dpprint(1, keyvalues)
                self.dprint(1, "")
                return c, loopkeys, keyvalues
            else:
                print(f"Error in tkCIF.GetNextKey(): Invalid key [{l}]")
                return c+1, other, [l, '']
                continue
        return c, end_of_cifdata, []



[ドキュメント]
    def StringsToCIF(self, lines):
        """
        文字列のリストからCIFの内容を抽出して辞書形式に変換します。

        `GetNextKey` メソッドを使用して行のリストを解析し、
        キーワードと値のペアを格納する辞書を構築します。
        ループデータは `keyword[i]` の形式でインデックス付きエントリとして格納されます。

        :param lines: list[str] CIFデータの行のリスト。
        :returns: dict CIF情報を含む辞書。
                         キーはCIFキーワード（ループデータの場合は `_keyword[index]` 形式）、
                         値は対応するデータです。
        """
        
        cifinf = {}
        i = 0
        icomment = 0
        while 1:
            self.dprint(1, "S2CIF: i next=", i, "  len(lines)=", len(lines))
            i_next, type, keyinf = self.GetNextKey(lines, i)

            self.dprint(1, "S2CIF: i={} => {}  type=[{}] key=[{}]\n".format(i, i_next, type, keyinf))

            if type == end_of_cifdata or len(keyinf) == 0:
                return cifinf

            self.dprint(1, "keyinf:")
            self.dpprint(1, keyinf)
            self.dprint(1, "")

            if type == end_of_cifdata:
                break
            elif type == comment:
                self.dprint(1, "comment {} [{}]".format(icomment, keyinf[0]))
                cifinf["comment[{}]".format(icomment+1)] = keyinf[0]
                icomment += 1
            elif type == loopkeys:
                for idata in range(len(keyinf)):
                    for j in range(len(keyinf[idata])):
                        cifinf[keyinf[idata][j][0] + "[{}]".format(idata)] = keyinf[idata][j][1]
            else:
                cifinf[keyinf[0]] = keyinf[1]
            i = i_next
        return cifinf




[ドキュメント]
    def ReadNextCIF(self, path = None, print_level = 0):
        """
        CIFファイルの次のデータブロックを読み込み、行のリストとして返します。

        `data_` で始まる次のCIFデータブロックを見つけ、`#End` または次の `data_`
        ブロックの手前までを読み込みます。
        パスが指定された場合、そのファイルを開いてから読み込みます。
        それ以外の場合は、現在のファイルポインタ (`self.fp`) から読み込みます。

        :param path: str, optional 読み込むCIFファイルのパス。指定しない場合、`self.path` を使用します。
        :param print_level: int, optional デバッグ出力レベル。デフォルトは0。
        :returns: list[str] CIFデータの行のリスト。読み込むデータブロックがない場合はNone。
        """
        if path:
            self.path = path
            if self.Open(self.path) == 0:
                return None
        lines = []
        IsSkip = 1
        ret = None
        iheader = 0
        while 1:
            pos = self.Tell()
            line = self.ReadLine()
#            self.dprint(1, "tkcrystal.tkcif.ReadNextCIF: line: [{}]".format(line.strip()))
            if not line:
                if len(lines) == 0:
                    break
                ret = lines
                break
            if IsSkip and re.match(r'[\r\n]$', line):
                continue
            IsSkip = 0
            if re.match("#End", line):
#                self.dprint(1, "tkcrystal.tkcif.ReadNextCIF: match to end")
                lines.append(line)
                ret = lines
                break
            if re.match("data_", line):
                iheader += 1
                if iheader > 1:
                    self.Seek(pos)
                    ret = lines
                    break
            lines.append(line)

        if path:
            self.Close()
        return ret


    def __ReadCIF1(self, path = None, print_level = 1):
        """
        CIFファイルを読み込むためのヘルパー関数。

        `ReadCIF` メソッドからのみ呼び出され、単一のCIFデータブロックを `tkCIFData` オブジェクトとして読み込みます。
        ファイルポインタが `io.StringIO` オブジェクトである可能性も考慮します。

        :param path: str or io.StringIO, optional 読み込むCIFファイルのパス、またはStringIOオブジェクト。
                                                   指定しない場合、`self.path` または `self.fp` を使用します。
        :param print_level: int, optional デバッグ出力レベル。デフォルトは1。
        :returns: tkCIFData 読み込まれたCIFデータを含む `tkCIFData` オブジェクト。
                          読み込みに失敗した場合は空のリスト `[]` を返します。
        """

# pathが文字列型でない場合、io.StringIOと判断してファイルポインタとする
        if path and type(path) is not str:
            self.fp = path
            path = None

        if path:
            if self.Open(path) == None:
                return []
        elif self.fp:
            self.Rewind()
        else:
            if self.Open(self.path) == None:
                return []
        lines = self.ReadNextCIF()
        if path:
            self.Close()

        dic = self.StringsToCIF(lines)

        cif = tkCIFData()
        cif.path   = self.path
        cif.data   = dic
        cif.lines = lines
        self.cifdata_list = [cif]
        if not cif.CheckCIF():
            exit()

        return self.cifdata_list[0]


[ドキュメント]
    def read_cif(self, path = None, find_valid_structure = 1, print_level = 1):
        """
        CIFファイルを読み込み、`tkCIFData` オブジェクトのリスト (`self.cifdata_list`) を更新します。

        単一のCIFデータブロック、または複数のブロックを読み込むことができます。
        `find_valid_structure` が真の場合、有効な結晶構造データ（`_cell_length_a` > 0）を
        持つ最初のデータブロックを返します。

        :param path: str, optional 読み込むCIFファイルのパス。指定しない場合、`self.path` を使用します。
        :param find_valid_structure: int, optional
                                    1の場合、有効な構造データが見つかるまでセクションを読み込みます。
                                    0の場合、最初のセクションのみを読み込みます。デフォルトは1。
        :param print_level: int, optional デバッグ出力レベル。デフォルトは1。
        :returns: tkCIFData 有効な結晶構造データを持つ `tkCIFData` オブジェクト。
                          `find_valid_structure` が有効で適切なデータが見つからない場合はNone。
        """

        if find_valid_structure:
            cifdatas = self.ReadCIFs(path, print_level = print_level)
            for cifdata in cifdatas:
                lata = cifdata.geterrorf('_cell_length_a')
                if lata is not None and lata > 0:
                    return cifdata
            else:
                return None
        return self.__ReadCIF1(path, print_level = print_level)



[ドキュメント]
    def ReadCIF(self, path = None, find_valid_structure = 1, print_level = 1):
        """
        CIFファイルを読み込み、`tkCIFData` オブジェクトを返します。

        `read_cif` メソッドのエイリアスです。
        有効な結晶構造データを持つ最初のデータブロックを返します。

        :param path: str, optional 読み込むCIFファイルのパス。指定しない場合、`self.path` を使用します。
        :param find_valid_structure: int, optional
                                    1の場合、有効な構造データが見つかるまでセクションを読み込みます。
                                    0の場合、最初のセクションのみを読み込みます。デフォルトは1。
        :param print_level: int, optional デバッグ出力レベル。デフォルトは1。
        :returns: tkCIFData 有効な結晶構造データを持つ `tkCIFData` オブジェクト。
                          `find_valid_structure` が有効で適切なデータが見つからない場合はNone。
        """
        return self.read_cif(path = path, find_valid_structure = find_valid_structure, print_level = print_level)



[ドキュメント]
    def ReadCIFs(self, path = None, print_level = 1):
        """
        CIFファイルを読み込み、その中の全てのデータブロックを `tkCIFData` オブジェクトのリストとして返します。

        `data_` で始まる各セクションを個別の `tkCIFData` オブジェクトとして解析し、リストに格納します。
        ファイルポインタが `io.StringIO` オブジェクトである可能性も考慮します。

        :param path: str or io.StringIO, optional 読み込むCIFファイルのパス、またはStringIOオブジェクト。
                                                   指定しない場合、`self.path` または `self.fp` を使用します。
        :param print_level: int, optional デバッグ出力レベル。デフォルトは1。
        :returns: list[tkCIFData] 読み込まれた全てのCIFデータを含む `tkCIFData` オブジェクトのリスト。
        """
# pathが文字列型でない場合、io.StringIOと判断してファイルポインタとする
        if path and type(path) is not str:
            self.fp = path
            path = None

        if path:
            if self.Open(path) == None:
                return []
        elif self.fp:
            self.Rewind()
        else:
            if self.Open(path) == None:
                return []

        ciflist = []
        while 1:
            lines = self.ReadNextCIF(print_level = print_level)
            if lines == None:
                break
            dic = self.StringsToCIF(lines)
            cif = tkCIFData()
            cif.path   = self.path
            cif.data   = dic
            cif.lines = lines
            ciflist.append(cif)

        self.cifdata_list = ciflist

        if path:
            self.Close()

        return self.cifdata_list




infile = 'test.cif'
#infile = 'test1.cif'
#narg = len(sys.argv)
#if narg >= 2:
#    infile = sys.argv[1]

#debug = 0
#if narg >= 3:
#    debug = int(sys.argv[2])



[ドキュメント]
def main():
    cif = tkCIF(infile)
    cif.debug = debug
    single = 1
    
#    atom = tkAtomType()
#    inf = atom.GetAtomInformation('Au')
#    for key in inf:
#        print("{}: {}".format(key, inf[key]))
#    print("")
#    exit()

    if single:
        cifdata = cif.ReadCIF()
        cifdata.Print()
        cry = cifdata.GetCrystal()
        print("")
        print("==============================================")
        cry.PrintInf()
        print("")
        print("==============================================")

        outfile = 'a.cif'
        cifdata.WriteSimpleCIFFile(outfile)
        
        outfile = 'b.cif'
        cifdata.CreateCIFFileFromCCrystal(cry, outfile)

#        pos0 = [0.3, 0.2, 0.1]
#        isym = 1
#        pos1 = cry.DoSymmetryOperation(pos0, isym)
#        print("pos0 ", pos0, " => sym %d" % (isym), pos1)
    else:
        cifdatas = cif.ReadCIFs()
        for cifdata in cifdatas:
            cifdata.Print()
    cif.Close()



if __name__ == "__main__":
    main()