import sys
from bs4 import BeautifulSoup, Comment

# 入出力ファイルの指定（コマンドライン引数対応）
infile = 'word_exported.html'
outfile = 'cleaned_output.html'
if len(sys.argv) >= 2:
    infile = sys.argv[1]
if len(sys.argv) >= 3:
    outfile = sys.argv[2]

# VMLのimagedata → img変換（親が削除されても画像が残るように）
def convert_vml_to_img(soup):
    for vml_tag in soup.find_all('v:imagedata'):
        src = vml_tag.get('src')
        alt = vml_tag.get('o:title', '')

        img_tag = soup.new_tag('img', src=src, alt=alt)

        shape_tag = vml_tag.find_parent('v:shape')
        if shape_tag:
            shape_tag.insert_after(img_tag)
            shape_tag.decompose()
        else:
            vml_tag.insert_after(img_tag)
            vml_tag.decompose()

    return soup

# Word特有の名前空間タグ・属性・コメントを削除
def clean_word_namespaces(soup):
    for tag in soup.find_all():
        if tag.name and tag.name.startswith(('v:', 'o:')):
            tag.decompose()

    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        if 'if gte msEquation' in comment or 'endif' in comment:
            comment.extract()

    for tag in soup.find_all(True):
        tag.attrs = {k: v for k, v in tag.attrs.items() if not k.startswith('mso-')}

    return soup

# 空のspanや入れ子だけのspanを削除・平坦化
def clean_empty_spans(soup):
    for span in soup.find_all('span'):
        if not span.text.strip() and not span.find(True):
            span.decompose()

    for span in soup.find_all('span'):
        children = span.find_all('span', recursive=False)
        if len(children) == 1 and not span.attrs:
            span.replace_with(children[0])

    return soup

# Word数式や空タグ、条件付きコメントの削除
def deep_clean_word_tags(soup):
    for tag in soup.find_all(['m:r', 'span']):
        if not tag.text.strip() and not tag.find(True):
            tag.decompose()

    for element in soup.find_all(string=True):
        if isinstance(element, Comment):
            if 'if' in element or 'endif' in element:
                element.extract()
        elif element.strip().startswith('<?') and element.strip().endswith('?>'):
            element.extract()

    for span in soup.find_all('span'):
        children = span.find_all('m:r', recursive=False)
        if len(children) == 1 and not span.text.strip():
            span.decompose()

    return soup

# テキストだけのspanをマージ
def simplify_spans(soup):
    for span in soup.find_all('span'):
        if not span.text.strip() and not span.find(True):
            span.decompose()

    for element in soup.find_all(string=True):
        if isinstance(element, Comment):
            if 'if' in element or 'endif' in element:
                element.extract()
        elif element.strip().startswith('<?') and element.strip().endswith('?>'):
            element.extract()

    for span in soup.find_all('span'):
        if span.string and not span.attrs and not span.find(True):
            span.replace_with(span.string)

    return soup

# メイン処理
with open(infile, 'r', encoding='cp932') as f:
    original_html = f.read()

soup = BeautifulSoup(original_html, 'html.parser')

# 処理の流れ
soup = convert_vml_to_img(soup)
soup = clean_word_namespaces(soup)
soup = clean_empty_spans(soup)
soup = deep_clean_word_tags(soup)
soup = simplify_spans(soup)

# charsetをUTF-8に変更
meta = soup.find('meta', attrs={'charset': True})
if meta:
    meta['charset'] = 'utf-8'
else:
    head = soup.find('head')
    if head:
        new_meta = soup.new_tag('meta', charset='utf-8')
        head.insert(0, new_meta)

# 出力
with open(outfile, 'w', encoding='utf-8') as f:
    f.write(str(soup))

print("変換完了！cleaned_output.html を確認してね。")
