import os
import sys
import re
from pathlib import Path
import chardet
import re
from dotenv import load_dotenv

from urllib.parse import urlparse
import html2text
from bs4 import BeautifulSoup

from pdf2docx import Converter
from docx import Document
from pptx import Presentation
from markitdown import MarkItDown

from jinja2 import Environment, FileSystemLoader

import requests
import openai
from openai import OpenAI


# ------------------------------------------------------
# 初期設定
# ------------------------------------------------------
class tkParams:
    pass


cfg = None

# for reformat
reformat_role = "あなたは専門的な英語を正確かつ筋の通った文章に校正するアシスタントです。"
reformat_prompt = '''以下の*＃テキスト*以下の文章はPDFファイルからテキストを抜き出したものですが、以下の作業をお願いします
・論文題目、著者名、所属名、雑誌名、DOIをとりだし、[TITLE], [AUTHORS], [AFFILIATIONS], [JOURNAL], [DOI]に続けて出力してください
・本文の文章が崩れているので、もとの文章を復元してください 
＃テキスト
{{ text }}
'''


def initialize():
    global cfg

    cfg = tkParams()

    cfg.config_path = "translate.env"

#以降のパラメターは config_path で設定できる
    cfg.html_template_path = 'template_translate.html'

    cfg.account_inf_path = "accounts.env"
    cfg.api = 'openai'  # 'deepl'
    cfg.openai_model = "gpt-4o"
    cfg.mode = None

    cfg.infile = 'translate_test.docx'
#cfg.infile = "vba_setup.html"
    cfg.output_html_path = None

    cfg.process_unit = 'paragraph'  # or 'run'
    cfg.use_md = False

    cfg.limit_to_multibyte_str = False
    cfg.min_translate_length = 5
    cfg.allowed_translation_length_ratio = 5.0

#for OpenAI API
    cfg.temperature = 0.3
    cfg.max_tokens = 2000

# ------------------------------------------------------
# 設定ファイル config_path 読み込み
# ------------------------------------------------------
    load_dotenv(dotenv_path = cfg.config_path)
    cfg.account_inf_path = os.getenv("account_inf_path", "accounts.env")
    load_dotenv(dotenv_path = cfg.account_inf_path)

    cfg.html_template_path = os.getenv("html_template_path", "template_translate.html")
    cfg.role_content       = os.getenv("role_content")
    cfg.prompt             = os.getenv("prompt_template")

    cfg.min_translate_length  = int(os.getenv("min_translate_length", "5"))
    cfg.allowed_translation_length_ratio  = float(os.getenv("allowed_translation_length_ratio", "5.0"))

    cfg.temperature = float(os.getenv("temperature", "0.3"))
    cfg.max_tokens  = int(os.getenv("max_tokens", "2000"))

    cfg.endpoint = os.getenv("endpoint")
    
    return cfg


# ------------------------------------------------------
# 基本関数
# ------------------------------------------------------
def usage():
    print()
    print(f"Usage: python {sys.argv[0]} api mode infile process_unit use_markdown limit_to_multibyte_str min_translate_length allowed_translation_length_ratio temperature")
    print(f"   api: [openai|deepl]")
    print(f"   mode: [je|ee]")
    print(f"   process_unit: 'paragraph' (recommended) or 'run' (to keep character formats)")
    print(f"   use_markdown: Translate intermediate markdown file generated from infile")
    print(f"   limit_to_multibyte_str: 1: Translate only strings including mb characters")
    print(f"   min_translate_length: int (default: 5): Minimum length of input text to be translated")
    print(f"   allowed_translation_length_ratio: float (default: 5.0): Upperlimit of len(translated) / len(input)")
    print(f"   temperature: float (default: 0.3): GPT temperature (0 - 1.0)")
    print()

def getarg(i, defval = None):
    if len(sys.argv) > i:
        return sys.argv[i]
    return defval

def process_template(template: str, context: dict) -> str:
    """
    Replace special characters like \t, \n, \r and template tags {{ key }} with their corresponding values.

    Args:
        template (str): The input string containing template tags and special characters.
        context (dict): A dictionary containing key-value pairs for template replacement.

    Returns:
        str: The processed string with replacements applied.
    """
    # Replace special characters
    template = template.replace(r'\t', '\t')
    template = template.replace(r'\n', '\n')
    template = template.replace(r'\r', '\r')
    
    # Replace {{ key }} with context values
    def replace_placeholder(match):
        key = match.group(1).strip()
        return str(context.get(key, f'{{{{ {key} }}}}'))  # Keep original if key not found

    template = re.sub(r'\{\{\s*(.*?)\s*\}\}', replace_placeholder, template)

    return template

# ------------------------------------------------------
# 起動時引数設定
# ------------------------------------------------------
def update_variables(cfg):
    cfg.api = getarg(1, cfg.api)
    if cfg.api == "openai":
        cfg.mode = "je"
    else:
        cfg.mode = "je"

    cfg.mode = getarg(2, cfg.mode)
    if cfg.mode[0] == 'j':
        cfg.source_lang = "JA"
        limit_to_multibyte_str = True
    else:
        cfg.source_lang = "EN"
        limit_to_multibyte_str = False
    if cfg.mode[1] == 'j':
        cfg.target_lang = "JA"
    else:
        cfg.target_lang = "EN"

    cfg.infile          = getarg(3, cfg.infile)
    cfg.process_unit    = getarg(4, cfg.process_unit)
    cfg.use_md          = int(getarg(5, cfg.use_md))
    cfg.limit_to_multibyte_str = int(getarg(6, cfg.limit_to_multibyte_str))
    cfg.min_translate_length   = int(getarg(7, cfg.min_translate_length))
    cfg.allowed_translation_length_ratio = int(getarg(8, cfg.allowed_translation_length_ratio))
    cfg.temperature = float(getarg( 9, cfg.temperature))
    cfg.max_tokens  = int(getarg(10, cfg.max_tokens))
    
    return cfg


# ------------------------------------------------------
# Key, アカウント情報等読み込み
# ------------------------------------------------------
def read_api_inf(cfg):
    if cfg.api == 'openai':
        openai.api_key = os.getenv("OPENAI_API_KEY")
        if not openai.api_key:
            print("ERROR: OpenAI APIキーが見つかりません。環境変数 'OPENAI_API_KEY' を設定してください。")
            return False

        if not cfg.role_content:
            print("ERROR: role_contentを定義してください")
            return False

        if not cfg.prompt:
            print("ERROR: prompt_templateを定義してください")
            return False
    elif cfg.api == 'deepl':
        cfg.deepl_api_key = os.getenv("DEEPL_API_KEY")
        if not cfg.deepl_api_key:
            print("ERROR: DeepL APIキーが見つかりません。環境変数 'DEEPL_API_KEY' を設定してください。")
            return False

        if not cfg.endpoint:
            print("ERROR: endpointが見つかりません。環境変数 'endpoint' を設定してください。")
            return False
    else:
        print()
        print(f"Error: Invalid API [{api}]")
        print()
        usage()
        return False

    force_server_charcode = os.getenv("force_server_charcode", 'utf-8')

    return True

# ------------------------------------------------------
# 関数
# ------------------------------------------------------

def save(path, text):
    with open(path, "w", encoding="utf-8") as file:
        file.write(text)

def read_file(path):
    if not os.path.exists(path):
        print("\nError in read_file(): File [{path}] does not exist\n")
        exit()

    fp = open(path, "r", encoding="utf-8")
    if not fp:
        print("\nError in read_file(): Failed to read [{path}]\n")
        exit()
        
    text = fp.read()
    fp.close()
    
    return text

def replace_path(path, ext):
    return os.path.splitext(path)[0] + ext

def check_multibyte_str(text, limit_to_multibyte_str):
    if not limit_to_multibyte_str:
        return True 

    pattern = re.compile(r'[\u0800-\uFFFF]')
    ret = bool(pattern.search(text))

    return ret

def to_translate(text, min_translate_length, limit_to_multibyte_str):
    if len(text.strip()) < min_translate_length:
        return False
    return check_multibyte_str(text, limit_to_multibyte_str)

def revise_with_openai(text, openai_model, role_content, prompt, temperature, max_tokens):
#    prompt = f"{prompt}:\n翻訳してほしいテキスト\n{text}"
    prompt = process_template(prompt, { "text": text })

    response = openai.chat.completions.create(
        model = openai_model,
        messages=[
            {"role": "system", "content": role_content},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response.choices[0].message.content.strip()


def translate_with_deepl(text):
    """
    DeepL APIを使用してテキストを翻訳する関数
    """

    headers = {"Authorization": f"DeepL-Auth-Key {cfg.deepl_api_key}"}
    params = {
        "text": text,
        "source_lang": cfg.source_lang,
        "target_lang": cfg.target_lang
    }
    response = requests.post(cfg.endpoint, headers = headers, data = params)
    if response.status_code == 200:
        result = response.json()
        return result["translations"][0]["text"]
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return text  # エラーの場合、元のテキストを返す

def translate(text, api, openai_model, role_content, prompt, temperature, max_tokens):
    if api == 'openai':
        return revise_with_openai(text, openai_model, role_content, prompt, temperature, max_tokens)
    elif api == 'deepl':
        return translate_with_deepl(text)
    else:
        print()
        print(f"Error in translate(): Invalid API [{api}]")
        print()
        exit()

def html_to_markdown(html_file_path):
    with open(html_file_path, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()

    return html2text.html2text(html_content)

def convert_to_md(infile):
    if ".jpg" in infile.lower(): 
        print()
        print("Extract markdown from image file {infile}")
        client = OpenAI()
        md = MarkItDown(mlm_client=client, mlm_model="gpt-4o-mini")
#        result = md.convert(infile, mlm_prompt="画像情報を抽出してください。")
        result = md.convert(infile, mlm_prompt="画像について説明してください。")
#        result = md.convert(infile, mlm_prompt="画像からテキストを抽出してください。")
    elif ".html" in infile.lower(): 
        return html_to_markdown(infile)
    else:
        print()
        print("Convert {infile} to markdown")
#    md = MarkItDown(mlm_client=client, mlm_model="gpt-4o-mini")
        md = MarkItDown()
        result = md.convert(infile)

    return result.text_content

def n_leading_chars(s, c = '#'):
    return len(s) - len(s.lstrip(c))

# OpenAI APIが、どうしても英文を日本語文に変換したり余計な説明をつけるので、
# 特定のケースの翻訳をrejectする
# ・ 翻訳文の先頭が ``` の場合（念のため ''' も却下）
# ・ 翻訳文の先頭に余計な #, * を追加している場合
# ・ 翻訳文が元の文の長さよりもかなり長い場合
def check_translation(text, translated, allowed_translation_length_ratio):
    if translated.startswith("'''"):
        return False
    if translated.startswith("```"):
        return False

    ntranslated = n_leading_chars(translated, '#')
    ntext = n_leading_chars(text, '#')
    if ntranslated > ntext:
        return False

    ntranslated = n_leading_chars(translated, '*')
    ntext = n_leading_chars(text, '*')
    if ntranslated == ntext + 1:
        return False

    if len(translated) > len(text) * allowed_translation_length_ratio:
        return False

    return True

def translate_html(text, api, api_model, role_content, prompt, temperature, max_tokens, 
            min_translate_length, limit_to_multibyte_str, allowed_translation_length_ratio):
    """
    HTMLの日本語部分を英語に翻訳
    """

    data = []
    print()
    print(">>> Analyzing html...")
    soup = BeautifulSoup(text, 'html.parser')

    print("Translating content...")
    for element in soup.find_all(string=True):  # 全てのテキストノードを取得
        if to_translate(element, min_translate_length, limit_to_multibyte_str):
            original_text = element
            revised_text = translate(element, api, api_model, role_content, prompt, temperature, max_tokens)
            print(f"[Original] {original_text}")
            print(f"  -> [Revised] {revised_text}")
            if check_translation(original_text, revised_text, allowed_translation_length_ratio):
                element.replace_with(revised_text)
                data.append({ "original": original_text, "translated": revised_text})
            else:
                print(f"  *** This translation is rejected")

    return str(soup), data

def translate_pptx(ppt, api, api_model, role_content, prompt, temperature, max_tokens, 
            min_translate_length, limit_to_multibyte_str, allowed_translation_length_ratio):
    data = []
    print()
    print(">>> Translating...")
    for slide in ppt.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:  # テキストフレームがある場合
                for paragraph in shape.text_frame.paragraphs:
                    if to_translate(paragraph.text, min_translate_length, limit_to_multibyte_str):
                        original_text = paragraph.text
                        revised_text = translate(paragraph.text, api, api_model, role_content, prompt, temperature, max_tokens)
                        print(f"[Original] {original_text}")
                        print(f"  -> [Revised] {revised_text}")
                        if check_translation(original_text, revised_text, allowed_translation_length_ratio):
                            paragraph.text = revised_text
                            data.append({ "original": original_text, "translated": revised_text})
                        else:
                            print(f"  *** This translation is rejected")
    return ppt, data

def translate_docx(doc, api, api_model, role_content, prompt, temperature, max_tokens, 
            min_translate_length, limit_to_multibyte_str, allowed_translation_length_ratio, process_unit):
    data = []
    print()
    print(">>> Processing paragraphs/runs...")
    if process_unit == 'paragraph':
        for paragraph in doc.paragraphs:
            if to_translate(paragraph.text, min_translate_length, limit_to_multibyte_str):
                original_text = paragraph.text
                revised_text = translate(paragraph.text, api, api_model, role_content, prompt, temperature, max_tokens)
                print(f"[Original] {original_text}")
                print(f"  -> [Revised] {revised_text}")
                if check_translation(original_text, revised_text, allowed_translation_length_ratio):
                    paragraph.text = revised_text
                    data.append({ "original": original_text, "translated": revised_text})
                else:
                    print(f"  *** This translation is rejected")
    else:
        for paragraph in doc.paragraphs:
            for run in paragraph.runs:
                if to_translate(run.text, min_translate_length, limit_to_multibyte_str):
                    original_text = run.text
                    revised_text = translate(run.text, api, api_model, role_content, prompt, temperature, max_tokens)
                    print(f"[Original] {run.text}")
                    print(f"  -> [Revised] {revised_text}")
                    if check_translation(original_text, revised_text, allowed_translation_length_ratio):
                        run.text = revised_text
                        data.append({ "original": original_text, "translated": revised_text})
                    else:
                        print(f"  *** This translation is rejected")

    print(">>> Processing tables...")
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    if process_unit == 'paragraph':
                        if to_translate(paragraph.text, min_translate_length, limit_to_multibyte_str):
                            original_text = paragraph.text
                            revised_text = translate(paragraph.text, api, api_model, role_content, prompt, temperature, max_tokens)
                            print(f"[Original] {original_text}")
                            print(f"  -> [Revised] {revised_text}")
                            if check_translation(original_text, revised_text, allowed_translation_length_ratio):
                                paragraph.text = revised_text
                                data.append({ "original": original_text, "translated": paragraph.text})
                            else:
                                print(f"  *** This translation is rejected")
                    else:
                        for run in paragraph.runs:
                            if to_translate(run.text, min_translate_length, limit_to_multibyte_str):
                                original_text = run.text
                                revised_text = translate(run.text, api, api_model, role_content, prompt, temperature, max_tokens)
                                print(f"[Original] {original_text}")
                                print(f"  -> [Revised] {revised_text}")
                                if check_translation(original_text, revised_text, allowed_translation_length_ratio):
                                    run.text = revised_text
                                    data.append({ "original": original_text, "translated": revised_text})
                                else:
                                    print(f"  *** This translation is rejected")
    return doc, data

def translate_text(text, api, api_model, role_content, prompt, temperature, max_tokens,
                min_translate_length, limit_to_multibyte_str, allowed_translation_length_ratio):
    data = []
    print(">>> Translating...")
    if to_translate(text, min_translate_length, limit_to_multibyte_str):
        original_text = text
        revised_text = translate(text, api, api_model, role_content, prompt, temperature, max_tokens)
        print(f"[Original] {original_text}")
        print(f"  -> [Revised] {revised_text}")
        data.append({ "original": original_text, "translated": revised_text})

    return revised_text, data

def pdf_to_docx(pdf_file, docx_file):
    cv = Converter(pdf_file)
    cv.convert(docx_file, start=0, end=None)  # start, endページを指定可能
    cv.close()
    print(f"Converted '{pdf_file}' to '{docx_file}' successfully.")

def get_filetype(path):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        return "pdf"
    elif ext == ".docx":
        return "docx"
    elif ext == ".pptx":
        return "pptx"
    elif ext == ".html" or ext == ".htm":
        return "html"
    elif ext == ".txt" or ext == ".text":
        return "txt"
    elif ext == ".md":
        return "md"

    return None

def execute(cfg):
    filetype = get_filetype(cfg.infile)
    if filetype == 'md':
        outfile = replace_path(mdfile, "_revised.md")
    elif cfg.use_md:
        mdfile = replace_path(cfg.infile, ".md")
        outfile = replace_path(mdfile, "_revised.md")
    elif filetype == 'pdf':
        if cfg.use_md:
            outfile = replace_path(mdfile, "_revised.md")
        else:
            outfile = replace_path(cfg.infile, "_revised.docx")
    elif filetype == 'docx':
        outfile = replace_path(cfg.infile, "_revised.docx")
    elif filetype == 'html':
        outfile = replace_path(cfg.infile, "_revised.html")
    elif filetype == 'pptx':
        outfile = replace_path(cfg.infile, "_revised.pptx")
    else:
        print("\nError in execute(): Invalid extension in [{cfg.infile}]\n")
        exit()
        
    output_comparison_html_path = replace_path(cfg.infile, '_compare.html')
    output_reformat_md = replace_path(cfg.infile, "_reformat.md")
    output_docx = replace_path(cfg.infile, ".docx")

    print("=== Translate and revise .docx/.pptx/.pdf/.html/.md file ===")
    print(f"  Input file            : {cfg.infile}")
    print(f"    file type           : {filetype}")
    print(f"  API                   : {cfg.api}")
    print(f"  openai_model          : {cfg.openai_model}")
    print(f"  Translation mode      : {cfg.mode}")
    print(f"  limit_to_multibyte_str: {cfg.limit_to_multibyte_str}")
    print(f"  min_translate_length  : {cfg.min_translate_length}")
    print(f"  allowed_translation_length_ratio: {cfg.allowed_translation_length_ratio}")
    print(f"  Output file           : {outfile}")
    print(f"  Template HTML file    : {cfg.html_template_path}")
    print(f"  Output compare file   : {output_comparison_html_path}")
    print(f"  Use markdown : {cfg.use_md}")
    if cfg.use_md:
        print(f"  Markdown file: {mdfile}")
        if filetype == 'pdf':
            print(f"  Reformat markdown file: {output_reformat_md}")
    else:
        if filetype == 'docx' or filetype == 'pdf':
            print(f"  Process unit : {cfg.process_unit}")
            if filetype == 'pdf':
                print(f"  Converted docx file: {output_docx}")
    if cfg.api == "openai" or cfg.infile.endswith('.pdf'):
        print(f"  role_content : {cfg.role_content}")
        print(f"  prompt       : {cfg.prompt}")

    print()
    if filetype == 'md' or filetype == 'txt':
        print(f"Read [{cfg.infile}]")
        text = read_file(cfg.infile)
        text, data = translate_text(text, cfg.api, cfg.openai_model, cfg.role_content, cfg.prompt, cfg.temperature, cfg.max_tokens, cfg.min_translate_length, cfg.limit_to_multibyte_str, cfg.allowed_translation_length_ratio)
        print(f"=== Saving revised text to {outfile} ===")
        save(outfile, text)
        exit()
    elif cfg.use_md:
        print(f"Read [{cfg.infile}] and convert to markdown")
        text = convert_to_md(cfg.infile)
        print(f"=== Saving markdown to  {mdfile} ===")
        save(mdfile, text)

# PDFファイルの場合は、ChatGPTで文書を整えなおす
        if filetype == 'pdf':
            print(f"Reformatting [{mdfile}] by OpenAI...")
            text = revise_with_openai(text, cfg.openai_model, reformat_role, reformat_prompt, cfg.temperature, cfg.max_tokens)
            print(f"Reformatted MD file is saved to [{output_reformat_md}]")
            save(output_reformat_md, text)

        text, data = translate_text(text, cfg.api, cfg.openai_model, cfg.role_content, cfg.prompt, cfg.temperature, cfg.max_tokens, cfg.min_translate_length, cfg.limit_to_multibyte_str, cfg.allowed_translation_length_ratio)
        print(f"=== Saving revised text to {outfile} ===")
        save(outfile, text)

    if not cfg.use_md:
        if filetype == 'pdf':
            print(f"Converting [{cfg.infile}] to [{output_docx}]")
            pdf_to_docx(cfg.infile, output_docx)
            print(f"Read [{output_docx}]")
            doc = Document(output_docx)
            doc, data = translate_docx(doc, cfg.api, cfg.openai_model, cfg.role_content, cfg.prompt, cfg.temperature, cfg.max_tokens, 
                            cfg.min_translate_length, cfg.limit_to_multibyte_str, cfg.allowed_translation_length_ratio, cfg.process_unit)
            print(f"=== Saving revised text to {outfile} ===")
            doc.save(outfile)
        elif filetype == 'pptx':
            ppt = Presentation(cfg.infile)
            ppt, data = translate_pptx(ppt, cfg.api, cfg.openai_model, cfg.role_content, cfg.prompt, cfg.temperature, cfg.max_tokens, cfg.min_translate_length, cfg.limit_to_multibyte_str, cfg.allowed_translation_length_ratio)
            print(f"=== Saving revised text to {outfile} ===")
            ppt.save(outfile)
        elif filetype == 'docx':
            print(f"Read [{cfg.infile}]")
            doc = Document(cfg.infile)
            doc, data = translate_docx(doc, cfg.api, cfg.openai_model, cfg.role_content, cfg.prompt, cfg.temperature, cfg.max_tokens, 
                            cfg.min_translate_length, cfg.limit_to_multibyte_str, cfg.allowed_translation_length_ratio, cfg.process_unit)
            print(f"=== Saving revised text to {outfile} ===")
            doc.save(outfile)
        elif filetype == 'html':
            print(f"Read [{cfg.infile}]")
            html = read_file(cfg.infile)
            html, data = translate_html(html, cfg.api, cfg.openai_model, cfg.role_content, cfg.prompt, cfg.temperature, cfg.max_tokens, cfg.min_translate_length, cfg.limit_to_multibyte_str, cfg.allowed_translation_length_ratio)
            print(f"=== Saving revised text to {outfile} ===")
            save(outfile, html)

# ------------------------------------------------------
# テンプレート処理
# ------------------------------------------------------
    context = cfg.__dict__.copy()
    context["data"] = data

    template_dir = os.getcwd()
    env = Environment(loader=FileSystemLoader(template_dir))
    template = env.get_template(cfg.html_template_path)
    rendered_html = template.render(context)

    print()
    print(f"Save translation data to [{output_comparison_html_path}]")
    with open(output_comparison_html_path, 'w', encoding='utf-8') as file:
        file.write(rendered_html)

    print()


def main():
    cfg = initialize()
    cfg = update_variables(cfg)
    if not read_api_inf(cfg):
        exit()

    execute(cfg)

if __name__ == "__main__":
    main()
    usage()
