#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
pptx2md.py
----------
Convert a single PowerPoint (.pptx) file into a Markdown (.md) chapter.

Features:
- Extract slide titles, bullet text, and speaker notes
- Extract embedded images from slides (to ./images by default)
- Optionally include pre-rendered slide PNGs (e.g., slide_001.png, slide_002.png...)
- Writes one Markdown file: <pptx_stem>.md

Dependencies:
- python-pptx  (pip install python-pptx)
- Pillow       (pip install Pillow)  # image decoding backend used by python-pptx

Usage:
  python pptx2md.py --pptx path/to/slides.pptx --out-dir ./out \
                    [--slides-png-dir path/to/slides_png] \
                    [--no-embedded-images] [--no-notes] \
                    [--json-out]

Author: ChatGPT (MIT License)
"""
import argparse
from pathlib import Path
from typing import List, Dict
import json
import shutil

from pptx import Presentation


def extract_pptx(pptx_path: Path, out_dir: Path, extract_images: bool, images_dirname: str) -> Dict:
    """
    Extract slide-wise title, bullets, speaker notes, and (optionally) embedded images.
    Returns dict: {"slides": [ {"title":..., "bullets":[...], "notes":..., "images":[...]} ]}
    Image paths are relative paths (e.g., "images/img_0001.png").
    """
    prs = Presentation(str(pptx_path))
    slides_data = []
    images_dir = out_dir / images_dirname
    if extract_images:
        images_dir.mkdir(parents=True, exist_ok=True)

    image_counter = 1

    for idx, slide in enumerate(prs.slides, start=1):
        title_text = ""
        bullets: List[str] = []
        notes_text = ""
        saved_images = []

        for shape in slide.shapes:
            # Extract embedded images if requested
            if extract_images:
                try:
                    # 13 = MSO_SHAPE_TYPE.PICTURE
                    if getattr(shape, "shape_type", None) == 13 and hasattr(shape, "image"):
                        try:
                            img = shape.image
                            ext = img.ext  # e.g., 'png', 'jpeg'
                            blob = img.blob
                            fname = f"img_{image_counter:04d}.{ext}"
                            (images_dir / fname).write_bytes(blob)
                            saved_images.append(str(Path(images_dirname) / fname))
                            image_counter += 1
                        except Exception:
                            pass
                except Exception:
                    pass

            # Text frames (title / bullets)
            if getattr(shape, "has_text_frame", False) and shape.text_frame is not None:
                try:
                    if getattr(shape, "is_placeholder", False) and "title" in str(shape.placeholder_format.type).lower():
                        if not title_text:
                            title_text = (shape.text or "").strip()
                            continue
                except Exception:
                    pass
                text = (shape.text or "").strip()
                if text:
                    for line in text.splitlines():
                        s = line.strip()
                        if s:
                            bullets.append(s)

        # Speaker notes
        notes_slide = slide.notes_slide if slide.has_notes_slide else None
        if notes_slide and notes_slide.notes_text_frame:
            notes_text = (notes_slide.notes_text_frame.text or "").strip()

        slides_data.append({
            "title": title_text or f"Slide {idx}",
            "bullets": bullets,
            "notes": notes_text,
            "images": saved_images,
        })

    return {"slides": slides_data, "source": pptx_path.name}


def include_slide_pngs(slides_png_dir: Path, out_dir: Path, target_subdir: str = "slides_png") -> list:
    """
    If pre-rendered slide images exist (e.g., slide_001.png...), copy them under out_dir/target_subdir.
    Return list of relative paths (sorted).
    """
    rels = []
    if slides_png_dir and slides_png_dir.is_dir():
        dst = out_dir / target_subdir
        dst.mkdir(parents=True, exist_ok=True)
        for f in sorted(slides_png_dir.iterdir()):
            if f.suffix.lower() in {".png", ".jpg", ".jpeg"}:
                shutil.copy2(f, dst / f.name)
                rels.append(str(Path(target_subdir) / f.name))
    return rels


def write_markdown(md_path: Path, slides_json: Dict, slide_pngs: list, include_notes: bool):
    lines = []
    lines.append(f"# {slides_json.get('source', 'slides.pptx')}")
    lines.append("")

    slides: List[Dict] = slides_json["slides"]
    ordered_pngs = sorted([p for p in slide_pngs if Path(p).suffix.lower() in {'.png', '.jpg', '.jpeg'}])
    png_by_index = {i+1: ordered_pngs[i] for i in range(len(ordered_pngs))}

    for i, s in enumerate(slides, start=1):
        title = s.get("title") or f"Slide {i}"
        lines.append(f"## Slide {i} — {title}")
        lines.append("")
        bullets = s.get("bullets") or []
        if bullets:
            for b in bullets:
                lines.append(f"- {b}")
            lines.append("")

        if include_notes:
            notes = (s.get("notes") or "").strip()
            if notes:
                lines.append("**Notes**")
                lines.append("")
                for line in notes.splitlines():
                    lines.append(f"> {line}")
                lines.append("")

        if i in png_by_index:
            lines.append(f"![Slide {i}]({png_by_index[i]})")
            lines.append("")

        for img_rel in s.get("images", []):
            lines.append(f"![Embedded]({img_rel})")
            lines.append("")

        lines.append("---")
        lines.append("")

    md_path.write_text("\n".join(lines), encoding="utf-8")


def main():
    ap = argparse.ArgumentParser(description="Convert a PPTX file into a Markdown chapter.")
    ap.add_argument("--pptx", required=True, help="Path to slides.pptx")
    ap.add_argument("--out-dir", required=True, help="Output directory for the Markdown and assets")
    ap.add_argument("--slides-png-dir", default=None, help="Directory containing pre-rendered slide PNGs to include")
    ap.add_argument("--no-embedded-images", action="store_true", help="Do not extract embedded images from PPTX")
    ap.add_argument("--no-notes", action="store_true", help="Do not include speaker notes")
    ap.add_argument("--images-dirname", default="images", help="Subdirectory name for extracted images")
    ap.add_argument("--json-out", action="store_true", help="Also write slides_text.json for debugging")

    args = ap.parse_args()
    pptx_path = Path(args.pptx).expanduser().resolve()
    out_dir = Path(args.out_dir).expanduser().resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    if not pptx_path.exists():
        raise SystemExit(f"[ERR] PPTX not found: {pptx_path}")

    slides = extract_pptx(pptx_path, out_dir, extract_images=not args.no_embedded_images,
                          images_dirname=args.images_dirname)

    slide_pngs = []
    if args.slides_png_dir:
        slide_pngs = include_slide_pngs(Path(args.slides_png_dir).expanduser().resolve(), out_dir)

    md_path = out_dir / (pptx_path.stem + ".md")
    write_markdown(md_path, slides, slide_pngs, include_notes=not args.no_notes)

    if args.json_out:
        (out_dir / "slides_text.json").write_text(json.dumps(slides, ensure_ascii=False, indent=2), encoding="utf-8")

    print(f"[OK] Wrote: {md_path}")


if __name__ == "__main__":
    main()
