﻿#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Re-encode text files (default: .py, .pyw, .txt) in a directory tree to UTF-8.
Optional backup (--backup) creates a copy <file>.bak before overwriting.
Ignores directories like .git, __pycache__, venv, node_modules.
"""

import argparse
import os
import re
import shutil
import stat
import tempfile
import tokenize
from typing import Optional

SKIP_DIRS = {".git", "__pycache__", "venv", ".venv", "env", "node_modules", "build", "dist"}
DEFAULT_EXTS = {".py", ".pyw", ".txt", ".md", ".cfg", ".ini"}

CODING_RE = re.compile(br"coding[:=]\s*([-\w.]+)")

FALLBACK_ENCODINGS = ["utf-8", "cp1252", "latin-1", "iso-8859-1"]


def detect_cookie_encoding(path: str) -> Optional[str]:
    try:
        with open(path, "rb") as f:
            enc, _ = tokenize.detect_encoding(f.readline)
            return enc
    except Exception:
        return None


def decode_bytes(data: bytes, declared_enc: Optional[str]) -> Optional[str]:
    tried = []
    if declared_enc:
        tried.append(declared_enc)
    tried.extend([e for e in FALLBACK_ENCODINGS if e not in tried])

    for enc in tried:
        try:
            return data.decode(enc)
        except Exception:
            continue
    return None


def replace_or_remove_cookie(text: str) -> str:
    lines = text.splitlines(True)
    first_two = "".join(lines[:2])
    if re.search(r"coding[:=]\s*[-\w.]+", first_two):
        if lines and lines[0].startswith("#!"):
            if len(lines) > 1:
                lines[1] = "# -*- coding: utf-8 -*-\n"
            else:
                lines.insert(1, "# -*- coding: utf-8 -*-\n")
        else:
            lines[0] = "# -*- coding: utf-8 -*-\n"
        return "".join(lines)
    return text


def process_file(path: str, make_backup: bool) -> Optional[str]:
    with open(path, "rb") as f:
        data = f.read()

    if b"\x00" in data:
        return "skipped (binary)"

    declared = detect_cookie_encoding(path)
    if declared and declared.lower().replace("-", "_") in ("utf_8", "utf_8_sig", "utf8"):
        if data.startswith(b"\xef\xbb\xbf"):
            text = data[len(b"\xef\xbb\xbf"):].decode("utf-8", errors="strict")
        else:
            return "ok (already utf-8)"
    else:
        text = decode_bytes(data, declared)
        if text is None:
            return "failed (decode)"

    text = replace_or_remove_cookie(text)

    try:
        new_bytes = text.encode("utf-8")
    except Exception:
        return "failed (encode)"

    if new_bytes == data and not data.startswith(b"\xef\xbb\xbf"):
        return "ok (no change)"

    if make_backup:
        bak = path + ".bak"
        shutil.copy2(path, bak)

    st = os.stat(path)
    fd, tmp_path = tempfile.mkstemp(dir=os.path.dirname(path))
    os.close(fd)
    try:
        with open(tmp_path, "wb") as outf:
            outf.write(new_bytes)
        os.replace(tmp_path, path)
        os.chmod(path, stat.S_IMODE(st.st_mode))
        os.utime(path, (st.st_atime, st.st_mtime))
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

    return "rewritten (utf-8)"


def should_process_file(filename: str, exts: set) -> bool:
    _, ext = os.path.splitext(filename)
    return ext.lower() in exts


def walk_and_reencode(root: str, exts: set, backup: bool, dry_run: bool):
    summary = {"processed": 0, "skipped": 0, "failed": 0, "changes": 0}
    for dirpath, dirnames, filenames in os.walk(root):
        dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS and not d.startswith(".")]
        for fn in filenames:
            if not should_process_file(fn, exts):
                continue
            path = os.path.join(dirpath, fn)
            if os.path.abspath(path) == os.path.abspath(__file__):
                continue
            summary["processed"] += 1
            if dry_run:
                print(f"[dry] would process: {path}")
                continue
            result = process_file(path, backup)
            print(f"{path}: {result}")
            if result is None or result.startswith("failed"):
                summary["failed"] += 1
            elif result.startswith("skipped") or result.startswith("ok"):
                summary["skipped"] += 1
            else:
                summary["changes"] += 1
    return summary


def parse_args():
    p = argparse.ArgumentParser(prog="reencode_all_to_utf8.py", description="Re-encode text files to UTF-8")
    p.add_argument("path", nargs="?", default=".", help="Root directory (default: current).")
    p.add_argument("--backup", action="store_true", help="Create .bak copies before overwriting.")
    p.add_argument("--exts", nargs="*", default=None, help="File extensions e.g. .py .txt (default set used).")
    p.add_argument("--dry-run", action="store_true", help="Show what would be done without changing files.")
    return p.parse_args()


def main():
    args = parse_args()
    exts = set(args.exts) if args.exts else DEFAULT_EXTS
    exts = set(e if e.startswith(".") else "." + e for e in exts)
    summary = walk_and_reencode(args.path, exts, args.backup, args.dry_run)
    print("\nSummary:")
    print(f"  processed: {summary['processed']}")
    print(f"  changed:   {summary['changes']}")
    print(f"  skipped:   {summary['skipped']}")
    print(f"  failed:    {summary['failed']}")


if __name__ == "__main__":
    main()