#!/usr/bin/env python3
"""Redact secrets in backup tree copies.

Design goals:
- Only touch the *backup working tree* (never the live OpenClaw config).
- Be conservative and predictable: key-based redaction for JSON, pattern-based for text.

Usage:
  redact_backup_tree.py --root /path/to/backup_repo
"""

from __future__ import annotations

import argparse
import json
import os
import re
from pathlib import Path
from typing import Any, Iterable

REDACTED = "[REDACTED]"

# ---- JSON redaction (key-path based) ----
JSON_KEY_PATHS: list[tuple[str, ...]] = [
    ("gateway", "auth", "token"),
    ("gateway", "remote", "token"),
]

# Also redact common env-style keys if they appear inside JSON objects.
ENV_KEY_RE = re.compile(r"^(?:OPENCLAW_GATEWAY_TOKEN|OPENAI_API_KEY|ANTHROPIC_API_KEY|GEMINI_API_KEY|GOOGLE_API_KEY|COHERE_API_KEY|HUGGINGFACEHUB_API_TOKEN)$")


def _set_path(obj: Any, path: tuple[str, ...], value: Any) -> bool:
    cur = obj
    for k in path[:-1]:
        if not isinstance(cur, dict) or k not in cur:
            return False
        cur = cur[k]
    last = path[-1]
    if isinstance(cur, dict) and last in cur and cur[last] not in (None, ""):
        cur[last] = value
        return True
    return False


def redact_openclaw_backup_json(p: Path) -> dict[str, Any]:
    data = json.loads(p.read_text(encoding="utf-8"))
    changed = {
        "file": str(p),
        "json_paths_redacted": 0,
        "env_keys_redacted": 0,
    }

    for kp in JSON_KEY_PATHS:
        if _set_path(data, kp, REDACTED):
            changed["json_paths_redacted"] += 1

    # Walk dicts and redact known env-style keys
    def walk(x: Any):
        if isinstance(x, dict):
            for k, v in list(x.items()):
                if isinstance(k, str) and ENV_KEY_RE.match(k) and isinstance(v, str) and v and v != REDACTED:
                    x[k] = REDACTED
                    changed["env_keys_redacted"] += 1
                else:
                    walk(v)
        elif isinstance(x, list):
            for i in x:
                walk(i)

    walk(data)

    p.write_text(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
    return changed


# ---- Text redaction (pattern based) ----

# KEY=VALUE patterns for env/service/unit/log text
LINE_KV_RE = re.compile(
    r"(?P<key>(?:[A-Z0-9_]+(?:TOKEN|API_KEY|SECRET|PASSWORD)|OPENCLAW_GATEWAY_TOKEN))\s*=\s*(?P<val>[^\s\"']+)"
)

# Token-like substrings that often appear in logs/memos
TOKEN_SUBSTRING_RES: list[re.Pattern[str]] = [
    re.compile(r"\bghp_[A-Za-z0-9]{20,}\b"),
    re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,}\b"),
    re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"),  # Slack
    re.compile(r"\bsk-[A-Za-z0-9]{20,}\b"),  # OpenAI (legacy-ish)
    re.compile(r"\b(?:AIza|ya29\.)[A-Za-z0-9_\-]{10,}\b"),  # Google-ish
]


def redact_text_file(p: Path) -> dict[str, Any]:
    try:
        text = p.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        # Skip binary-ish files.
        return {"file": str(p), "skipped": "unicode"}

    original = text

    # Redact KEY=VALUE occurrences
    def repl(m: re.Match[str]) -> str:
        return f"{m.group('key')}={REDACTED}"

    text = LINE_KV_RE.sub(repl, text)

    # Redact token-like substrings
    for r in TOKEN_SUBSTRING_RES:
        text = r.sub(REDACTED, text)

    if text != original:
        p.write_text(text, encoding="utf-8")
        return {"file": str(p), "changed": True}
    return {"file": str(p), "changed": False}


def iter_candidate_text_files(root: Path) -> Iterable[Path]:
    # Keep scope intentionally limited; expand as needed.
    allow_suffix = {".txt", ".log", ".md", ".service", ".env"}
    for p in root.rglob("*"):
        if not p.is_file():
            continue
        if p.name.startswith(".") and p.suffix not in allow_suffix:
            continue
        if p.suffix.lower() in allow_suffix:
            yield p


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--root", required=True)
    args = ap.parse_args()

    root = Path(args.root).expanduser().resolve()
    if not root.exists():
        raise SystemExit(f"Root not found: {root}")

    reports: list[dict[str, Any]] = []

    ocb = root / "openclaw_backup.json"
    if ocb.exists():
        reports.append(redact_openclaw_backup_json(ocb))

    # Text pass (logs/memos/etc.)
    changed_count = 0
    for p in iter_candidate_text_files(root):
        # Avoid touching git internals if something weird got copied.
        if ".git" in p.parts:
            continue
        rep = redact_text_file(p)
        if rep.get("changed"):
            changed_count += 1
        reports.append(rep)

    summary = {
        "root": str(root),
        "reports": reports[:50],  # cap
        "text_files_changed": changed_count,
    }
    print(json.dumps(summary, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()
