tg_resume_db/extract/llm.py

from __future__ import annotations

import hashlib
import json
import os
import re
import sqlite3
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

try:
    import httpx  # type: ignore
except Exception:  # pragma: no cover
    httpx = None  # type: ignore


def resolve_llm_runtime() -> Dict[str, str]:
    """
    Resolve OpenAI-compatible runtime config.
    Supports both generic vars and Mistral aliases:
      - generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
      - mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
    """
    provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
    base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
    model = (os.environ.get("LLM_MODEL") or "").strip()
    api_key = (os.environ.get("LLM_API_KEY") or "").strip()

    mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
    mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
    mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()

    if not api_key and mistral_key:
        api_key = mistral_key
    if not model and mistral_model:
        model = mistral_model
    if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
        base_url = mistral_base

    if base_url:
        base_url = base_url.rstrip("/")

    if not provider:
        if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
            provider = "mistral"
        else:
            provider = "generic"

    return {
        "provider": provider,
        "base_url": base_url,
        "model": model,
        "api_key": api_key,
    }


# ------------- Public API -------------

def llm_parse_enabled() -> bool:
    """
    Enabled only if httpx is available and both base_url/model are resolved.
    Opt-out via LLM_PARSE_ENABLED=0.
    """
    if httpx is None:
        return False
    if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
        return False
    runtime = resolve_llm_runtime()
    return bool(runtime["base_url"]) and bool(runtime["model"])


_PROMPT_VERSION = "v3_sections_doc_type"
_REVIEW_PROMPT_VERSION = "v1_review_merge"


@dataclass
class LLMExtraction:
    roles: List[str]
    skills: List[str]
    primary_languages: List[str]
    seniority: Optional[str]
    backend_focus: Optional[bool]
    experience_years_total: Optional[float]
    experience_years_engineering: Optional[float]
    english_level: Optional[str]
    location: Optional[str]
    remote_ok: Optional[bool]
    salary_min_usd: Optional[int]
    salary_max_usd: Optional[int]
    salary_min_rub: Optional[int]
    salary_max_rub: Optional[int]
    highlights: List[str]
    keywords: List[str]

    @staticmethod
    def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
        def _as_list(v: Any) -> List[str]:
            if v is None:
                return []
            if isinstance(v, list):
                return [str(x).strip() for x in v if str(x).strip()]
            s = str(v).strip()
            return [s] if s else []

        def _as_float(v: Any) -> Optional[float]:
            try:
                return float(v)
            except Exception:
                return None

        def _as_int(v: Any) -> Optional[int]:
            try:
                return int(float(v))
            except Exception:
                return None

        def _as_bool(v: Any) -> Optional[bool]:
            if isinstance(v, bool):
                return v
            if v is None:
                return None
            s = str(v).strip().lower()
            if s in ("true", "1", "yes", "y"):
                return True
            if s in ("false", "0", "no", "n"):
                return False
            return None

        return LLMExtraction(
            roles=_as_list(obj.get("roles")),
            skills=_as_list(obj.get("skills")),
            primary_languages=_as_list(obj.get("primary_languages")),
            seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
            backend_focus=_as_bool(obj.get("backend_focus")),
            experience_years_total=_as_float(obj.get("experience_years_total")),
            experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
            english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
            location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
            remote_ok=_as_bool(obj.get("remote_ok")),
            salary_min_usd=_as_int(obj.get("salary_min_usd")),
            salary_max_usd=_as_int(obj.get("salary_max_usd")),
            salary_min_rub=_as_int(obj.get("salary_min_rub")),
            salary_max_rub=_as_int(obj.get("salary_max_rub")),
            highlights=_as_list(obj.get("highlights")),
            keywords=_as_list(obj.get("keywords")),
        )


def llm_extract_profile(
    clean_text: str,
    *,
    con: Optional[sqlite3.Connection] = None,
    doc_type: Optional[str] = None,
    sections: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
    """
    Returns (LLMExtraction | None, debug_info).
    - Uses cache on disk/sqlite to keep throughput high.
    - Silently degrades to None on any failure.
    """
    runtime = resolve_llm_runtime()
    dbg: Dict[str, Any] = {
        "enabled": llm_parse_enabled(),
        "provider": runtime.get("provider"),
        "model": runtime.get("model"),
        "from_cache": False,
        "cache_backend": None,
        "error": None,
        "prompt_version": _PROMPT_VERSION,
    }
    if not llm_parse_enabled():
        return None, dbg

    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
    cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"

    payload = _build_payload(
        clean_text,
        doc_type=doc_type,
        sections=sections,
        prompt_version=_PROMPT_VERSION,
        temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
        max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
        system_prompt="You output ONLY JSON for structured resume extraction.",
        prompt_template=_PROMPT_TEMPLATE,
    )

    data = _cached_llm_json_call(
        con=con,
        cache_key=cache_key,
        model=runtime["model"],
        payload=payload,
        dbg=dbg,
    )
    if data is None:
        return None, dbg
    return LLMExtraction.from_obj(data), dbg


def llm_review_profile(
    clean_text: str,
    *,
    draft: Dict[str, Any],
    con: Optional[sqlite3.Connection] = None,
    doc_type: Optional[str] = None,
    sections: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
    """
    Second-pass validator:
    - Takes already parsed JSON (draft)
    - Re-checks every field against resume text
    - Returns corrected extraction for safe merge in pipeline
    """
    runtime = resolve_llm_runtime()
    dbg: Dict[str, Any] = {
        "enabled": llm_parse_enabled(),
        "provider": runtime.get("provider"),
        "model": runtime.get("model"),
        "from_cache": False,
        "cache_backend": None,
        "error": None,
        "prompt_version": _REVIEW_PROMPT_VERSION,
        "quality_score": None,
        "changed_fields": [],
        "issues_found": [],
    }
    if not llm_parse_enabled():
        return None, dbg

    clean_draft = _sanitize_review_draft(draft)
    draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
    draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
    cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"

    payload = _build_payload(
        clean_text,
        doc_type=doc_type,
        sections=sections,
        prompt_version=_REVIEW_PROMPT_VERSION,
        temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
        max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
        system_prompt="You output ONLY JSON for resume parsing quality review.",
        prompt_template=_REVIEW_PROMPT_TEMPLATE,
        extra_vars={"draft_json": draft_blob},
    )

    data = _cached_llm_json_call(
        con=con,
        cache_key=cache_key,
        model=runtime["model"],
        payload=payload,
        dbg=dbg,
    )
    if data is None:
        return None, dbg

    corrected_obj: Dict[str, Any]
    if isinstance(data.get("corrected"), dict):
        corrected_obj = data["corrected"]
    else:
        corrected_obj = data

    dbg["quality_score"] = _as_float(data.get("quality_score"))
    dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
    dbg["issues_found"] = _as_str_list(data.get("issues_found"))

    return LLMExtraction.from_obj(corrected_obj), dbg


# ------------- Internal helpers -------------

_PROMPT_TEMPLATE = """
Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
Схема:
{{
  "roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
  "skills": ["python","go","k8s","postgres","react", "..."],
  "primary_languages": ["python","go","java","c++", "..."],
  "seniority": "intern|junior|middle|senior|lead|principal|null",
  "backend_focus": true|false|null,
  "experience_years_total": number|null,
  "experience_years_engineering": number|null,
  "english_level": "A1|A2|B1|B2|C1|C2|null",
  "location": "city, country|null",
  "remote_ok": true|false|null,
  "salary_min_usd": int|null,
  "salary_max_usd": int|null,
  "salary_min_rub": int|null,
  "salary_max_rub": int|null,
  "highlights": ["кратко достижения (1-2 предложения)"],
  "keywords": ["уникальные ключевые слова, продукты или домены"]
}}
Не включай контактные данные в skills/keywords.
Detected doc_type: {doc_type}
Sections (if present):
{sections_block}

Full text snippet (use only if needed):
```TEXT
{resume_text}
```
"""

_REVIEW_PROMPT_TEMPLATE = """
Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.

Верни JSON строго такой формы:
{{
  "corrected": {{
    "roles": ["..."],
    "skills": ["..."],
    "primary_languages": ["..."],
    "seniority": "intern|junior|middle|senior|lead|principal|null",
    "backend_focus": true|false|null,
    "experience_years_total": number|null,
    "experience_years_engineering": number|null,
    "english_level": "A1|A2|B1|B2|C1|C2|null",
    "location": "city, country|null",
    "remote_ok": true|false|null,
    "salary_min_usd": int|null,
    "salary_max_usd": int|null,
    "salary_min_rub": int|null,
    "salary_max_rub": int|null,
    "highlights": ["..."],
    "keywords": ["..."]
  }},
  "changed_fields": ["field_name", "..."],
  "issues_found": ["кратко что было неверно/сомнительно", "..."],
  "quality_score": 0.0
}}

Черновик JSON:
```DRAFT
{draft_json}
```

Detected doc_type: {doc_type}
Sections (if present):
{sections_block}

Full text snippet (use only if needed):
```TEXT
{resume_text}
```
"""


def _trim_text(text: str, max_len: int = 9000) -> str:
    """
    Keep head and tail to preserve summary + recent projects.
    """
    if len(text) <= max_len:
        return text
    head = text[: max_len // 2]
    tail = text[-max_len // 2 :]
    return head + "\n...\n" + tail


def _build_payload(
    clean_text: str,
    *,
    doc_type: Optional[str],
    sections: Optional[Dict[str, str]],
    prompt_version: str,
    temperature: float,
    max_tokens: int,
    system_prompt: str,
    prompt_template: str,
    extra_vars: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    runtime = resolve_llm_runtime()
    base_url = runtime["base_url"]
    model = runtime["model"]

    sections_block = _build_sections_block(sections)
    tpl_vars = {
        "resume_text": _trim_text(clean_text),
        "doc_type": (doc_type or "unknown"),
        "sections_block": sections_block or "(no sections detected)",
    }
    if extra_vars:
        tpl_vars.update(extra_vars)

    prompt = prompt_template.format(**tpl_vars)

    return {
        "base_url": base_url,
        "model": model,
        "prompt_version": prompt_version,
        "payload": {
            "model": model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            "temperature": temperature,
            "max_tokens": max_tokens,
        },
        "headers": _build_headers(runtime),
        "timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
    }


def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
    headers = {"Content-Type": "application/json"}
    api_key = runtime.get("api_key", "")
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    return headers


def _cached_llm_json_call(
    *,
    con: Optional[sqlite3.Connection],
    cache_key: str,
    model: str,
    payload: Dict[str, Any],
    dbg: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
    data = _cache_get_sqlite(con, cache_key)
    if data:
        dbg["from_cache"] = True
        dbg["cache_backend"] = "sqlite"
        return data

    cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
    cache_ok = True
    try:
        cache_dir.mkdir(parents=True, exist_ok=True)
    except Exception:
        cache_ok = False

    safe_name = cache_key.replace(":", "_")
    cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None

    if cache_path and cache_path.exists():
        try:
            data = json.loads(cache_path.read_text(encoding="utf-8"))
            dbg["from_cache"] = True
            dbg["cache_backend"] = "disk"
            return data
        except Exception:
            pass

    try:
        data = _llm_call_json(payload)
        if con:
            _cache_put_sqlite(con, cache_key, model, data)
        if cache_path:
            cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
        return data
    except Exception as e:  # pragma: no cover - network/LLM failures
        dbg["error"] = repr(e)
        return None


def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
    if httpx is None:
        raise RuntimeError("httpx is not installed")

    base_url: str = task["base_url"]
    payload: Dict[str, Any] = task["payload"]
    timeout = float(task.get("timeout", 18.0))

    with httpx.Client(timeout=timeout) as client:
        r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
        r.raise_for_status()
        data = r.json()

    content = data["choices"][0]["message"]["content"]
    if isinstance(content, list):
        parts = []
        for block in content:
            if isinstance(block, dict):
                parts.append(str(block.get("text") or ""))
            else:
                parts.append(str(block))
        content = "\n".join(parts)
    content = str(content)

    m = re.search(r"\{.*\}", content, flags=re.S)
    if not m:
        raise ValueError("LLM did not return JSON")
    return json.loads(m.group(0))


def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
    if not sections:
        return ""
    parts: List[str] = []
    order = [
        ("about", "ABOUT"),
        ("skills", "SKILLS"),
        ("experience", "EXPERIENCE"),
        ("education", "EDUCATION"),
        ("contacts", "CONTACTS"),
    ]
    for key, label in order:
        text = sections.get(key)
        if not text:
            continue
        snippet = _trim_text(text, max_len=1800)
        parts.append(f"[{label}]\n{snippet}")
    return "\n\n".join(parts)


def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(draft, dict):
        draft = {}

    allowed = {
        "roles",
        "skills",
        "primary_languages",
        "seniority",
        "backend_focus",
        "experience_years_total",
        "experience_years_engineering",
        "english_level",
        "location",
        "remote_ok",
        "salary_min_usd",
        "salary_max_usd",
        "salary_min_rub",
        "salary_max_rub",
        "highlights",
        "keywords",
    }
    cleaned = {k: v for k, v in draft.items() if k in allowed}
    return asdict(LLMExtraction.from_obj(cleaned))


def _as_float(v: Any) -> Optional[float]:
    try:
        x = float(v)
    except Exception:
        return None
    if x < 0:
        return None
    if x > 1.0:
        return 1.0
    return x


def _as_str_list(v: Any) -> List[str]:
    if v is None:
        return []
    if isinstance(v, list):
        return [str(x).strip() for x in v if str(x).strip()]
    s = str(v).strip()
    return [s] if s else []


def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
    if con is None:
        return None
    try:
        row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
        if row and row["result_json"]:
            return json.loads(row["result_json"])
    except Exception:
        return None
    return None


def _cache_put_sqlite(
    con: Optional[sqlite3.Connection],
    cache_key: str,
    model: str,
    data: Dict[str, Any],
) -> None:
    if con is None:
        return
    try:
        con.execute(
            "INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
            (cache_key, model, json.dumps(data, ensure_ascii=False)),
        )
    except Exception:
        return