586 lines
18 KiB
Python
586 lines
18 KiB
Python
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import json
|
||
import os
|
||
import re
|
||
import sqlite3
|
||
from dataclasses import asdict, dataclass
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
try:
|
||
import httpx # type: ignore
|
||
except Exception: # pragma: no cover
|
||
httpx = None # type: ignore
|
||
|
||
|
||
def resolve_llm_runtime() -> Dict[str, str]:
|
||
"""
|
||
Resolve OpenAI-compatible runtime config.
|
||
Supports both generic vars and Mistral aliases:
|
||
- generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
|
||
- mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
|
||
"""
|
||
provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
|
||
base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
|
||
model = (os.environ.get("LLM_MODEL") or "").strip()
|
||
api_key = (os.environ.get("LLM_API_KEY") or "").strip()
|
||
|
||
mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
|
||
mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
|
||
mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
|
||
|
||
if not api_key and mistral_key:
|
||
api_key = mistral_key
|
||
if not model and mistral_model:
|
||
model = mistral_model
|
||
if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
|
||
base_url = mistral_base
|
||
|
||
if base_url:
|
||
base_url = base_url.rstrip("/")
|
||
|
||
if not provider:
|
||
if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
|
||
provider = "mistral"
|
||
else:
|
||
provider = "generic"
|
||
|
||
return {
|
||
"provider": provider,
|
||
"base_url": base_url,
|
||
"model": model,
|
||
"api_key": api_key,
|
||
}
|
||
|
||
|
||
# ------------- Public API -------------
|
||
|
||
def llm_parse_enabled() -> bool:
|
||
"""
|
||
Enabled only if httpx is available and both base_url/model are resolved.
|
||
Opt-out via LLM_PARSE_ENABLED=0.
|
||
"""
|
||
if httpx is None:
|
||
return False
|
||
if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
|
||
return False
|
||
runtime = resolve_llm_runtime()
|
||
return bool(runtime["base_url"]) and bool(runtime["model"])
|
||
|
||
|
||
_PROMPT_VERSION = "v3_sections_doc_type"
|
||
_REVIEW_PROMPT_VERSION = "v1_review_merge"
|
||
|
||
|
||
@dataclass
|
||
class LLMExtraction:
|
||
roles: List[str]
|
||
skills: List[str]
|
||
primary_languages: List[str]
|
||
seniority: Optional[str]
|
||
backend_focus: Optional[bool]
|
||
experience_years_total: Optional[float]
|
||
experience_years_engineering: Optional[float]
|
||
english_level: Optional[str]
|
||
location: Optional[str]
|
||
remote_ok: Optional[bool]
|
||
salary_min_usd: Optional[int]
|
||
salary_max_usd: Optional[int]
|
||
salary_min_rub: Optional[int]
|
||
salary_max_rub: Optional[int]
|
||
highlights: List[str]
|
||
keywords: List[str]
|
||
|
||
@staticmethod
|
||
def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
|
||
def _as_list(v: Any) -> List[str]:
|
||
if v is None:
|
||
return []
|
||
if isinstance(v, list):
|
||
return [str(x).strip() for x in v if str(x).strip()]
|
||
s = str(v).strip()
|
||
return [s] if s else []
|
||
|
||
def _as_float(v: Any) -> Optional[float]:
|
||
try:
|
||
return float(v)
|
||
except Exception:
|
||
return None
|
||
|
||
def _as_int(v: Any) -> Optional[int]:
|
||
try:
|
||
return int(float(v))
|
||
except Exception:
|
||
return None
|
||
|
||
def _as_bool(v: Any) -> Optional[bool]:
|
||
if isinstance(v, bool):
|
||
return v
|
||
if v is None:
|
||
return None
|
||
s = str(v).strip().lower()
|
||
if s in ("true", "1", "yes", "y"):
|
||
return True
|
||
if s in ("false", "0", "no", "n"):
|
||
return False
|
||
return None
|
||
|
||
return LLMExtraction(
|
||
roles=_as_list(obj.get("roles")),
|
||
skills=_as_list(obj.get("skills")),
|
||
primary_languages=_as_list(obj.get("primary_languages")),
|
||
seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
|
||
backend_focus=_as_bool(obj.get("backend_focus")),
|
||
experience_years_total=_as_float(obj.get("experience_years_total")),
|
||
experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
|
||
english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
|
||
location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
|
||
remote_ok=_as_bool(obj.get("remote_ok")),
|
||
salary_min_usd=_as_int(obj.get("salary_min_usd")),
|
||
salary_max_usd=_as_int(obj.get("salary_max_usd")),
|
||
salary_min_rub=_as_int(obj.get("salary_min_rub")),
|
||
salary_max_rub=_as_int(obj.get("salary_max_rub")),
|
||
highlights=_as_list(obj.get("highlights")),
|
||
keywords=_as_list(obj.get("keywords")),
|
||
)
|
||
|
||
|
||
def llm_extract_profile(
|
||
clean_text: str,
|
||
*,
|
||
con: Optional[sqlite3.Connection] = None,
|
||
doc_type: Optional[str] = None,
|
||
sections: Optional[Dict[str, str]] = None,
|
||
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
|
||
"""
|
||
Returns (LLMExtraction | None, debug_info).
|
||
- Uses cache on disk/sqlite to keep throughput high.
|
||
- Silently degrades to None on any failure.
|
||
"""
|
||
runtime = resolve_llm_runtime()
|
||
dbg: Dict[str, Any] = {
|
||
"enabled": llm_parse_enabled(),
|
||
"provider": runtime.get("provider"),
|
||
"model": runtime.get("model"),
|
||
"from_cache": False,
|
||
"cache_backend": None,
|
||
"error": None,
|
||
"prompt_version": _PROMPT_VERSION,
|
||
}
|
||
if not llm_parse_enabled():
|
||
return None, dbg
|
||
|
||
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
|
||
cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
|
||
|
||
payload = _build_payload(
|
||
clean_text,
|
||
doc_type=doc_type,
|
||
sections=sections,
|
||
prompt_version=_PROMPT_VERSION,
|
||
temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
|
||
max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
|
||
system_prompt="You output ONLY JSON for structured resume extraction.",
|
||
prompt_template=_PROMPT_TEMPLATE,
|
||
)
|
||
|
||
data = _cached_llm_json_call(
|
||
con=con,
|
||
cache_key=cache_key,
|
||
model=runtime["model"],
|
||
payload=payload,
|
||
dbg=dbg,
|
||
)
|
||
if data is None:
|
||
return None, dbg
|
||
return LLMExtraction.from_obj(data), dbg
|
||
|
||
|
||
def llm_review_profile(
|
||
clean_text: str,
|
||
*,
|
||
draft: Dict[str, Any],
|
||
con: Optional[sqlite3.Connection] = None,
|
||
doc_type: Optional[str] = None,
|
||
sections: Optional[Dict[str, str]] = None,
|
||
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
|
||
"""
|
||
Second-pass validator:
|
||
- Takes already parsed JSON (draft)
|
||
- Re-checks every field against resume text
|
||
- Returns corrected extraction for safe merge in pipeline
|
||
"""
|
||
runtime = resolve_llm_runtime()
|
||
dbg: Dict[str, Any] = {
|
||
"enabled": llm_parse_enabled(),
|
||
"provider": runtime.get("provider"),
|
||
"model": runtime.get("model"),
|
||
"from_cache": False,
|
||
"cache_backend": None,
|
||
"error": None,
|
||
"prompt_version": _REVIEW_PROMPT_VERSION,
|
||
"quality_score": None,
|
||
"changed_fields": [],
|
||
"issues_found": [],
|
||
}
|
||
if not llm_parse_enabled():
|
||
return None, dbg
|
||
|
||
clean_draft = _sanitize_review_draft(draft)
|
||
draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
|
||
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
|
||
draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
|
||
cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
|
||
|
||
payload = _build_payload(
|
||
clean_text,
|
||
doc_type=doc_type,
|
||
sections=sections,
|
||
prompt_version=_REVIEW_PROMPT_VERSION,
|
||
temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
|
||
max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
|
||
system_prompt="You output ONLY JSON for resume parsing quality review.",
|
||
prompt_template=_REVIEW_PROMPT_TEMPLATE,
|
||
extra_vars={"draft_json": draft_blob},
|
||
)
|
||
|
||
data = _cached_llm_json_call(
|
||
con=con,
|
||
cache_key=cache_key,
|
||
model=runtime["model"],
|
||
payload=payload,
|
||
dbg=dbg,
|
||
)
|
||
if data is None:
|
||
return None, dbg
|
||
|
||
corrected_obj: Dict[str, Any]
|
||
if isinstance(data.get("corrected"), dict):
|
||
corrected_obj = data["corrected"]
|
||
else:
|
||
corrected_obj = data
|
||
|
||
dbg["quality_score"] = _as_float(data.get("quality_score"))
|
||
dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
|
||
dbg["issues_found"] = _as_str_list(data.get("issues_found"))
|
||
|
||
return LLMExtraction.from_obj(corrected_obj), dbg
|
||
|
||
|
||
# ------------- Internal helpers -------------
|
||
|
||
_PROMPT_TEMPLATE = """
|
||
Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
|
||
Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
|
||
Схема:
|
||
{{
|
||
"roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
|
||
"skills": ["python","go","k8s","postgres","react", "..."],
|
||
"primary_languages": ["python","go","java","c++", "..."],
|
||
"seniority": "intern|junior|middle|senior|lead|principal|null",
|
||
"backend_focus": true|false|null,
|
||
"experience_years_total": number|null,
|
||
"experience_years_engineering": number|null,
|
||
"english_level": "A1|A2|B1|B2|C1|C2|null",
|
||
"location": "city, country|null",
|
||
"remote_ok": true|false|null,
|
||
"salary_min_usd": int|null,
|
||
"salary_max_usd": int|null,
|
||
"salary_min_rub": int|null,
|
||
"salary_max_rub": int|null,
|
||
"highlights": ["кратко достижения (1-2 предложения)"],
|
||
"keywords": ["уникальные ключевые слова, продукты или домены"]
|
||
}}
|
||
Не включай контактные данные в skills/keywords.
|
||
Detected doc_type: {doc_type}
|
||
Sections (if present):
|
||
{sections_block}
|
||
|
||
Full text snippet (use only if needed):
|
||
```TEXT
|
||
{resume_text}
|
||
```
|
||
"""
|
||
|
||
_REVIEW_PROMPT_TEMPLATE = """
|
||
Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
|
||
У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
|
||
Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
|
||
|
||
Верни JSON строго такой формы:
|
||
{{
|
||
"corrected": {{
|
||
"roles": ["..."],
|
||
"skills": ["..."],
|
||
"primary_languages": ["..."],
|
||
"seniority": "intern|junior|middle|senior|lead|principal|null",
|
||
"backend_focus": true|false|null,
|
||
"experience_years_total": number|null,
|
||
"experience_years_engineering": number|null,
|
||
"english_level": "A1|A2|B1|B2|C1|C2|null",
|
||
"location": "city, country|null",
|
||
"remote_ok": true|false|null,
|
||
"salary_min_usd": int|null,
|
||
"salary_max_usd": int|null,
|
||
"salary_min_rub": int|null,
|
||
"salary_max_rub": int|null,
|
||
"highlights": ["..."],
|
||
"keywords": ["..."]
|
||
}},
|
||
"changed_fields": ["field_name", "..."],
|
||
"issues_found": ["кратко что было неверно/сомнительно", "..."],
|
||
"quality_score": 0.0
|
||
}}
|
||
|
||
Черновик JSON:
|
||
```DRAFT
|
||
{draft_json}
|
||
```
|
||
|
||
Detected doc_type: {doc_type}
|
||
Sections (if present):
|
||
{sections_block}
|
||
|
||
Full text snippet (use only if needed):
|
||
```TEXT
|
||
{resume_text}
|
||
```
|
||
"""
|
||
|
||
|
||
def _trim_text(text: str, max_len: int = 9000) -> str:
|
||
"""
|
||
Keep head and tail to preserve summary + recent projects.
|
||
"""
|
||
if len(text) <= max_len:
|
||
return text
|
||
head = text[: max_len // 2]
|
||
tail = text[-max_len // 2 :]
|
||
return head + "\n...\n" + tail
|
||
|
||
|
||
def _build_payload(
|
||
clean_text: str,
|
||
*,
|
||
doc_type: Optional[str],
|
||
sections: Optional[Dict[str, str]],
|
||
prompt_version: str,
|
||
temperature: float,
|
||
max_tokens: int,
|
||
system_prompt: str,
|
||
prompt_template: str,
|
||
extra_vars: Optional[Dict[str, Any]] = None,
|
||
) -> Dict[str, Any]:
|
||
runtime = resolve_llm_runtime()
|
||
base_url = runtime["base_url"]
|
||
model = runtime["model"]
|
||
|
||
sections_block = _build_sections_block(sections)
|
||
tpl_vars = {
|
||
"resume_text": _trim_text(clean_text),
|
||
"doc_type": (doc_type or "unknown"),
|
||
"sections_block": sections_block or "(no sections detected)",
|
||
}
|
||
if extra_vars:
|
||
tpl_vars.update(extra_vars)
|
||
|
||
prompt = prompt_template.format(**tpl_vars)
|
||
|
||
return {
|
||
"base_url": base_url,
|
||
"model": model,
|
||
"prompt_version": prompt_version,
|
||
"payload": {
|
||
"model": model,
|
||
"messages": [
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": prompt},
|
||
],
|
||
"temperature": temperature,
|
||
"max_tokens": max_tokens,
|
||
},
|
||
"headers": _build_headers(runtime),
|
||
"timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
|
||
}
|
||
|
||
|
||
def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
|
||
headers = {"Content-Type": "application/json"}
|
||
api_key = runtime.get("api_key", "")
|
||
if api_key:
|
||
headers["Authorization"] = f"Bearer {api_key}"
|
||
return headers
|
||
|
||
|
||
def _cached_llm_json_call(
|
||
*,
|
||
con: Optional[sqlite3.Connection],
|
||
cache_key: str,
|
||
model: str,
|
||
payload: Dict[str, Any],
|
||
dbg: Dict[str, Any],
|
||
) -> Optional[Dict[str, Any]]:
|
||
data = _cache_get_sqlite(con, cache_key)
|
||
if data:
|
||
dbg["from_cache"] = True
|
||
dbg["cache_backend"] = "sqlite"
|
||
return data
|
||
|
||
cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
|
||
cache_ok = True
|
||
try:
|
||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||
except Exception:
|
||
cache_ok = False
|
||
|
||
safe_name = cache_key.replace(":", "_")
|
||
cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
|
||
|
||
if cache_path and cache_path.exists():
|
||
try:
|
||
data = json.loads(cache_path.read_text(encoding="utf-8"))
|
||
dbg["from_cache"] = True
|
||
dbg["cache_backend"] = "disk"
|
||
return data
|
||
except Exception:
|
||
pass
|
||
|
||
try:
|
||
data = _llm_call_json(payload)
|
||
if con:
|
||
_cache_put_sqlite(con, cache_key, model, data)
|
||
if cache_path:
|
||
cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
|
||
return data
|
||
except Exception as e: # pragma: no cover - network/LLM failures
|
||
dbg["error"] = repr(e)
|
||
return None
|
||
|
||
|
||
def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
|
||
if httpx is None:
|
||
raise RuntimeError("httpx is not installed")
|
||
|
||
base_url: str = task["base_url"]
|
||
payload: Dict[str, Any] = task["payload"]
|
||
timeout = float(task.get("timeout", 18.0))
|
||
|
||
with httpx.Client(timeout=timeout) as client:
|
||
r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
|
||
content = data["choices"][0]["message"]["content"]
|
||
if isinstance(content, list):
|
||
parts = []
|
||
for block in content:
|
||
if isinstance(block, dict):
|
||
parts.append(str(block.get("text") or ""))
|
||
else:
|
||
parts.append(str(block))
|
||
content = "\n".join(parts)
|
||
content = str(content)
|
||
|
||
m = re.search(r"\{.*\}", content, flags=re.S)
|
||
if not m:
|
||
raise ValueError("LLM did not return JSON")
|
||
return json.loads(m.group(0))
|
||
|
||
|
||
def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
|
||
if not sections:
|
||
return ""
|
||
parts: List[str] = []
|
||
order = [
|
||
("about", "ABOUT"),
|
||
("skills", "SKILLS"),
|
||
("experience", "EXPERIENCE"),
|
||
("education", "EDUCATION"),
|
||
("contacts", "CONTACTS"),
|
||
]
|
||
for key, label in order:
|
||
text = sections.get(key)
|
||
if not text:
|
||
continue
|
||
snippet = _trim_text(text, max_len=1800)
|
||
parts.append(f"[{label}]\n{snippet}")
|
||
return "\n\n".join(parts)
|
||
|
||
|
||
def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
|
||
if not isinstance(draft, dict):
|
||
draft = {}
|
||
|
||
allowed = {
|
||
"roles",
|
||
"skills",
|
||
"primary_languages",
|
||
"seniority",
|
||
"backend_focus",
|
||
"experience_years_total",
|
||
"experience_years_engineering",
|
||
"english_level",
|
||
"location",
|
||
"remote_ok",
|
||
"salary_min_usd",
|
||
"salary_max_usd",
|
||
"salary_min_rub",
|
||
"salary_max_rub",
|
||
"highlights",
|
||
"keywords",
|
||
}
|
||
cleaned = {k: v for k, v in draft.items() if k in allowed}
|
||
return asdict(LLMExtraction.from_obj(cleaned))
|
||
|
||
|
||
def _as_float(v: Any) -> Optional[float]:
|
||
try:
|
||
x = float(v)
|
||
except Exception:
|
||
return None
|
||
if x < 0:
|
||
return None
|
||
if x > 1.0:
|
||
return 1.0
|
||
return x
|
||
|
||
|
||
def _as_str_list(v: Any) -> List[str]:
|
||
if v is None:
|
||
return []
|
||
if isinstance(v, list):
|
||
return [str(x).strip() for x in v if str(x).strip()]
|
||
s = str(v).strip()
|
||
return [s] if s else []
|
||
|
||
|
||
def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
|
||
if con is None:
|
||
return None
|
||
try:
|
||
row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
|
||
if row and row["result_json"]:
|
||
return json.loads(row["result_json"])
|
||
except Exception:
|
||
return None
|
||
return None
|
||
|
||
|
||
def _cache_put_sqlite(
|
||
con: Optional[sqlite3.Connection],
|
||
cache_key: str,
|
||
model: str,
|
||
data: Dict[str, Any],
|
||
) -> None:
|
||
if con is None:
|
||
return
|
||
try:
|
||
con.execute(
|
||
"INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
|
||
(cache_key, model, json.dumps(data, ensure_ascii=False)),
|
||
)
|
||
except Exception:
|
||
return
|