Files
tg_resume_db/agent.py
2026-03-11 15:27:10 +03:00

1185 lines
39 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
import re
import sqlite3
from dataclasses import dataclass, asdict
from typing import Any, Dict, List, Optional, Set, Tuple
try:
import httpx # type: ignore
except Exception: # pragma: no cover
httpx = None # type: ignore
from tg_resume_db.search import search_with_filters
from tg_resume_db.extract.parse import (
extract_remote,
extract_english,
extract_location_best_effort,
extract_roles_skills,
extract_salary,
)
from tg_resume_db.extract.clean import normalize_text
from tg_resume_db.extract.llm import resolve_llm_runtime
from tg_resume_db.normalize import normalize_skill, find_skills_in_text
# --------- Search plan (LLM outputs THIS, not SQL) ----------
@dataclass
class SearchPlan:
query_text: str = "" # full-text query (FTS)
skills_any: List[str] = None # at least one must match
skills_all: List[str] = None # all must match
roles_any: List[str] = None
location: Optional[str] = None
remote: Optional[bool] = None
english_min: Optional[str] = None # e.g. A1..C2
exp_years_min: Optional[float] = None
salary_min: Optional[int] = None
salary_max: Optional[int] = None
limit: int = 20
sort: str = "rank" # rank | exp_desc | salary_desc
def __post_init__(self):
self.skills_any = self.skills_any or []
self.skills_all = self.skills_all or []
self.roles_any = self.roles_any or []
_ALLOWED_PLAN_KEYS = {
"query_text",
"skills_any",
"skills_all",
"roles_any",
"location",
"remote",
"english_min",
"exp_years_min",
"salary_min",
"salary_max",
"limit",
"sort",
}
# --------- Text helpers ----------
_EN_ORDER = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}
def _norm_token(s: str) -> str:
s = (s or "").strip().lower()
s = re.sub(r"\s+", " ", s)
return s
def _uniq_keep_order(xs: List[str]) -> List[str]:
seen = set()
out: List[str] = []
for x in (xs or []):
x = _norm_token(str(x))
if not x or x in seen:
continue
seen.add(x)
out.append(x)
return out
def _filter_skills_vs_location(skills: List[str], location: Optional[str]) -> List[str]:
if not skills:
return []
bad = set()
if location:
bad.add(_norm_token(location))
for w in [
"москва", "санкт-петербург", "спб", "питер", "екатеринбург", "минск", "алматы",
"remote", "удаленно", "удалённо", "удаленка", "удалёнка", "гибрид", "hybrid",
"офис", "office", "onsite", "on-site",
]:
bad.add(w)
return [s for s in skills if _norm_token(s) not in bad]
# ---- Name-list detection (чтобы не ужимать фильтрами запрос "списком ФИО") ----
_NAME_RE = re.compile(r"\b[А-ЯЁA-Z][а-яёa-z]+(?:[-\s]+[А-ЯЁA-Z][а-яёa-z]+)+\b")
def _looks_like_name_list(user_prompt: str) -> bool:
"""
Heuristic: если в запросе несколько строк с ФИО, считаем это прямой поиск по именам
и не жёстко фильтруем по стеку/опыту.
"""
if not user_prompt:
return False
matches = _NAME_RE.findall(user_prompt)
if len(matches) >= 3:
return True
# lines with at least one full name
lines = [ln.strip() for ln in user_prompt.splitlines() if ln.strip()]
name_lines = sum(1 for ln in lines if _NAME_RE.search(ln))
return name_lines >= 2 and len(matches) >= 2
# ---- Work mode: hybrid must NOT force remote=true ----
_HYBRID_RE = re.compile(r"\b(гибрид|hybrid)\b", re.I)
_REMOTE_RE = re.compile(r"\b(remote|удал(ен|ён|енно|ённо)?|удаленк|удалёнк|дистанц)\b", re.I)
_OFFICE_RE = re.compile(r"\b(офис|office|on[-\s]?site|onsite|в офисе|на месте)\b", re.I)
def _apply_work_mode_overrides(user_prompt: str, plan: SearchPlan) -> None:
"""
Принудительно правим plan.remote по тексту запроса:
- "гибрид" => remote = None (не фильтруем)
- "офис/onsite" => remote = False
- "remote/удаленно" => remote = True
"""
t = (user_prompt or "").lower()
if _HYBRID_RE.search(t):
plan.remote = None
return
if _OFFICE_RE.search(t):
plan.remote = False
return
if _REMOTE_RE.search(t):
plan.remote = True
return
def _simplify_query_text(user_prompt: str, skills_any: List[str]) -> str:
"""
FTS-поиск может ухудшаться, если query_text перегружен.
Если в запросе явно стек (3+ технологий) — оставим краткий search intent.
"""
up = (user_prompt or "").strip()
if len(skills_any) >= 3:
# максимально безопасно и универсально
if re.search(r"\bbackend\b", up, re.I) or "бэкенд" in up.lower():
return "backend developer"
return "developer"
return up
# --------- sanitize helpers ----------
def _as_list(v: Any) -> List[str]:
if v is None:
return []
if isinstance(v, list):
return [str(x) for x in v if str(x).strip()]
s = str(v).strip()
if not s:
return []
return [x.strip() for x in s.split(",") if x.strip()]
def _to_bool(v: Any) -> Optional[bool]:
if v is None:
return None
if isinstance(v, bool):
return v
s = str(v).strip().lower()
if s in ("true", "1", "yes", "y", "да", "д"):
return True
if s in ("false", "0", "no", "n", "нет", "н"):
return False
return None
def _to_int(v: Any) -> Optional[int]:
if v is None:
return None
try:
return int(float(v))
except Exception:
return None
def _to_float(v: Any) -> Optional[float]:
if v is None:
return None
try:
return float(v)
except Exception:
return None
def _sanitize_plan_dict(obj: Any) -> Dict[str, Any]:
"""
Убираем лишние ключи (например, user_prompt) и приводим типы.
Лечит: SearchPlan.__init__() got an unexpected keyword argument ...
"""
if not isinstance(obj, dict):
return {}
clean: Dict[str, Any] = {}
for k, v in obj.items():
if k not in _ALLOWED_PLAN_KEYS:
continue
clean[k] = v
if "skills_any" in clean:
clean["skills_any"] = _as_list(clean["skills_any"])
if "skills_all" in clean:
clean["skills_all"] = _as_list(clean["skills_all"])
if "roles_any" in clean:
clean["roles_any"] = _as_list(clean["roles_any"])
if "remote" in clean:
clean["remote"] = _to_bool(clean["remote"])
if "salary_min" in clean:
clean["salary_min"] = _to_int(clean["salary_min"])
if "salary_max" in clean:
clean["salary_max"] = _to_int(clean["salary_max"])
if "exp_years_min" in clean:
clean["exp_years_min"] = _to_float(clean["exp_years_min"])
if "limit" in clean:
lim = _to_int(clean["limit"])
clean["limit"] = lim if lim is not None else 20
if "sort" in clean:
clean["sort"] = str(clean["sort"] or "").strip()
if "location" in clean and clean["location"] is not None:
loc = str(clean["location"]).strip()
clean["location"] = loc if loc else None
if "english_min" in clean and clean["english_min"] is not None:
eng = str(clean["english_min"]).strip().upper()
clean["english_min"] = eng if eng else None
if "query_text" in clean and clean["query_text"] is not None:
clean["query_text"] = str(clean["query_text"]).strip()
return clean
# --------- heuristic plan ----------
def _heuristic_plan(user_prompt: str) -> SearchPlan:
# Если запрос похож на список имён — ищем по тексту без лишних фильтров
if _looks_like_name_list(user_prompt):
return SearchPlan(
query_text=user_prompt.strip(),
skills_any=[],
skills_all=[],
roles_any=[],
location=None,
remote=None,
english_min=None,
exp_years_min=None,
salary_min=None,
salary_max=None,
limit=20,
sort="rank",
)
text = normalize_text(user_prompt)
roles, skills = extract_roles_skills(text)
location = extract_location_best_effort(text)
remote = extract_remote(text)
english = extract_english(text)
sal_min, sal_max, sal_conf, _ = extract_salary(text)
skills = _filter_skills_vs_location(skills, location)
roles = _uniq_keep_order(roles)
skills = _uniq_keep_order(skills)
plan = SearchPlan(
query_text=_simplify_query_text(user_prompt, skills),
skills_any=skills[:12],
roles_any=(["backend"] if ("backend" in roles or "backend" in user_prompt.lower()) else roles[:6]),
location=location,
remote=remote,
english_min=english,
salary_min=sal_min if sal_conf and sal_conf >= 0.4 else None,
salary_max=sal_max if sal_conf and sal_conf >= 0.4 else None,
limit=20,
sort="rank",
)
_apply_work_mode_overrides(user_prompt, plan)
return plan
# --------- Optional LLM (OpenAI-compatible base_url) ----------
def _llm_enabled() -> bool:
if httpx is None:
return False
runtime = resolve_llm_runtime()
return bool(runtime.get("base_url")) and bool(runtime.get("model"))
def _llm_call_json(messages: List[Dict[str, str]]) -> Dict[str, Any]:
if httpx is None:
raise RuntimeError("httpx is not installed")
runtime = resolve_llm_runtime()
base_url = runtime.get("base_url", "").rstrip("/")
model = runtime.get("model", "")
api_key = runtime.get("api_key", "")
if not base_url or not model:
raise RuntimeError("LLM runtime is not configured")
payload = {"model": model, "messages": messages, "temperature": 0.2}
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
with httpx.Client(timeout=30.0) as client:
r = client.post(f"{base_url}/chat/completions", headers=headers, json=payload)
r.raise_for_status()
data = r.json()
content = data["choices"][0]["message"]["content"]
m = re.search(r"\{.*\}", content, flags=re.S)
if not m:
raise ValueError("LLM did not return JSON")
return json.loads(m.group(0))
def _llm_build_plan(user_prompt: str, draft: SearchPlan) -> SearchPlan:
schema_hint = {
"query_text": "string",
"skills_any": ["string"],
"skills_all": ["string"],
"roles_any": ["string"],
"location": "string|null",
"remote": "bool|null",
"english_min": "A1|A2|B1|B2|C1|C2|null",
"exp_years_min": "number|null",
"salary_min": "int|null",
"salary_max": "int|null",
"limit": "int",
"sort": "rank|exp_desc|salary_desc",
}
msgs = [
{
"role": "system",
"content": (
"Ты превращаешь запрос рекрутера в JSON-фильтры поиска по базе резюме.\n"
"НЕЛЬЗЯ писать SQL. Верни ТОЛЬКО JSON объекта SearchPlan.\n"
f"Schema: {json.dumps(schema_hint, ensure_ascii=False)}\n"
"ВАЖНО:\n"
"- Никаких лишних ключей - только поля Schema.\n"
"- Не добавляй в skills города/локации.\n"
"- 'гибрид' НЕ означает remote=true (если видишь 'гибрид' - remote=null).\n"
"- Старайся делать поиск широким: skills_all используй ТОЛЬКО если явно попросили обязательные навыки.\n"
"- Если в запросе есть указание уровня английского (например B2+), заполни english_min.\n"
"- Если явно указан опыт 'N+' лет - поставь exp_years_min=N.\n"
),
},
{
"role": "user",
"content": (
f"Запрос: {user_prompt}\n\n"
f"Черновик (эвристика): {json.dumps(asdict(draft), ensure_ascii=False)}"
),
},
]
obj_raw = _llm_call_json(msgs)
obj = _sanitize_plan_dict(obj_raw)
plan = SearchPlan(**{**asdict(draft), **obj})
plan.skills_any = _uniq_keep_order(_filter_skills_vs_location(plan.skills_any, plan.location))
plan.skills_all = _uniq_keep_order(_filter_skills_vs_location(plan.skills_all, plan.location))
plan.roles_any = _uniq_keep_order(plan.roles_any)
# мягко улучшим query_text
plan.query_text = _simplify_query_text(user_prompt, plan.skills_any)
plan.limit = max(5, min(int(plan.limit or 20), 50))
if plan.sort not in ("rank", "exp_desc", "salary_desc"):
plan.sort = "rank"
# fallback: если LLM обнулил важные поля - вернём эвристику
if not plan.skills_any:
plan.skills_any = draft.skills_any
if not plan.skills_all:
plan.skills_all = draft.skills_all
if plan.english_min is None and draft.english_min is not None:
plan.english_min = draft.english_min
if plan.exp_years_min is None:
try:
req_exp = _extract_required_exp_years(user_prompt)
if req_exp is not None:
plan.exp_years_min = req_exp
except Exception:
pass
_apply_work_mode_overrides(user_prompt, plan)
return plan
# --------- post processing: dedupe + "real fit" filter ----------
_CORE = {"java", "kotlin", "python", "go", "golang"}
_BONUS = {"c++", "cpp"}
_LANG_VARIANTS = {
"java": {"java"},
"kotlin": {"kotlin"},
"python": {"python"},
"go": {"go", "golang"},
"c++": {"c++", "cpp", "c plus plus"},
"c#": {"c#", "csharp"},
}
_SKILL_EVIDENCE_ALIASES = {
"go": {"go", "golang"},
"golang": {"go", "golang"},
"kubernetes": {"kubernetes", "k8s"},
"postgresql": {"postgresql", "postgres", "postgre sql", "postgre-sql", "psql"},
"javascript": {"javascript", "java script", "js"},
"typescript": {"typescript", "type script", "ts"},
"nodejs": {"nodejs", "node js", "node.js", "node"},
"grpc": {"grpc", "g rpc"},
"graphql": {"graphql", "graph ql"},
"ci/cd": {"ci/cd", "ci cd", "cicd"},
"c++": {"c++", "cpp", "c plus plus"},
"c#": {"c#", "csharp", "c sharp"},
"dotnet": {"dotnet", ".net"},
"aws": {"aws", "amazon web services"},
"gcp": {"gcp", "google cloud", "google cloud platform"},
"redis": {"redis"},
"kafka": {"kafka"},
"docker": {"docker"},
}
_GENERIC_SKIP_SKILLS = {
"backend",
"frontend",
"fullstack",
"developer",
"engineer",
"senior",
"middle",
"junior",
"lead",
}
_DOMAIN_VARIANTS = {
"fintech": {
"fintech",
"финтех",
"bank",
"banking",
"бан",
"payment",
"payments",
"card",
"cards",
"sber",
"тбанк",
"tinkoff",
"visa",
"mastercard",
"trading",
"exchange",
"crypto",
"крипт",
"биржа",
},
"ecommerce": {
"ecommerce",
"e-commerce",
"marketplace",
"retail",
"checkout",
"cart",
"онлайн магазин",
},
"gamedev": {"gamedev", "game dev", "gaming", "unity", "unreal", "игр"},
"healthcare": {"healthcare", "medtech", "hospital", "clinic", "мед", "health tech"},
}
def _token_in_text(text: str, token: str) -> bool:
if not text or not token:
return False
pat = r"(?<![a-z0-9+#])" + re.escape(token) + r"(?![a-z0-9+#])"
return re.search(pat, text, re.I) is not None
def _lang_in_text(text: str, canon_lang: str) -> bool:
aliases = _LANG_VARIANTS.get(canon_lang, {canon_lang})
for tok in aliases:
if _token_in_text(text, tok):
return True
return False
def _skill_aliases(skill: str) -> List[str]:
canon = normalize_skill(skill) or _norm_token(skill)
if not canon:
return []
aliases = set()
aliases.add(canon)
aliases.add(_norm_token(skill))
aliases.update(_SKILL_EVIDENCE_ALIASES.get(canon, set()))
if canon in _LANG_VARIANTS:
aliases.update(_LANG_VARIANTS.get(canon, set()))
out: List[str] = []
for a in aliases:
t = _norm_token(a)
if not t:
continue
out.append(t)
return _uniq_keep_order(out)
def _extract_required_skills(user_prompt: str, plan: Optional[SearchPlan], req_langs: Set[str]) -> List[str]:
raw: List[str] = []
if plan:
raw.extend(plan.skills_all or [])
raw.extend(plan.skills_any or [])
raw.extend(find_skills_in_text(user_prompt or ""))
raw.extend(list(req_langs or set()))
out: List[str] = []
seen = set()
for s in raw:
canon = normalize_skill(s) or _norm_token(s)
if not canon:
continue
canon = _norm_token(canon)
if canon in _GENERIC_SKIP_SKILLS:
continue
if canon in seen:
continue
seen.add(canon)
out.append(canon)
return out[:10]
def _query_stack_is_strict(user_prompt: str) -> bool:
t = (user_prompt or "").lower()
if any(w in t for w in ("обязательно", "строго", "must", "required", "mandatory", "без этого")):
return True
if "," in t and " или " not in t and " or " not in t:
return True
return False
def _extract_required_domains(user_prompt: str) -> List[str]:
t = (user_prompt or "").lower()
out: List[str] = []
for canon, variants in _DOMAIN_VARIANTS.items():
if any(v in t for v in variants):
out.append(canon)
return out
def _domain_hit(text: str, domain: str) -> bool:
variants = _DOMAIN_VARIANTS.get(domain, set())
txt = (text or "").lower()
return any(v in txt for v in variants)
def _load_resume_contexts(
con: sqlite3.Connection,
items: List[Dict[str, Any]],
) -> Dict[str, Dict[str, str]]:
resume_ids = []
seen = set()
for it in items or []:
rid = str(it.get("resume_id") or "").strip()
if not rid or rid in seen:
continue
seen.add(rid)
resume_ids.append(rid)
if not resume_ids:
return {}
ph = ",".join("?" for _ in resume_ids)
sql = (
f"SELECT resume_id, clean_text, sections_json, extraction_json "
f"FROM resumes WHERE resume_id IN ({ph})"
)
try:
rows = con.execute(sql, resume_ids).fetchall()
except Exception:
return {}
out: Dict[str, Dict[str, str]] = {}
for r in rows:
rid = str(r["resume_id"])
clean = str(r["clean_text"] or "")
sections: Dict[str, Any] = {}
try:
raw = json.loads(r["sections_json"] or "{}")
if isinstance(raw, dict):
sections = raw
except Exception:
sections = {}
extraction: Dict[str, Any] = {}
try:
raw = json.loads(r["extraction_json"] or "{}")
if isinstance(raw, dict):
extraction = raw
except Exception:
extraction = {}
skills_text = str(sections.get("skills") or "")
body_parts: List[str] = []
for key in ("about", "summary", "experience", "projects", "work"):
val = sections.get(key)
if val:
body_parts.append(str(val))
for p in extraction.get("positions") or []:
if not isinstance(p, dict):
continue
body_parts.append(str(p.get("title") or ""))
body_parts.append(str(p.get("company") or ""))
body_parts.append(str(p.get("description") or ""))
body_text = "\n".join(body_parts).strip()
# fallback for badly split templates
if len(body_text) < 80:
body_text = clean
if skills_text:
body_text = body_text.replace(skills_text, " ")
out[rid] = {
"skills_text": skills_text.lower(),
"body_text": body_text.lower(),
"clean_text": clean.lower(),
}
return out
def _normalize_lang_token(token: str) -> Optional[str]:
t = _norm_token(token)
if not t:
return None
for canon, aliases in _LANG_VARIANTS.items():
if t == canon or t in aliases:
return canon
return None
def _extract_required_languages(user_prompt: str) -> List[str]:
t = (user_prompt or "").lower()
hits: List[str] = []
for canon, aliases in _LANG_VARIANTS.items():
if any(_token_in_text(t, alias) for alias in aliases):
if canon not in hits:
hits.append(canon)
return hits
def _dedupe_by_candidate_best_rank(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
best: Dict[str, Dict[str, Any]] = {}
for it in items or []:
cid = it.get("candidate_id") or ""
if not cid:
continue
if cid not in best:
best[cid] = it
continue
# rank: у тебя чем меньше (более отрицательный), тем "выше"
r_new = it.get("rank")
r_old = best[cid].get("rank")
try:
if r_old is None or (r_new is not None and float(r_new) < float(r_old)):
best[cid] = it
except Exception:
pass
return list(best.values())
def _needs_postfilter(user_prompt: str) -> bool:
"""
Включаем строгий "вакансионный" фильтр, если запрос похож на вакансию:
- "опыт от N лет" или "5+"
- явный стек из языков
"""
if _looks_like_name_list(user_prompt):
return False
t = (user_prompt or "").lower()
if re.search(r"(опыт|experience).{0,20}(\d+)\s*\+|\b(\d+)\s*\+\s*лет", t):
return True
skill_hits = len(find_skills_in_text(t))
if skill_hits >= 2:
return True
if _extract_required_domains(user_prompt) and skill_hits >= 1:
return True
# stack words fallback
hits = 0
for w in ("java", "kotlin", "python", "go", "golang", "c++", "cpp"):
if w in t:
hits += 1
return hits >= 2
_EXCLUDE_LOC_MARKERS = {
"россия",
"russia",
"rf",
"russian federation",
"moscow",
"москва",
"москв",
"spb",
"petersburg",
"петербург",
"санкт",
"мск",
"belarus",
"беларусь",
"белоруссия",
"iran",
"ирак",
"iraq",
"пакистан",
"pakistan",
"india",
"индия",
"африк",
}
def _location_exclusion_requested(user_prompt: str) -> bool:
t = (user_prompt or "").lower()
return any(k in t for k in _EXCLUDE_LOC_MARKERS) and ("кроме" in t or "except" in t or "не " in t)
def _extract_required_exp_years(user_prompt: str) -> Optional[float]:
t = (user_prompt or "").lower()
m = re.search(r"(опыт|experience).{0,20}(\d+(?:[.,]\d+)?)\s*(?:лет|years?)", t)
if m:
try:
return float(m.group(2).replace(",", "."))
except Exception:
return None
m = re.search(r"\b(\d+(?:[.,]\d+)?)\s*\+\s*(?:лет|years?)\b", t)
if m:
try:
return float(m.group(1).replace(",", "."))
except Exception:
return None
return None
def _extract_required_english(user_prompt: str) -> Optional[str]:
t = (user_prompt or "").upper()
m = re.search(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", t)
if m:
return m.group(1).replace("+", "")
# textual
if "FLUENT" in t or "ADVANCED" in t or "PROFICIENT" in t:
return "C1"
if "UPPER" in t and "INTERMEDIATE" in t:
return "B2"
if "INTERMEDIATE" in t:
return "B1"
return None
def _jobfit_filter_items(
con: sqlite3.Connection,
user_prompt: str,
items: List[Dict[str, Any]],
plan: Optional[SearchPlan] = None,
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""
"По-взрослому":
- exp >= required (если указано)
- совпадает стек (минимум 1 язык из запроса/Core)
- обязательный основной Go (для этого запроса с Go)
- английский >= требуемого уровня
- backend не обязателен, но учитывается в сортировке
"""
req_exp = _extract_required_exp_years(user_prompt) # например 5.0
req_langs = set(_extract_required_languages(user_prompt))
req_english = _extract_required_english(user_prompt)
req_skills = _extract_required_skills(user_prompt, plan, req_langs)
req_domains = _extract_required_domains(user_prompt)
strict_stack = _query_stack_is_strict(user_prompt) or (req_exp is not None)
must_have_skills = _uniq_keep_order([normalize_skill(s) or s for s in ((plan.skills_all or []) if plan else [])])
if not must_have_skills and strict_stack and req_skills:
# Для коротких стеков в вакансии считаем все элементы обязательными.
if len(req_skills) <= 4:
must_have_skills = req_skills
else:
must_have_skills = req_skills[:4]
filtered: List[Dict[str, Any]] = []
dropped = 0
reasons: Dict[str, int] = {}
exclude_ru = _location_exclusion_requested(user_prompt)
# Если явно ищут Go и нет числа лет — зададим минимум 4 года как дефолт
if req_exp is None and ("go" in req_langs or "golang" in req_langs):
req_exp = 4.0
resume_ctx = _load_resume_contexts(con, items)
for it in items or []:
roles = set((it.get("roles") or []))
skills = set(_norm_token(s) for s in (it.get("skills") or []))
for pl in it.get("primary_languages") or []:
skills.add(_norm_token(pl))
# CHECK ENGINEERING EXPERIENCE FIRST
# If 'experience_years_eng' is available and distinct (not None), use it.
# Otherwise fallback to 'experience_years'.
exp_eng = it.get("experience_years_eng")
exp_total = it.get("experience_years")
# Prefer engineering years for filtering if available
exp_val = None
if exp_eng is not None:
try:
exp_val = float(exp_eng)
except:
pass
if exp_val is None and exp_total is not None:
try:
exp_val = float(exp_total)
except:
pass
if req_exp is not None and (exp_val is None or exp_val < req_exp):
dropped += 1
reasons["exp_lt_required"] = reasons.get("exp_lt_required", 0) + 1
continue
backend_focus_flag = it.get("backend_focus")
loc = (it.get("location") or "").lower()
if exclude_ru and any(bad in loc for bad in _EXCLUDE_LOC_MARKERS):
dropped += 1
reasons["location_excluded"] = reasons.get("location_excluded", 0) + 1
continue
lang_tokens: Set[str] = set()
for lang in (it.get("primary_languages") or []):
norm = _normalize_lang_token(lang)
if norm:
lang_tokens.add(norm)
if not lang_tokens:
for sk in skills:
norm = _normalize_lang_token(sk)
if norm:
lang_tokens.add(norm)
# Для language-стека оставляем базовую проверку.
missing_primary_lang = False
for req_lang in req_langs:
if req_lang not in lang_tokens and req_lang in ("go", "python", "java", "kotlin", "c++", "c#"):
missing_primary_lang = True
break
if missing_primary_lang:
dropped += 1
reasons["no_primary_required_lang"] = reasons.get("no_primary_required_lang", 0) + 1
continue
rid = str(it.get("resume_id") or "")
ctx = resume_ctx.get(rid) or {}
ctx_body = str(ctx.get("body_text") or "")
ctx_skills = str(ctx.get("skills_text") or "")
ctx_clean = str(ctx.get("clean_text") or "")
ctx_domain = "\n".join([ctx_body, ctx_clean, str(it.get("snippet") or "").lower()])
# Evidence-based skill validation (не только Go):
# must-have скиллы не должны быть только в section "skills".
skill_hits_total = 0
skill_hits_body = 0
missing_must = 0
skills_only_must = 0
skills_only_critical = 0
for req_skill in req_skills:
aliases = _skill_aliases(req_skill)
if not aliases:
continue
hit_body = any(_token_in_text(ctx_body, a) for a in aliases)
hit_skills = any(_token_in_text(ctx_skills, a) for a in aliases)
hit_any = hit_body or hit_skills or any(_norm_token(req_skill) == _norm_token(s) for s in skills)
if hit_any:
skill_hits_total += 1
if hit_body:
skill_hits_body += 1
if req_skill in must_have_skills:
if not hit_any:
missing_must += 1
elif not hit_body and hit_skills:
skills_only_must += 1
if _normalize_lang_token(req_skill) is not None:
skills_only_critical += 1
if missing_must > 0:
dropped += 1
reasons["required_skill_missing"] = reasons.get("required_skill_missing", 0) + 1
continue
# Строго режем, если ключевой language-требование есть только в skill-list,
# либо если весь must-have стек не подтвержден опытом.
if strict_stack and (skills_only_critical > 0 or (must_have_skills and skills_only_must >= len(must_have_skills))):
dropped += 1
reasons["required_skill_only_in_skills"] = reasons.get("required_skill_only_in_skills", 0) + 1
continue
if req_skills and strict_stack:
min_hits = len(must_have_skills) if must_have_skills else (2 if len(req_skills) >= 2 else 1)
if skill_hits_total < min_hits:
dropped += 1
reasons["required_skills_weak"] = reasons.get("required_skills_weak", 0) + 1
continue
domain_hits = 0
for d in req_domains:
if _domain_hit(ctx_domain, d):
domain_hits += 1
if req_domains and strict_stack and domain_hits < len(req_domains):
dropped += 1
reasons["domain_mismatch"] = reasons.get("domain_mismatch", 0) + 1
continue
if req_langs:
lang_hits_req = len(lang_tokens & req_langs)
if lang_hits_req < 1:
dropped += 1
reasons["lang_stack_weak"] = reasons.get("lang_stack_weak", 0) + 1
continue
else:
lang_hits_req = None
core_hits = len(lang_tokens & _CORE)
bonus_hits = len(lang_tokens & _BONUS)
# Требуем хотя бы один язык из CORE/bonus
if core_hits + bonus_hits < 1:
dropped += 1
reasons["stack_too_weak"] = reasons.get("stack_too_weak", 0) + 1
continue
it2 = dict(it)
it2["_fit"] = {
"core_hits": core_hits,
"bonus_cpp": bool(bonus_hits),
"req_lang_hits": lang_hits_req,
"req_skill_hits": skill_hits_total,
"req_skill_hits_body": skill_hits_body,
"req_domain_hits": domain_hits,
"backend_role": "backend" in roles,
"backend_focus": backend_focus_flag,
}
if req_english:
lvl = str(it.get("english_level") or "").upper()
if not lvl or _EN_ORDER.get(lvl, 0) < _EN_ORDER.get(req_english, 0):
dropped += 1
reasons["english_below_required"] = reasons.get("english_below_required", 0) + 1
continue
filtered.append(it2)
# сорт: больше core_hits, затем rank
def key(x: Dict[str, Any]):
fit = x.get("_fit") or {}
core_hits = int(fit.get("core_hits", 0))
bonus = 1 if fit.get("bonus_cpp") else 0
backend_bonus = 1 if fit.get("backend_role") or fit.get("backend_focus") else 0
req_skill_hits = int(fit.get("req_skill_hits", 0))
req_skill_hits_body = int(fit.get("req_skill_hits_body", 0))
req_domain_hits = int(fit.get("req_domain_hits", 0))
r = x.get("rank")
try:
r = float(r)
except Exception:
r = 0.0
# ручной скоринг по доменным признакам
score = 0.0
if "go" in (x.get("primary_languages") or []):
score += 5.0 # основной Go
try:
if x.get("experience_years_eng") and float(x.get("experience_years_eng")) >= max(4.0, req_exp or 0):
score += 3.0
except Exception:
pass
skills = set(_norm_token(s) for s in (x.get("skills") or []))
text_boost = 0.0
for kw in ("kubernetes", "k8s"):
if kw in skills:
text_boost += 1.5; break
for kw in ("ddd", "domain-driven design", "eda", "event-driven"):
if kw in skills:
text_boost += 2.0; break
for kw in ("fintech", "trading", "crypto", "exchange", "биржа", "финтех"):
if kw in skills:
text_boost += 2.5; break
snippet = (x.get("snippet") or "").lower()
for kw in ("highload", "high-load", "high throughput", "high-throughput", "low latency", "low-latency", "highload"):
if kw in snippet:
text_boost += 1.5
break
score += text_boost
return (-req_domain_hits, -req_skill_hits_body, -req_skill_hits, -core_hits, -backend_bonus, -bonus, -(score), r)
filtered.sort(key=key)
dbg = {
"postfilter_applied": True,
"required_exp": req_exp,
"required_languages": sorted(list(req_langs)),
"required_skills": req_skills,
"must_have_skills": must_have_skills,
"required_domains": req_domains,
"strict_stack": strict_stack,
"dropped": dropped,
"reasons": reasons,
}
return filtered, dbg
# --------- Refinement loop ----------
def _refine_plan_no_llm(plan: SearchPlan, result_count: int, user_prompt: str) -> SearchPlan:
p = SearchPlan(**asdict(plan))
if result_count == 0:
p.location = None
p.salary_min = None
p.salary_max = None
p.english_min = None
# если было строго по remote — ослабим; потом override применим обратно
p.remote = None
# опыт уменьшаем плавно
if p.exp_years_min is not None:
p.exp_years_min = max(0.0, float(p.exp_years_min) - 1.0)
if not (p.query_text or "").strip():
p.query_text = " ".join(p.skills_any[:8])
_apply_work_mode_overrides(user_prompt, p)
return p
return p
def agent_search(
con: sqlite3.Connection,
user_prompt: str,
max_iters: int = 2,
limit: int = 20,
) -> Dict[str, Any]:
draft = _heuristic_plan(user_prompt)
draft.limit = limit
names_only_query = _looks_like_name_list(user_prompt)
plan = _llm_build_plan(user_prompt, draft) if (_llm_enabled() and not names_only_query) else draft
plan.limit = limit
history: List[Dict[str, Any]] = []
final_items: List[Dict[str, Any]] = []
final_count = 0
for i in range(max_iters + 1):
_apply_work_mode_overrides(user_prompt, plan)
res = search_with_filters(con, plan)
items = res.get("items", [])
count = int(res.get("count", len(items)))
history.append(
{
"plan": asdict(plan),
"count": count,
"top_snippets": [it.get("snippet", "")[:180] for it in items[:5]],
}
)
if count > 0 or i == max_iters:
final_items = items
final_count = count
break
# refine
if _llm_enabled():
msgs = [
{
"role": "system",
"content": (
"Ты корректируешь JSON SearchPlan. Верни ТОЛЬКО JSON с полями SearchPlan.\n"
"Если 0 результатов — ослабь фильтры: remote=null, exp_years_min уменьшить/обнулить, "
"location/salary/english убрать. skills_any сохранить.\n"
"Никаких лишних ключей. Помни: 'гибрид' НЕ означает remote=true.\n"
),
},
{
"role": "user",
"content": json.dumps(
{
"query": user_prompt,
"previous_plan": asdict(plan),
"result_count": count,
},
ensure_ascii=False,
),
},
]
obj_raw = _llm_call_json(msgs)
obj = _sanitize_plan_dict(obj_raw)
plan = SearchPlan(**{**asdict(plan), **obj})
plan.skills_any = _uniq_keep_order(_filter_skills_vs_location(plan.skills_any, plan.location))
plan.skills_all = _uniq_keep_order(_filter_skills_vs_location(plan.skills_all, plan.location))
plan.roles_any = _uniq_keep_order(plan.roles_any)
plan.query_text = _simplify_query_text(user_prompt, plan.skills_any)
plan.limit = limit
if plan.sort not in ("rank", "exp_desc", "salary_desc"):
plan.sort = "rank"
_apply_work_mode_overrides(user_prompt, plan)
else:
plan = _refine_plan_no_llm(plan, count, user_prompt)
plan.limit = limit
# ---- 1) dedupe ----
deduped = _dedupe_by_candidate_best_rank(final_items)
# ---- 2) postfilter for vacancy-like queries ----
post_dbg: Dict[str, Any] = {"postfilter_applied": False}
if _needs_postfilter(user_prompt):
filtered, post_dbg = _jobfit_filter_items(con, user_prompt, deduped, plan=plan)
else:
filtered = deduped
return {
"plan": asdict(plan),
"items": filtered,
"count": len(filtered),
"history": history,
"llm_used": _llm_enabled(),
"postfilter": post_dbg,
}