from __future__ import annotations import json import re import sqlite3 from typing import Any, Dict, List, Tuple from tg_resume_db.normalize import normalize_skill, find_skills_in_text # ----------------------------- # Normalization helpers # ----------------------------- def _norm_token(v: str) -> str: return " ".join(str(v).strip().lower().split()) def _as_list(v: Any) -> List[str]: """ Accepts: - None - list - "a,b,c" (csv string) """ if v is None: return [] if isinstance(v, list): return [str(x) for x in v if str(x).strip()] s = str(v).strip() if not s: return [] return [x.strip() for x in s.split(",") if x.strip()] def _uniq_keep_order(xs: List[str]) -> List[str]: seen = set() out: List[str] = [] for x in xs: t = _norm_token(x) if not t or t in seen: continue seen.add(t) out.append(t) return out # ----------------------------- # Pipe-normalized columns filters # skills_norm / roles_norm like: "|python|fastapi|" # ----------------------------- def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]: vals = [_norm_token(x) for x in (values or []) if str(x).strip()] if not vals: return ("1=1", []) parts: List[str] = [] args: List[Any] = [] for v in vals: parts.append(f"instr({field}, ?) > 0") args.append(f"|{v}|") return "(" + " OR ".join(parts) + ")", args def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]: vals = [_norm_token(x) for x in (values or []) if str(x).strip()] if not vals: return ("1=1", []) parts: List[str] = [] args: List[Any] = [] for v in vals: parts.append(f"instr({field}, ?) > 0") args.append(f"|{v}|") return "(" + " AND ".join(parts) + ")", args # ----------------------------- # FTS5 sanitizer (fixes comma/garbage breaking MATCH) # ----------------------------- # allow longer queries (списки имён, длинные промпты) без агрессивного усечения _FTS_MAX_TERMS = 48 def _fts_safe_query(q: str) -> str: """ Turn a free-form recruiter text into a safe FTS5 MATCH expression. We intentionally DO NOT allow raw FTS syntax from user input, because it easily breaks on commas/quotes/etc. Example: "Backend developer, опыт 5+ лет, Java C++ Python" -> "\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\"" """ if not q: return "resume" s = q.strip().lower() # normalize common tokens s = s.replace("c++", "cpp") s = s.replace("c#", "csharp") s = s.replace(".net", "dotnet") # remove punctuation that breaks MATCH s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s) s = re.sub(r"\s+", " ", s).strip() # tokens (latin/cyrillic + digits + a few chars) terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I) terms = terms[:_FTS_MAX_TERMS] if not terms: return "resume" # quote every term => safe; join with OR => broad query return " OR ".join([f"\"{t}\"" for t in terms]) def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]: """ Extract +must and -exclude skills from query; return (must, exclude, cleaned_query). """ if not q: return [], [], "" must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q) excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q) must = [] exclude = [] for t in must_raw: canon = normalize_skill(t) if canon: must.append(canon) for t in excl_raw: canon = normalize_skill(t) if canon: exclude.append(canon) if " and " in q.lower() or " & " in q: must += find_skills_in_text(q) cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q) cleaned = re.sub(r"\s+", " ", cleaned).strip() return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned # ----------------------------- # Contacts # ----------------------------- def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]: rows = con.execute( "SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?", (candidate_id,), ).fetchall() m: Dict[str, List[str]] = {} for r in rows: m.setdefault(r["contact_type"], []).append(r["contact_value"]) # чуть чище: уберём дубль-контакты for k, vals in list(m.items()): m[k] = _uniq_keep_order(vals) return m # ----------------------------- # Main search (FTS + filters) # ----------------------------- def search( con: sqlite3.Connection, query: str, filters: Dict[str, Any], limit: int = 20, offset: int = 0, ) -> List[Dict[str, Any]]: """ Search candidates using: - FTS5 for ranking/snippet - stack filters for skills/roles via pipe-normalized columns - basic filters: remote/location/experience/salary/english """ where: List[str] = ["r.is_active = 1"] params: List[Any] = [] must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "") # -------- basic filters -------- if filters.get("remote") is not None: where.append("c.remote = ?") params.append(1 if bool(filters["remote"]) else 0) if filters.get("location"): where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?") params.append("%" + str(filters["location"]).lower() + "%") # Используем experience_years для SQL-фильтрации (широкий поиск), # а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py if filters.get("experience_min") is not None: where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?") params.append(float(filters["experience_min"])) # Salary: "unknown salary doesn't exclude" if filters.get("salary_min") is not None: where.append("(c.salary_max IS NULL OR c.salary_max >= ?)") params.append(int(filters["salary_min"])) if filters.get("salary_max") is not None: where.append("(c.salary_min IS NULL OR c.salary_min <= ?)") params.append(int(filters["salary_max"])) if filters.get("doc_type"): where.append("r.doc_type = ?") params.append(str(filters["doc_type"])) # English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py # -------- roles/skills stack filters -------- # backward compatibility skills_any: List[str] = [] skills_all: List[str] = [] roles_any: List[str] = [] if filters.get("skill"): skills_any.append(str(filters["skill"])) if filters.get("role"): roles_any.append(str(filters["role"])) skills_any += _as_list(filters.get("skills_any")) skills_all += _as_list(filters.get("skills_all")) roles_any += _as_list(filters.get("roles_any")) skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any]) skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all]) roles_any = _uniq_keep_order(roles_any) if must_skills: skills_all = _uniq_keep_order(skills_all + must_skills) # Denis rule: if any skills were provided -> enforce ANY match if skills_any: clause, args = _pipe_any_clause("c.skills_norm", skills_any) where.append(clause) params.extend(args) if skills_all: clause, args = _pipe_all_clause("c.skills_norm", skills_all) where.append(clause) params.extend(args) if roles_any: clause, args = _pipe_any_clause("c.roles_norm", roles_any) where.append(clause) params.extend(args) if exclude_skills: for sk in exclude_skills: where.append("instr(c.skills_norm, ?) = 0") params.append(f"|{sk}|") # -------- FTS query (SAFE) -------- fts_q = _fts_safe_query(cleaned_query or "") limit = max(1, min(int(limit or 20), 100)) offset = max(0, int(offset or 0)) # UPDATED SQL: Added experience_years_eng and language/backend metadata sql = f""" SELECT c.candidate_id, c.name, c.location, c.remote, c.experience_years, c.experience_years_eng, c.experience_confidence, c.salary_min, c.salary_max, c.salary_confidence, c.english_level, c.roles_json, c.skills_json, c.primary_languages_json, c.backend_focus, r.doc_type, r.doc_type_confidence, r.parse_method, r.resume_id, snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet, bm25(resumes_fts) AS rank FROM resumes_fts JOIN resumes r ON r.resume_id = resumes_fts.resume_id JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id WHERE resumes_fts MATCH ? AND {" AND ".join(where)} ORDER BY rank LIMIT ? OFFSET ? """ rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall() out: List[Dict[str, Any]] = [] for row in rows: cand_id = row["candidate_id"] contacts_map = _fetch_contacts_map(con, cand_id) out.append( { "candidate_id": cand_id, "name": row["name"], "location": row["location"], "remote": bool(row["remote"]) if row["remote"] is not None else None, "experience_years": row["experience_years"], "experience_years_eng": row["experience_years_eng"], # Passed to agent "experience_confidence": row["experience_confidence"], "salary_min": row["salary_min"], "salary_max": row["salary_max"], "salary_confidence": row["salary_confidence"], "english_level": row["english_level"], "roles": json.loads(row["roles_json"] or "[]"), "skills": json.loads(row["skills_json"] or "[]"), "primary_languages": json.loads(row["primary_languages_json"] or "[]"), "backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None), "doc_type": row["doc_type"], "doc_type_confidence": row["doc_type_confidence"], "parse_method": row["parse_method"], "contacts": contacts_map, "resume_id": row["resume_id"], "snippet": row["snippet"], "rank": row["rank"], } ) return out # ----------------------------- # Agent helper (SearchPlan -> search()) # ----------------------------- def _join_csv(xs: List[str]) -> str: xs = [str(x).strip() for x in (xs or []) if str(x).strip()] return ",".join(xs) def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]: """ Wrapper for agent.py. Expects `plan` with fields: query_text, skills_any, skills_all, roles_any, location, remote, english_min, exp_years_min, salary_min, salary_max, limit, sort Returns: { "items": [...], "count": N } """ filters = { "remote": getattr(plan, "remote", None), "location": getattr(plan, "location", None), "experience_min": getattr(plan, "exp_years_min", None), "salary_min": getattr(plan, "salary_min", None), "salary_max": getattr(plan, "salary_max", None), "english": getattr(plan, "english_min", None), "roles_any": _join_csv(getattr(plan, "roles_any", []) or []), "skills_any": _join_csv(getattr(plan, "skills_any", []) or []), "skills_all": _join_csv(getattr(plan, "skills_all", []) or []), } items = search( con, query=(getattr(plan, "query_text", "") or "").strip(), filters=filters, limit=int(getattr(plan, "limit", 20) or 20), offset=0, ) sort_mode = (getattr(plan, "sort", "rank") or "rank").strip() if sort_mode == "exp_desc": def k(it: Dict[str, Any]): v = it.get("experience_years") return (v is None, -(v or 0.0)) items = sorted(items, key=k) elif sort_mode == "salary_desc": def k(it: Dict[str, Any]): v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min") return (v is None, -(v or 0)) items = sorted(items, key=k) return {"items": items, "count": len(items)}