Initial commit

2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions
--- a/search.py
+++ b/search.py
@@ -0,0 +1,393 @@
+from __future__ import annotations
+
+import json
+import re
+import sqlite3
+from typing import Any, Dict, List, Tuple
+
+from tg_resume_db.normalize import normalize_skill, find_skills_in_text
+
+
+# -----------------------------
+# Normalization helpers
+# -----------------------------
+
+def _norm_token(v: str) -> str:
+    return " ".join(str(v).strip().lower().split())
+
+
+def _as_list(v: Any) -> List[str]:
+    """
+    Accepts:
+      - None
+      - list
+      - "a,b,c" (csv string)
+    """
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x) for x in v if str(x).strip()]
+    s = str(v).strip()
+    if not s:
+        return []
+    return [x.strip() for x in s.split(",") if x.strip()]
+
+
+def _uniq_keep_order(xs: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for x in xs:
+        t = _norm_token(x)
+        if not t or t in seen:
+            continue
+        seen.add(t)
+        out.append(t)
+    return out
+
+
+# -----------------------------
+# Pipe-normalized columns filters
+# skills_norm / roles_norm like: "|python|fastapi|"
+# -----------------------------
+
+def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
+    vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
+    if not vals:
+        return ("1=1", [])
+
+    parts: List[str] = []
+    args: List[Any] = []
+    for v in vals:
+        parts.append(f"instr({field}, ?) > 0")
+        args.append(f"|{v}|")
+
+    return "(" + " OR ".join(parts) + ")", args
+
+
+def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
+    vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
+    if not vals:
+        return ("1=1", [])
+
+    parts: List[str] = []
+    args: List[Any] = []
+    for v in vals:
+        parts.append(f"instr({field}, ?) > 0")
+        args.append(f"|{v}|")
+
+    return "(" + " AND ".join(parts) + ")", args
+
+
+# -----------------------------
+# FTS5 sanitizer (fixes comma/garbage breaking MATCH)
+# -----------------------------
+
+# allow longer queries (списки имён, длинные промпты) без агрессивного усечения
+_FTS_MAX_TERMS = 48
+
+def _fts_safe_query(q: str) -> str:
+    """
+    Turn a free-form recruiter text into a safe FTS5 MATCH expression.
+    We intentionally DO NOT allow raw FTS syntax from user input,
+    because it easily breaks on commas/quotes/etc.
+
+    Example:
+      "Backend developer, опыт 5+ лет, Java C++ Python" ->
+      "\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
+    """
+    if not q:
+        return "resume"
+
+    s = q.strip().lower()
+
+    # normalize common tokens
+    s = s.replace("c++", "cpp")
+    s = s.replace("c#", "csharp")
+    s = s.replace(".net", "dotnet")
+
+    # remove punctuation that breaks MATCH
+    s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+
+    # tokens (latin/cyrillic + digits + a few chars)
+    terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
+    terms = terms[:_FTS_MAX_TERMS]
+
+    if not terms:
+        return "resume"
+
+    # quote every term => safe; join with OR => broad query
+    return " OR ".join([f"\"{t}\"" for t in terms])
+
+
+def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
+    """
+    Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
+    """
+    if not q:
+        return [], [], ""
+    must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
+    excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
+    must = []
+    exclude = []
+    for t in must_raw:
+        canon = normalize_skill(t)
+        if canon:
+            must.append(canon)
+    for t in excl_raw:
+        canon = normalize_skill(t)
+        if canon:
+            exclude.append(canon)
+    if " and " in q.lower() or " & " in q:
+        must += find_skills_in_text(q)
+
+    cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
+
+
+# -----------------------------
+# Contacts
+# -----------------------------
+
+def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
+    rows = con.execute(
+        "SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
+        (candidate_id,),
+    ).fetchall()
+
+    m: Dict[str, List[str]] = {}
+    for r in rows:
+        m.setdefault(r["contact_type"], []).append(r["contact_value"])
+
+    # чуть чище: уберём дубль-контакты
+    for k, vals in list(m.items()):
+        m[k] = _uniq_keep_order(vals)
+
+    return m
+
+
+# -----------------------------
+# Main search (FTS + filters)
+# -----------------------------
+
+def search(
+    con: sqlite3.Connection,
+    query: str,
+    filters: Dict[str, Any],
+    limit: int = 20,
+    offset: int = 0,
+) -> List[Dict[str, Any]]:
+    """
+    Search candidates using:
+      - FTS5 for ranking/snippet
+      - stack filters for skills/roles via pipe-normalized columns
+      - basic filters: remote/location/experience/salary/english
+    """
+
+    where: List[str] = ["r.is_active = 1"]
+    params: List[Any] = []
+
+    must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
+
+    # -------- basic filters --------
+    if filters.get("remote") is not None:
+        where.append("c.remote = ?")
+        params.append(1 if bool(filters["remote"]) else 0)
+
+    if filters.get("location"):
+        where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
+        params.append("%" + str(filters["location"]).lower() + "%")
+
+    # Используем experience_years для SQL-фильтрации (широкий поиск),
+    # а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
+    if filters.get("experience_min") is not None:
+        where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
+        params.append(float(filters["experience_min"]))
+
+    # Salary: "unknown salary doesn't exclude"
+    if filters.get("salary_min") is not None:
+        where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
+        params.append(int(filters["salary_min"]))
+
+    if filters.get("salary_max") is not None:
+        where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
+        params.append(int(filters["salary_max"]))
+
+    if filters.get("doc_type"):
+        where.append("r.doc_type = ?")
+        params.append(str(filters["doc_type"]))
+
+    # English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
+
+    # -------- roles/skills stack filters --------
+    # backward compatibility
+    skills_any: List[str] = []
+    skills_all: List[str] = []
+    roles_any: List[str] = []
+
+    if filters.get("skill"):
+        skills_any.append(str(filters["skill"]))
+    if filters.get("role"):
+        roles_any.append(str(filters["role"]))
+
+    skills_any += _as_list(filters.get("skills_any"))
+    skills_all += _as_list(filters.get("skills_all"))
+    roles_any += _as_list(filters.get("roles_any"))
+
+    skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
+    skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
+    roles_any = _uniq_keep_order(roles_any)
+
+    if must_skills:
+        skills_all = _uniq_keep_order(skills_all + must_skills)
+
+    # Denis rule: if any skills were provided -> enforce ANY match
+    if skills_any:
+        clause, args = _pipe_any_clause("c.skills_norm", skills_any)
+        where.append(clause)
+        params.extend(args)
+
+    if skills_all:
+        clause, args = _pipe_all_clause("c.skills_norm", skills_all)
+        where.append(clause)
+        params.extend(args)
+
+    if roles_any:
+        clause, args = _pipe_any_clause("c.roles_norm", roles_any)
+        where.append(clause)
+        params.extend(args)
+
+    if exclude_skills:
+        for sk in exclude_skills:
+            where.append("instr(c.skills_norm, ?) = 0")
+            params.append(f"|{sk}|")
+
+    # -------- FTS query (SAFE) --------
+    fts_q = _fts_safe_query(cleaned_query or "")
+
+    limit = max(1, min(int(limit or 20), 100))
+    offset = max(0, int(offset or 0))
+
+    # UPDATED SQL: Added experience_years_eng and language/backend metadata
+    sql = f"""
+    SELECT
+      c.candidate_id,
+      c.name,
+      c.location,
+      c.remote,
+      c.experience_years,
+      c.experience_years_eng,
+      c.experience_confidence,
+      c.salary_min,
+      c.salary_max,
+      c.salary_confidence,
+      c.english_level,
+      c.roles_json,
+      c.skills_json,
+      c.primary_languages_json,
+      c.backend_focus,
+      r.doc_type,
+      r.doc_type_confidence,
+      r.parse_method,
+      r.resume_id,
+      snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet,
+      bm25(resumes_fts) AS rank
+    FROM resumes_fts
+    JOIN resumes r ON r.resume_id = resumes_fts.resume_id
+    JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
+    WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
+    ORDER BY rank
+    LIMIT ? OFFSET ?
+    """
+
+    rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
+
+    out: List[Dict[str, Any]] = []
+    for row in rows:
+        cand_id = row["candidate_id"]
+        contacts_map = _fetch_contacts_map(con, cand_id)
+
+        out.append(
+            {
+                "candidate_id": cand_id,
+                "name": row["name"],
+                "location": row["location"],
+                "remote": bool(row["remote"]) if row["remote"] is not None else None,
+                "experience_years": row["experience_years"],
+                "experience_years_eng": row["experience_years_eng"], # Passed to agent
+                "experience_confidence": row["experience_confidence"],
+                "salary_min": row["salary_min"],
+                "salary_max": row["salary_max"],
+                "salary_confidence": row["salary_confidence"],
+                "english_level": row["english_level"],
+                "roles": json.loads(row["roles_json"] or "[]"),
+                "skills": json.loads(row["skills_json"] or "[]"),
+                "primary_languages": json.loads(row["primary_languages_json"] or "[]"),
+                "backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
+                "doc_type": row["doc_type"],
+                "doc_type_confidence": row["doc_type_confidence"],
+                "parse_method": row["parse_method"],
+                "contacts": contacts_map,
+                "resume_id": row["resume_id"],
+                "snippet": row["snippet"],
+                "rank": row["rank"],
+            }
+        )
+
+    return out
+
+
+# -----------------------------
+# Agent helper (SearchPlan -> search())
+# -----------------------------
+
+def _join_csv(xs: List[str]) -> str:
+    xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
+    return ",".join(xs)
+
+
+def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
+    """
+    Wrapper for agent.py.
+    Expects `plan` with fields:
+      query_text, skills_any, skills_all, roles_any, location, remote,
+      english_min, exp_years_min, salary_min, salary_max, limit, sort
+    Returns:
+      { "items": [...], "count": N }
+    """
+    filters = {
+        "remote": getattr(plan, "remote", None),
+        "location": getattr(plan, "location", None),
+        "experience_min": getattr(plan, "exp_years_min", None),
+        "salary_min": getattr(plan, "salary_min", None),
+        "salary_max": getattr(plan, "salary_max", None),
+        "english": getattr(plan, "english_min", None),
+        "roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
+        "skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
+        "skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
+    }
+
+    items = search(
+        con,
+        query=(getattr(plan, "query_text", "") or "").strip(),
+        filters=filters,
+        limit=int(getattr(plan, "limit", 20) or 20),
+        offset=0,
+    )
+
+    sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
+
+    if sort_mode == "exp_desc":
+        def k(it: Dict[str, Any]):
+            v = it.get("experience_years")
+            return (v is None, -(v or 0.0))
+        items = sorted(items, key=k)
+
+    elif sort_mode == "salary_desc":
+        def k(it: Dict[str, Any]):
+            v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
+            return (v is None, -(v or 0))
+        items = sorted(items, key=k)
+
+    return {"items": items, "count": len(items)}