Files
tg_resume_db/search.py
2026-03-11 15:27:10 +03:00

394 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
import re
import sqlite3
from typing import Any, Dict, List, Tuple
from tg_resume_db.normalize import normalize_skill, find_skills_in_text
# -----------------------------
# Normalization helpers
# -----------------------------
def _norm_token(v: str) -> str:
return " ".join(str(v).strip().lower().split())
def _as_list(v: Any) -> List[str]:
"""
Accepts:
- None
- list
- "a,b,c" (csv string)
"""
if v is None:
return []
if isinstance(v, list):
return [str(x) for x in v if str(x).strip()]
s = str(v).strip()
if not s:
return []
return [x.strip() for x in s.split(",") if x.strip()]
def _uniq_keep_order(xs: List[str]) -> List[str]:
seen = set()
out: List[str] = []
for x in xs:
t = _norm_token(x)
if not t or t in seen:
continue
seen.add(t)
out.append(t)
return out
# -----------------------------
# Pipe-normalized columns filters
# skills_norm / roles_norm like: "|python|fastapi|"
# -----------------------------
def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
if not vals:
return ("1=1", [])
parts: List[str] = []
args: List[Any] = []
for v in vals:
parts.append(f"instr({field}, ?) > 0")
args.append(f"|{v}|")
return "(" + " OR ".join(parts) + ")", args
def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
if not vals:
return ("1=1", [])
parts: List[str] = []
args: List[Any] = []
for v in vals:
parts.append(f"instr({field}, ?) > 0")
args.append(f"|{v}|")
return "(" + " AND ".join(parts) + ")", args
# -----------------------------
# FTS5 sanitizer (fixes comma/garbage breaking MATCH)
# -----------------------------
# allow longer queries (списки имён, длинные промпты) без агрессивного усечения
_FTS_MAX_TERMS = 48
def _fts_safe_query(q: str) -> str:
"""
Turn a free-form recruiter text into a safe FTS5 MATCH expression.
We intentionally DO NOT allow raw FTS syntax from user input,
because it easily breaks on commas/quotes/etc.
Example:
"Backend developer, опыт 5+ лет, Java C++ Python" ->
"\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
"""
if not q:
return "resume"
s = q.strip().lower()
# normalize common tokens
s = s.replace("c++", "cpp")
s = s.replace("c#", "csharp")
s = s.replace(".net", "dotnet")
# remove punctuation that breaks MATCH
s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
s = re.sub(r"\s+", " ", s).strip()
# tokens (latin/cyrillic + digits + a few chars)
terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
terms = terms[:_FTS_MAX_TERMS]
if not terms:
return "resume"
# quote every term => safe; join with OR => broad query
return " OR ".join([f"\"{t}\"" for t in terms])
def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
"""
Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
"""
if not q:
return [], [], ""
must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
must = []
exclude = []
for t in must_raw:
canon = normalize_skill(t)
if canon:
must.append(canon)
for t in excl_raw:
canon = normalize_skill(t)
if canon:
exclude.append(canon)
if " and " in q.lower() or " & " in q:
must += find_skills_in_text(q)
cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
# -----------------------------
# Contacts
# -----------------------------
def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
rows = con.execute(
"SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
(candidate_id,),
).fetchall()
m: Dict[str, List[str]] = {}
for r in rows:
m.setdefault(r["contact_type"], []).append(r["contact_value"])
# чуть чище: уберём дубль-контакты
for k, vals in list(m.items()):
m[k] = _uniq_keep_order(vals)
return m
# -----------------------------
# Main search (FTS + filters)
# -----------------------------
def search(
con: sqlite3.Connection,
query: str,
filters: Dict[str, Any],
limit: int = 20,
offset: int = 0,
) -> List[Dict[str, Any]]:
"""
Search candidates using:
- FTS5 for ranking/snippet
- stack filters for skills/roles via pipe-normalized columns
- basic filters: remote/location/experience/salary/english
"""
where: List[str] = ["r.is_active = 1"]
params: List[Any] = []
must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
# -------- basic filters --------
if filters.get("remote") is not None:
where.append("c.remote = ?")
params.append(1 if bool(filters["remote"]) else 0)
if filters.get("location"):
where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
params.append("%" + str(filters["location"]).lower() + "%")
# Используем experience_years для SQL-фильтрации (широкий поиск),
# а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
if filters.get("experience_min") is not None:
where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
params.append(float(filters["experience_min"]))
# Salary: "unknown salary doesn't exclude"
if filters.get("salary_min") is not None:
where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
params.append(int(filters["salary_min"]))
if filters.get("salary_max") is not None:
where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
params.append(int(filters["salary_max"]))
if filters.get("doc_type"):
where.append("r.doc_type = ?")
params.append(str(filters["doc_type"]))
# English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
# -------- roles/skills stack filters --------
# backward compatibility
skills_any: List[str] = []
skills_all: List[str] = []
roles_any: List[str] = []
if filters.get("skill"):
skills_any.append(str(filters["skill"]))
if filters.get("role"):
roles_any.append(str(filters["role"]))
skills_any += _as_list(filters.get("skills_any"))
skills_all += _as_list(filters.get("skills_all"))
roles_any += _as_list(filters.get("roles_any"))
skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
roles_any = _uniq_keep_order(roles_any)
if must_skills:
skills_all = _uniq_keep_order(skills_all + must_skills)
# Denis rule: if any skills were provided -> enforce ANY match
if skills_any:
clause, args = _pipe_any_clause("c.skills_norm", skills_any)
where.append(clause)
params.extend(args)
if skills_all:
clause, args = _pipe_all_clause("c.skills_norm", skills_all)
where.append(clause)
params.extend(args)
if roles_any:
clause, args = _pipe_any_clause("c.roles_norm", roles_any)
where.append(clause)
params.extend(args)
if exclude_skills:
for sk in exclude_skills:
where.append("instr(c.skills_norm, ?) = 0")
params.append(f"|{sk}|")
# -------- FTS query (SAFE) --------
fts_q = _fts_safe_query(cleaned_query or "")
limit = max(1, min(int(limit or 20), 100))
offset = max(0, int(offset or 0))
# UPDATED SQL: Added experience_years_eng and language/backend metadata
sql = f"""
SELECT
c.candidate_id,
c.name,
c.location,
c.remote,
c.experience_years,
c.experience_years_eng,
c.experience_confidence,
c.salary_min,
c.salary_max,
c.salary_confidence,
c.english_level,
c.roles_json,
c.skills_json,
c.primary_languages_json,
c.backend_focus,
r.doc_type,
r.doc_type_confidence,
r.parse_method,
r.resume_id,
snippet(resumes_fts, 2, '[', ']', '', 14) AS snippet,
bm25(resumes_fts) AS rank
FROM resumes_fts
JOIN resumes r ON r.resume_id = resumes_fts.resume_id
JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
ORDER BY rank
LIMIT ? OFFSET ?
"""
rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
out: List[Dict[str, Any]] = []
for row in rows:
cand_id = row["candidate_id"]
contacts_map = _fetch_contacts_map(con, cand_id)
out.append(
{
"candidate_id": cand_id,
"name": row["name"],
"location": row["location"],
"remote": bool(row["remote"]) if row["remote"] is not None else None,
"experience_years": row["experience_years"],
"experience_years_eng": row["experience_years_eng"], # Passed to agent
"experience_confidence": row["experience_confidence"],
"salary_min": row["salary_min"],
"salary_max": row["salary_max"],
"salary_confidence": row["salary_confidence"],
"english_level": row["english_level"],
"roles": json.loads(row["roles_json"] or "[]"),
"skills": json.loads(row["skills_json"] or "[]"),
"primary_languages": json.loads(row["primary_languages_json"] or "[]"),
"backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
"doc_type": row["doc_type"],
"doc_type_confidence": row["doc_type_confidence"],
"parse_method": row["parse_method"],
"contacts": contacts_map,
"resume_id": row["resume_id"],
"snippet": row["snippet"],
"rank": row["rank"],
}
)
return out
# -----------------------------
# Agent helper (SearchPlan -> search())
# -----------------------------
def _join_csv(xs: List[str]) -> str:
xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
return ",".join(xs)
def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
"""
Wrapper for agent.py.
Expects `plan` with fields:
query_text, skills_any, skills_all, roles_any, location, remote,
english_min, exp_years_min, salary_min, salary_max, limit, sort
Returns:
{ "items": [...], "count": N }
"""
filters = {
"remote": getattr(plan, "remote", None),
"location": getattr(plan, "location", None),
"experience_min": getattr(plan, "exp_years_min", None),
"salary_min": getattr(plan, "salary_min", None),
"salary_max": getattr(plan, "salary_max", None),
"english": getattr(plan, "english_min", None),
"roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
"skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
"skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
}
items = search(
con,
query=(getattr(plan, "query_text", "") or "").strip(),
filters=filters,
limit=int(getattr(plan, "limit", 20) or 20),
offset=0,
)
sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
if sort_mode == "exp_desc":
def k(it: Dict[str, Any]):
v = it.get("experience_years")
return (v is None, -(v or 0.0))
items = sorted(items, key=k)
elif sort_mode == "salary_desc":
def k(it: Dict[str, Any]):
v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
return (v is None, -(v or 0))
items = sorted(items, key=k)
return {"items": items, "count": len(items)}