Initial commit
This commit is contained in:
393
search.py
Normal file
393
search.py
Normal file
@@ -0,0 +1,393 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from tg_resume_db.normalize import normalize_skill, find_skills_in_text
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Normalization helpers
|
||||
# -----------------------------
|
||||
|
||||
def _norm_token(v: str) -> str:
|
||||
return " ".join(str(v).strip().lower().split())
|
||||
|
||||
|
||||
def _as_list(v: Any) -> List[str]:
|
||||
"""
|
||||
Accepts:
|
||||
- None
|
||||
- list
|
||||
- "a,b,c" (csv string)
|
||||
"""
|
||||
if v is None:
|
||||
return []
|
||||
if isinstance(v, list):
|
||||
return [str(x) for x in v if str(x).strip()]
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return []
|
||||
return [x.strip() for x in s.split(",") if x.strip()]
|
||||
|
||||
|
||||
def _uniq_keep_order(xs: List[str]) -> List[str]:
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for x in xs:
|
||||
t = _norm_token(x)
|
||||
if not t or t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
out.append(t)
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Pipe-normalized columns filters
|
||||
# skills_norm / roles_norm like: "|python|fastapi|"
|
||||
# -----------------------------
|
||||
|
||||
def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
|
||||
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
|
||||
if not vals:
|
||||
return ("1=1", [])
|
||||
|
||||
parts: List[str] = []
|
||||
args: List[Any] = []
|
||||
for v in vals:
|
||||
parts.append(f"instr({field}, ?) > 0")
|
||||
args.append(f"|{v}|")
|
||||
|
||||
return "(" + " OR ".join(parts) + ")", args
|
||||
|
||||
|
||||
def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
|
||||
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
|
||||
if not vals:
|
||||
return ("1=1", [])
|
||||
|
||||
parts: List[str] = []
|
||||
args: List[Any] = []
|
||||
for v in vals:
|
||||
parts.append(f"instr({field}, ?) > 0")
|
||||
args.append(f"|{v}|")
|
||||
|
||||
return "(" + " AND ".join(parts) + ")", args
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# FTS5 sanitizer (fixes comma/garbage breaking MATCH)
|
||||
# -----------------------------
|
||||
|
||||
# allow longer queries (списки имён, длинные промпты) без агрессивного усечения
|
||||
_FTS_MAX_TERMS = 48
|
||||
|
||||
def _fts_safe_query(q: str) -> str:
|
||||
"""
|
||||
Turn a free-form recruiter text into a safe FTS5 MATCH expression.
|
||||
We intentionally DO NOT allow raw FTS syntax from user input,
|
||||
because it easily breaks on commas/quotes/etc.
|
||||
|
||||
Example:
|
||||
"Backend developer, опыт 5+ лет, Java C++ Python" ->
|
||||
"\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
|
||||
"""
|
||||
if not q:
|
||||
return "resume"
|
||||
|
||||
s = q.strip().lower()
|
||||
|
||||
# normalize common tokens
|
||||
s = s.replace("c++", "cpp")
|
||||
s = s.replace("c#", "csharp")
|
||||
s = s.replace(".net", "dotnet")
|
||||
|
||||
# remove punctuation that breaks MATCH
|
||||
s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
# tokens (latin/cyrillic + digits + a few chars)
|
||||
terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
|
||||
terms = terms[:_FTS_MAX_TERMS]
|
||||
|
||||
if not terms:
|
||||
return "resume"
|
||||
|
||||
# quote every term => safe; join with OR => broad query
|
||||
return " OR ".join([f"\"{t}\"" for t in terms])
|
||||
|
||||
|
||||
def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
|
||||
"""
|
||||
Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
|
||||
"""
|
||||
if not q:
|
||||
return [], [], ""
|
||||
must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
|
||||
excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
|
||||
must = []
|
||||
exclude = []
|
||||
for t in must_raw:
|
||||
canon = normalize_skill(t)
|
||||
if canon:
|
||||
must.append(canon)
|
||||
for t in excl_raw:
|
||||
canon = normalize_skill(t)
|
||||
if canon:
|
||||
exclude.append(canon)
|
||||
if " and " in q.lower() or " & " in q:
|
||||
must += find_skills_in_text(q)
|
||||
|
||||
cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
|
||||
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||
return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Contacts
|
||||
# -----------------------------
|
||||
|
||||
def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
|
||||
rows = con.execute(
|
||||
"SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
|
||||
(candidate_id,),
|
||||
).fetchall()
|
||||
|
||||
m: Dict[str, List[str]] = {}
|
||||
for r in rows:
|
||||
m.setdefault(r["contact_type"], []).append(r["contact_value"])
|
||||
|
||||
# чуть чище: уберём дубль-контакты
|
||||
for k, vals in list(m.items()):
|
||||
m[k] = _uniq_keep_order(vals)
|
||||
|
||||
return m
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Main search (FTS + filters)
|
||||
# -----------------------------
|
||||
|
||||
def search(
|
||||
con: sqlite3.Connection,
|
||||
query: str,
|
||||
filters: Dict[str, Any],
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search candidates using:
|
||||
- FTS5 for ranking/snippet
|
||||
- stack filters for skills/roles via pipe-normalized columns
|
||||
- basic filters: remote/location/experience/salary/english
|
||||
"""
|
||||
|
||||
where: List[str] = ["r.is_active = 1"]
|
||||
params: List[Any] = []
|
||||
|
||||
must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
|
||||
|
||||
# -------- basic filters --------
|
||||
if filters.get("remote") is not None:
|
||||
where.append("c.remote = ?")
|
||||
params.append(1 if bool(filters["remote"]) else 0)
|
||||
|
||||
if filters.get("location"):
|
||||
where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
|
||||
params.append("%" + str(filters["location"]).lower() + "%")
|
||||
|
||||
# Используем experience_years для SQL-фильтрации (широкий поиск),
|
||||
# а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
|
||||
if filters.get("experience_min") is not None:
|
||||
where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
|
||||
params.append(float(filters["experience_min"]))
|
||||
|
||||
# Salary: "unknown salary doesn't exclude"
|
||||
if filters.get("salary_min") is not None:
|
||||
where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
|
||||
params.append(int(filters["salary_min"]))
|
||||
|
||||
if filters.get("salary_max") is not None:
|
||||
where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
|
||||
params.append(int(filters["salary_max"]))
|
||||
|
||||
if filters.get("doc_type"):
|
||||
where.append("r.doc_type = ?")
|
||||
params.append(str(filters["doc_type"]))
|
||||
|
||||
# English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
|
||||
|
||||
# -------- roles/skills stack filters --------
|
||||
# backward compatibility
|
||||
skills_any: List[str] = []
|
||||
skills_all: List[str] = []
|
||||
roles_any: List[str] = []
|
||||
|
||||
if filters.get("skill"):
|
||||
skills_any.append(str(filters["skill"]))
|
||||
if filters.get("role"):
|
||||
roles_any.append(str(filters["role"]))
|
||||
|
||||
skills_any += _as_list(filters.get("skills_any"))
|
||||
skills_all += _as_list(filters.get("skills_all"))
|
||||
roles_any += _as_list(filters.get("roles_any"))
|
||||
|
||||
skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
|
||||
skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
|
||||
roles_any = _uniq_keep_order(roles_any)
|
||||
|
||||
if must_skills:
|
||||
skills_all = _uniq_keep_order(skills_all + must_skills)
|
||||
|
||||
# Denis rule: if any skills were provided -> enforce ANY match
|
||||
if skills_any:
|
||||
clause, args = _pipe_any_clause("c.skills_norm", skills_any)
|
||||
where.append(clause)
|
||||
params.extend(args)
|
||||
|
||||
if skills_all:
|
||||
clause, args = _pipe_all_clause("c.skills_norm", skills_all)
|
||||
where.append(clause)
|
||||
params.extend(args)
|
||||
|
||||
if roles_any:
|
||||
clause, args = _pipe_any_clause("c.roles_norm", roles_any)
|
||||
where.append(clause)
|
||||
params.extend(args)
|
||||
|
||||
if exclude_skills:
|
||||
for sk in exclude_skills:
|
||||
where.append("instr(c.skills_norm, ?) = 0")
|
||||
params.append(f"|{sk}|")
|
||||
|
||||
# -------- FTS query (SAFE) --------
|
||||
fts_q = _fts_safe_query(cleaned_query or "")
|
||||
|
||||
limit = max(1, min(int(limit or 20), 100))
|
||||
offset = max(0, int(offset or 0))
|
||||
|
||||
# UPDATED SQL: Added experience_years_eng and language/backend metadata
|
||||
sql = f"""
|
||||
SELECT
|
||||
c.candidate_id,
|
||||
c.name,
|
||||
c.location,
|
||||
c.remote,
|
||||
c.experience_years,
|
||||
c.experience_years_eng,
|
||||
c.experience_confidence,
|
||||
c.salary_min,
|
||||
c.salary_max,
|
||||
c.salary_confidence,
|
||||
c.english_level,
|
||||
c.roles_json,
|
||||
c.skills_json,
|
||||
c.primary_languages_json,
|
||||
c.backend_focus,
|
||||
r.doc_type,
|
||||
r.doc_type_confidence,
|
||||
r.parse_method,
|
||||
r.resume_id,
|
||||
snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet,
|
||||
bm25(resumes_fts) AS rank
|
||||
FROM resumes_fts
|
||||
JOIN resumes r ON r.resume_id = resumes_fts.resume_id
|
||||
JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
|
||||
WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
|
||||
ORDER BY rank
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
|
||||
rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
cand_id = row["candidate_id"]
|
||||
contacts_map = _fetch_contacts_map(con, cand_id)
|
||||
|
||||
out.append(
|
||||
{
|
||||
"candidate_id": cand_id,
|
||||
"name": row["name"],
|
||||
"location": row["location"],
|
||||
"remote": bool(row["remote"]) if row["remote"] is not None else None,
|
||||
"experience_years": row["experience_years"],
|
||||
"experience_years_eng": row["experience_years_eng"], # Passed to agent
|
||||
"experience_confidence": row["experience_confidence"],
|
||||
"salary_min": row["salary_min"],
|
||||
"salary_max": row["salary_max"],
|
||||
"salary_confidence": row["salary_confidence"],
|
||||
"english_level": row["english_level"],
|
||||
"roles": json.loads(row["roles_json"] or "[]"),
|
||||
"skills": json.loads(row["skills_json"] or "[]"),
|
||||
"primary_languages": json.loads(row["primary_languages_json"] or "[]"),
|
||||
"backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
|
||||
"doc_type": row["doc_type"],
|
||||
"doc_type_confidence": row["doc_type_confidence"],
|
||||
"parse_method": row["parse_method"],
|
||||
"contacts": contacts_map,
|
||||
"resume_id": row["resume_id"],
|
||||
"snippet": row["snippet"],
|
||||
"rank": row["rank"],
|
||||
}
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Agent helper (SearchPlan -> search())
|
||||
# -----------------------------
|
||||
|
||||
def _join_csv(xs: List[str]) -> str:
|
||||
xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
|
||||
return ",".join(xs)
|
||||
|
||||
|
||||
def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
Wrapper for agent.py.
|
||||
Expects `plan` with fields:
|
||||
query_text, skills_any, skills_all, roles_any, location, remote,
|
||||
english_min, exp_years_min, salary_min, salary_max, limit, sort
|
||||
Returns:
|
||||
{ "items": [...], "count": N }
|
||||
"""
|
||||
filters = {
|
||||
"remote": getattr(plan, "remote", None),
|
||||
"location": getattr(plan, "location", None),
|
||||
"experience_min": getattr(plan, "exp_years_min", None),
|
||||
"salary_min": getattr(plan, "salary_min", None),
|
||||
"salary_max": getattr(plan, "salary_max", None),
|
||||
"english": getattr(plan, "english_min", None),
|
||||
"roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
|
||||
"skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
|
||||
"skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
|
||||
}
|
||||
|
||||
items = search(
|
||||
con,
|
||||
query=(getattr(plan, "query_text", "") or "").strip(),
|
||||
filters=filters,
|
||||
limit=int(getattr(plan, "limit", 20) or 20),
|
||||
offset=0,
|
||||
)
|
||||
|
||||
sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
|
||||
|
||||
if sort_mode == "exp_desc":
|
||||
def k(it: Dict[str, Any]):
|
||||
v = it.get("experience_years")
|
||||
return (v is None, -(v or 0.0))
|
||||
items = sorted(items, key=k)
|
||||
|
||||
elif sort_mode == "salary_desc":
|
||||
def k(it: Dict[str, Any]):
|
||||
v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
|
||||
return (v is None, -(v or 0))
|
||||
items = sorted(items, key=k)
|
||||
|
||||
return {"items": items, "count": len(items)}
|
||||
Reference in New Issue
Block a user