From 8b4b8d54d1f0dfbe8a7d1034b6d72dc21c08a46a Mon Sep 17 00:00:00 2001 From: bzego Date: Wed, 11 Mar 2026 15:27:10 +0300 Subject: [PATCH] Initial commit --- .gitignore | 3 + __init__.py | 2 + agent.py | 1184 ++++++++++++++++++ api.py | 77 ++ bundle_export.py | 267 ++++ cli.py | 282 +++++ db.py | 296 +++++ dedup/simhash.py | 41 + extract/clean.py | 39 + extract/doc_type.py | 134 ++ extract/experience.py | 159 +++ extract/experience_timeline.py | 144 +++ extract/llm.py | 585 +++++++++ extract/parse.py | 659 ++++++++++ extract/pdf_extract.py | 211 ++++ extract/sections.py | 70 ++ extract/templates/__init__.py | 1 + extract/templates/generic.py | 46 + extract/templates/hh.py | 58 + extract/templates/hh_ru.py | 85 ++ extract/templates/linkedin.py | 57 + extract/templates/one_page.py | 46 + extract/templates/one_page_en.py | 11 + extract/templates/one_page_ru.py | 11 + extract/templates/pptx_export.py | 45 + extract/text_extract.py | 99 ++ importers/file_scan.py | 21 + importers/telegram_html.py | 66 + importers/telegram_json.py | 73 ++ normalize.py | 174 +++ pdf_merge.py | 45 + pipeline.py | 1990 ++++++++++++++++++++++++++++++ search.py | 393 ++++++ util.py | 33 + 34 files changed, 7407 insertions(+) create mode 100644 .gitignore create mode 100644 __init__.py create mode 100644 agent.py create mode 100644 api.py create mode 100644 bundle_export.py create mode 100644 cli.py create mode 100644 db.py create mode 100644 dedup/simhash.py create mode 100644 extract/clean.py create mode 100644 extract/doc_type.py create mode 100644 extract/experience.py create mode 100644 extract/experience_timeline.py create mode 100644 extract/llm.py create mode 100644 extract/parse.py create mode 100644 extract/pdf_extract.py create mode 100644 extract/sections.py create mode 100644 extract/templates/__init__.py create mode 100644 extract/templates/generic.py create mode 100644 extract/templates/hh.py create mode 100644 extract/templates/hh_ru.py create mode 100644 extract/templates/linkedin.py create mode 100644 extract/templates/one_page.py create mode 100644 extract/templates/one_page_en.py create mode 100644 extract/templates/one_page_ru.py create mode 100644 extract/templates/pptx_export.py create mode 100644 extract/text_extract.py create mode 100644 importers/file_scan.py create mode 100644 importers/telegram_html.py create mode 100644 importers/telegram_json.py create mode 100644 normalize.py create mode 100644 pdf_merge.py create mode 100644 pipeline.py create mode 100644 search.py create mode 100644 util.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..77ac754 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.venv/ +__pycache__/ +*.pyc diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..be35134 --- /dev/null +++ b/__init__.py @@ -0,0 +1,2 @@ +__all__ = [] +__version__ = "1.0.0" diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..4bf9420 --- /dev/null +++ b/agent.py @@ -0,0 +1,1184 @@ +from __future__ import annotations + +import json +import re +import sqlite3 +from dataclasses import dataclass, asdict +from typing import Any, Dict, List, Optional, Set, Tuple + +try: + import httpx # type: ignore +except Exception: # pragma: no cover + httpx = None # type: ignore + +from tg_resume_db.search import search_with_filters +from tg_resume_db.extract.parse import ( + extract_remote, + extract_english, + extract_location_best_effort, + extract_roles_skills, + extract_salary, +) +from tg_resume_db.extract.clean import normalize_text +from tg_resume_db.extract.llm import resolve_llm_runtime +from tg_resume_db.normalize import normalize_skill, find_skills_in_text + + +# --------- Search plan (LLM outputs THIS, not SQL) ---------- + +@dataclass +class SearchPlan: + query_text: str = "" # full-text query (FTS) + skills_any: List[str] = None # at least one must match + skills_all: List[str] = None # all must match + roles_any: List[str] = None + location: Optional[str] = None + remote: Optional[bool] = None + english_min: Optional[str] = None # e.g. A1..C2 + exp_years_min: Optional[float] = None + salary_min: Optional[int] = None + salary_max: Optional[int] = None + limit: int = 20 + sort: str = "rank" # rank | exp_desc | salary_desc + + def __post_init__(self): + self.skills_any = self.skills_any or [] + self.skills_all = self.skills_all or [] + self.roles_any = self.roles_any or [] + + +_ALLOWED_PLAN_KEYS = { + "query_text", + "skills_any", + "skills_all", + "roles_any", + "location", + "remote", + "english_min", + "exp_years_min", + "salary_min", + "salary_max", + "limit", + "sort", +} + +# --------- Text helpers ---------- + +_EN_ORDER = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6} + + +def _norm_token(s: str) -> str: + s = (s or "").strip().lower() + s = re.sub(r"\s+", " ", s) + return s + + +def _uniq_keep_order(xs: List[str]) -> List[str]: + seen = set() + out: List[str] = [] + for x in (xs or []): + x = _norm_token(str(x)) + if not x or x in seen: + continue + seen.add(x) + out.append(x) + return out + + +def _filter_skills_vs_location(skills: List[str], location: Optional[str]) -> List[str]: + if not skills: + return [] + bad = set() + if location: + bad.add(_norm_token(location)) + for w in [ + "москва", "санкт-петербург", "спб", "питер", "екатеринбург", "минск", "алматы", + "remote", "удаленно", "удалённо", "удаленка", "удалёнка", "гибрид", "hybrid", + "офис", "office", "onsite", "on-site", + ]: + bad.add(w) + return [s for s in skills if _norm_token(s) not in bad] + + +# ---- Name-list detection (чтобы не ужимать фильтрами запрос "списком ФИО") ---- +_NAME_RE = re.compile(r"\b[А-ЯЁA-Z][а-яёa-z]+(?:[-\s]+[А-ЯЁA-Z][а-яёa-z]+)+\b") + + +def _looks_like_name_list(user_prompt: str) -> bool: + """ + Heuristic: если в запросе несколько строк с ФИО, считаем это прямой поиск по именам + и не жёстко фильтруем по стеку/опыту. + """ + if not user_prompt: + return False + matches = _NAME_RE.findall(user_prompt) + if len(matches) >= 3: + return True + + # lines with at least one full name + lines = [ln.strip() for ln in user_prompt.splitlines() if ln.strip()] + name_lines = sum(1 for ln in lines if _NAME_RE.search(ln)) + return name_lines >= 2 and len(matches) >= 2 + + +# ---- Work mode: hybrid must NOT force remote=true ---- + +_HYBRID_RE = re.compile(r"\b(гибрид|hybrid)\b", re.I) +_REMOTE_RE = re.compile(r"\b(remote|удал(ен|ён|енно|ённо)?|удаленк|удалёнк|дистанц)\b", re.I) +_OFFICE_RE = re.compile(r"\b(офис|office|on[-\s]?site|onsite|в офисе|на месте)\b", re.I) + + +def _apply_work_mode_overrides(user_prompt: str, plan: SearchPlan) -> None: + """ + Принудительно правим plan.remote по тексту запроса: + - "гибрид" => remote = None (не фильтруем) + - "офис/onsite" => remote = False + - "remote/удаленно" => remote = True + """ + t = (user_prompt or "").lower() + + if _HYBRID_RE.search(t): + plan.remote = None + return + if _OFFICE_RE.search(t): + plan.remote = False + return + if _REMOTE_RE.search(t): + plan.remote = True + return + + +def _simplify_query_text(user_prompt: str, skills_any: List[str]) -> str: + """ + FTS-поиск может ухудшаться, если query_text перегружен. + Если в запросе явно стек (3+ технологий) — оставим краткий search intent. + """ + up = (user_prompt or "").strip() + if len(skills_any) >= 3: + # максимально безопасно и универсально + if re.search(r"\bbackend\b", up, re.I) or "бэкенд" in up.lower(): + return "backend developer" + return "developer" + return up + + +# --------- sanitize helpers ---------- + +def _as_list(v: Any) -> List[str]: + if v is None: + return [] + if isinstance(v, list): + return [str(x) for x in v if str(x).strip()] + s = str(v).strip() + if not s: + return [] + return [x.strip() for x in s.split(",") if x.strip()] + + +def _to_bool(v: Any) -> Optional[bool]: + if v is None: + return None + if isinstance(v, bool): + return v + s = str(v).strip().lower() + if s in ("true", "1", "yes", "y", "да", "д"): + return True + if s in ("false", "0", "no", "n", "нет", "н"): + return False + return None + + +def _to_int(v: Any) -> Optional[int]: + if v is None: + return None + try: + return int(float(v)) + except Exception: + return None + + +def _to_float(v: Any) -> Optional[float]: + if v is None: + return None + try: + return float(v) + except Exception: + return None + + +def _sanitize_plan_dict(obj: Any) -> Dict[str, Any]: + """ + Убираем лишние ключи (например, user_prompt) и приводим типы. + Лечит: SearchPlan.__init__() got an unexpected keyword argument ... + """ + if not isinstance(obj, dict): + return {} + + clean: Dict[str, Any] = {} + for k, v in obj.items(): + if k not in _ALLOWED_PLAN_KEYS: + continue + clean[k] = v + + if "skills_any" in clean: + clean["skills_any"] = _as_list(clean["skills_any"]) + if "skills_all" in clean: + clean["skills_all"] = _as_list(clean["skills_all"]) + if "roles_any" in clean: + clean["roles_any"] = _as_list(clean["roles_any"]) + + if "remote" in clean: + clean["remote"] = _to_bool(clean["remote"]) + + if "salary_min" in clean: + clean["salary_min"] = _to_int(clean["salary_min"]) + if "salary_max" in clean: + clean["salary_max"] = _to_int(clean["salary_max"]) + + if "exp_years_min" in clean: + clean["exp_years_min"] = _to_float(clean["exp_years_min"]) + + if "limit" in clean: + lim = _to_int(clean["limit"]) + clean["limit"] = lim if lim is not None else 20 + + if "sort" in clean: + clean["sort"] = str(clean["sort"] or "").strip() + + if "location" in clean and clean["location"] is not None: + loc = str(clean["location"]).strip() + clean["location"] = loc if loc else None + + if "english_min" in clean and clean["english_min"] is not None: + eng = str(clean["english_min"]).strip().upper() + clean["english_min"] = eng if eng else None + + if "query_text" in clean and clean["query_text"] is not None: + clean["query_text"] = str(clean["query_text"]).strip() + + return clean + + +# --------- heuristic plan ---------- + +def _heuristic_plan(user_prompt: str) -> SearchPlan: + # Если запрос похож на список имён — ищем по тексту без лишних фильтров + if _looks_like_name_list(user_prompt): + return SearchPlan( + query_text=user_prompt.strip(), + skills_any=[], + skills_all=[], + roles_any=[], + location=None, + remote=None, + english_min=None, + exp_years_min=None, + salary_min=None, + salary_max=None, + limit=20, + sort="rank", + ) + + text = normalize_text(user_prompt) + + roles, skills = extract_roles_skills(text) + location = extract_location_best_effort(text) + remote = extract_remote(text) + english = extract_english(text) + sal_min, sal_max, sal_conf, _ = extract_salary(text) + + skills = _filter_skills_vs_location(skills, location) + roles = _uniq_keep_order(roles) + skills = _uniq_keep_order(skills) + + plan = SearchPlan( + query_text=_simplify_query_text(user_prompt, skills), + skills_any=skills[:12], + roles_any=(["backend"] if ("backend" in roles or "backend" in user_prompt.lower()) else roles[:6]), + location=location, + remote=remote, + english_min=english, + salary_min=sal_min if sal_conf and sal_conf >= 0.4 else None, + salary_max=sal_max if sal_conf and sal_conf >= 0.4 else None, + limit=20, + sort="rank", + ) + + _apply_work_mode_overrides(user_prompt, plan) + return plan + + +# --------- Optional LLM (OpenAI-compatible base_url) ---------- + +def _llm_enabled() -> bool: + if httpx is None: + return False + runtime = resolve_llm_runtime() + return bool(runtime.get("base_url")) and bool(runtime.get("model")) + + +def _llm_call_json(messages: List[Dict[str, str]]) -> Dict[str, Any]: + if httpx is None: + raise RuntimeError("httpx is not installed") + + runtime = resolve_llm_runtime() + base_url = runtime.get("base_url", "").rstrip("/") + model = runtime.get("model", "") + api_key = runtime.get("api_key", "") + if not base_url or not model: + raise RuntimeError("LLM runtime is not configured") + + payload = {"model": model, "messages": messages, "temperature": 0.2} + + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + with httpx.Client(timeout=30.0) as client: + r = client.post(f"{base_url}/chat/completions", headers=headers, json=payload) + r.raise_for_status() + data = r.json() + + content = data["choices"][0]["message"]["content"] + m = re.search(r"\{.*\}", content, flags=re.S) + if not m: + raise ValueError("LLM did not return JSON") + return json.loads(m.group(0)) + + +def _llm_build_plan(user_prompt: str, draft: SearchPlan) -> SearchPlan: + schema_hint = { + "query_text": "string", + "skills_any": ["string"], + "skills_all": ["string"], + "roles_any": ["string"], + "location": "string|null", + "remote": "bool|null", + "english_min": "A1|A2|B1|B2|C1|C2|null", + "exp_years_min": "number|null", + "salary_min": "int|null", + "salary_max": "int|null", + "limit": "int", + "sort": "rank|exp_desc|salary_desc", + } + + msgs = [ + { + "role": "system", + "content": ( + "Ты превращаешь запрос рекрутера в JSON-фильтры поиска по базе резюме.\n" + "НЕЛЬЗЯ писать SQL. Верни ТОЛЬКО JSON объекта SearchPlan.\n" + f"Schema: {json.dumps(schema_hint, ensure_ascii=False)}\n" + "ВАЖНО:\n" + "- Никаких лишних ключей - только поля Schema.\n" + "- Не добавляй в skills города/локации.\n" + "- 'гибрид' НЕ означает remote=true (если видишь 'гибрид' - remote=null).\n" + "- Старайся делать поиск широким: skills_all используй ТОЛЬКО если явно попросили обязательные навыки.\n" + "- Если в запросе есть указание уровня английского (например B2+), заполни english_min.\n" + "- Если явно указан опыт 'N+' лет - поставь exp_years_min=N.\n" + ), + }, + { + "role": "user", + "content": ( + f"Запрос: {user_prompt}\n\n" + f"Черновик (эвристика): {json.dumps(asdict(draft), ensure_ascii=False)}" + ), + }, + ] + + obj_raw = _llm_call_json(msgs) + obj = _sanitize_plan_dict(obj_raw) + + plan = SearchPlan(**{**asdict(draft), **obj}) + + plan.skills_any = _uniq_keep_order(_filter_skills_vs_location(plan.skills_any, plan.location)) + plan.skills_all = _uniq_keep_order(_filter_skills_vs_location(plan.skills_all, plan.location)) + plan.roles_any = _uniq_keep_order(plan.roles_any) + + # мягко улучшим query_text + plan.query_text = _simplify_query_text(user_prompt, plan.skills_any) + + plan.limit = max(5, min(int(plan.limit or 20), 50)) + if plan.sort not in ("rank", "exp_desc", "salary_desc"): + plan.sort = "rank" + + # fallback: если LLM обнулил важные поля - вернём эвристику + if not plan.skills_any: + plan.skills_any = draft.skills_any + if not plan.skills_all: + plan.skills_all = draft.skills_all + if plan.english_min is None and draft.english_min is not None: + plan.english_min = draft.english_min + if plan.exp_years_min is None: + try: + req_exp = _extract_required_exp_years(user_prompt) + if req_exp is not None: + plan.exp_years_min = req_exp + except Exception: + pass + + _apply_work_mode_overrides(user_prompt, plan) + + return plan + + +# --------- post processing: dedupe + "real fit" filter ---------- + +_CORE = {"java", "kotlin", "python", "go", "golang"} +_BONUS = {"c++", "cpp"} + +_LANG_VARIANTS = { + "java": {"java"}, + "kotlin": {"kotlin"}, + "python": {"python"}, + "go": {"go", "golang"}, + "c++": {"c++", "cpp", "c plus plus"}, + "c#": {"c#", "csharp"}, +} + +_SKILL_EVIDENCE_ALIASES = { + "go": {"go", "golang"}, + "golang": {"go", "golang"}, + "kubernetes": {"kubernetes", "k8s"}, + "postgresql": {"postgresql", "postgres", "postgre sql", "postgre-sql", "psql"}, + "javascript": {"javascript", "java script", "js"}, + "typescript": {"typescript", "type script", "ts"}, + "nodejs": {"nodejs", "node js", "node.js", "node"}, + "grpc": {"grpc", "g rpc"}, + "graphql": {"graphql", "graph ql"}, + "ci/cd": {"ci/cd", "ci cd", "cicd"}, + "c++": {"c++", "cpp", "c plus plus"}, + "c#": {"c#", "csharp", "c sharp"}, + "dotnet": {"dotnet", ".net"}, + "aws": {"aws", "amazon web services"}, + "gcp": {"gcp", "google cloud", "google cloud platform"}, + "redis": {"redis"}, + "kafka": {"kafka"}, + "docker": {"docker"}, +} + +_GENERIC_SKIP_SKILLS = { + "backend", + "frontend", + "fullstack", + "developer", + "engineer", + "senior", + "middle", + "junior", + "lead", +} + +_DOMAIN_VARIANTS = { + "fintech": { + "fintech", + "финтех", + "bank", + "banking", + "бан", + "payment", + "payments", + "card", + "cards", + "sber", + "тбанк", + "tinkoff", + "visa", + "mastercard", + "trading", + "exchange", + "crypto", + "крипт", + "биржа", + }, + "ecommerce": { + "ecommerce", + "e-commerce", + "marketplace", + "retail", + "checkout", + "cart", + "онлайн магазин", + }, + "gamedev": {"gamedev", "game dev", "gaming", "unity", "unreal", "игр"}, + "healthcare": {"healthcare", "medtech", "hospital", "clinic", "мед", "health tech"}, +} + + +def _token_in_text(text: str, token: str) -> bool: + if not text or not token: + return False + pat = r"(? bool: + aliases = _LANG_VARIANTS.get(canon_lang, {canon_lang}) + for tok in aliases: + if _token_in_text(text, tok): + return True + return False + + +def _skill_aliases(skill: str) -> List[str]: + canon = normalize_skill(skill) or _norm_token(skill) + if not canon: + return [] + + aliases = set() + aliases.add(canon) + aliases.add(_norm_token(skill)) + aliases.update(_SKILL_EVIDENCE_ALIASES.get(canon, set())) + if canon in _LANG_VARIANTS: + aliases.update(_LANG_VARIANTS.get(canon, set())) + + out: List[str] = [] + for a in aliases: + t = _norm_token(a) + if not t: + continue + out.append(t) + return _uniq_keep_order(out) + + +def _extract_required_skills(user_prompt: str, plan: Optional[SearchPlan], req_langs: Set[str]) -> List[str]: + raw: List[str] = [] + if plan: + raw.extend(plan.skills_all or []) + raw.extend(plan.skills_any or []) + raw.extend(find_skills_in_text(user_prompt or "")) + raw.extend(list(req_langs or set())) + + out: List[str] = [] + seen = set() + for s in raw: + canon = normalize_skill(s) or _norm_token(s) + if not canon: + continue + canon = _norm_token(canon) + if canon in _GENERIC_SKIP_SKILLS: + continue + if canon in seen: + continue + seen.add(canon) + out.append(canon) + return out[:10] + + +def _query_stack_is_strict(user_prompt: str) -> bool: + t = (user_prompt or "").lower() + if any(w in t for w in ("обязательно", "строго", "must", "required", "mandatory", "без этого")): + return True + if "," in t and " или " not in t and " or " not in t: + return True + return False + + +def _extract_required_domains(user_prompt: str) -> List[str]: + t = (user_prompt or "").lower() + out: List[str] = [] + for canon, variants in _DOMAIN_VARIANTS.items(): + if any(v in t for v in variants): + out.append(canon) + return out + + +def _domain_hit(text: str, domain: str) -> bool: + variants = _DOMAIN_VARIANTS.get(domain, set()) + txt = (text or "").lower() + return any(v in txt for v in variants) + + +def _load_resume_contexts( + con: sqlite3.Connection, + items: List[Dict[str, Any]], +) -> Dict[str, Dict[str, str]]: + resume_ids = [] + seen = set() + for it in items or []: + rid = str(it.get("resume_id") or "").strip() + if not rid or rid in seen: + continue + seen.add(rid) + resume_ids.append(rid) + + if not resume_ids: + return {} + + ph = ",".join("?" for _ in resume_ids) + sql = ( + f"SELECT resume_id, clean_text, sections_json, extraction_json " + f"FROM resumes WHERE resume_id IN ({ph})" + ) + try: + rows = con.execute(sql, resume_ids).fetchall() + except Exception: + return {} + + out: Dict[str, Dict[str, str]] = {} + for r in rows: + rid = str(r["resume_id"]) + clean = str(r["clean_text"] or "") + + sections: Dict[str, Any] = {} + try: + raw = json.loads(r["sections_json"] or "{}") + if isinstance(raw, dict): + sections = raw + except Exception: + sections = {} + + extraction: Dict[str, Any] = {} + try: + raw = json.loads(r["extraction_json"] or "{}") + if isinstance(raw, dict): + extraction = raw + except Exception: + extraction = {} + + skills_text = str(sections.get("skills") or "") + body_parts: List[str] = [] + for key in ("about", "summary", "experience", "projects", "work"): + val = sections.get(key) + if val: + body_parts.append(str(val)) + + for p in extraction.get("positions") or []: + if not isinstance(p, dict): + continue + body_parts.append(str(p.get("title") or "")) + body_parts.append(str(p.get("company") or "")) + body_parts.append(str(p.get("description") or "")) + + body_text = "\n".join(body_parts).strip() + + # fallback for badly split templates + if len(body_text) < 80: + body_text = clean + if skills_text: + body_text = body_text.replace(skills_text, " ") + + out[rid] = { + "skills_text": skills_text.lower(), + "body_text": body_text.lower(), + "clean_text": clean.lower(), + } + + return out + + +def _normalize_lang_token(token: str) -> Optional[str]: + t = _norm_token(token) + if not t: + return None + for canon, aliases in _LANG_VARIANTS.items(): + if t == canon or t in aliases: + return canon + return None + + +def _extract_required_languages(user_prompt: str) -> List[str]: + t = (user_prompt or "").lower() + hits: List[str] = [] + for canon, aliases in _LANG_VARIANTS.items(): + if any(_token_in_text(t, alias) for alias in aliases): + if canon not in hits: + hits.append(canon) + return hits + + +def _dedupe_by_candidate_best_rank(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + best: Dict[str, Dict[str, Any]] = {} + for it in items or []: + cid = it.get("candidate_id") or "" + if not cid: + continue + if cid not in best: + best[cid] = it + continue + # rank: у тебя чем меньше (более отрицательный), тем "выше" + r_new = it.get("rank") + r_old = best[cid].get("rank") + try: + if r_old is None or (r_new is not None and float(r_new) < float(r_old)): + best[cid] = it + except Exception: + pass + return list(best.values()) + + +def _needs_postfilter(user_prompt: str) -> bool: + """ + Включаем строгий "вакансионный" фильтр, если запрос похож на вакансию: + - "опыт от N лет" или "5+" + - явный стек из языков + """ + if _looks_like_name_list(user_prompt): + return False + + t = (user_prompt or "").lower() + if re.search(r"(опыт|experience).{0,20}(\d+)\s*\+|\b(\d+)\s*\+\s*лет", t): + return True + skill_hits = len(find_skills_in_text(t)) + if skill_hits >= 2: + return True + if _extract_required_domains(user_prompt) and skill_hits >= 1: + return True + # stack words fallback + hits = 0 + for w in ("java", "kotlin", "python", "go", "golang", "c++", "cpp"): + if w in t: + hits += 1 + return hits >= 2 + + +_EXCLUDE_LOC_MARKERS = { + "россия", + "russia", + "rf", + "russian federation", + "moscow", + "москва", + "москв", + "spb", + "petersburg", + "петербург", + "санкт", + "мск", + "belarus", + "беларусь", + "белоруссия", + "iran", + "ирак", + "iraq", + "пакистан", + "pakistan", + "india", + "индия", + "африк", +} + + +def _location_exclusion_requested(user_prompt: str) -> bool: + t = (user_prompt or "").lower() + return any(k in t for k in _EXCLUDE_LOC_MARKERS) and ("кроме" in t or "except" in t or "не " in t) + + +def _extract_required_exp_years(user_prompt: str) -> Optional[float]: + t = (user_prompt or "").lower() + m = re.search(r"(опыт|experience).{0,20}(\d+(?:[.,]\d+)?)\s*(?:лет|years?)", t) + if m: + try: + return float(m.group(2).replace(",", ".")) + except Exception: + return None + m = re.search(r"\b(\d+(?:[.,]\d+)?)\s*\+\s*(?:лет|years?)\b", t) + if m: + try: + return float(m.group(1).replace(",", ".")) + except Exception: + return None + return None + + +def _extract_required_english(user_prompt: str) -> Optional[str]: + t = (user_prompt or "").upper() + m = re.search(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", t) + if m: + return m.group(1).replace("+", "") + # textual + if "FLUENT" in t or "ADVANCED" in t or "PROFICIENT" in t: + return "C1" + if "UPPER" in t and "INTERMEDIATE" in t: + return "B2" + if "INTERMEDIATE" in t: + return "B1" + return None + + +def _jobfit_filter_items( + con: sqlite3.Connection, + user_prompt: str, + items: List[Dict[str, Any]], + plan: Optional[SearchPlan] = None, +) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: + """ + "По-взрослому": + - exp >= required (если указано) + - совпадает стек (минимум 1 язык из запроса/Core) + - обязательный основной Go (для этого запроса с Go) + - английский >= требуемого уровня + - backend не обязателен, но учитывается в сортировке + """ + req_exp = _extract_required_exp_years(user_prompt) # например 5.0 + req_langs = set(_extract_required_languages(user_prompt)) + req_english = _extract_required_english(user_prompt) + req_skills = _extract_required_skills(user_prompt, plan, req_langs) + req_domains = _extract_required_domains(user_prompt) + strict_stack = _query_stack_is_strict(user_prompt) or (req_exp is not None) + + must_have_skills = _uniq_keep_order([normalize_skill(s) or s for s in ((plan.skills_all or []) if plan else [])]) + if not must_have_skills and strict_stack and req_skills: + # Для коротких стеков в вакансии считаем все элементы обязательными. + if len(req_skills) <= 4: + must_have_skills = req_skills + else: + must_have_skills = req_skills[:4] + + filtered: List[Dict[str, Any]] = [] + dropped = 0 + reasons: Dict[str, int] = {} + + exclude_ru = _location_exclusion_requested(user_prompt) + # Если явно ищут Go и нет числа лет — зададим минимум 4 года как дефолт + if req_exp is None and ("go" in req_langs or "golang" in req_langs): + req_exp = 4.0 + + resume_ctx = _load_resume_contexts(con, items) + + for it in items or []: + roles = set((it.get("roles") or [])) + skills = set(_norm_token(s) for s in (it.get("skills") or [])) + for pl in it.get("primary_languages") or []: + skills.add(_norm_token(pl)) + + # CHECK ENGINEERING EXPERIENCE FIRST + # If 'experience_years_eng' is available and distinct (not None), use it. + # Otherwise fallback to 'experience_years'. + exp_eng = it.get("experience_years_eng") + exp_total = it.get("experience_years") + + # Prefer engineering years for filtering if available + exp_val = None + if exp_eng is not None: + try: + exp_val = float(exp_eng) + except: + pass + + if exp_val is None and exp_total is not None: + try: + exp_val = float(exp_total) + except: + pass + + if req_exp is not None and (exp_val is None or exp_val < req_exp): + dropped += 1 + reasons["exp_lt_required"] = reasons.get("exp_lt_required", 0) + 1 + continue + + backend_focus_flag = it.get("backend_focus") + + loc = (it.get("location") or "").lower() + if exclude_ru and any(bad in loc for bad in _EXCLUDE_LOC_MARKERS): + dropped += 1 + reasons["location_excluded"] = reasons.get("location_excluded", 0) + 1 + continue + + lang_tokens: Set[str] = set() + for lang in (it.get("primary_languages") or []): + norm = _normalize_lang_token(lang) + if norm: + lang_tokens.add(norm) + if not lang_tokens: + for sk in skills: + norm = _normalize_lang_token(sk) + if norm: + lang_tokens.add(norm) + + # Для language-стека оставляем базовую проверку. + missing_primary_lang = False + for req_lang in req_langs: + if req_lang not in lang_tokens and req_lang in ("go", "python", "java", "kotlin", "c++", "c#"): + missing_primary_lang = True + break + if missing_primary_lang: + dropped += 1 + reasons["no_primary_required_lang"] = reasons.get("no_primary_required_lang", 0) + 1 + continue + + rid = str(it.get("resume_id") or "") + ctx = resume_ctx.get(rid) or {} + ctx_body = str(ctx.get("body_text") or "") + ctx_skills = str(ctx.get("skills_text") or "") + ctx_clean = str(ctx.get("clean_text") or "") + ctx_domain = "\n".join([ctx_body, ctx_clean, str(it.get("snippet") or "").lower()]) + + # Evidence-based skill validation (не только Go): + # must-have скиллы не должны быть только в section "skills". + skill_hits_total = 0 + skill_hits_body = 0 + missing_must = 0 + skills_only_must = 0 + skills_only_critical = 0 + for req_skill in req_skills: + aliases = _skill_aliases(req_skill) + if not aliases: + continue + hit_body = any(_token_in_text(ctx_body, a) for a in aliases) + hit_skills = any(_token_in_text(ctx_skills, a) for a in aliases) + hit_any = hit_body or hit_skills or any(_norm_token(req_skill) == _norm_token(s) for s in skills) + if hit_any: + skill_hits_total += 1 + if hit_body: + skill_hits_body += 1 + + if req_skill in must_have_skills: + if not hit_any: + missing_must += 1 + elif not hit_body and hit_skills: + skills_only_must += 1 + if _normalize_lang_token(req_skill) is not None: + skills_only_critical += 1 + + if missing_must > 0: + dropped += 1 + reasons["required_skill_missing"] = reasons.get("required_skill_missing", 0) + 1 + continue + + # Строго режем, если ключевой language-требование есть только в skill-list, + # либо если весь must-have стек не подтвержден опытом. + if strict_stack and (skills_only_critical > 0 or (must_have_skills and skills_only_must >= len(must_have_skills))): + dropped += 1 + reasons["required_skill_only_in_skills"] = reasons.get("required_skill_only_in_skills", 0) + 1 + continue + + if req_skills and strict_stack: + min_hits = len(must_have_skills) if must_have_skills else (2 if len(req_skills) >= 2 else 1) + if skill_hits_total < min_hits: + dropped += 1 + reasons["required_skills_weak"] = reasons.get("required_skills_weak", 0) + 1 + continue + + domain_hits = 0 + for d in req_domains: + if _domain_hit(ctx_domain, d): + domain_hits += 1 + if req_domains and strict_stack and domain_hits < len(req_domains): + dropped += 1 + reasons["domain_mismatch"] = reasons.get("domain_mismatch", 0) + 1 + continue + + if req_langs: + lang_hits_req = len(lang_tokens & req_langs) + if lang_hits_req < 1: + dropped += 1 + reasons["lang_stack_weak"] = reasons.get("lang_stack_weak", 0) + 1 + continue + else: + lang_hits_req = None + + core_hits = len(lang_tokens & _CORE) + bonus_hits = len(lang_tokens & _BONUS) + + # Требуем хотя бы один язык из CORE/bonus + if core_hits + bonus_hits < 1: + dropped += 1 + reasons["stack_too_weak"] = reasons.get("stack_too_weak", 0) + 1 + continue + + it2 = dict(it) + it2["_fit"] = { + "core_hits": core_hits, + "bonus_cpp": bool(bonus_hits), + "req_lang_hits": lang_hits_req, + "req_skill_hits": skill_hits_total, + "req_skill_hits_body": skill_hits_body, + "req_domain_hits": domain_hits, + "backend_role": "backend" in roles, + "backend_focus": backend_focus_flag, + } + if req_english: + lvl = str(it.get("english_level") or "").upper() + if not lvl or _EN_ORDER.get(lvl, 0) < _EN_ORDER.get(req_english, 0): + dropped += 1 + reasons["english_below_required"] = reasons.get("english_below_required", 0) + 1 + continue + + filtered.append(it2) + + # сорт: больше core_hits, затем rank + def key(x: Dict[str, Any]): + fit = x.get("_fit") or {} + core_hits = int(fit.get("core_hits", 0)) + bonus = 1 if fit.get("bonus_cpp") else 0 + backend_bonus = 1 if fit.get("backend_role") or fit.get("backend_focus") else 0 + req_skill_hits = int(fit.get("req_skill_hits", 0)) + req_skill_hits_body = int(fit.get("req_skill_hits_body", 0)) + req_domain_hits = int(fit.get("req_domain_hits", 0)) + r = x.get("rank") + try: + r = float(r) + except Exception: + r = 0.0 + # ручной скоринг по доменным признакам + score = 0.0 + if "go" in (x.get("primary_languages") or []): + score += 5.0 # основной Go + try: + if x.get("experience_years_eng") and float(x.get("experience_years_eng")) >= max(4.0, req_exp or 0): + score += 3.0 + except Exception: + pass + skills = set(_norm_token(s) for s in (x.get("skills") or [])) + text_boost = 0.0 + for kw in ("kubernetes", "k8s"): + if kw in skills: + text_boost += 1.5; break + for kw in ("ddd", "domain-driven design", "eda", "event-driven"): + if kw in skills: + text_boost += 2.0; break + for kw in ("fintech", "trading", "crypto", "exchange", "биржа", "финтех"): + if kw in skills: + text_boost += 2.5; break + snippet = (x.get("snippet") or "").lower() + for kw in ("highload", "high-load", "high throughput", "high-throughput", "low latency", "low-latency", "highload"): + if kw in snippet: + text_boost += 1.5 + break + score += text_boost + return (-req_domain_hits, -req_skill_hits_body, -req_skill_hits, -core_hits, -backend_bonus, -bonus, -(score), r) + + filtered.sort(key=key) + + dbg = { + "postfilter_applied": True, + "required_exp": req_exp, + "required_languages": sorted(list(req_langs)), + "required_skills": req_skills, + "must_have_skills": must_have_skills, + "required_domains": req_domains, + "strict_stack": strict_stack, + "dropped": dropped, + "reasons": reasons, + } + return filtered, dbg + + +# --------- Refinement loop ---------- + +def _refine_plan_no_llm(plan: SearchPlan, result_count: int, user_prompt: str) -> SearchPlan: + p = SearchPlan(**asdict(plan)) + + if result_count == 0: + p.location = None + p.salary_min = None + p.salary_max = None + p.english_min = None + + # если было строго по remote — ослабим; потом override применим обратно + p.remote = None + + # опыт уменьшаем плавно + if p.exp_years_min is not None: + p.exp_years_min = max(0.0, float(p.exp_years_min) - 1.0) + + if not (p.query_text or "").strip(): + p.query_text = " ".join(p.skills_any[:8]) + + _apply_work_mode_overrides(user_prompt, p) + return p + + return p + + +def agent_search( + con: sqlite3.Connection, + user_prompt: str, + max_iters: int = 2, + limit: int = 20, +) -> Dict[str, Any]: + draft = _heuristic_plan(user_prompt) + draft.limit = limit + + names_only_query = _looks_like_name_list(user_prompt) + plan = _llm_build_plan(user_prompt, draft) if (_llm_enabled() and not names_only_query) else draft + plan.limit = limit + + history: List[Dict[str, Any]] = [] + final_items: List[Dict[str, Any]] = [] + final_count = 0 + + for i in range(max_iters + 1): + _apply_work_mode_overrides(user_prompt, plan) + + res = search_with_filters(con, plan) + items = res.get("items", []) + count = int(res.get("count", len(items))) + + history.append( + { + "plan": asdict(plan), + "count": count, + "top_snippets": [it.get("snippet", "")[:180] for it in items[:5]], + } + ) + + if count > 0 or i == max_iters: + final_items = items + final_count = count + break + + # refine + if _llm_enabled(): + msgs = [ + { + "role": "system", + "content": ( + "Ты корректируешь JSON SearchPlan. Верни ТОЛЬКО JSON с полями SearchPlan.\n" + "Если 0 результатов — ослабь фильтры: remote=null, exp_years_min уменьшить/обнулить, " + "location/salary/english убрать. skills_any сохранить.\n" + "Никаких лишних ключей. Помни: 'гибрид' НЕ означает remote=true.\n" + ), + }, + { + "role": "user", + "content": json.dumps( + { + "query": user_prompt, + "previous_plan": asdict(plan), + "result_count": count, + }, + ensure_ascii=False, + ), + }, + ] + + obj_raw = _llm_call_json(msgs) + obj = _sanitize_plan_dict(obj_raw) + + plan = SearchPlan(**{**asdict(plan), **obj}) + + plan.skills_any = _uniq_keep_order(_filter_skills_vs_location(plan.skills_any, plan.location)) + plan.skills_all = _uniq_keep_order(_filter_skills_vs_location(plan.skills_all, plan.location)) + plan.roles_any = _uniq_keep_order(plan.roles_any) + + plan.query_text = _simplify_query_text(user_prompt, plan.skills_any) + plan.limit = limit + if plan.sort not in ("rank", "exp_desc", "salary_desc"): + plan.sort = "rank" + + _apply_work_mode_overrides(user_prompt, plan) + else: + plan = _refine_plan_no_llm(plan, count, user_prompt) + plan.limit = limit + + # ---- 1) dedupe ---- + deduped = _dedupe_by_candidate_best_rank(final_items) + + # ---- 2) postfilter for vacancy-like queries ---- + post_dbg: Dict[str, Any] = {"postfilter_applied": False} + if _needs_postfilter(user_prompt): + filtered, post_dbg = _jobfit_filter_items(con, user_prompt, deduped, plan=plan) + else: + filtered = deduped + + return { + "plan": asdict(plan), + "items": filtered, + "count": len(filtered), + "history": history, + "llm_used": _llm_enabled(), + "postfilter": post_dbg, + } diff --git a/api.py b/api.py new file mode 100644 index 0000000..7e74a77 --- /dev/null +++ b/api.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import os +from typing import Any, Dict, Optional + +from fastapi import FastAPI +from pydantic import BaseModel, Field + +from tg_resume_db.db import connect, init_db +from tg_resume_db.agent import agent_search +from tg_resume_db.search import search as db_search + +DB_PATH = os.environ.get("CANDIDATES_DB", "./candidates.db") + +app = FastAPI(title="Resume Search API", version="1.0") + +class SearchRequest(BaseModel): + query: str = Field(default="") + limit: int = Field(default=20, ge=1, le=100) + offset: int = Field(default=0, ge=0) + remote: Optional[bool] = None + location: Optional[str] = None + experience_min: Optional[float] = None + salary_min: Optional[int] = None + salary_max: Optional[int] = None + english: Optional[str] = None + role: Optional[str] = None + skill: Optional[str] = None + + +class AISearchRequest(BaseModel): + prompt: str = Field(default="") + limit: int = Field(default=20, ge=1, le=100) + ai_iters: int = Field(default=2, ge=0, le=5) + + +@app.on_event("startup") +def _startup(): + con = connect(DB_PATH) + init_db(con) + con.close() + +@app.get("/health") +def health(): + return {"ok": True} + +@app.post("/search") +def search(req: SearchRequest) -> Dict[str, Any]: + con = connect(DB_PATH) + try: + items = db_search(con, query=req.query, filters=req.model_dump(), limit=req.limit, offset=req.offset) + return {"items": items, "count": len(items)} + finally: + con.close() + + +@app.post("/search/ai") +def search_ai(req: AISearchRequest) -> Dict[str, Any]: + con = connect(DB_PATH) + try: + res = agent_search( + con, + user_prompt=req.prompt, + max_iters=req.ai_iters, + limit=req.limit, + ) + return { + "ai": True, + "llm_used": res.get("llm_used", False), + "plan": res.get("plan"), + "history": res.get("history"), + "postfilter": res.get("postfilter"), + "items": res.get("items", []), + "count": int(res.get("count", 0)), + } + finally: + con.close() diff --git a/bundle_export.py b/bundle_export.py new file mode 100644 index 0000000..b06b41d --- /dev/null +++ b/bundle_export.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +import json +import os +import re +import shutil +import sqlite3 +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + +# NEW: PDF merge helper (pypdf) +# pip install pypdf +try: + from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir +except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили + merge_all_pdfs_in_dir = None + + +def _slug(s: str, max_len: int = 60) -> str: + s = (s or "").strip() + if not s: + return "candidate" + s = re.sub(r"\s+", " ", s) + s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s) + s = s.replace(" ", "_") + s = re.sub(r"_+", "_", s).strip("_") + if not s: + return "candidate" + return s[:max_len] + + +def _safe_mkdir(p: Path) -> None: + p.mkdir(parents=True, exist_ok=True) + + +def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]: + """ + Возвращает список самых приоритетных путей к файлу резюме. + 1) resumes.file_path + 2) sources.original_file_path + 3) некоторые варианты путей из sources.extra_json + """ + paths: List[str] = [] + + row = con.execute( + "SELECT file_path FROM resumes WHERE resume_id=?", + (resume_id,), + ).fetchone() + if row and row["file_path"]: + paths.append(str(row["file_path"])) + + cur = con.execute( + """SELECT original_file_path, original_file_name, extra_json + FROM sources + WHERE resume_id=?""", + (resume_id,), + ) + for r in cur.fetchall(): + ofp = r["original_file_path"] + if ofp: + paths.append(str(ofp)) + + try: + extra = json.loads(r["extra_json"] or "{}") + if isinstance(extra, dict): + for k in ("file_path", "path", "local_path", "source_path"): + if extra.get(k): + paths.append(str(extra[k])) + except Exception: + pass + + # дедуп + seen = set() + out: List[str] = [] + for p in paths: + p2 = os.path.normpath(p) + if p2 in seen: + continue + seen.add(p2) + out.append(p2) + return out + + +def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path: + ext = src.suffix.lower() if src.suffix else "" + candidate = f"{base_name}{ext}" + dst = dst_dir / candidate + + if dst.exists(): + i = 2 + while True: + dst = dst_dir / f"{base_name}({i}){ext}" + if not dst.exists(): + break + i += 1 + + shutil.copy2(src, dst) + return dst + + +def bundle_search_results( + con: sqlite3.Connection, + results: Iterable[Dict[str, Any]], + out_dir: str, + *, + copy_files: bool = True, + merge_text: bool = True, + merge_pdf: bool = True, # NEW +) -> Dict[str, Any]: + """ + results: iterable dictов где есть минимум: + - resume_id + - candidate_id + - name (желательно) + + Создаёт: + - files/: скопированные исходные файлы резюме + - merged_resumes.txt: склейка текста clean_text из БД (если merge_text) + - pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен) + - manifest.json + - README.txt + """ + out_root = Path(out_dir).resolve() + files_dir = out_root / "files" + _safe_mkdir(files_dir) + + manifest: List[Dict[str, Any]] = [] + copied = 0 + missing = 0 + + merged_parts: List[str] = [] + merged_txt_path = out_root / "merged_resumes.txt" + + for item in results: + resume_id = item.get("resume_id") + cand_id = item.get("candidate_id") + name = item.get("name") or "" + if not resume_id or not cand_id: + continue + + # merged TXT из БД + if merge_text: + row = con.execute( + "SELECT clean_text FROM resumes WHERE resume_id=?", + (resume_id,), + ).fetchone() + clean_text = (row["clean_text"] if row else "") or "" + header = f"===== {name or cand_id} | {cand_id} | {resume_id} =====" + merged_parts.append(header) + merged_parts.append(clean_text.strip()) + merged_parts.append("") + + if not copy_files: + continue + + src_paths = _pick_source_paths(con, resume_id) + + src_found: Optional[Path] = None + for sp in src_paths: + p = Path(sp) + if p.exists() and p.is_file(): + src_found = p + break + + if not src_found: + missing += 1 + manifest.append( + { + "candidate_id": cand_id, + "name": name, + "resume_id": resume_id, + "copied": False, + "reason": "source_file_not_found", + "tried_paths": src_paths, + } + ) + continue + + base = f"{_slug(name) or _slug(cand_id)}__{resume_id}" + try: + dst = _copy_unique(src_found, files_dir, base) + copied += 1 + manifest.append( + { + "candidate_id": cand_id, + "name": name, + "resume_id": resume_id, + "copied": True, + "source_path": str(src_found), + "dest_path": str(dst), + } + ) + except Exception as e: + missing += 1 + manifest.append( + { + "candidate_id": cand_id, + "name": name, + "resume_id": resume_id, + "copied": False, + "reason": f"copy_failed: {repr(e)}", + "source_path": str(src_found), + } + ) + + # merged TXT + if merge_text: + merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore") + + # NEW: merged PDF from files/*.pdf + merged_pdf_path: Optional[Path] = None + pdf_info: Optional[Dict[str, Any]] = None + if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None: + try: + merged_pdf_path = out_root / "pdf" / "merged.pdf" + _safe_mkdir(merged_pdf_path.parent) + pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path) + except Exception as e: + pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"} + + # manifest.json + (out_root / "manifest.json").write_text( + json.dumps( + { + "out_dir": str(out_root), + "copied_files": copied, + "missing_files": missing, + "merged_text": str(merged_txt_path) if merge_text else None, + "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None, + "pdf_info": pdf_info, + "items": manifest, + }, + ensure_ascii=False, + indent=2, + ), + encoding="utf-8", + errors="ignore", + ) + + # README + readme_lines = [ + "Папка создана командой search.", + "- files/: скопированные исходные файлы резюме", + "- merged_resumes.txt: склейка текста clean_text из БД", + "- manifest.json: что откуда скопировалось / что не найдено", + ] + if merge_pdf: + if merge_all_pdfs_in_dir is None: + readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)") + else: + readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)") + + (out_root / "README.txt").write_text( + "\n".join(readme_lines) + "\n", + encoding="utf-8", + errors="ignore", + ) + + return { + "out_dir": str(out_root), + "copied_files": copied, + "missing_files": missing, + "merged_text": str(merged_txt_path) if merge_text else None, + "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None, + "manifest": str(out_root / "manifest.json"), + "pdf_info": pdf_info, + } diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..b798673 --- /dev/null +++ b/cli.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +import argparse +import json +import sys +from datetime import datetime +from typing import Any, Dict +from pathlib import Path + +import os + +from tg_resume_db.bundle_export import bundle_search_results +from tg_resume_db.db import connect, init_db +from tg_resume_db.pipeline import import_exports as run_import +from tg_resume_db.search import search as run_search +from tg_resume_db.util import Logger +from tg_resume_db.extract.text_extract import extract_text as extract_text_generic +from tg_resume_db.extract.pdf_extract import extract_pdf_best +from tg_resume_db.extract.clean import normalize_text +from tg_resume_db.extract.doc_type import detect_doc_type +from tg_resume_db.extract.sections import split_sections, sections_present +from tg_resume_db.extract.parse import extract_name_guess + + +def _print_json(obj: Dict[str, Any]) -> None: + s = json.dumps(obj, ensure_ascii=False, indent=2) + try: + print(s) + except UnicodeEncodeError: + # Fallback for cp1251/legacy consoles. + print(s.encode("ascii", "backslashreplace").decode("ascii")) + + +def _is_interactive() -> bool: + return sys.stdin.isatty() and sys.stdout.isatty() + + +def main() -> None: + ap = argparse.ArgumentParser(prog="tg_resume_db") + sub = ap.add_subparsers(dest="cmd", required=True) + + # ---------------- import_exports ---------------- + imp = sub.add_parser("import_exports", help="Import Telegram exports recursively (incremental)") + imp.add_argument("--input", required=True, help="Path to exports directory") + imp.add_argument("--db", required=True, help="SQLite db path") + imp.add_argument("--log", default="./import.log", help="Log file path") + imp.add_argument("--near-dist", type=int, default=6, help="Simhash max Hamming distance for near-duplicates") + imp.add_argument("--min-text-len", type=int, default=250, help="Skip very short texts") + imp.add_argument( + "--llm", + choices=["auto", "off", "force"], + default="auto", + help="LLM enrichment mode: auto (default), off to disable, force to always run when configured", + ) + imp.add_argument( + "--llm-review", + choices=["always", "auto", "off"], + default="always", + help="LLM review mode for parsed JSON: always (default), auto, off", + ) + imp.add_argument( + "--llm-review-rounds", + type=int, + default=1, + help="How many LLM review merge rounds to run per resume (1..3)", + ) + + # ---------------- search ---------------- + s = sub.add_parser("search", help="Search candidates") + s.add_argument("--db", required=True) + s.add_argument("--query", required=True) + s.add_argument("--limit", type=int, default=20) + s.add_argument("--offset", type=int, default=0) + s.add_argument("--remote", choices=["true", "false"], default=None) + s.add_argument("--location", default=None) + s.add_argument("--experience-min", type=float, default=None) + s.add_argument("--salary-min", type=int, default=None) + s.add_argument("--salary-max", type=int, default=None) + s.add_argument("--english", default=None) + s.add_argument("--doc-type", default=None) + + # AI mode + s.add_argument("--ai", action="store_true", help="Use LLM to build filters from text query and run search") + s.add_argument("--ai-iters", type=int, default=2, help="How many refine iterations for AI search") + + # Backward compatible single-value filters + s.add_argument("--role", default=None, help="Single role (backward compatible); prefer --roles-any") + s.add_argument("--skill", default=None, help="Single skill (backward compatible); prefer --skills-any/--skills-all") + + # Stack filters (comma-separated) + s.add_argument("--roles-any", default=None, help="Comma-separated roles; at least one must match") + s.add_argument("--skills-any", default=None, help="Comma-separated skills; at least one must match") + s.add_argument("--skills-all", default=None, help="Comma-separated skills; all must match") + + # Bundle export behavior + s.add_argument("--bundle", choices=["ask", "yes", "no"], default="ask", help="Bundle found resumes into a folder") + + # ---------------- inspect ---------------- + ins = sub.add_parser("inspect", help="Inspect a single resume file (doc_type/sections)") + ins.add_argument("--file", required=True, help="Path to resume file") + + args = ap.parse_args() + + # ========================= import_exports ========================= + if args.cmd == "import_exports": + con = connect(args.db) + try: + init_db(con) + log = Logger(args.log) + + prev_enabled = os.environ.get("LLM_PARSE_ENABLED") + prev_force = os.environ.get("LLM_PARSE_FORCE") + prev_review_mode = os.environ.get("LLM_PARSE_REVIEW_MODE") + prev_review_rounds = os.environ.get("LLM_PARSE_REVIEW_ROUNDS") + try: + if args.llm == "off": + os.environ["LLM_PARSE_ENABLED"] = "0" + os.environ["LLM_PARSE_REVIEW_MODE"] = "off" + elif args.llm == "force": + os.environ["LLM_PARSE_ENABLED"] = "1" + os.environ["LLM_PARSE_FORCE"] = "1" + os.environ["LLM_PARSE_REVIEW_MODE"] = "always" + else: + os.environ["LLM_PARSE_REVIEW_MODE"] = args.llm_review + + rounds = max(1, min(int(args.llm_review_rounds), 3)) + os.environ["LLM_PARSE_REVIEW_ROUNDS"] = str(rounds) + stats = run_import( + con=con, + input_dir=args.input, + log=log, + max_near_dist=args.near_dist, + min_text_len=args.min_text_len, + ) + finally: + if args.llm == "off": + if prev_enabled is None: + os.environ.pop("LLM_PARSE_ENABLED", None) + else: + os.environ["LLM_PARSE_ENABLED"] = prev_enabled + elif args.llm == "force": + if prev_enabled is None: + os.environ.pop("LLM_PARSE_ENABLED", None) + else: + os.environ["LLM_PARSE_ENABLED"] = prev_enabled + if prev_force is None: + os.environ.pop("LLM_PARSE_FORCE", None) + else: + os.environ["LLM_PARSE_FORCE"] = prev_force + if prev_review_mode is None: + os.environ.pop("LLM_PARSE_REVIEW_MODE", None) + else: + os.environ["LLM_PARSE_REVIEW_MODE"] = prev_review_mode + if prev_review_rounds is None: + os.environ.pop("LLM_PARSE_REVIEW_ROUNDS", None) + else: + os.environ["LLM_PARSE_REVIEW_ROUNDS"] = prev_review_rounds + finally: + con.close() + + _print_json(stats) + return + + # ============================= search ============================= + if args.cmd == "search": + con = connect(args.db) + init_db(con) # важно: гарантирует, что resumes_fts и триггеры существуют + + try: + items: list[Dict[str, Any]] = [] + out: Dict[str, Any] = {} + + if args.ai: + from tg_resume_db.agent import agent_search + + res = agent_search( + con, + user_prompt=args.query, + max_iters=args.ai_iters, + ) + + items = res.get("items", []) + out = { + "ai": True, + "llm_used": res.get("llm_used", False), + "plan": res.get("plan"), + "history": res.get("history"), + "postfilter": res.get("postfilter"), + "items": items, + "count": res.get("count", len(items)), + } + else: + filters = { + "remote": (args.remote == "true") if args.remote is not None else None, + "location": args.location, + "experience_min": args.experience_min, + "salary_min": args.salary_min, + "salary_max": args.salary_max, + "english": args.english, + "doc_type": args.doc_type, + # backward compat + "role": args.role, + "skill": args.skill, + # new + "roles_any": args.roles_any, + "skills_any": args.skills_any, + "skills_all": args.skills_all, + } + + items = run_search( + con, + query=args.query, + filters=filters, + limit=args.limit, + offset=args.offset, + ) + out = {"ai": False, "items": items, "count": len(items)} + + # 1) печатаем результаты + _print_json(out) + + # 2) bundle prompt/flag + if args.bundle == "yes": + do_bundle = True + elif args.bundle == "no": + do_bundle = False + else: # ask + do_bundle = False + if _is_interactive(): + ans = input("\nСобрать найденные резюме в папку? (Y/N): ").strip().lower() + do_bundle = ans in ("y", "yes", "да", "д") + + if do_bundle: + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + out_dir = f"./bundle_{ts}" + info = bundle_search_results(con, items, out_dir, copy_files=True, merge_text=True) + print(f"\n[done] Готово: {info['out_dir']}") + print(f" files copied: {info['copied_files']}, missing: {info['missing_files']}") + print(f" merged: {info['merged_text']}") + print(f" manifest: {info['manifest']}") + + return + + finally: + con.close() + + # ============================= inspect ============================= + if args.cmd == "inspect": + fp = args.file + path = Path(fp) + extract_meta = {} + if path.suffix.lower() == ".pdf": + pdf_res = extract_pdf_best(path, timeout_sec=25) + raw_text = pdf_res.text + extract_meta = { + "method": pdf_res.method, + "quality_score": pdf_res.score, + "quality_flags": pdf_res.flags, + "pages": len(pdf_res.pages), + } + else: + raw_text = extract_text_generic(path) + extract_meta = {"method": "generic"} + + clean = normalize_text(raw_text or "") + dt = detect_doc_type(clean, file_ext=Path(fp).suffix.lower()) + secs = split_sections(clean, dt.doc_type) + out = { + "file": fp, + "doc_type": dt.doc_type, + "confidence": dt.confidence, + "signals": dt.signals, + "extract": extract_meta, + "sections_present": sections_present(secs), + "name_guess": extract_name_guess(clean), + } + _print_json(out) + return + + +if __name__ == "__main__": + main() diff --git a/db.py b/db.py new file mode 100644 index 0000000..1c7d3d9 --- /dev/null +++ b/db.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import sqlite3 +from pathlib import Path + + +SCHEMA = r""" +PRAGMA journal_mode=WAL; +PRAGMA synchronous=NORMAL; +PRAGMA temp_store=MEMORY; + +CREATE TABLE IF NOT EXISTS candidates ( + candidate_id TEXT PRIMARY KEY, + name TEXT, + location TEXT, + remote INTEGER, + experience_years REAL, + experience_years_eng REAL, -- инженерный опыт (после фильтра HR) + experience_confidence REAL, + salary_min INTEGER, + salary_max INTEGER, + salary_confidence REAL, + english_level TEXT, + roles_json TEXT, + skills_json TEXT, + primary_languages_json TEXT, + backend_focus INTEGER, + roles_norm TEXT, -- "|backend|devops|" + skills_norm TEXT, -- "|python|k8s|" + created_at TEXT DEFAULT (datetime('now')), + updated_at TEXT DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS candidate_contacts ( + contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url + contact_value TEXT NOT NULL, -- normalized + candidate_id TEXT NOT NULL, + created_at TEXT DEFAULT (datetime('now')), + PRIMARY KEY(contact_type, contact_value), + FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) +); + +CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id); + +CREATE TABLE IF NOT EXISTS resumes ( + resume_id TEXT PRIMARY KEY, + candidate_id TEXT NOT NULL, + sha256 TEXT, + simhash TEXT, + clean_text TEXT NOT NULL, + raw_text TEXT, + extraction_json TEXT, + llm_summary TEXT, + llm_tags_json TEXT, + extract_method TEXT, + extract_quality_score REAL, + extract_quality_flags TEXT, + extract_pages_json TEXT, + doc_type TEXT, + doc_type_confidence REAL, + parse_method TEXT, + parse_version TEXT, + sections_json TEXT, + is_active INTEGER DEFAULT 1, + duplicate_of_resume_id TEXT, + file_path TEXT, + file_mtime INTEGER, + file_size INTEGER, + created_at TEXT DEFAULT (datetime('now')), + FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id); +CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active); + +CREATE TABLE IF NOT EXISTS sources ( + source_id INTEGER PRIMARY KEY AUTOINCREMENT, + resume_id TEXT NOT NULL, + export_path TEXT, + chat_title TEXT, + message_id TEXT, + message_date TEXT, + origin_type TEXT, + original_file_path TEXT, + original_file_name TEXT, + extra_json TEXT, + created_at TEXT DEFAULT (datetime('now')), + FOREIGN KEY(resume_id) REFERENCES resumes(resume_id) +); + +CREATE TABLE IF NOT EXISTS files_seen ( + sha256 TEXT PRIMARY KEY, + size INTEGER, + mtime INTEGER, + canonical_resume_id TEXT, + first_seen_at TEXT DEFAULT (datetime('now')), + last_seen_at TEXT DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS simhash_buckets ( + bucket INTEGER NOT NULL, + band INTEGER NOT NULL, + resume_id TEXT NOT NULL, + PRIMARY KEY(bucket, band, resume_id), + FOREIGN KEY(resume_id) REFERENCES resumes(resume_id) +); + +CREATE TABLE IF NOT EXISTS candidate_skills ( + candidate_id TEXT NOT NULL, + skill_id TEXT NOT NULL, + skill_label TEXT, + confidence REAL, + source TEXT, + evidence TEXT, + created_at TEXT DEFAULT (datetime('now')), + PRIMARY KEY(candidate_id, skill_id), + FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) +); + +CREATE TABLE IF NOT EXISTS candidate_roles ( + candidate_id TEXT NOT NULL, + role TEXT NOT NULL, + confidence REAL, + source TEXT, + evidence TEXT, + created_at TEXT DEFAULT (datetime('now')), + PRIMARY KEY(candidate_id, role), + FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) +); + +CREATE TABLE IF NOT EXISTS candidate_languages ( + candidate_id TEXT NOT NULL, + language TEXT NOT NULL, + level TEXT, + confidence REAL, + source TEXT, + evidence TEXT, + created_at TEXT DEFAULT (datetime('now')), + PRIMARY KEY(candidate_id, language), + FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) +); + +CREATE TABLE IF NOT EXISTS positions ( + position_id TEXT PRIMARY KEY, + resume_id TEXT NOT NULL, + candidate_id TEXT NOT NULL, + title TEXT, + company TEXT, + date_from TEXT, + date_to TEXT, + is_current INTEGER, + description TEXT, + stack_json TEXT, + created_at TEXT DEFAULT (datetime('now')), + FOREIGN KEY(resume_id) REFERENCES resumes(resume_id), + FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) +); + +CREATE TABLE IF NOT EXISTS llm_cache ( + cache_key TEXT PRIMARY KEY, + model TEXT, + result_json TEXT, + created_at TEXT DEFAULT (datetime('now')) +); + +-- Full-text index (FTS5): contentless +CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5( + resume_id UNINDEXED, + candidate_id UNINDEXED, + clean_text, + tokenize='unicode61 remove_diacritics 2' +); + +-- --- Triggers to keep FTS synced with resumes --- +-- Insert +CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts +AFTER INSERT ON resumes +BEGIN + DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id; + INSERT INTO resumes_fts(resume_id, candidate_id, clean_text) + SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text + WHERE NEW.is_active = 1; +END; + +-- Delete +CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts +AFTER DELETE ON resumes +BEGIN + DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id; +END; + +-- Update (text/active/candidate) +CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts +AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes +BEGIN + DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id; + INSERT INTO resumes_fts(resume_id, candidate_id, clean_text) + SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text + WHERE NEW.is_active = 1; +END; +""" + + +def connect(db_path: str) -> sqlite3.Connection: + Path(db_path).parent.mkdir(parents=True, exist_ok=True) + con = sqlite3.connect(db_path) + con.row_factory = sqlite3.Row + return con + + +def _table_exists(con: sqlite3.Connection, name: str) -> bool: + row = con.execute( + "SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1", + (name,), + ).fetchone() + return row is not None + + +def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool: + cur = con.execute(f"PRAGMA table_info({table})") + for r in cur.fetchall(): + if r["name"] == column: + return True + return False + + +def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None: + if not _table_exists(con, table): + return + if _column_exists(con, table, column): + return + con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}") + + +def _ensure_fts_backfilled(con: sqlite3.Connection) -> None: + """ + Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes. + Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0. + """ + if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"): + return + + try: + resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"]) + fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"]) + except Exception: + return + + if resumes_cnt <= 0: + return + + # Любое несовпадение -> rebuild (убирает и пустоту, и дубли) + if fts_cnt != resumes_cnt: + con.execute("DELETE FROM resumes_fts") + con.execute( + """ + INSERT INTO resumes_fts(resume_id, candidate_id, clean_text) + SELECT resume_id, candidate_id, clean_text + FROM resumes + WHERE is_active=1 + """ + ) + con.commit() + + +def init_db(con: sqlite3.Connection) -> None: + con.executescript(SCHEMA) + # Lightweight migrations for existing DBs (safe to re-run) + _add_column_if_missing(con, "candidates", "experience_years_eng", "REAL") + _add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT") + _add_column_if_missing(con, "candidates", "backend_focus", "INTEGER") + _add_column_if_missing(con, "resumes", "llm_summary", "TEXT") + _add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT") + _add_column_if_missing(con, "resumes", "extract_method", "TEXT") + _add_column_if_missing(con, "resumes", "extract_quality_score", "REAL") + _add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT") + _add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT") + _add_column_if_missing(con, "resumes", "doc_type", "TEXT") + _add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL") + _add_column_if_missing(con, "resumes", "parse_method", "TEXT") + _add_column_if_missing(con, "resumes", "parse_version", "TEXT") + _add_column_if_missing(con, "resumes", "sections_json", "TEXT") + if not _table_exists(con, "llm_cache"): + con.execute( + """ + CREATE TABLE IF NOT EXISTS llm_cache ( + cache_key TEXT PRIMARY KEY, + model TEXT, + result_json TEXT, + created_at TEXT DEFAULT (datetime('now')) + ) + """ + ) + con.commit() + _ensure_fts_backfilled(con) diff --git a/dedup/simhash.py b/dedup/simhash.py new file mode 100644 index 0000000..be8a643 --- /dev/null +++ b/dedup/simhash.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import hashlib +import re +from typing import List, Tuple + +def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str: + h = hashlib.sha256() + for chunk in iter(lambda: f.read(chunk_size), b""): + h.update(chunk) + return h.hexdigest() + +def sha256_file(path) -> str: + with open(path, "rb") as f: + return sha256_file_bytes_iter(f) + +def sha1_str(s: str) -> str: + return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest() + +def simhash64(text: str) -> int: + tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower()) + if not tokens: + return 0 + v = [0] * 64 + for tok in tokens: + h = hashlib.md5(tok.encode("utf-8")).digest() + x = int.from_bytes(h[:8], "big", signed=False) + for i in range(64): + v[i] += 1 if ((x >> i) & 1) else -1 + out = 0 + for i in range(64): + if v[i] > 0: + out |= (1 << i) + return out + +def hamming64(a: int, b: int) -> int: + return (a ^ b).bit_count() + +def simhash_bands(x: int) -> List[Tuple[int, int]]: + # 4 bands x 16 bits + return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)] diff --git a/extract/clean.py b/extract/clean.py new file mode 100644 index 0000000..110bb04 --- /dev/null +++ b/extract/clean.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import re +from collections import Counter +import unicodedata + +RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I) +RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U) +RE_MULTI_SPACE = re.compile(r"[ \t]+") +RE_MULTI_NL = re.compile(r"\n{3,}") + +_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"] +_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]") + +def normalize_text(raw: str) -> str: + text = raw.replace("\r\n", "\n").replace("\r", "\n") + for ch in _INVISIBLE_CHARS: + text = text.replace(ch, "") + text = _BIDI_CTRL_RE.sub("", text) + # remove most control/format chars but keep line breaks and tabs + text = "".join( + ch for ch in text + if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C") + ) + text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b")) + lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")] + lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)] + counts = Counter(lines) + filtered = [] + for ln in lines: + if counts[ln] >= 4 and len(ln) <= 90: + continue + filtered.append(ln) + text = "\n".join(filtered) + text = RE_MULTI_NL.sub("\n\n", text).strip() + return text + +def to_fts_text(clean: str) -> str: + return re.sub(r"\s+", " ", clean).strip() diff --git a/extract/doc_type.py b/extract/doc_type.py new file mode 100644 index 0000000..7cf701d --- /dev/null +++ b/extract/doc_type.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class DocTypeResult: + doc_type: str + confidence: float + signals: List[str] + + +_HH_PATTERNS = [ + (re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"), + (re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"), + (re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"), + (re.compile(r"\bжелаемая\s+должность\b", re.I), 1.2, "hh_desired_role"), + (re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"), + (re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"), +] + +_LI_PATTERNS = [ + (re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"), + (re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"), + (re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"), + (re.compile(r"\beducation\b", re.I), 0.9, "li_education"), + (re.compile(r"\bskills\b", re.I), 0.9, "li_skills"), + (re.compile(r"\babout\b", re.I), 0.6, "li_about"), +] + +_PPTX_PATTERNS = [ + (re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"), + (re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"), + (re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"), + (re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"), +] + + +def _score_patterns(text: str, patterns) -> tuple[float, List[str]]: + score = 0.0 + signals: List[str] = [] + for rx, weight, name in patterns: + if rx.search(text): + score += weight + signals.append(name) + return score, signals + + +def _confidence_from_score(score: float) -> float: + if score >= 4.0: + return 0.92 + if score >= 3.0: + return 0.85 + if score >= 2.0: + return 0.75 + if score >= 1.2: + return 0.62 + if score > 0.0: + return 0.50 + return 0.30 + + +def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult: + lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()] + head_lines = lines[:80] + head_text = "\n".join(head_lines) + head_lc = head_text.lower() + + signals: List[str] = [] + + hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS) + li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS) + pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS) + if file_ext and file_ext.lower() in (".pptx", ".ppt"): + pptx_score += 2.0 + signals.append("pptx_ext") + + signals.extend(hh_signals + li_signals + pptx_signals) + + # One-page heuristic: short resumes with dense content + total_len = len(clean_text or "") + one_page_score = 0.0 + if len(lines) <= 70 and total_len <= 4500: + one_page_score = 2.2 + signals.append("one_page_short") + elif len(lines) <= 90 and total_len <= 6500: + one_page_score = 1.6 + signals.append("one_page_medium") + + # Scan heuristic: very low textual content + letters = sum(ch.isalpha() for ch in clean_text or "") + total = max(1, len(clean_text or "")) + letter_ratio = letters / total + scan_score = 0.0 + if total_len < 200 or letter_ratio < 0.12: + scan_score = 3.2 + signals.append("scan_low_text") + if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"): + scan_score += 0.6 + signals.append("scan_file_ext") + + candidates = [ + ("hh_ru", hh_score), + ("linkedin_pdf", li_score), + ("pptx_export", pptx_score), + ("one_page", one_page_score), + ("scan_pdf", scan_score), + ] + doc_type, best_score = max(candidates, key=lambda x: x[1]) + + if best_score <= 0.0: + base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic" + return DocTypeResult(doc_type=base, confidence=0.35, signals=signals) + + confidence = _confidence_from_score(best_score) + # If scan is detected strongly, prefer it + if doc_type == "scan_pdf" and confidence >= 0.8: + return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals) + + # Split one-page into ru/en + if doc_type == "one_page": + if _looks_cyrillic(head_text): + return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals) + return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals) + + return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals) + + +def _looks_cyrillic(text: str) -> bool: + cyr = len(re.findall(r"[А-Яа-яЁё]", text)) + lat = len(re.findall(r"[A-Za-z]", text)) + return cyr > lat and cyr >= 10 diff --git a/extract/experience.py b/extract/experience.py new file mode 100644 index 0000000..964db09 --- /dev/null +++ b/extract/experience.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from datetime import date +from typing import Dict, List, Optional, Tuple + +# Month maps (EN + RU) +MONTHS = { + "jan": 1, "january": 1, "янв": 1, "январ": 1, + "feb": 2, "february": 2, "фев": 2, "феврал": 2, + "mar": 3, "march": 3, "мар": 3, "март": 3, + "apr": 4, "april": 4, "апр": 4, "апрел": 4, + "may": 5, "май": 5, + "jun": 6, "june": 6, "июн": 6, "июнь": 6, + "jul": 7, "july": 7, "июл": 7, "июль": 7, + "aug": 8, "august": 8, "авг": 8, "август": 8, + "sep": 9, "september": 9, "сен": 9, "сент": 9, + "oct": 10, "october": 10, "окт": 10, "октя": 10, + "nov": 11, "november": 11, "ноя": 11, "ноябр": 11, + "dec": 12, "december": 12, "дек": 12, "дека": 12, +} + +PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I) + +# Direct "X years" patterns +DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I) + +# Dates like 03.2019, 2019, Jan 2020, янв 2020 +MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b") +YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b") +MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b") + +# Range separators +RANGE_RE = re.compile(r"(?P.+?)\s*(?:—|–|-|to|по)\s*(?P.+?)$", re.I) + +@dataclass +class ExpResult: + years: Optional[float] + confidence: float + debug: Dict + +def _clamp_years(y: float) -> Optional[float]: + if 0.0 <= y <= 45.0: + return y + return None + +def _parse_mon(mon: str) -> Optional[int]: + m = mon.strip().lower() + m = re.sub(r"[^\wа-я]+", "", m, flags=re.I) + # allow prefixes: "январ", "феврал" + for k, v in MONTHS.items(): + if m.startswith(k): + return v + return None + +def _as_ymd(y: int, m: int) -> date: + return date(y, m, 1) + +def _parse_one_date(s: str) -> Optional[date]: + s = s.strip() + if PRESENT_RE.search(s): + today = date.today() + return date(today.year, today.month, 1) + + m1 = MMYYYY_RE.search(s) + if m1: + mm = int(m1.group(1)) + yy = int(m1.group(2)) + return _as_ymd(yy, mm) + + m2 = MON_YYYY_RE.search(s) + if m2: + mon = _parse_mon(m2.group(1)) + yy = int(m2.group(2)) + if mon: + return _as_ymd(yy, mon) + + m3 = YYYY_RE.search(s) + if m3: + yy = int(m3.group(1)) + return _as_ymd(yy, 1) + + return None + +def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]: + if not intervals: + return [] + intervals = sorted(intervals, key=lambda x: (x[0], x[1])) + merged = [intervals[0]] + for s, e in intervals[1:]: + ls, le = merged[-1] + if s <= le: + merged[-1] = (ls, max(le, e)) + else: + merged.append((s, e)) + return merged + +def _months_between(a: date, b: date) -> int: + # month-level difference (inclusive-ish): b >= a + return (b.year - a.year) * 12 + (b.month - a.month) + +def extract_experience(text: str) -> ExpResult: + debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []} + + # 1) Direct years + directs = [] + for m in DIRECT_YEARS_RE.finditer(text): + try: + v = float(m.group(1).replace(",", ".")) + if 0 <= v <= 45: + directs.append(v) + debug["direct_matches"].append({"match": m.group(0), "value": v}) + except Exception: + pass + if directs: + years = _clamp_years(max(directs)) + return ExpResult(years=years, confidence=0.90, debug=debug) + + # 2) Ranges in lines: try to detect "start - end" + intervals: List[Tuple[date, date]] = [] + for line in text.splitlines(): + ln = line.strip() + if len(ln) < 7: + continue + # require range separator + if not any(x in ln for x in ("—", "–", "-", " to ", " по ")): + continue + rr = RANGE_RE.match(ln) + if not rr: + continue + a = rr.group("a") + b = rr.group("b") + da = _parse_one_date(a) + db = _parse_one_date(b) + if da and db: + if db < da: + da, db = db, da + # cap extremely old + if da.year < 1990: + continue + intervals.append((da, db)) + debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()}) + + intervals = _merge_intervals(intervals) + debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals] + + if not intervals: + return ExpResult(years=None, confidence=0.0, debug=debug) + + total_months = 0 + for s, e in intervals: + total_months += max(0, _months_between(s, e)) + years = round(total_months / 12.0, 2) + years = _clamp_years(years) if years is not None else None + + # confidence depends on amount of evidence + conf = 0.70 if total_months >= 12 else 0.55 + return ExpResult(years=years, confidence=conf, debug=debug) diff --git a/extract/experience_timeline.py b/extract/experience_timeline.py new file mode 100644 index 0000000..ca73ae5 --- /dev/null +++ b/extract/experience_timeline.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass, asdict +from datetime import date +from typing import List, Optional + +MONTHS = { + "jan": 1, "january": 1, "янв": 1, "январ": 1, + "feb": 2, "february": 2, "фев": 2, "феврал": 2, + "mar": 3, "march": 3, "мар": 3, "март": 3, + "apr": 4, "april": 4, "апр": 4, "апрел": 4, + "may": 5, "май": 5, + "jun": 6, "june": 6, "июн": 6, "июнь": 6, + "jul": 7, "july": 7, "июл": 7, "июль": 7, + "aug": 8, "august": 8, "авг": 8, "август": 8, + "sep": 9, "september": 9, "сен": 9, "сент": 9, + "oct": 10, "october": 10, "окт": 10, "октя": 10, + "nov": 11, "november": 11, "ноя": 11, "ноябр": 11, + "dec": 12, "december": 12, "дек": 12, "дека": 12, +} + +PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I) +MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b") +YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b") +MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b") +RANGE_RE = re.compile(r"(?P.+?)\s*(?:—|–|-|to|по)\s*(?P.+?)$", re.I) +YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I) +EDU_CONTEXT_RE = re.compile( + r"\b(" + r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|" + r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет" + r")\b", + re.I, +) + + +@dataclass +class Position: + title: Optional[str] + company: Optional[str] + date_from: Optional[str] + date_to: Optional[str] + is_current: Optional[bool] + description: Optional[str] + + +def _parse_mon(mon: str) -> Optional[int]: + m = mon.strip().lower() + m = re.sub(r"[^\wа-я]+", "", m, flags=re.I) + for k, v in MONTHS.items(): + if m.startswith(k): + return v + return None + + +def _as_ymd(y: int, m: int) -> date: + return date(y, m, 1) + + +def _parse_one_date(s: str) -> Optional[date]: + s = s.strip() + if PRESENT_RE.search(s): + today = date.today() + return date(today.year, today.month, 1) + m1 = MMYYYY_RE.search(s) + if m1: + mm = int(m1.group(1)) + yy = int(m1.group(2)) + return _as_ymd(yy, mm) + m2 = MON_YYYY_RE.search(s) + if m2: + mon = _parse_mon(m2.group(1)) + yy = int(m2.group(2)) + if mon: + return _as_ymd(yy, mon) + m3 = YYYY_RE.search(s) + if m3: + yy = int(m3.group(1)) + return _as_ymd(yy, 1) + return None + + +def extract_positions(text: str, max_items: int = 40) -> List[Position]: + lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] + positions: List[Position] = [] + i = 0 + while i < len(lines) and len(positions) < max_items: + ln = lines[i] + if not any(x in ln for x in ("—", "–", "-", " to ", " по ")): + i += 1 + continue + rr = RANGE_RE.match(ln) + if not rr: + i += 1 + continue + ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)]) + if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx): + i += 1 + continue + da = _parse_one_date(rr.group("a")) + db = _parse_one_date(rr.group("b")) + if not da or not db: + i += 1 + continue + if da.year < 1990: + i += 1 + continue + is_current = PRESENT_RE.search(rr.group("b")) is not None + title = None + company = None + desc_lines: List[str] = [] + if i + 1 < len(lines): + if EDU_CONTEXT_RE.search(lines[i + 1]): + i += 1 + continue + header = lines[i + 1] + parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()] + if parts: + title = parts[0] + if len(parts) > 1: + company = parts[1] + j = i + 2 + while j < len(lines): + if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]): + break + desc_lines.append(lines[j]) + j += 1 + positions.append( + Position( + title=title, + company=company, + date_from=da.isoformat(), + date_to=db.isoformat(), + is_current=is_current, + description="\n".join(desc_lines).strip() if desc_lines else None, + ) + ) + i = j + return positions + + +def positions_to_dicts(items: List[Position]) -> List[dict]: + return [asdict(p) for p in items] diff --git a/extract/llm.py b/extract/llm.py new file mode 100644 index 0000000..8a5a1bf --- /dev/null +++ b/extract/llm.py @@ -0,0 +1,585 @@ +from __future__ import annotations + +import hashlib +import json +import os +import re +import sqlite3 +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +try: + import httpx # type: ignore +except Exception: # pragma: no cover + httpx = None # type: ignore + + +def resolve_llm_runtime() -> Dict[str, str]: + """ + Resolve OpenAI-compatible runtime config. + Supports both generic vars and Mistral aliases: + - generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY + - mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL + """ + provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower() + base_url = (os.environ.get("LLM_BASE_URL") or "").strip() + model = (os.environ.get("LLM_MODEL") or "").strip() + api_key = (os.environ.get("LLM_API_KEY") or "").strip() + + mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip() + mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip() + mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip() + + if not api_key and mistral_key: + api_key = mistral_key + if not model and mistral_model: + model = mistral_model + if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")): + base_url = mistral_base + + if base_url: + base_url = base_url.rstrip("/") + + if not provider: + if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")): + provider = "mistral" + else: + provider = "generic" + + return { + "provider": provider, + "base_url": base_url, + "model": model, + "api_key": api_key, + } + + +# ------------- Public API ------------- + +def llm_parse_enabled() -> bool: + """ + Enabled only if httpx is available and both base_url/model are resolved. + Opt-out via LLM_PARSE_ENABLED=0. + """ + if httpx is None: + return False + if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"): + return False + runtime = resolve_llm_runtime() + return bool(runtime["base_url"]) and bool(runtime["model"]) + + +_PROMPT_VERSION = "v3_sections_doc_type" +_REVIEW_PROMPT_VERSION = "v1_review_merge" + + +@dataclass +class LLMExtraction: + roles: List[str] + skills: List[str] + primary_languages: List[str] + seniority: Optional[str] + backend_focus: Optional[bool] + experience_years_total: Optional[float] + experience_years_engineering: Optional[float] + english_level: Optional[str] + location: Optional[str] + remote_ok: Optional[bool] + salary_min_usd: Optional[int] + salary_max_usd: Optional[int] + salary_min_rub: Optional[int] + salary_max_rub: Optional[int] + highlights: List[str] + keywords: List[str] + + @staticmethod + def from_obj(obj: Dict[str, Any]) -> "LLMExtraction": + def _as_list(v: Any) -> List[str]: + if v is None: + return [] + if isinstance(v, list): + return [str(x).strip() for x in v if str(x).strip()] + s = str(v).strip() + return [s] if s else [] + + def _as_float(v: Any) -> Optional[float]: + try: + return float(v) + except Exception: + return None + + def _as_int(v: Any) -> Optional[int]: + try: + return int(float(v)) + except Exception: + return None + + def _as_bool(v: Any) -> Optional[bool]: + if isinstance(v, bool): + return v + if v is None: + return None + s = str(v).strip().lower() + if s in ("true", "1", "yes", "y"): + return True + if s in ("false", "0", "no", "n"): + return False + return None + + return LLMExtraction( + roles=_as_list(obj.get("roles")), + skills=_as_list(obj.get("skills")), + primary_languages=_as_list(obj.get("primary_languages")), + seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None, + backend_focus=_as_bool(obj.get("backend_focus")), + experience_years_total=_as_float(obj.get("experience_years_total")), + experience_years_engineering=_as_float(obj.get("experience_years_engineering")), + english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None, + location=(str(obj.get("location")).strip() or None) if obj.get("location") else None, + remote_ok=_as_bool(obj.get("remote_ok")), + salary_min_usd=_as_int(obj.get("salary_min_usd")), + salary_max_usd=_as_int(obj.get("salary_max_usd")), + salary_min_rub=_as_int(obj.get("salary_min_rub")), + salary_max_rub=_as_int(obj.get("salary_max_rub")), + highlights=_as_list(obj.get("highlights")), + keywords=_as_list(obj.get("keywords")), + ) + + +def llm_extract_profile( + clean_text: str, + *, + con: Optional[sqlite3.Connection] = None, + doc_type: Optional[str] = None, + sections: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]: + """ + Returns (LLMExtraction | None, debug_info). + - Uses cache on disk/sqlite to keep throughput high. + - Silently degrades to None on any failure. + """ + runtime = resolve_llm_runtime() + dbg: Dict[str, Any] = { + "enabled": llm_parse_enabled(), + "provider": runtime.get("provider"), + "model": runtime.get("model"), + "from_cache": False, + "cache_backend": None, + "error": None, + "prompt_version": _PROMPT_VERSION, + } + if not llm_parse_enabled(): + return None, dbg + + text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest() + cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}" + + payload = _build_payload( + clean_text, + doc_type=doc_type, + sections=sections, + prompt_version=_PROMPT_VERSION, + temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)), + max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)), + system_prompt="You output ONLY JSON for structured resume extraction.", + prompt_template=_PROMPT_TEMPLATE, + ) + + data = _cached_llm_json_call( + con=con, + cache_key=cache_key, + model=runtime["model"], + payload=payload, + dbg=dbg, + ) + if data is None: + return None, dbg + return LLMExtraction.from_obj(data), dbg + + +def llm_review_profile( + clean_text: str, + *, + draft: Dict[str, Any], + con: Optional[sqlite3.Connection] = None, + doc_type: Optional[str] = None, + sections: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]: + """ + Second-pass validator: + - Takes already parsed JSON (draft) + - Re-checks every field against resume text + - Returns corrected extraction for safe merge in pipeline + """ + runtime = resolve_llm_runtime() + dbg: Dict[str, Any] = { + "enabled": llm_parse_enabled(), + "provider": runtime.get("provider"), + "model": runtime.get("model"), + "from_cache": False, + "cache_backend": None, + "error": None, + "prompt_version": _REVIEW_PROMPT_VERSION, + "quality_score": None, + "changed_fields": [], + "issues_found": [], + } + if not llm_parse_enabled(): + return None, dbg + + clean_draft = _sanitize_review_draft(draft) + draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True) + text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest() + draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest() + cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}" + + payload = _build_payload( + clean_text, + doc_type=doc_type, + sections=sections, + prompt_version=_REVIEW_PROMPT_VERSION, + temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)), + max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)), + system_prompt="You output ONLY JSON for resume parsing quality review.", + prompt_template=_REVIEW_PROMPT_TEMPLATE, + extra_vars={"draft_json": draft_blob}, + ) + + data = _cached_llm_json_call( + con=con, + cache_key=cache_key, + model=runtime["model"], + payload=payload, + dbg=dbg, + ) + if data is None: + return None, dbg + + corrected_obj: Dict[str, Any] + if isinstance(data.get("corrected"), dict): + corrected_obj = data["corrected"] + else: + corrected_obj = data + + dbg["quality_score"] = _as_float(data.get("quality_score")) + dbg["changed_fields"] = _as_str_list(data.get("changed_fields")) + dbg["issues_found"] = _as_str_list(data.get("issues_found")) + + return LLMExtraction.from_obj(corrected_obj), dbg + + +# ------------- Internal helpers ------------- + +_PROMPT_TEMPLATE = """ +Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON. +Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список. +Схема: +{{ + "roles": ["backend","devops","frontend","qa","data engineer","android","ios"], + "skills": ["python","go","k8s","postgres","react", "..."], + "primary_languages": ["python","go","java","c++", "..."], + "seniority": "intern|junior|middle|senior|lead|principal|null", + "backend_focus": true|false|null, + "experience_years_total": number|null, + "experience_years_engineering": number|null, + "english_level": "A1|A2|B1|B2|C1|C2|null", + "location": "city, country|null", + "remote_ok": true|false|null, + "salary_min_usd": int|null, + "salary_max_usd": int|null, + "salary_min_rub": int|null, + "salary_max_rub": int|null, + "highlights": ["кратко достижения (1-2 предложения)"], + "keywords": ["уникальные ключевые слова, продукты или домены"] +}} +Не включай контактные данные в skills/keywords. +Detected doc_type: {doc_type} +Sections (if present): +{sections_block} + +Full text snippet (use only if needed): +```TEXT +{resume_text} +``` +""" + +_REVIEW_PROMPT_TEMPLATE = """ +Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON. +У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме. +Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать. + +Верни JSON строго такой формы: +{{ + "corrected": {{ + "roles": ["..."], + "skills": ["..."], + "primary_languages": ["..."], + "seniority": "intern|junior|middle|senior|lead|principal|null", + "backend_focus": true|false|null, + "experience_years_total": number|null, + "experience_years_engineering": number|null, + "english_level": "A1|A2|B1|B2|C1|C2|null", + "location": "city, country|null", + "remote_ok": true|false|null, + "salary_min_usd": int|null, + "salary_max_usd": int|null, + "salary_min_rub": int|null, + "salary_max_rub": int|null, + "highlights": ["..."], + "keywords": ["..."] + }}, + "changed_fields": ["field_name", "..."], + "issues_found": ["кратко что было неверно/сомнительно", "..."], + "quality_score": 0.0 +}} + +Черновик JSON: +```DRAFT +{draft_json} +``` + +Detected doc_type: {doc_type} +Sections (if present): +{sections_block} + +Full text snippet (use only if needed): +```TEXT +{resume_text} +``` +""" + + +def _trim_text(text: str, max_len: int = 9000) -> str: + """ + Keep head and tail to preserve summary + recent projects. + """ + if len(text) <= max_len: + return text + head = text[: max_len // 2] + tail = text[-max_len // 2 :] + return head + "\n...\n" + tail + + +def _build_payload( + clean_text: str, + *, + doc_type: Optional[str], + sections: Optional[Dict[str, str]], + prompt_version: str, + temperature: float, + max_tokens: int, + system_prompt: str, + prompt_template: str, + extra_vars: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + runtime = resolve_llm_runtime() + base_url = runtime["base_url"] + model = runtime["model"] + + sections_block = _build_sections_block(sections) + tpl_vars = { + "resume_text": _trim_text(clean_text), + "doc_type": (doc_type or "unknown"), + "sections_block": sections_block or "(no sections detected)", + } + if extra_vars: + tpl_vars.update(extra_vars) + + prompt = prompt_template.format(**tpl_vars) + + return { + "base_url": base_url, + "model": model, + "prompt_version": prompt_version, + "payload": { + "model": model, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ], + "temperature": temperature, + "max_tokens": max_tokens, + }, + "headers": _build_headers(runtime), + "timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)), + } + + +def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]: + headers = {"Content-Type": "application/json"} + api_key = runtime.get("api_key", "") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + +def _cached_llm_json_call( + *, + con: Optional[sqlite3.Connection], + cache_key: str, + model: str, + payload: Dict[str, Any], + dbg: Dict[str, Any], +) -> Optional[Dict[str, Any]]: + data = _cache_get_sqlite(con, cache_key) + if data: + dbg["from_cache"] = True + dbg["cache_backend"] = "sqlite" + return data + + cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve() + cache_ok = True + try: + cache_dir.mkdir(parents=True, exist_ok=True) + except Exception: + cache_ok = False + + safe_name = cache_key.replace(":", "_") + cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None + + if cache_path and cache_path.exists(): + try: + data = json.loads(cache_path.read_text(encoding="utf-8")) + dbg["from_cache"] = True + dbg["cache_backend"] = "disk" + return data + except Exception: + pass + + try: + data = _llm_call_json(payload) + if con: + _cache_put_sqlite(con, cache_key, model, data) + if cache_path: + cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8") + return data + except Exception as e: # pragma: no cover - network/LLM failures + dbg["error"] = repr(e) + return None + + +def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]: + if httpx is None: + raise RuntimeError("httpx is not installed") + + base_url: str = task["base_url"] + payload: Dict[str, Any] = task["payload"] + timeout = float(task.get("timeout", 18.0)) + + with httpx.Client(timeout=timeout) as client: + r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload) + r.raise_for_status() + data = r.json() + + content = data["choices"][0]["message"]["content"] + if isinstance(content, list): + parts = [] + for block in content: + if isinstance(block, dict): + parts.append(str(block.get("text") or "")) + else: + parts.append(str(block)) + content = "\n".join(parts) + content = str(content) + + m = re.search(r"\{.*\}", content, flags=re.S) + if not m: + raise ValueError("LLM did not return JSON") + return json.loads(m.group(0)) + + +def _build_sections_block(sections: Optional[Dict[str, str]]) -> str: + if not sections: + return "" + parts: List[str] = [] + order = [ + ("about", "ABOUT"), + ("skills", "SKILLS"), + ("experience", "EXPERIENCE"), + ("education", "EDUCATION"), + ("contacts", "CONTACTS"), + ] + for key, label in order: + text = sections.get(key) + if not text: + continue + snippet = _trim_text(text, max_len=1800) + parts.append(f"[{label}]\n{snippet}") + return "\n\n".join(parts) + + +def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]: + if not isinstance(draft, dict): + draft = {} + + allowed = { + "roles", + "skills", + "primary_languages", + "seniority", + "backend_focus", + "experience_years_total", + "experience_years_engineering", + "english_level", + "location", + "remote_ok", + "salary_min_usd", + "salary_max_usd", + "salary_min_rub", + "salary_max_rub", + "highlights", + "keywords", + } + cleaned = {k: v for k, v in draft.items() if k in allowed} + return asdict(LLMExtraction.from_obj(cleaned)) + + +def _as_float(v: Any) -> Optional[float]: + try: + x = float(v) + except Exception: + return None + if x < 0: + return None + if x > 1.0: + return 1.0 + return x + + +def _as_str_list(v: Any) -> List[str]: + if v is None: + return [] + if isinstance(v, list): + return [str(x).strip() for x in v if str(x).strip()] + s = str(v).strip() + return [s] if s else [] + + +def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]: + if con is None: + return None + try: + row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone() + if row and row["result_json"]: + return json.loads(row["result_json"]) + except Exception: + return None + return None + + +def _cache_put_sqlite( + con: Optional[sqlite3.Connection], + cache_key: str, + model: str, + data: Dict[str, Any], +) -> None: + if con is None: + return + try: + con.execute( + "INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)", + (cache_key, model, json.dumps(data, ensure_ascii=False)), + ) + except Exception: + return diff --git a/extract/parse.py b/extract/parse.py new file mode 100644 index 0000000..f868fb4 --- /dev/null +++ b/extract/parse.py @@ -0,0 +1,659 @@ +from __future__ import annotations + +import json +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple +from tg_resume_db.normalize import normalize_skill +from tg_resume_db.extract.experience import extract_experience + +EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I) +EMAIL_SPLIT_RE = re.compile( + r"(?[a-z0-9][a-z0-9._%+\-]{1,40})\s+" + r"(?P[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})", + re.I, +) +PHONE_RE = re.compile(r"(? List[Tuple[str, re.Pattern]]: + patterns: List[Tuple[str, re.Pattern]] = [] + for skill in sorted(SKILLS): + aliases = [skill] + _SKILL_ALIASES.get(skill, []) + for alias in aliases: + if skill == "java" and alias == "java": + # Do not match "java" inside "java script". + pat = re.compile(r"(? Dict[str, List[re.Pattern]]: + out: Dict[str, List[re.Pattern]] = {} + for role in ROLES: + aliases = _ROLE_ALIASES.get(role, [role]) + out[role] = [ + re.compile(r"(?\d{1,2})\s*(?:год|года|лет|years?|yrs?)" + r"(?:[^0-9]{0,20}(?P\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?" +) + +EXP_NEARBY_RE = re.compile( + r"(?i)\b(?P\d{1,2})\s*(?:год|года|лет|years?|yrs?)" + r"(?:[^0-9]{0,20}(?P\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?" +) + +HH_FOOTER_RE = re.compile( + r"(?P[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено", + re.I, +) +NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I) +NAME_LINE_RE = re.compile( + r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$" +) +NAME_STOPWORDS = { + "resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education", + "projects", "about", "profile", "objective", "навыки", "опыт", "образование", + "контакты", "профиль", "цель", "резюме", + "developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead", + "backend developer", "frontend developer", "fullstack developer", "software engineer", + "разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид", + "top skills", "experience", "education", "languages", "certifications", + "skills & endorsements", "endorsements", + "university", "state university", "institute", "college", "academy", "school", + "bachelor", "master", "degree", "faculty", "университет", "институт", "академия", + "колледж", "школа", "бакалавр", "магистр", "факультет", +} + +_NAME_BAD_WORDS = { + "skills", "top skills", "experience", "education", "languages", "certifications", + "projects", "summary", "about", "profile", "endorsements", + "university", "institute", "college", "academy", "school", + "bachelor", "master", "degree", "faculty", +} + +NAME_INSTITUTION_RE = re.compile( + r"\b(" + r"university|institute|college|academy|school|faculty|bachelor|master|degree|" + r"mathematics|computer science|informatics|physics|economics|management|" + r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|" + r"математик|информатик|физик|экономик|менеджмент" + r")\b", + re.I, +) + +_EMAIL_PREFIX_STOP = { + "email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github", +} + + +def _prune_fragment_emails(values: List[str]) -> List[str]: + uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v)) + out: List[str] = [] + for e in uniq: + local, domain = e.split("@", 1) + drop = False + for other in uniq: + if other == e: + continue + ol, od = other.split("@", 1) + if od != domain: + continue + if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol): + drop = True + break + if not drop: + out.append(e) + return out + + +def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]: + """ + Returns (total_years, engineering_years, confidence, debug). + + Logic: + 1. Calculate TOTAL experience from summaries. + 2. Check if the candidate is primarily a Recruiter/HR. + - If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs). + - If NO: engineering_years = total_years (Optimistic assumption for valid devs). + """ + dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False} + + total_years: Optional[float] = None + confidence = 0.0 + + lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] + + # 1. Detect if Recruiter + # Check the "Header" (first ~15 lines or first 1000 chars) for HR titles + header_text = "\n".join(lines[:15]) + is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text)) + dbg["is_recruiter"] = is_recruiter + + # 2. Extract Total Duration + if lines: + # Strategy A: Explicit summary + for i, ln in enumerate(lines[:200]): + if AGE_LINE_RE.search(ln): continue + + # Look for summary line + if EXP_HEADER_RE.search(ln): + window = ln + if i + 1 < len(lines): window += " " + lines[i+1] + if i + 2 < len(lines): window += " " + lines[i+2] + + m = EXP_SUMMARY_RE.search(window) + if m: + y = int(m.group("y")) + mm = int(m.group("m")) if m.group("m") else 0 + total_years = float(round(y + (mm / 12.0), 2)) + if 0 <= total_years <= 60: + dbg["method"] = "summary" + dbg["matched"] = m.group(0) + confidence = 0.95 + break + + # Strategy B: Fallback nearby + if total_years is None: + safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)] + for i, ln in enumerate(safe_lines): + if not EXP_HEADER_RE.search(ln): continue + chunk = " ".join(safe_lines[i : i + 12]) + m = EXP_NEARBY_RE.search(chunk) + if m: + y = int(m.group("y")) + mm = int(m.group("m")) if m.group("m") else 0 + val = float(round(y + (mm / 12.0), 2)) + if 0 <= val <= 60: + total_years = val + dbg["method"] = "header_chunk" + dbg["matched"] = m.group(0) + confidence = 0.80 + break + + # 2.5 Timeline/range fallback-reconciliation + # Protects against cases where summary parser catches one short fragment + # while CV has a long timeline. + try: + alt = extract_experience(text or "") + except Exception: + alt = None + if alt and alt.years is not None: + if total_years is None: + total_years = alt.years + confidence = max(confidence, alt.confidence) + dbg["method"] = "timeline_fallback" + dbg["matched"] = "date_ranges" + elif alt.years > (total_years + 1.0): + strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78 + if strong_summary and (alt.years - float(total_years)) > 1.5: + dbg["reconcile"] = "timeline_skip_strong_summary" + else: + total_years = alt.years + confidence = max(confidence, min(0.82, alt.confidence)) + dbg["method"] = "timeline_reconcile" + dbg["matched"] = "date_ranges" + + # 3. Calculate Engineering Years + eng_years = total_years + if is_recruiter: + # If they are a recruiter, their "engineering" experience is effectively 0 + # for the purpose of finding a Developer. + eng_years = 0.0 + + return total_years, eng_years, confidence, dbg + + +def _norm_phone(p: str) -> str: + digits = re.sub(r"\D+", "", p) + if digits.startswith("8") and len(digits) == 11: + digits = "7" + digits[1:] + return "+" + digits if digits else "" + +def _norm_token(s: str) -> str: + return re.sub(r"\s+", " ", s.strip().lower()) + +def safe_json(v) -> str: + return json.dumps(v, ensure_ascii=False) + +def extract_contacts(text: str) -> Dict[str, List[str]]: + emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or "")) + for m in EMAIL_SPLIT_RE.finditer(text or ""): + prefix = m.group("prefix").strip().lower().strip(".-_") + if not prefix or prefix in _EMAIL_PREFIX_STOP: + continue + if not re.search(r"[._\-\d]", prefix): + continue + tail = m.group("tail").lower() + if "@" not in tail: + continue + local_tail, domain = tail.split("@", 1) + local = f"{prefix}{local_tail}" + if len(local) > 64: + continue + cand = f"{local}@{domain}" + if EMAIL_RE.fullmatch(cand): + emails_set.add(cand) + emails = _prune_fragment_emails(sorted(emails_set)) + phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1)))) + tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text))) + gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text))) + li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text))) + urls = sorted(set(m.group(0) for m in URL_RE.finditer(text))) + return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls} + +def extract_name_guess(text: str) -> Optional[str]: + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + if not lines: + return None + + # 1) HH footer "Name • Резюме обновлено ..." + m = HH_FOOTER_RE.search(text or "") + if m: + cand = m.group("name").strip() + if _looks_like_name_line(cand): + return cand + + # 2) Key-value line: "Name: ..." / "Имя: ..." + for ln in lines[:40]: + m2 = NAME_KV_RE.match(ln) + if m2: + cand = m2.group(2).strip() + cand = re.split(r"[|,/;]", cand)[0].strip() + if _looks_like_name_line(cand): + return cand + + # 3) Name-like in first ~40 lines + for ln in lines[:40]: + if _looks_like_heading_line(ln): + continue + if _looks_like_name_line(ln): + return ln + + # 4) Name-like near the end (pptx exports often put name there) + tail_start = max(0, len(lines) - 60) + for i in range(tail_start, len(lines)): + ln = lines[i] + if _looks_like_heading_line(ln): + continue + ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower() + if NAME_INSTITUTION_RE.search(ctx): + continue + if _looks_like_name_line(ln): + return ln + + return None + + +def _looks_like_heading_line(line: str) -> bool: + low = (line or "").strip().lower() + if not low: + return False + if low in _NAME_BAD_WORDS: + return True + if low.startswith("top skills"): + return True + if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")): + return True + return False + + +def _looks_like_name_line(line: str) -> bool: + if not line: + return False + if len(line) > 80: + return False + low = line.lower().strip() + if low in NAME_STOPWORDS: + return False + if _looks_like_heading_line(line): + return False + if re.search(r"\b(resume|cv|резюме)\b", line, re.I): + return False + if NAME_INSTITUTION_RE.search(line): + return False + if not NAME_LINE_RE.match(line.strip()): + return False + return True + +def extract_remote(text: str) -> Optional[bool]: + if not text: + return None + for ln in text.splitlines()[:120]: + if REMOTE_RE.search(ln): + return True + return None + +def extract_english(text: str) -> Optional[str]: + t = text or "" + lines = [ln.strip() for ln in t.splitlines() if ln.strip()] + + # 1) CEFR levels anywhere are accepted. + m = EN_RE.search(t) + if m: + return m.group(1).replace("+", "").upper() + + # 2) Textual levels only when English context is present. + candidate_chunks: List[str] = [] + for i, ln in enumerate(lines): + if EN_LANG_RE.search(ln): + candidate_chunks.append(ln) + if i + 1 < len(lines): + candidate_chunks.append(lines[i + 1]) + + if not candidate_chunks: + return None + + m2 = EN_TEXT_RE.search("\n".join(candidate_chunks)) + if not m2: + return None + word = m2.group(1).lower() + if word in ("native", "fluent", "proficient", "advanced"): + return "C1" + if word.startswith("upper"): + return "B2" + if word == "intermediate": + return "B1" + if word == "elementary": + return "A2" + return None + +def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]: + """ + Extracts roles and skills, but strictly filters out HR/Recruitment context. + """ + lines = text.splitlines() + + # 1. Filter text: Remove lines that talk about hiring/vacancies + clean_lines = [] + for ln in lines: + if not HR_CONTEXT_RE.search(ln): + clean_lines.append(ln) + + clean_text = "\n".join(clean_lines).lower() + + # 2. Extract Skills from clean text only + skills = [] + for s, pat in _SKILL_PATTERNS: + if pat.search(clean_text): + skills.append(normalize_skill(s) or s) + skills = sorted(set(skills)) + + # 3. Extract Roles + # Priority: Header (first 10 lines) + header_text = "\n".join(lines[:10]).lower() + + found_roles = set() + + # Check if Recruiter + if NON_TECH_ROLES_RE.search(header_text): + # If explicit recruiter in header, do NOT add generic tech roles like "backend" + # even if they appear in the text (often describes who they hire). + pass + else: + # Normal extraction + for r in ROLES: + pats = _ROLE_PATTERNS.get(r, []) + if any(p.search(clean_text) for p in pats): + # extra guard: devops requires explicit evidence, not just CI/CD mentions + if r == "devops": + if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I): + continue + found_roles.add(r) + + return sorted(list(found_roles)), skills + +def norm_pipe(tokens: List[str]) -> str: + toks = [_norm_token(t) for t in tokens if _norm_token(t)] + uniq = sorted(set(toks)) + return "|" + "|".join(uniq) + "|" if uniq else "|" + +def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]: + dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []} + lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] + if not lines: + return None, None, 0.0, dbg + + candidates: List[Tuple[int, str, bool, bool]] = [] + for i, ln in enumerate(lines): + has_hint = SALARY_HINT_RE.search(ln) is not None + has_pay = PAY_TOKEN_RE.search(ln) is not None + if not has_hint and not has_pay: + continue + if SALARY_NOISE_RE.search(ln) and not has_hint: + continue + candidates.append((i, ln, has_hint, has_pay)) + + if not candidates: + return None, None, 0.0, dbg + + has_hint = any(x[2] for x in candidates) + if not has_hint: + # Inline pay without "salary" is allowed only near header/contact block. + candidates = [x for x in candidates if x[0] < 15] + if not candidates: + return None, None, 0.0, dbg + + scan_chunks: List[str] = [] + for i, ln, hint, _ in candidates: + chunk = ln + if hint and (i + 1) < len(lines): + chunk = f"{chunk} {lines[i + 1]}" + scan_chunks.append(chunk) + dbg["used_lines"].append(ln) + if hint: + dbg["hint_lines"] += 1 + dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk)) + + nums: List[int] = [] + for chunk in scan_chunks: + for m in NUM_RE.finditer(chunk): + val = None + if m.group(1) and m.group(2): + val = int(m.group(1)) * 1000 + elif m.group(3): + val = int(re.sub(r"\s+", "", m.group(3))) + elif m.group(4): + val = int(m.group(4)) + if val and 20_000 <= val <= 30_000_000: + nums.append(val) + dbg["numbers"].append(val) + + if not nums: + return None, None, 0.0, dbg + + nums = sorted(nums) + salary_min = nums[0] + salary_max = nums[-1] if len(nums) > 1 else nums[0] + + if dbg["hint_lines"] > 0: + conf = 0.82 if dbg["currency_hits"] > 0 else 0.70 + else: + conf = 0.58 if dbg["currency_hits"] > 0 else 0.0 + + if salary_max > salary_min * 4: + conf -= 0.12 + if len(nums) == 1: + conf -= 0.06 + + conf = max(0.0, min(conf, 0.9)) + if conf < 0.45: + return None, None, conf, dbg + return salary_min, salary_max, conf, dbg + +def extract_location_best_effort(text: str) -> Optional[str]: + if not text: + return None + + def _clean_loc(val: str) -> str: + return re.sub(r"\s+", " ", (val or "").strip(" |,;")) + + def _is_loc_like(val: str, *, allow_single: bool = False) -> bool: + v = _clean_loc(val) + if not v or len(v) < 3 or len(v) > 90: + return False + if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v): + return False + if SECTION_HEADER_RE.match(v): + return False + if LOCATION_CITY_COUNTRY_RE.match(v): + return True + if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v): + return True + return False + + patterns = [ + re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"), + re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"), + re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"), + ] + for p in patterns: + m = p.search(text) + if m: + val = _clean_loc(m.group(2)) + if _is_loc_like(val, allow_single=True): + return val + + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + head: List[str] = [] + for ln in lines[:60]: + if SECTION_HEADER_RE.match(ln): + low = ln.lower() + if low in ("contacts", "contact", "contact info"): + continue + break + head.append(ln) + + for ln in head: + parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()] + for seg in parts: + if _is_loc_like(seg): + return _clean_loc(seg) + return None diff --git a/extract/pdf_extract.py b/extract/pdf_extract.py new file mode 100644 index 0000000..cb2da4b --- /dev/null +++ b/extract/pdf_extract.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import re +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional, Tuple + +try: # optional dependency + from pypdf import PdfReader # type: ignore +except Exception: # pragma: no cover + try: + from PyPDF2 import PdfReader # type: ignore + except Exception: # pragma: no cover + PdfReader = None # type: ignore + +try: # optional dependency + from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore +except Exception: # pragma: no cover + pdfminer_extract_text = None # type: ignore + + +@dataclass +class PdfExtractResult: + text: str + pages: List[dict] + method: str + score: float + flags: List[str] + + +_SECTION_HINTS = [ + "experience", "work experience", "skills", "education", "projects", "summary", "about", + "опыт работы", "навыки", "образование", "проекты", "о себе", +] + + +def _which_pdftotext() -> Optional[str]: + exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe") + return exe + + +def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str: + exe = _which_pdftotext() + if not exe: + return "" + cmd = [exe] + if layout: + cmd.append("-layout") + cmd += ["-nopgbrk", str(path), "-"] + try: + p = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout_sec, + check=False, + text=True, + encoding="utf-8", + errors="ignore", + ) + return (p.stdout or "").strip() + except Exception: + return "" + + +def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]: + if PdfReader is None: + return [] + try: + reader = PdfReader(str(path), strict=False) + except Exception: + return [] + pages: List[dict] = [] + for i, page in enumerate(getattr(reader, "pages", [])): + if max_pages and i >= max_pages: + break + try: + text = page.extract_text() or "" + except Exception: + text = "" + pages.append({"page": i + 1, "text": text}) + return pages + + +def _extract_pdfminer(path: Path) -> str: + if pdfminer_extract_text is None: + return "" + try: + return (pdfminer_extract_text(str(path)) or "").strip() + except Exception: + return "" + + +def _quality_score(text: str) -> Tuple[float, List[str]]: + flags: List[str] = [] + if not text: + return 0.0, ["empty"] + + total = len(text) + letters = sum(ch.isalpha() for ch in text) + spaces = text.count(" ") + alpha_ratio = letters / max(1, total) + space_ratio = spaces / max(1, total) + + words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text) + avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0 + + lines = [ln for ln in text.splitlines() if ln.strip()] + long_lines = [ln for ln in lines if len(ln) > 200] + long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0 + + glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text)) + + section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower()) + + score = 0.0 + if alpha_ratio >= 0.45: + score += 2.0 + elif alpha_ratio >= 0.30: + score += 1.0 + else: + flags.append("low_alpha") + + if 0.10 <= space_ratio <= 0.28: + score += 1.0 + else: + flags.append("odd_spacing") + + if 3.5 <= avg_word_len <= 9.0: + score += 1.0 + else: + flags.append("odd_word_len") + + if long_line_ratio <= 0.06: + score += 1.0 + else: + flags.append("long_lines") + + if glued_hits <= 6: + score += 1.0 + else: + flags.append("glued_text") + + if section_hits >= 2: + score += 1.0 + elif section_hits == 1: + score += 0.5 + + if total < 200: + flags.append("short_text") + + if alpha_ratio < 0.08 or total < 120: + flags.append("scan_like") + + return score, flags + + +def deglue_text(text: str) -> str: + if not text: + return text + t = text + t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t) + t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t) + t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t) + t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t) + return t + + +def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult: + candidates: List[Tuple[str, str]] = [] + + txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec) + if txt_layout: + candidates.append(("pdftotext_layout", txt_layout)) + + txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec) + if txt_plain: + candidates.append(("pdftotext_plain", txt_plain)) + + txt_pypdf = "" + if PdfReader is not None: + pages = _extract_pages_pypdf(path) + if pages: + txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text")) + if txt_pypdf: + candidates.append(("pypdf", txt_pypdf)) + + txt_pdfminer = _extract_pdfminer(path) + if txt_pdfminer: + candidates.append(("pdfminer", txt_pdfminer)) + + if not candidates: + return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"]) + + best_method = "none" + best_text = "" + best_score = -1.0 + best_flags: List[str] = [] + for method, text in candidates: + score, flags = _quality_score(text) + if score > best_score: + best_score = score + best_method = method + best_text = text + best_flags = flags + + pages = _extract_pages_pypdf(path) + best_text = deglue_text(best_text) + return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags) diff --git a/extract/sections.py b/extract/sections.py new file mode 100644 index 0000000..8432149 --- /dev/null +++ b/extract/sections.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import re +from typing import Dict, List, Optional, Tuple + + +_SECTION_PATTERNS: dict[str, List[re.Pattern]] = { + "contacts": [ + re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I), + ], + "about": [ + re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I), + ], + "skills": [ + re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I), + ], + "experience": [ + re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I), + ], + "education": [ + re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I), + ], + "projects": [ + re.compile(r"^\s*(projects?|проекты)\s*$", re.I), + ], + "languages": [ + re.compile(r"^\s*(languages?|языки)\s*$", re.I), + ], + "certifications": [ + re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I), + ], + "publications": [ + re.compile(r"^\s*(publications?|публикации)\s*$", re.I), + ], +} + + +def _match_header(line: str) -> Optional[str]: + for key, patterns in _SECTION_PATTERNS.items(): + for rx in patterns: + if rx.match(line): + return key + return None + + +def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]: + lines = [ln.strip() for ln in (clean_text or "").splitlines()] + sections: Dict[str, List[str]] = {"header": []} + current = "header" + + for ln in lines: + if not ln: + continue + key = _match_header(ln) + if key: + current = key + sections.setdefault(current, []) + continue + sections.setdefault(current, []).append(ln) + + out: Dict[str, str] = {} + for k, vals in sections.items(): + text = "\n".join(vals).strip() + if text: + out[k] = text + return out + + +def sections_present(sections: Dict[str, str]) -> List[str]: + return sorted([k for k, v in (sections or {}).items() if v and k != "header"]) diff --git a/extract/templates/__init__.py b/extract/templates/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/extract/templates/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/extract/templates/generic.py b/extract/templates/generic.py new file mode 100644 index 0000000..e6712a4 --- /dev/null +++ b/extract/templates/generic.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.parse import ( + extract_contacts, + extract_name_guess, + extract_remote, + extract_english, + extract_roles_skills, + extract_salary, + extract_location_best_effort, + extract_experience_years, +) + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + text = clean_text or "" + + contacts_raw = extract_contacts(text) + name = extract_name_guess(text) + remote = extract_remote(text) + english = extract_english(text) + roles, skills = extract_roles_skills(text) + location = extract_location_best_effort(text) + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text) + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text) + + return { + "name": name, + "contacts_raw": contacts_raw, + "remote": remote, + "english": english, + "roles": roles, + "skills": skills, + "location": location, + "exp_years": exp_years, + "exp_years_eng": exp_years_eng, + "exp_conf": exp_conf, + "exp_dbg": exp_dbg, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_conf": sal_conf, + "salary_dbg": sal_dbg, + "parse_method": "generic_heur", + } diff --git a/extract/templates/hh.py b/extract/templates/hh.py new file mode 100644 index 0000000..418de83 --- /dev/null +++ b/extract/templates/hh.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.parse import ( + extract_contacts, + extract_name_guess, + extract_remote, + extract_english, + extract_roles_skills, + extract_salary, + extract_location_best_effort, + extract_experience_years, +) + + +def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str: + if not sections: + return fallback + return sections.get(key) or fallback + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + header_text = _pick(sections, "header", clean_text) + contacts_text = _pick(sections, "contacts", clean_text) + about_text = _pick(sections, "about", clean_text) + skills_text = _pick(sections, "skills", clean_text) + exp_text = _pick(sections, "experience", clean_text) + exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text + + name = extract_name_guess(header_text) + contacts_raw = extract_contacts(contacts_text) + roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text])) + + remote = extract_remote(clean_text) + english = extract_english(clean_text) + location = extract_location_best_effort(clean_text) + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope) + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text) + + return { + "name": name, + "contacts_raw": contacts_raw, + "remote": remote, + "english": english, + "roles": roles, + "skills": skills, + "location": location, + "exp_years": exp_years, + "exp_years_eng": exp_years_eng, + "exp_conf": exp_conf, + "exp_dbg": exp_dbg, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_conf": sal_conf, + "salary_dbg": sal_dbg, + "parse_method": "hh_template", + } diff --git a/extract/templates/hh_ru.py b/extract/templates/hh_ru.py new file mode 100644 index 0000000..d6f1c7b --- /dev/null +++ b/extract/templates/hh_ru.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import re +from typing import Any, Dict, Optional + +from tg_resume_db.extract.parse import ( + extract_contacts, + extract_name_guess, + extract_remote, + extract_english, + extract_roles_skills, + extract_salary, + extract_location_best_effort, + extract_experience_years, +) + + +_DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)") +_SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)") +_SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)") +_EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)") + + +def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str: + if not sections: + return fallback + return sections.get(key) or fallback + + +def _find_first(regex: re.Pattern, text: str) -> Optional[str]: + for ln in text.splitlines(): + m = regex.search(ln) + if m: + val = m.group(1).strip() + val = re.split(r"[|;/]", val)[0].strip() + if 2 <= len(val) <= 80: + return val + return None + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + header_text = _pick(sections, "header", clean_text) + contacts_text = _pick(sections, "contacts", clean_text) + about_text = _pick(sections, "about", clean_text) + skills_text = _pick(sections, "skills", clean_text) + exp_text = _pick(sections, "experience", clean_text) + exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text + + name = extract_name_guess(header_text) + contacts_raw = extract_contacts(contacts_text) + roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text])) + + remote = extract_remote(clean_text) + english = extract_english(clean_text) + location = extract_location_best_effort(clean_text) + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope) + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text) + + desired_title = _find_first(_DESIRED_RE, clean_text) + specializations = _find_first(_SPEC_RE, clean_text) + schedule = _find_first(_SCHEDULE_RE, clean_text) + employment = _find_first(_EMPLOYMENT_RE, clean_text) + + return { + "name": name, + "contacts_raw": contacts_raw, + "remote": remote, + "english": english, + "roles": roles, + "skills": skills, + "location": location, + "exp_years": exp_years, + "exp_years_eng": exp_years_eng, + "exp_conf": exp_conf, + "exp_dbg": exp_dbg, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_conf": sal_conf, + "salary_dbg": sal_dbg, + "desired_title": desired_title, + "specializations": specializations, + "employment_type": employment, + "schedule": schedule, + "parse_method": "hh_template", + } diff --git a/extract/templates/linkedin.py b/extract/templates/linkedin.py new file mode 100644 index 0000000..294ad00 --- /dev/null +++ b/extract/templates/linkedin.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.parse import ( + extract_contacts, + extract_name_guess, + extract_remote, + extract_english, + extract_roles_skills, + extract_salary, + extract_location_best_effort, + extract_experience_years, +) + + +def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str: + if not sections: + return fallback + return sections.get(key) or fallback + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + header_text = _pick(sections, "header", clean_text) + about_text = _pick(sections, "about", clean_text) + skills_text = _pick(sections, "skills", clean_text) + exp_text = _pick(sections, "experience", clean_text) + exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text + + name = extract_name_guess(header_text) + contacts_raw = extract_contacts(clean_text) + roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text])) + + remote = extract_remote(clean_text) + english = extract_english(clean_text) + location = extract_location_best_effort(clean_text) + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope) + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text) + + return { + "name": name, + "contacts_raw": contacts_raw, + "remote": remote, + "english": english, + "roles": roles, + "skills": skills, + "location": location, + "exp_years": exp_years, + "exp_years_eng": exp_years_eng, + "exp_conf": exp_conf, + "exp_dbg": exp_dbg, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_conf": sal_conf, + "salary_dbg": sal_dbg, + "parse_method": "linkedin_template", + } diff --git a/extract/templates/one_page.py b/extract/templates/one_page.py new file mode 100644 index 0000000..5282df5 --- /dev/null +++ b/extract/templates/one_page.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.parse import ( + extract_contacts, + extract_name_guess, + extract_remote, + extract_english, + extract_roles_skills, + extract_salary, + extract_location_best_effort, + extract_experience_years, +) + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + text = clean_text or "" + + contacts_raw = extract_contacts(text) + name = extract_name_guess(text) + roles, skills = extract_roles_skills(text) + remote = extract_remote(text) + english = extract_english(text) + location = extract_location_best_effort(text) + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text) + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text) + + return { + "name": name, + "contacts_raw": contacts_raw, + "remote": remote, + "english": english, + "roles": roles, + "skills": skills, + "location": location, + "exp_years": exp_years, + "exp_years_eng": exp_years_eng, + "exp_conf": exp_conf, + "exp_dbg": exp_dbg, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_conf": sal_conf, + "salary_dbg": sal_dbg, + "parse_method": "one_page_template", + } diff --git a/extract/templates/one_page_en.py b/extract/templates/one_page_en.py new file mode 100644 index 0000000..696e67e --- /dev/null +++ b/extract/templates/one_page_en.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.templates.one_page import parse_resume as _parse + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + out = _parse(clean_text, sections) + out["parse_method"] = "one_page_en" + return out diff --git a/extract/templates/one_page_ru.py b/extract/templates/one_page_ru.py new file mode 100644 index 0000000..24610cf --- /dev/null +++ b/extract/templates/one_page_ru.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.templates.one_page import parse_resume as _parse + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + out = _parse(clean_text, sections) + out["parse_method"] = "one_page_ru" + return out diff --git a/extract/templates/pptx_export.py b/extract/templates/pptx_export.py new file mode 100644 index 0000000..c0c8935 --- /dev/null +++ b/extract/templates/pptx_export.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tg_resume_db.extract.parse import ( + extract_contacts, + extract_name_guess, + extract_remote, + extract_english, + extract_roles_skills, + extract_salary, + extract_location_best_effort, + extract_experience_years, +) + + +def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]: + text = clean_text or "" + contacts_raw = extract_contacts(text) + name = extract_name_guess(text) + roles, skills = extract_roles_skills(text) + remote = extract_remote(text) + english = extract_english(text) + location = extract_location_best_effort(text) + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text) + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text) + + return { + "name": name, + "contacts_raw": contacts_raw, + "remote": remote, + "english": english, + "roles": roles, + "skills": skills, + "location": location, + "exp_years": exp_years, + "exp_years_eng": exp_years_eng, + "exp_conf": exp_conf, + "exp_dbg": exp_dbg, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_conf": sal_conf, + "salary_dbg": sal_dbg, + "parse_method": "pptx_template", + } diff --git a/extract/text_extract.py b/extract/text_extract.py new file mode 100644 index 0000000..17ed285 --- /dev/null +++ b/extract/text_extract.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import os +from pathlib import Path +import logging +from bs4 import BeautifulSoup + +try: # optional dependency for PDF fallback + from pypdf import PdfReader as _PdfReader # type: ignore +except Exception: # pragma: no cover - optional import + try: + from PyPDF2 import PdfReader as _PdfReader # type: ignore + except Exception: # pragma: no cover + _PdfReader = None # type: ignore + +def _read_bytes(path: Path) -> bytes: + return path.read_bytes() + +def extract_text_from_txt(path: Path) -> str: + data = _read_bytes(path) + for enc in ("utf-8", "utf-16", "cp1251", "latin-1"): + try: + return data.decode(enc, errors="ignore") + except Exception: + continue + return data.decode("utf-8", errors="ignore") + +def extract_text_from_html(path: Path) -> str: + html = extract_text_from_txt(path) + soup = BeautifulSoup(html, "lxml") + return soup.get_text("\n", strip=True) + +def extract_text_from_docx(path: Path) -> str: + from docx import Document + doc = Document(str(path)) + parts = [] + for p in doc.paragraphs: + if p.text and p.text.strip(): + parts.append(p.text.strip()) + for table in doc.tables: + for row in table.rows: + cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()] + if cells: + parts.append(" | ".join(cells)) + return "\n".join(parts) + +_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40")) +# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..." +logging.getLogger("pypdf").setLevel(logging.ERROR) +logging.getLogger("PyPDF2").setLevel(logging.ERROR) + + +def extract_text_from_pdf(path: Path) -> str: + """ + Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer. + Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files. + """ + if _PdfReader is None: + raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)") + + try: + reader = _PdfReader(str(path), strict=False) + except Exception as exc: # pragma: no cover - pdf parser edge cases + raise RuntimeError(f"PDF read failed: {exc}") from exc + + parts = [] + for idx, page in enumerate(getattr(reader, "pages", [])): + if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT: + break + try: + text = page.extract_text() # type: ignore[attr-defined] + except Exception: + text = None + if text: + parts.append(text) + return "\n".join(parts) + +def extract_text_from_doc_best_effort(path: Path) -> str: + # .doc requires external tools; best-effort if textract installed + try: + import textract # type: ignore + b = textract.process(str(path)) + return b.decode("utf-8", errors="ignore") + except Exception: + return "" + +def extract_text(path: Path) -> str: + ext = path.suffix.lower() + if ext in (".txt", ".log"): + return extract_text_from_txt(path) + if ext in (".html", ".htm"): + return extract_text_from_html(path) + if ext == ".docx": + return extract_text_from_docx(path) + if ext == ".pdf": + return extract_text_from_pdf(path) + if ext == ".doc": + return extract_text_from_doc_best_effort(path) + return "" diff --git a/importers/file_scan.py b/importers/file_scan.py new file mode 100644 index 0000000..66a2e8a --- /dev/null +++ b/importers/file_scan.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Dict, Iterator + +RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"} + +def iter_files(root: Path) -> Iterator[Dict]: + for p in root.rglob("*"): + if p.is_file() and p.suffix.lower() in RESUME_EXTS: + yield { + "origin_type": "file_scan", + "export_path": str(root), + "chat_title": None, + "message_id": None, + "message_date": None, + "message_text": "", + "file_path": str(p.resolve()), + "original_name": p.name, + "extra": {}, + } diff --git a/importers/telegram_html.py b/importers/telegram_html.py new file mode 100644 index 0000000..5336996 --- /dev/null +++ b/importers/telegram_html.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import re +from pathlib import Path +from typing import Dict, Iterator, List, Optional + +from bs4 import BeautifulSoup + +RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"} + +def find_messages_html(root: Path) -> List[Path]: + return [p for p in root.rglob("messages*.html") if p.is_file()] + +def iter_artifacts(messages_html: Path) -> Iterator[Dict]: + html = messages_html.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html, "lxml") + + chat_title = None + h = soup.find(class_=re.compile(r"page_header", re.I)) + if h: + chat_title = h.get_text(" ", strip=True) + chat_title = chat_title or messages_html.parent.name + + for msg in soup.select(".message.default.clearfix, .message"): + message_id = msg.get("id") or None + date_div = msg.select_one(".date") + msg_date = date_div.get("title") if date_div else None + + text_div = msg.select_one(".text") + msg_text = text_div.get_text("\n", strip=True) if text_div else "" + + file_path = None + original_name = None + for a in msg.find_all("a", href=True): + href = a["href"] + p = (messages_html.parent / href).resolve() + if p.exists() and p.suffix.lower() in RESUME_EXTS: + file_path = str(p) + original_name = p.name + break + + if file_path: + yield { + "origin_type": "telegram_html", + "export_path": str(messages_html.parent), + "chat_title": chat_title, + "message_id": str(message_id) if message_id else None, + "message_date": msg_date, + "message_text": msg_text or "", + "file_path": file_path, + "original_name": original_name, + "extra": {"html_path": str(messages_html)}, + } + else: + if msg_text and len(msg_text.strip()) >= 500: + yield { + "origin_type": "message_text", + "export_path": str(messages_html.parent), + "chat_title": chat_title, + "message_id": str(message_id) if message_id else None, + "message_date": msg_date, + "message_text": msg_text, + "file_path": None, + "original_name": None, + "extra": {"html_path": str(messages_html)}, + } diff --git a/importers/telegram_json.py b/importers/telegram_json.py new file mode 100644 index 0000000..5cc9985 --- /dev/null +++ b/importers/telegram_json.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, Iterator, List, Optional + +RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"} + +def find_result_json(root: Path) -> List[Path]: + return list(root.rglob("result.json")) + +def _text_field_to_str(text_field) -> str: + if isinstance(text_field, str): + return text_field + if isinstance(text_field, list): + parts = [] + for item in text_field: + if isinstance(item, str): + parts.append(item) + elif isinstance(item, dict) and "text" in item: + parts.append(str(item["text"])) + return "".join(parts) + return "" + +def iter_artifacts(result_json: Path) -> Iterator[Dict]: + data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore")) + + chats = [] + if isinstance(data, dict): + chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or [] + for chat in chats: + chat_title = chat.get("name") or chat.get("title") or "unknown_chat" + messages = chat.get("messages", []) or [] + for msg in messages: + msg_id = str(msg.get("id") or "") + msg_date = msg.get("date") or msg.get("date_unixtime") or None + text = _text_field_to_str(msg.get("text", "")) + + file_rel = msg.get("file") or None + file_path = None + original_name = None + if file_rel: + p = (result_json.parent / file_rel).resolve() + if p.exists() and p.suffix.lower() in RESUME_EXTS: + file_path = str(p) + original_name = p.name + + if file_path: + yield { + "origin_type": "telegram_json", + "export_path": str(result_json.parent), + "chat_title": chat_title, + "message_id": msg_id, + "message_date": str(msg_date) if msg_date is not None else None, + "message_text": text or "", + "file_path": file_path, + "original_name": original_name, + "extra": {"json_path": str(result_json)}, + } + else: + # message-only resume paste (heuristic) + if text and len(text.strip()) >= 500: + yield { + "origin_type": "message_text", + "export_path": str(result_json.parent), + "chat_title": chat_title, + "message_id": msg_id, + "message_date": str(msg_date) if msg_date is not None else None, + "message_text": text, + "file_path": None, + "original_name": None, + "extra": {"json_path": str(result_json)}, + } diff --git a/normalize.py b/normalize.py new file mode 100644 index 0000000..ae7d21b --- /dev/null +++ b/normalize.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import re +from typing import Dict, List, Optional, Tuple + + +_SKILL_SYNONYMS: Dict[str, List[str]] = { + "python": ["py"], + "javascript": ["js", "node", "nodejs", "java script", "java-script"], + "typescript": ["ts", "type script", "type-script"], + "postgresql": ["postgres", "psql"], + "kubernetes": ["k8s"], + "docker": [], + "fastapi": [], + "django": ["drf", "django rest framework"], + "flask": [], + "golang": ["go"], + "c++": ["cpp"], + "c#": ["csharp"], + "redis": [], + "kafka": [], + "rabbitmq": [], + "grpc": [], + "rest": [], +} + +_SKILL_STOP = {"rest", "http", "json", "xml", "oop"} + +_ROLE_SYNONYMS: Dict[str, List[str]] = { + "backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"], + "frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"], + "fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"], + "devops": ["sre", "site reliability"], + "qa": ["tester", "тестировщик"], + "data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"], + "mobile": ["android", "ios", "mobile developer", "мобильный разработчик"], +} + + +def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]: + alias = {} + for canonical, al in src.items(): + alias[canonical] = canonical + for a in al: + alias[a] = canonical + return {k.lower(): v for k, v in alias.items()} + + +_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS) +_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS) + + +def _normalize_skill_surface(token: str) -> str: + t = (token or "").strip().lower() + if not t: + return "" + t = t.replace("/", " ") + t = re.sub(r"[_\-]+", " ", t) + t = re.sub(r"\s+", " ", t).strip() + + # "java script", "type script", "postgre sql", "graph ql", "g rpc" + t = re.sub(r"\bjava\s+script\b", "javascript", t) + t = re.sub(r"\btype\s+script\b", "typescript", t) + t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t) + t = re.sub(r"\bgraph\s+ql\b", "graphql", t) + t = re.sub(r"\bg\s+rpc\b", "grpc", t) + t = re.sub(r"\bdocker\s+compose\b", "docker compose", t) + return t + + +def normalize_skill(token: str) -> Optional[str]: + t = _normalize_skill_surface(token) + if not t: + return None + + # Avoid false-positive java from "javascript" + if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)): + return "javascript" + + return _SKILL_ALIAS.get(t, t) + + +def normalize_skills(skills: List[str]) -> List[str]: + out: List[str] = [] + seen = set() + for s in skills or []: + canon = normalize_skill(s) + if not canon or canon in seen: + continue + seen.add(canon) + out.append(canon) + return out + + +def normalize_role(token: str) -> Optional[str]: + t = (token or "").strip().lower() + if not t: + return None + return _ROLE_ALIAS.get(t, t) + + +def normalize_roles(roles: List[str]) -> List[str]: + out: List[str] = [] + seen = set() + for r in roles or []: + canon = normalize_role(r) + if not canon or canon in seen: + continue + seen.add(canon) + out.append(canon) + return out + + +def split_skills_primary_secondary( + skills: List[str], + *, + clean_text: str, + sections: Dict[str, str] | None = None, + primary_limit: int = 25, +) -> Tuple[List[str], List[str]]: + if not skills: + return [], [] + + text = (clean_text or "").lower() + skills_section = (sections or {}).get("skills", "").lower() + experience_section = (sections or {}).get("experience", "").lower() + + scores: Dict[str, float] = {} + for sk in skills: + s = sk.lower() + score = 1.0 + if s in skills_section: + score += 2.2 + if s in experience_section: + score += 1.2 + count = len(re.findall(r"\b" + re.escape(s) + r"\b", text)) + score += min(2.5, count * 0.5) + if s in _SKILL_STOP: + score -= 1.5 + scores[sk] = score + + ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True) + primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit] + secondary = [s for s in ranked if s not in primary] + return primary, secondary + + +def normalize_location(raw: Optional[str]) -> Optional[str]: + if not raw: + return None + t = raw.strip() + low = t.lower() + if low in ("москва", "moscow", "moscow, russia"): + return "Moscow, Russia" + if low in ("санкт-петербург", "спб", "питер", "saint petersburg"): + return "Saint Petersburg, Russia" + return t + + +def find_skills_in_text(text: str) -> List[str]: + if not text: + return [] + found: List[str] = [] + seen = set() + low = _normalize_skill_surface(text) + for alias, canon in _SKILL_ALIAS.items(): + key = _normalize_skill_surface(alias) + if key in seen: + continue + if re.search(r"\b" + re.escape(key) + r"\b", low): + if canon not in seen: + found.append(canon) + seen.add(canon) + return found diff --git a/pdf_merge.py b/pdf_merge.py new file mode 100644 index 0000000..b2b31af --- /dev/null +++ b/pdf_merge.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterable, List, Optional + +from pypdf import PdfReader, PdfWriter + + +def merge_pdfs(pdf_paths: Iterable[str | Path], out_pdf_path: str | Path) -> dict: + out_pdf_path = Path(out_pdf_path) + out_pdf_path.parent.mkdir(parents=True, exist_ok=True) + + writer = PdfWriter() + + merged: List[str] = [] + skipped: List[str] = [] + + for p in pdf_paths: + path = Path(p) + try: + reader = PdfReader(str(path)) + # просто добавляем страницы подряд + for page in reader.pages: + writer.add_page(page) + merged.append(str(path)) + except Exception: + skipped.append(str(path)) + + if merged: + with out_pdf_path.open("wb") as f: + writer.write(f) + + return { + "out_pdf": str(out_pdf_path), + "merged_count": len(merged), + "skipped_count": len(skipped), + "merged_files": merged, + "skipped_files": skipped, + } + + +def merge_all_pdfs_in_dir(files_dir: str | Path, out_pdf_path: str | Path) -> dict: + files_dir = Path(files_dir) + pdfs = sorted(files_dir.rglob("*.pdf")) + sorted(files_dir.rglob("*.PDF")) + return merge_pdfs(pdfs, out_pdf_path) diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..d7f5b86 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,1990 @@ +from __future__ import annotations + +import json +import os +import re +import shutil +import sqlite3 +import subprocess +import uuid +from dataclasses import asdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from tg_resume_db.util import Logger, utc_iso +from tg_resume_db.extract.text_extract import extract_text as extract_text_generic +from tg_resume_db.extract.clean import normalize_text, to_fts_text +from tg_resume_db.extract.pdf_extract import extract_pdf_best +from tg_resume_db.extract.llm import ( + LLMExtraction, + llm_extract_profile, + llm_parse_enabled, + llm_review_profile, +) +from tg_resume_db.extract.doc_type import detect_doc_type +from tg_resume_db.extract.sections import split_sections, sections_present +from tg_resume_db.extract.experience_timeline import extract_positions, positions_to_dicts +from tg_resume_db.extract.parse import ( + extract_contacts as extract_contacts_raw, + extract_name_guess, + extract_remote, + extract_english, + extract_salary, + extract_location_best_effort, + extract_experience_years, # Updated function + norm_pipe, + safe_json, +) +from tg_resume_db.extract.templates import generic as tpl_generic +from tg_resume_db.extract.templates import hh_ru as tpl_hh +from tg_resume_db.extract.templates import linkedin as tpl_linkedin +from tg_resume_db.extract.templates import one_page_en as tpl_one_page_en +from tg_resume_db.extract.templates import one_page_ru as tpl_one_page_ru +from tg_resume_db.extract.templates import pptx_export as tpl_pptx +from tg_resume_db.normalize import ( + normalize_skills, + normalize_roles, + split_skills_primary_secondary, + normalize_location, +) +from tg_resume_db.dedup.simhash import ( + sha256_file, + sha1_str, + simhash64, + simhash_bands, + hamming64, +) +from tg_resume_db.importers.telegram_json import find_result_json, iter_artifacts as iter_json_artifacts +from tg_resume_db.importers.telegram_html import find_messages_html, iter_artifacts as iter_html_artifacts +from tg_resume_db.importers.file_scan import iter_files as iter_file_scan + +_PARSE_VERSION = "v3_llm_review" + + +# ----------------------------- +# helpers: make everything text +# ----------------------------- + +def coerce_text(x: Any) -> str: + """Turn Telegram-export weird structures (dict/list/bytes) into plain text.""" + if x is None: + return "" + if isinstance(x, str): + return x + if isinstance(x, bytes): + for enc in ("utf-8", "utf-16", "cp1251", "latin-1"): + try: + return x.decode(enc, errors="ignore") + except Exception: + pass + return x.decode("utf-8", errors="ignore") + + if isinstance(x, list): + parts: List[str] = [] + for item in x: + if isinstance(item, dict): + parts.append(coerce_text(item.get("text") or item.get("href") or "")) + else: + parts.append(coerce_text(item)) + return "".join(parts) + + if isinstance(x, dict): + if "text" in x: + return coerce_text(x["text"]) + if "content" in x: + return coerce_text(x["content"]) + return json.dumps(x, ensure_ascii=False) + + return str(x) + + +# ----------------------------- +# PDF extraction: prefer pdftotext +# ----------------------------- + +def _which_pdftotext() -> Optional[str]: + if os.environ.get("PDFTOTEXT_ENABLE", "0").lower() not in ("1", "true", "yes"): + return None + exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe") + return exe + + +def extract_text_from_pdf_pdftotext(fp: Path, timeout_sec: int = 25) -> str: + exe = _which_pdftotext() + if not exe: + return "" + cmd = [exe, "-layout", "-nopgbrk", str(fp), "-"] + try: + p = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout_sec, + check=False, + text=True, + encoding="utf-8", + errors="ignore", + ) + return (p.stdout or "").strip() + except subprocess.TimeoutExpired: + return "" + except Exception: + return "" + + +def extract_text_resilient(fp: Path, log: Optional[Logger] = None, timeout_sec: int = 25) -> str: + ext = fp.suffix.lower() + + if ext == ".pdf": + out = extract_text_from_pdf_pdftotext(fp, timeout_sec=timeout_sec) + if out: + return out + try: + return extract_text_generic(fp) or "" + except Exception as e: + if log: + log.warn("[extract] pdf failed - skipped", {"file": str(fp), "err": repr(e)}) + return "" + + try: + return extract_text_generic(fp) or "" + except Exception as e: + if log: + log.warn("[extract] file failed - skipped", {"file": str(fp), "err": repr(e)}) + return "" + + +# ----------------------------- +# contacts normalization + phone/tg cleanup +# ----------------------------- + +_EMAIL_RE = re.compile(r"\b[a-zA-Z0-9._%+\-]{1,64}@[a-zA-Z0-9.\-]{1,253}\.[a-zA-Z]{2,}\b") +_EMAIL_SPLIT_RE = re.compile( + r"(?[a-z0-9][a-z0-9._%+\-]{1,40})\s+" + r"(?P[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})", + re.I, +) +_TG_AT_RE = re.compile(r"(? Optional[str]: + s = s.strip().lower() + if _EMAIL_RE.fullmatch(s): + return s + return None + + +def _recover_split_emails(text: str) -> List[str]: + out: List[str] = [] + for m in _EMAIL_SPLIT_RE.finditer(text or ""): + prefix = (m.group("prefix") or "").strip().lower().strip(".-_") + if not prefix or prefix in _EMAIL_PREFIX_STOP: + continue + if not re.search(r"[._\-\d]", prefix): + continue + tail = (m.group("tail") or "").strip().lower() + if "@" not in tail: + continue + local_tail, domain = tail.split("@", 1) + local = f"{prefix}{local_tail}" + if len(local) > 64: + continue + cand = f"{local}@{domain}" + if _EMAIL_RE.fullmatch(cand): + out.append(cand) + return out + + +def _prune_fragment_emails(values: List[str]) -> List[str]: + uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v)) + out: List[str] = [] + for e in uniq: + local, domain = e.split("@", 1) + drop = False + for other in uniq: + if other == e: + continue + ol, od = other.split("@", 1) + if od != domain: + continue + if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol): + drop = True + break + if not drop: + out.append(e) + return out + + +def _looks_like_month_range(digits: str) -> bool: + if len(digits) == 12: + try: + mm1 = int(digits[0:2]); yyyy1 = int(digits[2:6]) + mm2 = int(digits[6:8]); yyyy2 = int(digits[8:12]) + if 1 <= mm1 <= 12 and 1900 <= yyyy1 <= 2100 and 1 <= mm2 <= 12 and 1900 <= yyyy2 <= 2100: + return True + except Exception: + return False + return False + + +def _norm_phone(s: str) -> Optional[str]: + raw = s.strip() + if not (raw.startswith("+") or raw.startswith("7") or raw.startswith("8")): + return None + + digits = re.sub(r"\D+", "", raw) + if len(digits) < 10 or len(digits) > 15: + return None + + if len(set(digits)) <= 2: + return None + + if _looks_like_month_range(digits): + return None + + if len(digits) == 12 and digits.startswith(("2", "3", "4", "5", "6", "7", "8", "9")): + if digits.count("0") >= 6: + return None + + return "+" + digits + + +def _norm_tg_handle(handle: str) -> Optional[str]: + h = handle.strip().lstrip("@").lower() + if not (5 <= len(h) <= 32): + return None + if not re.fullmatch(r"[a-z0-9_]+", h): + return None + if h.isdigit(): + return None + if h in _TG_STOP: + return None + return h + + +def normalize_contacts(raw: Any, clean_text: str) -> Dict[str, List[str]]: + out: Dict[str, List[str]] = {"email": [], "phone": [], "tg": [], "github": [], "linkedin": []} + + if isinstance(raw, dict): + key_map = { + "emails": "email", "email": "email", + "phones": "phone", "phone": "phone", + "telegram": "tg", "tg": "tg", + "github": "github", + "linkedin": "linkedin", + } + for k, v in raw.items(): + nk = key_map.get(k) + if not nk: + continue + vals = [coerce_text(x) for x in v] if isinstance(v, list) else [coerce_text(v)] + out[nk].extend(vals) + + for e in _EMAIL_RE.findall(clean_text): + out["email"].append(e) + for e in _recover_split_emails(clean_text): + out["email"].append(e) + + for chunk in _PHONE_CHUNK_RE.findall(clean_text): + out["phone"].append(chunk) + + for h in _TG_AT_RE.findall(clean_text): + out["tg"].append(h) + for h in _TG_LINK_RE.findall(clean_text): + out["tg"].append(h) + + def uniq(seq: List[str]) -> List[str]: + seen = set() + res = [] + for x in seq: + if x in seen: + continue + seen.add(x) + res.append(x) + return res + + emails: List[str] = [] + for s in out["email"]: + n = _norm_email(s) + if n: + emails.append(n) + + phones: List[str] = [] + for s in out["phone"]: + n = _norm_phone(s) + if n: + phones.append(n) + + tgs: List[str] = [] + for s in out["tg"]: + n = _norm_tg_handle(s) + if n: + tgs.append(n) + + out["email"] = uniq(_prune_fragment_emails(emails)) + out["phone"] = uniq(phones) + out["tg"] = uniq(tgs) + + out["github"] = uniq([coerce_text(x).strip() for x in out["github"] if coerce_text(x).strip()]) + out["linkedin"] = uniq([coerce_text(x).strip() for x in out["linkedin"] if coerce_text(x).strip()]) + + return out + + +# ----------------------------- +# LLM helpers +# ----------------------------- + +_LANGUAGE_CANON = { + "python", + "java", + "kotlin", + "go", + "golang", + "c++", + "cpp", + "c#", + "javascript", + "typescript", + "ruby", + "php", + "swift", + "objective-c", + "scala", + "rust", + "dart", +} + +_LANGUAGE_ALIAS = { + "golang": "go", + "cpp": "c++", + "c plus plus": "c++", + "csharp": "c#", + "c#": "c#", + "js": "javascript", + "ts": "typescript", +} + + +_JAVA_REAL_RE = re.compile(r"\b(java\s*(8|11|17|21)|spring|jvm|maven|gradle|jakarta)\b", re.I) +_JAVASCRIPT_RE = re.compile(r"\b(java\s*script|javascript|js)\b", re.I) + + +def _norm_lang_token(token: str) -> Optional[str]: + raw = (token or "").strip().lower() + if not raw: + return None + norm = _LANGUAGE_ALIAS.get(raw, raw) + if norm in _LANGUAGE_CANON: + # collapse golang -> go, cpp -> c++ + if norm == "golang": + norm = "go" + if norm == "cpp": + norm = "c++" + return norm + return None + + +def _normalize_language_list(values: List[str]) -> List[str]: + seen = set() + out: List[str] = [] + for v in values or []: + tok = _norm_lang_token(v) + if not tok or tok in seen: + continue + seen.add(tok) + out.append(tok) + return out + + +def _drop_false_java( + skills: List[str], + primary_languages: List[str], + clean_text: str, +) -> Tuple[List[str], List[str]]: + norm_skills = [str(s).strip().lower() for s in (skills or [])] + if "java" not in norm_skills: + return skills, primary_languages + + txt = clean_text or "" + has_js = _JAVASCRIPT_RE.search(txt) is not None + has_real_java = _JAVA_REAL_RE.search(txt) is not None + if has_js and not has_real_java: + cleaned_skills = [s for s in skills if str(s).strip().lower() != "java"] + cleaned_langs = [s for s in primary_languages if str(s).strip().lower() != "java"] + return cleaned_skills, cleaned_langs + return skills, primary_languages + + +def _roles_from_desired_title(title: Optional[str]) -> List[str]: + if not title: + return [] + t = title.lower() + out: List[str] = [] + if "backend" in t or "бэкенд" in t or "бекенд" in t: + out.append("backend") + if "frontend" in t or "фронтенд" in t: + out.append("frontend") + if "fullstack" in t or "full stack" in t or "фулстек" in t: + out.append("fullstack") + if "devops" in t or "sre" in t: + out.append("devops") + if "qa" in t or "test" in t or "тестировщик" in t: + out.append("qa") + if "data" in t or "ml" in t or "machine learning" in t or "аналитик" in t: + out.append("data") + if "android" in t or "ios" in t or "mobile" in t or "мобиль" in t: + out.append("mobile") + return out + + +def _merge_lists(base: List[str], extra: List[str], limit: Optional[int] = None) -> List[str]: + seen = set() + out: List[str] = [] + for seq in (base or [], extra or []): + for x in seq: + t = str(x).strip() + if not t or t.lower() in seen: + continue + seen.add(t.lower()) + out.append(t) + if limit is not None and len(out) >= limit: + return out + return out + + +def _pick_salary( + heur_min: Optional[int], + heur_max: Optional[int], + heur_conf: Optional[float], + llm_min: Optional[int], + llm_max: Optional[int], +) -> Tuple[Optional[int], Optional[int], Optional[float]]: + if heur_min or heur_max: + if heur_conf is None: + heur_conf = 0.55 + return heur_min, heur_max, heur_conf + + if llm_min or llm_max: + return llm_min, llm_max, 0.65 + + return heur_min, heur_max, heur_conf + + +_EN_SIGNAL_RE = re.compile(r"\b(english|англий|ielts|toefl|cefr|a1|a2|b1|b2|c1|c2)\b", re.I) + + +def _has_english_signal(text: str) -> bool: + if not text: + return False + return _EN_SIGNAL_RE.search(text) is not None + + +def _can_accept_llm_english(clean_text: str, level: Optional[str]) -> bool: + if not level: + return False + # Require explicit language signal in CV to avoid invented C1/C2. + return _has_english_signal(clean_text) + + +_ROLE_EVIDENCE_PATTERNS: Dict[str, re.Pattern] = { + "qa": re.compile(r"\b(qa|quality assurance|tester|test engineer|test automation)\b", re.I), + "devops": re.compile(r"\b(devops|dev ops|sre|platform engineer|infrastructure engineer)\b", re.I), + "mobile": re.compile(r"\b(mobile|android|ios|react native|flutter)\b", re.I), + "data": re.compile(r"\b(data engineer|data scientist|ml engineer|machine learning)\b", re.I), + "architect": re.compile(r"\b(architect|solution architect|software architect)\b", re.I), +} + + +def _prune_roles_by_evidence(roles: List[str], clean_text: str) -> List[str]: + out: List[str] = [] + seen = set() + t = (clean_text or "").lower() + for role in roles or []: + r = str(role).strip().lower() + if not r or r in seen: + continue + seen.add(r) + pat = _ROLE_EVIDENCE_PATTERNS.get(r) + if pat is not None and not pat.search(t): + continue + out.append(r) + return out + + +def _parse_ym(date_iso: Optional[str]) -> Optional[Tuple[int, int]]: + if not date_iso: + return None + m = re.match(r"^\s*(\d{4})-(\d{2})", str(date_iso).strip()) + if not m: + return None + y = int(m.group(1)) + mm = int(m.group(2)) + if not (1900 <= y <= 2100 and 1 <= mm <= 12): + return None + return (y, mm) + + +def _months_between(a: Tuple[int, int], b: Tuple[int, int]) -> int: + return (b[0] - a[0]) * 12 + (b[1] - a[1]) + + +def _experience_years_from_positions(position_dicts: List[Dict[str, Any]]) -> Optional[float]: + intervals: List[Tuple[Tuple[int, int], Tuple[int, int]]] = [] + for p in position_dicts or []: + if not isinstance(p, dict): + continue + a = _parse_ym(p.get("date_from")) + b = _parse_ym(p.get("date_to")) + if not a or not b: + continue + if b < a: + a, b = b, a + intervals.append((a, b)) + + if not intervals: + return None + + intervals.sort(key=lambda x: x[0]) + merged: List[Tuple[Tuple[int, int], Tuple[int, int]]] = [intervals[0]] + for s, e in intervals[1:]: + ls, le = merged[-1] + if s <= le: + if e > le: + merged[-1] = (ls, e) + else: + merged.append((s, e)) + + months = 0 + for s, e in merged: + months += max(0, _months_between(s, e)) + years = round(months / 12.0, 2) + if 0.0 <= years <= 60.0: + return years + return None + + +def _reconcile_experience_fields( + *, + exp_years: Optional[float], + exp_years_eng: Optional[float], + exp_conf: Optional[float], + exp_dbg: Dict[str, Any], + positions: List[Dict[str, Any]], +) -> Tuple[Optional[float], Optional[float], Optional[float], Dict[str, Any]]: + dbg = dict(exp_dbg or {}) + source_notes: List[str] = [] + + pos_years = _experience_years_from_positions(positions) + if pos_years is not None: + dbg["positions_years"] = pos_years + + if exp_years is None and pos_years is not None: + exp_years = pos_years + exp_conf = max(float(exp_conf or 0.0), 0.74) + source_notes.append("positions_fallback") + elif exp_years is not None and pos_years is not None and pos_years > (float(exp_years) + 1.0): + method = str(dbg.get("method") or "") + strong_summary = method in ("summary", "header_chunk") and float(exp_conf or 0.0) >= 0.78 + if strong_summary and (pos_years - float(exp_years)) > 1.5: + source_notes.append("positions_reconcile_skip_strong_summary") + else: + exp_years = pos_years + exp_conf = max(float(exp_conf or 0.0), 0.75) + source_notes.append("positions_reconcile_up") + + # Prevent impossible split like total=1.5 while engineering=7.0. + try: + if exp_years is not None and exp_years_eng is not None: + if float(exp_years) < float(exp_years_eng) * 0.7: + exp_years = float(exp_years_eng) + exp_conf = max(float(exp_conf or 0.0), 0.74) + source_notes.append("eng_gt_total_fix") + except Exception: + pass + + is_recruiter = bool(dbg.get("is_recruiter")) + if exp_years_eng is None and exp_years is not None and not is_recruiter: + exp_years_eng = float(exp_years) + source_notes.append("eng_fill_from_total") + + if source_notes: + dbg["reconcile"] = source_notes + return exp_years, exp_years_eng, exp_conf, dbg + + +def _prefer_explicit_summary_experience( + *, + clean_text: str, + exp_years: Optional[float], + exp_years_eng: Optional[float], + exp_conf: Optional[float], + exp_dbg: Dict[str, Any], +) -> Tuple[Optional[float], Optional[float], Optional[float], Dict[str, Any]]: + try: + clean_total, clean_eng, clean_conf, clean_dbg = extract_experience_years(clean_text or "") + except Exception: + return exp_years, exp_years_eng, exp_conf, exp_dbg + + if clean_total is None: + return exp_years, exp_years_eng, exp_conf, exp_dbg + + if exp_years is None: + merged_dbg = dict(exp_dbg or {}) + merged_dbg["clean_exp_method"] = (clean_dbg or {}).get("method") + return clean_total, (clean_eng if clean_eng is not None else exp_years_eng), max(float(exp_conf or 0.0), float(clean_conf or 0.0)), merged_dbg + + parsed_method = str((exp_dbg or {}).get("method") or "") + clean_method = str((clean_dbg or {}).get("method") or "") + if clean_conf is not None and clean_conf >= 0.78 and clean_method in ("summary", "header_chunk"): + try: + if parsed_method.startswith("timeline") and float(clean_total) + 1.5 < float(exp_years): + merged_dbg = dict(exp_dbg or {}) + merged_dbg["clean_exp_method"] = clean_method + merged_dbg["reconcile_clean"] = "prefer_explicit_summary" + return clean_total, (clean_eng if clean_eng is not None else exp_years_eng), max(float(exp_conf or 0.0), float(clean_conf or 0.0)), merged_dbg + except Exception: + pass + + return exp_years, exp_years_eng, exp_conf, exp_dbg + + +def _need_llm_fallback( + *, + roles: List[str], + skills: List[str], + exp_conf: Optional[float], + english: Optional[str], + location: Optional[str], + name: Optional[str], + doc_type: Optional[str], +) -> bool: + if doc_type == "scan_pdf": + return False + if not name: + return True + if not roles and len(skills) < 2: + return True + if exp_conf is None or exp_conf < 0.4: + return True + if not english and not location and len(skills) < 2: + return True + return False + + +def _maybe_llm_enrich( + *, + con: sqlite3.Connection, + clean: str, + roles: List[str], + skills: List[str], + exp_conf: Optional[float], + english: Optional[str], + location: Optional[str], + name: Optional[str], + doc_type: Optional[str], + sections: Optional[Dict[str, str]], +) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]: + """ + LLM runs only as fallback when heuristics are weak, + unless forced via LLM_PARSE_FORCE=1. + """ + if not llm_parse_enabled(): + return None, {"enabled": False} + + forced = os.environ.get("LLM_PARSE_FORCE", "0").lower() in ("1", "true", "yes") + if not forced and not _need_llm_fallback( + roles=roles, + skills=skills, + exp_conf=exp_conf, + english=english, + location=location, + name=name, + doc_type=doc_type, + ): + return None, {"enabled": True, "forced": False, "used": False, "reason": "heuristics_ok"} + + llm_res, llm_dbg = llm_extract_profile( + clean, + con=con, + doc_type=doc_type, + sections=sections, + ) + if isinstance(llm_dbg, dict): + llm_dbg["forced"] = forced + llm_dbg["used"] = bool(llm_res) + return llm_res, llm_dbg + + +_EN_ORDER = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6} + + +def _llm_review_mode() -> str: + mode = (os.environ.get("LLM_PARSE_REVIEW_MODE", "always") or "").strip().lower() + if mode in ("0", "false", "no", "off"): + return "off" + if mode in ("auto", "smart", "on_demand"): + return "auto" + return "always" + + +def _llm_review_rounds() -> int: + raw = (os.environ.get("LLM_PARSE_REVIEW_ROUNDS", "1") or "").strip() + try: + rounds = int(raw) + except Exception: + rounds = 1 + return max(1, min(rounds, 3)) + + +def _normalize_cefr(level: Optional[str]) -> Optional[str]: + if not level: + return None + m = re.search(r"\b(A1|A2|B1|B2|C1|C2)\b", str(level).upper()) + return m.group(1) if m else None + + +def _bounded_float(v: Any, lo: float, hi: float) -> Optional[float]: + try: + x = float(v) + except Exception: + return None + if x < lo or x > hi: + return None + return float(round(x, 2)) + + +def _bounded_int(v: Any, lo: int, hi: int) -> Optional[int]: + try: + x = int(float(v)) + except Exception: + return None + if x < lo or x > hi: + return None + return x + + +def _llm_review_needed( + *, + mode: str, + llm_enriched_used: bool, + name: Optional[str], + roles: List[str], + skills: List[str], + exp_conf: Optional[float], + english: Optional[str], + location: Optional[str], +) -> bool: + if mode == "off": + return False + if mode == "always": + return True + + if llm_enriched_used: + return True + if not name: + return True + if not roles or len(skills) < 3: + return True + if exp_conf is None or exp_conf < 0.65: + return True + if not english or not location: + return True + return False + + +def _build_llm_review_draft( + *, + roles: List[str], + skills: List[str], + primary_languages: List[str], + seniority: Optional[str], + backend_focus: Optional[bool], + exp_years: Optional[float], + exp_years_eng: Optional[float], + english: Optional[str], + location: Optional[str], + remote: Optional[bool], + sal_min: Optional[int], + sal_max: Optional[int], + highlights: List[str], + keywords: List[str], +) -> Dict[str, Any]: + return { + "roles": roles[:12], + "skills": skills[:64], + "primary_languages": primary_languages[:12], + "seniority": seniority, + "backend_focus": backend_focus, + "experience_years_total": exp_years, + "experience_years_engineering": exp_years_eng, + "english_level": _normalize_cefr(english), + "location": location, + "remote_ok": remote, + "salary_min_rub": sal_min, + "salary_max_rub": sal_max, + "highlights": highlights[:8], + "keywords": keywords[:40], + } + + +def _merge_review_result( + *, + review: LLMExtraction, + review_dbg: Dict[str, Any], + roles: List[str], + skills: List[str], + primary_languages: List[str], + seniority: Optional[str], + backend_focus: Optional[bool], + remote: Optional[bool], + location: Optional[str], + english: Optional[str], + exp_years: Optional[float], + exp_years_eng: Optional[float], + exp_conf: Optional[float], + sal_min: Optional[int], + sal_max: Optional[int], + sal_conf: Optional[float], + highlights: List[str], + keywords: List[str], + llm_summary: Optional[str], + llm_tags: List[str], +) -> Tuple[Dict[str, Any], Dict[str, Any]]: + quality = review_dbg.get("quality_score") + try: + quality_f = float(quality) if quality is not None else None + except Exception: + quality_f = None + trusted = quality_f is None or quality_f >= 0.55 + + changed: List[str] = [] + model_changed_raw = review_dbg.get("model_changed_fields") or [] + model_changed = set() + if isinstance(model_changed_raw, list): + for x in model_changed_raw: + s = str(x).strip() + if s: + model_changed.add(s) + + roles_out = list(roles or []) + if review.roles: + if trusted and "roles" in model_changed: + merged_roles = _merge_lists(review.roles, [], limit=12) + else: + merged_roles = _merge_lists(review.roles, roles_out, limit=12) if trusted else _merge_lists(roles_out, review.roles, limit=12) + if merged_roles != roles_out: + changed.append("roles") + roles_out = merged_roles + + skills_out = list(skills or []) + if review.skills: + merged_skills = _merge_lists(review.skills, skills_out, limit=64) if trusted else _merge_lists(skills_out, review.skills, limit=64) + if merged_skills != skills_out: + changed.append("skills") + skills_out = merged_skills + + langs_out = list(primary_languages or []) + review_langs = _normalize_language_list(review.primary_languages) + if review_langs: + if trusted and "primary_languages" in model_changed: + merged_langs = _merge_lists(review_langs, [], limit=12) + else: + merged_langs = _merge_lists(review_langs, langs_out, limit=12) if trusted else _merge_lists(langs_out, review_langs, limit=12) + if merged_langs != langs_out: + changed.append("primary_languages") + langs_out = merged_langs + + seniority_out = seniority + if review.seniority and (trusted or not seniority_out): + if review.seniority != seniority_out: + changed.append("seniority") + seniority_out = review.seniority + + backend_focus_out = backend_focus + if review.backend_focus is not None and (trusted or backend_focus_out is None): + if review.backend_focus != backend_focus_out: + changed.append("backend_focus") + backend_focus_out = review.backend_focus + + remote_out = remote + if review.remote_ok is not None and (trusted or remote_out is None): + if review.remote_ok != remote_out: + changed.append("remote") + remote_out = review.remote_ok + + location_out = location + if review.location and (trusted or not location_out): + loc = review.location.strip() + if 2 <= len(loc) <= 120 and loc != (location_out or ""): + changed.append("location") + location_out = loc + + english_out = _normalize_cefr(english) + review_english = _normalize_cefr(review.english_level) + if review_english: + if english_out is None: + english_out = review_english + changed.append("english") + elif trusted and _EN_ORDER.get(review_english, 0) > _EN_ORDER.get(english_out, 0): + english_out = review_english + changed.append("english") + + exp_years_out = exp_years + exp_years_eng_out = exp_years_eng + exp_conf_out = exp_conf + review_exp_total = _bounded_float(review.experience_years_total, 0.0, 60.0) + review_exp_eng = _bounded_float(review.experience_years_engineering, 0.0, 60.0) + if review_exp_total is not None: + if exp_years_out is None or (trusted and ((exp_conf_out or 0.0) < 0.75)): + if exp_years_out != review_exp_total: + changed.append("experience_years_total") + exp_years_out = review_exp_total + exp_conf_out = max(float(exp_conf_out or 0.0), 0.78 if trusted else 0.65) + if review_exp_eng is not None: + if exp_years_eng_out is None or trusted: + if exp_years_eng_out != review_exp_eng: + changed.append("experience_years_engineering") + exp_years_eng_out = review_exp_eng + exp_conf_out = max(float(exp_conf_out or 0.0), 0.74 if trusted else 0.62) + + sal_min_out = sal_min + sal_max_out = sal_max + sal_conf_out = sal_conf + cand_min = _bounded_int(review.salary_min_rub, 10_000, 200_000_000) + cand_max = _bounded_int(review.salary_max_rub, 10_000, 200_000_000) + if cand_min is None and cand_max is None: + cand_min = _bounded_int(review.salary_min_usd, 100, 2_000_000) + cand_max = _bounded_int(review.salary_max_usd, 100, 2_000_000) + if cand_min is not None or cand_max is not None: + if cand_min is not None and cand_max is not None and cand_min > cand_max: + cand_min, cand_max = cand_max, cand_min + if (sal_min_out is None and sal_max_out is None) or (trusted and (sal_conf_out is None or sal_conf_out < 0.75)): + if cand_min is not None and cand_min != sal_min_out: + sal_min_out = cand_min + changed.append("salary") + if cand_max is not None and cand_max != sal_max_out: + sal_max_out = cand_max + changed.append("salary") + sal_conf_out = max(float(sal_conf_out or 0.0), 0.72 if trusted else 0.60) + + highlights_out = list(highlights or []) + if review.highlights: + merged_highlights = _merge_lists(review.highlights, highlights_out, limit=8) if trusted else _merge_lists(highlights_out, review.highlights, limit=8) + if merged_highlights != highlights_out: + highlights_out = merged_highlights + changed.append("highlights") + + keywords_out = list(keywords or []) + if review.keywords: + merged_keywords = _merge_lists(review.keywords, keywords_out, limit=40) if trusted else _merge_lists(keywords_out, review.keywords, limit=40) + if merged_keywords != keywords_out: + keywords_out = merged_keywords + changed.append("keywords") + + llm_tags_out = list(llm_tags or []) + llm_tags_out = _merge_lists(keywords_out, llm_tags_out, limit=40) + llm_tags_out = _merge_lists(skills_out, llm_tags_out, limit=40) + llm_tags_out = _merge_lists(langs_out, llm_tags_out, limit=40) + + llm_summary_out = llm_summary + if highlights_out: + merged_summary = "; ".join([h.strip() for h in highlights_out if h.strip()])[:800] + if merged_summary and merged_summary != (llm_summary_out or ""): + llm_summary_out = merged_summary + changed.append("llm_summary") + + changed_uniq = [] + changed_seen = set() + for item in changed: + if item in changed_seen: + continue + changed_seen.add(item) + changed_uniq.append(item) + + return ( + { + "roles": roles_out, + "skills": skills_out, + "primary_languages": langs_out, + "seniority": seniority_out, + "backend_focus": backend_focus_out, + "remote": remote_out, + "location": location_out, + "english": english_out, + "exp_years": exp_years_out, + "exp_years_eng": exp_years_eng_out, + "exp_conf": exp_conf_out, + "sal_min": sal_min_out, + "sal_max": sal_max_out, + "sal_conf": sal_conf_out, + "highlights": highlights_out, + "keywords": keywords_out, + "llm_summary": llm_summary_out, + "llm_tags": llm_tags_out, + }, + { + "trusted": trusted, + "quality_score": quality_f, + "changed_fields": changed_uniq, + "issues_found": review_dbg.get("issues_found") or [], + "model_changed_fields": review_dbg.get("changed_fields") or [], + }, + ) + + +# ----------------------------- +# candidate/resume DB helpers +# ----------------------------- + +def stable_candidate_id(contacts: Dict[str, List[str]], name: Optional[str], simh: int) -> str: + if contacts.get("email"): + return "cand_" + sha1_str("email:" + contacts["email"][0]) + if contacts.get("phone"): + return "cand_" + sha1_str("phone:" + contacts["phone"][0]) + if contacts.get("tg"): + return "cand_" + sha1_str("tg:" + contacts["tg"][0]) + if contacts.get("github"): + return "cand_" + sha1_str("gh:" + contacts["github"][0]) + if contacts.get("linkedin"): + return "cand_" + sha1_str("li:" + contacts["linkedin"][0]) + base = (name or "unknown").strip().lower() + return "cand_" + sha1_str(f"name:{base}:{simh}") + + +def _candidate_by_contact(con: sqlite3.Connection, contacts: Dict[str, List[str]]) -> Optional[str]: + checks = [ + ("email", contacts.get("email", [])), + ("phone", contacts.get("phone", [])), + ("tg", contacts.get("tg", [])), + ("github", contacts.get("github", [])), + ("linkedin", contacts.get("linkedin", [])), + ] + for ctype, vals in checks: + for v in vals: + row = con.execute( + "SELECT candidate_id FROM candidate_contacts WHERE contact_type=? AND contact_value=?", + (ctype, v), + ).fetchone() + if row: + return row["candidate_id"] + return None + + +def _upsert_contacts(con: sqlite3.Connection, candidate_id: str, contacts: Dict[str, List[str]]) -> None: + pairs: List[Tuple[str, str]] = [] + for e in contacts.get("email", []): + pairs.append(("email", e)) + for p in contacts.get("phone", []): + pairs.append(("phone", p)) + for t in contacts.get("tg", []): + pairs.append(("tg", t)) + for g in contacts.get("github", []): + pairs.append(("github", g)) + for l in contacts.get("linkedin", []): + pairs.append(("linkedin", l)) + + for ctype, val in pairs: + con.execute( + "INSERT OR IGNORE INTO candidate_contacts(contact_type, contact_value, candidate_id) VALUES (?,?,?)", + (ctype, val, candidate_id), + ) + + +def _upsert_candidate_skills( + con: sqlite3.Connection, + candidate_id: str, + skills_primary: List[str], + skills_secondary: List[str], + source: str, +) -> None: + for sk in skills_primary: + con.execute( + """INSERT OR REPLACE INTO candidate_skills(candidate_id, skill_id, skill_label, confidence, source, evidence) + VALUES (?,?,?,?,?,?)""", + (candidate_id, sk, sk, 0.90, source, "skills_primary"), + ) + for sk in skills_secondary: + con.execute( + """INSERT OR REPLACE INTO candidate_skills(candidate_id, skill_id, skill_label, confidence, source, evidence) + VALUES (?,?,?,?,?,?)""", + (candidate_id, sk, sk, 0.60, source, "skills_secondary"), + ) + + +def _upsert_candidate_roles( + con: sqlite3.Connection, + candidate_id: str, + roles: List[str], + source: str, +) -> None: + for r in roles: + con.execute( + """INSERT OR REPLACE INTO candidate_roles(candidate_id, role, confidence, source, evidence) + VALUES (?,?,?,?,?)""", + (candidate_id, r, 0.80, source, "roles"), + ) + + +def _upsert_candidate_languages( + con: sqlite3.Connection, + candidate_id: str, + english_level: Optional[str], + source: str, +) -> None: + if not english_level: + return + con.execute( + """INSERT OR REPLACE INTO candidate_languages(candidate_id, language, level, confidence, source, evidence) + VALUES (?,?,?,?,?,?)""", + (candidate_id, "english", english_level, 0.75, source, "english_level"), + ) + + +def _ensure_candidate(con: sqlite3.Connection, candidate_id: str, fields: Dict[str, Any]) -> None: + # Attempt to ensure the new column exists if migration didn't run + try: + con.execute("ALTER TABLE candidates ADD COLUMN experience_years_eng REAL") + except Exception: + pass # Column likely exists or basic sqlite error, proceed to insert + try: + con.execute("ALTER TABLE candidates ADD COLUMN primary_languages_json TEXT") + except Exception: + pass + try: + con.execute("ALTER TABLE candidates ADD COLUMN backend_focus INTEGER") + except Exception: + pass + + exists = con.execute("SELECT 1 FROM candidates WHERE candidate_id=?", (candidate_id,)).fetchone() is not None + + primary_languages_json = safe_json(fields.get("primary_languages", [])) + backend_focus_field = fields.get("backend_focus") + backend_focus_int = None if backend_focus_field is None else (1 if backend_focus_field else 0) + + if not exists: + con.execute( + """INSERT INTO candidates( + candidate_id, name, location, remote, + experience_years, experience_years_eng, experience_confidence, + salary_min, salary_max, salary_confidence, + english_level, roles_json, skills_json, primary_languages_json, + roles_norm, skills_norm, backend_focus, + created_at, updated_at + ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + ( + candidate_id, + fields.get("name"), + fields.get("location"), + fields.get("remote"), + fields.get("experience_years"), + fields.get("experience_years_eng"), # new field + fields.get("experience_confidence"), + fields.get("salary_min"), + fields.get("salary_max"), + fields.get("salary_confidence"), + fields.get("english_level"), + safe_json(fields.get("roles", [])), + safe_json(fields.get("skills", [])), + primary_languages_json, + fields.get("roles_norm") or "|", + fields.get("skills_norm") or "|", + backend_focus_int, + utc_iso(), + utc_iso(), + ), + ) + else: + con.execute( + """UPDATE candidates SET + name = COALESCE(?, name), + location = COALESCE(?, location), + remote = COALESCE(?, remote), + + experience_years = COALESCE(?, experience_years), + experience_years_eng = COALESCE(?, experience_years_eng), + experience_confidence = COALESCE(?, experience_confidence), + + salary_min = COALESCE(?, salary_min), + salary_max = COALESCE(?, salary_max), + salary_confidence = COALESCE(?, salary_confidence), + + english_level = COALESCE(?, english_level), + + roles_json = CASE WHEN ? IS NOT NULL AND ? != '[]' THEN ? ELSE roles_json END, + skills_json = CASE WHEN ? IS NOT NULL AND ? != '[]' THEN ? ELSE skills_json END, + primary_languages_json = CASE WHEN ? IS NOT NULL AND ? != '[]' THEN ? ELSE primary_languages_json END, + roles_norm = CASE WHEN ? != '|' THEN ? ELSE roles_norm END, + skills_norm = CASE WHEN ? != '|' THEN ? ELSE skills_norm END, + backend_focus = COALESCE(?, backend_focus), + + updated_at = ? + WHERE candidate_id = ?""", + ( + fields.get("name"), + fields.get("location"), + fields.get("remote"), + fields.get("experience_years"), + fields.get("experience_years_eng"), # new field update + fields.get("experience_confidence"), + fields.get("salary_min"), + fields.get("salary_max"), + fields.get("salary_confidence"), + fields.get("english_level"), + + safe_json(fields.get("roles", [])), + safe_json(fields.get("roles", [])), + safe_json(fields.get("roles", [])), + + safe_json(fields.get("skills", [])), + safe_json(fields.get("skills", [])), + safe_json(fields.get("skills", [])), + + primary_languages_json, + primary_languages_json, + primary_languages_json, + + fields.get("roles_norm") or "|", + fields.get("roles_norm") or "|", + + fields.get("skills_norm") or "|", + fields.get("skills_norm") or "|", + + backend_focus_int, + utc_iso(), + candidate_id, + ), + ) + + +def _resume_by_sha(con: sqlite3.Connection, sha: str) -> Optional[str]: + row = con.execute("SELECT resume_id FROM resumes WHERE sha256=?", (sha,)).fetchone() + return row["resume_id"] if row else None + + +def _near_duplicate_active_resume(con: sqlite3.Connection, simh: int, max_dist: int) -> Optional[Tuple[str, int]]: + candidate_resume_ids = set() + for bucket, band in simhash_bands(simh): + cur = con.execute("SELECT resume_id FROM simhash_buckets WHERE bucket=? AND band=?", (bucket, band)) + for r in cur.fetchall(): + candidate_resume_ids.add(r["resume_id"]) + + best: Optional[Tuple[str, int]] = None + for rid in candidate_resume_ids: + row = con.execute("SELECT simhash FROM resumes WHERE resume_id=? AND is_active=1", (rid,)).fetchone() + if not row or row["simhash"] is None: + continue + try: + old = int(str(row["simhash"]), 16) + except Exception: + continue + dist = hamming64(old, simh) + if dist <= max_dist: + if best is None or dist < best[1]: + best = (rid, dist) + return best + + +def _insert_resume( + con: sqlite3.Connection, + candidate_id: str, + sha: Optional[str], + simh: int, + clean_text: str, + raw_text: str, + extraction_json: str, + llm_summary: Optional[str], + llm_tags: List[str], + extract_method: Optional[str], + extract_quality_score: Optional[float], + extract_quality_flags: Optional[str], + extract_pages_json: Optional[str], + doc_type: Optional[str], + doc_type_confidence: Optional[float], + parse_method: Optional[str], + parse_version: Optional[str], + sections_json: Optional[str], + file_path: Optional[str], + mtime: Optional[int], + size: Optional[int], + near_dup_of: Optional[str], +) -> str: + resume_id = "res_" + uuid.uuid4().hex + + if near_dup_of: + con.execute("UPDATE resumes SET is_active=0 WHERE resume_id=?", (near_dup_of,)) + + con.execute( + """INSERT INTO resumes( + resume_id, candidate_id, sha256, simhash, clean_text, raw_text, extraction_json, + llm_summary, llm_tags_json, + extract_method, extract_quality_score, extract_quality_flags, extract_pages_json, + doc_type, doc_type_confidence, parse_method, parse_version, sections_json, + is_active, duplicate_of_resume_id, file_path, file_mtime, file_size, created_at + ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + ( + resume_id, + candidate_id, + sha, + f"{simh:016x}", + clean_text, + raw_text[:250000], + extraction_json, + llm_summary, + safe_json(llm_tags), + extract_method, + extract_quality_score, + extract_quality_flags, + extract_pages_json, + doc_type, + doc_type_confidence, + parse_method, + parse_version, + sections_json, + 1, + near_dup_of, + file_path, + mtime, + size, + utc_iso(), + ), + ) + + for bucket, band in simhash_bands(simh): + con.execute( + "INSERT OR IGNORE INTO simhash_buckets(bucket, band, resume_id) VALUES (?,?,?)", + (bucket, band, resume_id), + ) + + return resume_id + + +def _insert_source(con: sqlite3.Connection, resume_id: str, src: Dict[str, Any]) -> None: + con.execute( + """INSERT INTO sources( + resume_id, export_path, chat_title, message_id, message_date, + origin_type, original_file_path, original_file_name, extra_json + ) VALUES (?,?,?,?,?,?,?,?,?)""", + ( + resume_id, + src.get("export_path"), + src.get("chat_title"), + src.get("message_id"), + src.get("message_date"), + src.get("origin_type"), + src.get("file_path"), + src.get("original_name"), + json.dumps(src.get("extra", {}), ensure_ascii=False), + ), + ) + + +def _insert_positions( + con: sqlite3.Connection, + resume_id: str, + candidate_id: str, + positions: List[Dict[str, Any]], +) -> None: + if not positions: + return + for p in positions: + pos_id = "pos_" + uuid.uuid4().hex + con.execute( + """INSERT INTO positions( + position_id, resume_id, candidate_id, title, company, + date_from, date_to, is_current, description, stack_json + ) VALUES (?,?,?,?,?,?,?,?,?,?)""", + ( + pos_id, + resume_id, + candidate_id, + p.get("title"), + p.get("company"), + p.get("date_from"), + p.get("date_to"), + 1 if p.get("is_current") else 0 if p.get("is_current") is not None else None, + p.get("description"), + json.dumps(p.get("stack") or [], ensure_ascii=False), + ), + ) + + +def _update_files_seen(con: sqlite3.Connection, sha: str, size: int, mtime: int, canonical_resume_id: str) -> None: + con.execute( + """INSERT INTO files_seen(sha256, size, mtime, canonical_resume_id, first_seen_at, last_seen_at) + VALUES (?,?,?,?,?,?) + ON CONFLICT(sha256) DO UPDATE SET + size=excluded.size, + mtime=excluded.mtime, + canonical_resume_id=excluded.canonical_resume_id, + last_seen_at=excluded.last_seen_at + """, + (sha, size, mtime, canonical_resume_id, utc_iso(), utc_iso()), + ) + + +# ----------------------------- +# artifacts collection +# ----------------------------- + +def collect_artifacts(input_root: Path) -> List[Dict[str, Any]]: + artifacts: List[Dict[str, Any]] = [] + + for rj in find_result_json(input_root): + artifacts.extend(list(iter_json_artifacts(rj))) + + for mh in find_messages_html(input_root): + artifacts.extend(list(iter_html_artifacts(mh))) + + artifacts.extend(list(iter_file_scan(input_root))) + return artifacts + + +# ----------------------------- +# main pipeline +# ----------------------------- + +def import_exports( + con: sqlite3.Connection, + input_dir: str, + log: Logger, + max_near_dist: int = 6, + min_text_len: int = 250, + commit_every: int = 20, +) -> Dict[str, Any]: + root = Path(input_dir).resolve() + if not root.exists(): + raise SystemExit(f"Input not found: {root}") + + artifacts = collect_artifacts(root) + log.info(f"[import] artifacts found: {len(artifacts)}", {"input": str(root)}) + + stats: Dict[str, Any] = { + "input": str(root), + "artifacts": len(artifacts), + "processed_new": 0, + "dup_sha": 0, + "near_dup": 0, + "short_or_empty": 0, + "errors": 0, + "sources_added_only": 0, + "llm_enriched": 0, + "llm_reviewed": 0, + "llm_review_changed": 0, + } + near_dup_examples: List[Dict[str, Any]] = [] + + for i, a in enumerate(artifacts, start=1): + try: + raw_text = "" + sha = None + + file_path = a.get("file_path") + size = None + mtime = None + + extract_method = None + extract_score = None + extract_flags: List[str] = [] + pages: List[Dict[str, Any]] = [] + + if file_path: + fp = Path(file_path) + if not fp.exists(): + continue + st = fp.stat() + size = int(st.st_size) + mtime = int(st.st_mtime) + + sha = sha256_file(str(fp)) + + existing_resume = _resume_by_sha(con, sha) + if existing_resume: + stats["dup_sha"] += 1 + _insert_source(con, existing_resume, a) + _update_files_seen(con, sha, size, mtime, existing_resume) + stats["sources_added_only"] += 1 + continue + + if fp.suffix.lower() == ".pdf": + pdf_res = extract_pdf_best(fp, timeout_sec=25) + raw_text = pdf_res.text + extract_method = pdf_res.method + extract_score = pdf_res.score + extract_flags = pdf_res.flags + pages = pdf_res.pages + else: + try: + raw_text = extract_text_generic(fp) or "" + except Exception as e: + if log: + log.warn("[extract] file failed - skipped", {"file": str(fp), "err": repr(e)}) + raw_text = "" + extract_method = f"file_{fp.suffix.lower().lstrip('.') or 'unknown'}" + else: + raw_text = a.get("message_text") or "" + extract_method = "telegram_post" + + raw_text = coerce_text(raw_text) + + if not raw_text or len(raw_text.strip()) < min_text_len: + stats["short_or_empty"] += 1 + continue + + clean = normalize_text(raw_text) + if not clean or len(clean) < min_text_len: + stats["short_or_empty"] += 1 + continue + + file_ext = Path(file_path).suffix.lower() if file_path else None + dt = detect_doc_type(clean, file_ext=file_ext) + if a.get("origin_type") == "message_text": + from tg_resume_db.extract.doc_type import DocTypeResult + dt = DocTypeResult(doc_type="telegram_post", confidence=0.92, signals=["telegram_message"]) + if extract_flags and "scan_like" in extract_flags: + from tg_resume_db.extract.doc_type import DocTypeResult + dt = DocTypeResult(doc_type="scan_pdf", confidence=0.9, signals=dt.signals + ["scan_like"]) + sections = split_sections(clean, dt.doc_type) + sections_list = sections_present(sections) + exp_section_text = sections.get("experience") if isinstance(sections, dict) else None + positions = extract_positions(exp_section_text or clean) + position_dicts = positions_to_dicts(positions) + + parser = tpl_generic + if dt.confidence >= 0.8: + if dt.doc_type == "hh_ru": + parser = tpl_hh + elif dt.doc_type == "linkedin_pdf": + parser = tpl_linkedin + elif dt.doc_type == "one_page_en": + parser = tpl_one_page_en + elif dt.doc_type == "one_page_ru": + parser = tpl_one_page_ru + elif dt.doc_type == "pptx_export": + parser = tpl_pptx + + parsed = parser.parse_resume(clean, sections) + parse_method = parsed.get("parse_method") or "generic_heur" + + contacts_raw = parsed.get("contacts_raw") or extract_contacts_raw(clean) + contacts = normalize_contacts(contacts_raw, clean) + + name = parsed.get("name") or extract_name_guess(clean) + remote = parsed.get("remote") + if remote is None: + remote = extract_remote(clean) + english = parsed.get("english") or extract_english(clean) + + roles = parsed.get("roles") or [] + skills = parsed.get("skills") or [] + primary_languages: List[str] = [] + + location = parsed.get("location") or extract_location_best_effort(clean) + + exp_years = parsed.get("exp_years") + exp_years_eng = parsed.get("exp_years_eng") + exp_conf = parsed.get("exp_conf") + exp_dbg = parsed.get("exp_dbg") or {} + if exp_years is None and exp_years_eng is None: + exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(clean) + exp_years, exp_years_eng, exp_conf, exp_dbg = _prefer_explicit_summary_experience( + clean_text=clean, + exp_years=exp_years, + exp_years_eng=exp_years_eng, + exp_conf=exp_conf, + exp_dbg=exp_dbg, + ) + exp_years, exp_years_eng, exp_conf, exp_dbg = _reconcile_experience_fields( + exp_years=exp_years, + exp_years_eng=exp_years_eng, + exp_conf=exp_conf, + exp_dbg=exp_dbg, + positions=position_dicts, + ) + + sal_min = parsed.get("salary_min") + sal_max = parsed.get("salary_max") + sal_conf = parsed.get("salary_conf") + sal_dbg = parsed.get("salary_dbg") or {} + if sal_min is None and sal_max is None: + sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean) + + llm_summary: Optional[str] = None + llm_tags: List[str] = [] + seniority: Optional[str] = None + highlights: List[str] = [] + keywords: List[str] = [] + + llm_enriched, llm_dbg = _maybe_llm_enrich( + con=con, + clean=clean, + roles=roles, + skills=skills, + exp_conf=exp_conf, + english=english, + location=location, + name=name, + doc_type=dt.doc_type, + sections=sections, + ) + + backend_focus_flag: Optional[bool] = None + + if llm_enriched: + parse_method = "llm_rag" + stats["llm_enriched"] += 1 + roles = _merge_lists(llm_enriched.roles, roles, limit=8) + + normalized_llm_langs = _normalize_language_list(llm_enriched.primary_languages) + if normalized_llm_langs: + primary_languages = _merge_lists(normalized_llm_langs, primary_languages, limit=8) + skills = _merge_lists(normalized_llm_langs, skills, limit=48) + skills = _merge_lists(llm_enriched.skills, skills, limit=48) + + if remote is None and llm_enriched.remote_ok is not None: + remote = llm_enriched.remote_ok + if not location and llm_enriched.location: + location = llm_enriched.location + if not english and llm_enriched.english_level and _can_accept_llm_english(clean, llm_enriched.english_level): + english = llm_enriched.english_level + + backend_focus_flag = llm_enriched.backend_focus + if llm_enriched.backend_focus is True: + roles = _merge_lists(["backend"], roles, limit=8) + elif llm_enriched.backend_focus is False: + pruned_roles: List[str] = [] + seen_roles = set() + for r in roles: + if r.lower() == "backend": + continue + rl = r.lower() + if rl in seen_roles: + continue + seen_roles.add(rl) + pruned_roles.append(r) + roles = pruned_roles + + if (exp_conf is None or exp_conf < 0.6) and llm_enriched.experience_years_total is not None: + exp_years = llm_enriched.experience_years_total + exp_conf = 0.65 + if llm_enriched.experience_years_engineering is not None: + exp_years_eng = llm_enriched.experience_years_engineering + + sal_min, sal_max, sal_conf = _pick_salary( + sal_min, sal_max, sal_conf, llm_enriched.salary_min_rub, llm_enriched.salary_max_rub + ) + if sal_min is None and sal_max is None: + sal_min, sal_max, sal_conf = _pick_salary( + sal_min, sal_max, sal_conf, llm_enriched.salary_min_usd, llm_enriched.salary_max_usd + ) + + seniority = llm_enriched.seniority + highlights = [h.strip() for h in llm_enriched.highlights if h.strip()] + if highlights: + llm_summary = "; ".join(highlights)[:800] + keywords = _merge_lists(llm_enriched.keywords, keywords, limit=40) + llm_tags = _merge_lists(llm_enriched.keywords, llm_tags, limit=24) + llm_tags = _merge_lists(llm_enriched.skills, llm_tags, limit=24) + llm_tags = _merge_lists(llm_enriched.primary_languages, llm_tags, limit=24) + + desired_title = parsed.get("desired_title") + if desired_title: + roles = _merge_lists(_roles_from_desired_title(desired_title), roles, limit=8) + + llm_review_mode = _llm_review_mode() + llm_review_rounds_dbg: List[Dict[str, Any]] = [] + llm_review_merge_dbg: List[Dict[str, Any]] = [] + llm_review_used = False + llm_review_changed = False + + if llm_parse_enabled() and _llm_review_needed( + mode=llm_review_mode, + llm_enriched_used=bool(llm_enriched), + name=name, + roles=roles, + skills=skills, + exp_conf=exp_conf, + english=english, + location=location, + ): + for _ in range(_llm_review_rounds()): + review_draft = _build_llm_review_draft( + roles=roles, + skills=skills, + primary_languages=primary_languages, + seniority=seniority, + backend_focus=backend_focus_flag, + exp_years=exp_years, + exp_years_eng=exp_years_eng, + english=english, + location=location, + remote=remote, + sal_min=sal_min, + sal_max=sal_max, + highlights=highlights, + keywords=keywords, + ) + review_res, review_dbg = llm_review_profile( + clean, + draft=review_draft, + con=con, + doc_type=dt.doc_type, + sections=sections, + ) + llm_review_rounds_dbg.append(review_dbg) + if not review_res: + continue + + llm_review_used = True + merged, merge_dbg = _merge_review_result( + review=review_res, + review_dbg=review_dbg, + roles=roles, + skills=skills, + primary_languages=primary_languages, + seniority=seniority, + backend_focus=backend_focus_flag, + remote=remote, + location=location, + english=english, + exp_years=exp_years, + exp_years_eng=exp_years_eng, + exp_conf=exp_conf, + sal_min=sal_min, + sal_max=sal_max, + sal_conf=sal_conf, + highlights=highlights, + keywords=keywords, + llm_summary=llm_summary, + llm_tags=llm_tags, + ) + llm_review_merge_dbg.append(merge_dbg) + + roles = merged["roles"] + skills = merged["skills"] + primary_languages = merged["primary_languages"] + seniority = merged["seniority"] + backend_focus_flag = merged["backend_focus"] + remote = merged["remote"] + location = merged["location"] + english = merged["english"] + exp_years = merged["exp_years"] + exp_years_eng = merged["exp_years_eng"] + exp_conf = merged["exp_conf"] + sal_min = merged["sal_min"] + sal_max = merged["sal_max"] + sal_conf = merged["sal_conf"] + highlights = merged["highlights"] + keywords = merged["keywords"] + llm_summary = merged["llm_summary"] + llm_tags = merged["llm_tags"] + + if merge_dbg.get("changed_fields"): + llm_review_changed = True + else: + break + + if llm_review_used: + stats["llm_reviewed"] += 1 + if llm_review_changed: + stats["llm_review_changed"] += 1 + if "+llm_review" not in parse_method: + parse_method = f"{parse_method}+llm_review" + + llm_review_meta = { + "enabled": llm_parse_enabled(), + "mode": llm_review_mode, + "used": llm_review_used, + "changed": llm_review_changed, + "rounds": llm_review_rounds_dbg, + "merge": llm_review_merge_dbg, + } + + roles = normalize_roles(roles) + roles = _prune_roles_by_evidence(roles, clean) + skills = normalize_skills(skills) + + skills_primary, skills_secondary = split_skills_primary_secondary( + skills, + clean_text=clean, + sections=sections, + ) + + location = normalize_location(location) + + exp_years, exp_years_eng, exp_conf, exp_dbg = _reconcile_experience_fields( + exp_years=exp_years, + exp_years_eng=exp_years_eng, + exp_conf=exp_conf, + exp_dbg=exp_dbg, + positions=position_dicts, + ) + + if not primary_languages: + language_from_skills = [] + for sk in skills: + tok = _norm_lang_token(sk) + if tok: + language_from_skills.append(tok) + primary_languages = _merge_lists(language_from_skills, primary_languages, limit=8) + + skills, primary_languages = _drop_false_java(skills, primary_languages, clean) + + simh = simhash64(to_fts_text(clean)) + + candidate_id = _candidate_by_contact(con, contacts) or stable_candidate_id(contacts, name, simh) + + _ensure_candidate(con, candidate_id, { + "name": name, + "location": location, + "remote": (1 if remote else 0) if remote is not None else None, + "experience_years": exp_years, + "experience_years_eng": exp_years_eng, # Passed to DB + "experience_confidence": exp_conf if exp_years is not None else None, + "salary_min": sal_min, + "salary_max": sal_max, + "salary_confidence": sal_conf if sal_min is not None else None, + "english_level": english, + "roles": roles, + "skills": skills, + "primary_languages": primary_languages, + "backend_focus": backend_focus_flag, + "roles_norm": norm_pipe(roles), + "skills_norm": norm_pipe(skills), + }) + _upsert_contacts(con, candidate_id, contacts) + _upsert_candidate_skills(con, candidate_id, skills_primary, skills_secondary, parse_method) + _upsert_candidate_roles(con, candidate_id, roles, parse_method) + _upsert_candidate_languages(con, candidate_id, english, parse_method) + + near = _near_duplicate_active_resume(con, simh, max_dist=max_near_dist) + near_dup_of = near[0] if near else None + if near_dup_of: + stats["near_dup"] += 1 + if len(near_dup_examples) < 10: + near_dup_examples.append({ + "new_file": file_path, + "dup_of": near_dup_of, + "dist": near[1], + "candidate_id": candidate_id, + }) + + extraction = { + "name_guess": name, + "contacts": contacts, + "doc_type": { + "type": dt.doc_type, + "confidence": dt.confidence, + "signals": dt.signals, + }, + "extract": { + "method": extract_method, + "quality_score": extract_score, + "quality_flags": extract_flags, + "pages": pages[:40], + }, + "sections_present": sections_list, + "parse": { + "method": parse_method, + "version": _PARSE_VERSION, + }, + "desired_title": desired_title, + "skills_primary": skills_primary, + "skills_secondary": skills_secondary, + "hh_meta": { + "specializations": parsed.get("specializations"), + "employment_type": parsed.get("employment_type"), + "schedule": parsed.get("schedule"), + }, + "positions": positions_to_dicts(positions), + "positions_count": len(position_dicts), + "experience": { + "years": exp_years, + "years_engineering": exp_years_eng, # Saved in JSON too + "confidence": exp_conf, + "debug": exp_dbg + }, + "salary": {"min": sal_min, "max": sal_max, "confidence": sal_conf, "debug": sal_dbg}, + "location_guess": location, + "roles": roles, + "skills": skills, + "primary_languages": primary_languages, + "remote_guess": remote, + "english": english, + "llm_summary": llm_summary, + "llm_tags": llm_tags, + "seniority": seniority, + "backend_focus": backend_focus_flag, + "highlights": highlights, + "keywords": keywords, + "llm": { + "used": bool(llm_enriched), + "debug": llm_dbg, + "data": asdict(llm_enriched) if llm_enriched else None, + "review": llm_review_meta, + }, + } + + resume_id = _insert_resume( + con=con, + candidate_id=candidate_id, + sha=sha, + simh=simh, + clean_text=clean, + raw_text=raw_text, + extraction_json=json.dumps(extraction, ensure_ascii=False), + llm_summary=llm_summary, + llm_tags=llm_tags, + extract_method=extract_method, + extract_quality_score=extract_score, + extract_quality_flags=json.dumps(extract_flags, ensure_ascii=False), + extract_pages_json=json.dumps(pages[:40], ensure_ascii=False), + doc_type=dt.doc_type, + doc_type_confidence=dt.confidence, + parse_method=parse_method, + parse_version=_PARSE_VERSION, + sections_json=json.dumps(sections, ensure_ascii=False), + file_path=file_path, + mtime=mtime, + size=size, + near_dup_of=near_dup_of, + ) + + _insert_source(con, resume_id, a) + _insert_positions(con, resume_id, candidate_id, position_dicts) + + if sha and size is not None and mtime is not None: + _update_files_seen(con, sha, size, mtime, resume_id) + + stats["processed_new"] += 1 + + if i % commit_every == 0: + con.commit() + log.info( + f"[import] progress {i}/{len(artifacts)} " + f"new={stats['processed_new']} dup_sha={stats['dup_sha']} " + f"near={stats['near_dup']} err={stats['errors']}", + {}, + ) + + except Exception as e: + stats["errors"] += 1 + log.error("[import] artifact failed", {"err": repr(e), "artifact": a}) + + con.commit() + stats["near_dup_examples"] = near_dup_examples + log.info("[import] done", stats) + return stats diff --git a/search.py b/search.py new file mode 100644 index 0000000..11ecfb2 --- /dev/null +++ b/search.py @@ -0,0 +1,393 @@ +from __future__ import annotations + +import json +import re +import sqlite3 +from typing import Any, Dict, List, Tuple + +from tg_resume_db.normalize import normalize_skill, find_skills_in_text + + +# ----------------------------- +# Normalization helpers +# ----------------------------- + +def _norm_token(v: str) -> str: + return " ".join(str(v).strip().lower().split()) + + +def _as_list(v: Any) -> List[str]: + """ + Accepts: + - None + - list + - "a,b,c" (csv string) + """ + if v is None: + return [] + if isinstance(v, list): + return [str(x) for x in v if str(x).strip()] + s = str(v).strip() + if not s: + return [] + return [x.strip() for x in s.split(",") if x.strip()] + + +def _uniq_keep_order(xs: List[str]) -> List[str]: + seen = set() + out: List[str] = [] + for x in xs: + t = _norm_token(x) + if not t or t in seen: + continue + seen.add(t) + out.append(t) + return out + + +# ----------------------------- +# Pipe-normalized columns filters +# skills_norm / roles_norm like: "|python|fastapi|" +# ----------------------------- + +def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]: + vals = [_norm_token(x) for x in (values or []) if str(x).strip()] + if not vals: + return ("1=1", []) + + parts: List[str] = [] + args: List[Any] = [] + for v in vals: + parts.append(f"instr({field}, ?) > 0") + args.append(f"|{v}|") + + return "(" + " OR ".join(parts) + ")", args + + +def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]: + vals = [_norm_token(x) for x in (values or []) if str(x).strip()] + if not vals: + return ("1=1", []) + + parts: List[str] = [] + args: List[Any] = [] + for v in vals: + parts.append(f"instr({field}, ?) > 0") + args.append(f"|{v}|") + + return "(" + " AND ".join(parts) + ")", args + + +# ----------------------------- +# FTS5 sanitizer (fixes comma/garbage breaking MATCH) +# ----------------------------- + +# allow longer queries (списки имён, длинные промпты) без агрессивного усечения +_FTS_MAX_TERMS = 48 + +def _fts_safe_query(q: str) -> str: + """ + Turn a free-form recruiter text into a safe FTS5 MATCH expression. + We intentionally DO NOT allow raw FTS syntax from user input, + because it easily breaks on commas/quotes/etc. + + Example: + "Backend developer, опыт 5+ лет, Java C++ Python" -> + "\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\"" + """ + if not q: + return "resume" + + s = q.strip().lower() + + # normalize common tokens + s = s.replace("c++", "cpp") + s = s.replace("c#", "csharp") + s = s.replace(".net", "dotnet") + + # remove punctuation that breaks MATCH + s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s) + s = re.sub(r"\s+", " ", s).strip() + + # tokens (latin/cyrillic + digits + a few chars) + terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I) + terms = terms[:_FTS_MAX_TERMS] + + if not terms: + return "resume" + + # quote every term => safe; join with OR => broad query + return " OR ".join([f"\"{t}\"" for t in terms]) + + +def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]: + """ + Extract +must and -exclude skills from query; return (must, exclude, cleaned_query). + """ + if not q: + return [], [], "" + must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q) + excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q) + must = [] + exclude = [] + for t in must_raw: + canon = normalize_skill(t) + if canon: + must.append(canon) + for t in excl_raw: + canon = normalize_skill(t) + if canon: + exclude.append(canon) + if " and " in q.lower() or " & " in q: + must += find_skills_in_text(q) + + cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned + + +# ----------------------------- +# Contacts +# ----------------------------- + +def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]: + rows = con.execute( + "SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?", + (candidate_id,), + ).fetchall() + + m: Dict[str, List[str]] = {} + for r in rows: + m.setdefault(r["contact_type"], []).append(r["contact_value"]) + + # чуть чище: уберём дубль-контакты + for k, vals in list(m.items()): + m[k] = _uniq_keep_order(vals) + + return m + + +# ----------------------------- +# Main search (FTS + filters) +# ----------------------------- + +def search( + con: sqlite3.Connection, + query: str, + filters: Dict[str, Any], + limit: int = 20, + offset: int = 0, +) -> List[Dict[str, Any]]: + """ + Search candidates using: + - FTS5 for ranking/snippet + - stack filters for skills/roles via pipe-normalized columns + - basic filters: remote/location/experience/salary/english + """ + + where: List[str] = ["r.is_active = 1"] + params: List[Any] = [] + + must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "") + + # -------- basic filters -------- + if filters.get("remote") is not None: + where.append("c.remote = ?") + params.append(1 if bool(filters["remote"]) else 0) + + if filters.get("location"): + where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?") + params.append("%" + str(filters["location"]).lower() + "%") + + # Используем experience_years для SQL-фильтрации (широкий поиск), + # а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py + if filters.get("experience_min") is not None: + where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?") + params.append(float(filters["experience_min"])) + + # Salary: "unknown salary doesn't exclude" + if filters.get("salary_min") is not None: + where.append("(c.salary_max IS NULL OR c.salary_max >= ?)") + params.append(int(filters["salary_min"])) + + if filters.get("salary_max") is not None: + where.append("(c.salary_min IS NULL OR c.salary_min <= ?)") + params.append(int(filters["salary_max"])) + + if filters.get("doc_type"): + where.append("r.doc_type = ?") + params.append(str(filters["doc_type"])) + + # English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py + + # -------- roles/skills stack filters -------- + # backward compatibility + skills_any: List[str] = [] + skills_all: List[str] = [] + roles_any: List[str] = [] + + if filters.get("skill"): + skills_any.append(str(filters["skill"])) + if filters.get("role"): + roles_any.append(str(filters["role"])) + + skills_any += _as_list(filters.get("skills_any")) + skills_all += _as_list(filters.get("skills_all")) + roles_any += _as_list(filters.get("roles_any")) + + skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any]) + skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all]) + roles_any = _uniq_keep_order(roles_any) + + if must_skills: + skills_all = _uniq_keep_order(skills_all + must_skills) + + # Denis rule: if any skills were provided -> enforce ANY match + if skills_any: + clause, args = _pipe_any_clause("c.skills_norm", skills_any) + where.append(clause) + params.extend(args) + + if skills_all: + clause, args = _pipe_all_clause("c.skills_norm", skills_all) + where.append(clause) + params.extend(args) + + if roles_any: + clause, args = _pipe_any_clause("c.roles_norm", roles_any) + where.append(clause) + params.extend(args) + + if exclude_skills: + for sk in exclude_skills: + where.append("instr(c.skills_norm, ?) = 0") + params.append(f"|{sk}|") + + # -------- FTS query (SAFE) -------- + fts_q = _fts_safe_query(cleaned_query or "") + + limit = max(1, min(int(limit or 20), 100)) + offset = max(0, int(offset or 0)) + + # UPDATED SQL: Added experience_years_eng and language/backend metadata + sql = f""" + SELECT + c.candidate_id, + c.name, + c.location, + c.remote, + c.experience_years, + c.experience_years_eng, + c.experience_confidence, + c.salary_min, + c.salary_max, + c.salary_confidence, + c.english_level, + c.roles_json, + c.skills_json, + c.primary_languages_json, + c.backend_focus, + r.doc_type, + r.doc_type_confidence, + r.parse_method, + r.resume_id, + snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet, + bm25(resumes_fts) AS rank + FROM resumes_fts + JOIN resumes r ON r.resume_id = resumes_fts.resume_id + JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id + WHERE resumes_fts MATCH ? AND {" AND ".join(where)} + ORDER BY rank + LIMIT ? OFFSET ? + """ + + rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall() + + out: List[Dict[str, Any]] = [] + for row in rows: + cand_id = row["candidate_id"] + contacts_map = _fetch_contacts_map(con, cand_id) + + out.append( + { + "candidate_id": cand_id, + "name": row["name"], + "location": row["location"], + "remote": bool(row["remote"]) if row["remote"] is not None else None, + "experience_years": row["experience_years"], + "experience_years_eng": row["experience_years_eng"], # Passed to agent + "experience_confidence": row["experience_confidence"], + "salary_min": row["salary_min"], + "salary_max": row["salary_max"], + "salary_confidence": row["salary_confidence"], + "english_level": row["english_level"], + "roles": json.loads(row["roles_json"] or "[]"), + "skills": json.loads(row["skills_json"] or "[]"), + "primary_languages": json.loads(row["primary_languages_json"] or "[]"), + "backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None), + "doc_type": row["doc_type"], + "doc_type_confidence": row["doc_type_confidence"], + "parse_method": row["parse_method"], + "contacts": contacts_map, + "resume_id": row["resume_id"], + "snippet": row["snippet"], + "rank": row["rank"], + } + ) + + return out + + +# ----------------------------- +# Agent helper (SearchPlan -> search()) +# ----------------------------- + +def _join_csv(xs: List[str]) -> str: + xs = [str(x).strip() for x in (xs or []) if str(x).strip()] + return ",".join(xs) + + +def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]: + """ + Wrapper for agent.py. + Expects `plan` with fields: + query_text, skills_any, skills_all, roles_any, location, remote, + english_min, exp_years_min, salary_min, salary_max, limit, sort + Returns: + { "items": [...], "count": N } + """ + filters = { + "remote": getattr(plan, "remote", None), + "location": getattr(plan, "location", None), + "experience_min": getattr(plan, "exp_years_min", None), + "salary_min": getattr(plan, "salary_min", None), + "salary_max": getattr(plan, "salary_max", None), + "english": getattr(plan, "english_min", None), + "roles_any": _join_csv(getattr(plan, "roles_any", []) or []), + "skills_any": _join_csv(getattr(plan, "skills_any", []) or []), + "skills_all": _join_csv(getattr(plan, "skills_all", []) or []), + } + + items = search( + con, + query=(getattr(plan, "query_text", "") or "").strip(), + filters=filters, + limit=int(getattr(plan, "limit", 20) or 20), + offset=0, + ) + + sort_mode = (getattr(plan, "sort", "rank") or "rank").strip() + + if sort_mode == "exp_desc": + def k(it: Dict[str, Any]): + v = it.get("experience_years") + return (v is None, -(v or 0.0)) + items = sorted(items, key=k) + + elif sort_mode == "salary_desc": + def k(it: Dict[str, Any]): + v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min") + return (v is None, -(v or 0)) + items = sorted(items, key=k) + + return {"items": items, "count": len(items)} diff --git a/util.py b/util.py new file mode 100644 index 0000000..b27fe5d --- /dev/null +++ b/util.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import json +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Optional + +def utc_iso() -> str: + return datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + +class Logger: + def __init__(self, log_path: Optional[str] = None): + self.log_path = Path(log_path) if log_path else None + if self.log_path: + self.log_path.parent.mkdir(parents=True, exist_ok=True) + + def _write(self, level: str, msg: str, extra: Optional[Dict[str, Any]] = None) -> None: + line = f"{utc_iso()} [{level}] {msg}" + print(line, file=sys.stdout, flush=True) + if self.log_path: + payload = {"ts": utc_iso(), "level": level, "msg": msg, "extra": extra or {}} + with self.log_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + def info(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None: + self._write("INFO", msg, extra) + + def warn(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None: + self._write("WARN", msg, extra) + + def error(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None: + self._write("ERROR", msg, extra)